{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": "Test word vectors", "id": "b79941bf4553fd6" }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-18T23:28:55.563335Z", "start_time": "2025-11-18T23:28:53.763429Z" } }, "cell_type": "code", "source": [ "import import_ipynb\n", "#from notebooks. import *\n", "\n", "import spacy\n", "nlp = spacy.load(\"en_core_web_md\")\n", "\n", "words = [\"cat\", \"dog\", \"feline\", \"vehicle\", \"car\"]\n", "# Test work similarities\n", "for word1 in words:\n", " for word2 in words:\n", " if word1 != word2:\n", " similarity = nlp.vocab[word1].similarity(nlp.vocab[word2])\n", " print(f\"{word1} - {word2}: {similarity:.3f}\")\n", "\n", "\n" ], "id": "8a3c4314a90086fe", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "cat - dog: 1.000\n", "cat - feline: 0.363\n", "cat - vehicle: 0.078\n", "cat - car: 0.193\n", "dog - cat: 1.000\n", "dog - feline: 0.363\n", "dog - vehicle: 0.078\n", "dog - car: 0.193\n", "feline - cat: 0.363\n", "feline - dog: 0.363\n", "feline - vehicle: 0.180\n", "feline - car: 0.050\n", "vehicle - cat: 0.078\n", "vehicle - dog: 0.078\n", "vehicle - feline: 0.180\n", "vehicle - car: 0.205\n", "car - cat: 0.193\n", "car - dog: 0.193\n", "car - feline: 0.050\n", "car - vehicle: 0.205\n" ] } ], "execution_count": 1 }, { "metadata": {}, "cell_type": "markdown", "source": "Simple averaging", "id": "8f32b5695f554268" }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-18T23:45:03.085563Z", "start_time": "2025-11-18T23:45:03.082190Z" } }, "cell_type": "code", "source": [ "def sentence_similarity_avg(sent1, sent2):\n", " doc1 = nlp(sent1)\n", " doc2 = nlp(sent2)\n", "\n", " # Vectors for each word, filter out words without vectors (medium model)\n", " vecs1 = [token.vector for token in doc1 if token.has_vector]\n", " vecs2 = [token.vector for token in doc2 if token.has_vector]\n", "\n", " if not vecs1 or not vecs2:\n", " return 0.0\n", "\n", " # Average vectors\n", " avg1 = sum(vecs1) / len(vecs1)\n", " avg2 = sum(vecs2) / len(vecs2)\n", "\n", " #cosine similarity\n", " from sklearn.metrics.pairwise import cosine_similarity\n", " return cosine_similarity([avg1], [avg2])[0][0]\n" ], "id": "68a6757447e4a1c7", "outputs": [], "execution_count": 3 }, { "metadata": {}, "cell_type": "markdown", "source": "SIF - Smooth Inverse Similarity", "id": "a9c3aa050f5bc0fe" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "def sentence_similarity_sif(sent1, sent2):\n", " doc1 = nlp(sent1)\n", " doc2 = nlp(sent2)" ], "id": "c100956f89d9b581" } ], "metadata": { "kernelspec": { "name": "python3", "language": "python", "display_name": "Python 3 (ipykernel)" } }, "nbformat": 4, "nbformat_minor": 5 }