{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "1638b7b97e3bd6f", "metadata": { "ExecuteTime": { "end_time": "2025-11-22T11:40:21.711998Z", "start_time": "2025-11-22T11:40:20.129376Z" } }, "outputs": [], "source": [ "import spacy\n", "nlp = spacy.load(\"en_core_web_lg\") # Medium model" ] }, { "cell_type": "markdown", "id": "b79941bf4553fd6", "metadata": {}, "source": [ "Test word vectors" ] }, { "cell_type": "code", "execution_count": 9, "id": "8a3c4314a90086fe", "metadata": { "ExecuteTime": { "end_time": "2025-11-22T11:47:39.286432Z", "start_time": "2025-11-22T11:47:39.271377Z" } }, "outputs": [ { "ename": "ValueError", "evalue": "[E010] Word vectors set to length 0. This may be because you don't have a model installed or loaded, or because your model doesn't include word vectors. For more info, see the docs:\nhttps://spacy.io/usage/models", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m word2 \u001b[38;5;129;01min\u001b[39;00m words:\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m word1 != word2:\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m similarity = \u001b[43mnlp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m[\u001b[49m\u001b[43mword1\u001b[49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43msimilarity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnlp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m[\u001b[49m\u001b[43mword2\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mword1\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mword2\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n", "\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:146\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.similarity\u001b[39m\u001b[34m()\u001b[39m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:164\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.vector_norm.__get__\u001b[39m\u001b[34m()\u001b[39m\n", "\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:176\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.vector.__get__\u001b[39m\u001b[34m()\u001b[39m\n", "\u001b[31mValueError\u001b[39m: [E010] Word vectors set to length 0. This may be because you don't have a model installed or loaded, or because your model doesn't include word vectors. For more info, see the docs:\nhttps://spacy.io/usage/models" ] } ], "source": [ "def test_word_vectors(word):\n", " print(word, nlp.vocab[word].vector.shape)\n", "\n", "words = [\"cat\", \"dog\", \"feline\", \"feral\", \"vehicle\", \"car\"]\n", "# Test work similarities\n", "for word1 in words:\n", " for word2 in words:\n", " if word1 != word2:\n", " similarity = nlp.vocab[word1].similarity(nlp.vocab[word2])\n", " print(f\"{word1} - {word2}: {similarity:.3f}\")\n", "\n", "\n" ] }, { "cell_type": "markdown", "id": "8f32b5695f554268", "metadata": {}, "source": [ "Simple averaging" ] }, { "cell_type": "code", "execution_count": 3, "id": "68a6757447e4a1c7", "metadata": { "ExecuteTime": { "end_time": "2025-11-18T23:45:03.085563Z", "start_time": "2025-11-18T23:45:03.082190Z" } }, "outputs": [], "source": [ "def sentence_similarity_avg(sent1, sent2):\n", " doc1 = nlp(sent1)\n", " doc2 = nlp(sent2)\n", "\n", " # Vectors for each word, filter out words without vectors (medium model)\n", " vecs1 = [token.vector for token in doc1 if token.has_vector]\n", " vecs2 = [token.vector for token in doc2 if token.has_vector]\n", "\n", " if not vecs1 or not vecs2:\n", " return 0.0\n", "\n", " # Average vectors\n", " avg1 = sum(vecs1) / len(vecs1)\n", " avg2 = sum(vecs2) / len(vecs2)\n", "\n", " #cosine similarity\n", " from sklearn.metrics.pairwise import cosine_similarity\n", " return cosine_similarity([avg1], [avg2])[0][0]\n" ] }, { "cell_type": "markdown", "id": "a9c3aa050f5bc0fe", "metadata": {}, "source": [ "SIF - Smooth Inverse Similarity" ] }, { "cell_type": "code", "execution_count": null, "id": "c100956f89d9b581", "metadata": {}, "outputs": [], "source": [ "def sentence_similarity_sif(sent1, sent2):\n", " doc1 = nlp(sent1)\n", " doc2 = nlp(sent2)" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.7" } }, "nbformat": 4, "nbformat_minor": 5 }