reserch text
This commit is contained in:
@@ -1,35 +1,55 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1638b7b97e3bd6f",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-11-22T11:40:21.711998Z",
|
||||
"start_time": "2025-11-22T11:40:20.129376Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import spacy\n",
|
||||
"nlp = spacy.load(\"en_core_web_md\") # Medium model"
|
||||
],
|
||||
"id": "1638b7b97e3bd6f",
|
||||
"outputs": [],
|
||||
"execution_count": 11
|
||||
"nlp = spacy.load(\"en_core_web_lg\") # Medium model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "markdown",
|
||||
"source": "Test word vectors",
|
||||
"id": "b79941bf4553fd6"
|
||||
"id": "b79941bf4553fd6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Test word vectors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "8a3c4314a90086fe",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-11-22T11:47:39.286432Z",
|
||||
"start_time": "2025-11-22T11:47:39.271377Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ValueError",
|
||||
"evalue": "[E010] Word vectors set to length 0. This may be because you don't have a model installed or loaded, or because your model doesn't include word vectors. For more info, see the docs:\nhttps://spacy.io/usage/models",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m word2 \u001b[38;5;129;01min\u001b[39;00m words:\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m word1 != word2:\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m similarity = \u001b[43mnlp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m[\u001b[49m\u001b[43mword1\u001b[49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43msimilarity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnlp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m[\u001b[49m\u001b[43mword2\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mword1\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mword2\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:146\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.similarity\u001b[39m\u001b[34m()\u001b[39m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:164\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.vector_norm.__get__\u001b[39m\u001b[34m()\u001b[39m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:176\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.vector.__get__\u001b[39m\u001b[34m()\u001b[39m\n",
|
||||
"\u001b[31mValueError\u001b[39m: [E010] Word vectors set to length 0. This may be because you don't have a model installed or loaded, or because your model doesn't include word vectors. For more info, see the docs:\nhttps://spacy.io/usage/models"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def test_word_vectors(word):\n",
|
||||
" print(word, nlp.vocab[word].vector.shape)\n",
|
||||
@@ -43,62 +63,27 @@
|
||||
" print(f\"{word1} - {word2}: {similarity:.3f}\")\n",
|
||||
"\n",
|
||||
"\n"
|
||||
],
|
||||
"id": "8a3c4314a90086fe",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"cat - dog: 1.000\n",
|
||||
"cat - feline: 0.363\n",
|
||||
"cat - feral: 0.483\n",
|
||||
"cat - vehicle: 0.078\n",
|
||||
"cat - car: 0.193\n",
|
||||
"dog - cat: 1.000\n",
|
||||
"dog - feline: 0.363\n",
|
||||
"dog - feral: 0.483\n",
|
||||
"dog - vehicle: 0.078\n",
|
||||
"dog - car: 0.193\n",
|
||||
"feline - cat: 0.363\n",
|
||||
"feline - dog: 0.363\n",
|
||||
"feline - feral: 0.412\n",
|
||||
"feline - vehicle: 0.180\n",
|
||||
"feline - car: 0.050\n",
|
||||
"feral - cat: 0.483\n",
|
||||
"feral - dog: 0.483\n",
|
||||
"feral - feline: 0.412\n",
|
||||
"feral - vehicle: 0.175\n",
|
||||
"feral - car: 0.161\n",
|
||||
"vehicle - cat: 0.078\n",
|
||||
"vehicle - dog: 0.078\n",
|
||||
"vehicle - feline: 0.180\n",
|
||||
"vehicle - feral: 0.175\n",
|
||||
"vehicle - car: 0.205\n",
|
||||
"car - cat: 0.193\n",
|
||||
"car - dog: 0.193\n",
|
||||
"car - feline: 0.050\n",
|
||||
"car - feral: 0.161\n",
|
||||
"car - vehicle: 0.205\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 15
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "markdown",
|
||||
"source": "Simple averaging",
|
||||
"id": "8f32b5695f554268"
|
||||
"id": "8f32b5695f554268",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Simple averaging"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "68a6757447e4a1c7",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-11-18T23:45:03.085563Z",
|
||||
"start_time": "2025-11-18T23:45:03.082190Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def sentence_similarity_avg(sent1, sent2):\n",
|
||||
" doc1 = nlp(sent1)\n",
|
||||
@@ -118,35 +103,46 @@
|
||||
" #cosine similarity\n",
|
||||
" from sklearn.metrics.pairwise import cosine_similarity\n",
|
||||
" return cosine_similarity([avg1], [avg2])[0][0]\n"
|
||||
],
|
||||
"id": "68a6757447e4a1c7",
|
||||
"outputs": [],
|
||||
"execution_count": 3
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "markdown",
|
||||
"source": "SIF - Smooth Inverse Similarity",
|
||||
"id": "a9c3aa050f5bc0fe"
|
||||
"id": "a9c3aa050f5bc0fe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"SIF - Smooth Inverse Similarity"
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {},
|
||||
"cell_type": "code",
|
||||
"outputs": [],
|
||||
"execution_count": null,
|
||||
"id": "c100956f89d9b581",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def sentence_similarity_sif(sent1, sent2):\n",
|
||||
" doc1 = nlp(sent1)\n",
|
||||
" doc2 = nlp(sent2)"
|
||||
],
|
||||
"id": "c100956f89d9b581"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"display_name": "Python 3 (ipykernel)"
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
Reference in New Issue
Block a user