Mode advanced data exploration and notebook htmls

This commit is contained in:
Henry Dowd
2025-11-30 21:50:08 +00:00
parent fe2c087093
commit 828c49e9b7
4 changed files with 18218 additions and 466 deletions

File diff suppressed because one or more lines are too long

View File

@@ -13,7 +13,8 @@
"outputs": [],
"source": [
"import spacy\n",
"nlp = spacy.load(\"en_core_web_lg\") # Medium model"
"nlp = spacy.load(\"en_core_web_lg\") # Large model\n",
"nlp_trf = spacy.load(\"en_core_web_trf\") # Transformer Model"
]
},
{
@@ -26,7 +27,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 2,
"id": "8a3c4314a90086fe",
"metadata": {
"ExecuteTime": {
@@ -36,17 +37,39 @@
},
"outputs": [
{
"ename": "ValueError",
"evalue": "[E010] Word vectors set to length 0. This may be because you don't have a model installed or loaded, or because your model doesn't include word vectors. For more info, see the docs:\nhttps://spacy.io/usage/models",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m word2 \u001b[38;5;129;01min\u001b[39;00m words:\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m word1 != word2:\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m similarity = \u001b[43mnlp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m[\u001b[49m\u001b[43mword1\u001b[49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43msimilarity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnlp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m[\u001b[49m\u001b[43mword2\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mword1\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mword2\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:146\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.similarity\u001b[39m\u001b[34m()\u001b[39m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:164\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.vector_norm.__get__\u001b[39m\u001b[34m()\u001b[39m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:176\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.vector.__get__\u001b[39m\u001b[34m()\u001b[39m\n",
"\u001b[31mValueError\u001b[39m: [E010] Word vectors set to length 0. This may be because you don't have a model installed or loaded, or because your model doesn't include word vectors. For more info, see the docs:\nhttps://spacy.io/usage/models"
"name": "stdout",
"output_type": "stream",
"text": [
"cat - dog: 0.802\n",
"cat - feline: 0.699\n",
"cat - feral: 0.486\n",
"cat - vehicle: 0.190\n",
"cat - car: 0.319\n",
"dog - cat: 0.802\n",
"dog - feline: 0.566\n",
"dog - feral: 0.400\n",
"dog - vehicle: 0.258\n",
"dog - car: 0.356\n",
"feline - cat: 0.699\n",
"feline - dog: 0.566\n",
"feline - feral: 0.543\n",
"feline - vehicle: 0.103\n",
"feline - car: 0.095\n",
"feral - cat: 0.486\n",
"feral - dog: 0.400\n",
"feral - feline: 0.543\n",
"feral - vehicle: 0.088\n",
"feral - car: 0.040\n",
"vehicle - cat: 0.190\n",
"vehicle - dog: 0.258\n",
"vehicle - feline: 0.103\n",
"vehicle - feral: 0.088\n",
"vehicle - car: 0.767\n",
"car - cat: 0.319\n",
"car - dog: 0.356\n",
"car - feline: 0.095\n",
"car - feral: 0.040\n",
"car - vehicle: 0.767\n"
]
}
],
@@ -115,7 +138,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "c100956f89d9b581",
"metadata": {},
"outputs": [],

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long