Files
paraphrase_detector/notebooks/03_semantic_methods.ipynb
2025-11-19 00:31:43 +00:00

114 lines
2.9 KiB
Plaintext

{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "Test word vectors",
"id": "b79941bf4553fd6"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-18T23:28:55.563335Z",
"start_time": "2025-11-18T23:28:53.763429Z"
}
},
"cell_type": "code",
"source": [
"import import_ipynb\n",
"from notebooks.02_baseline_experiments.ipynb import *\n",
"\n",
"import spacy\n",
"nlp = spacy.load(\"en_core_web_md\")\n",
"\n",
"words = [\"cat\", \"dog\", \"feline\", \"vehicle\", \"car\"]\n",
"# Test work similarities\n",
"for word1 in words:\n",
" for word2 in words:\n",
" if word1 != word2:\n",
" similarity = nlp.vocab[word1].similarity(nlp.vocab[word2])\n",
" print(f\"{word1} - {word2}: {similarity:.3f}\")\n",
"\n",
"\n"
],
"id": "8a3c4314a90086fe",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cat - dog: 1.000\n",
"cat - feline: 0.363\n",
"cat - vehicle: 0.078\n",
"cat - car: 0.193\n",
"dog - cat: 1.000\n",
"dog - feline: 0.363\n",
"dog - vehicle: 0.078\n",
"dog - car: 0.193\n",
"feline - cat: 0.363\n",
"feline - dog: 0.363\n",
"feline - vehicle: 0.180\n",
"feline - car: 0.050\n",
"vehicle - cat: 0.078\n",
"vehicle - dog: 0.078\n",
"vehicle - feline: 0.180\n",
"vehicle - car: 0.205\n",
"car - cat: 0.193\n",
"car - dog: 0.193\n",
"car - feline: 0.050\n",
"car - vehicle: 0.205\n"
]
}
],
"execution_count": 1
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Simple averaging",
"id": "8f32b5695f554268"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-18T23:45:03.085563Z",
"start_time": "2025-11-18T23:45:03.082190Z"
}
},
"cell_type": "code",
"source": [
"def sentence_similarity_ang(sent1, sent2):\n",
" doc1 = nlp(sent1)\n",
" doc2 = nlp(sent2)\n",
"\n",
" # Vectors for each word, filter out words without vectors (medium model)\n",
" vecs1 = [token.vector for token in doc1 if token.has_vector]\n",
" vecs2 = [token.vector for token in doc2 if token.has_vector]\n",
"\n",
" if not vecs1 or not vecs2:\n",
" return 0.0\n",
"\n",
" # Average vectors\n",
" avg1 = sum(vecs1) / len(vecs1)\n",
" avg2 = sum(vecs2) / len(vecs2)\n",
"\n",
" #cosine similarity\n",
" from sklearn.metrics.pairwise import cosine_similarity\n",
" return cosine_similarity([avg1], [avg2])[0][0]\n"
],
"id": "68a6757447e4a1c7",
"outputs": [],
"execution_count": 3
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3 (ipykernel)"
}
},
"nbformat": 4,
"nbformat_minor": 5
}