174 lines
4.2 KiB
Plaintext
174 lines
4.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1638b7b97e3bd6f",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-11-22T11:40:21.711998Z",
|
|
"start_time": "2025-11-22T11:40:20.129376Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import spacy\n",
|
|
"nlp = spacy.load(\"en_core_web_lg\") # Large model\n",
|
|
"nlp_trf = spacy.load(\"en_core_web_trf\") # Transformer Model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b79941bf4553fd6",
|
|
"metadata": {},
|
|
"source": [
|
|
"Test word vectors"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "8a3c4314a90086fe",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-11-22T11:47:39.286432Z",
|
|
"start_time": "2025-11-22T11:47:39.271377Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"cat - dog: 0.802\n",
|
|
"cat - feline: 0.699\n",
|
|
"cat - feral: 0.486\n",
|
|
"cat - vehicle: 0.190\n",
|
|
"cat - car: 0.319\n",
|
|
"dog - cat: 0.802\n",
|
|
"dog - feline: 0.566\n",
|
|
"dog - feral: 0.400\n",
|
|
"dog - vehicle: 0.258\n",
|
|
"dog - car: 0.356\n",
|
|
"feline - cat: 0.699\n",
|
|
"feline - dog: 0.566\n",
|
|
"feline - feral: 0.543\n",
|
|
"feline - vehicle: 0.103\n",
|
|
"feline - car: 0.095\n",
|
|
"feral - cat: 0.486\n",
|
|
"feral - dog: 0.400\n",
|
|
"feral - feline: 0.543\n",
|
|
"feral - vehicle: 0.088\n",
|
|
"feral - car: 0.040\n",
|
|
"vehicle - cat: 0.190\n",
|
|
"vehicle - dog: 0.258\n",
|
|
"vehicle - feline: 0.103\n",
|
|
"vehicle - feral: 0.088\n",
|
|
"vehicle - car: 0.767\n",
|
|
"car - cat: 0.319\n",
|
|
"car - dog: 0.356\n",
|
|
"car - feline: 0.095\n",
|
|
"car - feral: 0.040\n",
|
|
"car - vehicle: 0.767\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def test_word_vectors(word):\n",
|
|
" print(word, nlp.vocab[word].vector.shape)\n",
|
|
"\n",
|
|
"words = [\"cat\", \"dog\", \"feline\", \"feral\", \"vehicle\", \"car\"]\n",
|
|
"# Test work similarities\n",
|
|
"for word1 in words:\n",
|
|
" for word2 in words:\n",
|
|
" if word1 != word2:\n",
|
|
" similarity = nlp.vocab[word1].similarity(nlp.vocab[word2])\n",
|
|
" print(f\"{word1} - {word2}: {similarity:.3f}\")\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8f32b5695f554268",
|
|
"metadata": {},
|
|
"source": [
|
|
"Simple averaging"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "68a6757447e4a1c7",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-11-18T23:45:03.085563Z",
|
|
"start_time": "2025-11-18T23:45:03.082190Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def sentence_similarity_avg(sent1, sent2):\n",
|
|
" doc1 = nlp(sent1)\n",
|
|
" doc2 = nlp(sent2)\n",
|
|
"\n",
|
|
" # Vectors for each word, filter out words without vectors (medium model)\n",
|
|
" vecs1 = [token.vector for token in doc1 if token.has_vector]\n",
|
|
" vecs2 = [token.vector for token in doc2 if token.has_vector]\n",
|
|
"\n",
|
|
" if not vecs1 or not vecs2:\n",
|
|
" return 0.0\n",
|
|
"\n",
|
|
" # Average vectors\n",
|
|
" avg1 = sum(vecs1) / len(vecs1)\n",
|
|
" avg2 = sum(vecs2) / len(vecs2)\n",
|
|
"\n",
|
|
" #cosine similarity\n",
|
|
" from sklearn.metrics.pairwise import cosine_similarity\n",
|
|
" return cosine_similarity([avg1], [avg2])[0][0]\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a9c3aa050f5bc0fe",
|
|
"metadata": {},
|
|
"source": [
|
|
"SIF - Smooth Inverse Similarity"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "c100956f89d9b581",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def sentence_similarity_sif(sent1, sent2):\n",
|
|
" doc1 = nlp(sent1)\n",
|
|
" doc2 = nlp(sent2)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|