more notebook testing
This commit is contained in:
@@ -9,12 +9,13 @@
|
|||||||
{
|
{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2025-11-18T23:01:17.888318Z",
|
"end_time": "2025-11-19T00:00:02.012627Z",
|
||||||
"start_time": "2025-11-18T23:01:16.494987Z"
|
"start_time": "2025-11-19T00:00:00.160731Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"source": [
|
||||||
|
"import import_ipynb\n",
|
||||||
"import spacy\n",
|
"import spacy\n",
|
||||||
"\n",
|
"\n",
|
||||||
"nlp = spacy.load(\"en_core_web_md\") # Can swap for large model if required\n",
|
"nlp = spacy.load(\"en_core_web_md\") # Can swap for large model if required\n",
|
||||||
@@ -50,7 +51,7 @@
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"execution_count": 1
|
"execution_count": 2
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
|||||||
@@ -3,12 +3,15 @@
|
|||||||
{
|
{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
"end_time": "2025-11-18T23:15:35.056834Z",
|
"end_time": "2025-11-19T00:00:04.962487Z",
|
||||||
"start_time": "2025-11-18T23:15:35.051218Z"
|
"start_time": "2025-11-19T00:00:04.958995Z"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"source": [
|
||||||
|
"import import_ipynb\n",
|
||||||
|
"from notebooks.01_data_exploration import *\n",
|
||||||
|
"\n",
|
||||||
"def jaccard_similarity(sent1, sent2):\n",
|
"def jaccard_similarity(sent1, sent2):\n",
|
||||||
" # make lowercase and split into words\n",
|
" # make lowercase and split into words\n",
|
||||||
" words1 = set(sent1.lower().split())\n",
|
" words1 = set(sent1.lower().split())\n",
|
||||||
@@ -30,16 +33,15 @@
|
|||||||
"id": "e60d024e969254a",
|
"id": "e60d024e969254a",
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"ename": "SyntaxError",
|
||||||
"output_type": "stream",
|
"evalue": "invalid decimal literal (2501033926.py, line 2)",
|
||||||
"text": [
|
"output_type": "error",
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
"traceback": [
|
||||||
"'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n",
|
" \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[8]\u001B[39m\u001B[32m, line 2\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mfrom notebooks.01_data_exploration import *\u001B[39m\n ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m invalid decimal literal\n"
|
||||||
"'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"execution_count": 7
|
"execution_count": 8
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|||||||
@@ -2,14 +2,112 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": "Test word vectors",
|
||||||
|
"id": "b79941bf4553fd6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-11-18T23:28:55.563335Z",
|
||||||
|
"start_time": "2025-11-18T23:28:53.763429Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"outputs": [],
|
"source": [
|
||||||
"execution_count": null,
|
"import import_ipynb\n",
|
||||||
"source": "",
|
"from notebooks.02_baseline_experiments.ipynb import *\n",
|
||||||
"id": "8a3c4314a90086fe"
|
"\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load(\"en_core_web_md\")\n",
|
||||||
|
"\n",
|
||||||
|
"words = [\"cat\", \"dog\", \"feline\", \"vehicle\", \"car\"]\n",
|
||||||
|
"# Test work similarities\n",
|
||||||
|
"for word1 in words:\n",
|
||||||
|
" for word2 in words:\n",
|
||||||
|
" if word1 != word2:\n",
|
||||||
|
" similarity = nlp.vocab[word1].similarity(nlp.vocab[word2])\n",
|
||||||
|
" print(f\"{word1} - {word2}: {similarity:.3f}\")\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
],
|
||||||
|
"id": "8a3c4314a90086fe",
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"cat - dog: 1.000\n",
|
||||||
|
"cat - feline: 0.363\n",
|
||||||
|
"cat - vehicle: 0.078\n",
|
||||||
|
"cat - car: 0.193\n",
|
||||||
|
"dog - cat: 1.000\n",
|
||||||
|
"dog - feline: 0.363\n",
|
||||||
|
"dog - vehicle: 0.078\n",
|
||||||
|
"dog - car: 0.193\n",
|
||||||
|
"feline - cat: 0.363\n",
|
||||||
|
"feline - dog: 0.363\n",
|
||||||
|
"feline - vehicle: 0.180\n",
|
||||||
|
"feline - car: 0.050\n",
|
||||||
|
"vehicle - cat: 0.078\n",
|
||||||
|
"vehicle - dog: 0.078\n",
|
||||||
|
"vehicle - feline: 0.180\n",
|
||||||
|
"vehicle - car: 0.205\n",
|
||||||
|
"car - cat: 0.193\n",
|
||||||
|
"car - dog: 0.193\n",
|
||||||
|
"car - feline: 0.050\n",
|
||||||
|
"car - vehicle: 0.205\n"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
"execution_count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": "Simple averaging",
|
||||||
|
"id": "8f32b5695f554268"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-11-18T23:45:03.085563Z",
|
||||||
|
"start_time": "2025-11-18T23:45:03.082190Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def sentence_similarity_ang(sent1, sent2):\n",
|
||||||
|
" doc1 = nlp(sent1)\n",
|
||||||
|
" doc2 = nlp(sent2)\n",
|
||||||
|
"\n",
|
||||||
|
" # Vectors for each word, filter out words without vectors (medium model)\n",
|
||||||
|
" vecs1 = [token.vector for token in doc1 if token.has_vector]\n",
|
||||||
|
" vecs2 = [token.vector for token in doc2 if token.has_vector]\n",
|
||||||
|
"\n",
|
||||||
|
" if not vecs1 or not vecs2:\n",
|
||||||
|
" return 0.0\n",
|
||||||
|
"\n",
|
||||||
|
" # Average vectors\n",
|
||||||
|
" avg1 = sum(vecs1) / len(vecs1)\n",
|
||||||
|
" avg2 = sum(vecs2) / len(vecs2)\n",
|
||||||
|
"\n",
|
||||||
|
" #cosine similarity\n",
|
||||||
|
" from sklearn.metrics.pairwise import cosine_similarity\n",
|
||||||
|
" return cosine_similarity([avg1], [avg2])[0][0]\n"
|
||||||
|
],
|
||||||
|
"id": "68a6757447e4a1c7",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 3
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"language": "python",
|
||||||
|
"display_name": "Python 3 (ipykernel)"
|
||||||
|
}
|
||||||
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 5
|
"nbformat_minor": 5
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,48 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "code",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": null,
|
||||||
|
"source": [
|
||||||
|
"import import_ipynb\n",
|
||||||
|
"from notebooks.03_semantic_methods.ipynb import *\n",
|
||||||
|
"\n",
|
||||||
|
"def extract_all_features(sentence_pairs):\n",
|
||||||
|
" features = []\n",
|
||||||
|
" for sent1, sent2 in sentence_pairs:\n",
|
||||||
|
" feature_vector = [\n",
|
||||||
|
" jaccard_similarity(sent1, sent2),\n",
|
||||||
|
" sentence_similarity_avg(sent1, sent2),\n",
|
||||||
|
" sentence_similarity_sif(sent1, sent2),\n",
|
||||||
|
" syntactic_similarity(sent1, sent2)\n",
|
||||||
|
" ]"
|
||||||
|
],
|
||||||
|
"id": "1c45d83192facfc6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "code",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": null,
|
||||||
|
"source": [
|
||||||
|
"from sklearn.linear_model import LogisticRegression\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"labled_pairs = []\n",
|
||||||
|
"\n",
|
||||||
|
"X = extract_all_features(labled_pairs)\n",
|
||||||
|
"y = [0,1,0,1...] #Lables for pairs\n",
|
||||||
|
"\n",
|
||||||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
|
||||||
|
"model = LogisticRegression()\n",
|
||||||
|
"model.fit(X_train, y_train)\n"
|
||||||
|
],
|
||||||
|
"id": "9665682bd5a7951e"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user