more notebook testing

2025-11-19 00:31:43 +00:00
parent 8d6b1cab2c
commit 7ad4068a55
4 changed files with 165 additions and 16 deletions
--- a/notebooks/01_data_exploration.ipynb
+++ b/notebooks/01_data_exploration.ipynb
@@ -9,12 +9,13 @@
  {
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2025-11-18T23:01:17.888318Z",
+     "end_time": "2025-11-19T00:00:02.012627Z",
-     "start_time": "2025-11-18T23:01:16.494987Z"
+     "start_time": "2025-11-19T00:00:00.160731Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import import_ipynb\n",
    "import spacy\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_md\")  # Can swap for large model if required\n",
@@ -50,7 +51,7 @@
     ]
    }
   ],
-   "execution_count": 1
+   "execution_count": 2
  },
  {
   "metadata": {},
--- a/notebooks/02_baseline_experiments.ipynb
+++ b/notebooks/02_baseline_experiments.ipynb
@@ -3,12 +3,15 @@
  {
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2025-11-18T23:15:35.056834Z",
+     "end_time": "2025-11-19T00:00:04.962487Z",
-     "start_time": "2025-11-18T23:15:35.051218Z"
+     "start_time": "2025-11-19T00:00:04.958995Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import import_ipynb\n",
    "from notebooks.01_data_exploration import *\n",
    "\n",
    "def jaccard_similarity(sent1, sent2):\n",
    "    # make lowercase and split into words\n",
    "    words1 = set(sent1.lower().split())\n",
@@ -30,16 +33,15 @@
   "id": "e60d024e969254a",
   "outputs": [
    {
-     "name": "stdout",
+     "ename": "SyntaxError",
-     "output_type": "stream",
+     "evalue": "invalid decimal literal (2501033926.py, line 2)",
-     "text": [
+     "output_type": "error",
-      "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
+     "traceback": [
-      "'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n",
+      "  \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[8]\u001B[39m\u001B[32m, line 2\u001B[39m\n\u001B[31m    \u001B[39m\u001B[31mfrom notebooks.01_data_exploration import *\u001B[39m\n                     ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m invalid decimal literal\n"
      "'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n"
     ]
    }
   ],
-   "execution_count": 7
+   "execution_count": 8
  }
 ],
 "metadata": {
--- a/notebooks/03_semantic_methods.ipynb
+++ b/notebooks/03_semantic_methods.ipynb
@@ -2,14 +2,112 @@
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "Test word vectors",
   "id": "b79941bf4553fd6"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-18T23:28:55.563335Z",
     "start_time": "2025-11-18T23:28:53.763429Z"
    }
   },
   "cell_type": "code",
-   "outputs": [],
+   "source": [
-   "execution_count": null,
+    "import import_ipynb\n",
-   "source": "",
+    "from notebooks.02_baseline_experiments.ipynb import *\n",
-   "id": "8a3c4314a90086fe"
+    "\n",
    "import spacy\n",
    "nlp = spacy.load(\"en_core_web_md\")\n",
    "\n",
    "words = [\"cat\", \"dog\", \"feline\", \"vehicle\", \"car\"]\n",
    "# Test work similarities\n",
    "for word1 in words:\n",
    "    for word2 in words:\n",
    "        if word1 != word2:\n",
    "            similarity = nlp.vocab[word1].similarity(nlp.vocab[word2])\n",
    "            print(f\"{word1} - {word2}: {similarity:.3f}\")\n",
    "\n",
    "\n"
   ],
   "id": "8a3c4314a90086fe",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cat - dog: 1.000\n",
      "cat - feline: 0.363\n",
      "cat - vehicle: 0.078\n",
      "cat - car: 0.193\n",
      "dog - cat: 1.000\n",
      "dog - feline: 0.363\n",
      "dog - vehicle: 0.078\n",
      "dog - car: 0.193\n",
      "feline - cat: 0.363\n",
      "feline - dog: 0.363\n",
      "feline - vehicle: 0.180\n",
      "feline - car: 0.050\n",
      "vehicle - cat: 0.078\n",
      "vehicle - dog: 0.078\n",
      "vehicle - feline: 0.180\n",
      "vehicle - car: 0.205\n",
      "car - cat: 0.193\n",
      "car - dog: 0.193\n",
      "car - feline: 0.050\n",
      "car - vehicle: 0.205\n"
     ]
    }
   ],
   "execution_count": 1
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "Simple averaging",
   "id": "8f32b5695f554268"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-18T23:45:03.085563Z",
     "start_time": "2025-11-18T23:45:03.082190Z"
    }
   },
   "cell_type": "code",
   "source": [
    "def sentence_similarity_ang(sent1, sent2):\n",
    "    doc1 = nlp(sent1)\n",
    "    doc2 = nlp(sent2)\n",
    "\n",
    "    # Vectors for each word, filter out words without vectors (medium model)\n",
    "    vecs1 = [token.vector for token in doc1 if token.has_vector]\n",
    "    vecs2 = [token.vector for token in doc2 if token.has_vector]\n",
    "\n",
    "    if not vecs1 or not vecs2:\n",
    "        return 0.0\n",
    "\n",
    "    # Average vectors\n",
    "    avg1 = sum(vecs1) / len(vecs1)\n",
    "    avg2 = sum(vecs2) / len(vecs2)\n",
    "\n",
    "    #cosine similarity\n",
    "    from sklearn.metrics.pairwise import cosine_similarity\n",
    "    return cosine_similarity([avg1], [avg2])[0][0]\n"
   ],
   "id": "68a6757447e4a1c7",
   "outputs": [],
   "execution_count": 3
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3 (ipykernel)"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/04_fusion_model.ipynb
+++ b/notebooks/04_fusion_model.ipynb
@@ -0,0 +1,48 @@
 {
 "cells": [
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "import import_ipynb\n",
    "from notebooks.03_semantic_methods.ipynb import *\n",
    "\n",
    "def extract_all_features(sentence_pairs):\n",
    "    features = []\n",
    "    for sent1, sent2 in sentence_pairs:\n",
    "        feature_vector = [\n",
    "            jaccard_similarity(sent1, sent2),\n",
    "            sentence_similarity_avg(sent1, sent2),\n",
    "            sentence_similarity_sif(sent1, sent2),\n",
    "            syntactic_similarity(sent1, sent2)\n",
    "        ]"
   ],
   "id": "1c45d83192facfc6"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "labled_pairs = []\n",
    "\n",
    "X = extract_all_features(labled_pairs)\n",
    "y = [0,1,0,1...] #Lables for pairs\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
    "model = LogisticRegression()\n",
    "model.fit(X_train, y_train)\n"
   ],
   "id": "9665682bd5a7951e"
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
 }