From c2e88b26ca44ea0e8ee4461392729541a0d6a512 Mon Sep 17 00:00:00 2001
From: Henry Dowd <henrydowd@outlook.com>
Date: Fri, 21 Nov 2025 11:56:34 +0000
Subject: [PATCH] Dependency tree for sentence structure and made notebook
 funcions importable wit new py file

---
 notebooks/01_data_exploration.ipynb     | 504 +++++++++++++++++++++++-
 notebooks/02_baseline_experiments.ipynb |  19 +-
 notebooks/03_semantic_methods.ipynb     |  22 +-
 notebooks/04_fusion_model.ipynb         |   4 +-
 notebooks/05_final_evaluation.ipynb     |  15 +
 notebooks/notebook_functions.py         |  48 +++
 6 files changed, 592 insertions(+), 20 deletions(-)
 create mode 100644 notebooks/notebook_functions.py

diff --git a/notebooks/01_data_exploration.ipynb b/notebooks/01_data_exploration.ipynb
index 229a62f..ab59c02 100644
--- a/notebooks/01_data_exploration.ipynb
+++ b/notebooks/01_data_exploration.ipynb
@@ -9,13 +9,13 @@
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-11-19T00:00:02.012627Z",
-     "start_time": "2025-11-19T00:00:00.160731Z"
+     "end_time": "2025-11-20T19:03:29.658876Z",
+     "start_time": "2025-11-20T19:03:27.809309Z"
     }
    },
    "cell_type": "code",
    "source": [
-    "import import_ipynb\n",
+    "import token\n",
     "import spacy\n",
     "\n",
     "nlp = spacy.load(\"en_core_web_md\")  # Can swap for large model if required\n",
@@ -30,8 +30,7 @@
     "    doc = nlp(sent)\n",
     "    print(f\"Sentence: {sent}\")\n",
     "    print(f\"Tokens: {[token.text for token in doc]}\")\n",
-    "    print(\"---\")\n",
-    "\n"
+    "    print(\"---\")\n"
    ],
    "id": "e003ac06a58cfbb4",
    "outputs": [
@@ -51,7 +50,498 @@
      ]
     }
    ],
-   "execution_count": 2
+   "execution_count": 17
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-11-20T19:53:11.868566Z",
+     "start_time": "2025-11-20T19:53:11.861295Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import spacy\n",
+    "import token\n",
+    "\n",
+    "nlp = spacy.load(\"en_core_web_md\")\n",
+    "\n",
+    "test_sentences = [\n",
+    "    \"The cat sat on the mat.\",\n",
+    "    \"On the mat, the cat was sitting.\",\n",
+    "    \"A completely different sentence about something else.\"\n",
+    "]\n",
+    "\n",
+    "class TextPreprocessor:\n",
+    "    def __init__(self):\n",
+    "        self.nlp = spacy.load(\"en_core_web_md\")\n",
+    "\n",
+    "        def direct_detection(self, text):\n",
+    "            \"\"\"For direct copy detection\"\"\"\n",
+    "            #Keep punctuation\n",
+    "            return text.lower().strip()\n",
+    "\n",
+    "        def semantic_analysis(self, text):\n",
+    "            \"\"\"Semantic Similarity\"\"\"\n",
+    "            doc = self.nlp(text)\n",
+    "            tokens = []\n",
+    "            for token in doc:\n",
+    "                if (not token.is_punct and not token.is_space and token.is_alpha and token.is_stop and len(token.lemma_) > 1): #Remove single char tokens\n",
+    "                    tokens.append(token.lemma_.lower())\n",
+    "            return \" \".join(tokens)\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "for sent in test_sentences:\n",
+    "    print(f\"Original Sentence: {sent}\")\n",
+    "    print(\"---\")\n",
+    "    print(f\"Preprocessed Sentence: {preprocess_semantic(sent)}\")\n",
+    "    print(\"-\" * 50)"
+   ],
+   "id": "5e488a878a5cfccb",
+   "outputs": [
+    {
+     "ename": "IndentationError",
+     "evalue": "expected an indented block after 'if' statement on line 26 (400725648.py, line 31)",
+     "output_type": "error",
+     "traceback": [
+      "  \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[19]\u001B[39m\u001B[32m, line 31\u001B[39m\n\u001B[31m    \u001B[39m\u001B[31mfor sent in test_sentences:\u001B[39m\n                               ^\n\u001B[31mIndentationError\u001B[39m\u001B[31m:\u001B[39m expected an indented block after 'if' statement on line 26\n"
+     ]
+    }
+   ],
+   "execution_count": 19
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-11-20T15:51:42.074798Z",
+     "start_time": "2025-11-20T15:51:42.050593Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import spacy\n",
+    "\n",
+    "nlp = spacy.load(\"en_core_web_md\")\n",
+    "\n",
+    "def extract_parse_tree(text):\n",
+    "    doc = nlp(text)\n",
+    "\n",
+    "    print(f\"Sentence: {text}\")\n",
+    "    print(\"\\nDependenct Parse Tree:\")\n",
+    "    print(\"-\" * 50)\n",
+    "\n",
+    "    for token in doc:\n",
+    "        print(f\"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}\")\n",
+    "\n",
+    "    return doc\n",
+    "\n",
+    "test_sentences = [\n",
+    "    \"The cat sat on the mat.\",\n",
+    "    \"On the mat, the cat was sitting.\",\n",
+    "    \"A completely different sentence about something else.\"\n",
+    "]\n",
+    "\n",
+    "for sentence in test_sentences:\n",
+    "    doc = extract_parse_tree(sentence)\n",
+    "    print(\"\\n\" + \"=\"*60 + \"\\n\")"
+   ],
+   "id": "83fc18c9de2e354",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sentence: The cat sat on the mat.\n",
+      "\n",
+      "Dependenct Parse Tree:\n",
+      "--------------------------------------------------\n",
+      "The          det          cat          []\n",
+      "cat          nsubj        sat          ['The']\n",
+      "sat          ROOT         sat          ['cat', 'on', '.']\n",
+      "on           prep         sat          ['mat']\n",
+      "the          det          mat          []\n",
+      "mat          pobj         on           ['the']\n",
+      ".            punct        sat          []\n",
+      "\n",
+      "============================================================\n",
+      "\n",
+      "Sentence: On the mat, the cat was sitting.\n",
+      "\n",
+      "Dependenct Parse Tree:\n",
+      "--------------------------------------------------\n",
+      "On           prep         sitting      ['mat']\n",
+      "the          det          mat          []\n",
+      "mat          pobj         On           ['the']\n",
+      ",            punct        sitting      []\n",
+      "the          det          cat          []\n",
+      "cat          nsubj        sitting      ['the']\n",
+      "was          aux          sitting      []\n",
+      "sitting      ROOT         sitting      ['On', ',', 'cat', 'was', '.']\n",
+      ".            punct        sitting      []\n",
+      "\n",
+      "============================================================\n",
+      "\n",
+      "Sentence: A completely different sentence about something else.\n",
+      "\n",
+      "Dependenct Parse Tree:\n",
+      "--------------------------------------------------\n",
+      "A            det          sentence     []\n",
+      "completely   advmod       different    []\n",
+      "different    amod         sentence     ['completely']\n",
+      "sentence     ROOT         sentence     ['A', 'different', 'about', '.']\n",
+      "about        prep         sentence     ['something']\n",
+      "something    pobj         about        ['else']\n",
+      "else         advmod       something    []\n",
+      ".            punct        sentence     []\n",
+      "\n",
+      "============================================================\n",
+      "\n"
+     ]
+    }
+   ],
+   "execution_count": 15
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-11-20T15:46:08.461059Z",
+     "start_time": "2025-11-20T15:45:47.529073Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import spacy\n",
+    "from spacy import displacy\n",
+    "from IPython.display import display, HTML\n",
+    "\n",
+    "nlp = spacy.load(\"en_core_web_md\")\n",
+    "\n",
+    "test_sentences = [\n",
+    "    \"The cat sat on the mat.\",\n",
+    "    \"On the mat, the cat was sitting.\",\n",
+    "    \"A completely different sentence about something else.\"\n",
+    "]\n",
+    "\n",
+    "def visualize_parse_tree(text):\n",
+    "    doc = nlp(text)\n",
+    "    html = displacy.render(doc, style=\"dep\", jupyter=False, options={\"distance\": 100})\n",
+    "    display(HTML(html))\n",
+    "\n",
+    "for sentence in test_sentences:\n",
+    "    print(f\"Sentence: {sentence}\")\n",
+    "    visualize_parse_tree(sentence)"
+   ],
+   "id": "e413238c1af12f62",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sentence: The cat sat on the mat.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ],
+      "text/html": [
+       "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"30acd41e2fa8417a92c8204d90501047-0\" class=\"displacy\" width=\"650\" height=\"237.0\" direction=\"ltr\" style=\"max-width: none; height: 237.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">The</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">DET</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">cat</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">sat</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">VERB</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">on</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">ADP</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">the</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">DET</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">mat.</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-30acd41e2fa8417a92c8204d90501047-0-0\" stroke-width=\"2px\" d=\"M70,102.0 C70,52.0 145.0,52.0 145.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-30acd41e2fa8417a92c8204d90501047-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M70,104.0 L62,92.0 78,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-30acd41e2fa8417a92c8204d90501047-0-1\" stroke-width=\"2px\" d=\"M170,102.0 C170,52.0 245.0,52.0 245.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-30acd41e2fa8417a92c8204d90501047-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M170,104.0 L162,92.0 178,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-30acd41e2fa8417a92c8204d90501047-0-2\" stroke-width=\"2px\" d=\"M270,102.0 C270,52.0 345.0,52.0 345.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-30acd41e2fa8417a92c8204d90501047-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M345.0,104.0 L353.0,92.0 337.0,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-30acd41e2fa8417a92c8204d90501047-0-3\" stroke-width=\"2px\" d=\"M470,102.0 C470,52.0 545.0,52.0 545.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-30acd41e2fa8417a92c8204d90501047-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M470,104.0 L462,92.0 478,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-30acd41e2fa8417a92c8204d90501047-0-4\" stroke-width=\"2px\" d=\"M370,102.0 C370,2.0 550.0,2.0 550.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-30acd41e2fa8417a92c8204d90501047-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M550.0,104.0 L558.0,92.0 542.0,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "</svg>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data",
+     "jetTransient": {
+      "display_id": null
+     }
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sentence: On the mat, the cat was sitting.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ],
+      "text/html": [
+       "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"a6d4123128854110b9b87ac31b74258f-0\" class=\"displacy\" width=\"750\" height=\"287.0\" direction=\"ltr\" style=\"max-width: none; height: 287.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">On</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">ADP</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">the</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">DET</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">mat,</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">the</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">DET</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">cat</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">was</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">AUX</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">sitting.</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">VERB</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-a6d4123128854110b9b87ac31b74258f-0-0\" stroke-width=\"2px\" d=\"M70,152.0 C70,2.0 650.0,2.0 650.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-a6d4123128854110b9b87ac31b74258f-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M70,154.0 L62,142.0 78,142.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-a6d4123128854110b9b87ac31b74258f-0-1\" stroke-width=\"2px\" d=\"M170,152.0 C170,102.0 240.0,102.0 240.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-a6d4123128854110b9b87ac31b74258f-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M170,154.0 L162,142.0 178,142.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-a6d4123128854110b9b87ac31b74258f-0-2\" stroke-width=\"2px\" d=\"M70,152.0 C70,52.0 245.0,52.0 245.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-a6d4123128854110b9b87ac31b74258f-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M245.0,154.0 L253.0,142.0 237.0,142.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-a6d4123128854110b9b87ac31b74258f-0-3\" stroke-width=\"2px\" d=\"M370,152.0 C370,102.0 440.0,102.0 440.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-a6d4123128854110b9b87ac31b74258f-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M370,154.0 L362,142.0 378,142.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-a6d4123128854110b9b87ac31b74258f-0-4\" stroke-width=\"2px\" d=\"M470,152.0 C470,52.0 645.0,52.0 645.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-a6d4123128854110b9b87ac31b74258f-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M470,154.0 L462,142.0 478,142.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-a6d4123128854110b9b87ac31b74258f-0-5\" stroke-width=\"2px\" d=\"M570,152.0 C570,102.0 640.0,102.0 640.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-a6d4123128854110b9b87ac31b74258f-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M570,154.0 L562,142.0 578,142.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "</svg>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data",
+     "jetTransient": {
+      "display_id": null
+     }
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sentence: A completely different sentence about something else.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ],
+      "text/html": [
+       "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"af01d7eb10f84122b89ef1cd5057e52c-0\" class=\"displacy\" width=\"750\" height=\"237.0\" direction=\"ltr\" style=\"max-width: none; height: 237.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">A</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">DET</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">completely</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">ADV</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">different</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">ADJ</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">sentence</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">about</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">ADP</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">something</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">PRON</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">else.</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">ADV</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-af01d7eb10f84122b89ef1cd5057e52c-0-0\" stroke-width=\"2px\" d=\"M70,102.0 C70,2.0 350.0,2.0 350.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-af01d7eb10f84122b89ef1cd5057e52c-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M70,104.0 L62,92.0 78,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-af01d7eb10f84122b89ef1cd5057e52c-0-1\" stroke-width=\"2px\" d=\"M170,102.0 C170,52.0 245.0,52.0 245.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-af01d7eb10f84122b89ef1cd5057e52c-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">advmod</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M170,104.0 L162,92.0 178,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-af01d7eb10f84122b89ef1cd5057e52c-0-2\" stroke-width=\"2px\" d=\"M270,102.0 C270,52.0 345.0,52.0 345.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-af01d7eb10f84122b89ef1cd5057e52c-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M270,104.0 L262,92.0 278,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-af01d7eb10f84122b89ef1cd5057e52c-0-3\" stroke-width=\"2px\" d=\"M370,102.0 C370,52.0 445.0,52.0 445.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-af01d7eb10f84122b89ef1cd5057e52c-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M445.0,104.0 L453.0,92.0 437.0,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-af01d7eb10f84122b89ef1cd5057e52c-0-4\" stroke-width=\"2px\" d=\"M470,102.0 C470,52.0 545.0,52.0 545.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-af01d7eb10f84122b89ef1cd5057e52c-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M545.0,104.0 L553.0,92.0 537.0,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-af01d7eb10f84122b89ef1cd5057e52c-0-5\" stroke-width=\"2px\" d=\"M570,102.0 C570,52.0 645.0,52.0 645.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-af01d7eb10f84122b89ef1cd5057e52c-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">advmod</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M645.0,104.0 L653.0,92.0 637.0,92.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "</svg>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data",
+     "jetTransient": {
+      "display_id": null
+     }
+    }
+   ],
+   "execution_count": 14
   },
   {
    "metadata": {},
@@ -59,7 +549,7 @@
    "outputs": [],
    "execution_count": null,
    "source": "",
-   "id": "83fc18c9de2e354"
+   "id": "6aff51eb71eb2238"
   }
  ],
  "metadata": {
diff --git a/notebooks/02_baseline_experiments.ipynb b/notebooks/02_baseline_experiments.ipynb
index 126ec8a..2454db0 100644
--- a/notebooks/02_baseline_experiments.ipynb
+++ b/notebooks/02_baseline_experiments.ipynb
@@ -3,14 +3,14 @@
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-11-19T00:00:04.962487Z",
-     "start_time": "2025-11-19T00:00:04.958995Z"
+     "end_time": "2025-11-19T10:01:11.039074Z",
+     "start_time": "2025-11-19T10:01:09.613806Z"
     }
    },
    "cell_type": "code",
    "source": [
     "import import_ipynb\n",
-    "from notebooks.01_data_exploration import *\n",
+    "#from notebooks.01_data_exploration import *\n",
     "\n",
     "def jaccard_similarity(sent1, sent2):\n",
     "    # make lowercase and split into words\n",
@@ -33,15 +33,16 @@
    "id": "e60d024e969254a",
    "outputs": [
     {
-     "ename": "SyntaxError",
-     "evalue": "invalid decimal literal (2501033926.py, line 2)",
-     "output_type": "error",
-     "traceback": [
-      "  \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[8]\u001B[39m\u001B[32m, line 2\u001B[39m\n\u001B[31m    \u001B[39m\u001B[31mfrom notebooks.01_data_exploration import *\u001B[39m\n                     ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m invalid decimal literal\n"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
+      "'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n",
+      "'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n"
      ]
     }
    ],
-   "execution_count": 8
+   "execution_count": 9
   }
  ],
  "metadata": {
diff --git a/notebooks/03_semantic_methods.ipynb b/notebooks/03_semantic_methods.ipynb
index 09d9e85..640b581 100644
--- a/notebooks/03_semantic_methods.ipynb
+++ b/notebooks/03_semantic_methods.ipynb
@@ -16,7 +16,7 @@
    "cell_type": "code",
    "source": [
     "import import_ipynb\n",
-    "from notebooks.02_baseline_experiments.ipynb import *\n",
+    "#from notebooks. import *\n",
     "\n",
     "import spacy\n",
     "nlp = spacy.load(\"en_core_web_md\")\n",
@@ -77,7 +77,7 @@
    },
    "cell_type": "code",
    "source": [
-    "def sentence_similarity_ang(sent1, sent2):\n",
+    "def sentence_similarity_avg(sent1, sent2):\n",
     "    doc1 = nlp(sent1)\n",
     "    doc2 = nlp(sent2)\n",
     "\n",
@@ -99,6 +99,24 @@
    "id": "68a6757447e4a1c7",
    "outputs": [],
    "execution_count": 3
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "SIF - Smooth Inverse Similarity",
+   "id": "a9c3aa050f5bc0fe"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "def sentence_similarity_sif(sent1, sent2):\n",
+    "    doc1 = nlp(sent1)\n",
+    "    doc2 = nlp(sent2)"
+   ],
+   "id": "c100956f89d9b581"
   }
  ],
  "metadata": {
diff --git a/notebooks/04_fusion_model.ipynb b/notebooks/04_fusion_model.ipynb
index ef126da..ca25e4e 100644
--- a/notebooks/04_fusion_model.ipynb
+++ b/notebooks/04_fusion_model.ipynb
@@ -6,8 +6,8 @@
    "outputs": [],
    "execution_count": null,
    "source": [
-    "import import_ipynb\n",
-    "from notebooks.03_semantic_methods.ipynb import *\n",
+    "import spacy\n",
+    "from notebook_functions import *\n",
     "\n",
     "def extract_all_features(sentence_pairs):\n",
     "    features = []\n",
diff --git a/notebooks/05_final_evaluation.ipynb b/notebooks/05_final_evaluation.ipynb
index e69de29..5ec7fbc 100644
--- a/notebooks/05_final_evaluation.ipynb
+++ b/notebooks/05_final_evaluation.ipynb
@@ -0,0 +1,15 @@
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "91a93904e31e87aa"
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/notebook_functions.py b/notebooks/notebook_functions.py
new file mode 100644
index 0000000..60423d2
--- /dev/null
+++ b/notebooks/notebook_functions.py
@@ -0,0 +1,48 @@
+# NoteBook Functions
+import spacy
+from spacy import displacy
+from IPython.display import display, HTML
+
+nlp = spacy.load("en_core_web_md") # Medium model
+
+def jaccard_similarity(sent1, sent2):
+    # make lowercase and split into words
+    words1 = set(sent1.lower().split())
+    words2 = set(sent2.lower().split())
+    intersection = words1.intersection(words2)
+    union = words1.union(words2)
+    return float(len(intersection)) / len(union) if union else 0.0
+
+def sentence_similarity_avg(sent1, sent2):
+    doc1 = nlp(sent1)
+    doc2 = nlp(sent2)
+
+    # Vectors for each word, filter out words without vectors (medium model)
+    vecs1 = [token.vector for token in doc1 if token.has_vector]
+    vecs2 = [token.vector for token in doc2 if token.has_vector]
+
+    if not vecs1 or not vecs2:
+        return 0.0
+
+    # Average vectors
+    avg1 = sum(vecs1) / len(vecs1)
+    avg2 = sum(vecs2) / len(vecs2)
+
+    #cosine similarity
+    from sklearn.metrics.pairwise import cosine_similarity
+    return cosine_similarity([avg1], [avg2])[0][0]
+
+def extract_all_features(sentence_pairs):
+    features = []
+    for sent1, sent2 in sentence_pairs:
+        feature_vector = [
+            jaccard_similarity(sent1, sent2),
+            sentence_similarity_avg(sent1, sent2),
+            sentence_similarity_sif(sent1, sent2),
+            syntactic_similarity(sent1, sent2)
+        ]
+
+def visualize_parse_tree(text):
+    doc = nlp(text)
+    html = displacy.render(doc, style="dep", jupyter=False, options={"distance": 100})
+    display(HTML(html))
\ No newline at end of file