From c2e88b26ca44ea0e8ee4461392729541a0d6a512 Mon Sep 17 00:00:00 2001 From: Henry Dowd Date: Fri, 21 Nov 2025 11:56:34 +0000 Subject: [PATCH] Dependency tree for sentence structure and made notebook funcions importable wit new py file --- notebooks/01_data_exploration.ipynb | 504 +++++++++++++++++++++++- notebooks/02_baseline_experiments.ipynb | 19 +- notebooks/03_semantic_methods.ipynb | 22 +- notebooks/04_fusion_model.ipynb | 4 +- notebooks/05_final_evaluation.ipynb | 15 + notebooks/notebook_functions.py | 48 +++ 6 files changed, 592 insertions(+), 20 deletions(-) create mode 100644 notebooks/notebook_functions.py diff --git a/notebooks/01_data_exploration.ipynb b/notebooks/01_data_exploration.ipynb index 229a62f..ab59c02 100644 --- a/notebooks/01_data_exploration.ipynb +++ b/notebooks/01_data_exploration.ipynb @@ -9,13 +9,13 @@ { "metadata": { "ExecuteTime": { - "end_time": "2025-11-19T00:00:02.012627Z", - "start_time": "2025-11-19T00:00:00.160731Z" + "end_time": "2025-11-20T19:03:29.658876Z", + "start_time": "2025-11-20T19:03:27.809309Z" } }, "cell_type": "code", "source": [ - "import import_ipynb\n", + "import token\n", "import spacy\n", "\n", "nlp = spacy.load(\"en_core_web_md\") # Can swap for large model if required\n", @@ -30,8 +30,7 @@ " doc = nlp(sent)\n", " print(f\"Sentence: {sent}\")\n", " print(f\"Tokens: {[token.text for token in doc]}\")\n", - " print(\"---\")\n", - "\n" + " print(\"---\")\n" ], "id": "e003ac06a58cfbb4", "outputs": [ @@ -51,7 +50,498 @@ ] } ], - "execution_count": 2 + "execution_count": 17 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-20T19:53:11.868566Z", + "start_time": "2025-11-20T19:53:11.861295Z" + } + }, + "cell_type": "code", + "source": [ + "import spacy\n", + "import token\n", + "\n", + "nlp = spacy.load(\"en_core_web_md\")\n", + "\n", + "test_sentences = [\n", + " \"The cat sat on the mat.\",\n", + " \"On the mat, the cat was sitting.\",\n", + " \"A completely different sentence about something else.\"\n", + "]\n", + "\n", + "class TextPreprocessor:\n", + " def __init__(self):\n", + " self.nlp = spacy.load(\"en_core_web_md\")\n", + "\n", + " def direct_detection(self, text):\n", + " \"\"\"For direct copy detection\"\"\"\n", + " #Keep punctuation\n", + " return text.lower().strip()\n", + "\n", + " def semantic_analysis(self, text):\n", + " \"\"\"Semantic Similarity\"\"\"\n", + " doc = self.nlp(text)\n", + " tokens = []\n", + " for token in doc:\n", + " if (not token.is_punct and not token.is_space and token.is_alpha and token.is_stop and len(token.lemma_) > 1): #Remove single char tokens\n", + " tokens.append(token.lemma_.lower())\n", + " return \" \".join(tokens)\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "for sent in test_sentences:\n", + " print(f\"Original Sentence: {sent}\")\n", + " print(\"---\")\n", + " print(f\"Preprocessed Sentence: {preprocess_semantic(sent)}\")\n", + " print(\"-\" * 50)" + ], + "id": "5e488a878a5cfccb", + "outputs": [ + { + "ename": "IndentationError", + "evalue": "expected an indented block after 'if' statement on line 26 (400725648.py, line 31)", + "output_type": "error", + "traceback": [ + " \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[19]\u001B[39m\u001B[32m, line 31\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mfor sent in test_sentences:\u001B[39m\n ^\n\u001B[31mIndentationError\u001B[39m\u001B[31m:\u001B[39m expected an indented block after 'if' statement on line 26\n" + ] + } + ], + "execution_count": 19 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-20T15:51:42.074798Z", + "start_time": "2025-11-20T15:51:42.050593Z" + } + }, + "cell_type": "code", + "source": [ + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_md\")\n", + "\n", + "def extract_parse_tree(text):\n", + " doc = nlp(text)\n", + "\n", + " print(f\"Sentence: {text}\")\n", + " print(\"\\nDependenct Parse Tree:\")\n", + " print(\"-\" * 50)\n", + "\n", + " for token in doc:\n", + " print(f\"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}\")\n", + "\n", + " return doc\n", + "\n", + "test_sentences = [\n", + " \"The cat sat on the mat.\",\n", + " \"On the mat, the cat was sitting.\",\n", + " \"A completely different sentence about something else.\"\n", + "]\n", + "\n", + "for sentence in test_sentences:\n", + " doc = extract_parse_tree(sentence)\n", + " print(\"\\n\" + \"=\"*60 + \"\\n\")" + ], + "id": "83fc18c9de2e354", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sentence: The cat sat on the mat.\n", + "\n", + "Dependenct Parse Tree:\n", + "--------------------------------------------------\n", + "The det cat []\n", + "cat nsubj sat ['The']\n", + "sat ROOT sat ['cat', 'on', '.']\n", + "on prep sat ['mat']\n", + "the det mat []\n", + "mat pobj on ['the']\n", + ". punct sat []\n", + "\n", + "============================================================\n", + "\n", + "Sentence: On the mat, the cat was sitting.\n", + "\n", + "Dependenct Parse Tree:\n", + "--------------------------------------------------\n", + "On prep sitting ['mat']\n", + "the det mat []\n", + "mat pobj On ['the']\n", + ", punct sitting []\n", + "the det cat []\n", + "cat nsubj sitting ['the']\n", + "was aux sitting []\n", + "sitting ROOT sitting ['On', ',', 'cat', 'was', '.']\n", + ". punct sitting []\n", + "\n", + "============================================================\n", + "\n", + "Sentence: A completely different sentence about something else.\n", + "\n", + "Dependenct Parse Tree:\n", + "--------------------------------------------------\n", + "A det sentence []\n", + "completely advmod different []\n", + "different amod sentence ['completely']\n", + "sentence ROOT sentence ['A', 'different', 'about', '.']\n", + "about prep sentence ['something']\n", + "something pobj about ['else']\n", + "else advmod something []\n", + ". punct sentence []\n", + "\n", + "============================================================\n", + "\n" + ] + } + ], + "execution_count": 15 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-20T15:46:08.461059Z", + "start_time": "2025-11-20T15:45:47.529073Z" + } + }, + "cell_type": "code", + "source": [ + "import spacy\n", + "from spacy import displacy\n", + "from IPython.display import display, HTML\n", + "\n", + "nlp = spacy.load(\"en_core_web_md\")\n", + "\n", + "test_sentences = [\n", + " \"The cat sat on the mat.\",\n", + " \"On the mat, the cat was sitting.\",\n", + " \"A completely different sentence about something else.\"\n", + "]\n", + "\n", + "def visualize_parse_tree(text):\n", + " doc = nlp(text)\n", + " html = displacy.render(doc, style=\"dep\", jupyter=False, options={\"distance\": 100})\n", + " display(HTML(html))\n", + "\n", + "for sentence in test_sentences:\n", + " print(f\"Sentence: {sentence}\")\n", + " visualize_parse_tree(sentence)" + ], + "id": "e413238c1af12f62", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sentence: The cat sat on the mat.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "\n", + " The\n", + " DET\n", + "\n", + "\n", + "\n", + " cat\n", + " NOUN\n", + "\n", + "\n", + "\n", + " sat\n", + " VERB\n", + "\n", + "\n", + "\n", + " on\n", + " ADP\n", + "\n", + "\n", + "\n", + " the\n", + " DET\n", + "\n", + "\n", + "\n", + " mat.\n", + " NOUN\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nsubj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " prep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " pobj\n", + " \n", + " \n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data", + "jetTransient": { + "display_id": null + } + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sentence: On the mat, the cat was sitting.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "\n", + " On\n", + " ADP\n", + "\n", + "\n", + "\n", + " the\n", + " DET\n", + "\n", + "\n", + "\n", + " mat,\n", + " NOUN\n", + "\n", + "\n", + "\n", + " the\n", + " DET\n", + "\n", + "\n", + "\n", + " cat\n", + " NOUN\n", + "\n", + "\n", + "\n", + " was\n", + " AUX\n", + "\n", + "\n", + "\n", + " sitting.\n", + " VERB\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " prep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " pobj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nsubj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " aux\n", + " \n", + " \n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data", + "jetTransient": { + "display_id": null + } + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sentence: A completely different sentence about something else.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "\n", + " A\n", + " DET\n", + "\n", + "\n", + "\n", + " completely\n", + " ADV\n", + "\n", + "\n", + "\n", + " different\n", + " ADJ\n", + "\n", + "\n", + "\n", + " sentence\n", + " NOUN\n", + "\n", + "\n", + "\n", + " about\n", + " ADP\n", + "\n", + "\n", + "\n", + " something\n", + " PRON\n", + "\n", + "\n", + "\n", + " else.\n", + " ADV\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " advmod\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " amod\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " prep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " pobj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " advmod\n", + " \n", + " \n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data", + "jetTransient": { + "display_id": null + } + } + ], + "execution_count": 14 }, { "metadata": {}, @@ -59,7 +549,7 @@ "outputs": [], "execution_count": null, "source": "", - "id": "83fc18c9de2e354" + "id": "6aff51eb71eb2238" } ], "metadata": { diff --git a/notebooks/02_baseline_experiments.ipynb b/notebooks/02_baseline_experiments.ipynb index 126ec8a..2454db0 100644 --- a/notebooks/02_baseline_experiments.ipynb +++ b/notebooks/02_baseline_experiments.ipynb @@ -3,14 +3,14 @@ { "metadata": { "ExecuteTime": { - "end_time": "2025-11-19T00:00:04.962487Z", - "start_time": "2025-11-19T00:00:04.958995Z" + "end_time": "2025-11-19T10:01:11.039074Z", + "start_time": "2025-11-19T10:01:09.613806Z" } }, "cell_type": "code", "source": [ "import import_ipynb\n", - "from notebooks.01_data_exploration import *\n", + "#from notebooks.01_data_exploration import *\n", "\n", "def jaccard_similarity(sent1, sent2):\n", " # make lowercase and split into words\n", @@ -33,15 +33,16 @@ "id": "e60d024e969254a", "outputs": [ { - "ename": "SyntaxError", - "evalue": "invalid decimal literal (2501033926.py, line 2)", - "output_type": "error", - "traceback": [ - " \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[8]\u001B[39m\u001B[32m, line 2\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mfrom notebooks.01_data_exploration import *\u001B[39m\n ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m invalid decimal literal\n" + "name": "stdout", + "output_type": "stream", + "text": [ + "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", + "'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n", + "'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n" ] } ], - "execution_count": 8 + "execution_count": 9 } ], "metadata": { diff --git a/notebooks/03_semantic_methods.ipynb b/notebooks/03_semantic_methods.ipynb index 09d9e85..640b581 100644 --- a/notebooks/03_semantic_methods.ipynb +++ b/notebooks/03_semantic_methods.ipynb @@ -16,7 +16,7 @@ "cell_type": "code", "source": [ "import import_ipynb\n", - "from notebooks.02_baseline_experiments.ipynb import *\n", + "#from notebooks. import *\n", "\n", "import spacy\n", "nlp = spacy.load(\"en_core_web_md\")\n", @@ -77,7 +77,7 @@ }, "cell_type": "code", "source": [ - "def sentence_similarity_ang(sent1, sent2):\n", + "def sentence_similarity_avg(sent1, sent2):\n", " doc1 = nlp(sent1)\n", " doc2 = nlp(sent2)\n", "\n", @@ -99,6 +99,24 @@ "id": "68a6757447e4a1c7", "outputs": [], "execution_count": 3 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "SIF - Smooth Inverse Similarity", + "id": "a9c3aa050f5bc0fe" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "def sentence_similarity_sif(sent1, sent2):\n", + " doc1 = nlp(sent1)\n", + " doc2 = nlp(sent2)" + ], + "id": "c100956f89d9b581" } ], "metadata": { diff --git a/notebooks/04_fusion_model.ipynb b/notebooks/04_fusion_model.ipynb index ef126da..ca25e4e 100644 --- a/notebooks/04_fusion_model.ipynb +++ b/notebooks/04_fusion_model.ipynb @@ -6,8 +6,8 @@ "outputs": [], "execution_count": null, "source": [ - "import import_ipynb\n", - "from notebooks.03_semantic_methods.ipynb import *\n", + "import spacy\n", + "from notebook_functions import *\n", "\n", "def extract_all_features(sentence_pairs):\n", " features = []\n", diff --git a/notebooks/05_final_evaluation.ipynb b/notebooks/05_final_evaluation.ipynb index e69de29..5ec7fbc 100644 --- a/notebooks/05_final_evaluation.ipynb +++ b/notebooks/05_final_evaluation.ipynb @@ -0,0 +1,15 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "91a93904e31e87aa" + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/notebook_functions.py b/notebooks/notebook_functions.py new file mode 100644 index 0000000..60423d2 --- /dev/null +++ b/notebooks/notebook_functions.py @@ -0,0 +1,48 @@ +# NoteBook Functions +import spacy +from spacy import displacy +from IPython.display import display, HTML + +nlp = spacy.load("en_core_web_md") # Medium model + +def jaccard_similarity(sent1, sent2): + # make lowercase and split into words + words1 = set(sent1.lower().split()) + words2 = set(sent2.lower().split()) + intersection = words1.intersection(words2) + union = words1.union(words2) + return float(len(intersection)) / len(union) if union else 0.0 + +def sentence_similarity_avg(sent1, sent2): + doc1 = nlp(sent1) + doc2 = nlp(sent2) + + # Vectors for each word, filter out words without vectors (medium model) + vecs1 = [token.vector for token in doc1 if token.has_vector] + vecs2 = [token.vector for token in doc2 if token.has_vector] + + if not vecs1 or not vecs2: + return 0.0 + + # Average vectors + avg1 = sum(vecs1) / len(vecs1) + avg2 = sum(vecs2) / len(vecs2) + + #cosine similarity + from sklearn.metrics.pairwise import cosine_similarity + return cosine_similarity([avg1], [avg2])[0][0] + +def extract_all_features(sentence_pairs): + features = [] + for sent1, sent2 in sentence_pairs: + feature_vector = [ + jaccard_similarity(sent1, sent2), + sentence_similarity_avg(sent1, sent2), + sentence_similarity_sif(sent1, sent2), + syntactic_similarity(sent1, sent2) + ] + +def visualize_parse_tree(text): + doc = nlp(text) + html = displacy.render(doc, style="dep", jupyter=False, options={"distance": 100}) + display(HTML(html)) \ No newline at end of file