Testing parsing and jaccard similarity in notebooks

2025-11-18 23:25:04 +00:00
parent 42deee19f6
commit 8d6b1cab2c
3 changed files with 142 additions and 0 deletions
--- a/notebooks/01_data_exploration.ipynb
+++ b/notebooks/01_data_exploration.ipynb
@@ -0,0 +1,73 @@
 {
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "Keep punctuation for direct copy detection but remove for semantic/keyword based methods",
   "id": "1c26616777253f10"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-18T23:01:17.888318Z",
     "start_time": "2025-11-18T23:01:16.494987Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import spacy\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_md\")  # Can swap for large model if required\n",
    "\n",
    "test_sentences = [\n",
    "    \"The cat sat on the mat.\",\n",
    "    \"On the mat, the cat was sitting.\",\n",
    "    \"A completely different sentence about something else.\"\n",
    "]\n",
    "\n",
    "for sent in test_sentences:\n",
    "    doc = nlp(sent)\n",
    "    print(f\"Sentence: {sent}\")\n",
    "    print(f\"Tokens: {[token.text for token in doc]}\")\n",
    "    print(\"---\")\n",
    "\n"
   ],
   "id": "e003ac06a58cfbb4",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence: The cat sat on the mat.\n",
      "Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n",
      "---\n",
      "Sentence: On the mat, the cat was sitting.\n",
      "Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n",
      "---\n",
      "Sentence: A completely different sentence about something else.\n",
      "Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n",
      "---\n"
     ]
    }
   ],
   "execution_count": 1
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "",
   "id": "83fc18c9de2e354"
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3 (ipykernel)"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/02_baseline_experiments.ipynb
+++ b/notebooks/02_baseline_experiments.ipynb
@@ -0,0 +1,54 @@
 {
 "cells": [
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-18T23:15:35.056834Z",
     "start_time": "2025-11-18T23:15:35.051218Z"
    }
   },
   "cell_type": "code",
   "source": [
    "def jaccard_similarity(sent1, sent2):\n",
    "    # make lowercase and split into words\n",
    "    words1 = set(sent1.lower().split())\n",
    "    words2 = set(sent2.lower().split())\n",
    "    intersection = words1.intersection(words2)\n",
    "    union = words1.union(words2)\n",
    "    return float(len(intersection)) / len(union) if union else 0.0\n",
    "\n",
    "test_pairs = [\n",
    "    (\"The cat sat on the mat.\", \"The cat sat on the mat.\"),     # Copy\n",
    "    (\"The cat sat on the mat.\", \"On the mat sat the cat.\"),     # Same words rearranged\n",
    "    (\"The cat sat on the mat.\", \"The dog ran in the park\")      # Different\n",
    "]\n",
    "\n",
    "for sent1, sent2 in test_pairs:\n",
    "    similarity = jaccard_similarity(sent1, sent2)\n",
    "    print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n"
   ],
   "id": "e60d024e969254a",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
      "'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n",
      "'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n"
     ]
    }
   ],
   "execution_count": 7
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3 (ipykernel)"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/03_semantic_methods.ipynb
+++ b/notebooks/03_semantic_methods.ipynb
@@ -0,0 +1,15 @@
 {
 "cells": [
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "",
   "id": "8a3c4314a90086fe"
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
 }