Baseline experimentation on test data

2025-11-30 17:38:00 +00:00
parent 02cdc7bac6
commit 37ccc03ac9
4 changed files with 561 additions and 18 deletions
--- a/notebooks/01_data_exploration.ipynb
+++ b/notebooks/01_data_exploration.ipynb
@@ -18,7 +18,19 @@
     "start_time": "2025-11-23T13:53:56.325948Z"
    }
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'spacy'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtoken\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mspacy\u001b[39;00m\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mspacy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m displacy\n\u001b[32m      4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mIPython\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdisplay\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m display, HTML\n",
+      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'spacy'"
+     ]
+    }
+   ],
   "source": [
    "import token\n",
    "import spacy\n",
--- a/notebooks/02_baseline_experiments.ipynb
+++ b/notebooks/02_baseline_experiments.ipynb
@@ -1,17 +1,102 @@
 {
 "cells": [
  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d2aa2997",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "done\n"
+     ]
+    }
+   ],
+   "source": [
+    "# pip install rapidfuzz scikit-learn numpy\n",
+    "\n",
+    "import numpy as np\n",
+    "from collections import Counter\n",
+    "import string\n",
+    "from rapidfuzz import fuzz, distance\n",
+    "\n",
+    "test_pairs = [\n",
+    "    # Direct copies and near-copies\n",
+    "    (\"The cat sat on the mat.\", \"The cat sat on the mat.\"),           # Exact copy\n",
+    "    (\"The cat sat on the mat.\", \"The cat sat on the mat\"),            # No punctuation\n",
+    "    (\"The cat sat on the mat.\", \"The  cat  sat  on  the  mat.\"),      # Extra spaces\n",
+    "    \n",
+    "    # Paraphrases with same meaning\n",
+    "    (\"The cat sat on the mat.\", \"On the mat, the cat was sitting.\"),  # Structural change\n",
+    "    (\"The cat sat on the mat.\", \"The feline rested on the rug.\"),     # Synonym replacement\n",
+    "    (\"The quick brown fox jumps.\", \"A fast brown fox leaps.\"),        # Partial synonym\n",
+    "    \n",
+    "    # Different sentences\n",
+    "    (\"The cat sat on the mat.\", \"The dog ran in the park.\"),          # Different content\n",
+    "    (\"I love programming.\", \"She enjoys reading books.\"),             # Completely different\n",
+    "    (\"The weather is nice today.\", \"It's raining outside.\"),          # Opposite meaning\n",
+    "    \n",
+    "    # Edge cases\n",
+    "    (\"Short.\", \"Short.\"),                                             # Very short\n",
+    "    (\"A B C D E F G\", \"A B C D E F G\"),                              # Repeated words\n",
+    "    (\"\", \"\"),                                                         # Empty strings\n",
+    "]\n",
+    "print(\"done\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b06d10d0",
+   "metadata": {},
+   "source": [
+    "### Jaccard Similarity "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "e60d024e969254a",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-19T10:01:11.039074Z",
     "start_time": "2025-11-19T10:01:09.613806Z"
    }
   },
-   "cell_type": "code",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The cat sat on the mat': 0.667\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The  cat  sat  on  the  mat.': 1.000\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.375\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.250\n",
+      "--------------------------------------------------\n",
+      "'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.250\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The dog ran in the park.': 0.111\n",
+      "--------------------------------------------------\n",
+      "'I love programming.' vs 'She enjoys reading books.': 0.000\n",
+      "--------------------------------------------------\n",
+      "'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
+      "--------------------------------------------------\n",
+      "'Short.' vs 'Short.': 1.000\n",
+      "--------------------------------------------------\n",
+      "'A B C D E F G' vs 'A B C D E F G': 1.000\n",
+      "--------------------------------------------------\n",
+      "'' vs '': 0.000\n",
+      "--------------------------------------------------\n"
+     ]
+    }
+   ],
   "source": [
-    "import import_ipynb\n",
-    "#from notebooks.01_data_exploration import *\n",
-    "\n",
    "def jaccard_similarity(sent1, sent2):\n",
    "    # make lowercase and split into words\n",
    "    words1 = set(sent1.lower().split())\n",
@@ -20,7 +105,7 @@
    "    union = words1.union(words2)\n",
    "    return float(len(intersection)) / len(union) if union else 0.0\n",
    "\n",
-    "test_pairs = [\n",
+    "small_test_pairs = [\n",
    "    (\"The cat sat on the mat.\", \"The cat sat on the mat.\"),     # Copy\n",
    "    (\"The cat sat on the mat.\", \"On the mat sat the cat.\"),     # Same words rearranged\n",
    "    (\"The cat sat on the mat.\", \"The dog ran in the park\")      # Different\n",
@@ -28,28 +113,469 @@
    "\n",
    "for sent1, sent2 in test_pairs:\n",
    "    similarity = jaccard_similarity(sent1, sent2)\n",
-    "    print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n"
+    "    print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
+    "    print(\"-\"* 50)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "337a1072",
+   "metadata": {},
+   "source": [
+    "### --- Leneshtein Similarity ---\n",
+    " Character & Word"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "0b68fdcd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
+      " Char similarity: 1.000 --- Word similarity: 1.000\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The cat sat on the mat':\n",
+      " Char similarity: 0.957 --- Word similarity: 0.833\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The  cat  sat  on  the  mat.':\n",
+      " Char similarity: 0.821 --- Word similarity: 1.000\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.':\n",
+      " Char similarity: 0.344 --- Word similarity: 0.143\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The feline rested on the rug.':\n",
+      " Char similarity: 0.517 --- Word similarity: 0.500\n",
+      "--------------------------------------------------\n",
+      "'The quick brown fox jumps.' vs 'A fast brown fox leaps.':\n",
+      " Char similarity: 0.577 --- Word similarity: 0.400\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The dog ran in the park.':\n",
+      " Char similarity: 0.625 --- Word similarity: 0.333\n",
+      "--------------------------------------------------\n",
+      "'I love programming.' vs 'She enjoys reading books.':\n",
+      " Char similarity: 0.200 --- Word similarity: 0.000\n",
+      "--------------------------------------------------\n",
+      "'The weather is nice today.' vs 'It's raining outside.':\n",
+      " Char similarity: 0.192 --- Word similarity: 0.000\n",
+      "--------------------------------------------------\n",
+      "'Short.' vs 'Short.':\n",
+      " Char similarity: 1.000 --- Word similarity: 1.000\n",
+      "--------------------------------------------------\n",
+      "'A B C D E F G' vs 'A B C D E F G':\n",
+      " Char similarity: 1.000 --- Word similarity: 1.000\n",
+      "--------------------------------------------------\n",
+      "'' vs '':\n",
+      " Char similarity: 1.000 --- Word similarity: 1.000\n",
+      "--------------------------------------------------\n"
+     ]
+    }
   ],
-   "id": "e60d024e969254a",
+   "source": [
+    "def char_levenshtein_similarity(sent1, sent2):\n",
+    "    \"\"\" Character based edit-distance similarity \"\"\"\n",
+    "    if not sent1 and not sent2:\n",
+    "        return 1.0\n",
+    "    if not sent1 or not sent2:\n",
+    "        return 0.0\n",
+    "    \n",
+    "    max_len = max(len(sent1), len(sent2))\n",
+    "    edit_distance = distance.Levenshtein.distance(sent1, sent2)\n",
+    "    return 1 - (edit_distance / max_len)\n",
+    "\n",
+    "def word_levenshtein_similarity(sent1, sent2):\n",
+    "    \"\"\" Word based edit-distance similarity \"\"\"\n",
+    "    words1 = sent1.lower().split()\n",
+    "    words2 = sent2.lower().split()\n",
+    "    \n",
+    "    if not words1 and not words2:\n",
+    "        return 1.0\n",
+    "    if not words1 or not words2:\n",
+    "        return 0.0\n",
+    "    \n",
+    "    max_len = max(len(words1), len(words2))\n",
+    "    edit_distance = distance.Levenshtein.distance(words1, words2)\n",
+    "    return 1 - (edit_distance / max_len)\n",
+    "\n",
+    "for sent1, sent2 in test_pairs:\n",
+    "    char_similarity = char_levenshtein_similarity(sent1, sent2)\n",
+    "    word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
+    "    print(f\"'{sent1}' vs '{sent2}':\") \n",
+    "    print(f\" Char similarity: {char_similarity:.3f} --- Word similarity: {word_similarity:.3f}\") # 3 decimal place\n",
+    "    print(\"-\"* 50)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bae45c9a",
+   "metadata": {},
+   "source": [
+    "### --- Cosine Similarity ---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "46a985b4",
+   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
-      "'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n",
-      "'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n"
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The  cat  sat  on  the  mat.': 1.000\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.825\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.625\n",
+      "--------------------------------------------------\n",
+      "'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.400\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The dog ran in the park.': 0.500\n",
+      "--------------------------------------------------\n",
+      "'I love programming.' vs 'She enjoys reading books.': 0.000\n",
+      "--------------------------------------------------\n",
+      "'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
+      "--------------------------------------------------\n",
+      "'Short.' vs 'Short.': 1.000\n",
+      "--------------------------------------------------\n",
+      "'A B C D E F G' vs 'A B C D E F G': 1.000\n",
+      "--------------------------------------------------\n",
+      "'' vs '': 1.000\n",
+      "--------------------------------------------------\n"
     ]
    }
   ],
-   "execution_count": 9
+   "source": [
+    "def cosine_similarity_bow(sent1, sent2):\n",
+    "    \"\"\" dosine similarity using bag-of-words \"\"\"\n",
+    "    words1 = sent1.lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
+    "    words2 = sent2.lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
+    "    \n",
+    "    if not words1 and not words2:\n",
+    "        return 1.0\n",
+    "    \n",
+    "    vocabulary = set(words1 + words2)\n",
+    "    if not vocabulary:\n",
+    "        return 0.0\n",
+    "    \n",
+    "    # Create frequency vectors\n",
+    "    freq1 = Counter(words1)\n",
+    "    freq2 = Counter(words2)\n",
+    "    \n",
+    "    # Convert to vectors\n",
+    "    vec1 = np.array([freq1[word] for word in vocabulary])\n",
+    "    vec2 = np.array([freq2[word] for word in vocabulary])\n",
+    "    \n",
+    "    # Compute cosine similarity\n",
+    "    dot_product = np.dot(vec1, vec2)\n",
+    "    norm1 = np.linalg.norm(vec1)\n",
+    "    norm2 = np.linalg.norm(vec2)\n",
+    "    \n",
+    "    if norm1 == 0 or norm2 == 0:\n",
+    "        return 0.0\n",
+    "    \n",
+    "    return dot_product / (norm1 * norm2)\n",
+    "\n",
+    "for sent1, sent2 in test_pairs:\n",
+    "    similarity = cosine_similarity_bow(sent1, sent2)\n",
+    "    print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
+    "    print(\"-\"* 50)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "658276dc",
+   "metadata": {},
+   "source": [
+    "### --- Fuzzy ratios ---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "7dc7ac2e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fuzzy ratio Examples\n",
+      "==========================================================================================\n",
+      "Case                           Ratio    Partial  Token Sort   Token Set Description\n",
+      "==========================================================================================\n",
+      "The cat sat on the mat.        100      100      100          100      Exact copy\n",
+      "The cat sat on the mat.        97       100      97           97       Different content\n",
+      "The cat sat on the mat.        90       82       100          100      Same words, different order\n",
+      "The cat sat on the mat.        47       57       58           62       Different content\n",
+      "The cat sat on the mat.        61       60       53           60       Different content\n",
+      "The quick brown fox jumps.     61       70       57           57       Different content\n",
+      "The cat sat on the mat.        63       63       55           55       Different content\n",
+      "I love programming.            40       43       40           40       Different content\n",
+      "The weather is nice today.     38       47       29           29       Different content\n",
+      "Short.                         100      100      100          100      Exact copy\n",
+      "A B C D E F G                  100      100      100          100      Exact copy\n",
+      "                               100      100      100          0        Exact copy\n"
+     ]
+    }
+   ],
+   "source": [
+    "def fuzzy_ratio_similarity(sent1, sent2):\n",
+    "    \"\"\"Fuzzy string matching ratio\"\"\"\n",
+    "    return fuzz.ratio(sent1.lower(), sent2.lower()) / 100.0\n",
+    "\n",
+    "def fuzzy_partial_ratio(sent1, sent2):\n",
+    "    \"\"\"Fuzzy partial string matching\"\"\"\n",
+    "    return fuzz.partial_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
+    "\n",
+    "def fuzzy_token_sort_ratio(sent1, sent2):\n",
+    "    \"\"\"Fuzzy token sorting ratio (ignore order)\"\"\"\n",
+    "    return fuzz.token_sort_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
+    "\n",
+    "def fuzzy_token_set_ratio(sent1, sent2):\n",
+    "    \"\"\" Fuzzy token set ratio (duplicates) \"\"\"\n",
+    "    return fuzz.token_set_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
+    "\n",
+    "print(\"Fuzzy ratio Examples\")\n",
+    "print(\"=\" * 90)\n",
+    "print(f\"{'Case':<30} {'Ratio':<8} {'Partial':<8} {'Token Sort':<12} {'Token Set':<8} Description\")\n",
+    "print(\"=\" * 90)\n",
+    "\n",
+    "for sent1, sent2 in test_pairs:\n",
+    "        ratio = int(fuzz.ratio(sent1, sent2))\n",
+    "        partial = int(fuzz.partial_ratio(sent1, sent2)) \n",
+    "        token_sort = int(fuzz.token_sort_ratio(sent1, sent2))\n",
+    "        token_set = int(fuzz.token_set_ratio(sent1, sent2))\n",
+    "        \n",
+    "        # description\n",
+    "        if sent1 == sent2:\n",
+    "            desc = \"Exact copy\"\n",
+    "        elif sorted(sent1.split()) == sorted(sent2.split()):\n",
+    "            desc = \"Same words, different order\"\n",
+    "        elif set(sent1.split()).issubset(set(sent2.split())) or set(sent2.split()).issubset(set(sent1.split())):\n",
+    "            desc = \"Subset relationship\"\n",
+    "        else:\n",
+    "            desc = \"Different content\"\n",
+    "            \n",
+    "        print(f\"{sent1[:28]:<30} {ratio:<8} {partial:<8} {token_sort:<12} {token_set:<8} {desc}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a48774d4",
+   "metadata": {},
+   "source": [
+    "### --- Longest common sub-sequence ----"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "e6a4d4e2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The  cat  sat  on  the  mat.': 0.815\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.433\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.536\n",
+      "--------------------------------------------------\n",
+      "'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.560\n",
+      "--------------------------------------------------\n",
+      "'The cat sat on the mat.' vs 'The dog ran in the park.': 0.609\n",
+      "--------------------------------------------------\n",
+      "'I love programming.' vs 'She enjoys reading books.': 0.333\n",
+      "--------------------------------------------------\n",
+      "'The weather is nice today.' vs 'It's raining outside.': 0.360\n",
+      "--------------------------------------------------\n",
+      "'Short.' vs 'Short.': 1.000\n",
+      "--------------------------------------------------\n",
+      "'A B C D E F G' vs 'A B C D E F G': 1.000\n",
+      "--------------------------------------------------\n",
+      "'' vs '': 0.000\n",
+      "--------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "def longest_common_subsequence(sent1, sent2):\n",
+    "    \"\"\" Longest common subsequence similarity \"\"\"\n",
+    "    if not sent1 or not sent2:\n",
+    "        return 0.0\n",
+    "    \n",
+    "    # Remove punctuation for better matching\n",
+    "    sent1_clean = sent1.lower().translate(str.maketrans('', '', string.punctuation))\n",
+    "    sent2_clean = sent2.lower().translate(str.maketrans('', '', string.punctuation))\n",
+    "    \n",
+    "    m, n = len(sent1_clean), len(sent2_clean)\n",
+    "    dp = [[0] * (n + 1) for _ in range(m + 1)]\n",
+    "    \n",
+    "    for i in range(1, m + 1):\n",
+    "        for j in range(1, n + 1):\n",
+    "            if sent1_clean[i-1] == sent2_clean[j-1]:\n",
+    "                dp[i][j] = dp[i-1][j-1] + 1\n",
+    "            else:\n",
+    "                dp[i][j] = max(dp[i-1][j], dp[i][j-1])\n",
+    "    \n",
+    "    lcs_length = dp[m][n]\n",
+    "    return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n",
+    "\n",
+    "for sent1, sent2 in test_pairs:\n",
+    "    similarity = longest_common_subsequence(sent1, sent2)\n",
+    "    print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
+    "    print(\"-\"* 50)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a532335",
+   "metadata": {},
+   "source": [
+    "### --- Containment Similarity ---\n",
+    "Percentage of Sentance A in Sentance B <br>\n",
+    "containment(A, B) = |words(A) ∩ words(B)| / |words(A)|"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "493979a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def containment_similarity(sent1, sent2):\n",
+    "    # sent1 in sent2 (asymmetric)\n",
+    "    \"\"\" What percentage of sent1's words are in sent2 \"\"\"\n",
+    "    words1 = set(sent1.lower().translate(str.maketrans('', '', string.punctuation)).split())\n",
+    "    words2 = set(sent2.lower().translate(str.maketrans('', '', string.punctuation)).split())\n",
+    "    \n",
+    "    if not words1:\n",
+    "        return 0.0\n",
+    "    \n",
+    "    common = words1.intersection(words2)\n",
+    "    return len(common) / len(words1)\n",
+    "\n",
+    " "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "912a9c7c",
+   "metadata": {},
+   "source": [
+    "### Evaluate BaseLine Methods\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3d07562",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pair                                     Sentence 1                     Sentence 2                    \n",
+      "====================================================================================================\n",
+      "Pair 1:                                  The cat sat on the mat.        The cat sat on the mat.       \n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "tuple indices must be integers or slices, not dict",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mTypeError\u001b[39m                                 Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 44\u001b[39m\n\u001b[32m     41\u001b[39m         \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m     43\u001b[39m results = evaluate_baseline_methods(test_pairs)\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m \u001b[43mprint_comparison_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_pairs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mprint_comparison_table\u001b[39m\u001b[34m(results, pairs)\u001b[39m\n\u001b[32m     37\u001b[39m \u001b[38;5;66;03m# Print similarities for this pair\u001b[39;00m\n\u001b[32m     38\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m method_name \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m     similarity = \u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmethod_name\u001b[49m\u001b[43m]\u001b[49m[i]\n\u001b[32m     40\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<40\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod_name\u001b[38;5;250m \u001b[39m+\u001b[38;5;250m \u001b[39m\u001b[33m'\u001b[39m\u001b[33m:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m     41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n",
+      "\u001b[31mTypeError\u001b[39m: tuple indices must be integers or slices, not dict"
+     ]
+    }
+   ],
+   "source": [
+    "def evaluate_baseline_methods(pairs):\n",
+    "    \"\"\" Evaluate all baseline methods on test pairs\"\"\"\n",
+    "    methods = {\n",
+    "        'Jaccard': jaccard_similarity,\n",
+    "        'Levenshtein (char)': char_levenshtein_similarity,\n",
+    "        'Levenshtein (word)': word_levenshtein_similarity,\n",
+    "        'Cosine BOW': cosine_similarity_bow,\n",
+    "        'Fuzzy Ratio': fuzzy_ratio_similarity,\n",
+    "        'Fuzzy Partial': fuzzy_partial_ratio,\n",
+    "        'Fuzzy Token Sort': fuzzy_token_sort_ratio,\n",
+    "        'Fuzzy Token Set': fuzzy_token_set_ratio,\n",
+    "        'LCS': longest_common_subsequence,\n",
+    "        'Containment': containment_similarity,\n",
+    "    }\n",
+    "    \n",
+    "    results = {method: [] for method in methods}\n",
+    "    \n",
+    "    for sent1, sent2 in pairs:\n",
+    "        for method_name, method_func in methods.items():\n",
+    "            similarity = method_func(sent1, sent2)\n",
+    "            results[method_name].append(similarity)\n",
+    "    \n",
+    "    return results, methods\n",
+    "\n",
+    "def print_comparison_table(results, pairs):\n",
+    "    \"\"\" Print a formatted comparison table \"\"\"\n",
+    "    print(f\"{'Pair':<40} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
+    "    print(\"=\" * 100)\n",
+    "    \n",
+    "    for i, (sent1, sent2) in enumerate(pairs):\n",
+    "        # Truncate long sentences for display\n",
+    "        display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n",
+    "        display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n",
+    "        \n",
+    "        print(f\"{f'Pair {i+1}:':<40} {display_sent1:<30} {display_sent2:<30}\")\n",
+    "        \n",
+    "        # Print similarities for this pair\n",
+    "        for method_name in results:\n",
+    "            similarity = results[method_name][i]\n",
+    "            print(f\"{'':<40} {method_name + ':':<20} {similarity:.3f}\")\n",
+    "        print(\"-\" * 100)\n",
+    "\n",
+    "results = evaluate_baseline_methods(test_pairs)\n",
+    "print_comparison_table(results, test_pairs)"
+   ]
  }
 ],
 "metadata": {
  "kernelspec": {
-   "name": "python3",
+   "display_name": ".venv",
   "language": "python",
-   "display_name": "Python 3 (ipykernel)"
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
  }
 },
 "nbformat": 4,
--- a/requirments.txt
+++ b/requirments.txt
@@ -1,9 +1,12 @@
-datasets
-huggingface-hub
 pandas
 numpy
 scikit-learn
-spacy
 matplotlib
-seaborn
-jupyter
+
+spacy
+tensor
+datasets
+jupyter
+
+huggingface-hub
+seaborn
--- a/2
+++ b/2
@@ -0,0 +1,2 @@
+python -m spacy download en_core_web_lg
+python -m spacy download en_core_web_trf