Baseline experimentation on test data

This commit is contained in:
Henry Dowd
2025-11-30 17:38:00 +00:00
parent 02cdc7bac6
commit 37ccc03ac9
4 changed files with 561 additions and 18 deletions

View File

@@ -18,7 +18,19 @@
"start_time": "2025-11-23T13:53:56.325948Z"
}
},
"outputs": [],
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'spacy'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtoken\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mspacy\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mspacy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m displacy\n\u001b[32m 4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mIPython\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdisplay\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m display, HTML\n",
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'spacy'"
]
}
],
"source": [
"import token\n",
"import spacy\n",

View File

@@ -1,17 +1,102 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "d2aa2997",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"done\n"
]
}
],
"source": [
"# pip install rapidfuzz scikit-learn numpy\n",
"\n",
"import numpy as np\n",
"from collections import Counter\n",
"import string\n",
"from rapidfuzz import fuzz, distance\n",
"\n",
"test_pairs = [\n",
" # Direct copies and near-copies\n",
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Exact copy\n",
" (\"The cat sat on the mat.\", \"The cat sat on the mat\"), # No punctuation\n",
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Extra spaces\n",
" \n",
" # Paraphrases with same meaning\n",
" (\"The cat sat on the mat.\", \"On the mat, the cat was sitting.\"), # Structural change\n",
" (\"The cat sat on the mat.\", \"The feline rested on the rug.\"), # Synonym replacement\n",
" (\"The quick brown fox jumps.\", \"A fast brown fox leaps.\"), # Partial synonym\n",
" \n",
" # Different sentences\n",
" (\"The cat sat on the mat.\", \"The dog ran in the park.\"), # Different content\n",
" (\"I love programming.\", \"She enjoys reading books.\"), # Completely different\n",
" (\"The weather is nice today.\", \"It's raining outside.\"), # Opposite meaning\n",
" \n",
" # Edge cases\n",
" (\"Short.\", \"Short.\"), # Very short\n",
" (\"A B C D E F G\", \"A B C D E F G\"), # Repeated words\n",
" (\"\", \"\"), # Empty strings\n",
"]\n",
"print(\"done\")"
]
},
{
"cell_type": "markdown",
"id": "b06d10d0",
"metadata": {},
"source": [
"### Jaccard Similarity "
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "e60d024e969254a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-19T10:01:11.039074Z",
"start_time": "2025-11-19T10:01:09.613806Z"
}
},
"cell_type": "code",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat': 0.667\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.375\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.250\n",
"--------------------------------------------------\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.250\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.111\n",
"--------------------------------------------------\n",
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.': 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
"--------------------------------------------------\n",
"'' vs '': 0.000\n",
"--------------------------------------------------\n"
]
}
],
"source": [
"import import_ipynb\n",
"#from notebooks.01_data_exploration import *\n",
"\n",
"def jaccard_similarity(sent1, sent2):\n",
" # make lowercase and split into words\n",
" words1 = set(sent1.lower().split())\n",
@@ -20,7 +105,7 @@
" union = words1.union(words2)\n",
" return float(len(intersection)) / len(union) if union else 0.0\n",
"\n",
"test_pairs = [\n",
"small_test_pairs = [\n",
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Copy\n",
" (\"The cat sat on the mat.\", \"On the mat sat the cat.\"), # Same words rearranged\n",
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
@@ -28,28 +113,469 @@
"\n",
"for sent1, sent2 in test_pairs:\n",
" similarity = jaccard_similarity(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n"
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
" print(\"-\"* 50)\n"
]
},
{
"cell_type": "markdown",
"id": "337a1072",
"metadata": {},
"source": [
"### --- Leneshtein Similarity ---\n",
" Character & Word"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "0b68fdcd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat':\n",
" Char similarity: 0.957 --- Word similarity: 0.833\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
" Char similarity: 0.821 --- Word similarity: 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.':\n",
" Char similarity: 0.344 --- Word similarity: 0.143\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.':\n",
" Char similarity: 0.517 --- Word similarity: 0.500\n",
"--------------------------------------------------\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.':\n",
" Char similarity: 0.577 --- Word similarity: 0.400\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.':\n",
" Char similarity: 0.625 --- Word similarity: 0.333\n",
"--------------------------------------------------\n",
"'I love programming.' vs 'She enjoys reading books.':\n",
" Char similarity: 0.200 --- Word similarity: 0.000\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.':\n",
" Char similarity: 0.192 --- Word similarity: 0.000\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n",
"'' vs '':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n"
]
}
],
"id": "e60d024e969254a",
"source": [
"def char_levenshtein_similarity(sent1, sent2):\n",
" \"\"\" Character based edit-distance similarity \"\"\"\n",
" if not sent1 and not sent2:\n",
" return 1.0\n",
" if not sent1 or not sent2:\n",
" return 0.0\n",
" \n",
" max_len = max(len(sent1), len(sent2))\n",
" edit_distance = distance.Levenshtein.distance(sent1, sent2)\n",
" return 1 - (edit_distance / max_len)\n",
"\n",
"def word_levenshtein_similarity(sent1, sent2):\n",
" \"\"\" Word based edit-distance similarity \"\"\"\n",
" words1 = sent1.lower().split()\n",
" words2 = sent2.lower().split()\n",
" \n",
" if not words1 and not words2:\n",
" return 1.0\n",
" if not words1 or not words2:\n",
" return 0.0\n",
" \n",
" max_len = max(len(words1), len(words2))\n",
" edit_distance = distance.Levenshtein.distance(words1, words2)\n",
" return 1 - (edit_distance / max_len)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" char_similarity = char_levenshtein_similarity(sent1, sent2)\n",
" word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}':\") \n",
" print(f\" Char similarity: {char_similarity:.3f} --- Word similarity: {word_similarity:.3f}\") # 3 decimal place\n",
" print(\"-\"* 50)"
]
},
{
"cell_type": "markdown",
"id": "bae45c9a",
"metadata": {},
"source": [
"### --- Cosine Similarity ---"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "46a985b4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n",
"'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n"
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.825\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.625\n",
"--------------------------------------------------\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.400\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.500\n",
"--------------------------------------------------\n",
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.': 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
"--------------------------------------------------\n",
"'' vs '': 1.000\n",
"--------------------------------------------------\n"
]
}
],
"execution_count": 9
"source": [
"def cosine_similarity_bow(sent1, sent2):\n",
" \"\"\" dosine similarity using bag-of-words \"\"\"\n",
" words1 = sent1.lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
" words2 = sent2.lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
" \n",
" if not words1 and not words2:\n",
" return 1.0\n",
" \n",
" vocabulary = set(words1 + words2)\n",
" if not vocabulary:\n",
" return 0.0\n",
" \n",
" # Create frequency vectors\n",
" freq1 = Counter(words1)\n",
" freq2 = Counter(words2)\n",
" \n",
" # Convert to vectors\n",
" vec1 = np.array([freq1[word] for word in vocabulary])\n",
" vec2 = np.array([freq2[word] for word in vocabulary])\n",
" \n",
" # Compute cosine similarity\n",
" dot_product = np.dot(vec1, vec2)\n",
" norm1 = np.linalg.norm(vec1)\n",
" norm2 = np.linalg.norm(vec2)\n",
" \n",
" if norm1 == 0 or norm2 == 0:\n",
" return 0.0\n",
" \n",
" return dot_product / (norm1 * norm2)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" similarity = cosine_similarity_bow(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
" print(\"-\"* 50)"
]
},
{
"cell_type": "markdown",
"id": "658276dc",
"metadata": {},
"source": [
"### --- Fuzzy ratios ---"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "7dc7ac2e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fuzzy ratio Examples\n",
"==========================================================================================\n",
"Case Ratio Partial Token Sort Token Set Description\n",
"==========================================================================================\n",
"The cat sat on the mat. 100 100 100 100 Exact copy\n",
"The cat sat on the mat. 97 100 97 97 Different content\n",
"The cat sat on the mat. 90 82 100 100 Same words, different order\n",
"The cat sat on the mat. 47 57 58 62 Different content\n",
"The cat sat on the mat. 61 60 53 60 Different content\n",
"The quick brown fox jumps. 61 70 57 57 Different content\n",
"The cat sat on the mat. 63 63 55 55 Different content\n",
"I love programming. 40 43 40 40 Different content\n",
"The weather is nice today. 38 47 29 29 Different content\n",
"Short. 100 100 100 100 Exact copy\n",
"A B C D E F G 100 100 100 100 Exact copy\n",
" 100 100 100 0 Exact copy\n"
]
}
],
"source": [
"def fuzzy_ratio_similarity(sent1, sent2):\n",
" \"\"\"Fuzzy string matching ratio\"\"\"\n",
" return fuzz.ratio(sent1.lower(), sent2.lower()) / 100.0\n",
"\n",
"def fuzzy_partial_ratio(sent1, sent2):\n",
" \"\"\"Fuzzy partial string matching\"\"\"\n",
" return fuzz.partial_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
"\n",
"def fuzzy_token_sort_ratio(sent1, sent2):\n",
" \"\"\"Fuzzy token sorting ratio (ignore order)\"\"\"\n",
" return fuzz.token_sort_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
"\n",
"def fuzzy_token_set_ratio(sent1, sent2):\n",
" \"\"\" Fuzzy token set ratio (duplicates) \"\"\"\n",
" return fuzz.token_set_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
"\n",
"print(\"Fuzzy ratio Examples\")\n",
"print(\"=\" * 90)\n",
"print(f\"{'Case':<30} {'Ratio':<8} {'Partial':<8} {'Token Sort':<12} {'Token Set':<8} Description\")\n",
"print(\"=\" * 90)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" ratio = int(fuzz.ratio(sent1, sent2))\n",
" partial = int(fuzz.partial_ratio(sent1, sent2)) \n",
" token_sort = int(fuzz.token_sort_ratio(sent1, sent2))\n",
" token_set = int(fuzz.token_set_ratio(sent1, sent2))\n",
" \n",
" # description\n",
" if sent1 == sent2:\n",
" desc = \"Exact copy\"\n",
" elif sorted(sent1.split()) == sorted(sent2.split()):\n",
" desc = \"Same words, different order\"\n",
" elif set(sent1.split()).issubset(set(sent2.split())) or set(sent2.split()).issubset(set(sent1.split())):\n",
" desc = \"Subset relationship\"\n",
" else:\n",
" desc = \"Different content\"\n",
" \n",
" print(f\"{sent1[:28]:<30} {ratio:<8} {partial:<8} {token_sort:<12} {token_set:<8} {desc}\")"
]
},
{
"cell_type": "markdown",
"id": "a48774d4",
"metadata": {},
"source": [
"### --- Longest common sub-sequence ----"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "e6a4d4e2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 0.815\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.433\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.536\n",
"--------------------------------------------------\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.560\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.609\n",
"--------------------------------------------------\n",
"'I love programming.' vs 'She enjoys reading books.': 0.333\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.': 0.360\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.': 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
"--------------------------------------------------\n",
"'' vs '': 0.000\n",
"--------------------------------------------------\n"
]
}
],
"source": [
"def longest_common_subsequence(sent1, sent2):\n",
" \"\"\" Longest common subsequence similarity \"\"\"\n",
" if not sent1 or not sent2:\n",
" return 0.0\n",
" \n",
" # Remove punctuation for better matching\n",
" sent1_clean = sent1.lower().translate(str.maketrans('', '', string.punctuation))\n",
" sent2_clean = sent2.lower().translate(str.maketrans('', '', string.punctuation))\n",
" \n",
" m, n = len(sent1_clean), len(sent2_clean)\n",
" dp = [[0] * (n + 1) for _ in range(m + 1)]\n",
" \n",
" for i in range(1, m + 1):\n",
" for j in range(1, n + 1):\n",
" if sent1_clean[i-1] == sent2_clean[j-1]:\n",
" dp[i][j] = dp[i-1][j-1] + 1\n",
" else:\n",
" dp[i][j] = max(dp[i-1][j], dp[i][j-1])\n",
" \n",
" lcs_length = dp[m][n]\n",
" return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" similarity = longest_common_subsequence(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
" print(\"-\"* 50)"
]
},
{
"cell_type": "markdown",
"id": "1a532335",
"metadata": {},
"source": [
"### --- Containment Similarity ---\n",
"Percentage of Sentance A in Sentance B <br>\n",
"containment(A, B) = |words(A) ∩ words(B)| / |words(A)|"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "493979a4",
"metadata": {},
"outputs": [],
"source": [
"def containment_similarity(sent1, sent2):\n",
" # sent1 in sent2 (asymmetric)\n",
" \"\"\" What percentage of sent1's words are in sent2 \"\"\"\n",
" words1 = set(sent1.lower().translate(str.maketrans('', '', string.punctuation)).split())\n",
" words2 = set(sent2.lower().translate(str.maketrans('', '', string.punctuation)).split())\n",
" \n",
" if not words1:\n",
" return 0.0\n",
" \n",
" common = words1.intersection(words2)\n",
" return len(common) / len(words1)\n",
"\n",
" "
]
},
{
"cell_type": "markdown",
"id": "912a9c7c",
"metadata": {},
"source": [
"### Evaluate BaseLine Methods\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3d07562",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pair Sentence 1 Sentence 2 \n",
"====================================================================================================\n",
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n"
]
},
{
"ename": "TypeError",
"evalue": "tuple indices must be integers or slices, not dict",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 44\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m 43\u001b[39m results = evaluate_baseline_methods(test_pairs)\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m \u001b[43mprint_comparison_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_pairs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mprint_comparison_table\u001b[39m\u001b[34m(results, pairs)\u001b[39m\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Print similarities for this pair\u001b[39;00m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m method_name \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m similarity = \u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmethod_name\u001b[49m\u001b[43m]\u001b[49m[i]\n\u001b[32m 40\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<40\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod_name\u001b[38;5;250m \u001b[39m+\u001b[38;5;250m \u001b[39m\u001b[33m'\u001b[39m\u001b[33m:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n",
"\u001b[31mTypeError\u001b[39m: tuple indices must be integers or slices, not dict"
]
}
],
"source": [
"def evaluate_baseline_methods(pairs):\n",
" \"\"\" Evaluate all baseline methods on test pairs\"\"\"\n",
" methods = {\n",
" 'Jaccard': jaccard_similarity,\n",
" 'Levenshtein (char)': char_levenshtein_similarity,\n",
" 'Levenshtein (word)': word_levenshtein_similarity,\n",
" 'Cosine BOW': cosine_similarity_bow,\n",
" 'Fuzzy Ratio': fuzzy_ratio_similarity,\n",
" 'Fuzzy Partial': fuzzy_partial_ratio,\n",
" 'Fuzzy Token Sort': fuzzy_token_sort_ratio,\n",
" 'Fuzzy Token Set': fuzzy_token_set_ratio,\n",
" 'LCS': longest_common_subsequence,\n",
" 'Containment': containment_similarity,\n",
" }\n",
" \n",
" results = {method: [] for method in methods}\n",
" \n",
" for sent1, sent2 in pairs:\n",
" for method_name, method_func in methods.items():\n",
" similarity = method_func(sent1, sent2)\n",
" results[method_name].append(similarity)\n",
" \n",
" return results, methods\n",
"\n",
"def print_comparison_table(results, pairs):\n",
" \"\"\" Print a formatted comparison table \"\"\"\n",
" print(f\"{'Pair':<40} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
" print(\"=\" * 100)\n",
" \n",
" for i, (sent1, sent2) in enumerate(pairs):\n",
" # Truncate long sentences for display\n",
" display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n",
" display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n",
" \n",
" print(f\"{f'Pair {i+1}:':<40} {display_sent1:<30} {display_sent2:<30}\")\n",
" \n",
" # Print similarities for this pair\n",
" for method_name in results:\n",
" similarity = results[method_name][i]\n",
" print(f\"{'':<40} {method_name + ':':<20} {similarity:.3f}\")\n",
" print(\"-\" * 100)\n",
"\n",
"results = evaluate_baseline_methods(test_pairs)\n",
"print_comparison_table(results, test_pairs)"
]
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": ".venv",
"language": "python",
"display_name": "Python 3 (ipykernel)"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,

View File

@@ -1,9 +1,12 @@
datasets
huggingface-hub
pandas
numpy
scikit-learn
spacy
matplotlib
seaborn
jupyter
spacy
tensor
datasets
jupyter
huggingface-hub
seaborn

2
spacy_models Normal file
View File

@@ -0,0 +1,2 @@
python -m spacy download en_core_web_lg
python -m spacy download en_core_web_trf