Files
paraphrase_detector/notebooks/02_baseline_experiments.ipynb
Henry Dowd 726c6b276b notebook2
2025-12-10 19:43:28 +00:00

674 lines
34 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d2aa2997",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"done\n"
]
}
],
"source": [
"# pip install rapidfuzz scikit-learn numpy\n",
"\n",
"import numpy as np\n",
"from collections import Counter\n",
"import string\n",
"from rapidfuzz import fuzz, distance\n",
"\n",
"test_pairs = [\n",
" # Direct copies and near-copies\n",
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Exact copy\n",
" (\"The cat sat on the mat.\", \"The cat sat on the mat\"), # No punctuation\n",
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Extra spaces\n",
" \n",
" # Paraphrases with same meaning\n",
" (\"The cat sat on the mat.\", \"On the mat, the cat was sitting.\"), # Structural change\n",
" (\"The cat sat on the mat.\", \"The feline rested on the rug.\"), # Synonym replacement\n",
" (\"The quick brown fox jumps.\", \"A fast brown fox leaps.\"), # Partial synonym\n",
" \n",
" # Different sentences\n",
" (\"The cat sat on the mat.\", \"The dog ran in the park.\"), # Different content\n",
" (\"I love programming.\", \"She enjoys reading books.\"), # Completely different\n",
" (\"The weather is nice today.\", \"It's raining outside.\"), # Opposite meaning\n",
" \n",
" # Edge cases\n",
" (\"Short.\", \"Short.\"), # Very short\n",
" (\"A B C D E F G\", \"A B C D E F G\"), # Repeated words\n",
" (\"\", \"\"), # Empty strings\n",
"]\n",
"print(\"done\")"
]
},
{
"cell_type": "markdown",
"id": "b06d10d0",
"metadata": {},
"source": [
"### Jaccard Similarity "
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "e60d024e969254a",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-19T10:01:11.039074Z",
"start_time": "2025-11-19T10:01:09.613806Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence 1 Sentence 2: Similarity Score:\n",
"====================================================================================================\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"The cat sat on the mat. vs The cat sat on the mat 0.667\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.375\n",
"The cat sat on the mat. vs The feline rested on the rug. 0.250\n",
"The quick brown fox jumps. vs A fast brown fox leaps. 0.250\n",
"The cat sat on the mat. vs The dog ran in the park. 0.111\n",
"I love programming. vs She enjoys reading books. 0.000\n",
"The weather is nice today. vs It's raining outside. 0.000\n",
"Short. vs Short. 1.000\n",
"A B C D E F G vs A B C D E F G 1.000\n",
" vs 0.000\n"
]
}
],
"source": [
"def jaccard_similarity(sent1, sent2):\n",
" # make lowercase and split into words\n",
" words1 = set(sent1.lower().split())\n",
" words2 = set(sent2.lower().split())\n",
" intersection = words1.intersection(words2)\n",
" union = words1.union(words2)\n",
" return float(len(intersection)) / len(union) if union else 0.0\n",
"\n",
"small_test_pairs = [\n",
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Copy\n",
" (\"The cat sat on the mat.\", \"On the mat sat the cat.\"), # Same words rearranged\n",
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
"]\n",
"\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" similarity = jaccard_similarity(sent1, sent2)\n",
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
" #print(\"-\"* 50)\n"
]
},
{
"cell_type": "markdown",
"id": "337a1072",
"metadata": {},
"source": [
"### --- Leneshtein Similarity ---\n",
" Character & Word"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "0b68fdcd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence 1 Sentence 2: Similarities -> Char: Word:\n",
"====================================================================================================\n"
]
},
{
"ename": "NameError",
"evalue": "name 'test_pairs' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 31\u001b[39m\n\u001b[32m 28\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mSentence 1\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<41\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mSentence 2:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mSimilarities ->\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<19\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mChar:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<10\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mWord:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 29\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m=\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m31\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m sent1, sent2 \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtest_pairs\u001b[49m:\n\u001b[32m 32\u001b[39m char_similarity = char_levenshtein_similarity(sent1, sent2)\n\u001b[32m 33\u001b[39m word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
"\u001b[31mNameError\u001b[39m: name 'test_pairs' is not defined"
]
}
],
"source": [
"def char_levenshtein_similarity(sent1, sent2):\n",
" \"\"\" Character based edit-distance similarity \"\"\"\n",
" if not sent1 and not sent2:\n",
" return 1.0\n",
" if not sent1 or not sent2:\n",
" return 0.0\n",
" \n",
" max_len = max(len(sent1), len(sent2))\n",
" edit_distance = distance.Levenshtein.distance(sent1, sent2)\n",
" return 1 - (edit_distance / max_len)\n",
"\n",
"def word_levenshtein_similarity(sent1, sent2):\n",
" \"\"\" Word based edit-distance similarity \"\"\"\n",
" words1 = sent1.lower().split()\n",
" words2 = sent2.lower().split()\n",
" \n",
" if not words1 and not words2:\n",
" return 1.0\n",
" if not words1 or not words2:\n",
" return 0.0\n",
" \n",
" max_len = max(len(words1), len(words2))\n",
" edit_distance = distance.Levenshtein.distance(words1, words2)\n",
" return 1 - (edit_distance / max_len)\n",
"\n",
" \n",
"\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<20} {'Similarities ->':<19} {'Char:':<10} {'Word:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" char_similarity = char_levenshtein_similarity(sent1, sent2)\n",
" word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {char_similarity:.3f}{'':<5} {word_similarity:.3F} \") # 3 decimal places\n",
" #print(\"-\"* 50"
]
},
{
"cell_type": "markdown",
"id": "bae45c9a",
"metadata": {},
"source": [
"### --- Cosine Similarity ---"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "46a985b4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence 1 Sentence 2: Similarity Score:\n",
"====================================================================================================\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"The cat sat on the mat. vs The cat sat on the mat 1.000\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.825\n",
"The cat sat on the mat. vs The feline rested on the rug. 0.625\n",
"The quick brown fox jumps. vs A fast brown fox leaps. 0.400\n",
"The cat sat on the mat. vs The dog ran in the park. 0.500\n",
"I love programming. vs She enjoys reading books. 0.000\n",
"The weather is nice today. vs It's raining outside. 0.000\n",
"Short. vs Short. 1.000\n",
"A B C D E F G vs A B C D E F G 1.000\n",
" vs 1.000\n"
]
}
],
"source": [
"def cosine_similarity_bow(sent1, sent2):\n",
" \"\"\" dosine similarity using bag-of-words \"\"\"\n",
" words1 = sent1.lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
" words2 = sent2.lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
" \n",
" if not words1 and not words2:\n",
" return 1.0\n",
" \n",
" vocabulary = set(words1 + words2)\n",
" if not vocabulary:\n",
" return 0.0\n",
" \n",
" # Create frequency vectors\n",
" freq1 = Counter(words1)\n",
" freq2 = Counter(words2)\n",
" \n",
" # Convert to vectors\n",
" vec1 = np.array([freq1[word] for word in vocabulary])\n",
" vec2 = np.array([freq2[word] for word in vocabulary])\n",
" \n",
" # Compute cosine similarity\n",
" dot_product = np.dot(vec1, vec2)\n",
" norm1 = np.linalg.norm(vec1)\n",
" norm2 = np.linalg.norm(vec2)\n",
" \n",
" if norm1 == 0 or norm2 == 0:\n",
" return 0.0\n",
" \n",
" return dot_product / (norm1 * norm2)\n",
"\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" similarity = cosine_similarity_bow(sent1, sent2)\n",
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
" #print(\"-\"* 50)"
]
},
{
"cell_type": "markdown",
"id": "658276dc",
"metadata": {},
"source": [
"### --- Fuzzy ratios ---"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "7dc7ac2e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fuzzy ratio Examples\n",
"==========================================================================================\n",
"Case Ratio Partial Token Sort Token Set Description\n",
"==========================================================================================\n",
"The cat sat on the mat. 100 100 100 100 Exact copy\n",
"The cat sat on the mat. 97 100 97 97 Different content\n",
"The cat sat on the mat. 90 82 100 100 Same words, different order\n",
"The cat sat on the mat. 47 57 58 62 Different content\n",
"The cat sat on the mat. 61 60 53 60 Different content\n",
"The quick brown fox jumps. 61 70 57 57 Different content\n",
"The cat sat on the mat. 63 63 55 55 Different content\n",
"I love programming. 40 43 40 40 Different content\n",
"The weather is nice today. 38 47 29 29 Different content\n",
"Short. 100 100 100 100 Exact copy\n",
"A B C D E F G 100 100 100 100 Exact copy\n",
" 100 100 100 0 Exact copy\n"
]
}
],
"source": [
"def fuzzy_ratio_similarity(sent1, sent2):\n",
" \"\"\"Fuzzy string matching ratio\"\"\"\n",
" return fuzz.ratio(sent1.lower(), sent2.lower()) / 100.0\n",
"\n",
"def fuzzy_partial_ratio(sent1, sent2):\n",
" \"\"\"Fuzzy partial string matching\"\"\"\n",
" return fuzz.partial_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
"\n",
"def fuzzy_token_sort_ratio(sent1, sent2):\n",
" \"\"\"Fuzzy token sorting ratio (ignore order)\"\"\"\n",
" return fuzz.token_sort_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
"\n",
"def fuzzy_token_set_ratio(sent1, sent2):\n",
" \"\"\" Fuzzy token set ratio (duplicates) \"\"\"\n",
" return fuzz.token_set_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
"\n",
"print(\"Fuzzy ratio Examples\")\n",
"print(\"=\" * 90)\n",
"print(f\"{'Case':<30} {'Ratio':<8} {'Partial':<8} {'Token Sort':<12} {'Token Set':<8} Description\")\n",
"print(\"=\" * 90)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" ratio = int(fuzz.ratio(sent1, sent2))\n",
" partial = int(fuzz.partial_ratio(sent1, sent2)) \n",
" token_sort = int(fuzz.token_sort_ratio(sent1, sent2))\n",
" token_set = int(fuzz.token_set_ratio(sent1, sent2))\n",
" \n",
" # description\n",
" if sent1 == sent2:\n",
" desc = \"Exact copy\"\n",
" elif sorted(sent1.split()) == sorted(sent2.split()):\n",
" desc = \"Same words, different order\"\n",
" elif set(sent1.split()).issubset(set(sent2.split())) or set(sent2.split()).issubset(set(sent1.split())):\n",
" desc = \"Subset relationship\"\n",
" else:\n",
" desc = \"Different content\"\n",
" \n",
" print(f\"{sent1[:28]:<30} {ratio:<8} {partial:<8} {token_sort:<12} {token_set:<8} {desc}\")"
]
},
{
"cell_type": "markdown",
"id": "a48774d4",
"metadata": {},
"source": [
"### --- Longest common sub-sequence ----"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "e6a4d4e2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence 1 Sentence 2: Similarity Score:\n",
"====================================================================================================\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"The cat sat on the mat. vs The cat sat on the mat 1.000\n",
"The cat sat on the mat. vs The cat sat on the mat. 0.815\n",
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.433\n",
"The cat sat on the mat. vs The feline rested on the rug. 0.536\n",
"The quick brown fox jumps. vs A fast brown fox leaps. 0.560\n",
"The cat sat on the mat. vs The dog ran in the park. 0.609\n",
"I love programming. vs She enjoys reading books. 0.333\n",
"The weather is nice today. vs It's raining outside. 0.360\n",
"Short. vs Short. 1.000\n",
"A B C D E F G vs A B C D E F G 1.000\n",
" vs 0.000\n"
]
}
],
"source": [
"def longest_common_subsequence(sent1, sent2):\n",
" \"\"\" Longest common subsequence similarity \"\"\"\n",
" if not sent1 or not sent2:\n",
" return 0.0\n",
" \n",
" # Remove punctuation for better matching\n",
" sent1_clean = sent1.lower().translate(str.maketrans('', '', string.punctuation))\n",
" sent2_clean = sent2.lower().translate(str.maketrans('', '', string.punctuation))\n",
" \n",
" m, n = len(sent1_clean), len(sent2_clean)\n",
" dp = [[0] * (n + 1) for _ in range(m + 1)]\n",
" \n",
" for i in range(1, m + 1):\n",
" for j in range(1, n + 1):\n",
" if sent1_clean[i-1] == sent2_clean[j-1]:\n",
" dp[i][j] = dp[i-1][j-1] + 1\n",
" else:\n",
" dp[i][j] = max(dp[i-1][j], dp[i][j-1])\n",
" \n",
" lcs_length = dp[m][n]\n",
" return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n",
"\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" similarity = longest_common_subsequence(sent1, sent2)\n",
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
" #print(\"-\"* 50)"
]
},
{
"cell_type": "markdown",
"id": "1a532335",
"metadata": {},
"source": [
"### --- Containment Similarity ---\n",
"Percentage of Sentance A in Sentance B <br>\n",
"containment(A, B) = |words(A) ∩ words(B)| / |words(A)|"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "493979a4",
"metadata": {},
"outputs": [],
"source": [
"def containment_similarity(sent1, sent2):\n",
" # sent1 in sent2 (asymmetric)\n",
" \"\"\" What percentage of sent1's words are in sent2 \"\"\"\n",
" words1 = set(sent1.lower().translate(str.maketrans('', '', string.punctuation)).split())\n",
" words2 = set(sent2.lower().translate(str.maketrans('', '', string.punctuation)).split())\n",
" \n",
" if not words1:\n",
" return 0.0\n",
" \n",
" common = words1.intersection(words2)\n",
" return len(common) / len(words1)\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "912a9c7c",
"metadata": {},
"source": [
"### Evaluate BaseLine Methods\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "b3d07562",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pair Sentence 1 Sentence 2 \n",
"====================================================================================================\n",
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 2: The cat sat on the mat. The cat sat on the mat \n",
" Jaccard: 0.667\n",
" Levenshtein (char): 0.957\n",
" Levenshtein (word): 0.833\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 0.978\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 0.978\n",
" Fuzzy Token Set: 0.973\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 3: The cat sat on the mat. The cat sat on the mat. \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 0.821\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 0.902\n",
" Fuzzy Partial: 0.826\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 0.815\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 4: The cat sat on the mat. On the mat, the cat was sit...\n",
" Jaccard: 0.375\n",
" Levenshtein (char): 0.344\n",
" Levenshtein (word): 0.143\n",
" Cosine BOW: 0.825\n",
" Fuzzy Ratio: 0.509\n",
" Fuzzy Partial: 0.619\n",
" Fuzzy Token Sort: 0.764\n",
" Fuzzy Token Set: 0.723\n",
" LCS: 0.433\n",
" Containment: 0.800\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 5: The cat sat on the mat. The feline rested on the rug. \n",
" Jaccard: 0.250\n",
" Levenshtein (char): 0.517\n",
" Levenshtein (word): 0.500\n",
" Cosine BOW: 0.625\n",
" Fuzzy Ratio: 0.615\n",
" Fuzzy Partial: 0.605\n",
" Fuzzy Token Sort: 0.538\n",
" Fuzzy Token Set: 0.480\n",
" LCS: 0.536\n",
" Containment: 0.400\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 6: The quick brown fox jumps. A fast brown fox leaps. \n",
" Jaccard: 0.250\n",
" Levenshtein (char): 0.577\n",
" Levenshtein (word): 0.400\n",
" Cosine BOW: 0.400\n",
" Fuzzy Ratio: 0.612\n",
" Fuzzy Partial: 0.700\n",
" Fuzzy Token Sort: 0.531\n",
" Fuzzy Token Set: 0.562\n",
" LCS: 0.560\n",
" Containment: 0.400\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 7: The cat sat on the mat. The dog ran in the park. \n",
" Jaccard: 0.111\n",
" Levenshtein (char): 0.625\n",
" Levenshtein (word): 0.333\n",
" Cosine BOW: 0.500\n",
" Fuzzy Ratio: 0.638\n",
" Fuzzy Partial: 0.636\n",
" Fuzzy Token Sort: 0.553\n",
" Fuzzy Token Set: 0.462\n",
" LCS: 0.609\n",
" Containment: 0.200\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 8: I love programming. She enjoys reading books. \n",
" Jaccard: 0.000\n",
" Levenshtein (char): 0.200\n",
" Levenshtein (word): 0.000\n",
" Cosine BOW: 0.000\n",
" Fuzzy Ratio: 0.409\n",
" Fuzzy Partial: 0.432\n",
" Fuzzy Token Sort: 0.364\n",
" Fuzzy Token Set: 0.364\n",
" LCS: 0.333\n",
" Containment: 0.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 9: The weather is nice today. It's raining outside. \n",
" Jaccard: 0.000\n",
" Levenshtein (char): 0.192\n",
" Levenshtein (word): 0.000\n",
" Cosine BOW: 0.000\n",
" Fuzzy Ratio: 0.426\n",
" Fuzzy Partial: 0.514\n",
" Fuzzy Token Sort: 0.340\n",
" Fuzzy Token Set: 0.340\n",
" LCS: 0.360\n",
" Containment: 0.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 10: Short. Short. \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 11: A B C D E F G A B C D E F G \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 12: \n",
" Jaccard: 0.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 0.000\n",
" LCS: 0.000\n",
" Containment: 0.000\n",
"----------------------------------------------------------------------------------------------------\n"
]
}
],
"source": [
"def evaluate_baseline_methods(pairs):\n",
" \"\"\" Evaluate all baseline methods on test pairs\"\"\"\n",
" methods = {\n",
" 'Jaccard': jaccard_similarity,\n",
" 'Levenshtein (char)': char_levenshtein_similarity,\n",
" 'Levenshtein (word)': word_levenshtein_similarity,\n",
" 'Cosine BOW': cosine_similarity_bow,\n",
" 'Fuzzy Ratio': fuzzy_ratio_similarity,\n",
" 'Fuzzy Partial': fuzzy_partial_ratio,\n",
" 'Fuzzy Token Sort': fuzzy_token_sort_ratio,\n",
" 'Fuzzy Token Set': fuzzy_token_set_ratio,\n",
" 'LCS': longest_common_subsequence,\n",
" 'Containment': containment_similarity,\n",
" }\n",
" \n",
" results = {method: [] for method in methods}\n",
" \n",
" for sent1, sent2 in pairs:\n",
" for method_name, method_func in methods.items():\n",
" similarity = method_func(sent1, sent2)\n",
" results[method_name].append(similarity)\n",
" \n",
" return results, methods\n",
"\n",
"def print_comparison_table(results, pairs):\n",
" \"\"\" Print a formatted comparison table \"\"\"\n",
" print(f\"{'Pair':<30} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
" print(\"=\" * 100)\n",
" \n",
" for i, (sent1, sent2) in enumerate(pairs):\n",
" # if sentence too long\n",
" display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n",
" display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n",
" \n",
" print(f\"{f'Pair {i+1}:':<30} {display_sent1:<30} {display_sent2:<30}\")\n",
" \n",
" # Print similarities for this pair\n",
" for method_name in results:\n",
" similarity = results[method_name][i]\n",
" print(f\"{'':<30} {method_name + ':':<20} {similarity:.3f}\")\n",
" print(\"-\" * 100)\n",
"\n",
"results, methods = evaluate_baseline_methods(test_pairs)\n",
"print_comparison_table(results, test_pairs)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}