674 lines
34 KiB
Plaintext
674 lines
34 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "d2aa2997",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"done\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# pip install rapidfuzz scikit-learn numpy\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"from collections import Counter\n",
|
|
"import string\n",
|
|
"from rapidfuzz import fuzz, distance\n",
|
|
"\n",
|
|
"test_pairs = [\n",
|
|
" # Direct copies and near-copies\n",
|
|
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Exact copy\n",
|
|
" (\"The cat sat on the mat.\", \"The cat sat on the mat\"), # No punctuation\n",
|
|
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Extra spaces\n",
|
|
" \n",
|
|
" # Paraphrases with same meaning\n",
|
|
" (\"The cat sat on the mat.\", \"On the mat, the cat was sitting.\"), # Structural change\n",
|
|
" (\"The cat sat on the mat.\", \"The feline rested on the rug.\"), # Synonym replacement\n",
|
|
" (\"The quick brown fox jumps.\", \"A fast brown fox leaps.\"), # Partial synonym\n",
|
|
" \n",
|
|
" # Different sentences\n",
|
|
" (\"The cat sat on the mat.\", \"The dog ran in the park.\"), # Different content\n",
|
|
" (\"I love programming.\", \"She enjoys reading books.\"), # Completely different\n",
|
|
" (\"The weather is nice today.\", \"It's raining outside.\"), # Opposite meaning\n",
|
|
" \n",
|
|
" # Edge cases\n",
|
|
" (\"Short.\", \"Short.\"), # Very short\n",
|
|
" (\"A B C D E F G\", \"A B C D E F G\"), # Repeated words\n",
|
|
" (\"\", \"\"), # Empty strings\n",
|
|
"]\n",
|
|
"print(\"done\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b06d10d0",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Jaccard Similarity "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 58,
|
|
"id": "e60d024e969254a",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-11-19T10:01:11.039074Z",
|
|
"start_time": "2025-11-19T10:01:09.613806Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Sentence 1 Sentence 2: Similarity Score:\n",
|
|
"====================================================================================================\n",
|
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
|
"The cat sat on the mat. vs The cat sat on the mat 0.667\n",
|
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
|
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.375\n",
|
|
"The cat sat on the mat. vs The feline rested on the rug. 0.250\n",
|
|
"The quick brown fox jumps. vs A fast brown fox leaps. 0.250\n",
|
|
"The cat sat on the mat. vs The dog ran in the park. 0.111\n",
|
|
"I love programming. vs She enjoys reading books. 0.000\n",
|
|
"The weather is nice today. vs It's raining outside. 0.000\n",
|
|
"Short. vs Short. 1.000\n",
|
|
"A B C D E F G vs A B C D E F G 1.000\n",
|
|
" vs 0.000\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def jaccard_similarity(sent1, sent2):\n",
|
|
" # make lowercase and split into words\n",
|
|
" words1 = set(sent1.lower().split())\n",
|
|
" words2 = set(sent2.lower().split())\n",
|
|
" intersection = words1.intersection(words2)\n",
|
|
" union = words1.union(words2)\n",
|
|
" return float(len(intersection)) / len(union) if union else 0.0\n",
|
|
"\n",
|
|
"small_test_pairs = [\n",
|
|
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Copy\n",
|
|
" (\"The cat sat on the mat.\", \"On the mat sat the cat.\"), # Same words rearranged\n",
|
|
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
|
|
"]\n",
|
|
"\n",
|
|
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
|
|
"print(\"=\" * 100)\n",
|
|
"\n",
|
|
"for sent1, sent2 in test_pairs:\n",
|
|
" similarity = jaccard_similarity(sent1, sent2)\n",
|
|
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
|
|
" #print(\"-\"* 50)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "337a1072",
|
|
"metadata": {},
|
|
"source": [
|
|
"### --- Leneshtein Similarity ---\n",
|
|
" Character & Word"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "0b68fdcd",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Sentence 1 Sentence 2: Similarities -> Char: Word:\n",
|
|
"====================================================================================================\n"
|
|
]
|
|
},
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'test_pairs' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 31\u001b[39m\n\u001b[32m 28\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mSentence 1\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<41\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mSentence 2:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mSimilarities ->\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<19\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mChar:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<10\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mWord:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 29\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m=\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m31\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m sent1, sent2 \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtest_pairs\u001b[49m:\n\u001b[32m 32\u001b[39m char_similarity = char_levenshtein_similarity(sent1, sent2)\n\u001b[32m 33\u001b[39m word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
|
|
"\u001b[31mNameError\u001b[39m: name 'test_pairs' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def char_levenshtein_similarity(sent1, sent2):\n",
|
|
" \"\"\" Character based edit-distance similarity \"\"\"\n",
|
|
" if not sent1 and not sent2:\n",
|
|
" return 1.0\n",
|
|
" if not sent1 or not sent2:\n",
|
|
" return 0.0\n",
|
|
" \n",
|
|
" max_len = max(len(sent1), len(sent2))\n",
|
|
" edit_distance = distance.Levenshtein.distance(sent1, sent2)\n",
|
|
" return 1 - (edit_distance / max_len)\n",
|
|
"\n",
|
|
"def word_levenshtein_similarity(sent1, sent2):\n",
|
|
" \"\"\" Word based edit-distance similarity \"\"\"\n",
|
|
" words1 = sent1.lower().split()\n",
|
|
" words2 = sent2.lower().split()\n",
|
|
" \n",
|
|
" if not words1 and not words2:\n",
|
|
" return 1.0\n",
|
|
" if not words1 or not words2:\n",
|
|
" return 0.0\n",
|
|
" \n",
|
|
" max_len = max(len(words1), len(words2))\n",
|
|
" edit_distance = distance.Levenshtein.distance(words1, words2)\n",
|
|
" return 1 - (edit_distance / max_len)\n",
|
|
"\n",
|
|
" \n",
|
|
"\n",
|
|
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<20} {'Similarities ->':<19} {'Char:':<10} {'Word:'}\")\n",
|
|
"print(\"=\" * 100)\n",
|
|
"\n",
|
|
"for sent1, sent2 in test_pairs:\n",
|
|
" char_similarity = char_levenshtein_similarity(sent1, sent2)\n",
|
|
" word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
|
|
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {char_similarity:.3f}{'':<5} {word_similarity:.3F} \") # 3 decimal places\n",
|
|
" #print(\"-\"* 50"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "bae45c9a",
|
|
"metadata": {},
|
|
"source": [
|
|
"### --- Cosine Similarity ---"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"id": "46a985b4",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Sentence 1 Sentence 2: Similarity Score:\n",
|
|
"====================================================================================================\n",
|
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
|
"The cat sat on the mat. vs The cat sat on the mat 1.000\n",
|
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
|
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.825\n",
|
|
"The cat sat on the mat. vs The feline rested on the rug. 0.625\n",
|
|
"The quick brown fox jumps. vs A fast brown fox leaps. 0.400\n",
|
|
"The cat sat on the mat. vs The dog ran in the park. 0.500\n",
|
|
"I love programming. vs She enjoys reading books. 0.000\n",
|
|
"The weather is nice today. vs It's raining outside. 0.000\n",
|
|
"Short. vs Short. 1.000\n",
|
|
"A B C D E F G vs A B C D E F G 1.000\n",
|
|
" vs 1.000\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def cosine_similarity_bow(sent1, sent2):\n",
|
|
" \"\"\" dosine similarity using bag-of-words \"\"\"\n",
|
|
" words1 = sent1.lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
|
|
" words2 = sent2.lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
|
|
" \n",
|
|
" if not words1 and not words2:\n",
|
|
" return 1.0\n",
|
|
" \n",
|
|
" vocabulary = set(words1 + words2)\n",
|
|
" if not vocabulary:\n",
|
|
" return 0.0\n",
|
|
" \n",
|
|
" # Create frequency vectors\n",
|
|
" freq1 = Counter(words1)\n",
|
|
" freq2 = Counter(words2)\n",
|
|
" \n",
|
|
" # Convert to vectors\n",
|
|
" vec1 = np.array([freq1[word] for word in vocabulary])\n",
|
|
" vec2 = np.array([freq2[word] for word in vocabulary])\n",
|
|
" \n",
|
|
" # Compute cosine similarity\n",
|
|
" dot_product = np.dot(vec1, vec2)\n",
|
|
" norm1 = np.linalg.norm(vec1)\n",
|
|
" norm2 = np.linalg.norm(vec2)\n",
|
|
" \n",
|
|
" if norm1 == 0 or norm2 == 0:\n",
|
|
" return 0.0\n",
|
|
" \n",
|
|
" return dot_product / (norm1 * norm2)\n",
|
|
"\n",
|
|
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
|
|
"print(\"=\" * 100)\n",
|
|
"\n",
|
|
"for sent1, sent2 in test_pairs:\n",
|
|
" similarity = cosine_similarity_bow(sent1, sent2)\n",
|
|
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
|
|
" #print(\"-\"* 50)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "658276dc",
|
|
"metadata": {},
|
|
"source": [
|
|
"### --- Fuzzy ratios ---"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "7dc7ac2e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Fuzzy ratio Examples\n",
|
|
"==========================================================================================\n",
|
|
"Case Ratio Partial Token Sort Token Set Description\n",
|
|
"==========================================================================================\n",
|
|
"The cat sat on the mat. 100 100 100 100 Exact copy\n",
|
|
"The cat sat on the mat. 97 100 97 97 Different content\n",
|
|
"The cat sat on the mat. 90 82 100 100 Same words, different order\n",
|
|
"The cat sat on the mat. 47 57 58 62 Different content\n",
|
|
"The cat sat on the mat. 61 60 53 60 Different content\n",
|
|
"The quick brown fox jumps. 61 70 57 57 Different content\n",
|
|
"The cat sat on the mat. 63 63 55 55 Different content\n",
|
|
"I love programming. 40 43 40 40 Different content\n",
|
|
"The weather is nice today. 38 47 29 29 Different content\n",
|
|
"Short. 100 100 100 100 Exact copy\n",
|
|
"A B C D E F G 100 100 100 100 Exact copy\n",
|
|
" 100 100 100 0 Exact copy\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def fuzzy_ratio_similarity(sent1, sent2):\n",
|
|
" \"\"\"Fuzzy string matching ratio\"\"\"\n",
|
|
" return fuzz.ratio(sent1.lower(), sent2.lower()) / 100.0\n",
|
|
"\n",
|
|
"def fuzzy_partial_ratio(sent1, sent2):\n",
|
|
" \"\"\"Fuzzy partial string matching\"\"\"\n",
|
|
" return fuzz.partial_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
|
|
"\n",
|
|
"def fuzzy_token_sort_ratio(sent1, sent2):\n",
|
|
" \"\"\"Fuzzy token sorting ratio (ignore order)\"\"\"\n",
|
|
" return fuzz.token_sort_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
|
|
"\n",
|
|
"def fuzzy_token_set_ratio(sent1, sent2):\n",
|
|
" \"\"\" Fuzzy token set ratio (duplicates) \"\"\"\n",
|
|
" return fuzz.token_set_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
|
|
"\n",
|
|
"print(\"Fuzzy ratio Examples\")\n",
|
|
"print(\"=\" * 90)\n",
|
|
"print(f\"{'Case':<30} {'Ratio':<8} {'Partial':<8} {'Token Sort':<12} {'Token Set':<8} Description\")\n",
|
|
"print(\"=\" * 90)\n",
|
|
"\n",
|
|
"for sent1, sent2 in test_pairs:\n",
|
|
" ratio = int(fuzz.ratio(sent1, sent2))\n",
|
|
" partial = int(fuzz.partial_ratio(sent1, sent2)) \n",
|
|
" token_sort = int(fuzz.token_sort_ratio(sent1, sent2))\n",
|
|
" token_set = int(fuzz.token_set_ratio(sent1, sent2))\n",
|
|
" \n",
|
|
" # description\n",
|
|
" if sent1 == sent2:\n",
|
|
" desc = \"Exact copy\"\n",
|
|
" elif sorted(sent1.split()) == sorted(sent2.split()):\n",
|
|
" desc = \"Same words, different order\"\n",
|
|
" elif set(sent1.split()).issubset(set(sent2.split())) or set(sent2.split()).issubset(set(sent1.split())):\n",
|
|
" desc = \"Subset relationship\"\n",
|
|
" else:\n",
|
|
" desc = \"Different content\"\n",
|
|
" \n",
|
|
" print(f\"{sent1[:28]:<30} {ratio:<8} {partial:<8} {token_sort:<12} {token_set:<8} {desc}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a48774d4",
|
|
"metadata": {},
|
|
"source": [
|
|
"### --- Longest common sub-sequence ----"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"id": "e6a4d4e2",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Sentence 1 Sentence 2: Similarity Score:\n",
|
|
"====================================================================================================\n",
|
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
|
"The cat sat on the mat. vs The cat sat on the mat 1.000\n",
|
|
"The cat sat on the mat. vs The cat sat on the mat. 0.815\n",
|
|
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.433\n",
|
|
"The cat sat on the mat. vs The feline rested on the rug. 0.536\n",
|
|
"The quick brown fox jumps. vs A fast brown fox leaps. 0.560\n",
|
|
"The cat sat on the mat. vs The dog ran in the park. 0.609\n",
|
|
"I love programming. vs She enjoys reading books. 0.333\n",
|
|
"The weather is nice today. vs It's raining outside. 0.360\n",
|
|
"Short. vs Short. 1.000\n",
|
|
"A B C D E F G vs A B C D E F G 1.000\n",
|
|
" vs 0.000\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def longest_common_subsequence(sent1, sent2):\n",
|
|
" \"\"\" Longest common subsequence similarity \"\"\"\n",
|
|
" if not sent1 or not sent2:\n",
|
|
" return 0.0\n",
|
|
" \n",
|
|
" # Remove punctuation for better matching\n",
|
|
" sent1_clean = sent1.lower().translate(str.maketrans('', '', string.punctuation))\n",
|
|
" sent2_clean = sent2.lower().translate(str.maketrans('', '', string.punctuation))\n",
|
|
" \n",
|
|
" m, n = len(sent1_clean), len(sent2_clean)\n",
|
|
" dp = [[0] * (n + 1) for _ in range(m + 1)]\n",
|
|
" \n",
|
|
" for i in range(1, m + 1):\n",
|
|
" for j in range(1, n + 1):\n",
|
|
" if sent1_clean[i-1] == sent2_clean[j-1]:\n",
|
|
" dp[i][j] = dp[i-1][j-1] + 1\n",
|
|
" else:\n",
|
|
" dp[i][j] = max(dp[i-1][j], dp[i][j-1])\n",
|
|
" \n",
|
|
" lcs_length = dp[m][n]\n",
|
|
" return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n",
|
|
"\n",
|
|
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
|
|
"print(\"=\" * 100)\n",
|
|
"\n",
|
|
"for sent1, sent2 in test_pairs:\n",
|
|
" similarity = longest_common_subsequence(sent1, sent2)\n",
|
|
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
|
|
" #print(\"-\"* 50)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1a532335",
|
|
"metadata": {},
|
|
"source": [
|
|
"### --- Containment Similarity ---\n",
|
|
"Percentage of Sentance A in Sentance B <br>\n",
|
|
"containment(A, B) = |words(A) ∩ words(B)| / |words(A)|"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "493979a4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def containment_similarity(sent1, sent2):\n",
|
|
" # sent1 in sent2 (asymmetric)\n",
|
|
" \"\"\" What percentage of sent1's words are in sent2 \"\"\"\n",
|
|
" words1 = set(sent1.lower().translate(str.maketrans('', '', string.punctuation)).split())\n",
|
|
" words2 = set(sent2.lower().translate(str.maketrans('', '', string.punctuation)).split())\n",
|
|
" \n",
|
|
" if not words1:\n",
|
|
" return 0.0\n",
|
|
" \n",
|
|
" common = words1.intersection(words2)\n",
|
|
" return len(common) / len(words1)\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "912a9c7c",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Evaluate BaseLine Methods\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"id": "b3d07562",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pair Sentence 1 Sentence 2 \n",
|
|
"====================================================================================================\n",
|
|
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n",
|
|
" Jaccard: 1.000\n",
|
|
" Levenshtein (char): 1.000\n",
|
|
" Levenshtein (word): 1.000\n",
|
|
" Cosine BOW: 1.000\n",
|
|
" Fuzzy Ratio: 1.000\n",
|
|
" Fuzzy Partial: 1.000\n",
|
|
" Fuzzy Token Sort: 1.000\n",
|
|
" Fuzzy Token Set: 1.000\n",
|
|
" LCS: 1.000\n",
|
|
" Containment: 1.000\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 2: The cat sat on the mat. The cat sat on the mat \n",
|
|
" Jaccard: 0.667\n",
|
|
" Levenshtein (char): 0.957\n",
|
|
" Levenshtein (word): 0.833\n",
|
|
" Cosine BOW: 1.000\n",
|
|
" Fuzzy Ratio: 0.978\n",
|
|
" Fuzzy Partial: 1.000\n",
|
|
" Fuzzy Token Sort: 0.978\n",
|
|
" Fuzzy Token Set: 0.973\n",
|
|
" LCS: 1.000\n",
|
|
" Containment: 1.000\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 3: The cat sat on the mat. The cat sat on the mat. \n",
|
|
" Jaccard: 1.000\n",
|
|
" Levenshtein (char): 0.821\n",
|
|
" Levenshtein (word): 1.000\n",
|
|
" Cosine BOW: 1.000\n",
|
|
" Fuzzy Ratio: 0.902\n",
|
|
" Fuzzy Partial: 0.826\n",
|
|
" Fuzzy Token Sort: 1.000\n",
|
|
" Fuzzy Token Set: 1.000\n",
|
|
" LCS: 0.815\n",
|
|
" Containment: 1.000\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 4: The cat sat on the mat. On the mat, the cat was sit...\n",
|
|
" Jaccard: 0.375\n",
|
|
" Levenshtein (char): 0.344\n",
|
|
" Levenshtein (word): 0.143\n",
|
|
" Cosine BOW: 0.825\n",
|
|
" Fuzzy Ratio: 0.509\n",
|
|
" Fuzzy Partial: 0.619\n",
|
|
" Fuzzy Token Sort: 0.764\n",
|
|
" Fuzzy Token Set: 0.723\n",
|
|
" LCS: 0.433\n",
|
|
" Containment: 0.800\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 5: The cat sat on the mat. The feline rested on the rug. \n",
|
|
" Jaccard: 0.250\n",
|
|
" Levenshtein (char): 0.517\n",
|
|
" Levenshtein (word): 0.500\n",
|
|
" Cosine BOW: 0.625\n",
|
|
" Fuzzy Ratio: 0.615\n",
|
|
" Fuzzy Partial: 0.605\n",
|
|
" Fuzzy Token Sort: 0.538\n",
|
|
" Fuzzy Token Set: 0.480\n",
|
|
" LCS: 0.536\n",
|
|
" Containment: 0.400\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 6: The quick brown fox jumps. A fast brown fox leaps. \n",
|
|
" Jaccard: 0.250\n",
|
|
" Levenshtein (char): 0.577\n",
|
|
" Levenshtein (word): 0.400\n",
|
|
" Cosine BOW: 0.400\n",
|
|
" Fuzzy Ratio: 0.612\n",
|
|
" Fuzzy Partial: 0.700\n",
|
|
" Fuzzy Token Sort: 0.531\n",
|
|
" Fuzzy Token Set: 0.562\n",
|
|
" LCS: 0.560\n",
|
|
" Containment: 0.400\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 7: The cat sat on the mat. The dog ran in the park. \n",
|
|
" Jaccard: 0.111\n",
|
|
" Levenshtein (char): 0.625\n",
|
|
" Levenshtein (word): 0.333\n",
|
|
" Cosine BOW: 0.500\n",
|
|
" Fuzzy Ratio: 0.638\n",
|
|
" Fuzzy Partial: 0.636\n",
|
|
" Fuzzy Token Sort: 0.553\n",
|
|
" Fuzzy Token Set: 0.462\n",
|
|
" LCS: 0.609\n",
|
|
" Containment: 0.200\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 8: I love programming. She enjoys reading books. \n",
|
|
" Jaccard: 0.000\n",
|
|
" Levenshtein (char): 0.200\n",
|
|
" Levenshtein (word): 0.000\n",
|
|
" Cosine BOW: 0.000\n",
|
|
" Fuzzy Ratio: 0.409\n",
|
|
" Fuzzy Partial: 0.432\n",
|
|
" Fuzzy Token Sort: 0.364\n",
|
|
" Fuzzy Token Set: 0.364\n",
|
|
" LCS: 0.333\n",
|
|
" Containment: 0.000\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 9: The weather is nice today. It's raining outside. \n",
|
|
" Jaccard: 0.000\n",
|
|
" Levenshtein (char): 0.192\n",
|
|
" Levenshtein (word): 0.000\n",
|
|
" Cosine BOW: 0.000\n",
|
|
" Fuzzy Ratio: 0.426\n",
|
|
" Fuzzy Partial: 0.514\n",
|
|
" Fuzzy Token Sort: 0.340\n",
|
|
" Fuzzy Token Set: 0.340\n",
|
|
" LCS: 0.360\n",
|
|
" Containment: 0.000\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 10: Short. Short. \n",
|
|
" Jaccard: 1.000\n",
|
|
" Levenshtein (char): 1.000\n",
|
|
" Levenshtein (word): 1.000\n",
|
|
" Cosine BOW: 1.000\n",
|
|
" Fuzzy Ratio: 1.000\n",
|
|
" Fuzzy Partial: 1.000\n",
|
|
" Fuzzy Token Sort: 1.000\n",
|
|
" Fuzzy Token Set: 1.000\n",
|
|
" LCS: 1.000\n",
|
|
" Containment: 1.000\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 11: A B C D E F G A B C D E F G \n",
|
|
" Jaccard: 1.000\n",
|
|
" Levenshtein (char): 1.000\n",
|
|
" Levenshtein (word): 1.000\n",
|
|
" Cosine BOW: 1.000\n",
|
|
" Fuzzy Ratio: 1.000\n",
|
|
" Fuzzy Partial: 1.000\n",
|
|
" Fuzzy Token Sort: 1.000\n",
|
|
" Fuzzy Token Set: 1.000\n",
|
|
" LCS: 1.000\n",
|
|
" Containment: 1.000\n",
|
|
"----------------------------------------------------------------------------------------------------\n",
|
|
"Pair 12: \n",
|
|
" Jaccard: 0.000\n",
|
|
" Levenshtein (char): 1.000\n",
|
|
" Levenshtein (word): 1.000\n",
|
|
" Cosine BOW: 1.000\n",
|
|
" Fuzzy Ratio: 1.000\n",
|
|
" Fuzzy Partial: 1.000\n",
|
|
" Fuzzy Token Sort: 1.000\n",
|
|
" Fuzzy Token Set: 0.000\n",
|
|
" LCS: 0.000\n",
|
|
" Containment: 0.000\n",
|
|
"----------------------------------------------------------------------------------------------------\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"def evaluate_baseline_methods(pairs):\n",
|
|
" \"\"\" Evaluate all baseline methods on test pairs\"\"\"\n",
|
|
" methods = {\n",
|
|
" 'Jaccard': jaccard_similarity,\n",
|
|
" 'Levenshtein (char)': char_levenshtein_similarity,\n",
|
|
" 'Levenshtein (word)': word_levenshtein_similarity,\n",
|
|
" 'Cosine BOW': cosine_similarity_bow,\n",
|
|
" 'Fuzzy Ratio': fuzzy_ratio_similarity,\n",
|
|
" 'Fuzzy Partial': fuzzy_partial_ratio,\n",
|
|
" 'Fuzzy Token Sort': fuzzy_token_sort_ratio,\n",
|
|
" 'Fuzzy Token Set': fuzzy_token_set_ratio,\n",
|
|
" 'LCS': longest_common_subsequence,\n",
|
|
" 'Containment': containment_similarity,\n",
|
|
" }\n",
|
|
" \n",
|
|
" results = {method: [] for method in methods}\n",
|
|
" \n",
|
|
" for sent1, sent2 in pairs:\n",
|
|
" for method_name, method_func in methods.items():\n",
|
|
" similarity = method_func(sent1, sent2)\n",
|
|
" results[method_name].append(similarity)\n",
|
|
" \n",
|
|
" return results, methods\n",
|
|
"\n",
|
|
"def print_comparison_table(results, pairs):\n",
|
|
" \"\"\" Print a formatted comparison table \"\"\"\n",
|
|
" print(f\"{'Pair':<30} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
|
|
" print(\"=\" * 100)\n",
|
|
" \n",
|
|
" for i, (sent1, sent2) in enumerate(pairs):\n",
|
|
" # if sentence too long\n",
|
|
" display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n",
|
|
" display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n",
|
|
" \n",
|
|
" print(f\"{f'Pair {i+1}:':<30} {display_sent1:<30} {display_sent2:<30}\")\n",
|
|
" \n",
|
|
" # Print similarities for this pair\n",
|
|
" for method_name in results:\n",
|
|
" similarity = results[method_name][i]\n",
|
|
" print(f\"{'':<30} {method_name + ':':<20} {similarity:.3f}\")\n",
|
|
" print(\"-\" * 100)\n",
|
|
"\n",
|
|
"results, methods = evaluate_baseline_methods(test_pairs)\n",
|
|
"print_comparison_table(results, test_pairs)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|