Expanded on Baseline experimens, analyzed and compared

This commit is contained in:
Henry Dowd
2025-11-30 19:38:42 +00:00
parent 37ccc03ac9
commit fe2c087093

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"id": "d2aa2997",
"metadata": {},
"outputs": [
@@ -56,7 +56,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 58,
"id": "e60d024e969254a",
"metadata": {
"ExecuteTime": {
@@ -69,30 +69,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat': 0.667\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.375\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.250\n",
"--------------------------------------------------\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.250\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.111\n",
"--------------------------------------------------\n",
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.': 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
"--------------------------------------------------\n",
"'' vs '': 0.000\n",
"--------------------------------------------------\n"
"Sentence 1 Sentence 2: Similarity Score:\n",
"====================================================================================================\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"The cat sat on the mat. vs The cat sat on the mat 0.667\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.375\n",
"The cat sat on the mat. vs The feline rested on the rug. 0.250\n",
"The quick brown fox jumps. vs A fast brown fox leaps. 0.250\n",
"The cat sat on the mat. vs The dog ran in the park. 0.111\n",
"I love programming. vs She enjoys reading books. 0.000\n",
"The weather is nice today. vs It's raining outside. 0.000\n",
"Short. vs Short. 1.000\n",
"A B C D E F G vs A B C D E F G 1.000\n",
" vs 0.000\n"
]
}
],
@@ -111,10 +101,13 @@
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
"]\n",
"\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" similarity = jaccard_similarity(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
" print(\"-\"* 50)\n"
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
" #print(\"-\"* 50)\n"
]
},
{
@@ -128,7 +121,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 57,
"id": "0b68fdcd",
"metadata": {},
"outputs": [
@@ -136,42 +129,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat':\n",
" Char similarity: 0.957 --- Word similarity: 0.833\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
" Char similarity: 0.821 --- Word similarity: 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.':\n",
" Char similarity: 0.344 --- Word similarity: 0.143\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.':\n",
" Char similarity: 0.517 --- Word similarity: 0.500\n",
"--------------------------------------------------\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.':\n",
" Char similarity: 0.577 --- Word similarity: 0.400\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.':\n",
" Char similarity: 0.625 --- Word similarity: 0.333\n",
"--------------------------------------------------\n",
"'I love programming.' vs 'She enjoys reading books.':\n",
" Char similarity: 0.200 --- Word similarity: 0.000\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.':\n",
" Char similarity: 0.192 --- Word similarity: 0.000\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n",
"'' vs '':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n"
"Sentence 1 Sentence 2: Similarities -> Char: Word:\n",
"====================================================================================================\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000 1.000 \n",
"The cat sat on the mat. vs The cat sat on the mat 0.957 0.833 \n",
"The cat sat on the mat. vs The cat sat on the mat. 0.821 1.000 \n",
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.344 0.143 \n",
"The cat sat on the mat. vs The feline rested on the rug. 0.517 0.500 \n",
"The quick brown fox jumps. vs A fast brown fox leaps. 0.577 0.400 \n",
"The cat sat on the mat. vs The dog ran in the park. 0.625 0.333 \n",
"I love programming. vs She enjoys reading books. 0.200 0.000 \n",
"The weather is nice today. vs It's raining outside. 0.192 0.000 \n",
"Short. vs Short. 1.000 1.000 \n",
"A B C D E F G vs A B C D E F G 1.000 1.000 \n",
" vs 1.000 1.000 \n"
]
}
],
@@ -201,12 +172,16 @@
" edit_distance = distance.Levenshtein.distance(words1, words2)\n",
" return 1 - (edit_distance / max_len)\n",
"\n",
" \n",
"\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<20} {'Similarities ->':<19} {'Char:':<10} {'Word:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" char_similarity = char_levenshtein_similarity(sent1, sent2)\n",
" word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}':\") \n",
" print(f\" Char similarity: {char_similarity:.3f} --- Word similarity: {word_similarity:.3f}\") # 3 decimal place\n",
" print(\"-\"* 50)"
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {char_similarity:.3f}{'':<5} {word_similarity:.3F} \") # 3 decimal places\n",
" #print(\"-\"* 50"
]
},
{
@@ -219,7 +194,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 38,
"id": "46a985b4",
"metadata": {},
"outputs": [
@@ -227,30 +202,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.825\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.625\n",
"--------------------------------------------------\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.400\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.500\n",
"--------------------------------------------------\n",
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.': 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
"--------------------------------------------------\n",
"'' vs '': 1.000\n",
"--------------------------------------------------\n"
"Sentence 1 Sentence 2: Similarity Score:\n",
"====================================================================================================\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"The cat sat on the mat. vs The cat sat on the mat 1.000\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.825\n",
"The cat sat on the mat. vs The feline rested on the rug. 0.625\n",
"The quick brown fox jumps. vs A fast brown fox leaps. 0.400\n",
"The cat sat on the mat. vs The dog ran in the park. 0.500\n",
"I love programming. vs She enjoys reading books. 0.000\n",
"The weather is nice today. vs It's raining outside. 0.000\n",
"Short. vs Short. 1.000\n",
"A B C D E F G vs A B C D E F G 1.000\n",
" vs 1.000\n"
]
}
],
@@ -285,10 +250,13 @@
" \n",
" return dot_product / (norm1 * norm2)\n",
"\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" similarity = cosine_similarity_bow(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
" print(\"-\"* 50)"
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
" #print(\"-\"* 50)"
]
},
{
@@ -301,7 +269,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 5,
"id": "7dc7ac2e",
"metadata": {},
"outputs": [
@@ -379,7 +347,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 37,
"id": "e6a4d4e2",
"metadata": {},
"outputs": [
@@ -387,30 +355,20 @@
"name": "stdout",
"output_type": "stream",
"text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 0.815\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.433\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.536\n",
"--------------------------------------------------\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.560\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.609\n",
"--------------------------------------------------\n",
"'I love programming.' vs 'She enjoys reading books.': 0.333\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.': 0.360\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.': 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
"--------------------------------------------------\n",
"'' vs '': 0.000\n",
"--------------------------------------------------\n"
"Sentence 1 Sentence 2: Similarity Score:\n",
"====================================================================================================\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"The cat sat on the mat. vs The cat sat on the mat 1.000\n",
"The cat sat on the mat. vs The cat sat on the mat. 0.815\n",
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.433\n",
"The cat sat on the mat. vs The feline rested on the rug. 0.536\n",
"The quick brown fox jumps. vs A fast brown fox leaps. 0.560\n",
"The cat sat on the mat. vs The dog ran in the park. 0.609\n",
"I love programming. vs She enjoys reading books. 0.333\n",
"The weather is nice today. vs It's raining outside. 0.360\n",
"Short. vs Short. 1.000\n",
"A B C D E F G vs A B C D E F G 1.000\n",
" vs 0.000\n"
]
}
],
@@ -437,10 +395,13 @@
" lcs_length = dp[m][n]\n",
" return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n",
"\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" similarity = longest_common_subsequence(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
" print(\"-\"* 50)"
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
" #print(\"-\"* 50)"
]
},
{
@@ -471,8 +432,7 @@
" \n",
" common = words1.intersection(words2)\n",
" return len(common) / len(words1)\n",
"\n",
" "
"\n"
]
},
{
@@ -485,7 +445,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 25,
"id": "b3d07562",
"metadata": {},
"outputs": [
@@ -495,19 +455,150 @@
"text": [
"Pair Sentence 1 Sentence 2 \n",
"====================================================================================================\n",
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n"
]
},
{
"ename": "TypeError",
"evalue": "tuple indices must be integers or slices, not dict",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 44\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m 43\u001b[39m results = evaluate_baseline_methods(test_pairs)\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m \u001b[43mprint_comparison_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_pairs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mprint_comparison_table\u001b[39m\u001b[34m(results, pairs)\u001b[39m\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Print similarities for this pair\u001b[39;00m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m method_name \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m similarity = \u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmethod_name\u001b[49m\u001b[43m]\u001b[49m[i]\n\u001b[32m 40\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<40\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod_name\u001b[38;5;250m \u001b[39m+\u001b[38;5;250m \u001b[39m\u001b[33m'\u001b[39m\u001b[33m:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n",
"\u001b[31mTypeError\u001b[39m: tuple indices must be integers or slices, not dict"
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 2: The cat sat on the mat. The cat sat on the mat \n",
" Jaccard: 0.667\n",
" Levenshtein (char): 0.957\n",
" Levenshtein (word): 0.833\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 0.978\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 0.978\n",
" Fuzzy Token Set: 0.973\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 3: The cat sat on the mat. The cat sat on the mat. \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 0.821\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 0.902\n",
" Fuzzy Partial: 0.826\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 0.815\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 4: The cat sat on the mat. On the mat, the cat was sit...\n",
" Jaccard: 0.375\n",
" Levenshtein (char): 0.344\n",
" Levenshtein (word): 0.143\n",
" Cosine BOW: 0.825\n",
" Fuzzy Ratio: 0.509\n",
" Fuzzy Partial: 0.619\n",
" Fuzzy Token Sort: 0.764\n",
" Fuzzy Token Set: 0.723\n",
" LCS: 0.433\n",
" Containment: 0.800\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 5: The cat sat on the mat. The feline rested on the rug. \n",
" Jaccard: 0.250\n",
" Levenshtein (char): 0.517\n",
" Levenshtein (word): 0.500\n",
" Cosine BOW: 0.625\n",
" Fuzzy Ratio: 0.615\n",
" Fuzzy Partial: 0.605\n",
" Fuzzy Token Sort: 0.538\n",
" Fuzzy Token Set: 0.480\n",
" LCS: 0.536\n",
" Containment: 0.400\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 6: The quick brown fox jumps. A fast brown fox leaps. \n",
" Jaccard: 0.250\n",
" Levenshtein (char): 0.577\n",
" Levenshtein (word): 0.400\n",
" Cosine BOW: 0.400\n",
" Fuzzy Ratio: 0.612\n",
" Fuzzy Partial: 0.700\n",
" Fuzzy Token Sort: 0.531\n",
" Fuzzy Token Set: 0.562\n",
" LCS: 0.560\n",
" Containment: 0.400\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 7: The cat sat on the mat. The dog ran in the park. \n",
" Jaccard: 0.111\n",
" Levenshtein (char): 0.625\n",
" Levenshtein (word): 0.333\n",
" Cosine BOW: 0.500\n",
" Fuzzy Ratio: 0.638\n",
" Fuzzy Partial: 0.636\n",
" Fuzzy Token Sort: 0.553\n",
" Fuzzy Token Set: 0.462\n",
" LCS: 0.609\n",
" Containment: 0.200\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 8: I love programming. She enjoys reading books. \n",
" Jaccard: 0.000\n",
" Levenshtein (char): 0.200\n",
" Levenshtein (word): 0.000\n",
" Cosine BOW: 0.000\n",
" Fuzzy Ratio: 0.409\n",
" Fuzzy Partial: 0.432\n",
" Fuzzy Token Sort: 0.364\n",
" Fuzzy Token Set: 0.364\n",
" LCS: 0.333\n",
" Containment: 0.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 9: The weather is nice today. It's raining outside. \n",
" Jaccard: 0.000\n",
" Levenshtein (char): 0.192\n",
" Levenshtein (word): 0.000\n",
" Cosine BOW: 0.000\n",
" Fuzzy Ratio: 0.426\n",
" Fuzzy Partial: 0.514\n",
" Fuzzy Token Sort: 0.340\n",
" Fuzzy Token Set: 0.340\n",
" LCS: 0.360\n",
" Containment: 0.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 10: Short. Short. \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 11: A B C D E F G A B C D E F G \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 12: \n",
" Jaccard: 0.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 0.000\n",
" LCS: 0.000\n",
" Containment: 0.000\n",
"----------------------------------------------------------------------------------------------------\n"
]
}
],
@@ -538,23 +629,23 @@
"\n",
"def print_comparison_table(results, pairs):\n",
" \"\"\" Print a formatted comparison table \"\"\"\n",
" print(f\"{'Pair':<40} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
" print(f\"{'Pair':<30} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
" print(\"=\" * 100)\n",
" \n",
" for i, (sent1, sent2) in enumerate(pairs):\n",
" # Truncate long sentences for display\n",
" # if sentence too long\n",
" display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n",
" display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n",
" \n",
" print(f\"{f'Pair {i+1}:':<40} {display_sent1:<30} {display_sent2:<30}\")\n",
" print(f\"{f'Pair {i+1}:':<30} {display_sent1:<30} {display_sent2:<30}\")\n",
" \n",
" # Print similarities for this pair\n",
" for method_name in results:\n",
" similarity = results[method_name][i]\n",
" print(f\"{'':<40} {method_name + ':':<20} {similarity:.3f}\")\n",
" print(f\"{'':<30} {method_name + ':':<20} {similarity:.3f}\")\n",
" print(\"-\" * 100)\n",
"\n",
"results = evaluate_baseline_methods(test_pairs)\n",
"results, methods = evaluate_baseline_methods(test_pairs)\n",
"print_comparison_table(results, test_pairs)"
]
}