Expanded on Baseline experimens, analyzed and compared

This commit is contained in:
Henry Dowd
2025-11-30 19:38:42 +00:00
parent 37ccc03ac9
commit fe2c087093

View File

@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 1,
"id": "d2aa2997", "id": "d2aa2997",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -56,7 +56,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 58,
"id": "e60d024e969254a", "id": "e60d024e969254a",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@@ -69,30 +69,20 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", "Sentence 1 Sentence 2: Similarity Score:\n",
"--------------------------------------------------\n", "====================================================================================================\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat': 0.667\n", "The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"--------------------------------------------------\n", "The cat sat on the mat. vs The cat sat on the mat 0.667\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", "The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"--------------------------------------------------\n", "The cat sat on the mat. vs On the mat, the cat was sitting. 0.375\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.375\n", "The cat sat on the mat. vs The feline rested on the rug. 0.250\n",
"--------------------------------------------------\n", "The quick brown fox jumps. vs A fast brown fox leaps. 0.250\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.250\n", "The cat sat on the mat. vs The dog ran in the park. 0.111\n",
"--------------------------------------------------\n", "I love programming. vs She enjoys reading books. 0.000\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.250\n", "The weather is nice today. vs It's raining outside. 0.000\n",
"--------------------------------------------------\n", "Short. vs Short. 1.000\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.111\n", "A B C D E F G vs A B C D E F G 1.000\n",
"--------------------------------------------------\n", " vs 0.000\n"
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.': 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
"--------------------------------------------------\n",
"'' vs '': 0.000\n",
"--------------------------------------------------\n"
] ]
} }
], ],
@@ -111,10 +101,13 @@
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n", " (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
"]\n", "]\n",
"\n", "\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n", "for sent1, sent2 in test_pairs:\n",
" similarity = jaccard_similarity(sent1, sent2)\n", " similarity = jaccard_similarity(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n", " print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
" print(\"-\"* 50)\n" " #print(\"-\"* 50)\n"
] ]
}, },
{ {
@@ -128,7 +121,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 57,
"id": "0b68fdcd", "id": "0b68fdcd",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -136,42 +129,20 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n", "Sentence 1 Sentence 2: Similarities -> Char: Word:\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n", "====================================================================================================\n",
"--------------------------------------------------\n", "The cat sat on the mat. vs The cat sat on the mat. 1.000 1.000 \n",
"'The cat sat on the mat.' vs 'The cat sat on the mat':\n", "The cat sat on the mat. vs The cat sat on the mat 0.957 0.833 \n",
" Char similarity: 0.957 --- Word similarity: 0.833\n", "The cat sat on the mat. vs The cat sat on the mat. 0.821 1.000 \n",
"--------------------------------------------------\n", "The cat sat on the mat. vs On the mat, the cat was sitting. 0.344 0.143 \n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n", "The cat sat on the mat. vs The feline rested on the rug. 0.517 0.500 \n",
" Char similarity: 0.821 --- Word similarity: 1.000\n", "The quick brown fox jumps. vs A fast brown fox leaps. 0.577 0.400 \n",
"--------------------------------------------------\n", "The cat sat on the mat. vs The dog ran in the park. 0.625 0.333 \n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.':\n", "I love programming. vs She enjoys reading books. 0.200 0.000 \n",
" Char similarity: 0.344 --- Word similarity: 0.143\n", "The weather is nice today. vs It's raining outside. 0.192 0.000 \n",
"--------------------------------------------------\n", "Short. vs Short. 1.000 1.000 \n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.':\n", "A B C D E F G vs A B C D E F G 1.000 1.000 \n",
" Char similarity: 0.517 --- Word similarity: 0.500\n", " vs 1.000 1.000 \n"
"--------------------------------------------------\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.':\n",
" Char similarity: 0.577 --- Word similarity: 0.400\n",
"--------------------------------------------------\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.':\n",
" Char similarity: 0.625 --- Word similarity: 0.333\n",
"--------------------------------------------------\n",
"'I love programming.' vs 'She enjoys reading books.':\n",
" Char similarity: 0.200 --- Word similarity: 0.000\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.':\n",
" Char similarity: 0.192 --- Word similarity: 0.000\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n",
"'' vs '':\n",
" Char similarity: 1.000 --- Word similarity: 1.000\n",
"--------------------------------------------------\n"
] ]
} }
], ],
@@ -201,12 +172,16 @@
" edit_distance = distance.Levenshtein.distance(words1, words2)\n", " edit_distance = distance.Levenshtein.distance(words1, words2)\n",
" return 1 - (edit_distance / max_len)\n", " return 1 - (edit_distance / max_len)\n",
"\n", "\n",
" \n",
"\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<20} {'Similarities ->':<19} {'Char:':<10} {'Word:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n", "for sent1, sent2 in test_pairs:\n",
" char_similarity = char_levenshtein_similarity(sent1, sent2)\n", " char_similarity = char_levenshtein_similarity(sent1, sent2)\n",
" word_similarity = word_levenshtein_similarity(sent1, sent2)\n", " word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}':\") \n", " print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {char_similarity:.3f}{'':<5} {word_similarity:.3F} \") # 3 decimal places\n",
" print(f\" Char similarity: {char_similarity:.3f} --- Word similarity: {word_similarity:.3f}\") # 3 decimal place\n", " #print(\"-\"* 50"
" print(\"-\"* 50)"
] ]
}, },
{ {
@@ -219,7 +194,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 38,
"id": "46a985b4", "id": "46a985b4",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -227,30 +202,20 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", "Sentence 1 Sentence 2: Similarity Score:\n",
"--------------------------------------------------\n", "====================================================================================================\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n", "The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"--------------------------------------------------\n", "The cat sat on the mat. vs The cat sat on the mat 1.000\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", "The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"--------------------------------------------------\n", "The cat sat on the mat. vs On the mat, the cat was sitting. 0.825\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.825\n", "The cat sat on the mat. vs The feline rested on the rug. 0.625\n",
"--------------------------------------------------\n", "The quick brown fox jumps. vs A fast brown fox leaps. 0.400\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.625\n", "The cat sat on the mat. vs The dog ran in the park. 0.500\n",
"--------------------------------------------------\n", "I love programming. vs She enjoys reading books. 0.000\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.400\n", "The weather is nice today. vs It's raining outside. 0.000\n",
"--------------------------------------------------\n", "Short. vs Short. 1.000\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.500\n", "A B C D E F G vs A B C D E F G 1.000\n",
"--------------------------------------------------\n", " vs 1.000\n"
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.': 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
"--------------------------------------------------\n",
"'' vs '': 1.000\n",
"--------------------------------------------------\n"
] ]
} }
], ],
@@ -285,10 +250,13 @@
" \n", " \n",
" return dot_product / (norm1 * norm2)\n", " return dot_product / (norm1 * norm2)\n",
"\n", "\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n", "for sent1, sent2 in test_pairs:\n",
" similarity = cosine_similarity_bow(sent1, sent2)\n", " similarity = cosine_similarity_bow(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n", " print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
" print(\"-\"* 50)" " #print(\"-\"* 50)"
] ]
}, },
{ {
@@ -301,7 +269,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 5,
"id": "7dc7ac2e", "id": "7dc7ac2e",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -379,7 +347,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 37,
"id": "e6a4d4e2", "id": "e6a4d4e2",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -387,30 +355,20 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", "Sentence 1 Sentence 2: Similarity Score:\n",
"--------------------------------------------------\n", "====================================================================================================\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n", "The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
"--------------------------------------------------\n", "The cat sat on the mat. vs The cat sat on the mat 1.000\n",
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 0.815\n", "The cat sat on the mat. vs The cat sat on the mat. 0.815\n",
"--------------------------------------------------\n", "The cat sat on the mat. vs On the mat, the cat was sitting. 0.433\n",
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.433\n", "The cat sat on the mat. vs The feline rested on the rug. 0.536\n",
"--------------------------------------------------\n", "The quick brown fox jumps. vs A fast brown fox leaps. 0.560\n",
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.536\n", "The cat sat on the mat. vs The dog ran in the park. 0.609\n",
"--------------------------------------------------\n", "I love programming. vs She enjoys reading books. 0.333\n",
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.560\n", "The weather is nice today. vs It's raining outside. 0.360\n",
"--------------------------------------------------\n", "Short. vs Short. 1.000\n",
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.609\n", "A B C D E F G vs A B C D E F G 1.000\n",
"--------------------------------------------------\n", " vs 0.000\n"
"'I love programming.' vs 'She enjoys reading books.': 0.333\n",
"--------------------------------------------------\n",
"'The weather is nice today.' vs 'It's raining outside.': 0.360\n",
"--------------------------------------------------\n",
"'Short.' vs 'Short.': 1.000\n",
"--------------------------------------------------\n",
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
"--------------------------------------------------\n",
"'' vs '': 0.000\n",
"--------------------------------------------------\n"
] ]
} }
], ],
@@ -437,10 +395,13 @@
" lcs_length = dp[m][n]\n", " lcs_length = dp[m][n]\n",
" return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n", " return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n",
"\n", "\n",
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
"print(\"=\" * 100)\n",
"\n",
"for sent1, sent2 in test_pairs:\n", "for sent1, sent2 in test_pairs:\n",
" similarity = longest_common_subsequence(sent1, sent2)\n", " similarity = longest_common_subsequence(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n", " print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
" print(\"-\"* 50)" " #print(\"-\"* 50)"
] ]
}, },
{ {
@@ -471,8 +432,7 @@
" \n", " \n",
" common = words1.intersection(words2)\n", " common = words1.intersection(words2)\n",
" return len(common) / len(words1)\n", " return len(common) / len(words1)\n",
"\n", "\n"
" "
] ]
}, },
{ {
@@ -485,7 +445,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 25,
"id": "b3d07562", "id": "b3d07562",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -495,19 +455,150 @@
"text": [ "text": [
"Pair Sentence 1 Sentence 2 \n", "Pair Sentence 1 Sentence 2 \n",
"====================================================================================================\n", "====================================================================================================\n",
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n" "Pair 1: The cat sat on the mat. The cat sat on the mat. \n",
] " Jaccard: 1.000\n",
}, " Levenshtein (char): 1.000\n",
{ " Levenshtein (word): 1.000\n",
"ename": "TypeError", " Cosine BOW: 1.000\n",
"evalue": "tuple indices must be integers or slices, not dict", " Fuzzy Ratio: 1.000\n",
"output_type": "error", " Fuzzy Partial: 1.000\n",
"traceback": [ " Fuzzy Token Sort: 1.000\n",
"\u001b[31m---------------------------------------------------------------------------\u001b[39m", " Fuzzy Token Set: 1.000\n",
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", " LCS: 1.000\n",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 44\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m 43\u001b[39m results = evaluate_baseline_methods(test_pairs)\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m \u001b[43mprint_comparison_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_pairs\u001b[49m\u001b[43m)\u001b[49m\n", " Containment: 1.000\n",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mprint_comparison_table\u001b[39m\u001b[34m(results, pairs)\u001b[39m\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Print similarities for this pair\u001b[39;00m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m method_name \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m similarity = \u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmethod_name\u001b[49m\u001b[43m]\u001b[49m[i]\n\u001b[32m 40\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<40\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod_name\u001b[38;5;250m \u001b[39m+\u001b[38;5;250m \u001b[39m\u001b[33m'\u001b[39m\u001b[33m:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n", "----------------------------------------------------------------------------------------------------\n",
"\u001b[31mTypeError\u001b[39m: tuple indices must be integers or slices, not dict" "Pair 2: The cat sat on the mat. The cat sat on the mat \n",
" Jaccard: 0.667\n",
" Levenshtein (char): 0.957\n",
" Levenshtein (word): 0.833\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 0.978\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 0.978\n",
" Fuzzy Token Set: 0.973\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 3: The cat sat on the mat. The cat sat on the mat. \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 0.821\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 0.902\n",
" Fuzzy Partial: 0.826\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 0.815\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 4: The cat sat on the mat. On the mat, the cat was sit...\n",
" Jaccard: 0.375\n",
" Levenshtein (char): 0.344\n",
" Levenshtein (word): 0.143\n",
" Cosine BOW: 0.825\n",
" Fuzzy Ratio: 0.509\n",
" Fuzzy Partial: 0.619\n",
" Fuzzy Token Sort: 0.764\n",
" Fuzzy Token Set: 0.723\n",
" LCS: 0.433\n",
" Containment: 0.800\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 5: The cat sat on the mat. The feline rested on the rug. \n",
" Jaccard: 0.250\n",
" Levenshtein (char): 0.517\n",
" Levenshtein (word): 0.500\n",
" Cosine BOW: 0.625\n",
" Fuzzy Ratio: 0.615\n",
" Fuzzy Partial: 0.605\n",
" Fuzzy Token Sort: 0.538\n",
" Fuzzy Token Set: 0.480\n",
" LCS: 0.536\n",
" Containment: 0.400\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 6: The quick brown fox jumps. A fast brown fox leaps. \n",
" Jaccard: 0.250\n",
" Levenshtein (char): 0.577\n",
" Levenshtein (word): 0.400\n",
" Cosine BOW: 0.400\n",
" Fuzzy Ratio: 0.612\n",
" Fuzzy Partial: 0.700\n",
" Fuzzy Token Sort: 0.531\n",
" Fuzzy Token Set: 0.562\n",
" LCS: 0.560\n",
" Containment: 0.400\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 7: The cat sat on the mat. The dog ran in the park. \n",
" Jaccard: 0.111\n",
" Levenshtein (char): 0.625\n",
" Levenshtein (word): 0.333\n",
" Cosine BOW: 0.500\n",
" Fuzzy Ratio: 0.638\n",
" Fuzzy Partial: 0.636\n",
" Fuzzy Token Sort: 0.553\n",
" Fuzzy Token Set: 0.462\n",
" LCS: 0.609\n",
" Containment: 0.200\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 8: I love programming. She enjoys reading books. \n",
" Jaccard: 0.000\n",
" Levenshtein (char): 0.200\n",
" Levenshtein (word): 0.000\n",
" Cosine BOW: 0.000\n",
" Fuzzy Ratio: 0.409\n",
" Fuzzy Partial: 0.432\n",
" Fuzzy Token Sort: 0.364\n",
" Fuzzy Token Set: 0.364\n",
" LCS: 0.333\n",
" Containment: 0.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 9: The weather is nice today. It's raining outside. \n",
" Jaccard: 0.000\n",
" Levenshtein (char): 0.192\n",
" Levenshtein (word): 0.000\n",
" Cosine BOW: 0.000\n",
" Fuzzy Ratio: 0.426\n",
" Fuzzy Partial: 0.514\n",
" Fuzzy Token Sort: 0.340\n",
" Fuzzy Token Set: 0.340\n",
" LCS: 0.360\n",
" Containment: 0.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 10: Short. Short. \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 11: A B C D E F G A B C D E F G \n",
" Jaccard: 1.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 1.000\n",
" LCS: 1.000\n",
" Containment: 1.000\n",
"----------------------------------------------------------------------------------------------------\n",
"Pair 12: \n",
" Jaccard: 0.000\n",
" Levenshtein (char): 1.000\n",
" Levenshtein (word): 1.000\n",
" Cosine BOW: 1.000\n",
" Fuzzy Ratio: 1.000\n",
" Fuzzy Partial: 1.000\n",
" Fuzzy Token Sort: 1.000\n",
" Fuzzy Token Set: 0.000\n",
" LCS: 0.000\n",
" Containment: 0.000\n",
"----------------------------------------------------------------------------------------------------\n"
] ]
} }
], ],
@@ -538,23 +629,23 @@
"\n", "\n",
"def print_comparison_table(results, pairs):\n", "def print_comparison_table(results, pairs):\n",
" \"\"\" Print a formatted comparison table \"\"\"\n", " \"\"\" Print a formatted comparison table \"\"\"\n",
" print(f\"{'Pair':<40} {'Sentence 1':<30} {'Sentence 2':<30}\")\n", " print(f\"{'Pair':<30} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
" print(\"=\" * 100)\n", " print(\"=\" * 100)\n",
" \n", " \n",
" for i, (sent1, sent2) in enumerate(pairs):\n", " for i, (sent1, sent2) in enumerate(pairs):\n",
" # Truncate long sentences for display\n", " # if sentence too long\n",
" display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n", " display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n",
" display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n", " display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n",
" \n", " \n",
" print(f\"{f'Pair {i+1}:':<40} {display_sent1:<30} {display_sent2:<30}\")\n", " print(f\"{f'Pair {i+1}:':<30} {display_sent1:<30} {display_sent2:<30}\")\n",
" \n", " \n",
" # Print similarities for this pair\n", " # Print similarities for this pair\n",
" for method_name in results:\n", " for method_name in results:\n",
" similarity = results[method_name][i]\n", " similarity = results[method_name][i]\n",
" print(f\"{'':<40} {method_name + ':':<20} {similarity:.3f}\")\n", " print(f\"{'':<30} {method_name + ':':<20} {similarity:.3f}\")\n",
" print(\"-\" * 100)\n", " print(\"-\" * 100)\n",
"\n", "\n",
"results = evaluate_baseline_methods(test_pairs)\n", "results, methods = evaluate_baseline_methods(test_pairs)\n",
"print_comparison_table(results, test_pairs)" "print_comparison_table(results, test_pairs)"
] ]
} }