diff --git a/notebooks/02_baseline_experiments.ipynb b/notebooks/02_baseline_experiments.ipynb index f4a5c48..b75fa67 100644 --- a/notebooks/02_baseline_experiments.ipynb +++ b/notebooks/02_baseline_experiments.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "id": "d2aa2997", "metadata": {}, "outputs": [ @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 58, "id": "e60d024e969254a", "metadata": { "ExecuteTime": { @@ -69,30 +69,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The cat sat on the mat': 0.667\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.375\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.250\n", - "--------------------------------------------------\n", - "'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.250\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The dog ran in the park.': 0.111\n", - "--------------------------------------------------\n", - "'I love programming.' vs 'She enjoys reading books.': 0.000\n", - "--------------------------------------------------\n", - "'The weather is nice today.' vs 'It's raining outside.': 0.000\n", - "--------------------------------------------------\n", - "'Short.' vs 'Short.': 1.000\n", - "--------------------------------------------------\n", - "'A B C D E F G' vs 'A B C D E F G': 1.000\n", - "--------------------------------------------------\n", - "'' vs '': 0.000\n", - "--------------------------------------------------\n" + "Sentence 1 Sentence 2: Similarity Score:\n", + "====================================================================================================\n", + "The cat sat on the mat. vs The cat sat on the mat. 1.000\n", + "The cat sat on the mat. vs The cat sat on the mat 0.667\n", + "The cat sat on the mat. vs The cat sat on the mat. 1.000\n", + "The cat sat on the mat. vs On the mat, the cat was sitting. 0.375\n", + "The cat sat on the mat. vs The feline rested on the rug. 0.250\n", + "The quick brown fox jumps. vs A fast brown fox leaps. 0.250\n", + "The cat sat on the mat. vs The dog ran in the park. 0.111\n", + "I love programming. vs She enjoys reading books. 0.000\n", + "The weather is nice today. vs It's raining outside. 0.000\n", + "Short. vs Short. 1.000\n", + "A B C D E F G vs A B C D E F G 1.000\n", + " vs 0.000\n" ] } ], @@ -111,10 +101,13 @@ " (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n", "]\n", "\n", + "print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n", + "print(\"=\" * 100)\n", + "\n", "for sent1, sent2 in test_pairs:\n", " similarity = jaccard_similarity(sent1, sent2)\n", - " print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n", - " print(\"-\"* 50)\n" + " print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n", + " #print(\"-\"* 50)\n" ] }, { @@ -128,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 57, "id": "0b68fdcd", "metadata": {}, "outputs": [ @@ -136,42 +129,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "'The cat sat on the mat.' vs 'The cat sat on the mat.':\n", - " Char similarity: 1.000 --- Word similarity: 1.000\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The cat sat on the mat':\n", - " Char similarity: 0.957 --- Word similarity: 0.833\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The cat sat on the mat.':\n", - " Char similarity: 0.821 --- Word similarity: 1.000\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.':\n", - " Char similarity: 0.344 --- Word similarity: 0.143\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The feline rested on the rug.':\n", - " Char similarity: 0.517 --- Word similarity: 0.500\n", - "--------------------------------------------------\n", - "'The quick brown fox jumps.' vs 'A fast brown fox leaps.':\n", - " Char similarity: 0.577 --- Word similarity: 0.400\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The dog ran in the park.':\n", - " Char similarity: 0.625 --- Word similarity: 0.333\n", - "--------------------------------------------------\n", - "'I love programming.' vs 'She enjoys reading books.':\n", - " Char similarity: 0.200 --- Word similarity: 0.000\n", - "--------------------------------------------------\n", - "'The weather is nice today.' vs 'It's raining outside.':\n", - " Char similarity: 0.192 --- Word similarity: 0.000\n", - "--------------------------------------------------\n", - "'Short.' vs 'Short.':\n", - " Char similarity: 1.000 --- Word similarity: 1.000\n", - "--------------------------------------------------\n", - "'A B C D E F G' vs 'A B C D E F G':\n", - " Char similarity: 1.000 --- Word similarity: 1.000\n", - "--------------------------------------------------\n", - "'' vs '':\n", - " Char similarity: 1.000 --- Word similarity: 1.000\n", - "--------------------------------------------------\n" + "Sentence 1 Sentence 2: Similarities -> Char: Word:\n", + "====================================================================================================\n", + "The cat sat on the mat. vs The cat sat on the mat. 1.000 1.000 \n", + "The cat sat on the mat. vs The cat sat on the mat 0.957 0.833 \n", + "The cat sat on the mat. vs The cat sat on the mat. 0.821 1.000 \n", + "The cat sat on the mat. vs On the mat, the cat was sitting. 0.344 0.143 \n", + "The cat sat on the mat. vs The feline rested on the rug. 0.517 0.500 \n", + "The quick brown fox jumps. vs A fast brown fox leaps. 0.577 0.400 \n", + "The cat sat on the mat. vs The dog ran in the park. 0.625 0.333 \n", + "I love programming. vs She enjoys reading books. 0.200 0.000 \n", + "The weather is nice today. vs It's raining outside. 0.192 0.000 \n", + "Short. vs Short. 1.000 1.000 \n", + "A B C D E F G vs A B C D E F G 1.000 1.000 \n", + " vs 1.000 1.000 \n" ] } ], @@ -201,12 +172,16 @@ " edit_distance = distance.Levenshtein.distance(words1, words2)\n", " return 1 - (edit_distance / max_len)\n", "\n", + " \n", + "\n", + "print(f\"{'Sentence 1':<41} {'Sentence 2:':<20} {'Similarities ->':<19} {'Char:':<10} {'Word:'}\")\n", + "print(\"=\" * 100)\n", + "\n", "for sent1, sent2 in test_pairs:\n", " char_similarity = char_levenshtein_similarity(sent1, sent2)\n", " word_similarity = word_levenshtein_similarity(sent1, sent2)\n", - " print(f\"'{sent1}' vs '{sent2}':\") \n", - " print(f\" Char similarity: {char_similarity:.3f} --- Word similarity: {word_similarity:.3f}\") # 3 decimal place\n", - " print(\"-\"* 50)" + " print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {char_similarity:.3f}{'':<5} {word_similarity:.3F} \") # 3 decimal places\n", + " #print(\"-\"* 50" ] }, { @@ -219,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 38, "id": "46a985b4", "metadata": {}, "outputs": [ @@ -227,30 +202,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.825\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.625\n", - "--------------------------------------------------\n", - "'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.400\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The dog ran in the park.': 0.500\n", - "--------------------------------------------------\n", - "'I love programming.' vs 'She enjoys reading books.': 0.000\n", - "--------------------------------------------------\n", - "'The weather is nice today.' vs 'It's raining outside.': 0.000\n", - "--------------------------------------------------\n", - "'Short.' vs 'Short.': 1.000\n", - "--------------------------------------------------\n", - "'A B C D E F G' vs 'A B C D E F G': 1.000\n", - "--------------------------------------------------\n", - "'' vs '': 1.000\n", - "--------------------------------------------------\n" + "Sentence 1 Sentence 2: Similarity Score:\n", + "====================================================================================================\n", + "The cat sat on the mat. vs The cat sat on the mat. 1.000\n", + "The cat sat on the mat. vs The cat sat on the mat 1.000\n", + "The cat sat on the mat. vs The cat sat on the mat. 1.000\n", + "The cat sat on the mat. vs On the mat, the cat was sitting. 0.825\n", + "The cat sat on the mat. vs The feline rested on the rug. 0.625\n", + "The quick brown fox jumps. vs A fast brown fox leaps. 0.400\n", + "The cat sat on the mat. vs The dog ran in the park. 0.500\n", + "I love programming. vs She enjoys reading books. 0.000\n", + "The weather is nice today. vs It's raining outside. 0.000\n", + "Short. vs Short. 1.000\n", + "A B C D E F G vs A B C D E F G 1.000\n", + " vs 1.000\n" ] } ], @@ -285,10 +250,13 @@ " \n", " return dot_product / (norm1 * norm2)\n", "\n", + "print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n", + "print(\"=\" * 100)\n", + "\n", "for sent1, sent2 in test_pairs:\n", " similarity = cosine_similarity_bow(sent1, sent2)\n", - " print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n", - " print(\"-\"* 50)" + " print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n", + " #print(\"-\"* 50)" ] }, { @@ -301,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 5, "id": "7dc7ac2e", "metadata": {}, "outputs": [ @@ -379,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 37, "id": "e6a4d4e2", "metadata": {}, "outputs": [ @@ -387,30 +355,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The cat sat on the mat.': 0.815\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.433\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.536\n", - "--------------------------------------------------\n", - "'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.560\n", - "--------------------------------------------------\n", - "'The cat sat on the mat.' vs 'The dog ran in the park.': 0.609\n", - "--------------------------------------------------\n", - "'I love programming.' vs 'She enjoys reading books.': 0.333\n", - "--------------------------------------------------\n", - "'The weather is nice today.' vs 'It's raining outside.': 0.360\n", - "--------------------------------------------------\n", - "'Short.' vs 'Short.': 1.000\n", - "--------------------------------------------------\n", - "'A B C D E F G' vs 'A B C D E F G': 1.000\n", - "--------------------------------------------------\n", - "'' vs '': 0.000\n", - "--------------------------------------------------\n" + "Sentence 1 Sentence 2: Similarity Score:\n", + "====================================================================================================\n", + "The cat sat on the mat. vs The cat sat on the mat. 1.000\n", + "The cat sat on the mat. vs The cat sat on the mat 1.000\n", + "The cat sat on the mat. vs The cat sat on the mat. 0.815\n", + "The cat sat on the mat. vs On the mat, the cat was sitting. 0.433\n", + "The cat sat on the mat. vs The feline rested on the rug. 0.536\n", + "The quick brown fox jumps. vs A fast brown fox leaps. 0.560\n", + "The cat sat on the mat. vs The dog ran in the park. 0.609\n", + "I love programming. vs She enjoys reading books. 0.333\n", + "The weather is nice today. vs It's raining outside. 0.360\n", + "Short. vs Short. 1.000\n", + "A B C D E F G vs A B C D E F G 1.000\n", + " vs 0.000\n" ] } ], @@ -437,10 +395,13 @@ " lcs_length = dp[m][n]\n", " return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n", "\n", + "print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n", + "print(\"=\" * 100)\n", + "\n", "for sent1, sent2 in test_pairs:\n", " similarity = longest_common_subsequence(sent1, sent2)\n", - " print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n", - " print(\"-\"* 50)" + " print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n", + " #print(\"-\"* 50)" ] }, { @@ -471,8 +432,7 @@ " \n", " common = words1.intersection(words2)\n", " return len(common) / len(words1)\n", - "\n", - " " + "\n" ] }, { @@ -485,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "b3d07562", "metadata": {}, "outputs": [ @@ -493,21 +453,152 @@ "name": "stdout", "output_type": "stream", "text": [ - "Pair Sentence 1 Sentence 2 \n", + "Pair Sentence 1 Sentence 2 \n", "====================================================================================================\n", - "Pair 1: The cat sat on the mat. The cat sat on the mat. \n" - ] - }, - { - "ename": "TypeError", - "evalue": "tuple indices must be integers or slices, not dict", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 44\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m 43\u001b[39m results = evaluate_baseline_methods(test_pairs)\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m \u001b[43mprint_comparison_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_pairs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mprint_comparison_table\u001b[39m\u001b[34m(results, pairs)\u001b[39m\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Print similarities for this pair\u001b[39;00m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m method_name \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m similarity = \u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmethod_name\u001b[49m\u001b[43m]\u001b[49m[i]\n\u001b[32m 40\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<40\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod_name\u001b[38;5;250m \u001b[39m+\u001b[38;5;250m \u001b[39m\u001b[33m'\u001b[39m\u001b[33m:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n", - "\u001b[31mTypeError\u001b[39m: tuple indices must be integers or slices, not dict" + "Pair 1: The cat sat on the mat. The cat sat on the mat. \n", + " Jaccard: 1.000\n", + " Levenshtein (char): 1.000\n", + " Levenshtein (word): 1.000\n", + " Cosine BOW: 1.000\n", + " Fuzzy Ratio: 1.000\n", + " Fuzzy Partial: 1.000\n", + " Fuzzy Token Sort: 1.000\n", + " Fuzzy Token Set: 1.000\n", + " LCS: 1.000\n", + " Containment: 1.000\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 2: The cat sat on the mat. The cat sat on the mat \n", + " Jaccard: 0.667\n", + " Levenshtein (char): 0.957\n", + " Levenshtein (word): 0.833\n", + " Cosine BOW: 1.000\n", + " Fuzzy Ratio: 0.978\n", + " Fuzzy Partial: 1.000\n", + " Fuzzy Token Sort: 0.978\n", + " Fuzzy Token Set: 0.973\n", + " LCS: 1.000\n", + " Containment: 1.000\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 3: The cat sat on the mat. The cat sat on the mat. \n", + " Jaccard: 1.000\n", + " Levenshtein (char): 0.821\n", + " Levenshtein (word): 1.000\n", + " Cosine BOW: 1.000\n", + " Fuzzy Ratio: 0.902\n", + " Fuzzy Partial: 0.826\n", + " Fuzzy Token Sort: 1.000\n", + " Fuzzy Token Set: 1.000\n", + " LCS: 0.815\n", + " Containment: 1.000\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 4: The cat sat on the mat. On the mat, the cat was sit...\n", + " Jaccard: 0.375\n", + " Levenshtein (char): 0.344\n", + " Levenshtein (word): 0.143\n", + " Cosine BOW: 0.825\n", + " Fuzzy Ratio: 0.509\n", + " Fuzzy Partial: 0.619\n", + " Fuzzy Token Sort: 0.764\n", + " Fuzzy Token Set: 0.723\n", + " LCS: 0.433\n", + " Containment: 0.800\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 5: The cat sat on the mat. The feline rested on the rug. \n", + " Jaccard: 0.250\n", + " Levenshtein (char): 0.517\n", + " Levenshtein (word): 0.500\n", + " Cosine BOW: 0.625\n", + " Fuzzy Ratio: 0.615\n", + " Fuzzy Partial: 0.605\n", + " Fuzzy Token Sort: 0.538\n", + " Fuzzy Token Set: 0.480\n", + " LCS: 0.536\n", + " Containment: 0.400\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 6: The quick brown fox jumps. A fast brown fox leaps. \n", + " Jaccard: 0.250\n", + " Levenshtein (char): 0.577\n", + " Levenshtein (word): 0.400\n", + " Cosine BOW: 0.400\n", + " Fuzzy Ratio: 0.612\n", + " Fuzzy Partial: 0.700\n", + " Fuzzy Token Sort: 0.531\n", + " Fuzzy Token Set: 0.562\n", + " LCS: 0.560\n", + " Containment: 0.400\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 7: The cat sat on the mat. The dog ran in the park. \n", + " Jaccard: 0.111\n", + " Levenshtein (char): 0.625\n", + " Levenshtein (word): 0.333\n", + " Cosine BOW: 0.500\n", + " Fuzzy Ratio: 0.638\n", + " Fuzzy Partial: 0.636\n", + " Fuzzy Token Sort: 0.553\n", + " Fuzzy Token Set: 0.462\n", + " LCS: 0.609\n", + " Containment: 0.200\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 8: I love programming. She enjoys reading books. \n", + " Jaccard: 0.000\n", + " Levenshtein (char): 0.200\n", + " Levenshtein (word): 0.000\n", + " Cosine BOW: 0.000\n", + " Fuzzy Ratio: 0.409\n", + " Fuzzy Partial: 0.432\n", + " Fuzzy Token Sort: 0.364\n", + " Fuzzy Token Set: 0.364\n", + " LCS: 0.333\n", + " Containment: 0.000\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 9: The weather is nice today. It's raining outside. \n", + " Jaccard: 0.000\n", + " Levenshtein (char): 0.192\n", + " Levenshtein (word): 0.000\n", + " Cosine BOW: 0.000\n", + " Fuzzy Ratio: 0.426\n", + " Fuzzy Partial: 0.514\n", + " Fuzzy Token Sort: 0.340\n", + " Fuzzy Token Set: 0.340\n", + " LCS: 0.360\n", + " Containment: 0.000\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 10: Short. Short. \n", + " Jaccard: 1.000\n", + " Levenshtein (char): 1.000\n", + " Levenshtein (word): 1.000\n", + " Cosine BOW: 1.000\n", + " Fuzzy Ratio: 1.000\n", + " Fuzzy Partial: 1.000\n", + " Fuzzy Token Sort: 1.000\n", + " Fuzzy Token Set: 1.000\n", + " LCS: 1.000\n", + " Containment: 1.000\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 11: A B C D E F G A B C D E F G \n", + " Jaccard: 1.000\n", + " Levenshtein (char): 1.000\n", + " Levenshtein (word): 1.000\n", + " Cosine BOW: 1.000\n", + " Fuzzy Ratio: 1.000\n", + " Fuzzy Partial: 1.000\n", + " Fuzzy Token Sort: 1.000\n", + " Fuzzy Token Set: 1.000\n", + " LCS: 1.000\n", + " Containment: 1.000\n", + "----------------------------------------------------------------------------------------------------\n", + "Pair 12: \n", + " Jaccard: 0.000\n", + " Levenshtein (char): 1.000\n", + " Levenshtein (word): 1.000\n", + " Cosine BOW: 1.000\n", + " Fuzzy Ratio: 1.000\n", + " Fuzzy Partial: 1.000\n", + " Fuzzy Token Sort: 1.000\n", + " Fuzzy Token Set: 0.000\n", + " LCS: 0.000\n", + " Containment: 0.000\n", + "----------------------------------------------------------------------------------------------------\n" ] } ], @@ -538,23 +629,23 @@ "\n", "def print_comparison_table(results, pairs):\n", " \"\"\" Print a formatted comparison table \"\"\"\n", - " print(f\"{'Pair':<40} {'Sentence 1':<30} {'Sentence 2':<30}\")\n", + " print(f\"{'Pair':<30} {'Sentence 1':<30} {'Sentence 2':<30}\")\n", " print(\"=\" * 100)\n", " \n", " for i, (sent1, sent2) in enumerate(pairs):\n", - " # Truncate long sentences for display\n", + " # if sentence too long\n", " display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n", " display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n", " \n", - " print(f\"{f'Pair {i+1}:':<40} {display_sent1:<30} {display_sent2:<30}\")\n", + " print(f\"{f'Pair {i+1}:':<30} {display_sent1:<30} {display_sent2:<30}\")\n", " \n", " # Print similarities for this pair\n", " for method_name in results:\n", " similarity = results[method_name][i]\n", - " print(f\"{'':<40} {method_name + ':':<20} {similarity:.3f}\")\n", + " print(f\"{'':<30} {method_name + ':':<20} {similarity:.3f}\")\n", " print(\"-\" * 100)\n", "\n", - "results = evaluate_baseline_methods(test_pairs)\n", + "results, methods = evaluate_baseline_methods(test_pairs)\n", "print_comparison_table(results, test_pairs)" ] }