{ "cells": [ { "metadata": { "ExecuteTime": { "end_time": "2025-11-19T00:00:04.962487Z", "start_time": "2025-11-19T00:00:04.958995Z" } }, "cell_type": "code", "source": [ "import import_ipynb\n", "from notebooks.01_data_exploration import *\n", "\n", "def jaccard_similarity(sent1, sent2):\n", " # make lowercase and split into words\n", " words1 = set(sent1.lower().split())\n", " words2 = set(sent2.lower().split())\n", " intersection = words1.intersection(words2)\n", " union = words1.union(words2)\n", " return float(len(intersection)) / len(union) if union else 0.0\n", "\n", "test_pairs = [\n", " (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Copy\n", " (\"The cat sat on the mat.\", \"On the mat sat the cat.\"), # Same words rearranged\n", " (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n", "]\n", "\n", "for sent1, sent2 in test_pairs:\n", " similarity = jaccard_similarity(sent1, sent2)\n", " print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n" ], "id": "e60d024e969254a", "outputs": [ { "ename": "SyntaxError", "evalue": "invalid decimal literal (2501033926.py, line 2)", "output_type": "error", "traceback": [ " \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[8]\u001B[39m\u001B[32m, line 2\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mfrom notebooks.01_data_exploration import *\u001B[39m\n ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m invalid decimal literal\n" ] } ], "execution_count": 8 } ], "metadata": { "kernelspec": { "name": "python3", "language": "python", "display_name": "Python 3 (ipykernel)" } }, "nbformat": 4, "nbformat_minor": 5 }