{ "cells": [ { "cell_type": "markdown", "id": "dd72d1539056a64", "metadata": {}, "source": [ "Import Cell\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "12579bf734bb1a92", "metadata": { "ExecuteTime": { "end_time": "2025-11-23T13:53:57.753560Z", "start_time": "2025-11-23T13:53:56.325948Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 13 test sentences\n" ] } ], "source": [ "import spacy\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from collections import Counter\n", "import string\n", "from IPython.display import display, HTML\n", "\n", "nlp_lg = spacy.load(\"en_core_web_lg\")\n", "nlp_trf = spacy.load(\"en_core_web_trf\")\n", "\n", "nlp = nlp_lg\n", "\n", "# Test sentences covering different similarity types\n", "test_sentences = [\n", " # Direct copies and near-copies\n", " \"The cat sat on the mat.\",\n", " \"The cat sat on the mat\",\n", " \"The cat sat on the mat.\",\n", " \n", " # Paraphrases\n", " \"On the mat, the cat was sitting.\",\n", " \"The feline rested on the rug.\",\n", " \n", " # Structural changes\n", " \"The quick brown fox jumps over the lazy dog.\",\n", " \"Over the lazy dog jumps the quick brown fox.\",\n", " \n", " # Different content\n", " \"The dog ran in the park.\",\n", " \"I love programming.\",\n", " \"She enjoys reading books.\",\n", " \n", " # Edge cases\n", " \"Short.\",\n", " \"A B C D E F G\",\n", " \"\"\n", "]\n", "\n", "print(f\"Loaded {len(test_sentences)} test sentences\")" ] }, { "cell_type": "markdown", "id": "1c26616777253f10", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 5, "id": "e003ac06a58cfbb4", "metadata": { "ExecuteTime": { "end_time": "2025-11-23T13:54:12.922343Z", "start_time": "2025-11-23T13:54:12.896440Z" } }, "outputs": [ { "data": { "text/html": [ "
| \n", " | sentence_id | \n", "sentence | \n", "char_length | \n", "word_count | \n", "alpha_word_count | \n", "avg_word_length | \n", "has_punctuation | \n", "is_empty | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0 | \n", "The cat sat on the mat. | \n", "23 | \n", "7 | \n", "6 | \n", "2.833333 | \n", "True | \n", "False | \n", "
| 1 | \n", "1 | \n", "The cat sat on the mat | \n", "22 | \n", "6 | \n", "6 | \n", "2.833333 | \n", "False | \n", "False | \n", "
| 2 | \n", "2 | \n", "The cat sat on the mat. | \n", "28 | \n", "7 | \n", "6 | \n", "2.833333 | \n", "True | \n", "False | \n", "
| 3 | \n", "3 | \n", "On the mat, the cat was sitting. | \n", "32 | \n", "9 | \n", "7 | \n", "3.428571 | \n", "True | \n", "False | \n", "
| 4 | \n", "4 | \n", "The feline rested on the rug. | \n", "29 | \n", "7 | \n", "6 | \n", "3.833333 | \n", "True | \n", "False | \n", "
| 5 | \n", "5 | \n", "The quick brown fox jumps over the lazy dog. | \n", "44 | \n", "10 | \n", "9 | \n", "3.888889 | \n", "True | \n", "False | \n", "
| 6 | \n", "6 | \n", "Over the lazy dog jumps the quick brown fox. | \n", "44 | \n", "10 | \n", "9 | \n", "3.888889 | \n", "True | \n", "False | \n", "
| 7 | \n", "7 | \n", "The dog ran in the park. | \n", "24 | \n", "7 | \n", "6 | \n", "3.000000 | \n", "True | \n", "False | \n", "
| 8 | \n", "8 | \n", "I love programming. | \n", "19 | \n", "4 | \n", "3 | \n", "5.333333 | \n", "True | \n", "False | \n", "
| 9 | \n", "9 | \n", "She enjoys reading books. | \n", "25 | \n", "5 | \n", "4 | \n", "5.250000 | \n", "True | \n", "False | \n", "
| 10 | \n", "10 | \n", "Short. | \n", "6 | \n", "2 | \n", "1 | \n", "5.000000 | \n", "True | \n", "False | \n", "
| 11 | \n", "11 | \n", "A B C D E F G | \n", "13 | \n", "7 | \n", "7 | \n", "1.000000 | \n", "False | \n", "False | \n", "
| 12 | \n", "12 | \n", "\n", " | 0 | \n", "0 | \n", "0 | \n", "0.000000 | \n", "False | \n", "True | \n", "