{ "cells": [ { "cell_type": "markdown", "id": "dd72d1539056a64", "metadata": {}, "source": [ "Import Cell\n" ] }, { "cell_type": "code", "id": "12579bf734bb1a92", "metadata": { "ExecuteTime": { "end_time": "2025-12-14T01:12:41.942156089Z", "start_time": "2025-12-14T01:12:31.060344929Z" } }, "source": [ "import spacy\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from collections import Counter\n", "import string\n", "from IPython.display import display, HTML\n", "\n", "nlp_lg = spacy.load(\"en_core_web_lg\")\n", "nlp_trf = spacy.load(\"en_core_web_trf\")\n", "\n", "nlp = nlp_lg\n", "\n", "# Test sentences covering different similarity types\n", "test_sentences = [\n", " # Direct copies and near-copies\n", " \"The cat sat on the mat.\",\n", " \"The cat sat on the mat\",\n", " \"The cat sat on the mat.\",\n", " \n", " # Paraphrases\n", " \"On the mat, the cat was sitting.\",\n", " \"The feline rested on the rug.\",\n", " \n", " # Structural changes\n", " \"The quick brown fox jumps over the lazy dog.\",\n", " \"Over the lazy dog jumps the quick brown fox.\",\n", " \n", " # Different content\n", " \"The dog ran in the park.\",\n", " \"I love programming.\",\n", " \"She enjoys reading books.\",\n", " \n", " # Edge cases\n", " \"Short.\",\n", " \"A B C D E F G\",\n", " \"\"\n", "]\n", "\n", "print(f\"Loaded {len(test_sentences)} test sentences\")" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 13 test sentences\n" ] } ], "execution_count": 1 }, { "cell_type": "markdown", "id": "1c26616777253f10", "metadata": {}, "source": [] }, { "cell_type": "code", "id": "e003ac06a58cfbb4", "metadata": { "ExecuteTime": { "end_time": "2025-12-14T01:12:42.630828162Z", "start_time": "2025-12-14T01:12:41.953456434Z" } }, "source": [ "def analyze_text_statistics(sentences):\n", " stats = []\n", " for i, sent in enumerate(sentences):\n", " doc = nlp(sent)\n", " words = [token.text for token in doc if not token.is_space]\n", " alpha_words = [token.text for token in doc if token.is_alpha]\n", " \n", " stats.append({\n", " 'sentence_id': i,\n", " 'sentence': sent,\n", " 'char_length': len(sent),\n", " 'word_count': len(words),\n", " 'alpha_word_count': len(alpha_words),\n", " 'avg_word_length': np.mean([len(word) for word in alpha_words]) if alpha_words else 0,\n", " 'has_punctuation': any(token.is_punct for token in doc),\n", " 'is_empty': len(sent.strip()) == 0\n", " })\n", " \n", " return pd.DataFrame(stats)\n", "\n", "stats_df = analyze_text_statistics(test_sentences)\n", "display(stats_df)\n", "\n", "# Visualize basic statistics\n", "fig, axes = plt.subplots(2, 2, figsize=(12, 8))\n", "\n", "# Character length distribution\n", "axes[0,0].bar(range(len(stats_df)), stats_df['char_length'])\n", "axes[0,0].set_title('Character Length by Sentence')\n", "axes[0,0].set_xlabel('Sentence ID')\n", "axes[0,0].set_ylabel('Characters')\n", "\n", "# Word count distribution\n", "axes[0,1].bar(range(len(stats_df)), stats_df['word_count'])\n", "axes[0,1].set_title('Word Count by Sentence')\n", "axes[0,1].set_xlabel('Sentence ID')\n", "axes[0,1].set_ylabel('Words')\n", "\n", "# Average word length\n", "axes[1,0].bar(range(len(stats_df)), stats_df['avg_word_length'])\n", "axes[1,0].set_title('Average Word Length')\n", "axes[1,0].set_xlabel('Sentence ID')\n", "axes[1,0].set_ylabel('Characters')\n", "\n", "# Sentence type breakdown\n", "types = ['With Punctuation' if x else 'No Punctuation' for x in stats_df['has_punctuation']]\n", "type_counts = pd.Series(types).value_counts()\n", "axes[1,1].pie(type_counts.values, labels=type_counts.index, autopct='%1.1f%%')\n", "axes[1,1].set_title('Punctuation Distribution')\n", "\n", "plt.tight_layout()\n", "plt.show()" ], "outputs": [ { "data": { "text/plain": [ " sentence_id sentence char_length \\\n", "0 0 The cat sat on the mat. 23 \n", "1 1 The cat sat on the mat 22 \n", "2 2 The cat sat on the mat. 28 \n", "3 3 On the mat, the cat was sitting. 32 \n", "4 4 The feline rested on the rug. 29 \n", "5 5 The quick brown fox jumps over the lazy dog. 44 \n", "6 6 Over the lazy dog jumps the quick brown fox. 44 \n", "7 7 The dog ran in the park. 24 \n", "8 8 I love programming. 19 \n", "9 9 She enjoys reading books. 25 \n", "10 10 Short. 6 \n", "11 11 A B C D E F G 13 \n", "12 12 0 \n", "\n", " word_count alpha_word_count avg_word_length has_punctuation is_empty \n", "0 7 6 2.833333 True False \n", "1 6 6 2.833333 False False \n", "2 7 6 2.833333 True False \n", "3 9 7 3.428571 True False \n", "4 7 6 3.833333 True False \n", "5 10 9 3.888889 True False \n", "6 10 9 3.888889 True False \n", "7 7 6 3.000000 True False \n", "8 4 3 5.333333 True False \n", "9 5 4 5.250000 True False \n", "10 2 1 5.000000 True False \n", "11 7 7 1.000000 False False \n", "12 0 0 0.000000 False True " ], "text/html": [ "
| \n", " | sentence_id | \n", "sentence | \n", "char_length | \n", "word_count | \n", "alpha_word_count | \n", "avg_word_length | \n", "has_punctuation | \n", "is_empty | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0 | \n", "The cat sat on the mat. | \n", "23 | \n", "7 | \n", "6 | \n", "2.833333 | \n", "True | \n", "False | \n", "
| 1 | \n", "1 | \n", "The cat sat on the mat | \n", "22 | \n", "6 | \n", "6 | \n", "2.833333 | \n", "False | \n", "False | \n", "
| 2 | \n", "2 | \n", "The cat sat on the mat. | \n", "28 | \n", "7 | \n", "6 | \n", "2.833333 | \n", "True | \n", "False | \n", "
| 3 | \n", "3 | \n", "On the mat, the cat was sitting. | \n", "32 | \n", "9 | \n", "7 | \n", "3.428571 | \n", "True | \n", "False | \n", "
| 4 | \n", "4 | \n", "The feline rested on the rug. | \n", "29 | \n", "7 | \n", "6 | \n", "3.833333 | \n", "True | \n", "False | \n", "
| 5 | \n", "5 | \n", "The quick brown fox jumps over the lazy dog. | \n", "44 | \n", "10 | \n", "9 | \n", "3.888889 | \n", "True | \n", "False | \n", "
| 6 | \n", "6 | \n", "Over the lazy dog jumps the quick brown fox. | \n", "44 | \n", "10 | \n", "9 | \n", "3.888889 | \n", "True | \n", "False | \n", "
| 7 | \n", "7 | \n", "The dog ran in the park. | \n", "24 | \n", "7 | \n", "6 | \n", "3.000000 | \n", "True | \n", "False | \n", "
| 8 | \n", "8 | \n", "I love programming. | \n", "19 | \n", "4 | \n", "3 | \n", "5.333333 | \n", "True | \n", "False | \n", "
| 9 | \n", "9 | \n", "She enjoys reading books. | \n", "25 | \n", "5 | \n", "4 | \n", "5.250000 | \n", "True | \n", "False | \n", "
| 10 | \n", "10 | \n", "Short. | \n", "6 | \n", "2 | \n", "1 | \n", "5.000000 | \n", "True | \n", "False | \n", "
| 11 | \n", "11 | \n", "A B C D E F G | \n", "13 | \n", "7 | \n", "7 | \n", "1.000000 | \n", "False | \n", "False | \n", "
| 12 | \n", "12 | \n", "\n", " | 0 | \n", "0 | \n", "0 | \n", "0.000000 | \n", "False | \n", "True | \n", "