{ "cells": [ { "cell_type": "markdown", "id": "dd72d1539056a64", "metadata": {}, "source": [ "Import Cell\n" ] }, { "cell_type": "code", "id": "12579bf734bb1a92", "metadata": { "ExecuteTime": { "end_time": "2025-11-23T13:53:57.753560Z", "start_time": "2025-11-23T13:53:56.325948Z" } }, "source": [ "import token\n", "import spacy\n", "from spacy import displacy\n", "from IPython.display import display, HTML\n", "\n", "nlp = spacy.load(\"en_core_web_md\") # Medium size model\n", "\n", "test_sentences = [\n", " \"The cat sat on the mat.\",\n", " \"On the mat, the cat was sitting.\",\n", " \"A completely different sentence about something else.\"\n", "]" ], "outputs": [], "execution_count": 1 }, { "cell_type": "markdown", "id": "1c26616777253f10", "metadata": {}, "source": [ "Keep punctuation for direct copy detection but remove for semantic/keyword based methods" ] }, { "cell_type": "code", "id": "e003ac06a58cfbb4", "metadata": { "ExecuteTime": { "end_time": "2025-11-23T13:54:12.922343Z", "start_time": "2025-11-23T13:54:12.896440Z" } }, "source": [ "\n", "for sent in test_sentences:\n", " doc = nlp(sent)\n", " print(f\"Sentence: {sent}\")\n", " print(f\"Tokens: {[token.text for token in doc]}\")\n", " print(\"---\")\n" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: The cat sat on the mat.\n", "Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n", "---\n", "Sentence: On the mat, the cat was sitting.\n", "Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n", "---\n", "Sentence: A completely different sentence about something else.\n", "Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n", "---\n" ] } ], "execution_count": 2 }, { "cell_type": "code", "id": "5e488a878a5cfccb", "metadata": { "ExecuteTime": { "end_time": "2025-11-23T13:55:23.734853Z", "start_time": "2025-11-23T13:55:22.744266Z" } }, "source": [ "\n", "class TextPreprocessor:\n", " def __init__(self):\n", " self.nlp = spacy.load(\"en_core_web_md\")\n", "\n", " @staticmethod\n", " def direct_detection(text):\n", " \"\"\"For direct copy detection\"\"\"\n", " #Keep punctuation\n", " return text.lower().strip()\n", "\n", " def semantic_analysis(self, text):\n", " \"\"\"Semantic Similarity\"\"\"\n", " doc = self.nlp(text)\n", " processed_tokens = []\n", " # Remove stopwords, punctuation\n", " for token in doc:\n", " if not token.is_punct and not token.is_space and token.is_alpha and not token.is_stop:\n", " processed_tokens.append(token.lemma_.lower())\n", " return \" \".join(processed_tokens)\n", "\n", " def syntactic_analysis(self, text):\n", " \"\"\"Syntactic Similarity\"\"\"\n", " doc = self.nlp(text)\n", " processed_tokens = []\n", "\n", " # Normalize content words\n", " for token in doc:\n", " if token.is_space:\n", " continue\n", " elif token.is_punct:\n", " processed_tokens.append(token.text) # Keep punctuation\n", " elif token.is_stop:\n", " processed_tokens.append(token.lemma_.lower()) # Normalize stopwords\n", " else:\n", " processed_tokens.append(token.lemma_.lower()) # Normalize content words\n", " return \" \".join(processed_tokens)\n", "\n", "\n", "preprocessor = TextPreprocessor()\n", "\n", "processed_direct = []\n", "processed_semantic = []\n", "processed_syntactic = []\n", "\n", "for sentence in test_sentences:\n", " print(\"-\" * 50)\n", " print(f\"Sentence: {sentence}\")\n", " direct = preprocessor.direct_detection(sentence)\n", " processed_direct.append(direct)\n", " print(\"--- Direct Sentence ---\")\n", " print(f\"{direct}\")\n", " semantic = preprocessor.semantic_analysis(sentence)\n", " processed_semantic.append(semantic)\n", " print(\"--- Semantic Sentence ---\")\n", " print(f\"{semantic}\")\n", " syntactic = preprocessor.syntactic_analysis(sentence)\n", " processed_syntactic.append(syntactic)\n", " print(\"--- Syntactic Sentence ---\")\n", " print(f\"{syntactic}\")\n", "\n", "print(\"-\" * 50)\n", "#for sent in test_sentences:\n", "# print(f\"Original Sentence: {sent}\")\n", "# print(\"--- Semantic Analysis ---\")\n", "# print(f\"Preprocessed Sentence: {preprocessor.semantic_analysis(sent)}\")\n", "# print(\"--- Syntactic Analysis ---\")\n", "# print(f\"Preprocessed Sentence: {preprocessor.syntactic_analysis(sent)}\")\n", "# print(\"-\" * 50)" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--------------------------------------------------\n", "Sentence: The cat sat on the mat.\n", "--- Direct Sentence ---\n", "the cat sat on the mat.\n", "--- Semantic Sentence ---\n", "cat sit mat\n", "--- Syntactic Sentence ---\n", "the cat sit on the mat .\n", "--------------------------------------------------\n", "Sentence: On the mat, the cat was sitting.\n", "--- Direct Sentence ---\n", "on the mat, the cat was sitting.\n", "--- Semantic Sentence ---\n", "mat cat sit\n", "--- Syntactic Sentence ---\n", "on the mat , the cat be sit .\n", "--------------------------------------------------\n", "Sentence: A completely different sentence about something else.\n", "--- Direct Sentence ---\n", "a completely different sentence about something else.\n", "--- Semantic Sentence ---\n", "completely different sentence\n", "--- Syntactic Sentence ---\n", "a completely different sentence about something else .\n", "--------------------------------------------------\n" ] } ], "execution_count": 3 }, { "cell_type": "code", "id": "83fc18c9de2e354", "metadata": { "ExecuteTime": { "end_time": "2025-11-23T13:55:33.587912Z", "start_time": "2025-11-23T13:55:33.565711Z" } }, "source": [ "\n", "def extract_parse_tree(text):\n", " doc = nlp(text)\n", "\n", " print(f\"Sentence: {text}\")\n", " print(\"\\nDependenct Parse Tree:\")\n", " print(\"-\" * 50)\n", "\n", " for token in doc:\n", " print(f\"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}\")\n", "\n", " return doc\n", "\n", "for sentence in processed_syntactic:\n", " doc = extract_parse_tree(sentence)\n", " print(\"\\n\" + \"=\"*60 + \"\\n\")" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: the cat sit on the mat .\n", "\n", "Dependenct Parse Tree:\n", "--------------------------------------------------\n", "the det cat []\n", "cat nsubj sit ['the']\n", "sit ROOT sit ['cat', 'on', '.']\n", "on prep sit ['mat']\n", "the det mat []\n", "mat pobj on ['the']\n", ". punct sit []\n", "\n", "============================================================\n", "\n", "Sentence: on the mat , the cat be sit .\n", "\n", "Dependenct Parse Tree:\n", "--------------------------------------------------\n", "on prep sit ['mat']\n", "the det mat []\n", "mat pobj on ['the']\n", ", punct sit []\n", "the det cat []\n", "cat nsubj sit ['the']\n", "be aux sit []\n", "sit ROOT sit ['on', ',', 'cat', 'be', '.']\n", ". punct sit []\n", "\n", "============================================================\n", "\n", "Sentence: a completely different sentence about something else .\n", "\n", "Dependenct Parse Tree:\n", "--------------------------------------------------\n", "a det sentence []\n", "completely advmod different []\n", "different amod sentence ['completely']\n", "sentence ROOT sentence ['a', 'different', 'about', '.']\n", "about prep sentence ['something']\n", "something pobj about ['else']\n", "else advmod something []\n", ". punct sentence []\n", "\n", "============================================================\n", "\n" ] } ], "execution_count": 4 }, { "cell_type": "markdown", "id": "5b5c8742d7c4c4c5", "metadata": {}, "source": [ "***USE NetworkX" ] }, { "cell_type": "code", "id": "e413238c1af12f62", "metadata": { "ExecuteTime": { "end_time": "2025-11-23T13:56:21.733459Z", "start_time": "2025-11-23T13:56:21.702279Z" } }, "source": [ "\n", "\n", "def visualize_parse_tree(text):\n", " doc = nlp(text)\n", " html = displacy.render(doc, style=\"dep\", jupyter=False, options={\"distance\": 100})\n", " display(HTML(html))\n", "\n", "\n", "\n", "for sentence in processed_syntactic:\n", " print(f\"Sentence: {sentence}\")\n", " print(\"---\")\n", " print(f\"Processed Sentence: \" + sentence)\n", " visualize_parse_tree(sentence)" ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: the cat sit on the mat .\n", "---\n", "Processed Sentence: the cat sit on the mat .\n" ] }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " cat\n", " NOUN\n", "\n", "\n", "\n", " sit\n", " VERB\n", "\n", "\n", "\n", " on\n", " ADP\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " mat .\n", " NOUN\n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nsubj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } }, { "name": "stdout", "output_type": "stream", "text": [ "Sentence: on the mat , the cat be sit .\n", "---\n", "Processed Sentence: on the mat , the cat be sit .\n" ] }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " on\n", " ADP\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " mat ,\n", " NOUN\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " cat\n", " NOUN\n", "\n", "\n", "\n", " be\n", " AUX\n", "\n", "\n", "\n", " sit .\n", " VERB\n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nsubj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " aux\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } }, { "name": "stdout", "output_type": "stream", "text": [ "Sentence: a completely different sentence about something else .\n", "---\n", "Processed Sentence: a completely different sentence about something else .\n" ] }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " a\n", " DET\n", "\n", "\n", "\n", " completely\n", " ADV\n", "\n", "\n", "\n", " different\n", " ADJ\n", "\n", "\n", "\n", " sentence\n", " NOUN\n", "\n", "\n", "\n", " about\n", " ADP\n", "\n", "\n", "\n", " something\n", " PRON\n", "\n", "\n", "\n", " else .\n", " ADV\n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " advmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " amod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " advmod\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } } ], "execution_count": 6 }, { "cell_type": "code", "execution_count": null, "id": "6aff51eb71eb2238", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 5 }