{
"cells": [
{
"cell_type": "markdown",
"id": "dd72d1539056a64",
"metadata": {},
"source": [
"Import Cell\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "12579bf734bb1a92",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-23T13:53:57.753560Z",
"start_time": "2025-11-23T13:53:56.325948Z"
}
},
"outputs": [],
"source": [
"import token\n",
"import spacy\n",
"from spacy import displacy\n",
"from IPython.display import display, HTML\n",
"import torch\n",
"\n",
"nlp = spacy.load(\"en_core_web_md\") # Medium size model\n",
"\n",
"test_sentences = [\n",
" \"The cat sat on the mat.\",\n",
" \"On the mat, the cat was sitting.\",\n",
" \"A completely different sentence about something else.\"\n",
"]"
]
},
{
"cell_type": "markdown",
"id": "1c26616777253f10",
"metadata": {},
"source": [
"Keep punctuation for direct copy detection but remove for semantic/keyword based methods"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e003ac06a58cfbb4",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-23T13:54:12.922343Z",
"start_time": "2025-11-23T13:54:12.896440Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence: The cat sat on the mat.\n",
"Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n",
"---\n",
"Sentence: On the mat, the cat was sitting.\n",
"Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n",
"---\n",
"Sentence: A completely different sentence about something else.\n",
"Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n",
"---\n"
]
}
],
"source": [
"\n",
"for sent in test_sentences:\n",
" doc = nlp(sent)\n",
" print(f\"Sentence: {sent}\")\n",
" print(f\"Tokens: {[token.text for token in doc]}\")\n",
" print(\"---\")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5e488a878a5cfccb",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-23T13:55:23.734853Z",
"start_time": "2025-11-23T13:55:22.744266Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------\n",
"Sentence: The cat sat on the mat.\n",
"--- Direct Sentence ---\n",
"the cat sat on the mat.\n",
"--- Semantic Sentence ---\n",
"cat sit mat\n",
"--- Syntactic Sentence ---\n",
"the cat sit on the mat .\n",
"--------------------------------------------------\n",
"Sentence: On the mat, the cat was sitting.\n",
"--- Direct Sentence ---\n",
"on the mat, the cat was sitting.\n",
"--- Semantic Sentence ---\n",
"mat cat sit\n",
"--- Syntactic Sentence ---\n",
"on the mat , the cat be sit .\n",
"--------------------------------------------------\n",
"Sentence: A completely different sentence about something else.\n",
"--- Direct Sentence ---\n",
"a completely different sentence about something else.\n",
"--- Semantic Sentence ---\n",
"completely different sentence\n",
"--- Syntactic Sentence ---\n",
"a completely different sentence about something else .\n",
"--------------------------------------------------\n"
]
}
],
"source": [
"\n",
"class TextPreprocessor:\n",
" def __init__(self):\n",
" self.nlp = spacy.load(\"en_core_web_md\")\n",
"\n",
" @staticmethod\n",
" def direct_detection(text):\n",
" \"\"\"For direct copy detection\"\"\"\n",
" #Keep punctuation\n",
" return text.lower().strip()\n",
"\n",
" def semantic_analysis(self, text):\n",
" \"\"\"Semantic Similarity\"\"\"\n",
" doc = self.nlp(text)\n",
" processed_tokens = []\n",
" # Remove stopwords, punctuation\n",
" for token in doc:\n",
" if not token.is_punct and not token.is_space and token.is_alpha and not token.is_stop:\n",
" processed_tokens.append(token.lemma_.lower())\n",
" return \" \".join(processed_tokens)\n",
"\n",
" def syntactic_analysis(self, text):\n",
" \"\"\"Syntactic Similarity\"\"\"\n",
" doc = self.nlp(text)\n",
" processed_tokens = []\n",
"\n",
" # Normalize content words\n",
" for token in doc:\n",
" if token.is_space:\n",
" continue\n",
" elif token.is_punct:\n",
" processed_tokens.append(token.text) # Keep punctuation\n",
" elif token.is_stop:\n",
" processed_tokens.append(token.lemma_.lower()) # Normalize stopwords\n",
" else:\n",
" processed_tokens.append(token.lemma_.lower()) # Normalize content words\n",
" return \" \".join(processed_tokens)\n",
"\n",
"\n",
"preprocessor = TextPreprocessor()\n",
"\n",
"processed_direct = []\n",
"processed_semantic = []\n",
"processed_syntactic = []\n",
"\n",
"for sentence in test_sentences:\n",
" print(\"-\" * 50)\n",
" print(f\"Sentence: {sentence}\")\n",
" direct = preprocessor.direct_detection(sentence)\n",
" processed_direct.append(direct)\n",
" print(\"--- Direct Sentence ---\")\n",
" print(f\"{direct}\")\n",
" semantic = preprocessor.semantic_analysis(sentence)\n",
" processed_semantic.append(semantic)\n",
" print(\"--- Semantic Sentence ---\")\n",
" print(f\"{semantic}\")\n",
" syntactic = preprocessor.syntactic_analysis(sentence)\n",
" processed_syntactic.append(syntactic)\n",
" print(\"--- Syntactic Sentence ---\")\n",
" print(f\"{syntactic}\")\n",
"\n",
"print(\"-\" * 50)\n",
"#for sent in test_sentences:\n",
"# print(f\"Original Sentence: {sent}\")\n",
"# print(\"--- Semantic Analysis ---\")\n",
"# print(f\"Preprocessed Sentence: {preprocessor.semantic_analysis(sent)}\")\n",
"# print(\"--- Syntactic Analysis ---\")\n",
"# print(f\"Preprocessed Sentence: {preprocessor.syntactic_analysis(sent)}\")\n",
"# print(\"-\" * 50)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "83fc18c9de2e354",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-23T13:55:33.587912Z",
"start_time": "2025-11-23T13:55:33.565711Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence: the cat sit on the mat .\n",
"\n",
"Dependenct Parse Tree:\n",
"--------------------------------------------------\n",
"the det cat []\n",
"cat nsubj sit ['the']\n",
"sit ROOT sit ['cat', 'on', '.']\n",
"on prep sit ['mat']\n",
"the det mat []\n",
"mat pobj on ['the']\n",
". punct sit []\n",
"\n",
"============================================================\n",
"\n",
"Sentence: on the mat , the cat be sit .\n",
"\n",
"Dependenct Parse Tree:\n",
"--------------------------------------------------\n",
"on prep sit ['mat']\n",
"the det mat []\n",
"mat pobj on ['the']\n",
", punct sit []\n",
"the det cat []\n",
"cat nsubj sit ['the']\n",
"be aux sit []\n",
"sit ROOT sit ['on', ',', 'cat', 'be', '.']\n",
". punct sit []\n",
"\n",
"============================================================\n",
"\n",
"Sentence: a completely different sentence about something else .\n",
"\n",
"Dependenct Parse Tree:\n",
"--------------------------------------------------\n",
"a det sentence []\n",
"completely advmod different []\n",
"different amod sentence ['completely']\n",
"sentence ROOT sentence ['a', 'different', 'about', '.']\n",
"about prep sentence ['something']\n",
"something pobj about ['else']\n",
"else advmod something []\n",
". punct sentence []\n",
"\n",
"============================================================\n",
"\n"
]
}
],
"source": [
"\n",
"def extract_parse_tree(text):\n",
" doc = nlp(text)\n",
"\n",
" print(f\"Sentence: {text}\")\n",
" print(\"\\nDependenct Parse Tree:\")\n",
" print(\"-\" * 50)\n",
"\n",
" for token in doc:\n",
" print(f\"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}\")\n",
"\n",
" return doc\n",
"\n",
"for sentence in processed_syntactic:\n",
" doc = extract_parse_tree(sentence)\n",
" print(\"\\n\" + \"=\"*60 + \"\\n\")"
]
},
{
"cell_type": "markdown",
"id": "5b5c8742d7c4c4c5",
"metadata": {},
"source": [
"***USE NetworkX"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e413238c1af12f62",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-23T13:56:21.733459Z",
"start_time": "2025-11-23T13:56:21.702279Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence: the cat sit on the mat .\n",
"---\n",
"Processed Sentence: the cat sit on the mat .\n"
]
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence: on the mat , the cat be sit .\n",
"---\n",
"Processed Sentence: on the mat , the cat be sit .\n"
]
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence: a completely different sentence about something else .\n",
"---\n",
"Processed Sentence: a completely different sentence about something else .\n"
]
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"\n",
"def visualize_parse_tree(text):\n",
" doc = nlp(text)\n",
" html = displacy.render(doc, style=\"dep\", jupyter=False, options={\"distance\": 100})\n",
" display(HTML(html))\n",
"\n",
"\n",
"\n",
"for sentence in processed_syntactic:\n",
" print(f\"Sentence: {sentence}\")\n",
" print(\"---\")\n",
" print(f\"Processed Sentence: \" + sentence)\n",
" visualize_parse_tree(sentence)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6aff51eb71eb2238",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}