{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dd72d1539056a64",
   "metadata": {},
   "source": [
    "Import Cell\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "12579bf734bb1a92",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-23T13:53:57.753560Z",
     "start_time": "2025-11-23T13:53:56.325948Z"
    }
   },
   "outputs": [],
   "source": [
    "import token\n",
    "import spacy\n",
    "from spacy import displacy\n",
    "from IPython.display import display, HTML\n",
    "import torch\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_md\") # Medium size model\n",
    "\n",
    "test_sentences = [\n",
    "    \"The cat sat on the mat.\",\n",
    "    \"On the mat, the cat was sitting.\",\n",
    "    \"A completely different sentence about something else.\"\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c26616777253f10",
   "metadata": {},
   "source": [
    "Keep punctuation for direct copy detection but remove for semantic/keyword based methods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e003ac06a58cfbb4",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-23T13:54:12.922343Z",
     "start_time": "2025-11-23T13:54:12.896440Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence: The cat sat on the mat.\n",
      "Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n",
      "---\n",
      "Sentence: On the mat, the cat was sitting.\n",
      "Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n",
      "---\n",
      "Sentence: A completely different sentence about something else.\n",
      "Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n",
      "---\n"
     ]
    }
   ],
   "source": [
    "\n",
    "for sent in test_sentences:\n",
    "    doc = nlp(sent)\n",
    "    print(f\"Sentence: {sent}\")\n",
    "    print(f\"Tokens: {[token.text for token in doc]}\")\n",
    "    print(\"---\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5e488a878a5cfccb",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-23T13:55:23.734853Z",
     "start_time": "2025-11-23T13:55:22.744266Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------------------------\n",
      "Sentence: The cat sat on the mat.\n",
      "--- Direct Sentence ---\n",
      "the cat sat on the mat.\n",
      "--- Semantic Sentence ---\n",
      "cat sit mat\n",
      "--- Syntactic Sentence ---\n",
      "the cat sit on the mat .\n",
      "--------------------------------------------------\n",
      "Sentence: On the mat, the cat was sitting.\n",
      "--- Direct Sentence ---\n",
      "on the mat, the cat was sitting.\n",
      "--- Semantic Sentence ---\n",
      "mat cat sit\n",
      "--- Syntactic Sentence ---\n",
      "on the mat , the cat be sit .\n",
      "--------------------------------------------------\n",
      "Sentence: A completely different sentence about something else.\n",
      "--- Direct Sentence ---\n",
      "a completely different sentence about something else.\n",
      "--- Semantic Sentence ---\n",
      "completely different sentence\n",
      "--- Syntactic Sentence ---\n",
      "a completely different sentence about something else .\n",
      "--------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "\n",
    "class TextPreprocessor:\n",
    "    def __init__(self):\n",
    "        self.nlp = spacy.load(\"en_core_web_md\")\n",
    "\n",
    "    @staticmethod\n",
    "    def direct_detection(text):\n",
    "        \"\"\"For direct copy detection\"\"\"\n",
    "        #Keep punctuation\n",
    "        return text.lower().strip()\n",
    "\n",
    "    def semantic_analysis(self, text):\n",
    "        \"\"\"Semantic Similarity\"\"\"\n",
    "        doc = self.nlp(text)\n",
    "        processed_tokens = []\n",
    "        # Remove stopwords, punctuation\n",
    "        for token in doc:\n",
    "            if not token.is_punct and not token.is_space and token.is_alpha and not token.is_stop:\n",
    "                processed_tokens.append(token.lemma_.lower())\n",
    "        return \" \".join(processed_tokens)\n",
    "\n",
    "    def syntactic_analysis(self, text):\n",
    "        \"\"\"Syntactic Similarity\"\"\"\n",
    "        doc = self.nlp(text)\n",
    "        processed_tokens = []\n",
    "\n",
    "        # Normalize content words\n",
    "        for token in doc:\n",
    "            if token.is_space:\n",
    "                continue\n",
    "            elif token.is_punct:\n",
    "                processed_tokens.append(token.text) # Keep punctuation\n",
    "            elif token.is_stop:\n",
    "                processed_tokens.append(token.lemma_.lower()) # Normalize stopwords\n",
    "            else:\n",
    "                processed_tokens.append(token.lemma_.lower()) # Normalize content words\n",
    "        return \" \".join(processed_tokens)\n",
    "\n",
    "\n",
    "preprocessor = TextPreprocessor()\n",
    "\n",
    "processed_direct = []\n",
    "processed_semantic = []\n",
    "processed_syntactic = []\n",
    "\n",
    "for sentence in test_sentences:\n",
    "    print(\"-\" * 50)\n",
    "    print(f\"Sentence: {sentence}\")\n",
    "    direct = preprocessor.direct_detection(sentence)\n",
    "    processed_direct.append(direct)\n",
    "    print(\"--- Direct Sentence ---\")\n",
    "    print(f\"{direct}\")\n",
    "    semantic = preprocessor.semantic_analysis(sentence)\n",
    "    processed_semantic.append(semantic)\n",
    "    print(\"--- Semantic Sentence ---\")\n",
    "    print(f\"{semantic}\")\n",
    "    syntactic = preprocessor.syntactic_analysis(sentence)\n",
    "    processed_syntactic.append(syntactic)\n",
    "    print(\"--- Syntactic Sentence ---\")\n",
    "    print(f\"{syntactic}\")\n",
    "\n",
    "print(\"-\" * 50)\n",
    "#for sent in test_sentences:\n",
    "#    print(f\"Original Sentence: {sent}\")\n",
    "#    print(\"--- Semantic Analysis  ---\")\n",
    "#    print(f\"Preprocessed Sentence: {preprocessor.semantic_analysis(sent)}\")\n",
    "#    print(\"--- Syntactic Analysis ---\")\n",
    "#    print(f\"Preprocessed Sentence: {preprocessor.syntactic_analysis(sent)}\")\n",
    "#    print(\"-\" * 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "83fc18c9de2e354",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-23T13:55:33.587912Z",
     "start_time": "2025-11-23T13:55:33.565711Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence: the cat sit on the mat .\n",
      "\n",
      "Dependenct Parse Tree:\n",
      "--------------------------------------------------\n",
      "the          det          cat          []\n",
      "cat          nsubj        sit          ['the']\n",
      "sit          ROOT         sit          ['cat', 'on', '.']\n",
      "on           prep         sit          ['mat']\n",
      "the          det          mat          []\n",
      "mat          pobj         on           ['the']\n",
      ".            punct        sit          []\n",
      "\n",
      "============================================================\n",
      "\n",
      "Sentence: on the mat , the cat be sit .\n",
      "\n",
      "Dependenct Parse Tree:\n",
      "--------------------------------------------------\n",
      "on           prep         sit          ['mat']\n",
      "the          det          mat          []\n",
      "mat          pobj         on           ['the']\n",
      ",            punct        sit          []\n",
      "the          det          cat          []\n",
      "cat          nsubj        sit          ['the']\n",
      "be           aux          sit          []\n",
      "sit          ROOT         sit          ['on', ',', 'cat', 'be', '.']\n",
      ".            punct        sit          []\n",
      "\n",
      "============================================================\n",
      "\n",
      "Sentence: a completely different sentence about something else .\n",
      "\n",
      "Dependenct Parse Tree:\n",
      "--------------------------------------------------\n",
      "a            det          sentence     []\n",
      "completely   advmod       different    []\n",
      "different    amod         sentence     ['completely']\n",
      "sentence     ROOT         sentence     ['a', 'different', 'about', '.']\n",
      "about        prep         sentence     ['something']\n",
      "something    pobj         about        ['else']\n",
      "else         advmod       something    []\n",
      ".            punct        sentence     []\n",
      "\n",
      "============================================================\n",
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "def extract_parse_tree(text):\n",
    "    doc = nlp(text)\n",
    "\n",
    "    print(f\"Sentence: {text}\")\n",
    "    print(\"\\nDependenct Parse Tree:\")\n",
    "    print(\"-\" * 50)\n",
    "\n",
    "    for token in doc:\n",
    "        print(f\"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}\")\n",
    "\n",
    "    return doc\n",
    "\n",
    "for sentence in processed_syntactic:\n",
    "    doc = extract_parse_tree(sentence)\n",
    "    print(\"\\n\" + \"=\"*60 + \"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b5c8742d7c4c4c5",
   "metadata": {},
   "source": [
    "***USE NetworkX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e413238c1af12f62",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-23T13:56:21.733459Z",
     "start_time": "2025-11-23T13:56:21.702279Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence: the cat sit on the mat .\n",
      "---\n",
      "Processed Sentence: the cat sit on the mat .\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"7988d30657e54e32939db20da966a81e-0\" class=\"displacy\" width=\"650\" height=\"237.0\" direction=\"ltr\" style=\"max-width: none; height: 237.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">the</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">DET</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">cat</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">NOUN</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">sit</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">VERB</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">on</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">ADP</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">the</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">DET</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">mat .</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">NOUN</tspan>\n",
       "</text>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-7988d30657e54e32939db20da966a81e-0-0\" stroke-width=\"2px\" d=\"M70,102.0 C70,52.0 145.0,52.0 145.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-7988d30657e54e32939db20da966a81e-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M70,104.0 L62,92.0 78,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-7988d30657e54e32939db20da966a81e-0-1\" stroke-width=\"2px\" d=\"M170,102.0 C170,52.0 245.0,52.0 245.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-7988d30657e54e32939db20da966a81e-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M170,104.0 L162,92.0 178,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-7988d30657e54e32939db20da966a81e-0-2\" stroke-width=\"2px\" d=\"M270,102.0 C270,52.0 345.0,52.0 345.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-7988d30657e54e32939db20da966a81e-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M345.0,104.0 L353.0,92.0 337.0,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-7988d30657e54e32939db20da966a81e-0-3\" stroke-width=\"2px\" d=\"M470,102.0 C470,52.0 545.0,52.0 545.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-7988d30657e54e32939db20da966a81e-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M470,104.0 L462,92.0 478,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-7988d30657e54e32939db20da966a81e-0-4\" stroke-width=\"2px\" d=\"M370,102.0 C370,2.0 550.0,2.0 550.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-7988d30657e54e32939db20da966a81e-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M550.0,104.0 L558.0,92.0 542.0,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "</svg>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence: on the mat , the cat be sit .\n",
      "---\n",
      "Processed Sentence: on the mat , the cat be sit .\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"ede226a392d6400bb9fe1654f4f5b08c-0\" class=\"displacy\" width=\"750\" height=\"287.0\" direction=\"ltr\" style=\"max-width: none; height: 287.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">on</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">ADP</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">the</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">DET</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">mat ,</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">NOUN</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">the</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">DET</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">cat</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">NOUN</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">be</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">AUX</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">sit .</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">VERB</tspan>\n",
       "</text>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-0\" stroke-width=\"2px\" d=\"M70,152.0 C70,2.0 650.0,2.0 650.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M70,154.0 L62,142.0 78,142.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-1\" stroke-width=\"2px\" d=\"M170,152.0 C170,102.0 240.0,102.0 240.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M170,154.0 L162,142.0 178,142.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-2\" stroke-width=\"2px\" d=\"M70,152.0 C70,52.0 245.0,52.0 245.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M245.0,154.0 L253.0,142.0 237.0,142.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-3\" stroke-width=\"2px\" d=\"M370,152.0 C370,102.0 440.0,102.0 440.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M370,154.0 L362,142.0 378,142.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-4\" stroke-width=\"2px\" d=\"M470,152.0 C470,52.0 645.0,52.0 645.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M470,154.0 L462,142.0 478,142.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-5\" stroke-width=\"2px\" d=\"M570,152.0 C570,102.0 640.0,102.0 640.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M570,154.0 L562,142.0 578,142.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "</svg>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence: a completely different sentence about something else .\n",
      "---\n",
      "Processed Sentence: a completely different sentence about something else .\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"e2bf7e2546b1463e841e53291bfb9bb2-0\" class=\"displacy\" width=\"750\" height=\"237.0\" direction=\"ltr\" style=\"max-width: none; height: 237.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">a</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">DET</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"150\">completely</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"150\">ADV</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"250\">different</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"250\">ADJ</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"350\">sentence</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"350\">NOUN</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"450\">about</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"450\">ADP</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"550\">something</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"550\">PRON</tspan>\n",
       "</text>\n",
       "\n",
       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"650\">else .</tspan>\n",
       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"650\">ADV</tspan>\n",
       "</text>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-0\" stroke-width=\"2px\" d=\"M70,102.0 C70,2.0 350.0,2.0 350.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M70,104.0 L62,92.0 78,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-1\" stroke-width=\"2px\" d=\"M170,102.0 C170,52.0 245.0,52.0 245.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">advmod</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M170,104.0 L162,92.0 178,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-2\" stroke-width=\"2px\" d=\"M270,102.0 C270,52.0 345.0,52.0 345.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M270,104.0 L262,92.0 278,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-3\" stroke-width=\"2px\" d=\"M370,102.0 C370,52.0 445.0,52.0 445.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M445.0,104.0 L453.0,92.0 437.0,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-4\" stroke-width=\"2px\" d=\"M470,102.0 C470,52.0 545.0,52.0 545.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M545.0,104.0 L553.0,92.0 537.0,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "\n",
       "<g class=\"displacy-arrow\">\n",
       "    <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-5\" stroke-width=\"2px\" d=\"M570,102.0 C570,52.0 645.0,52.0 645.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
       "        <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">advmod</textPath>\n",
       "    </text>\n",
       "    <path class=\"displacy-arrowhead\" d=\"M645.0,104.0 L653.0,92.0 637.0,92.0\" fill=\"currentColor\"/>\n",
       "</g>\n",
       "</svg>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "\n",
    "\n",
    "def visualize_parse_tree(text):\n",
    "    doc = nlp(text)\n",
    "    html = displacy.render(doc, style=\"dep\", jupyter=False, options={\"distance\": 100})\n",
    "    display(HTML(html))\n",
    "\n",
    "\n",
    "\n",
    "for sentence in processed_syntactic:\n",
    "    print(f\"Sentence: {sentence}\")\n",
    "    print(\"---\")\n",
    "    print(f\"Processed Sentence: \" + sentence)\n",
    "    visualize_parse_tree(sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6aff51eb71eb2238",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}