From fb68bc869afc0ccd6d171782fadb913e5d30f8bc Mon Sep 17 00:00:00 2001 From: Henry Dowd Date: Sun, 23 Nov 2025 14:48:13 +0000 Subject: [PATCH] add jupyter to requirments --- notebooks/01_data_exploration.ipynb | 453 ++++++---------------------- requirments.txt | 3 +- 2 files changed, 90 insertions(+), 366 deletions(-) diff --git a/notebooks/01_data_exploration.ipynb b/notebooks/01_data_exploration.ipynb index ddf0a9a..95221c3 100644 --- a/notebooks/01_data_exploration.ipynb +++ b/notebooks/01_data_exploration.ipynb @@ -1,19 +1,22 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", - "source": "Import Cell\n", - "id": "dd72d1539056a64" + "id": "dd72d1539056a64", + "metadata": {}, + "source": [ + "Import Cell\n" + ] }, { + "cell_type": "code", + "id": "12579bf734bb1a92", "metadata": { "ExecuteTime": { - "end_time": "2025-11-21T17:01:35.974978Z", - "start_time": "2025-11-21T17:01:34.412508Z" + "end_time": "2025-11-23T13:53:57.753560Z", + "start_time": "2025-11-23T13:53:56.325948Z" } }, - "cell_type": "code", "source": [ "import token\n", "import spacy\n", @@ -28,24 +31,26 @@ " \"A completely different sentence about something else.\"\n", "]" ], - "id": "12579bf734bb1a92", "outputs": [], - "execution_count": 21 + "execution_count": 1 }, { - "metadata": {}, "cell_type": "markdown", - "source": "Keep punctuation for direct copy detection but remove for semantic/keyword based methods", - "id": "1c26616777253f10" + "id": "1c26616777253f10", + "metadata": {}, + "source": [ + "Keep punctuation for direct copy detection but remove for semantic/keyword based methods" + ] }, { + "cell_type": "code", + "id": "e003ac06a58cfbb4", "metadata": { "ExecuteTime": { - "end_time": "2025-11-20T19:03:29.658876Z", - "start_time": "2025-11-20T19:03:27.809309Z" + "end_time": "2025-11-23T13:54:12.922343Z", + "start_time": "2025-11-23T13:54:12.896440Z" } }, - "cell_type": "code", "source": [ "\n", "for sent in test_sentences:\n", @@ -54,7 +59,6 @@ " print(f\"Tokens: {[token.text for token in doc]}\")\n", " print(\"---\")\n" ], - "id": "e003ac06a58cfbb4", "outputs": [ { "name": "stdout", @@ -72,16 +76,17 @@ ] } ], - "execution_count": 17 + "execution_count": 2 }, { + "cell_type": "code", + "id": "5e488a878a5cfccb", "metadata": { "ExecuteTime": { - "end_time": "2025-11-21T23:26:23.172796Z", - "start_time": "2025-11-21T23:26:21.541542Z" + "end_time": "2025-11-23T13:55:23.734853Z", + "start_time": "2025-11-23T13:55:22.744266Z" } }, - "cell_type": "code", "source": [ "\n", "class TextPreprocessor:\n", @@ -153,7 +158,6 @@ "# print(f\"Preprocessed Sentence: {preprocessor.syntactic_analysis(sent)}\")\n", "# print(\"-\" * 50)" ], - "id": "5e488a878a5cfccb", "outputs": [ { "name": "stdout", @@ -187,16 +191,17 @@ ] } ], - "execution_count": 37 + "execution_count": 3 }, { + "cell_type": "code", + "id": "83fc18c9de2e354", "metadata": { "ExecuteTime": { - "end_time": "2025-11-21T23:39:48.443022Z", - "start_time": "2025-11-21T23:39:48.411766Z" + "end_time": "2025-11-23T13:55:33.587912Z", + "start_time": "2025-11-23T13:55:33.565711Z" } }, - "cell_type": "code", "source": [ "\n", "def extract_parse_tree(text):\n", @@ -215,7 +220,6 @@ " doc = extract_parse_tree(sentence)\n", " print(\"\\n\" + \"=\"*60 + \"\\n\")" ], - "id": "83fc18c9de2e354", "outputs": [ { "name": "stdout", @@ -269,22 +273,25 @@ ] } ], - "execution_count": 39 + "execution_count": 4 }, { - "metadata": {}, "cell_type": "markdown", - "source": "***USE NetworkX", - "id": "5b5c8742d7c4c4c5" + "id": "5b5c8742d7c4c4c5", + "metadata": {}, + "source": [ + "***USE NetworkX" + ] }, { + "cell_type": "code", + "id": "e413238c1af12f62", "metadata": { "ExecuteTime": { - "end_time": "2025-11-21T18:20:09.575176Z", - "start_time": "2025-11-21T18:20:09.465504Z" + "end_time": "2025-11-23T13:56:21.733459Z", + "start_time": "2025-11-23T13:56:21.702279Z" } }, - "cell_type": "code", "source": [ "\n", "\n", @@ -295,21 +302,18 @@ "\n", "\n", "\n", - "for sentence in test_sentences:\n", + "for sentence in processed_syntactic:\n", " print(f\"Sentence: {sentence}\")\n", " print(\"---\")\n", - " processed_sentence = preprocessor.syntactic_analysis(sentence)\n", - " print(f\"Processed Sentence: \" + processed_sentence)\n", - " visualize_parse_tree(processed_sentence)\n", + " print(f\"Processed Sentence: \" + sentence)\n", " visualize_parse_tree(sentence)" ], - "id": "e413238c1af12f62", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Sentence: The cat sat on the mat.\n", + "Sentence: the cat sit on the mat .\n", "---\n", "Processed Sentence: the cat sit on the mat .\n" ] @@ -320,7 +324,7 @@ "" ], "text/html": [ - "\n", + "\n", "\n", " the\n", " DET\n", @@ -352,126 +356,41 @@ "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " nsubj\n", + " nsubj\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " prep\n", + " prep\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " pobj\n", - " \n", - " \n", - "\n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data", - "jetTransient": { - "display_id": null - } - }, - { - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - "\n", - " The\n", - " DET\n", - "\n", - "\n", - "\n", - " cat\n", - " NOUN\n", - "\n", - "\n", - "\n", - " sat\n", - " VERB\n", - "\n", - "\n", - "\n", - " on\n", - " ADP\n", - "\n", - "\n", - "\n", - " the\n", - " DET\n", - "\n", - "\n", - "\n", - " mat.\n", - " NOUN\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " det\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " nsubj\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " prep\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " det\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " pobj\n", + " pobj\n", " \n", " \n", "\n", @@ -488,7 +407,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Sentence: On the mat, the cat was sitting.\n", + "Sentence: on the mat , the cat be sit .\n", "---\n", "Processed Sentence: on the mat , the cat be sit .\n" ] @@ -499,7 +418,7 @@ "" ], "text/html": [ - "\n", + "\n", "\n", " on\n", " ADP\n", @@ -536,147 +455,49 @@ "\n", "\n", "\n", - " \n", + " \n", " \n", - " prep\n", + " prep\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " pobj\n", + " pobj\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " nsubj\n", + " nsubj\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " aux\n", - " \n", - " \n", - "\n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data", - "jetTransient": { - "display_id": null - } - }, - { - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - "\n", - " On\n", - " ADP\n", - "\n", - "\n", - "\n", - " the\n", - " DET\n", - "\n", - "\n", - "\n", - " mat,\n", - " NOUN\n", - "\n", - "\n", - "\n", - " the\n", - " DET\n", - "\n", - "\n", - "\n", - " cat\n", - " NOUN\n", - "\n", - "\n", - "\n", - " was\n", - " AUX\n", - "\n", - "\n", - "\n", - " sitting.\n", - " VERB\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " prep\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " det\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " pobj\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " det\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " nsubj\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " aux\n", + " aux\n", " \n", " \n", "\n", @@ -693,7 +514,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Sentence: A completely different sentence about something else.\n", + "Sentence: a completely different sentence about something else .\n", "---\n", "Processed Sentence: a completely different sentence about something else .\n" ] @@ -704,7 +525,7 @@ "" ], "text/html": [ - "\n", + "\n", "\n", " a\n", " DET\n", @@ -741,147 +562,49 @@ "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " advmod\n", + " advmod\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " amod\n", + " amod\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " prep\n", + " prep\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " pobj\n", + " pobj\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " advmod\n", - " \n", - " \n", - "\n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data", - "jetTransient": { - "display_id": null - } - }, - { - "data": { - "text/plain": [ - "" - ], - "text/html": [ - "\n", - "\n", - " A\n", - " DET\n", - "\n", - "\n", - "\n", - " completely\n", - " ADV\n", - "\n", - "\n", - "\n", - " different\n", - " ADJ\n", - "\n", - "\n", - "\n", - " sentence\n", - " NOUN\n", - "\n", - "\n", - "\n", - " about\n", - " ADP\n", - "\n", - "\n", - "\n", - " something\n", - " PRON\n", - "\n", - "\n", - "\n", - " else.\n", - " ADV\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " det\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " advmod\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " amod\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " prep\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " pobj\n", - " \n", - " \n", - "\n", - "\n", - "\n", - " \n", - " \n", - " advmod\n", + " advmod\n", " \n", " \n", "\n", @@ -895,22 +618,22 @@ } } ], - "execution_count": 32 + "execution_count": 6 }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "", - "id": "6aff51eb71eb2238" + "id": "6aff51eb71eb2238", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3 (ipykernel)", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" } }, "nbformat": 4, diff --git a/requirments.txt b/requirments.txt index 0005b3d..4691ee1 100644 --- a/requirments.txt +++ b/requirments.txt @@ -5,4 +5,5 @@ numpy scikit-learn spacy matplotlib -seaborn \ No newline at end of file +seaborn +jupyter \ No newline at end of file