Compare commits

..

10 Commits

Author SHA1 Message Date
Henry Dowd
6223c1c7db Final_Evaluation notebook and model 2026-03-11 02:40:20 +00:00
Henry Dowd
52e8e2ec50 Updated TED method 2026-02-27 16:00:10 +00:00
Henry Dowd
3d755fca3e Ran files 2026-02-27 04:56:34 +00:00
Henry Dowd
84ee1dc246 Semantic methods and begginin fusion of methods to create model 2026-02-27 01:33:32 +00:00
Henry Dowd
722de6c575 Interim Report 2026-02-26 19:51:21 +00:00
Henry Dowd
0a8da7dc07 notebook1 2025-12-14 14:51:25 +00:00
Henry Dowd
726c6b276b notebook2 2025-12-10 19:43:28 +00:00
Henry Dowd
a370b68cb1 remove test file 2025-12-02 17:54:54 +00:00
Henry Dowd
ed0b34a8bf test file for git 2025-12-02 17:53:59 +00:00
Henry Dowd
828c49e9b7 Mode advanced data exploration and notebook htmls 2025-11-30 21:50:08 +00:00
9 changed files with 22559 additions and 574 deletions

BIN
22535753_Dowd.pdf Normal file

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"id": "d2aa2997", "id": "d2aa2997",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -56,7 +56,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 58, "execution_count": 3,
"id": "e60d024e969254a", "id": "e60d024e969254a",
"metadata": { "metadata": {
"ExecuteTime": { "ExecuteTime": {
@@ -121,7 +121,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 57, "execution_count": 4,
"id": "0b68fdcd", "id": "0b68fdcd",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -194,7 +194,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 38, "execution_count": null,
"id": "9e20739a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"id": "46a985b4", "id": "46a985b4",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -269,7 +277,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 6,
"id": "7dc7ac2e", "id": "7dc7ac2e",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -347,7 +355,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 37, "execution_count": 7,
"id": "e6a4d4e2", "id": "e6a4d4e2",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -416,7 +424,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 8,
"id": "493979a4", "id": "493979a4",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -445,7 +453,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 9,
"id": "b3d07562", "id": "b3d07562",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@@ -648,6 +656,223 @@
"results, methods = evaluate_baseline_methods(test_pairs)\n", "results, methods = evaluate_baseline_methods(test_pairs)\n",
"print_comparison_table(results, test_pairs)" "print_comparison_table(results, test_pairs)"
] ]
},
{
"cell_type": "markdown",
"id": "06db7e83",
"metadata": {},
"source": [
"### Tree edit Distance analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c62457eb",
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"\n",
"# Load spaCy model for dependency parsing\n",
"nlp = spacy.load(\"en_core_web_lg\")\n",
"print(\"Loaded spaCy model for Tree Edit Distance\")"
]
},
{
"cell_type": "markdown",
"id": "c5905e3b",
"metadata": {},
"source": [
"##### Build trees"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2aea0b77",
"metadata": {},
"outputs": [],
"source": [
"def build_tree_from_dependencies(text):\n",
" \"\"\"Build hierarchical tree structure from dependency parse.\"\"\"\n",
" doc = nlp(text)\n",
" \n",
" # Create node dictionary with parent-child relationships\n",
" tree = {}\n",
" root_token = None\n",
" \n",
" for token in doc:\n",
" tree[token.i] = {\n",
" 'text': token.text,\n",
" 'lemma': token.lemma_,\n",
" 'pos': token.pos_,\n",
" 'dep': token.dep_,\n",
" 'head_id': token.head.i,\n",
" 'children': []\n",
" }\n",
" \n",
" # Build parent-child relationships\n",
" for token in doc:\n",
" if token.head.i != token.i: # Not root\n",
" tree[token.head.i]['children'].append(token.i)\n",
" else: # Root node\n",
" root_token = token.i\n",
" \n",
" return tree, root_token\n",
"\n",
"print(\"Tree building function loaded\")"
]
},
{
"cell_type": "markdown",
"id": "b45d5916",
"metadata": {},
"source": [
"##### Helper Functions"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5e5fa5a",
"metadata": {},
"outputs": [],
"source": [
"def get_nodes_postorder(node_id, tree):\n",
" \"\"\"Get all nodes in postorder traversal (children before parents).\"\"\"\n",
" nodes = []\n",
" if node_id in tree:\n",
" for child_id in tree[node_id]['children']:\n",
" nodes.extend(get_nodes_postorder(child_id, tree))\n",
" nodes.append(node_id)\n",
" return nodes\n",
"\n",
"def node_label(node_id, tree):\n",
" \"\"\"Get label for a node (POS tag + dependency relation).\"\"\"\n",
" if node_id in tree:\n",
" return (tree[node_id]['pos'], tree[node_id]['dep'])\n",
" return ('', '')\n",
"\n",
"def nodes_match(node_id1, tree1, node_id2, tree2):\n",
" \"\"\"Check if two nodes have matching labels.\"\"\"\n",
" label1 = node_label(node_id1, tree1)\n",
" label2 = node_label(node_id2, tree2)\n",
" return label1 == label2\n",
"\n",
"print(\"Helper functions loaded\")"
]
},
{
"cell_type": "markdown",
"id": "5176cf24",
"metadata": {},
"source": [
"##### Tree edit distance function"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ab8c40c",
"metadata": {},
"outputs": [],
"source": [
"def tree_edit_distance_zhang_shasha(tree1, tree2, root1, root2):\n",
" \"\"\"\n",
" Full Tree Edit Distance using Zhang-Shasha algorithm.\n",
" Preserves hierarchical structure and node ordering.\n",
" Returns: Normalized similarity score (0-1)\n",
" \"\"\"\n",
" \n",
" # get postorder traversals\n",
" nodes1 = get_nodes_postorder(root1, tree1)\n",
" nodes2 = get_nodes_postorder(root2, tree2)\n",
" \n",
" m, n = len(nodes1), len(nodes2)\n",
" \n",
" # DP table\n",
" dp = [[0] * (n + 1) for _ in range(m + 1)]\n",
" \n",
" # Base cases\n",
" for i in range(m + 1):\n",
" dp[i][0] = i\n",
" for j in range(n + 1):\n",
" dp[0][j] = j\n",
" \n",
" # Fill DP table\n",
" for i in range(1, m + 1):\n",
" for j in range(1, n + 1):\n",
" node1 = nodes1[i - 1]\n",
" node2 = nodes2[j - 1]\n",
" \n",
" if nodes_match(node1, tree1, node2, tree2):\n",
" cost = dp[i - 1][j - 1]\n",
" else:\n",
" delete_cost = dp[i - 1][j] + 1\n",
" insert_cost = dp[i][j - 1] + 1\n",
" replace_cost = dp[i - 1][j - 1] + 1\n",
" cost = min(delete_cost, insert_cost, replace_cost)\n",
" \n",
" dp[i][j] = cost\n",
" \n",
" # convert to similarity\n",
" max_size = max(m, n)\n",
" if max_size == 0:\n",
" return 1.0\n",
" \n",
" edit_distance = dp[m][n]\n",
" similarity = 1.0 - (edit_distance / max_size)\n",
" \n",
" return max(0.0, similarity)\n",
"\n",
"def tree_edit_distance_similarity(sent1, sent2):\n",
" \"\"\"Wrapper function for Tree Edit Distance similarity.\"\"\"\n",
" if not sent1.strip() or not sent2.strip():\n",
" return 0.0\n",
" \n",
" try:\n",
" tree1, root1 = build_tree_from_dependencies(sent1)\n",
" tree2, root2 = build_tree_from_dependencies(sent2)\n",
" return tree_edit_distance_zhang_shasha(tree1, tree2, root1, root2)\n",
" except:\n",
" return 0.0\n",
"\n",
"print(\"Tree Edit Distance algorithm loaded\")"
]
},
{
"cell_type": "markdown",
"id": "3bfc1dcd",
"metadata": {},
"source": [
"##### Standalone running TED"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "80f72ef3",
"metadata": {},
"outputs": [],
"source": [
"print(\"Tree Edit Distance - Standalone Tests\")\n",
"print(\"=\" * 100)\n",
"\n",
"test_cases = [\n",
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Exact copy\n",
" (\"The cat sat on the mat.\", \"On the mat, the cat was sitting.\"), # Structural change\n",
" (\"The cat sat on the mat.\", \"The feline rested on the rug.\"), # Synonym replacement\n",
" (\"The cat sat on the mat.\", \"The dog ran in the park.\"), # Different content\n",
"]\n",
"\n",
"for sent1, sent2 in test_cases:\n",
" sim = tree_edit_distance_similarity(sent1, sent2)\n",
" print(f\"\\n{sent1:<45} vs\")\n",
" print(f\"{sent2:<45}\")\n",
" print(f\"TED Similarity: {sim:.3f}\")\n",
" print(\"-\" * 100)"
]
} }
], ],
"metadata": { "metadata": {
@@ -666,7 +891,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.13.7" "version": "3.14.2"
} }
}, },
"nbformat": 4, "nbformat": 4,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -2,6 +2,10 @@ pandas
numpy numpy
scikit-learn scikit-learn
matplotlib matplotlib
sentence-transformers
rapidfuzz
statsmodels
spacy spacy
tensor tensor