Ran files

This commit is contained in:
Henry Dowd
2026-02-27 04:56:34 +00:00
parent 84ee1dc246
commit 3d755fca3e
4 changed files with 829 additions and 769 deletions

File diff suppressed because one or more lines are too long

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "d2aa2997",
"metadata": {},
"outputs": [
@@ -56,7 +56,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 3,
"id": "e60d024e969254a",
"metadata": {
"ExecuteTime": {
@@ -121,7 +121,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"id": "0b68fdcd",
"metadata": {},
"outputs": [
@@ -130,18 +130,19 @@
"output_type": "stream",
"text": [
"Sentence 1 Sentence 2: Similarities -> Char: Word:\n",
"====================================================================================================\n"
]
},
{
"ename": "NameError",
"evalue": "name 'test_pairs' is not defined",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 31\u001b[39m\n\u001b[32m 28\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mSentence 1\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<41\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mSentence 2:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mSimilarities ->\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<19\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mChar:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<10\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33mWord:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 29\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m=\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m31\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m sent1, sent2 \u001b[38;5;129;01min\u001b[39;00m \u001b[43mtest_pairs\u001b[49m:\n\u001b[32m 32\u001b[39m char_similarity = char_levenshtein_similarity(sent1, sent2)\n\u001b[32m 33\u001b[39m word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
"\u001b[31mNameError\u001b[39m: name 'test_pairs' is not defined"
"====================================================================================================\n",
"The cat sat on the mat. vs The cat sat on the mat. 1.000 1.000 \n",
"The cat sat on the mat. vs The cat sat on the mat 0.957 0.833 \n",
"The cat sat on the mat. vs The cat sat on the mat. 0.821 1.000 \n",
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.344 0.143 \n",
"The cat sat on the mat. vs The feline rested on the rug. 0.517 0.500 \n",
"The quick brown fox jumps. vs A fast brown fox leaps. 0.577 0.400 \n",
"The cat sat on the mat. vs The dog ran in the park. 0.625 0.333 \n",
"I love programming. vs She enjoys reading books. 0.200 0.000 \n",
"The weather is nice today. vs It's raining outside. 0.192 0.000 \n",
"Short. vs Short. 1.000 1.000 \n",
"A B C D E F G vs A B C D E F G 1.000 1.000 \n",
" vs 1.000 1.000 \n"
]
}
],
@@ -193,7 +194,15 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": null,
"id": "9e20739a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"id": "46a985b4",
"metadata": {},
"outputs": [
@@ -268,7 +277,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "7dc7ac2e",
"metadata": {},
"outputs": [
@@ -346,7 +355,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 7,
"id": "e6a4d4e2",
"metadata": {},
"outputs": [
@@ -415,7 +424,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"id": "493979a4",
"metadata": {},
"outputs": [],
@@ -444,7 +453,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 9,
"id": "b3d07562",
"metadata": {},
"outputs": [
@@ -665,7 +674,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
"version": "3.13.12"
}
},
"nbformat": 4,

View File

@@ -1006,7 +1006,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.12"
"version": "3.14.2"
}
},
"nbformat": 4,

View File

@@ -13,7 +13,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "1c45d83192facfc6",
"metadata": {},
"outputs": [
@@ -63,6 +63,7 @@
" (\"Short.\", \"Short.\"), # Very short\n",
" (\"A B C D E F G\", \"A B C D E F G\"), # Repeated words\n",
" (\"\", \"\"), # Empty strings\n",
" \n",
"## ADDITIONAL TEST PAIRS (for semantic evaluation)\n",
" # Polysemy (word sense ambiguity) - Critical for BERT vs SpaCy\n",
" (\"He went to the bank to deposit money.\", \"He went to the river bank to fish.\"), # Bank: financial vs. riverbank\n",
@@ -110,7 +111,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 29,
"id": "9665682bd5a7951e",
"metadata": {},
"outputs": [
@@ -191,7 +192,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 30,
"id": "91282393",
"metadata": {},
"outputs": [
@@ -241,7 +242,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 31,
"id": "54916598",
"metadata": {},
"outputs": [
@@ -302,7 +303,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 32,
"id": "e8a13c18",
"metadata": {},
"outputs": [
@@ -456,7 +457,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 33,
"id": "292e3d91",
"metadata": {},
"outputs": [
@@ -512,7 +513,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 34,
"id": "4bea7544",
"metadata": {},
"outputs": [
@@ -578,7 +579,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 35,
"id": "0d0a4596",
"metadata": {},
"outputs": [
@@ -645,7 +646,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 36,
"id": "ed9f0973",
"metadata": {},
"outputs": [
@@ -701,8 +702,8 @@
"\n",
"# ROC Curve\n",
"fpr, tpr, _ = roc_curve(y_test, y_pred_proba)\n",
"auc = roc_auc_score(y_test, y_pred_proba)\n",
"axes[1].plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.3f})', linewidth=2)\n",
"auc_score = roc_auc_score(y_test, y_pred_proba)\n",
"axes[1].plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})', linewidth=2)\n",
"axes[1].plot([0, 1], [0, 1], 'k--', label='Random Classifier')\n",
"axes[1].set_xlabel('False Positive Rate')\n",
"axes[1].set_ylabel('True Positive Rate')\n",
@@ -724,7 +725,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 37,
"id": "6dc8a0c6",
"metadata": {},
"outputs": [
@@ -783,7 +784,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 38,
"id": "611b74f9",
"metadata": {},
"outputs": [
@@ -942,6 +943,80 @@
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "e7949424",
"metadata": {},
"source": [
"#### Summary"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "53e53eb5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"FUSION MODEL ANALYSIS SUMMARY\n",
"================================================================================\n",
"\n",
"1. Feature Correlations:\n",
" - BERT & TED correlation: 0.257\n",
" → Low = both branches provide independent information ✓\n",
"\n",
"2. Feature Importance:\n",
" - Most important feature: BERT (Semantic)\n",
" → Coefficient: 1.7731\n",
"\n",
"3. Model Performance:\n",
" - Test Accuracy: 0.833\n",
" - Test F1-Score: 0.800\n",
" - ROC AUC: 0.889\n",
"\n",
"4. Ablation Insights:\n",
" - Max performance drop when removing feature: 0.400\n",
" - All features contribute to model performance ✓\n",
"\n",
"5. Recommendations for Next Phase:\n",
" - Train on full MSRP corpus (35 pairs too small)\n",
" - Experiment with SVM or Random Forest for non-linear relationships\n",
" - Optimize threshold for precision vs. recall trade-off\n"
]
}
],
"source": [
"print(\"FUSION MODEL ANALYSIS SUMMARY\")\n",
"print(\"=\" * 80)\n",
"\n",
"print(\"\\n1. Feature Correlations:\")\n",
"print(f\" - BERT & TED correlation: {feature_corr.loc['bert', 'ted']:.3f}\")\n",
"print(f\" → Low = both branches provide independent information ✓\")\n",
"\n",
"print(\"\\n2. Feature Importance:\")\n",
"most_important = feature_names[np.argmax(np.abs(model.coef_[0]))]\n",
"print(f\" - Most important feature: {most_important}\")\n",
"print(f\" → Coefficient: {np.max(np.abs(model.coef_[0])):.4f}\")\n",
"\n",
"print(\"\\n3. Model Performance:\")\n",
"print(f\" - Test Accuracy: {model.score(X_test, y_test):.3f}\")\n",
"print(f\" - Test F1-Score: {f1_score(y_test, y_pred):.3f}\")\n",
"print(f\" - ROC AUC: {auc_score:.3f}\")\n",
"\n",
"print(\"\\n4. Ablation Insights:\")\n",
"performance_drop = ablation_df.iloc[0]['F1-Score'] - ablation_df['F1-Score'].iloc[1:].min()\n",
"print(f\" - Max performance drop when removing feature: {performance_drop:.3f}\")\n",
"print(f\" - All features contribute to model performance ✓\")\n",
"\n",
"print(\"\\n5. Recommendations for Next Phase:\")\n",
"print(\" - Train on full MSRP corpus (35 pairs too small)\")\n",
"print(\" - Experiment with SVM or Random Forest for non-linear relationships\")\n",
"print(\" - Optimize threshold for precision vs. recall trade-off\")"
]
}
],
"metadata": {