Ran files

2026-02-27 04:56:34 +00:00
parent 84ee1dc246
commit 3d755fca3e
4 changed files with 829 additions and 769 deletions
--- a/notebooks/04_fusion_model.ipynb
+++ b/notebooks/04_fusion_model.ipynb
@@ -13,7 +13,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "id": "1c45d83192facfc6",
   "metadata": {},
   "outputs": [
@@ -61,8 +61,9 @@
    "    \n",
    "    # Edge cases\n",
    "    (\"Short.\", \"Short.\"),                                             # Very short\n",
-    "    (\"A B C D E F G\", \"A B C D E F G\"),                              # Repeated words\n",
+    "    (\"A B C D E F G\", \"A B C D E F G\"),                               # Repeated words\n",
    "    (\"\", \"\"),                                                         # Empty strings\n",
+    "    \n",
    "## ADDITIONAL TEST PAIRS (for semantic evaluation)\n",
    "    # Polysemy (word sense ambiguity) - Critical for BERT vs SpaCy\n",
    "    (\"He went to the bank to deposit money.\", \"He went to the river bank to fish.\"),           # Bank: financial vs. riverbank\n",
@@ -110,7 +111,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 29,
   "id": "9665682bd5a7951e",
   "metadata": {},
   "outputs": [
@@ -191,7 +192,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 30,
   "id": "91282393",
   "metadata": {},
   "outputs": [
@@ -241,7 +242,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 31,
   "id": "54916598",
   "metadata": {},
   "outputs": [
@@ -302,7 +303,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 32,
   "id": "e8a13c18",
   "metadata": {},
   "outputs": [
@@ -456,7 +457,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 33,
   "id": "292e3d91",
   "metadata": {},
   "outputs": [
@@ -512,7 +513,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 34,
   "id": "4bea7544",
   "metadata": {},
   "outputs": [
@@ -578,7 +579,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 35,
   "id": "0d0a4596",
   "metadata": {},
   "outputs": [
@@ -645,7 +646,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 36,
   "id": "ed9f0973",
   "metadata": {},
   "outputs": [
@@ -701,8 +702,8 @@
    "\n",
    "# ROC Curve\n",
    "fpr, tpr, _ = roc_curve(y_test, y_pred_proba)\n",
-    "auc = roc_auc_score(y_test, y_pred_proba)\n",
-    "axes[1].plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.3f})', linewidth=2)\n",
+    "auc_score = roc_auc_score(y_test, y_pred_proba)\n",
+    "axes[1].plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})', linewidth=2)\n",
    "axes[1].plot([0, 1], [0, 1], 'k--', label='Random Classifier')\n",
    "axes[1].set_xlabel('False Positive Rate')\n",
    "axes[1].set_ylabel('True Positive Rate')\n",
@@ -724,7 +725,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 37,
   "id": "6dc8a0c6",
   "metadata": {},
   "outputs": [
@@ -783,7 +784,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 38,
   "id": "611b74f9",
   "metadata": {},
   "outputs": [
@@ -942,6 +943,80 @@
    "plt.tight_layout()\n",
    "plt.show()"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7949424",
+   "metadata": {},
+   "source": [
+    "#### Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "53e53eb5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FUSION MODEL ANALYSIS SUMMARY\n",
+      "================================================================================\n",
+      "\n",
+      "1. Feature Correlations:\n",
+      "   - BERT & TED correlation: 0.257\n",
+      "     → Low = both branches provide independent information ✓\n",
+      "\n",
+      "2. Feature Importance:\n",
+      "   - Most important feature: BERT (Semantic)\n",
+      "     → Coefficient: 1.7731\n",
+      "\n",
+      "3. Model Performance:\n",
+      "   - Test Accuracy: 0.833\n",
+      "   - Test F1-Score: 0.800\n",
+      "   - ROC AUC: 0.889\n",
+      "\n",
+      "4. Ablation Insights:\n",
+      "   - Max performance drop when removing feature: 0.400\n",
+      "   - All features contribute to model performance ✓\n",
+      "\n",
+      "5. Recommendations for Next Phase:\n",
+      "   - Train on full MSRP corpus (35 pairs too small)\n",
+      "   - Experiment with SVM or Random Forest for non-linear relationships\n",
+      "   - Optimize threshold for precision vs. recall trade-off\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"FUSION MODEL ANALYSIS SUMMARY\")\n",
+    "print(\"=\" * 80)\n",
+    "\n",
+    "print(\"\\n1. Feature Correlations:\")\n",
+    "print(f\"   - BERT & TED correlation: {feature_corr.loc['bert', 'ted']:.3f}\")\n",
+    "print(f\"     → Low = both branches provide independent information ✓\")\n",
+    "\n",
+    "print(\"\\n2. Feature Importance:\")\n",
+    "most_important = feature_names[np.argmax(np.abs(model.coef_[0]))]\n",
+    "print(f\"   - Most important feature: {most_important}\")\n",
+    "print(f\"     → Coefficient: {np.max(np.abs(model.coef_[0])):.4f}\")\n",
+    "\n",
+    "print(\"\\n3. Model Performance:\")\n",
+    "print(f\"   - Test Accuracy: {model.score(X_test, y_test):.3f}\")\n",
+    "print(f\"   - Test F1-Score: {f1_score(y_test, y_pred):.3f}\")\n",
+    "print(f\"   - ROC AUC: {auc_score:.3f}\")\n",
+    "\n",
+    "print(\"\\n4. Ablation Insights:\")\n",
+    "performance_drop = ablation_df.iloc[0]['F1-Score'] - ablation_df['F1-Score'].iloc[1:].min()\n",
+    "print(f\"   - Max performance drop when removing feature: {performance_drop:.3f}\")\n",
+    "print(f\"   - All features contribute to model performance ✓\")\n",
+    "\n",
+    "print(\"\\n5. Recommendations for Next Phase:\")\n",
+    "print(\"   - Train on full MSRP corpus (35 pairs too small)\")\n",
+    "print(\"   - Experiment with SVM or Random Forest for non-linear relationships\")\n",
+    "print(\"   - Optimize threshold for precision vs. recall trade-off\")"
+   ]
  }
 ],
 "metadata": {