Removed old files and added MSRParaphrase parser

2025-11-18 19:44:39 +00:00
parent 6bae7a2168
commit 42deee19f6
11 changed files with 96 additions and 77 deletions
--- a/basic-testing/parse_file.py
+++ b/basic-testing/parse_file.py
@@ -0,0 +1,6 @@
+import pandas as pd
+
+def load_msr_data(file_path):
+    """Load the MSR Paraphrase Corpus from a TSV file."""
+    df = pd.read_csv("../data/processed/msr_paraphrase_train.txt", sep='\t', quoting=3)
+    return df
--- a/basic-testing/parse_tree.py
+++ b/basic-testing/parse_tree.py
@@ -0,0 +1,29 @@
+import spacy
+from spacy import displacy
+
+# Load the model
+nlp = spacy.load("en_core_web_sm")
+
+def extract_parse_tree(text):
+    """Extract basic parse tree information"""
+    doc = nlp(text)
+    
+    print(f"Sentence: {text}")
+    print("\nDependency Parse Tree:")
+    print("-" * 50)
+    
+    for token in doc:
+        print(f"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}")
+    
+    return doc
+
+# Test with some sentences
+test_sentences = [
+    "The cat sat on the mat.",
+    "A quick brown fox jumps over the lazy dog.",
+    "She gave him the book yesterday."
+]
+
+for sentence in test_sentences:
+    doc = extract_parse_tree(sentence)
+    print("\n" + "="*60 + "\n")
--- a/data/processed/msr_test.pkl
+++ b/data/processed/msr_test.pkl
--- a/data/processed/msr_train.pkl
+++ b/data/processed/msr_train.pkl
--- a/data_preprocessing/msr_data_to_pickle.py
+++ b/data_preprocessing/msr_data_to_pickle.py
@@ -0,0 +1,53 @@
+import pandas as pd
+import os
+
+raw_data_path = "./data/raw/"
+processed_data_path = "./data/processed/"
+
+def load_msr_data(file_path):
+    """Load the MSR Paraphrase Corpus from a TSV file."""
+    df = pd.read_csv(file_path, sep='\t', quoting=3)  # quoting=3 for ignoring quotes
+    
+    print(f"Loaded {len(df)} sentence pairs")
+    #print(f"Positive examples (paraphrases): {df['quality'].sum()}")
+    #print(f"Negative examples: {len(df) - df['quality'].sum()}")
+
+    return df
+
+
+def save_to_pickle(df, pickle_path):
+    """Save the DataFrame to a pickle file."""
+    df.to_pickle(pickle_path)
+    print(f"DataFrame saved to {pickle_path}")
+
+
+def load_and_save_data():
+    """Load paraphrase data from user input and save as pickle"""
+    print("Enter current relative path to MSR Corpus\n")
+    relative_path = input("./ : ").strip()
+    #full_path = os.path.join(raw_data_path, relative_path)
+
+    try:
+        df = load_msr_data(relative_path)
+    except Exception as e:
+        print(f"❌ Error loading data: {e}")
+        return None
+    
+    pkl_save_path = input("Enter relative path to save pickle: ").strip()
+    if not os.path.isdir(pkl_save_path):
+        print(f"❌ Directory does not exist: ./{pkl_save_path}")
+        return None
+    
+    pkl_filename = input("Enter pickle filename: ").strip() + ".pkl"
+    full_pkl_path = os.path.join(pkl_save_path, pkl_filename)
+
+    try:
+        save_to_pickle(df, full_pkl_path)
+    except Exception as e:
+        print(f"❌ Error saving pickle: {e}")
+        return None 
+
+    print("✅ Data loading and saving completed successfully.")
+    print(f"Pickle saved at: {full_pkl_path}")
+
+load_and_save_data()
--- a/dataset_testing.py
+++ b/dataset_testing.py
@@ -1,42 +0,0 @@
-import spacy
-from datasets import load_dataset
-from tools import parser
-
-# Load spaCy and dataset
-nlp = spacy.load("en_core_web_sm")
-dataset = load_dataset("glue", "mrpc")
-
-def process_sentence_pair(sentence1, sentence2):
-    """Parse both sentences and extract their dependency structures"""
-    
-    # Parse both sentences
-    doc1 = nlp(sentence1)
-    doc2 = nlp(sentence2)
-    
-    # Extract dependency graphs
-    deps1 = parser.extract_dependency_relationships(doc1)
-    deps2 = parser.extract_dependency_relationships(doc2)
-    
-    return {
-        'sentence1': sentence1,
-        'sentence2': sentence2,
-        'dependencies1': deps1,
-        'dependencies2': deps2,
-        'doc1': doc1,
-        'doc2': doc2
-    }
-
-# Process a few examples from the dataset
-print("Processing MRPC examples...")
-for i in range(5):  # Just do first 5 examples
-    example = dataset['train'][i]
-    result = process_sentence_pair(example['sentence1'], example['sentence2'])
-    
-    print(f"\nExample {i+1}:")
-    print(f"Sentence 1: {result['sentence1']}")
-    print(f"Sentence 2: {result['sentence2']}")
-    print(f"Label: {example['label']} (1=paraphrase, 0=not paraphrase)")
-    
-    print(f"\nDependencies for Sentence 1:")
-    for dep in result['dependencies1'][:55]:  # Show first 55 dependencies (Likely All)
-        print(f"  {dep['word']} --{dep['dep_type']}--> {dep['head']}")
--- a/requirments.txt
+++ b/requirments.txt
@@ -0,0 +1,8 @@
+datasets
+huggingface-hub
+pandas
+numpy
+scikit-learn
+spacy
+matplotlib
+seaborn
--- a/tools/init.py
+++ b/tools/init.py
--- a/tools/pycache/init.cpython-313.pyc
+++ b/tools/pycache/init.cpython-313.pyc
--- a/tools/pycache/parser.cpython-313.pyc
+++ b/tools/pycache/parser.cpython-313.pyc
--- a/tools/parser.py
+++ b/tools/parser.py
@@ -1,35 +0,0 @@
-import spacy
-
-# English model
-nlp = spacy.load("en_core_web_sm")
-
-# Parse a single sentence
-def parse_sentence(sentence):
-    doc = nlp(sentence)
-
-    print("Token-by-token analysis:")
-    for token in doc:
-        print(f"Text: {token.text:<12} Dep: {token.dep_:<10} Head: {token.head.text:<10} POS: {token.pos_:<8}")
-
-    return doc
-
-def extract_dependency_relationships(doc):
-    """Extract dependency relationships for graph representation"""
-    dependencies = []
-    
-    for token in doc:
-        # Skip punctuation
-        if token.is_punct:
-            continue
-            
-        dependency = {
-            'word': token.text,
-            'lemma': token.lemma_,
-            'dep_type': token.dep_,
-            'head': token.head.text,
-            'head_lemma': token.head.lemma_,
-            'pos': token.pos_
-        }
-        dependencies.append(dependency)
-    
-    return dependencies