Removed old files and added MSRParaphrase parser
This commit is contained in:
6
basic-testing/parse_file.py
Normal file
6
basic-testing/parse_file.py
Normal file
@@ -0,0 +1,6 @@
|
||||
import pandas as pd
|
||||
|
||||
def load_msr_data(file_path):
|
||||
"""Load the MSR Paraphrase Corpus from a TSV file."""
|
||||
df = pd.read_csv("../data/processed/msr_paraphrase_train.txt", sep='\t', quoting=3)
|
||||
return df
|
||||
29
basic-testing/parse_tree.py
Normal file
29
basic-testing/parse_tree.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
|
||||
# Load the model
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
def extract_parse_tree(text):
|
||||
"""Extract basic parse tree information"""
|
||||
doc = nlp(text)
|
||||
|
||||
print(f"Sentence: {text}")
|
||||
print("\nDependency Parse Tree:")
|
||||
print("-" * 50)
|
||||
|
||||
for token in doc:
|
||||
print(f"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}")
|
||||
|
||||
return doc
|
||||
|
||||
# Test with some sentences
|
||||
test_sentences = [
|
||||
"The cat sat on the mat.",
|
||||
"A quick brown fox jumps over the lazy dog.",
|
||||
"She gave him the book yesterday."
|
||||
]
|
||||
|
||||
for sentence in test_sentences:
|
||||
doc = extract_parse_tree(sentence)
|
||||
print("\n" + "="*60 + "\n")
|
||||
BIN
data/processed/msr_test.pkl
Normal file
BIN
data/processed/msr_test.pkl
Normal file
Binary file not shown.
BIN
data/processed/msr_train.pkl
Normal file
BIN
data/processed/msr_train.pkl
Normal file
Binary file not shown.
53
data_preprocessing/msr_data_to_pickle.py
Normal file
53
data_preprocessing/msr_data_to_pickle.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
raw_data_path = "./data/raw/"
|
||||
processed_data_path = "./data/processed/"
|
||||
|
||||
def load_msr_data(file_path):
|
||||
"""Load the MSR Paraphrase Corpus from a TSV file."""
|
||||
df = pd.read_csv(file_path, sep='\t', quoting=3) # quoting=3 for ignoring quotes
|
||||
|
||||
print(f"Loaded {len(df)} sentence pairs")
|
||||
#print(f"Positive examples (paraphrases): {df['quality'].sum()}")
|
||||
#print(f"Negative examples: {len(df) - df['quality'].sum()}")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def save_to_pickle(df, pickle_path):
|
||||
"""Save the DataFrame to a pickle file."""
|
||||
df.to_pickle(pickle_path)
|
||||
print(f"DataFrame saved to {pickle_path}")
|
||||
|
||||
|
||||
def load_and_save_data():
|
||||
"""Load paraphrase data from user input and save as pickle"""
|
||||
print("Enter current relative path to MSR Corpus\n")
|
||||
relative_path = input("./ : ").strip()
|
||||
#full_path = os.path.join(raw_data_path, relative_path)
|
||||
|
||||
try:
|
||||
df = load_msr_data(relative_path)
|
||||
except Exception as e:
|
||||
print(f"❌ Error loading data: {e}")
|
||||
return None
|
||||
|
||||
pkl_save_path = input("Enter relative path to save pickle: ").strip()
|
||||
if not os.path.isdir(pkl_save_path):
|
||||
print(f"❌ Directory does not exist: ./{pkl_save_path}")
|
||||
return None
|
||||
|
||||
pkl_filename = input("Enter pickle filename: ").strip() + ".pkl"
|
||||
full_pkl_path = os.path.join(pkl_save_path, pkl_filename)
|
||||
|
||||
try:
|
||||
save_to_pickle(df, full_pkl_path)
|
||||
except Exception as e:
|
||||
print(f"❌ Error saving pickle: {e}")
|
||||
return None
|
||||
|
||||
print("✅ Data loading and saving completed successfully.")
|
||||
print(f"Pickle saved at: {full_pkl_path}")
|
||||
|
||||
load_and_save_data()
|
||||
@@ -1,42 +0,0 @@
|
||||
import spacy
|
||||
from datasets import load_dataset
|
||||
from tools import parser
|
||||
|
||||
# Load spaCy and dataset
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
dataset = load_dataset("glue", "mrpc")
|
||||
|
||||
def process_sentence_pair(sentence1, sentence2):
|
||||
"""Parse both sentences and extract their dependency structures"""
|
||||
|
||||
# Parse both sentences
|
||||
doc1 = nlp(sentence1)
|
||||
doc2 = nlp(sentence2)
|
||||
|
||||
# Extract dependency graphs
|
||||
deps1 = parser.extract_dependency_relationships(doc1)
|
||||
deps2 = parser.extract_dependency_relationships(doc2)
|
||||
|
||||
return {
|
||||
'sentence1': sentence1,
|
||||
'sentence2': sentence2,
|
||||
'dependencies1': deps1,
|
||||
'dependencies2': deps2,
|
||||
'doc1': doc1,
|
||||
'doc2': doc2
|
||||
}
|
||||
|
||||
# Process a few examples from the dataset
|
||||
print("Processing MRPC examples...")
|
||||
for i in range(5): # Just do first 5 examples
|
||||
example = dataset['train'][i]
|
||||
result = process_sentence_pair(example['sentence1'], example['sentence2'])
|
||||
|
||||
print(f"\nExample {i+1}:")
|
||||
print(f"Sentence 1: {result['sentence1']}")
|
||||
print(f"Sentence 2: {result['sentence2']}")
|
||||
print(f"Label: {example['label']} (1=paraphrase, 0=not paraphrase)")
|
||||
|
||||
print(f"\nDependencies for Sentence 1:")
|
||||
for dep in result['dependencies1'][:55]: # Show first 55 dependencies (Likely All)
|
||||
print(f" {dep['word']} --{dep['dep_type']}--> {dep['head']}")
|
||||
@@ -0,0 +1,8 @@
|
||||
datasets
|
||||
huggingface-hub
|
||||
pandas
|
||||
numpy
|
||||
scikit-learn
|
||||
spacy
|
||||
matplotlib
|
||||
seaborn
|
||||
Binary file not shown.
Binary file not shown.
@@ -1,35 +0,0 @@
|
||||
import spacy
|
||||
|
||||
# English model
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
|
||||
# Parse a single sentence
|
||||
def parse_sentence(sentence):
|
||||
doc = nlp(sentence)
|
||||
|
||||
print("Token-by-token analysis:")
|
||||
for token in doc:
|
||||
print(f"Text: {token.text:<12} Dep: {token.dep_:<10} Head: {token.head.text:<10} POS: {token.pos_:<8}")
|
||||
|
||||
return doc
|
||||
|
||||
def extract_dependency_relationships(doc):
|
||||
"""Extract dependency relationships for graph representation"""
|
||||
dependencies = []
|
||||
|
||||
for token in doc:
|
||||
# Skip punctuation
|
||||
if token.is_punct:
|
||||
continue
|
||||
|
||||
dependency = {
|
||||
'word': token.text,
|
||||
'lemma': token.lemma_,
|
||||
'dep_type': token.dep_,
|
||||
'head': token.head.text,
|
||||
'head_lemma': token.head.lemma_,
|
||||
'pos': token.pos_
|
||||
}
|
||||
dependencies.append(dependency)
|
||||
|
||||
return dependencies
|
||||
Reference in New Issue
Block a user