diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e75ccaa --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv +.vscode diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/__pycache__/parser.cpython-313.pyc b/__pycache__/parser.cpython-313.pyc new file mode 100644 index 0000000..1693ab5 Binary files /dev/null and b/__pycache__/parser.cpython-313.pyc differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..3d980d3 --- /dev/null +++ b/main.py @@ -0,0 +1,14 @@ +from tools.parser import parse_sentence, extract_dependency_relationships + +def main(): + + sentence = "The quick brown fox jumps over the lazy dog." + doc = parse_sentence(sentence) + + dependencies = extract_dependency_relationships(doc) + print("\nDependency relationships:") + for dep in dependencies: + print(f"{dep['word']} --{dep['dep_type']}--> {dep['head']}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/testing/__init__.py b/testing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/testing/__pycache__/datasets.cpython-313.pyc b/testing/__pycache__/datasets.cpython-313.pyc new file mode 100644 index 0000000..26cec17 Binary files /dev/null and b/testing/__pycache__/datasets.cpython-313.pyc differ diff --git a/testing/dataset_testing.py b/testing/dataset_testing.py new file mode 100644 index 0000000..68b8800 --- /dev/null +++ b/testing/dataset_testing.py @@ -0,0 +1,41 @@ +import spacy +from tools import parser + +# Load spaCy and dataset +nlp = spacy.load("en_core_web_sm") +dataset = parser.load_dataset("glue", "mrpc") + +def process_sentence_pair(sentence1, sentence2): + """Parse both sentences and extract their dependency structures""" + + # Parse both sentences + doc1 = nlp(sentence1) + doc2 = nlp(sentence2) + + # Extract dependency graphs + deps1 = parser.extract_dependency_relationships(doc1) + deps2 = parser.extract_dependency_relationships(doc2) + + return { + 'sentence1': sentence1, + 'sentence2': sentence2, + 'dependencies1': deps1, + 'dependencies2': deps2, + 'doc1': doc1, + 'doc2': doc2 + } + +# Process a few examples from the dataset +print("Processing MRPC examples...") +for i in range(3): # Just do first 3 examples + example = dataset['train'][i] + result = process_sentence_pair(example['sentence1'], example['sentence2']) + + print(f"\nExample {i+1}:") + print(f"Sentence 1: {result['sentence1']}") + print(f"Sentence 2: {result['sentence2']}") + print(f"Label: {example['label']} (1=paraphrase, 0=not paraphrase)") + + print(f"\nDependencies for Sentence 1:") + for dep in result['dependencies1'][:5]: # Show first 5 dependencies + print(f" {dep['word']} --{dep['dep_type']}--> {dep['head']}") \ No newline at end of file diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tools/__pycache__/__init__.cpython-313.pyc b/tools/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000..443a12c Binary files /dev/null and b/tools/__pycache__/__init__.cpython-313.pyc differ diff --git a/tools/__pycache__/parser.cpython-313.pyc b/tools/__pycache__/parser.cpython-313.pyc new file mode 100644 index 0000000..b58fcc9 Binary files /dev/null and b/tools/__pycache__/parser.cpython-313.pyc differ diff --git a/tools/parser.py b/tools/parser.py new file mode 100644 index 0000000..63fbfe8 --- /dev/null +++ b/tools/parser.py @@ -0,0 +1,35 @@ +import spacy + +# English model +nlp = spacy.load("en_core_web_sm") + +# Parse a single sentence +def parse_sentence(sentence): + doc = nlp(sentence) + + print("Token-by-token analysis:") + for token in doc: + print(f"Text: {token.text:<12} Dep: {token.dep_:<10} Head: {token.head.text:<10} POS: {token.pos_:<8}") + + return doc + +def extract_dependency_relationships(doc): + """Extract dependency relationships for graph representation""" + dependencies = [] + + for token in doc: + # Skip punctuation + if token.is_punct: + continue + + dependency = { + 'word': token.text, + 'lemma': token.lemma_, + 'dep_type': token.dep_, + 'head': token.head.text, + 'head_lemma': token.head.lemma_, + 'pos': token.pos_ + } + dependencies.append(dependency) + + return dependencies