Files
paraphrase_detector/notebooks/01_data_exploration.ipynb
2025-11-18 23:25:04 +00:00

74 lines
1.9 KiB
Plaintext

{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "Keep punctuation for direct copy detection but remove for semantic/keyword based methods",
"id": "1c26616777253f10"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-18T23:01:17.888318Z",
"start_time": "2025-11-18T23:01:16.494987Z"
}
},
"cell_type": "code",
"source": [
"import spacy\n",
"\n",
"nlp = spacy.load(\"en_core_web_md\") # Can swap for large model if required\n",
"\n",
"test_sentences = [\n",
" \"The cat sat on the mat.\",\n",
" \"On the mat, the cat was sitting.\",\n",
" \"A completely different sentence about something else.\"\n",
"]\n",
"\n",
"for sent in test_sentences:\n",
" doc = nlp(sent)\n",
" print(f\"Sentence: {sent}\")\n",
" print(f\"Tokens: {[token.text for token in doc]}\")\n",
" print(\"---\")\n",
"\n"
],
"id": "e003ac06a58cfbb4",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence: The cat sat on the mat.\n",
"Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n",
"---\n",
"Sentence: On the mat, the cat was sitting.\n",
"Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n",
"---\n",
"Sentence: A completely different sentence about something else.\n",
"Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n",
"---\n"
]
}
],
"execution_count": 1
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
"id": "83fc18c9de2e354"
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3 (ipykernel)"
}
},
"nbformat": 4,
"nbformat_minor": 5
}