48 lines
1.5 KiB
Python
48 lines
1.5 KiB
Python
# NoteBook Functions
|
|
import spacy
|
|
from spacy import displacy
|
|
from IPython.display import display, HTML
|
|
|
|
nlp = spacy.load("en_core_web_md") # Medium model
|
|
|
|
def jaccard_similarity(sent1, sent2):
|
|
# make lowercase and split into words
|
|
words1 = set(sent1.lower().split())
|
|
words2 = set(sent2.lower().split())
|
|
intersection = words1.intersection(words2)
|
|
union = words1.union(words2)
|
|
return float(len(intersection)) / len(union) if union else 0.0
|
|
|
|
def sentence_similarity_avg(sent1, sent2):
|
|
doc1 = nlp(sent1)
|
|
doc2 = nlp(sent2)
|
|
|
|
# Vectors for each word, filter out words without vectors (medium model)
|
|
vecs1 = [token.vector for token in doc1 if token.has_vector]
|
|
vecs2 = [token.vector for token in doc2 if token.has_vector]
|
|
|
|
if not vecs1 or not vecs2:
|
|
return 0.0
|
|
|
|
# Average vectors
|
|
avg1 = sum(vecs1) / len(vecs1)
|
|
avg2 = sum(vecs2) / len(vecs2)
|
|
|
|
#cosine similarity
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
return cosine_similarity([avg1], [avg2])[0][0]
|
|
|
|
def extract_all_features(sentence_pairs):
|
|
features = []
|
|
for sent1, sent2 in sentence_pairs:
|
|
feature_vector = [
|
|
jaccard_similarity(sent1, sent2),
|
|
sentence_similarity_avg(sent1, sent2),
|
|
sentence_similarity_sif(sent1, sent2),
|
|
syntactic_similarity(sent1, sent2)
|
|
]
|
|
|
|
def visualize_parse_tree(text):
|
|
doc = nlp(text)
|
|
html = displacy.render(doc, style="dep", jupyter=False, options={"distance": 100})
|
|
display(HTML(html)) |