-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleanData.py
46 lines (32 loc) · 1.24 KB
/
cleanData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#print('hello')
import spacy
from spacy.tokens import Span
import collections
def print_line():
print("\n-----------------------------------------------------")
def get_entities():
doc = nlp(text)
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])
for entity in doc.ents:
print(entity.text, entity.label_)
def get_lpd():
doc = nlp(text)
for token in doc:
print("Text: {} | Lemmatization: {} | Part of Speech: {} | Dependency Parsing: {}".format(token.text, token.lemma_, token.pos_, token.dep_))
print_line()
def most_frequent():
doc = nlp(text)
words = [token.text for token in doc if token.pos_ != "PUNCT"]
print(collections.Counter(words).most_common(10))
print_line()
nlp = spacy.load("en_core_web_sm")
text = ("When Sebastian Thrun started working on self-driving cars at "
"Google in 2007, few people outside of the company took him "
"seriously. “I can tell you very senior CEOs of major American "
"car companies would shake my hand and turn away because I wasn’t "
"worth talking to,” said Thrun, in an interview with Recode earlier "
"this week.")
get_entities()
update_entities()
most_frequent()