-
Notifications
You must be signed in to change notification settings - Fork 1
Tweet Classification VITAL Workflow
Sajjadur Rahman edited this page Dec 18, 2020
·
2 revisions
data = VTA("train.csv")`
text.project().remove_urls()
text.project().strip_html()
text.project().remove_emoji()
text.project().remove_punctuation()
model = data.get_model("disaster_tweets.h5")
model.predict("text")
col = data.get_column("target")
col.aggregate().count("sample_counts")
col.visualize("barchart", md_tag="sample_counts")
text = data.get_column("text")
new_col = text.mutate().num_words()
data.visualize([new_col, "target"], "histogram")
def get_ngrams(corpus, n, g):
vec = CountVectorizer(ngram_range=(g, g)).fit(corpus)
bow = vec.transform(corpus)
sum_words = bow.sum(axis=0)
words_freq = [(word, int(sum_words[0, idx])) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return dict(words_freq[:n])
data.add_udf(get_ngrams)
text.apply_udf("get_ngrams", 10, 2, md_tag="ngrams")
text.visualize("barchart", md_tag="ngrams")