Skip to content

Tweet Classification VITAL Workflow

Sajjadur Rahman edited this page Dec 18, 2020 · 2 revisions

Getting References

data = VTA("train.csv")`

Clean text

text.project().remove_urls()
text.project().strip_html()
text.project().remove_emoji()
text.project().remove_punctuation()

Load and apply model

model = data.get_model("disaster_tweets.h5")
model.predict("text")

Display distribution of disaster tweet vs. no disaster

col = data.get_column("target")
col.aggregate().count("sample_counts")
col.visualize("barchart", md_tag="sample_counts")

Display word counts

text = data.get_column("text")
new_col = text.mutate().num_words()
data.visualize([new_col, "target"], "histogram")

Create and Add UDF

def get_ngrams(corpus, n, g):
    vec = CountVectorizer(ngram_range=(g, g)).fit(corpus)
    bow = vec.transform(corpus)
    sum_words = bow.sum(axis=0)
    words_freq = [(word, int(sum_words[0, idx])) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return dict(words_freq[:n])
data.add_udf(get_ngrams)

Apply UDF

Download model

text.apply_udf("get_ngrams", 10, 2, md_tag="ngrams")
text.visualize("barchart", md_tag="ngrams")