-
Notifications
You must be signed in to change notification settings - Fork 1
SPAM Detection VITAL Workflow
Sajjadur Rahman edited this page Dec 18, 2020
·
2 revisions
data = VTA("spam.csv")
data.rename("v1", "category")
data.rename("v2", "text")
col = data.get_column("category")
col.aggregate().count("spam_counts")
col.visualize("barchart", md_tag="spam_counts")
text = data.get_column("text")
text.project().strip_html()
text.project().remove_urls()
text.project().remove_square_brackets()
text.project().remove_stopwords()
new_col = text.mutate().num_words()
data.visualize([new_col, "category"], "histogram")
data.replace("category", "ham", 0)
data.replace("category", "spam", 1)
def get_ngrams(corpus, n, g):
vec = CountVectorizer(ngram_range=(g, g)).fit(corpus)
bow = vec.transform(corpus)
sum_words = bow.sum(axis=0)
words_freq = [(word, int(sum_words[0, idx])) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return dict(words_freq[:n])
data.add_udf(get_ngrams)
text.apply_udf("get_ngrams", 10, 2, md_tag="ngrams")
text.visualize("barchart", md_tag="ngrams")
model = data.get_model("sms_spam.h5")
model.predict("text")