Skip to content

SPAM Detection VITAL Workflow

Sajjadur Rahman edited this page Dec 18, 2020 · 2 revisions

Getting References

data = VTA("spam.csv")
data.rename("v1", "category")
data.rename("v2", "text")

Display distribution of spam/no spam

col = data.get_column("category")
col.aggregate().count("spam_counts")
col.visualize("barchart", md_tag="spam_counts")

Clean text

text = data.get_column("text")
text.project().strip_html()
text.project().remove_urls()
text.project().remove_square_brackets()
text.project().remove_stopwords()

Display word counts

new_col = text.mutate().num_words()
data.visualize([new_col, "category"], "histogram")

Now replace ham/spam with 0/1

data.replace("category", "ham", 0)
data.replace("category", "spam", 1)

Create and Add UDF

def get_ngrams(corpus, n, g):
    vec = CountVectorizer(ngram_range=(g, g)).fit(corpus)
    bow = vec.transform(corpus)
    sum_words = bow.sum(axis=0)
    words_freq = [(word, int(sum_words[0, idx])) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return dict(words_freq[:n])
data.add_udf(get_ngrams)

Apply UDF

text.apply_udf("get_ngrams", 10, 2, md_tag="ngrams")
text.visualize("barchart", md_tag="ngrams")

Load and apply model

Download model

model = data.get_model("sms_spam.h5")
model.predict("text")