-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy path4. bags.py
41 lines (30 loc) · 1.57 KB
/
4. bags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from spam_data import training_spam_docs, training_doc_tokens, training_labels, test_labels, test_spam_docs, training_docs, test_docs
from sklearn.naive_bayes import MultinomialNB
def create_features_dictionary(document_tokens):
features_dictionary = {}
index = 0
for token in document_tokens:
if token not in features_dictionary:
features_dictionary[token] = index
index += 1
return features_dictionary
def tokens_to_bow_vector(document_tokens, features_dictionary):
bow_vector = [0] * len(features_dictionary)
for token in document_tokens:
if token in features_dictionary:
feature_index = features_dictionary[token]
bow_vector[feature_index] += 1
return bow_vector
# Define bow_sms_dictionary:
bow_sms_dictionary = create_features_dictionary(training_doc_tokens)
# Define training_vectors:
training_vectors = [tokens_to_bow_vector(training_doc, bow_sms_dictionary) for training_doc in training_spam_docs]
# Define test_vectors:
test_vectors = [tokens_to_bow_vector(test_doc, bow_sms_dictionary) for test_doc in test_spam_docs]
spam_classifier = MultinomialNB()
def spam_or_not(label):
return "spam" if label else "not spam"
# Uncomment the code below when you're done:
spam_classifier.fit(training_vectors, training_labels)
predictions = spam_classifier.score(test_vectors, test_labels)
print("The predictions for the test data were {0}% accurate.\n\nFor example, '{1}' was classified as {2}.\n\nMeanwhile, '{3}' was classified as {4}.".format(predictions * 100, test_docs[0], spam_or_not(test_labels[0]), test_docs[10], spam_or_not(test_labels[10])))