forked from Soinull/assimilate
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassimilate-train.py
59 lines (46 loc) · 2.22 KB
/
assimilate-train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#
# Assimilate-Train.py
# Copyright 2017 Tim Crothers
# Credit for the excellent BroLogReader code is to Mike Sconzo - https://github.com/ClickSecurity/data_hacking/blob/master/browser_fingerprinting/bro_log_reader.py
#
import io
import numpy
from sklearn.externals import joblib
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from optparse import OptionParser
from assimilate_utils import BroLogReader
if __name__ == "__main__":
__version__ = '1.0'
usage = """assimilate-train [options]"""
parser = OptionParser(usage=usage, version=__version__)
parser.add_option("-n", "--normaldata", action="store", type="string", \
default=None, help="A directory of normal http header logs (required)")
parser.add_option("-m", "--maliciousdata", action="store", type="string", \
default=None, help="A directory of malicious http header logs (required)")
parser.add_option("-b", "--bayesianfile", action="store", type="string", \
default='./nb.pkl', help="the location to store the bayesian classifier")
parser.add_option("-x", "--vectorizerfile", action="store", type="string", \
default='./vectorizers.pkl', help="the location to store the vectorizer")
(opts, args) = parser.parse_args()
if opts.normaldata == None:
parser.error('Normal data directory needed')
if opts.maliciousdata == None:
parser.error('Malicious data directory needed')
data = DataFrame({'header': [], 'class': []})
blr = BroLogReader()
print('Reading normal data...')
data = data.append(blr.dataFrameFromDirectory(opts.normaldata, 'good'))
print('Reading malicious data...')
data = data.append(blr.dataFrameFromDirectory(opts.maliciousdata, 'bad'))
print('Vectorizing data...')
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['header'].values)
classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)
print('Writing out models...')
joblib.dump(vectorizer, opts.vectorizerfile)
joblib.dump(classifier,opts.bayesianfile)
print('Done!')