-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathutil.py
51 lines (42 loc) · 1.73 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# Utility/reusable functions:
# loadCanonDict: Loads up the canonical dictionary from the db
# In: Nothing
# Out: A python dict where the key is the word and the value is the unique ID for that word
from newsbot.models import *
import numpy as np
def loadCanonDict():
canonDict = DictEntry.objects.all()
dictSize = canonDict.count() + 1
cDict = {}
for cw in canonDict:
cDict[cw.canonWord] = cw.pk
return cDict
#buildExampleRow: Take in some text, spit out a row vector ready for stacking or analysis
#In: body text in canonical-word form and the canonical dictionary we're using
#Out: a numpy array with 1s and 0s set up properly
def buildExampleRow(body_text, cDict):
dictSize = len(cDict.keys())
one_ex_vector = np.zeros(dictSize+2)
cwords = body_text.split()
for word in cwords:
if(word in cDict.keys()):
one_ex_vector[cDict[word]-1] = 1
else:
print("This word doesn't exist in the dict:" + word)
return(one_ex_vector)
#processExamples: Using examples from the DB, process them into a training set
#in: The examples as a Django queryset, the canonical dictionary
#out: A tuple with Y_vector and Examples_matrix
def processExamples(qs_Examples, cDict):
Y_vector = np.zeros(qs_Examples.count(), dtype=np.int8)
Y_vec_count = 0
examplesMatrix = None
for ex in qs_Examples:
Y_vector[Y_vec_count] = int(ex.quality_class)
Y_vec_count = Y_vec_count + 1
if(examplesMatrix is None):
examplesMatrix = buildExampleRow(ex.body_text, cDict)
else:
examplesMatrix = np.vstack([examplesMatrix, buildExampleRow(ex.body_text, cDict)])
print('.', end='', flush=True)
return( (Y_vector, examplesMatrix))