-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
169 lines (140 loc) · 4.26 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import nltk
import json
import pandas as pd
import os
from plot_pca import plotpca
from string import punctuation
from nltk import word_tokenize
from nltk.probability import FreqDist
def main():
print('Starting..')
# nltk.download('punkt')
# generateMatrix('democrats/json/', 'master_merged.json', 'democrats/dem_matrix.csv', 1000)
# generateMatrix('republicans/json/', 'master_merged.json', 'republicans/repub_matrix.csv', 1000)
# plotpca("matrix_master.csv")
def fitdict(target_keys, input):
"""
Assigns values of the required words in target_keys to
a new list, indexed indentically with sortedKeys()
Meant to be used to assign an object values to the merged
frequency set
"""
list = [0] * len(target_keys)
count = 0
for x in target_keys:
if x in input:
list[count] = input[x]
count += 1
return list
def generateMatrix(directory, source_file, output_file, limit, factor=1000):
"""
Generates matrix for all .json in folder and writes as a .csv file
directory - e.g. 'democrats/json/' or 'republicans/json/'
source_file - Master merged.json dictionary
output_file - file name, will overwrite (.csv)
limit - use N words from the master dictionary
factor - exaggerate values by a factor
"""
# clear file if it exists
open(output_file, 'w').close()
dem_dict = getfirstN(readDist(source_file), limit)
keys = dem_dict.keys()
df = pd.DataFrame(keys, columns=['words'])
for filename in os.listdir(directory):
if filename.endswith(".json") & (filename != source_file):
f = open(directory + filename)
as_list = readDist(directory + filename)
normalize(as_list, factor)
fit_list = fitdict(keys, as_list)
col_name = filename[:-5]
df[col_name] = fit_list
f.close()
df.to_csv(output_file, index=False)
def normalize(dict, exaggerate_factor):
"""
Divide each dict entry by total number of words
exaggerate_factor lets you multiply by a value to increase readability
"""
num_words = sum(dict.values())
for key, value in dict.items():
dict[key] = value / num_words * exaggerate_factor
def sortedKeys(dict):
"""
force sort a dict by values, then get the sorted keys
"""
list = []
for key, value in sortdict(dict):
list.append(key)
return list
def sortdict(dict):
"""
sort dictionaries by values
"""
return sorted(dict.items(), key=lambda x: x[1], reverse=True)
def printdict(dict):
"""
print dictionaries
"""
for key, value in dict.items():
print(key, value)
def getfirstN(dict, N):
"""
get first N elements of a dict
"""
x = sortdict(dict)
ret = {}
count = 0
for key, value in x:
ret[key] = value
count += 1
if count == N:
break
return ret
def readDist(dir):
"""
read frequency distribution from textfile
"""
with open(dir) as file:
list = json.load(file)
return list
def dist(dir):
"""
create frequency distribution from textfile
"""
with open(dir) as f:
data = f.read()
fdist = FreqDist(word.lower() for word in word_tokenize(data))
return fdist
def writeDist(input, output):
"""
write text file to JSON of frequencies
"""
items = dist(input)
with open(output, 'w') as json_file:
json.dump(items, json_file)
def toJSON(directory):
"""
write all text files in directory to FreqDist JSON
param: directory -> 'democrats/' or 'republicans/'
"""
for filename in os.listdir(directory):
input_file = directory + filename
output_file = directory + "json/" + filename[:-4] + ".json"
if filename.endswith(".txt"):
writeDist(input_file, output_file)
def mergeTXT(directory, f):
"""
merge all .TXT files in directory, then output as f
"""
open(f, 'w').close()
output_file = open(f, 'a')
count = 0
for filename in os.listdir(directory):
if filename.endswith(".txt"):
f = open(directory + filename)
output_file.write(f.read())
count += 1
f.close()
print("Merged " + str(count) + " files.");
output_file.close()
main()