-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathentropy.py
168 lines (152 loc) · 6.27 KB
/
entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import math
import index
import csv
import shuffle
def calculate_bias(filepath, nrange):
'''
Create a bias term to add to entropy estimations of p_to_ent, using sample size and other
characteristics of input strings. To be completed.
'''
pass
def gather_branchpoints(filepath, nrange, backwards=False, ExpectedProbability=False):
'''
Compiles the transition probabilities from index.get_probs into probability distributions
for branch points (prefixes). For each n in the output dict, this set of probability distribution
is found at [beginnings_dict], while the other entry ['total_ngram_count'] stores the total number
of ngram tokens for that n.
'''
idct = index.get_probs(filepath, nrange, backwards = backwards)
out_dict = {}
for n in idct.keys():
ngram_dict = idct[n]
ngram_count_list = []
for value in ngram_dict.values():
ngram_count_list.append(value[1])
total_ngram_count = sum(ngram_count_list)
beginnings_dict = {}
for ngram in ngram_dict.keys():
if ngram[:-1] not in beginnings_dict.keys():
beginnings_dict[ngram[:-1]] = [[], [], []]
for ngram, prob_count in ngram_dict.items():
beginnings_dict[ngram[:-1]][0].append(ngram)
beginnings_dict[ngram[:-1]][1].append(prob_count[0])
beginnings_dict[ngram[:-1]][2].append(prob_count[1])
out_dict[n] = {
'beginnings_dict': beginnings_dict,
'total_ngram_count': total_ngram_count}
return out_dict
def branchpoints(filepath, nrange):
idct = gather_branchpoints(filepath, nrange)
out_dict = {}
for n, ndict in idct.items():
n_out_dict = {}
beginnings_dict = ndict['beginnings_dict']
for branchpoint, matrix in beginnings_dict.items():
n_out_dict[branchpoint] = {
'transitions': {}, 'count': sum(
matrix[2])}
for i in range(len(matrix[0])):
ngram = matrix[0][i]
transition = (ngram[-1])
n_out_dict[branchpoint]['transitions'][transition] = matrix[1][i]
out_dict[n] = n_out_dict
return out_dict
def p_to_ent(filepath, nrange, backwards=False, ExpectedProbability=False):
'''
Calculates probability distributions for the songs in filepath for the nth order MMs included in nrange,
then using each nth-order prob distr, calculates the entropy at each (n-1)gram.
For hapax legomena, returns an H of 0.
'''
idct = gather_branchpoints(filepath, nrange, backwards = backwards, ExpectedProbability = ExpectedProbability)
out_dict = {}
for n, n_dict in idct.items():
beginnings_dict = n_dict['beginnings_dict']
total_ngram_count = n_dict['total_ngram_count']
entropy_dict = {}
for beginning in beginnings_dict.keys():
probabilities_list = beginnings_dict[beginning][1]
counts_list = beginnings_dict[beginning][2]
ngrams_list = beginnings_dict[beginning][0]
entropy_terms = []
i = 0
for probability in probabilities_list:
unconditional_probability = counts_list[i] / total_ngram_count
ngram = ngrams_list[i]
if ExpectedProbability == False:
entropy_terms.append(probability * math.log(probability, 2))
if ExpectedProbability == True:
entropy_terms.append(probability * probability)
i += 1
ent_val = sum(entropy_terms)
ep_val = sum(entropy_terms)
if ExpectedProbability == False:
entropy_dict[beginning] = (-1 *
ent_val, sum(counts_list))
if ExpectedProbability == True:
entropy_dict[beginning] = (1 *
ep_val, sum(counts_list))
out_dict[n] = entropy_dict
return (out_dict)
def avg_ent(filepath, nrange, shuffle_mode=False, min_count=1, backwards=False, ExpectedProbability=False):
'''
For each n (Markov order) in the parameter nrange, averages entropy
across all n-grams, estimating the entropy rate of the songs in filepath.
'''
if shuffle_mode:
shuffle.shuffle(filepath)
filepath = './output/shuffle.csv'
ndct = p_to_ent(filepath, nrange, backwards = backwards, ExpectedProbability = ExpectedProbability)
# print(ndct)
result = {}
for key, value in ndct.items():
n = key
ls = []
for jey, ualue in value.items():
if ualue[1]>min_count:
for i in range(ualue[1]):
ls.append(ualue[0])
if len(ls)>0:
result[key] = sum(ls) / len(ls)
'''else:
result[key]='-'''
with open("./output/entropy.csv", 'w') as output_file:
writer = csv.writer(output_file)
for key, value in result.items():
row = []
row.append(key)
row.append(value)
writer.writerow(row)
return result
def get_ngram_entropy(filepath, ngram,backwards=False, ExpectedProbability=False):
if isinstance(ngram, str):
ngram = tuple(ngram)
nrange = [2, len(ngram) + 2]
entropy_dict = p_to_ent(filepath, nrange,backwards = backwards, ExpectedProbability = ExpectedProbability)
relevant_dict = entropy_dict[len(ngram) + 1]
if ngram in relevant_dict.keys():
result = relevant_dict[ngram]
else:
result = 'ngram_not_found'
return result
def get_ngram_counts(filepath, ngram):
if isinstance(ngram, str):
ngram = tuple(ngram)
nrange = [2, len(ngram) + 2]
probs_counts_dict = index.get_probs(filepath, nrange)
relevant_dict = probs_counts_dict[len(ngram)]
for key in relevant_dict.keys():
relevant_dict[key] = relevant_dict[key][1]
if ngram in relevant_dict.keys():
result = relevant_dict[ngram]
else:
result = 'ngram_not_found'
return result
def batch(filepath, ngram_list, mode):
out_list = []
if mode == 'counts':
for ngram in ngram_list:
out_list.append(get_ngram_counts(filepath, ngram))
if mode == 'entropy':
for ngram in ngram_list:
out_list.append(get_ngram_entropy(filepath, ngram))
return out_list