-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtext_scoring.py
executable file
·198 lines (181 loc) · 6.8 KB
/
text_scoring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import Levenshtein
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner
def to_translation_map(iterable):
#a function to make a set of junk (None) characters
return {key: None for key in iterable}
def text_to_text_alignment_and_score(text_ref, text_pred):
"""
Find a word to word alignment between two texts, considering the first is
the reference and the second the predicted
:param text_ref: text reference
:param text_pred: predicted text
:return:
"""
text_ref = text_ref.lower()
text_pred = text_pred.lower()
iterable = [".", ","]
# convert the reference text in order not to contain , and (junk characters)
translation_map = str.maketrans(to_translation_map(iterable))
text_ref = text_ref.translate(translation_map)
# Create sequences to be aligned.
a = Sequence(text_ref.split())
b = Sequence(text_pred.split())
# Create a vocabulary and encode the sequences.
v = Vocabulary()
a_enc = v.encodeSequence(a)
b_enc = v.encodeSequence(b)
# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(1, 0)
aligner = GlobalSequenceAligner(scoring, 0)
f, score, encodeds = aligner.align(a_enc, b_enc, text_ref.split(),
text_pred.split(), backtrace=True)
# get the first alignment if exists:
#print(encodeds[0])
print(encodeds)
if len(encodeds[0]) > 0:
alignment = v.decodeSequenceAlignment(encodeds[0])
print(alignment)
##fix first and last missing words of asr text
list_asr = []
list_pred = []
for word in text_pred.split():
if word != alignment.second.elements[0]:
list_asr.append(word)
list_pred.append('-')
else:
alignment.second.elements = list_asr + alignment.second.elements
alignment.first.elements = list_pred + alignment.first.elements
break
list_asr = []
list_pred = []
for word in reversed(text_pred.split()):
if word != alignment.second.elements[-1]:
list_asr = [word] + list_asr
list_pred.append('-')
else:
alignment.second.elements = alignment.second.elements + list_asr
alignment.first.elements = alignment.first.elements + list_pred
break
#fix first and last missing words of reference text
list_asr = []
list_pred = []
for word in text_ref.split():
if word != alignment.first.elements[0]:
list_pred.append(word)
list_asr.append('-')
else:
alignment.second.elements = list_asr + alignment.second.elements
alignment.first.elements = list_pred + alignment.first.elements
break
list_asr = []
list_pred = []
for word in reversed(text_ref.split()):
if word != alignment.first.elements[-1]:
list_pred = [word] + list_asr
list_asr.append('-')
else:
alignment.second.elements = alignment.second.elements + list_asr
alignment.first.elements = alignment.first.elements + list_pred
break
#print(alignment.second.elements)
#print(alignment.first.elements)
print(alignment)
rec = alignment.score * 100 / len(text_ref.split())
pre = alignment.score * 100 / len(text_pred.split())
else:
alignment = []
rec, pre = 0, 0
return alignment, rec, pre
def adjust_asr_results(asr_results, second,dur):
adjusted_results = []
i = 0
max_i = len(asr_results)
for j in range(0, len(second)):
if i== (max_i):
if adjusted_results[j-1]['word'] == '-':
adjusted_results.append(adjusted_results[j-1])
else:
k = adjusted_results[j-1]['et']
mean = (k+dur)/2
adjusted_results.append({"word": second[j], "st": mean,
"et": mean})
else:
if asr_results[i]['word'].lower() == second[j]:
adjusted_results.append(asr_results[i])
i += 1
else:
if j == 0:
l = asr_results[i]['st']
mean = l / 2
adjusted_results.append({"word": second[j], "st": mean,
"et": mean})
elif adjusted_results[j-1]['word'] == '-':
adjusted_results.append(adjusted_results[j-1])
else:
k = adjusted_results[j-1]['et']
l = asr_results[i]['st']
mean = (k+l) / 2
adjusted_results.append({"word": second[j], "st": mean,
"et": mean})
return adjusted_results
def calculate_score_after_alignment(A, B):
total_score = 0.0
k = len(A)
ref_words = 0
asr_words = 0
for j in range(0, k):
if A[j] != '-':
ref_words += 1
if B[j] != '-':
asr_words += 1
if A[j] == B[j]:
total_score += 1.0
elif B[j] != '-' and A[j] != '-':
total_score = total_score + Levenshtein.ratio(A[j], B[j])
if asr_words==0 or ref_words==0:
pre=0.0
rec=0.0
f1=0.0
else:
rec=total_score*100/ref_words
pre=total_score*100/asr_words
return rec, pre
def windows(first, second, adjusted_results, length, step,dur):
if step == 0:
raise ValueError("Parameter 'm' can't be 0")
i = length #center of window
k = len(second)-1
#print(k)
recall_list=[]
precision_list=[]
f1_list=[]
ref_text = []
asr_text = []
#print(adjusted_results[k]['et'])
while (i + length) < dur:
#print(i+length)
list_a = []
list_b = []
for j in range(0, len(second)):
bottom = i - length
up = i + length
if adjusted_results[j]['st'] >= bottom and \
adjusted_results[j]['st'] <= up:
list_a.append(first[j])
list_b.append(second[j])
#print(list_a)
#print(list_b)
rec, pre = calculate_score_after_alignment(list_a, list_b)
if rec==0.0 or pre==0.0:
f1 =0.0
else:
f1=2*rec*pre/(rec+pre)
recall_list.append({"x": i,"y":rec})
precision_list.append({"x":i,"y":pre})
ref_text.append(list_a)
asr_text.append(list_b)
f1_list.append({"x":i,"y":f1})
i = i+step
return(recall_list,precision_list,f1_list,ref_text,asr_text)