-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
270 lines (230 loc) · 9.28 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
# -*- coding: utf-8 -*-
"""
Completed by team:
Goal: To build a spell check system by using bigram
This python file is meant to be all the GUI needed functions will be in here. including core functions
@author: team
"""
# Importing needed only library
import string
import json, collections
import spacy
import numpy as np
from collections import Counter
import re
# Get the configuration files
from config import dic_path, bigram_freq_path, bigrams_text_path
nlp = spacy.load("en")
# Default text path into dicWords variable
dictionary_freq = json.load(open(dic_path))
bigram_freq = json.load(open(bigram_freq_path))
bigrams_text = json.load(open(bigrams_text_path))
dicWords = collections.defaultdict(int)
for k, v in dictionary_freq.items():
dicWords.setdefault(k, []).append(v)
# To identify if is word is found in the dictionary,
# if this is found, return True, else return False to indicate the word is not found.
def isWord(word):
status = True
punc = string.punctuation + ' '
if word in dicWords.keys():
status = True
elif word.lower() in dicWords.keys():
status = True
elif word in punc:
status = True
else:
status = False
return status
def add2Dict(word):
dicWords.setdefault(word, []).append(1) # Appending to dictionary
# get dic
def get_dic():
return dicWords
"""
Calculate the edit distance from string 1 to string 2, e.g. from to for, needed edit distance
edit distance operation: Minimum edit distance needed to convert from to for is 3,
that includes insertion, deletion and subsitution
From 'from'
count 1: deleted 'm'
count 2: deleted 'r'
count 3: inserted 'r' after 'o'
To 'for'
@author: team
"""
def edit_distance(str1, str2):
m = len(str1)
n = len(str2)
ed = [[0 for x in range(n + 1)] for x in range(m + 1)]
for i in range(m + 1):
for j in range(n + 1):
if i == 0:
ed[i][j] = j
elif j == 0:
ed[i][j] = i
elif str1[i - 1] == str2[j - 1]:
ed[i][j] = ed[i - 1][j - 1]
else:
ed[i][j] = 1 + min(ed[i][j - 1],
ed[i - 1][j],
ed[i - 1][j - 1])
return ed[m][n]
class UniSuggestion:
def __init__(self, value, ed=1, freq=1):
self.ed = ed
self.freq = freq
self.value = value
class BiSuggestion:
def __init__(self):
pass
# Input is a error word, check each item in dic, if edit distance <= 4, add to suggestion list
# Output is a list of UniSuggestion
def get_correct_word(w='thiis', ed=4):
w = w
ed = ed
uni_suggestion_list = []
for i in list(dicWords):
t_ed = edit_distance(w, i)
freq = dicWords[i][0]
if t_ed <= ed:
suggestion_word = UniSuggestion(i, t_ed, freq)
uni_suggestion_list.append(suggestion_word)
return uni_suggestion_list
# Checking real word occurance,
# the main goal from this, is to select only second word as error.
def check_real_word_occurance(sent, boolSent, index, val):
if val:
return True
else:
if sent[index] == sent[0]:
return False
else:
previous_word = boolSent[index-1]
current_word = boolSent[index]
# to check if current word of the first word is same as previous of the second word
current_first_word = sent[index][0]
previous_second_word = sent[index-1][1]
if ( previous_word == False ) and ( current_first_word == previous_second_word ) and current_word == False:
boolSent[index] = True
return True
elif previous_word == True and current_word == False:
return False
elif ( previous_word == False ) and ( current_first_word != previous_second_word ) and current_word == False:
return False
# order the correct list and return only top 5 for each edit distance
def get_ordered_correct_word(w='thiis', ed=4, limit=5): # ed is the edit distance; limit is limit number to return
t = get_correct_word(w=w, ed=ed)
ordered_list = [[]]
tmp_return_list = []
for i in range(ed):
ordered_list.append([])
for item in t:
ordered_list[(item.ed) - 1].append(item)
for tt in range(ed):
tmp_return_list.append(sorted(ordered_list[tt], key=lambda UniSuggestion: UniSuggestion.freq, reverse=True))
return_list = tmp_return_list
for i in range(len(return_list)):
if len(tmp_return_list[i]) >= limit:
return_list[i] = tmp_return_list[i][:limit]
else:
return_list[i] = tmp_return_list[i]
return return_list
"""
Detecting Errors: Bigram probability suggestion
Approximates the probability of a word given all the previous words
P(the|that) = P(that the) / P(that)
Main operation of this class is to:
1. Calculate bigram score
2. Check if real word is correct
@author: team
"""
class ErrorDetection:
def __init__(self, sentence):
self.bigram = sentence
self.text = sentence
# Converting corpus into bigram, after removal of puntuations
def getBigram(self):
complete_list = " ".join(str(i) for i in nlp(self.text))
doc = nlp(complete_list)
sentences = [sent.string.strip().lower() for sent in doc.sents]
bigrams = [b for l in sentences for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
bigram = [w for w in bigrams if
"?" not in w and ":" not in w and "," not in w and ";" not in w and "." not in w and "!" not in w]
self.bigram = bigram
def get_bigram_score(bigram):
try:
bigram_score = ErrorDetection.calculateScore(bigram)
return bigram_score
except:
bigram_score = 0
return bigram_score
# Calculating bigram score by formula
def calculateScore(bigram):
freq_bigram = bigram_freq[re.sub(' ', ';', bigram)]
freq_first_string = dictionary_freq[re.sub(' ', ';', bigram).split(";")[0]]
score = freq_bigram / freq_first_string
return score
# Approximates the probability of a word given all the previous words
# get bigram score from original content
# get bigram score from stemmed word
def check_realword_error(self):
boolErrorList = []
for note in self.bigram:
content = note[0] + ' ' + note[1]
stemmed_word = " ".join(token.lemma_.strip() for token in nlp(content)) # Stemming the word
if ErrorDetection.get_bigram_score(content) == 0:
# real word error
boolErrorList.append(False)
else:
if ErrorDetection.get_bigram_score(stemmed_word) == 0 or ErrorDetection.get_bigram_score(stemmed_word) == ErrorDetection.get_bigram_score(content):
boolErrorList.append(True)
elif ErrorDetection.get_bigram_score(content) > ErrorDetection.get_bigram_score(stemmed_word) :
boolErrorList.append(True)
elif ErrorDetection.get_bigram_score(content) > ( ErrorDetection.get_bigram_score(stemmed_word) * 0.01) :
boolErrorList.append(True)
else:
boolErrorList.append(False)
self.text = boolErrorList
# Suggesting word based on selection.
def suggest_word(word1, word2, selection):
def get_suggestion_List(word1, word2, selection):
# Running on combined set dataset/bigrams_combined_set.txt
# Running on training set dataset/bigrams_train_set.txt
bigrams_text = json.load(open(bigrams_text_path))
if selection == 1:
bigram_suggestion_list_1 = []
for i in bigrams_text:
if i[1] == word2 and i not in bigram_suggestion_list_1:
bigram_suggestion_list_1.append(i)
return bigram_suggestion_list_1
else:
bigram_suggestion_list_2 = []
for i in bigrams_text:
if i[0] == word1 and i not in bigram_suggestion_list_2:
bigram_suggestion_list_2.append(i)
return bigram_suggestion_list_2
score_list = {}
for i in get_suggestion_List(word1, word2, selection):
bigram_wrd = i[0] + " " + i[1]
edit_distanc_1 = edit_distance(word1, i[0])
edit_distanc_2 = edit_distance(word2, i[1])
# display with a minimum of 10 edits distance, but keeping only top 5 of the sorted list for the word suggestion.
# display suggested word follow by edit distance and bigram probability scores.
if selection == 1:
if edit_distanc_1 < 10:
j = str(np.around(ErrorDetection.get_bigram_score(bigram_wrd), 4)) + " [" + str(edit_distanc_1) + "]"
score_list[bigram_wrd] = edit_distanc_1
else:
if edit_distanc_2 < 10:
j = str(np.around(ErrorDetection.get_bigram_score(bigram_wrd), 4)) + " [" + str(edit_distanc_2) + "]"
score_list[bigram_wrd] = edit_distanc_2
return score_list
# this is for test only
if __name__ == '__main__':
t = get_ordered_correct_word('Thiis')
for item in t:
for mmm in item:
print(mmm.ed)
print(mmm.value)
print(mmm.freq)
print('----------------------------------------')