-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbi_gram_calculation.py
115 lines (93 loc) · 4.39 KB
/
bi_gram_calculation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import math
import pandas as pd
n_gram_string_list = []
excel_2_gram_probabilities_list = []
excel_3_gram_probabilities_list = []
empty_2_gram_list = []
empty_3_gram_list = []
cleaned_input_string_list = []
probability_value_list = []
df_N_gram_1 = pd.read_csv("/Users/cvkrishnarao/Desktop/RA/n_gram_data_converted/n_gram_coca_x1w_utf_8.csv")
df_N_gram_2 = pd.read_csv("/Users/cvkrishnarao/Desktop/RA/n_gram_data_converted/n_gram_coca_x2w_utf_8.csv")
def cleanupString_bi_gram(uncleanedString):
"""
:param uncleanedString: Uncleaned String with all special Characters and stuff
:return: cleaned string with no special characters
"""
uncleanedString = uncleanedString.replace('*', '')
uncleanedString = uncleanedString.replace('(', '')
uncleanedString = uncleanedString.replace(')', '')
uncleanedString = uncleanedString.replace('-', '')
uncleanedString = uncleanedString.replace(',', '')
uncleanedString = uncleanedString.replace('.', '')
uncleanedString = uncleanedString.replace(':', '')
uncleanedString = uncleanedString.replace('?', '')
uncleanedString = uncleanedString.replace('!', '')
uncleanedString = uncleanedString.replace('\n', '')
uncleanedString = uncleanedString.replace('\t', '')
uncleanedString = uncleanedString.replace('-', '')
w = uncleanedString.split()
f = ""
for wrd in w:
wrd = wrd.lower()
f = f + " " + wrd
return f
def n_gram_list_bi_gram(input_Cleaned_string, n_gram_value):
"""
:param input_Cleaned_string: Cleaned string without spaces or special characters
:param n_gram_value: value of n-gram split == 2 or 3, 4, 5
:return: Splitted work list according to the n_gram_values
"""
split_string = input_Cleaned_string.split()
for string in range(len(split_string)):
y = split_string[string:string + n_gram_value]
if len(y) == n_gram_value:
n_gram_string_list.append(y)
return n_gram_string_list
def n_gram_probability_calculation_bi_gram(n_gram_value, n_gram):
"""
Calculates the respective score for the n-gram dataset. n_gram_value == 2 or 3, 4, 5
n_gram = actual list of the splitted word count
returns: excel_2_gram_probabilities_list with the frequency count for the values and the probability
"""
sum_frequencies = 0
count_n_gram = len(n_gram)
probability_value_list.clear()
if n_gram_value == 2:
excel_2_gram_probabilities_list.clear()
for query_string in n_gram:
# Getting the Single n-1 gram for 2 gram frequency
query_string_search_one_gram = query_string[0]
# Querying the result for 1-gram
query_dataframe_1_gram = df_N_gram_1.loc[df_N_gram_1['Word_One'] == query_string_search_one_gram]
# Querying the result for 2-gram
query_dataframe_2_gram = df_N_gram_2.loc[
(df_N_gram_2['Word_One'] == query_string[0]) & (df_N_gram_2['Word_Two'] == query_string[1])]
try:
# Getting values for 1 gram
values_frequency_one_gram = list(query_dataframe_1_gram["Frequency"])[0]
except IndexError:
values_frequency_one_gram = 1
try:
# Getting the values for 2 gram
values_frequency_two_gram = list(query_dataframe_2_gram["Frequency"])[0]
except IndexError:
values_frequency_two_gram = 1
if values_frequency_one_gram is None:
values_frequency_one_gram = 1
if values_frequency_two_gram is None:
values_frequency_two_gram = 1
# Dividing to get the values of the frequency
divide = values_frequency_two_gram / values_frequency_one_gram
# Probabilities
probability_value = math.exp(math.log(divide))
probability_value_list.append(probability_value)
# calculation of the n_gram frequency
sum_frequencies += sum_frequencies + math.exp(math.log(divide))
excel_2_gram_probabilities_list.append(query_dataframe_2_gram.values.tolist())
values_frequency_one_gram = int
values_frequency_two_gram = int
# Getting the Mean value of the n_gram
mean_values = sum_frequencies / count_n_gram
# Returning the list frequency and the mean values
return excel_2_gram_probabilities_list, mean_values, probability_value_list