-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFEASRL.py
235 lines (186 loc) · 9.31 KB
/
FEASRL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# .conll file analysis
import os
import nltk
from nltk.corpus import framenet as fn
# Change this to the file you are looking to analyze
file_name = "everyone_chat.conll"
orig = False
if ".conll" in file_name:
correct_os = input("Is your OS either Linux or Windows (Y/N)? ")
if correct_os == "Y":
oper = input("Is it Linux? (Type Y or N): ")
orig = True
if oper == "Y":
os.system("cp {} temp.txt")
else:
os.system("copy {} temp.conll".format(file_name))
os.system("ren temp.conll temp.txt")
file_name = "temp.txt"
#filename = "Webex_comp.txt"
file = open(file_name, "r", encoding="utf8")
empty = []
sentences = [[[]]]
# Splits the .txt file
# Used to knwo where to append each frame.
line_index = 0
frame_index = 0
sent_index = 0
# Used in sentence matching
temp_frame = []
current_sent = []
first_5 = []
# ----------------------------------------------------------------------------------------------------------
# How the .txt/.conll file into 4-d vector works: |
# |
# This read the file line by line and splits each word in a line into different entries in a temporary |
# frame until the entire sentence entry is finished. |
# |
# Since we are reading a multi-sentence file, we keep it in a temporary frame to check if the last |
# frame is different from this frame's sentence. This is a achieved by saving the first 5 words in |
# the sentence and the first 5 words of the expected sentence and see if they match. If they do, |
# it is okay to append it to the first sentence frames list. Else, make a new sentence list and append it |
# then it adds up the sentence counter to know what sentence we need to append the temporary frame in. |
# |
# 4-d sentence order: All sentence -> sentence # -> frame # -> line # -> column # |
# ----------------------------------------------------------------------------------------------------------
for line in file:
x = line.split("\t")
# Checks if next line is a new line, therefore new frame
if x[0] == "\n":
sentences[sent_index][frame_index] = temp_frame
temp_frame = []
sentences[sent_index].append([])
frame_index += 1
first_5 =[]
line_index = 0
# If not a new line, parse through frame line by line and check if new sentence
else:
temp_frame.append(x)
if len(first_5) != 5:
first_5.append(temp_frame[line_index][1])
elif len(first_5) == 5:
if current_sent == []:
current_sent = first_5
elif not(first_5 == current_sent):
current_sent = first_5
sentences[sent_index].remove([])
sent_index += 1
sentences.append([[]])
frame_index = 0
line_index += 1
# Removes a stray empty list
sentences[sent_index].remove([])
file.close()
if orig:
if oper == "Y":
os.system("rm temp.txt")
else:
os.system("del temp.txt")
# parse through to see if frame contains .v (main verb) or temporal
contains = False
# Used to store sentences and main verbs for output later
all_sentence = []
main_words = []
main_word_lu = []
# Note: Pop list is 1-d and saves sentence and frame as [frame #, sentence#, frame #, ...]
pop_list = []
# Used to see if verb or temproal type word is used in the frame
substr = ".v"
substr2 = "Temporal"
frame_index = 0
sent_index = 0
# ------------------------------------------------------------------------------------------
# How does reading the 4-d array work and taking out what is needed: |
# |
# This reads frame by frame in each sentence and reads each line in a frame |
# to see if the substring we are looking for (.v or Temporal) is in it. If it is |
# doesn't do anything, else it notes down the sentence and frame to pop later since they |
# are irrelvant. |
# This also notes down all of the sentences used in the .conll file for output to the .txt |
# file for later. It saves these sentences in a list, so it is easy to access later. |
# It also saves the main word that is used and its LU (from frame net) for later use in |
# output to the .txt file. |
# ------------------------------------------------------------------------------------------
for sentence in sentences:
for frame in sentence:
if len(all_sentence) == sent_index:
temp_sent = []
for line in frame:
if len(all_sentence) == sent_index:
if line[1] == "unk":
temp_sent.append("the")
else:
temp_sent.append(line[1])
# Column 13 = part of speech, column 14 = LU
if (substr in line[12]) or (substr2 in line[13]):
main_words.append(line[1])
main_word_lu.append(line[13])
contains = True
break
if not(contains):
pop_list.append(frame_index)
pop_list.append(sent_index)
contains = False
frame_index += 1
all_sentence.append(' '.join(temp_sent))
frame_index = 0
sent_index += 1
print(all_sentence)
# pop frames not containing .v or temporal, Note: it reverses the pop list so, it maintains the index
# for each pop.
pop_list.reverse()
for i in range(int((len(pop_list)/2))):
sentences[pop_list[2*i]].pop(pop_list[2*i + 1])
# checks if col 14 is not zero or underscore. Therefore, FE left.
x = open("Results.txt", "w")
w_count = 0
# ------------------------------------------------------------------------------------------
# How does outputting to the .txt file work: |
# |
# This iterates through each sentence and outputs the sentence it is reading from the |
# saved sentence list we made earlier. It then iterates through each frame looking for any |
# related FEs and lists them under the main verb we saved before. It also searches for any |
# missing FEs that were not specified or identified in the current frame and lists them. |
# Note: It also fixes the UNK error in open-sesame, so it will never show up in output. |
# ------------------------------------------------------------------------------------------
for sentence in range(len(sentences)):
if sentence == 0:
x.write("Sentence: {}\n".format(all_sentence[sentence]))
else:
x.write("\nSentence: {}\n".format(all_sentence[sentence]))
for frame in range(len(sentences[sentence])):
existing_fe = []
pop_list = []
if frame == 0:
x.write("\tMain Word: {} (LU: {})\n".format(main_words[w_count], main_word_lu[w_count]))
x.write("\tAssociated FEs in Sentence (Word associated -> FE):\n")
else:
x.write("\n\tMain Word: {} (LU: {})\n".format(main_words[w_count], main_word_lu[w_count]))
x.write("\tAssociated FEs in Sentence (Word associated -> FE):\n")
for line in range(len(sentences[sentence][frame])-1):
if not((sentences[sentence][frame][line][14] == "O\n") or (sentences[sentence][frame][line][14] == "_")):
if sentences[sentence][frame][line][1] == "unk":
x.write("\t\tthe -> {}".format(sentences[sentence][frame][line][14]))
else:
x.write("\t\t{} -> {}".format( sentences[sentence][frame][line][1], sentences[sentence][frame][line][14]))
existing_fe.append(sentences[sentence][frame][line][14])
# This bottom section adds in the Missing FEs from each frame by using NLTK. First 3 lines are
# most important as they generate the FEs
curr_lu = fn.frames(main_word_lu[w_count])
f = fn.frame(curr_lu[0].get("ID"))
FEs = sorted([x for x in f.FE])
for exist in existing_fe:
for i in range(len(FEs)-1):
if FEs[i] in exist:
pop_list.append(i)
pop_list = list(set(pop_list))
pop_list.sort()
pop_list.reverse()
for i in pop_list:
FEs.pop(i)
x.write("\tMissing FEs in Sentence:\n\t\t")
for fe in FEs:
x.write("{}, ".format(fe))
x.write("\n")
w_count += 1
x.close()