This repository has been archived by the owner on Sep 5, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathretrieved-sentences.py
108 lines (91 loc) · 3.34 KB
/
retrieved-sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import unicodedata
import json
import re
import sys
import os
import argparse
# from retrieval.fever_doc_db import FeverDocDB
parser = argparse.ArgumentParser()
parser.add_argument("--in_file", type=str, required=True)
parser.add_argument("--out_file", type=str, required=True)
parser.add_argument("--fever_pages_dir", help="Wikipedia dump", type=str, required=True)
args = parser.parse_args()
in_file = args.in_file
out_file = args.out_file
fever_pages_dir = args.fever_pages_dir
if os.path.exists(out_file):
raise ValueError("Output already exists")
fever_pages = [ os.path.join(fever_pages_dir, f)
for f in os.listdir(fever_pages_dir)
if re.search("\.jsonl$", f) ]
fever_pages = sorted(fever_pages)
# fever_db = FeverDocDB(fever_db_file)
needed = {}
found = {}
jsondecoder = json.JSONDecoder()
jsonencoder = json.JSONEncoder()
with open(in_file, "r") as in_fp:
for line in in_fp:
struct = jsondecoder.decode(line)
for finding in struct["predicted_sentences"]:
title = unicodedata.normalize('NFD', str(finding[0]))
linenum = finding[1]
if(not(title in needed)):
needed[title] = []
needed[title].append(linenum)
for pages in fever_pages:
print("Processing " + pages)
with open(pages, "r") as in_fp:
for line in in_fp:
struct = json.JSONDecoder().decode(line)
title = unicodedata.normalize('NFD', str(struct["id"]))
if title in needed:
found[title] = {}
lines = struct["lines"].split("\n")
linenum = 0
for linerecord in lines:
fields = linerecord.split("\t")
if(linenum in needed[title]):
if(len(fields) < 2):
print("Problem retrieving from "+title+" line "+str(linenum))
found[title][linenum] = "This sentence is intentionally left blank ."
else:
textline = fields[1]
found[title][linenum] = textline
linenum = linenum + 1
print("Filling in answers")
with open(in_file, "r") as in_fp:
with open(out_file, "w") as out_fp:
for line in in_fp:
struct = jsondecoder.decode(line)
skip = False
supports = {}
for eg in struct["evidence"]:
if(len(eg) != 1):
skip = True # multiple supporting statements required
title = unicodedata.normalize('NFD', str(eg[0][2]))
if(not(title in supports)):
supports[title] = []
supports[title].append(eg[0][3]) # line number
if(skip == False):
for finding in struct["predicted_sentences"]:
title = unicodedata.normalize('NFD', str(finding[0]))
linenum = finding[1]
if(finding[0] is None):
pass
elif(not(title in found)):
print("Page not found: " + title)
elif(not(linenum in found[title])):
print("Line not found: " + title + " " + str(linenum))
else:
sentence = found[title][linenum]
if(len(finding) != 2):
print("Bad finding length: " + title + " " + str(linenum))
label = "NOT ENOUGH INFO"
if(title in supports):
if(linenum in supports[title]):
label = struct["label"] # REFUTES or SUPPORTS
finding.append(label)
finding.append(sentence)
result = jsonencoder.encode(struct)
out_fp.write(result + "\n")