This repository has been archived by the owner on Sep 5, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
entailment-title-input.py
71 lines (56 loc) · 2.18 KB
/
entailment-title-input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter
from tqdm import tqdm
import unicodedata
import json
import re
import sys
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--in_file", type=str, required=True)
parser.add_argument("--out_dir", type=str, required=True)
parser.add_argument("--split", help="which part of dataset", type=str, required=True)
parser.add_argument("--max_sent_len", help="maximum sentence length", type=int, default=200)
args = parser.parse_args()
in_file = args.in_file
out_dir = args.out_dir
split = args.split
max_sent_len = args.max_sent_len
jsondecoder = json.JSONDecoder()
tokenizer = SimpleWordSplitter()
premise_fp = open(out_dir + "/" + split + ".premise", "w")
hypothesis_fp = open(out_dir + "/" + split + ".hypothesis", "w")
label_fp = open(out_dir + "/" + split + ".label", "w")
index_fp = open(out_dir + "/" + split + ".index", "w")
labels = {} # Follow ESIM standards: lukecq1231: preprocess_data.py
labels["NOT ENOUGH INFO"] = 1
labels["SUPPORTS"] = 0
labels["REFUTES"] = 2
with open(in_file, "r") as in_fp:
for line in tqdm(in_fp.readlines()):
struct = jsondecoder.decode(line)
hypothesis = struct["claim"]
premise_idx = 0
for sentence in struct["predicted_sentences"]:
underlined_title = sentence[0]
label = labels[sentence[2]]
premise = sentence[3]
# Prefix the premise sentence with [ TITLE ] (from source article)
title = underlined_title.replace("_", " ")
title_words = tokenizer.split_words(title)
tokenized_title = " ".join(map(lambda x: x.text, title_words))
premise = "[ " + tokenized_title + " ] " + premise
premise_words = premise.split(" ")
if(len(premise_words) > max_sent_len):
premise = " ".join(premise_words[0:max_sent_len])
info = str(struct["id"]) + "\t" + str(premise_idx) + "\t"
info = info + str(sentence[0]) + "\t" + str(sentence[1])
premise_fp.write(premise + "\n")
hypothesis_fp.write(hypothesis + "\n")
label_fp.write(str(label) + "\n")
index_fp.write(info + "\n")
premise_idx = premise_idx + 1
premise_fp.close()
hypothesis_fp.close()
label_fp.close()
index_fp.close()