-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathconvert_copa.py
99 lines (78 loc) · 3.75 KB
/
convert_copa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import sys
import csv
import ast
import json
import random
from utils import convert_copa_data
# Original COPA (`dev + test`): https://people.ict.usc.edu/~gordon/copa.html
# Balanced-COPA (`dev`): https://github.com/Balanced-COPA/Balanced-COPA
# BCOPA-CE (`test`): https://github.com/badbadcode/weakCOPA
# the 'm' extension is for the multi-choice formatted data
dir_path = 'data/copa/'
copa_dev, copa_dev_ordered, copa_dev_m = convert_copa_data(dir_path + 'original/copa-dev.xml')
bcopa_dev, bcopa_dev_ordered, bcopa_dev_m = convert_copa_data(dir_path + 'original/balanced-copa-dev-all.xml')
copa_test, copa_test_ordered, copa_test_m = convert_copa_data(dir_path + 'original/copa-test.xml')
bcopa_ce_test, bcopa_ce_test_ordered, bcopa_ce_test_m = convert_copa_data(dir_path + 'original/BCOPA-CE.xml')
# ---------------------------------------------
# creating easy and hard subsets from COPA-test
# these subsets are defined by:
# Kavumba, Pride, et al. "When Choosing Plausible Alternatives, Clever Hans can be Clever."
# Proceedings of the First Workshop on Commonsense Inference in Natural Language Processing. 2019.
easy_hard_path = 'original/easy_hard_subsets.json'
with open(dir_path + easy_hard_path) as f:
easy_hard_data = json.load(f)
easy_test_m = []
hard_test_m = []
for item in copa_test_m:
item_id = int(item['id'])
if item_id in easy_hard_data['easy']:
easy_test_m.append(item)
elif item_id in easy_hard_data['hard']:
hard_test_m.append(item)
# ---------------------------------------------
all_data = {"copa_dev": copa_dev, "copa_dev_ordered": copa_dev_ordered, "copa_dev_m": copa_dev_m,
"bcopa_dev": bcopa_dev, "bcopa_dev_ordered": bcopa_dev_ordered, "bcopa_dev_m": bcopa_dev_m,
"copa_test": copa_test, "copa_test_ordered": copa_test_ordered, "copa_test_m": copa_test_m,
"bcopa_ce_test": bcopa_ce_test, "bcopa_ce_test_ordered": bcopa_ce_test_ordered,
"bcopa_ce_test_m": bcopa_ce_test_m,
"easy_test_m": easy_test_m, "hard_test_m": hard_test_m}
for data_name, data in all_data.items():
data = list(map(str, data))
with open(dir_path + "{}.txt".format(data_name), "w") as output:
output.writelines(f'{s}\n' for s in data)
# ---------------------------------------------
col_names_seq = ['sent1', 'sent2', 'label']
col_names_multi = ['startphrase', 'sent1', 'sent2', 'ending0', 'ending1', 'label']
def create_csv_seq(examples, file_path):
with open('{}.csv'.format(file_path), 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(col_names_seq)
for example in examples:
sent1 = example['sent1']
sent2 = example['sent2']
label = int(example['label'])
writer.writerow([sent1, sent2, label])
def create_csv_multi(examples, file_path):
with open('{}.csv'.format(file_path), 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(col_names_multi)
for example in examples:
example = ast.literal_eval(example)
sent1 = example['premise']
ending0 = example['choice0']
ending1 = example['choice1']
label = int(example['label'])
if 'cause' in example['question']:
sent2 = 'It is because'
startphrase = sent1 + ' ' + sent2
else:
sent2 = 'As a result,'
startphrase = sent1 + ' ' + sent2
writer.writerow([startphrase, sent1, sent2, ending0, ending1, label])
for data_name, data in all_data.items():
if data_name.endswith('_m'):
data = list(map(str, data))
create_csv_multi(data, dir_path + '{}'.format(data_name))
else:
create_csv_seq(data, dir_path + '{}'.format(data_name))