-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathpreprocess_data.py
160 lines (129 loc) · 6.48 KB
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import json
import os
import re
from utils.fix_label import fix_general_label_error
data_dir = "data/mwz2.1"
data_files = ["train_dials.json", "dev_dials.json", "test_dials.json"]
#--------------------------------
def normalize_time(text):
text = re.sub("(\d{1})(a\.?m\.?|p\.?m\.?)", r"\1 \2", text) # am/pm without space
text = re.sub("(^| )(\d{1,2}) (a\.?m\.?|p\.?m\.?)", r"\1\2:00 \3", text) # am/pm short to long form
text = re.sub("(^| )(at|from|by|until|after) ?(\d{1,2}) ?(\d{2})([^0-9]|$)", r"\1\2 \3:\4\5", text) # Missing separator
text = re.sub("(^| )(\d{2})[;.,](\d{2})", r"\1\2:\3", text) # Wrong separator
text = re.sub("(^| )(at|from|by|until|after) ?(\d{1,2})([;., ]|$)", r"\1\2 \3:00\4", text) # normalize simple full hour time
text = re.sub("(^| )(\d{1}:\d{2})", r"\g<1>0\2", text) # Add missing leading 0
# Map 12 hour times to 24 hour times
text = re.sub("(\d{2})(:\d{2}) ?p\.?m\.?", lambda x: str(int(x.groups()[0]) + 12 if int(x.groups()[0]) < 12 else int(x.groups()[0])) + x.groups()[1], text)
text = re.sub("(^| )24:(\d{2})", r"\g<1>00:\2", text) # Correct times that use 24 as hour
return text
def normalize_text(text):
text = normalize_time(text)
text = re.sub("n't", " not", text)
text = re.sub("(^| )zero(-| )star([s.,? ]|$)", r"\g<1>0 star\3", text)
text = re.sub("(^| )one(-| )star([s.,? ]|$)", r"\g<1>1 star\3", text)
text = re.sub("(^| )two(-| )star([s.,? ]|$)", r"\g<1>2 star\3", text)
text = re.sub("(^| )three(-| )star([s.,? ]|$)", r"\g<1>3 star\3", text)
text = re.sub("(^| )four(-| )star([s.,? ]|$)", r"\g<1>4 star\3", text)
text = re.sub("(^| )five(-| )star([s.,? ]|$)", r"\g<1>5 star\3", text)
text = re.sub("archaelogy", "archaeology", text) # Systematic typo
text = re.sub("anthropogy", "anthropology", text)
text = re.sub("theater", "theatre", text)
text = re.sub("musuem", "museum", text)
text = re.sub("the weat", "the west", text)
text = re.sub("the wast", "the west", text)
text = re.sub(" wendesday ", " wednesday ", text)
text = re.sub(" wednes ", " wednesday ", text)
text = re.sub("thurtsday", "thursday", text)
text = re.sub("mdoerate", "moderate", text)
text = re.sub("portugese", "portuguese", text)
text = re.sub("guesthouse", "guest house", text) # Normalization
text = re.sub("(^| )b ?& ?b([.,? ]|$)", r"\1bed and breakfast\2", text) # Normalization
text = re.sub("bed & breakfast", "bed and breakfast", text) # Normalization
return text
def normalize_label(slot, value_label):
# Some general typos
value_label = re.sub("theater", "theatre", value_label)
value_label = re.sub("archaelogy", "archaeology", value_label)
value_label = re.sub("anthropogy", "anthropology", value_label)
value_label = re.sub("portugese", "portuguese", value_label)
# Normalization of empty slots
if value_label == '' or value_label == "not mentioned":
return "none"
# Normalization of time slots
if "leaveat" in slot or "arriveby" in slot or slot == 'restaurant-book time':
return normalize_time(value_label)
# Normalization
if "type" in slot or "name" in slot or "destination" in slot or "departure" in slot:
value_label = re.sub("guesthouse", "guest house", value_label)
if slot == "hotel-name":
value_label = re.sub("b & b", "bed and breakfast", value_label)
# Map to boolean slots
if slot == 'hotel-parking' or slot == 'hotel-internet':
if value_label == 'yes' or value_label == 'free':
return "true"
if value_label == "no":
return "false"
if slot == 'hotel-type':
if value_label == "hotel":
return "true"
if value_label == "guest house":
return "false"
return value_label
#--------------------------------
def make_slot_meta(ontology):
meta = []
change = {}
for i, k in enumerate(ontology.keys()):
d, s = k.split('-')
if 'price' in s or 'leave' in s or 'arrive' in s:
s = s.replace(' ', '')
meta.append('-'.join([d, s]))
change[meta[-1]] = ontology[k]
return sorted(meta), change
### Read ontology file
fp_ont = open(os.path.join(data_dir, "ontology.json"), "r")
data_ont = json.load(fp_ont)
fp_ont.close()
slot_meta, _ = make_slot_meta(data_ont)
ontology_modified = {}
for slot in slot_meta:
ontology_modified[slot] = []
### normalize text and fix label errors
for idx, file_id in enumerate(data_files):
fp_data = open(os.path.join(data_dir, file_id), "r")
dials = json.load(fp_data)
dials_v2 = []
for dial_dict in dials:
dial_domains = dial_dict["domains"]
prev_turn_state = {}
for slot in slot_meta:
prev_turn_state[slot] = "none"
for ti, turn in enumerate(dial_dict["dialogue"]):
dial_dict["dialogue"][ti]["system_transcript"] = normalize_text(turn["system_transcript"])
dial_dict["dialogue"][ti]["transcript"] = normalize_text(turn["transcript"])
# state
turn_dialog_state = fix_general_label_error(turn["belief_state"], False, slot_meta)
for slot in slot_meta:
if slot not in turn_dialog_state or slot.split('-')[0] not in dial_domains:
turn_dialog_state[slot] = "none"
else:
turn_dialog_state[slot] = normalize_label(slot, turn_dialog_state[slot])
if turn_dialog_state[slot]=="dontcare":
turn_dialog_state[slot] = "do not care"
ontology_modified[slot].append(turn_dialog_state[slot])
dial_dict["dialogue"][ti]["belief_state"] = []
# turn label
turn_label = []
for slot in slot_meta:
if turn_dialog_state[slot] != prev_turn_state[slot]:
turn_label.append([slot, turn_dialog_state[slot]])
dial_dict["dialogue"][ti]["turn_label"] = turn_label
prev_turn_state = turn_dialog_state
dials_v2.append(dial_dict)
with open(os.path.join(data_dir, file_id.split(".")[0]+"_v2.json"), 'w') as outfile:
json.dump(dials_v2, outfile, indent=4)
# ontology extracted from dataset
for slot in slot_meta:
ontology_modified[slot] = sorted(list(set(ontology_modified[slot])))
with open(os.path.join(data_dir, 'ontology-modified_v2.json'), 'w') as outfile:
json.dump(ontology_modified, outfile, indent=4)