-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathcomparisons.py
executable file
·366 lines (317 loc) · 12.5 KB
/
comparisons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This file is called by mgproc.py
#
# It defines:
#
# - Comparison as a new base class with methods for
# - computing the feasibility of all metrics wrt
# a given psycholinguistic contrast
# - reseting the results for all metrics
#
# - ComparisonSet as a new base class that collects multiple comparisons;
# its methods are:
# - .add for adding comparisons to the collection
# - .compare for running every comparison in the set
# - .show for printing the winners, ties, and losers among the metrics
#
# - Functions for definining Comparion(Set)s with text files
import os
import pprint
import re
import tabulate
from mgproc import tree_from_file
class Comparison:
"""
Compare metrics with respect to a specific processing contrast.
A processing contrast consists of two trees, one of which is parsed faster
than the other. A Comparison collects two IOTrees, stores their empirical
processing difficulty, and how well metrics predict this contrast.
Picks two IOTrees as part of a processing contrast and records how well certain metrics
captured this contrast.
Public Methods
--------------
.name: str
name of comparsion (e.g. Eng-SRC-ORC)
.winner: IOTree
tree that is processed faster
.loser: IOTree
tree that is processed more slowly
.metrics: set
set of metrics to be used in comparison
.latex: str
LaTeX command for name of comparison
.success: set
set of metrics that correctly pick the winner
.tie: set
set of metrics that predict a tie
.failure: set
set of metrics that incorrectly pick the loser
.compare: list of metrics -> updated Comparison
for each metric, compute the values it assigns to the two trees
and update its viability accordingly
.reset:
wipe all comparison results calculated so far
"""
def __init__(self, name: str='',
winner: 'IOTree'=None, loser: 'IOTree'=None,
metrics: set=set(), latex: str='',
success: set=set(), tie: set=set(), failure: set=set()):
self.name = name
self.winner = winner
self.loser = loser
self.metrics = metrics
self.success = success
self.tie = tie
self.failure = failure
def compare(self, metrics: set=set()):
if not metrics:
metrics = self.metrics
for metric in metrics:
# check how the metric does
metric.compare(self.name, self.winner, self.loser)
# and add it to the correct group (possibly removing it from others)
if metric.viable == (True, True):
# predict first guy to win, second to lose
try:
self.tie.remove(metric)
self.failure.remove(metric)
except:
pass
finally:
self.success.add(metric)
elif metric.viable == (False, True):
# predict a tie
try:
self.success.remove(metric)
self.failure.remove(metric)
except:
pass
finally:
self.tie.add(metric)
else:
# predict first guy to lose, second to win
try:
self.success.remove(metric)
self.tie.remove(metric)
except:
pass
finally:
self.failure.add(metric)
def reset(self):
self.metrics = []
self.success = set()
self.tie = set()
self.failure = set()
class ComparisonSet:
"""
Collection of Comparisons.
Multiple comparisons can be collected into a single ComparisonSet. This
makes it easy for the user to run multiple comparisons at once.
Public Methods
--------------
.comparisons: list
stores all Comparisons belonging to this ComparisonSet
.add: Comparison -> updated ComparisonSet
add a Comparison to .comparison
.compare:
call .compare for every member of the ComparisonSet
.merge:
merge another ComparisonSet into this one; to be implemented
.show():
print the overview of successful, tie-ing, and failing metrics
.table():
print tabular overview of comparison results per metric
.trees():
print list of trees used in comparison
"""
def __init__(self, args: list, name: str='', metrics: set=set(),
success: set=set(), tie: set=set(), failure: set=set()):
self.name = name
self.metrics = metrics
self.success = success
self.tie = tie
self.failure = failure
self.comparisons = []
self._winners = []
self._losers = []
self._trees = []
for arg in args:
try:
if type(arg) == dict:
self.add(Comparison(**arg))
elif type(arg) == tuple:
self.add(Comparison(*arg))
else:
print('something is going wrong')
except:
print('Comparison specification of ' + arg + ' is illicit')
def trees(self,split: bool=False, update: bool=False):
if update or not self._losers:
self._losers = [comp.loser for comp in self.comparisons]
if update or not self._winners:
self._winners = [comp.winner for comp in self.comparisons]
if split and (update or not self._trees):
self._trees = [self._winners, self._losers]
elif update or not self._trees:
self._trees = self._winners + self._losers
return self._trees
def add(self, comparison):
self.comparisons.append(comparison)
def compare(self, comparisons: set=None):
# by default no Comparisons are passed;
# in that case, use the full collection
if not comparisons:
comparisons = self.comparisons
for comparison in comparisons:
comparison.compare(self.metrics)
# update our record of how the metrics did
self.success = set.intersection(*[comparison.success
for comparison in comparisons])
self.failure = set.union(*[comparison.failure
for comparison in comparisons])
self.tie = set(self.metrics).difference(
self.success.union(self.failure))
# self.tie = set.intersection(*[comparison.tie
# for comparison in comparisons])
def merge(self, compset: 'ComparisonSet') -> 'ComparisonSet':
# fixme: to be implemented
pass
def _metric_id(self, metric: 'RankedMetric'):
return '{0}_{1}'.format(metric._name(), metric._filters())
def _metric_dict(self, function: 'function'=None):
if not function:
function = lambda x: x
metric_dict = {}
metric_dict['success'] = [function(metric)
for metric in self.success]
metric_dict['tie'] = [function(metric)
for metric in self.tie]
metric_dict['failure'] = [function(metric)
for metric in self.failure]
return metric_dict
def show(self,subtype=None):
if subtype:
pprint.pprint(self._metric_dict(function=self._metric_id)[subtype])
else:
pprint.pprint(self._metric_dict(function=self._metric_id))
def _matrix(self, numerical: bool=False):
metrics = self.metrics
rows = []
for metric in metrics:
row = [metric.name, metric.filters]
for comparison in self.comparisons:
if numerical:
winner = str(
metric.profile[comparison.name]['desired winner'][2])
loser = str(
metric.profile[comparison.name]['desired loser'][2])
result = '{0}/{1}'.format(winner, loser)
row.append(result)
else:
result = metric.profile[comparison.name]['captured']
row.append(_rewrite_tuple(result))
rows.append(row)
return rows
def table(self, numerical: bool=False, filename: str=None):
headers = ['Metric', 'Filters'] +\
[comp.name for comp in self.comparisons]
table = tabulate.tabulate(sorted(self._matrix(numerical=numerical)),
tablefmt='orgtbl', headers=headers)
if filename:
f = open(filename, 'w')
f.write(table)
f.close()
else:
print(table)
def _rewrite_tuple(tuplepair: (bool, bool)) -> str:
"""Rewrite (bool, bool) pair as human-friendly string"""
rewrite = {(True, True): 'Yes',
(False, True): 'Tie',
(False, False): 'No'}
return rewrite.get(tuplepair, 'Error')
#########################################
# Comparison Specifications from Text #
#########################################
def _comparison_from_line(comparison_line: str, metrics: set=set(),
inputfile: str='', directory: str=None) -> dict:
"""
Construct Comparison from line in *.compare file.
The lines of a *.compare file are of the form
name; LaTeX command; winner; loser
name: Python-internal name of comparison
LaTeX: LaTeX command for the comparison name
winner: path to .tree.forest for more quickly processed tree
loser: path to .tree.forest for more slowly processed tree
Parameters
----------
comparison_line: str
line from *.compare file that is to be processed
metrics: set
set of metrics to be used in comparison
inputfile: str
path to *.compare file
directory: str
if specified, this will be prepended to the paths for winner and loser
"""
# split line at every ; and keep first four values
parameters = [field.strip() for field in comparison_line.split(';')]
try:
name, latex, winner_path, loser_path = parameters[:4]
except:
message = 'Error in file {0}:\n\
not enough parameters specified'
raise Exception(message).format(inputfile)
# construct IOTrees for winner and loser
if directory:
winner_path = os.path.join(directory, winner_path)
loser_path = os.path.join(directory, loser_path)
winner = tree_from_file(winner_path)
loser = tree_from_file(loser_path)
# return dictionary from which the Comparison will be built
return {'name': name, 'latex': latex, 'metrics': metrics,
'winner': winner, 'loser': loser}
def comparisons_from_file(inputfile: str=None,
directory: str=None,
extension: str='.compare',
metrics: set=set()) -> 'ComparisonSet':
"""
Build collection of Comparisons from *.compare file.
Users can define a ComparisonSet with a *.compare file,
where each line is of the form
name; LaTeX command; winner; loser
name: Python-internal name of comparison
LaTeX: LaTeX command for the comparison name
winner: path to .tree.forest for more quickly processed tree
loser: path to .tree.forest for more slowly processed tree
Parameters
----------
inputfile: str
path to *.compare file
directory: str
if specified, this will be prepended to the paths for winner and loser
extension: str
overwrite default file extension for *.compare files
metrics: set
"""
# ask for input file if necessary
if not inputfile:
inputfile =\
input("File to read in (without .compare extension):\n")
# remove extension if specified
if inputfile.endswith(extension):
inputfile = inputfile.replace(extension, '')
# set baseneame
basename = os.path.basename(inputfile)
# read in specification file
with open(inputfile + extension, 'r') as compfile:
# create list of dictionary, each one of defines a Comparison
parameter_dicts = [_comparison_from_line(line, metrics, inputfile, directory)
for line in compfile.readlines()
if not (re.match(r'^\s*$', line) or
re.match(r'\s*#.*', line))]
compfile.close()
comp = ComparisonSet(parameter_dicts, name=basename,
metrics=metrics)
comp.compare()
return comp