-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy patheval.py
659 lines (564 loc) · 21.9 KB
/
eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
"""Evaluation code for GQA."""
# pylint: skip-file
# mypy: ignore-errors
# Computes a suite of metrics such as accuracy, consistency, plausibility and
# scores per question type and length.
# Visit https://gqadataset.org/ for all information about the dataset,
# including examples, visualizations, paper and slides.
#
#
# Metrics:
# - Accuracy: Standard accuracy, computed over the balanced version of the
# dataset, which is more robust against cheating by making educated
# guesses. For each question-answer pair (q,a), we give 1 point if the
# predicted answer p matches a and 0 otherwise, and average over all
# questions in the dataset.
#
# - Consistency: A metric for the level of model's consistency across different
# questions. For each question-answer pair (q,a), we define a set
# Eq={q1, q2, ..., qn} of entailed questions, the answers to which
# can be unambiguously inferred given (q,a). Denote Q the set of
# all questions the model answered correctly. For each question
# q in Q, we measure the model's accuracy over the entailed
# questions Eq to get the score sq and finally average these
# results across all questions in Q.
#
# - Validity: Measures whether the model gives a "valid" answer - one that can
# theoretically be an answer to the question (e.g. a color to a
# color question, yes/no to a binary question etc.). We provide a
# set of valid answers to each questions over the final answer
# vocabulary, in the choices file, and use it to compute average
# validity across the dataset.
#
# - Plausibility: Measures whether the model answers are plausible, e.g. one
# that make sense in the real world, e.g. not answering "purple"
# to a question about apple color (unless it's really purple).
# We provide a set of all plausible answers to each questions,
# computed by looking at all attributes and relations hold for
# various objects throughout the whole dataset scene graphs,
# and use it to compute average model plausibility across the data.
#
# - Grounding: Only for attention models. Measures whether the model looks at
# the relevant regions in the image when answering a question.
# Each question in the dataset is annotated with the visual regions
# they refer to, which are then used to compute the level to which
# the model has a correct visual attention, which will allow to
# identify whether it really answers based on the image of by
# language-based guesses. Supports both spatial features and
# object-based features.
#
# - Distribution: Measures the overall match between the true answer distribution
# for different questions, vs the overall distribution predicted
# by the model through its answers for all the data. We use
# chi-square statistic to measure the degree of similarity between
# the distributions, giving indication to the level of overall
# world-knowledge of the model
#
# - Accuracy per type: accuracy per question structural types
# (logic, compare, choose), and semantic type
# (questions about attributes, relations, categories,
# objects or the whole scene).
#
# - Accuracy for length: accuracy as a function of the question length, in terms
# of (1) words number, and semantic
# complexity - number of reasoning steps.
#
# We may support additional metrics (e.g. coverage) in the future.
#
#
# Files format:
# - predictions file format: JSON array: [{"questionId": str, "prediction": str}]
# - attentions file format: JSON array:
# Spatial attention: [{
# "questionId": str,
# "attention": [mapSize x mapSize: float]
# }]
# Object-based attention:[{
# "questionId": str,
# "attention": [[x0, y0, x1, y1, float] x #regions]
# }]. 0 < x,y < 1.
# - questions and choices files are provided as part of the dataset.
# see https://gqadataset.org/download.html for information about their format.
#
#
# If you have any questions or comments, please feel free to send an email,
# at dorarad@cs.stanford.edu. We hope you'll enjoy using the GQA dataset! :)
#
#
import argparse
import glob
import json
import os.path
from collections import defaultdict
from tqdm import tqdm
# Arguments
parser = argparse.ArgumentParser()
parser.add_argument("--tier", default="val", type=str, help="Tier, e.g. train, val")
parser.add_argument(
"--scenes",
default=os.path.join(
os.path.dirname(__file__), "data/gqa/sceneGraphs/{tier}_sceneGraphs.json"
),
type=str,
help="Scene graphs file name format.",
)
parser.add_argument(
"--questions",
default=os.path.join(
os.path.dirname(__file__), "data/gqa/questions/{tier}_all_questions.json"
),
type=str,
help="Questions file name format.",
)
parser.add_argument(
"--choices",
default=os.path.join(
os.path.dirname(__file__), "data/gqa/eval/{tier}_choices.json"
),
type=str,
help="Choices file name format.",
)
# The include-ids and exclude-ids flags are used to compute metrics on a subset
# of a tier; For this project, we train on the full balanced training set, we
# evaluate on the *first half* of the balanced val tier, and we test
# on the second half of the balanced val tier.
# For computing test consistency, we need to include predictions to all questions
# for a given tier (not just balanced ones), as specified by the --questions
# argument; we then use --include-ids to specify the ids we should include for
# evaluating the test metrics (i.e. the second half of the val set), and use
# --exclude-ids to specify the ids that we used for evaluation so we don't use
# those when calculating consistency over entailed questions.
parser.add_argument(
"--include-ids",
default=None,
type=str,
help="A path to a list of question IDs on which to compute metrics. "
+ "If None, compute metrics for all provided IDs",
)
parser.add_argument(
"--exclude-ids",
default=None,
type=str,
help="A path to a list of question IDs on which to exclude when computing "
+ "metrics. If None, no IDs are excluded",
)
parser.add_argument(
"--predictions",
default=None,
required=True,
type=str,
help="Answers file name.",
)
parser.add_argument(
"--attentions",
default=os.path.join(os.path.dirname(__file__), "{tier}_attentions.json"),
type=str,
help="Attentions file name format.",
)
parser.add_argument(
"--consistency",
action="store_true",
help="True to compute consistency score "
+ "(Need to provide answers to questions in val_all_questions.json).",
)
parser.add_argument(
"--grounding",
action="store_true",
help="True to compute grounding score (If model uses attention).",
)
parser.add_argument(
"--objectFeatures",
action="store_true",
help="True for object-based attention (False for spatial).",
)
parser.add_argument(
"--mapSize",
default=7,
type=int,
help="Optional, only to get attention score. "
+ "Images features map size, mapSize * mapSize",
)
args = parser.parse_args()
print(
"Please make sure to use our provided visual features as gqadataset.org for",
"better comparability. We provide both spatial and object-based features",
"trained on GQA train set.",
)
print(
"In particular please avoid using features from",
"https://github.com/peteanderson80/bottom-up-attention since they were",
"trained on images contained in the GQA validation set and thus may give",
"false scores improvement.\n",
)
if not args.consistency:
print(
"Please consider using --consistency to compute consistency scores for",
"entailed questions.",
)
print(
"If you do so, please provide answers to all",
"questions in val_all_questions.json.\n",
)
if not args.grounding:
print("Please consider using --grounding to compute attention scores.")
print("If you do so, please provide attention maps through --attentions.\n")
# Files Loading
def load_file(name):
"""Load a file."""
print(name)
# load standard json file
if os.path.isfile(name):
with open(name) as file:
data = json.load(file)
# load file chunks if too big
elif os.path.isdir(os.path.dirname(name)):
dir_, ext = os.path.splitext(os.path.basename(name))
data = {}
chunks = glob.glob(
os.path.join(
os.path.dirname(name), "{dir}/{dir}_*{ext}".format(dir=dir_, ext=ext)
)
)
print(chunks)
for chunk in chunks:
with open(chunk) as file:
data.update(json.load(file))
else:
raise Exception("Can't find {}".format(name))
return data
# Load scene graphs
print("Loading scene graphs...")
scenes = load_file(args.scenes.format(tier=args.tier))
# Load questions
print("Loading questions...")
questions = load_file(args.questions.format(tier=args.tier))
# Load choices
print("Loading choices...")
choices = load_file(args.choices.format(tier=args.tier))
# Load predictions and turn them into a dictionary
print("Loading predictions...")
predictions = load_file(args.predictions)
predictions = {p["questionId"]: p["prediction"] for p in predictions}
# Load masked ids
include_ids = None
if args.include_ids is not None:
print("Loading include ids...")
include_ids = load_file(args.include_ids)
exclude_ids = None
if args.exclude_ids is not None:
print("Loading exclude ids...")
exclude_ids = load_file(args.exclude_ids)
# Make sure all question have predictions
# TODO check for include and exclude IDs?
# for qid in questions:
# if (qid not in predictions) and \
# (args.consistency or questions[qid]["isBalanced"]):
# if include_ids is None or qid in include_ids:
# print(
# "no prediction for question {}.".format(qid),
# "Please add prediction for all questions.",
# )
# raise Exception("missing predictions")
# Load attentions and turn them into a dictionary
ATTENTIONS = None
if args.grounding:
with open(args.attentions.format(tier=args.tier)) as attentionsFile:
ATTENTIONS = json.load(attentionsFile)
ATTENTIONS = {a["questionId"]: a["attention"] for a in ATTENTIONS}
# Scores data structures initialization
def to_score(condition):
"""Convert bool to float."""
return float(1 if condition else 0)
def avg(lst):
"""Compute average of a list."""
if len(lst) == 0:
return 0
return float(sum(lst)) / len(lst)
def wavg(lst, weights):
"""Compute weighted average of a list."""
if sum(weights) == 0:
return None
return float(sum(lst[i] * weights[i] for i in range(len(lst)))) / sum(weights)
# Initialize data structure to track all metrics: e.g. accuracy, validity and
# plausibility, as well as accuracy per question type, length and number of
# reasoning steps.
scores = {
"accuracy": [], # list of accuracies per question (1 if correct else 0).
"binary": [], # list of accuracies per a binary question (1 if correct else 0).
"open": [], # list of accuracies per an open question (1 if correct else 0).
"validity": [], # list of validity per question (1 if valid else 0).
"plausibility": [], # list of plausibility per question (1 if plausible else 0).
"consistency": [], # list of consistency scores for entailed questions.
"accuracyPerStructuralType": defaultdict(
list
), # question accuracies for each structural type (e.g. compare, logic questions).
"accuracyPerSemanticType": defaultdict(
list
), # question accuracies for each semantic type (e.g. object, attribute, relation).
"accuracyPerLength": defaultdict(
list
), # list of question accuracies per question's word number.
"accuracyPerSteps": defaultdict(
list
), # list of question accuracies per question's reasoning length (steps number).
"grounding": [], # list of grounding scores for each question.
}
# Initialize golden and predicted histograms per each question group.
# Used to compute the distribution metric.
dist = {
"gold": defaultdict(lambda: defaultdict(int)),
"predicted": defaultdict(lambda: defaultdict(int)),
}
# Question lengths - words numbers and reasoning steps number
def getWordsNum(question):
"""Compute question length (words number)."""
return len(question["question"].split())
def getStepsNum(question):
"""Compute number of reasoning steps.
This excludes the final "querying" step which doesn't increase effective
reasoning length.
"""
return len(
[
c
for c in question["semantic"]
if not (
any(
[
o in "{}: {}".format(c["operation"], c["argument"])
for o in ["exist", "query: name", "choose name"]
]
)
)
]
)
# Functions for validity and plausibility
def belongs(element, group, question):
"""Check if an element belongs to a group."""
if "Common" in question["types"]["detailed"]:
group = ["color", "material", "shape"]
return element in group
# Functions for consistency scores (for entailed questions ("inferred"))
def updateConsistency(questionId, question, questions, exclude=[]):
"""Update the consistency score."""
inferredQuestions = [
eid for eid in question["entailed"] if eid != questionId and eid not in exclude
]
if correct and len(inferredQuestions) > 0:
cosnsitencyScores = []
for eid in inferredQuestions:
gold = questions[eid]["answer"]
predicted = predictions[eid]
score = to_score(predicted == gold)
cosnsitencyScores.append(score)
scores["consistency"].append(avg(cosnsitencyScores))
# Functions for grounding score (optional, only for attention models)
# Utility functions for working with bounding boxes.
# c = (x0, y0, x1, y1), r = (r0, r1)
def yrange(c):
"""Extract y bounds from a bbox tuple."""
return (c[1], c[3])
def xrange(c):
"""Extract x bounds from a bbox tuple."""
return (c[0], c[2])
def length(r):
"""Get length of a segment."""
if r is None:
return 0
return float(r[1] - r[0])
def size(c):
"""Get bbox size."""
return length(xrange(c)) * length(yrange(c))
def intersection(r1, r2):
"""Get intersection of two segment."""
ir = (max(r1[0], r2[0]), min(r1[1], r2[1]))
if ir[1] > ir[0]:
return ir
return None
def intersectionSize(c1, c2):
"""Get intersection area of two bboxes."""
return length(intersection(xrange(c1), xrange(c2))) * length(
intersection(yrange(c1), yrange(c2))
)
def intersectionRate(c1, c2):
"""Get intersection rate of two bboxes relative to c1."""
return float(intersectionSize(c1, c2)) / size(c1)
def getCell(i, j):
"""Get spatial cell."""
edge = float(1) / args.mapSize
return (edge * i, edge * j, edge * (i + 1), edge * (j + 1))
def getRegion(sceneGraph, objectId):
"""Get bounding box of objectId in sceneGraph."""
obj = sceneGraph["objects"][objectId]
x0 = float(obj["x"]) / sceneGraph["width"]
y0 = float(obj["y"]) / sceneGraph["height"]
x1 = float(obj["x"] + obj["w"]) / sceneGraph["width"]
y1 = float(obj["y"] + obj["h"]) / sceneGraph["height"]
return (x0, y0, x1, y1)
def computeGroundingScore(question, sceneGraph, attentionMap):
"""Compute grounding score.
Compute amount of attention (probability) given to each of the regions the
question and answers refer to.
"""
# prepare gold regions
regions = []
# add question regions
regions += [
getRegion(sceneGraph, pointer)
for pointer in question["annotations"]["question"].values()
]
# add answer regions
regions += [
getRegion(sceneGraph, pointer)
for pointer in question["annotations"]["fullAnswer"].values()
]
# add all the image if the question refers to the whole scene
if any(("scene" in c) for c in question["semantic"]):
regions.append((0, 0, 1, 1))
# prepare attention map
if args.objectFeatures:
# cells = [((x0, y0, x1, y1), attention) for x0, y0, x1, y1, attention in cells]
# Undefined ref
raise Exception("Error in original eval code, refer to GQA paper.")
else:
cells = [
(getCell(i, j), attentionMap[i][j])
for i in range(args.mapSize)
for j in range(args.mapSize)
]
# compare attention map to gold regions
scores = []
for region in regions:
for cell, attention in cells:
scores.append(attention * intersectionRate(cell, region))
return sum(scores)
# Functions for distribution score
def chiSquare(goldDist, predictedDist):
"""Compute chi square statistic of gold distribution vs predicted distribution, \
averaged over all question groups."""
sumScore, sumOverall = 0, 0
for group in goldDist:
score, overall = 0, 0
for ans in goldDist[group]:
e = goldDist[group][ans]
o = predictedDist[group].get(ans, 0)
score += (float(o - e) ** 2) / e
overall += goldDist[group][ans]
sumScore += score * overall
sumOverall += overall
avgScore = float(sumScore) / sumOverall
return avgScore
# Main score computation
print(len(questions))
# Loop over the questions and compute mterics
for qid, question in tqdm(questions.items()):
# Compute scores over the balanced dataset
# (more robust against cheating by making educated guesses)
# Mask ids where appropriate to evaluate custom subsets of tiers
if question["isBalanced"] and (include_ids is None or qid in include_ids):
gold = question["answer"]
predicted = predictions[qid]
correct = predicted == gold
score = to_score(correct)
wordsNum = getWordsNum(question)
stepsNum = getStepsNum(question)
# Update accuracy
scores["accuracy"].append(score)
scores["accuracyPerLength"][wordsNum].append(score)
scores["accuracyPerSteps"][stepsNum].append(score)
scores["accuracyPerStructuralType"][question["types"]["structural"]].append(
score
)
scores["accuracyPerSemanticType"][question["types"]["semantic"]].append(score)
answerType = "open" if question["types"]["structural"] == "query" else "binary"
scores[answerType].append(score)
# Update validity score
valid = belongs(predicted, choices[qid]["valid"], question)
scores["validity"].append(to_score(valid))
# Update plausibility score
plausible = belongs(predicted, choices[qid]["plausible"], question)
scores["plausibility"].append(to_score(plausible))
# Optionally compute grounding (attention) score
if ATTENTIONS is not None:
groundingScore = computeGroundingScore(
question, scenes[question["imageId"]], ATTENTIONS[qid]
)
if groundingScore is not None:
scores["grounding"].append(groundingScore)
# Update histograms for gold and predicted answers
globalGroup = question["groups"]["global"]
if globalGroup is not None:
dist["gold"][globalGroup][gold] += 1
dist["predicted"][globalGroup][predicted] += 1
if args.consistency:
# Compute consistency (for entailed questions)
updateConsistency(
qid,
question,
questions,
exclude=exclude_ids if exclude_ids is not None else [],
)
# Compute distribution score
scores["distribution"] = chiSquare(dist["gold"], dist["predicted"]) / 100
# Average scores over all questions (in the balanced dataset) and print scores
metrics = [
"binary",
"open",
"accuracy",
"consistency",
"validity",
"plausibility",
"grounding",
"distribution",
]
detailedMetrics = [
("accuracyPerStructuralType", "Accuracy / structural type"),
("accuracyPerSemanticType", "Accuracy / semantic type"),
("accuracyPerSteps", "Accuracy / steps number"),
("accuracyPerLength", "Accuracy / words number"),
]
subMetrics = {
"attr": "attribute",
"cat": "category",
"global": "scene",
"obj": "object",
"rel": "relation",
}
# average
for k in metrics:
if isinstance(scores[k], list):
scores[k] = avg(scores[k]) * 100
for k, _ in detailedMetrics:
for t in scores[k]:
scores[k][t] = avg(scores[k][t]) * 100, len(scores[k][t])
# print
print("")
for m in metrics:
# skip grounding and consistency scores if not requested
if m == "grounding" and not args.grounding:
continue
if m == "consistency" and not args.consistency:
continue
# print score
print(
"{title}: {score:.2f}{suffix}".format(
title=m.capitalize(),
score=scores[m],
suffix=" (lower is better)" if m == "distribution" else "%",
)
)
for m, mPrintName in detailedMetrics:
print("")
# print metric title
print("{}:".format(mPrintName))
for t in sorted(list(scores[m].keys())):
# set sub-metric title
tName = t
if isinstance(scores[k], list):
tName = subMetrics.get(t, t).capitalize()
# print score
print(
" {title}: {score:.2f}{suffix} ({amount} questions)".format(
title=tName, score=scores[m][t][0], suffix="%", amount=scores[m][t][1]
)
)