eval.py

"""Evaluation code for GQA."""
# pylint: skip-file
# mypy: ignore-errors
# Computes a suite of metrics such as accuracy, consistency, plausibility and
# scores per question type and length.
# Visit https://gqadataset.org/ for all information about the dataset,
# including examples, visualizations, paper and slides.
#
#
# Metrics:
# - Accuracy: Standard accuracy, computed over the balanced version of the
#             dataset, which is more robust against cheating by making educated
#             guesses. For each question-answer pair (q,a), we give 1 point if the
#             predicted answer p matches a and 0 otherwise, and average over all
#             questions in the dataset.
#
# - Consistency: A metric for the level of model's consistency across different
#                questions. For each question-answer pair (q,a), we define a set
#                Eq={q1, q2, ..., qn} of entailed questions, the answers to which
#                can be unambiguously inferred given (q,a). Denote Q the set of
#                all questions the model answered correctly. For each question
#                q in Q, we measure the model's accuracy over the entailed
#                questions Eq to get the score sq and finally average these
#                results across all questions in Q.
#
# - Validity: Measures whether the model gives a "valid" answer - one that can
#             theoretically be an answer to the question (e.g. a color to a
#             color question, yes/no to a binary question etc.). We provide a
#             set of valid answers to each questions over the final answer
#             vocabulary, in the choices file, and use it to compute average
#             validity across the dataset.
#
# - Plausibility: Measures whether the model answers are plausible, e.g. one
#                 that make sense in the real world, e.g. not answering "purple"
#                 to a question about apple color (unless it's really purple).
#                 We provide a set of all plausible answers to each questions,
#                 computed by looking at all attributes and relations hold for
#                 various objects throughout the whole dataset scene graphs,
#                 and use it to compute average model plausibility across the data.
#
# - Grounding: Only for attention models. Measures whether the model looks at
#              the relevant regions in the image when answering a question.
#              Each question in the dataset is annotated with the visual regions
#              they refer to, which are then used to compute the level to which
#              the model has a correct visual attention, which will allow to
#              identify whether it really answers based on the image of by
#              language-based guesses. Supports both spatial features and
#              object-based features.
#
# - Distribution: Measures the overall match between the true answer distribution
#                 for different questions, vs the overall distribution predicted
#                 by the model through its answers for all the data. We use
#                 chi-square statistic to measure the degree of similarity between
#                 the distributions, giving indication to the level of overall
#                 world-knowledge of the model
#
# - Accuracy per type: accuracy per question structural types
#                      (logic, compare, choose), and semantic type
#                      (questions about attributes, relations, categories,
#                      objects or the whole scene).
#
# - Accuracy for length: accuracy as a function of the question length, in terms
#                        of (1) words number, and semantic
#                        complexity - number of reasoning steps.
#
# We may support additional metrics (e.g. coverage) in the future.
#
#
# Files format:
# - predictions file format: JSON array: [{"questionId": str, "prediction": str}]
# - attentions file format: JSON array:
#   Spatial attention: [{
#                          "questionId": str,
#                          "attention": [mapSize x mapSize: float]
#                      }]
#   Object-based attention:[{
#                              "questionId": str,
#                              "attention": [[x0, y0, x1, y1, float] x #regions]
#                          }]. 0 < x,y < 1.
# - questions and choices files are provided as part of the dataset.
#   see https://gqadataset.org/download.html for information about their format.
#
#
# If you have any questions or comments, please feel free to send an email,
# at dorarad@cs.stanford.edu. We hope you'll enjoy using the GQA dataset! :)
#
#

import argparse
import glob
import json
import os.path
from collections import defaultdict

from tqdm import tqdm

# Arguments

parser = argparse.ArgumentParser()
parser.add_argument("--tier", default="val", type=str, help="Tier, e.g. train, val")
parser.add_argument(
    "--scenes",
    default=os.path.join(
        os.path.dirname(__file__), "data/gqa/sceneGraphs/{tier}_sceneGraphs.json"
    ),
    type=str,
    help="Scene graphs file name format.",
)
parser.add_argument(
    "--questions",
    default=os.path.join(
        os.path.dirname(__file__), "data/gqa/questions/{tier}_all_questions.json"
    ),
    type=str,
    help="Questions file name format.",
)
parser.add_argument(
    "--choices",
    default=os.path.join(
        os.path.dirname(__file__), "data/gqa/eval/{tier}_choices.json"
    ),
    type=str,
    help="Choices file name format.",
)
# The include-ids and exclude-ids flags are used to compute metrics on a subset
# of a tier; For this project, we train on the full balanced training set, we
# evaluate on the *first half* of the balanced val tier, and we test
# on the second half of the balanced val tier.
# For computing test consistency, we need to include predictions to all questions
# for a given tier (not just balanced ones), as specified by the --questions
# argument; we then use --include-ids to specify the ids we should include for
# evaluating the test metrics (i.e. the second half of the val set), and use
# --exclude-ids to specify the ids that we used for evaluation so we don't use
# those when calculating consistency over entailed questions.
parser.add_argument(
    "--include-ids",
    default=None,
    type=str,
    help="A path to a list of question IDs on which to compute metrics. "
    + "If None, compute metrics for all provided IDs",
)
parser.add_argument(
    "--exclude-ids",
    default=None,
    type=str,
    help="A path to a list of question IDs on which to exclude when computing "
    + "metrics. If None, no IDs are excluded",
)
parser.add_argument(
    "--predictions",
    default=None,
    required=True,
    type=str,
    help="Answers file name.",
)
parser.add_argument(
    "--attentions",
    default=os.path.join(os.path.dirname(__file__), "{tier}_attentions.json"),
    type=str,
    help="Attentions file name format.",
)
parser.add_argument(
    "--consistency",
    action="store_true",
    help="True to compute consistency score "
    + "(Need to provide answers to questions in val_all_questions.json).",
)
parser.add_argument(
    "--grounding",
    action="store_true",
    help="True to compute grounding score (If model uses attention).",
)
parser.add_argument(
    "--objectFeatures",
    action="store_true",
    help="True for object-based attention (False for spatial).",
)
parser.add_argument(
    "--mapSize",
    default=7,
    type=int,
    help="Optional, only to get attention score. "
    + "Images features map size, mapSize * mapSize",
)
args = parser.parse_args()

print(
    "Please make sure to use our provided visual features as gqadataset.org for",
    "better comparability. We provide both spatial and object-based features",
    "trained on GQA train set.",
)
print(
    "In particular please avoid using features from",
    "https://github.com/peteanderson80/bottom-up-attention since they were",
    "trained on images contained in the GQA validation set and thus may give",
    "false scores improvement.\n",
)

if not args.consistency:
    print(
        "Please consider using --consistency to compute consistency scores for",
        "entailed questions.",
    )
    print(
        "If you do so, please provide answers to all",
        "questions in val_all_questions.json.\n",
    )

if not args.grounding:
    print("Please consider using --grounding to compute attention scores.")
    print("If you do so, please provide attention maps through --attentions.\n")


# Files Loading
def load_file(name):
    """Load a file."""
    print(name)
    # load standard json file
    if os.path.isfile(name):
        with open(name) as file:
            data = json.load(file)
    # load file chunks if too big
    elif os.path.isdir(os.path.dirname(name)):
        dir_, ext = os.path.splitext(os.path.basename(name))
        data = {}
        chunks = glob.glob(
            os.path.join(
                os.path.dirname(name), "{dir}/{dir}_*{ext}".format(dir=dir_, ext=ext)
            )
        )
        print(chunks)
        for chunk in chunks:
            with open(chunk) as file:
                data.update(json.load(file))
    else:
        raise Exception("Can't find {}".format(name))
    return data


# Load scene graphs
print("Loading scene graphs...")
scenes = load_file(args.scenes.format(tier=args.tier))

# Load questions
print("Loading questions...")
questions = load_file(args.questions.format(tier=args.tier))

# Load choices
print("Loading choices...")
choices = load_file(args.choices.format(tier=args.tier))

# Load predictions and turn them into a dictionary
print("Loading predictions...")
predictions = load_file(args.predictions)
predictions = {p["questionId"]: p["prediction"] for p in predictions}

# Load masked ids
include_ids = None
if args.include_ids is not None:
    print("Loading include ids...")
    include_ids = load_file(args.include_ids)

exclude_ids = None
if args.exclude_ids is not None:
    print("Loading exclude ids...")
    exclude_ids = load_file(args.exclude_ids)

# Make sure all question have predictions
# TODO check for include and exclude IDs?
# for qid in questions:
#     if (qid not in predictions) and \
#         (args.consistency or questions[qid]["isBalanced"]):
#         if include_ids is None or qid in include_ids:
#             print(
#                 "no prediction for question {}.".format(qid),
#                 "Please add prediction for all questions.",
#             )
#             raise Exception("missing predictions")

# Load attentions and turn them into a dictionary
ATTENTIONS = None
if args.grounding:
    with open(args.attentions.format(tier=args.tier)) as attentionsFile:
        ATTENTIONS = json.load(attentionsFile)
        ATTENTIONS = {a["questionId"]: a["attention"] for a in ATTENTIONS}

# Scores data structures initialization


def to_score(condition):
    """Convert bool to float."""
    return float(1 if condition else 0)


def avg(lst):
    """Compute average of a list."""
    if len(lst) == 0:
        return 0
    return float(sum(lst)) / len(lst)


def wavg(lst, weights):
    """Compute weighted average of a list."""
    if sum(weights) == 0:
        return None
    return float(sum(lst[i] * weights[i] for i in range(len(lst)))) / sum(weights)


# Initialize data structure to track all metrics: e.g. accuracy, validity and
# plausibility, as well as accuracy per question type, length and number of
# reasoning steps.
scores = {
    "accuracy": [],  # list of accuracies per question (1 if correct else 0).
    "binary": [],  # list of accuracies per a binary question (1 if correct else 0).
    "open": [],  # list of accuracies per an open question (1 if correct else 0).
    "validity": [],  # list of validity per question (1 if valid else 0).
    "plausibility": [],  # list of plausibility per question (1 if plausible else 0).
    "consistency": [],  # list of consistency scores for entailed questions.
    "accuracyPerStructuralType": defaultdict(
        list
    ),  # question accuracies for each structural type (e.g. compare, logic questions).
    "accuracyPerSemanticType": defaultdict(
        list
    ),  # question accuracies for each semantic type (e.g. object, attribute, relation).
    "accuracyPerLength": defaultdict(
        list
    ),  # list of question accuracies per question's word number.
    "accuracyPerSteps": defaultdict(
        list
    ),  # list of question accuracies per question's reasoning length (steps number).
    "grounding": [],  # list of grounding scores for each question.
}

# Initialize golden and predicted histograms per each question group.
# Used to compute the distribution metric.
dist = {
    "gold": defaultdict(lambda: defaultdict(int)),
    "predicted": defaultdict(lambda: defaultdict(int)),
}


# Question lengths - words numbers and reasoning steps number
def getWordsNum(question):
    """Compute question length (words number)."""
    return len(question["question"].split())


def getStepsNum(question):
    """Compute number of reasoning steps.

    This excludes the final "querying" step which doesn't increase effective
    reasoning length.
    """
    return len(
        [
            c
            for c in question["semantic"]
            if not (
                any(
                    [
                        o in "{}: {}".format(c["operation"], c["argument"])
                        for o in ["exist", "query: name", "choose name"]
                    ]
                )
            )
        ]
    )


# Functions for validity and plausibility
def belongs(element, group, question):
    """Check if an element belongs to a group."""
    if "Common" in question["types"]["detailed"]:
        group = ["color", "material", "shape"]

    return element in group


# Functions for consistency scores (for entailed questions ("inferred"))
def updateConsistency(questionId, question, questions, exclude=[]):
    """Update the consistency score."""
    inferredQuestions = [
        eid for eid in question["entailed"] if eid != questionId and eid not in exclude
    ]

    if correct and len(inferredQuestions) > 0:

        cosnsitencyScores = []
        for eid in inferredQuestions:
            gold = questions[eid]["answer"]
            predicted = predictions[eid]
            score = to_score(predicted == gold)
            cosnsitencyScores.append(score)

        scores["consistency"].append(avg(cosnsitencyScores))


# Functions for grounding score (optional, only for attention models)


# Utility functions for working with bounding boxes.
# c = (x0, y0, x1, y1), r = (r0, r1)
def yrange(c):
    """Extract y bounds from a bbox tuple."""
    return (c[1], c[3])


def xrange(c):
    """Extract x bounds from a bbox tuple."""
    return (c[0], c[2])


def length(r):
    """Get length of a segment."""
    if r is None:
        return 0
    return float(r[1] - r[0])


def size(c):
    """Get bbox size."""
    return length(xrange(c)) * length(yrange(c))


def intersection(r1, r2):
    """Get intersection of two segment."""
    ir = (max(r1[0], r2[0]), min(r1[1], r2[1]))
    if ir[1] > ir[0]:
        return ir
    return None


def intersectionSize(c1, c2):
    """Get intersection area of two bboxes."""
    return length(intersection(xrange(c1), xrange(c2))) * length(
        intersection(yrange(c1), yrange(c2))
    )


def intersectionRate(c1, c2):
    """Get intersection rate of two bboxes relative to c1."""
    return float(intersectionSize(c1, c2)) / size(c1)


def getCell(i, j):
    """Get spatial cell."""
    edge = float(1) / args.mapSize
    return (edge * i, edge * j, edge * (i + 1), edge * (j + 1))


def getRegion(sceneGraph, objectId):
    """Get bounding box of objectId in sceneGraph."""
    obj = sceneGraph["objects"][objectId]
    x0 = float(obj["x"]) / sceneGraph["width"]
    y0 = float(obj["y"]) / sceneGraph["height"]
    x1 = float(obj["x"] + obj["w"]) / sceneGraph["width"]
    y1 = float(obj["y"] + obj["h"]) / sceneGraph["height"]
    return (x0, y0, x1, y1)


def computeGroundingScore(question, sceneGraph, attentionMap):
    """Compute grounding score.

    Compute amount of attention (probability) given to each of the regions the
    question and answers refer to.
    """
    # prepare gold regions
    regions = []
    # add question regions
    regions += [
        getRegion(sceneGraph, pointer)
        for pointer in question["annotations"]["question"].values()
    ]
    # add answer regions
    regions += [
        getRegion(sceneGraph, pointer)
        for pointer in question["annotations"]["fullAnswer"].values()
    ]
    # add all the image if the question refers to the whole scene
    if any(("scene" in c) for c in question["semantic"]):
        regions.append((0, 0, 1, 1))

    # prepare attention map
    if args.objectFeatures:
        # cells = [((x0, y0, x1, y1), attention) for x0, y0, x1, y1, attention in cells]
        # Undefined ref
        raise Exception("Error in original eval code, refer to GQA paper.")
    else:
        cells = [
            (getCell(i, j), attentionMap[i][j])
            for i in range(args.mapSize)
            for j in range(args.mapSize)
        ]

    # compare attention map to gold regions
    scores = []
    for region in regions:
        for cell, attention in cells:
            scores.append(attention * intersectionRate(cell, region))
    return sum(scores)


# Functions for distribution score


def chiSquare(goldDist, predictedDist):
    """Compute chi square statistic of gold distribution vs predicted distribution, \
    averaged over all question groups."""
    sumScore, sumOverall = 0, 0

    for group in goldDist:
        score, overall = 0, 0

        for ans in goldDist[group]:
            e = goldDist[group][ans]
            o = predictedDist[group].get(ans, 0)
            score += (float(o - e) ** 2) / e
            overall += goldDist[group][ans]

        sumScore += score * overall
        sumOverall += overall

    avgScore = float(sumScore) / sumOverall

    return avgScore


# Main score computation
print(len(questions))
# Loop over the questions and compute mterics
for qid, question in tqdm(questions.items()):
    # Compute scores over the balanced dataset
    # (more robust against cheating by making educated guesses)
    # Mask ids where appropriate to evaluate custom subsets of tiers
    if question["isBalanced"] and (include_ids is None or qid in include_ids):
        gold = question["answer"]
        predicted = predictions[qid]

        correct = predicted == gold
        score = to_score(correct)

        wordsNum = getWordsNum(question)
        stepsNum = getStepsNum(question)
        # Update accuracy
        scores["accuracy"].append(score)
        scores["accuracyPerLength"][wordsNum].append(score)
        scores["accuracyPerSteps"][stepsNum].append(score)
        scores["accuracyPerStructuralType"][question["types"]["structural"]].append(
            score
        )
        scores["accuracyPerSemanticType"][question["types"]["semantic"]].append(score)
        answerType = "open" if question["types"]["structural"] == "query" else "binary"
        scores[answerType].append(score)

        # Update validity score
        valid = belongs(predicted, choices[qid]["valid"], question)
        scores["validity"].append(to_score(valid))

        # Update plausibility score
        plausible = belongs(predicted, choices[qid]["plausible"], question)
        scores["plausibility"].append(to_score(plausible))

        # Optionally compute grounding (attention) score
        if ATTENTIONS is not None:
            groundingScore = computeGroundingScore(
                question, scenes[question["imageId"]], ATTENTIONS[qid]
            )
            if groundingScore is not None:
                scores["grounding"].append(groundingScore)

        # Update histograms for gold and predicted answers
        globalGroup = question["groups"]["global"]
        if globalGroup is not None:
            dist["gold"][globalGroup][gold] += 1
            dist["predicted"][globalGroup][predicted] += 1

        if args.consistency:
            # Compute consistency (for entailed questions)
            updateConsistency(
                qid,
                question,
                questions,
                exclude=exclude_ids if exclude_ids is not None else [],
            )

# Compute distribution score
scores["distribution"] = chiSquare(dist["gold"], dist["predicted"]) / 100

# Average scores over all questions (in the balanced dataset) and print scores

metrics = [
    "binary",
    "open",
    "accuracy",
    "consistency",
    "validity",
    "plausibility",
    "grounding",
    "distribution",
]

detailedMetrics = [
    ("accuracyPerStructuralType", "Accuracy / structural type"),
    ("accuracyPerSemanticType", "Accuracy / semantic type"),
    ("accuracyPerSteps", "Accuracy / steps number"),
    ("accuracyPerLength", "Accuracy / words number"),
]

subMetrics = {
    "attr": "attribute",
    "cat": "category",
    "global": "scene",
    "obj": "object",
    "rel": "relation",
}
# average
for k in metrics:
    if isinstance(scores[k], list):
        scores[k] = avg(scores[k]) * 100

for k, _ in detailedMetrics:
    for t in scores[k]:
        scores[k][t] = avg(scores[k][t]) * 100, len(scores[k][t])

# print
print("")
for m in metrics:
    # skip grounding and consistency scores if not requested
    if m == "grounding" and not args.grounding:
        continue
    if m == "consistency" and not args.consistency:
        continue

    # print score
    print(
        "{title}: {score:.2f}{suffix}".format(
            title=m.capitalize(),
            score=scores[m],
            suffix=" (lower is better)" if m == "distribution" else "%",
        )
    )

for m, mPrintName in detailedMetrics:
    print("")
    # print metric title
    print("{}:".format(mPrintName))

    for t in sorted(list(scores[m].keys())):
        # set sub-metric title
        tName = t
        if isinstance(scores[k], list):
            tName = subMetrics.get(t, t).capitalize()

        # print score
        print(
            "  {title}: {score:.2f}{suffix} ({amount} questions)".format(
                title=tName, score=scores[m][t][0], suffix="%", amount=scores[m][t][1]
            )
        )