From a98edce9caf8e8481f4105cb26b57d5d0429f963 Mon Sep 17 00:00:00 2001 From: Sebastian Ebert <eberts@google.com> Date: Wed, 4 May 2022 08:24:12 -0700 Subject: [PATCH] Add TSV loader dataset. This is one of multiple commits bringing the input salience demo to life. With this demo we're showing the results of the paper '"Will You Find These Shortcuts?" A Protocol for Evaluating the Faithfulness of Input Salience Methods for Text Classification' [https://arxiv.org/abs/2111.07367]. In this commit we introduce a class that loads data from a 2 column TSV file. PiperOrigin-RevId: 446463349 --- lit_nlp/examples/is_eval/datasets.py | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 lit_nlp/examples/is_eval/datasets.py diff --git a/lit_nlp/examples/is_eval/datasets.py b/lit_nlp/examples/is_eval/datasets.py new file mode 100644 index 00000000..99e71ab4 --- /dev/null +++ b/lit_nlp/examples/is_eval/datasets.py @@ -0,0 +1,36 @@ +"""Text classification dataset for binary, single input data.""" +from lit_nlp.api import dataset as lit_dataset +from lit_nlp.api import types as lit_types +import pandas as pd + + +class SingleInputClassificationFromTSV(lit_dataset.Dataset): + """TSV data loader for files having a single input text and a label. + + Files must be in TSV format with 2 columns in this order: + 1. Input text. + 2. Numeric label. + + Exported examples have 2 output keys: "sentence" and "label". + """ + + LABELS = ["0", "1"] + + def __init__(self, path: str): + self._examples = self.load_datapoints(path) + + def load_datapoints(self, path: str): + with open(path) as fd: + df = pd.read_csv(fd, sep="\t", header=None, names=["sentence", "label"]) + # pylint: disable=g-complex-comprehension + return [{ + "sentence": row["sentence"], + "label": self.LABELS[row["label"]], + } for _, row in df.iterrows()] + # pylint: enable=g-complex-comprehension + + def spec(self) -> lit_types.Spec: + return { + "sentence": lit_types.TextSegment(), + "label": lit_types.CategoryLabel(vocab=self.LABELS), + }