Add TSV loader dataset.

This is one of multiple commits bringing the input salience demo to life. With this demo we're showing the results of the paper '"Will You Find These Shortcuts?" A Protocol for Evaluating the Faithfulness of Input Salience Methods for Text Classification' [https://arxiv.org/abs/2111.07367]. In this commit we introduce a class that loads data from a 2 column TSV file. PiperOrigin-RevId: 446463349
PAIR-code · May 4, 2022 · a98edce · a98edce
1 parent 085bb67
commit a98edce
Showing 1 changed file with 36 additions and 0 deletions.
diff --git a/lit_nlp/examples/is_eval/datasets.py b/lit_nlp/examples/is_eval/datasets.py
@@ -0,0 +1,36 @@
+"""Text classification dataset for binary, single input data."""
+from lit_nlp.api import dataset as lit_dataset
+from lit_nlp.api import types as lit_types
+import pandas as pd
+
+
+class SingleInputClassificationFromTSV(lit_dataset.Dataset):
+  """TSV data loader for files having a single input text and a label.
+
+  Files must be in TSV format with 2 columns in this order:
+  1. Input text.
+  2. Numeric label.
+
+  Exported examples have 2 output keys: "sentence" and "label".
+  """
+
+  LABELS = ["0", "1"]
+
+  def __init__(self, path: str):
+    self._examples = self.load_datapoints(path)
+
+  def load_datapoints(self, path: str):
+    with open(path) as fd:
+      df = pd.read_csv(fd, sep="\t", header=None, names=["sentence", "label"])
+    # pylint: disable=g-complex-comprehension
+    return [{
+        "sentence": row["sentence"],
+        "label": self.LABELS[row["label"]],
+    } for _, row in df.iterrows()]
+    # pylint: enable=g-complex-comprehension
+
+  def spec(self) -> lit_types.Spec:
+    return {
+        "sentence": lit_types.TextSegment(),
+        "label": lit_types.CategoryLabel(vocab=self.LABELS),
+    }