Skip to content

Commit

Permalink
Merge pull request #120 from allenai/fix-ci
Browse files Browse the repository at this point in the history
run tests on Beaker with NFS cache
  • Loading branch information
dirkgr authored Feb 20, 2023
2 parents e122c63 + 6e0ac0f commit e8b671e
Show file tree
Hide file tree
Showing 11 changed files with 242 additions and 39 deletions.
90 changes: 80 additions & 10 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@ env:
# Change this to invalidate existing cache.
CACHE_PREFIX: v1
PYTHON_PATH: ./
DEFAULT_PYTHON: 3.9
BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
BEAKER_WORKSPACE: ai2/catwalk-tests
BEAKER_IMAGE: petew/catwalk-testing # to rebuild this image, run 'make docker-testing'

jobs:
checks:
name: Python ${{ matrix.python }} - ${{ matrix.task.name }}
runs-on: [self-hosted, CPU-only]
name: ${{ matrix.task.name }} (python ${{ matrix.python }})
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
fail-fast: false
Expand All @@ -32,15 +36,10 @@ jobs:
- name: Build
run: python setup.py check && python setup.py bdist_wheel sdist

- name: Test
run: pytest --forked -n4 -v --color=yes tests/
- name: Type check
run: mypy .

include:
- task:
name: Type check
run: mypy .
python: '3.10'

- task:
name: Docs
run: cd docs && make html SPHINXOPTS="-W --keep-going"
Expand All @@ -61,7 +60,7 @@ jobs:
${{ matrix.task.run }}
- name: Upload package distribution files
if: matrix.task.name == 'Build'
if: matrix.task.name == 'Build' && matrix.python == env.DEFAULT_PYTHON
uses: actions/upload-artifact@v3
with:
name: package
Expand All @@ -72,6 +71,77 @@ jobs:
run: |
. .venv/bin/activate
pip uninstall -y ai2-catwalk
tests:
name: Test - suite ${{ matrix.test_suite.name }}
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
test_suite:
- name: A
mark: "not suite_B and not suite_C and not suite_D"

- name: B
mark: "suite_B"

- name: C
mark: "suite_C"

- name: D
mark: "suite_D"
steps:
- name: Determine current commit SHA (pull request)
if: github.event_name == 'pull_request'
run: |
echo "COMMIT_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
- name: Determine current commit SHA (push)
if: github.event_name != 'pull_request'
run: |
echo "COMMIT_SHA=$GITHUB_SHA" >> $GITHUB_ENV
- name: Tests
uses: allenai/beaker-run-action@v1.1
with:
spec: |
version: v2
description: Catwalk tests - suite ${{ matrix.test_suite.name }}
tasks:
- name: tests
image:
beaker: ${{ env.BEAKER_IMAGE }}
envVars:
- name: COMMIT_SHA
value: ${{ env.COMMIT_SHA }}
command:
- "/entrypoint.sh"
- "pytest"
- "-v"
- "--forked"
- "-n4"
- "--durations=5"
- "--color=yes"
- "tests/"
- "-m"
- "${{ matrix.test_suite.mark }}"
constraints:
cluster:
- ai2/general-cirrascale
- ai2/allennlp-cirrascale
- ai2/aristo-cirrascale
- ai2/mosaic-cirrascale
- ai2/s2-cirrascale
context:
priority: preemptible
datasets:
- mountPath: /root/.cache
source:
hostPath: /net/nfs/allennlp/catwalk-cache
result:
path: /unused
token: ${{ secrets.BEAKER_TOKEN }}
workspace: ${{ env.BEAKER_WORKSPACE }}

release:
name: Release
Expand Down
16 changes: 16 additions & 0 deletions Dockerfile.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# This Dockerfile is for building an image suitable for running catwalk's tests.
# There are no instruction lines in this Dockerfile that install catwalk. Instead, the entrypoint
# script handles installing catwalk from a particular commit at runtime, based on the environment
# variable "COMMIT_SHA". That way we don't need to rebuild and push the image each time we run
# tests, and we can be sure the dependencies are always up-to-date.
#
# To rebuild and push this image to Beaker, run 'make docker-testing'.

FROM ghcr.io/allenai/pytorch:1.13.0-cuda11.6-python3.9

COPY scripts/entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh

WORKDIR /testing

ENTRYPOINT ["/entrypoint.sh"]
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,11 @@ docs :
run-checks :
mypy .
CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ catwalk/


.PHONY : docker-testing
docker-testing :
docker build -t catwalk-testing -f Dockerfile.test .
beaker image create --workspace ai2/catwalk-tests --name catwalk-testing-tmp catwalk-testing
beaker image delete petew/catwalk-testing || true
beaker image rename petew/catwalk-testing-tmp catwalk-testing
2 changes: 1 addition & 1 deletion catwalk/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@


try:
from functools import cache as memoize
from functools import cache as memoize # type: ignore
except ImportError:
def memoize(user_function, /): # type: ignore
import functools
Expand Down
6 changes: 5 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ python_classes = Test* *Test
log_format = %(asctime)s - %(levelname)s - %(name)s - %(message)s
log_level = DEBUG
markers =
filterwarnings =
suite_A: mark test to run with suite A
suite_B: mark test to run with suite B
suite_C: mark test to run with suite C
suite_D: mark test to run with suite D
filterwarnings =
26 changes: 26 additions & 0 deletions scripts/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

# Exit script if any commands fail.
set -e
set -o pipefail

# Check that the environment variable has been set correctly
if [ -z "$COMMIT_SHA" ]; then
echo >&2 'error: missing COMMIT_SHA environment variable'
exit 1
fi

# Upgrade pip
/opt/conda/bin/pip install --upgrade pip

# Clone and install catwalk.
git clone https://github.com/allenai/catwalk.git
cd catwalk
git checkout --quiet "$COMMIT_SHA"
/opt/conda/bin/pip install --no-cache-dir '.[dev]'

# Create directory for results.
mkdir -p /results

# Execute the arguments to this script as commands themselves, piping output into a log file.
exec "$@" 2>&1 | tee /results/out.log
47 changes: 41 additions & 6 deletions tests/test_all_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,49 @@

import pytest

import catwalk.tasks
import catwalk.models
import catwalk.tasks
from catwalk.task import InstanceFormat
from catwalk.tasks.huggingface import HFMCInstance

from .util import suite_B, suite_C

# These are tasks are known to fail for now due to an unreachable server.
known_failures = {
"lambada_mt_en",
"lambada_mt_fr",
"lambada_mt_de",
"lambada_mt_it",
"lambada_mt_es",
"triviaqa",
}

# There are too many P3 tasks, so we just pick one.
# MRQA dataset takes too long to load, so we skip it.
task_names = [task for task in catwalk.tasks.TASKS.keys() if not task.startswith("p3::") and not task.startswith("mrqa::")]
task_names = [
pytest.param(
task,
id=task,
marks=pytest.mark.xfail if task in known_failures else (),
)
for task in catwalk.tasks.TASKS.keys()
if not task.startswith("p3::") and not task.startswith("mrqa::")
]
task_names.insert(0, "p3::wiki_qa_Is_This_True_")


@pytest.mark.parametrize("task_name", task_names)
@suite_B
def test_task(task_name: str):
task = catwalk.tasks.TASKS[task_name]
instances = next((task.get_split(split) for split in ["train", "validation", "test"] if task.has_split(split)), None)
instances = next(
(
task.get_split(split)
for split in ["train", "validation", "test"]
if task.has_split(split)
),
None,
)
if not instances:
return
for conversion in task.instance_conversions.values():
Expand All @@ -28,7 +56,9 @@ def test_task(task_name: str):
kwargs["num_fewshot"] = 0
try:
if "fewshot_instances" in signature.parameters:
kwargs["fewshot_instances"] = task.get_fewshot_instances(2, exceptions=instance)
kwargs["fewshot_instances"] = task.get_fewshot_instances(
2, exceptions=instance
)
except ValueError: # This task doesn't support fewshot for the chosen split.
kwargs = {}
assert conversion(instance, **kwargs) is not None
Expand All @@ -42,13 +72,18 @@ def test_task(task_name: str):


@pytest.mark.parametrize("task_name", mc_tasks)
@suite_C
def test_mc_tasks(task_name):
task = catwalk.tasks.TASKS[task_name]
for split in ["train", "validation", "test"]:
if not task.has_split(split):
continue
for instance in task.get_split(split):
mc_instance = cast(HFMCInstance, task.convert_instance(instance, InstanceFormat.HF_MC))
mc_instance = cast(
HFMCInstance, task.convert_instance(instance, InstanceFormat.HF_MC)
)
if mc_instance.correct_answer_index is not None:
assert mc_instance.correct_answer_index >= 0
assert mc_instance.correct_answer_index < len(mc_instance.answer_choices)
assert mc_instance.correct_answer_index < len(
mc_instance.answer_choices
)
28 changes: 18 additions & 10 deletions tests/test_spotchecks.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,33 @@
import sys

import pytest

import catwalk.__main__
from catwalk.steps import PredictStep, CalculateMetricsStep
from catwalk.steps import CalculateMetricsStep, PredictStep

from .util import suite_C


@suite_C
def test_squad():
args = catwalk.__main__._parser.parse_args([
"--model", "bert-base-uncased",
"--task", "squad",
"--split", "validation",
"--limit", "100"
])
args = catwalk.__main__._parser.parse_args(
[
"--model",
"bert-base-uncased",
"--task",
"squad",
"--split",
"validation",
"--limit",
"100",
]
)
catwalk.__main__.main(args)


@pytest.mark.parametrize("task", ["mnli", "cola", "rte"])
@suite_C
def test_gpt2_performance(task: str):
model = "rc::gpt2"
predictions = PredictStep(model=model, task=task, limit=100)
metrics = CalculateMetricsStep(model=model, task=task, predictions=predictions)
results = metrics.result()
assert results['relative_improvement'] > 0
assert results["relative_improvement"] > 0
3 changes: 3 additions & 0 deletions tests/test_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from catwalk import MODELS
from catwalk.steps import PredictStep, CalculateMetricsStep

from .util import suite_A

task_names = [
"arc_challenge",
"boolq",
Expand Down Expand Up @@ -40,6 +42,7 @@
params = params + generation_params

@pytest.mark.parametrize("task_name,model_name", params)
@suite_A
def test_task_eval(task_name: str, model_name: str):
if MODELS[model_name].supports_fewshot:
predict_kwargs = {"num_shots": 3}
Expand Down
Loading

0 comments on commit e8b671e

Please sign in to comment.