Merge pull request #120 from allenai/fix-ci

run tests on Beaker with NFS cache
allenai · Feb 20, 2023 · e8b671e · e8b671e
2 parents e122c63 + 6e0ac0f
commit e8b671e
Show file tree

Hide file tree

Showing 11 changed files with 242 additions and 39 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -18,11 +18,15 @@ env:
   # Change this to invalidate existing cache.
   CACHE_PREFIX: v1
   PYTHON_PATH: ./
+  DEFAULT_PYTHON: 3.9
+  BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
+  BEAKER_WORKSPACE: ai2/catwalk-tests
+  BEAKER_IMAGE: petew/catwalk-testing  # to rebuild this image, run 'make docker-testing'
 
 jobs:
   checks:
-    name: Python ${{ matrix.python }} - ${{ matrix.task.name }}
-    runs-on: [self-hosted, CPU-only]
+    name: ${{ matrix.task.name }} (python ${{ matrix.python }})
+    runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
       fail-fast: false
@@ -32,15 +36,10 @@ jobs:
           - name: Build
             run: python setup.py check && python setup.py bdist_wheel sdist
 
-          - name: Test
-            run: pytest --forked -n4 -v --color=yes tests/
+          - name: Type check
+            run: mypy .
 
         include:
-          - task:
-              name: Type check
-              run: mypy .
-            python: '3.10'
-
           - task:
               name: Docs
               run: cd docs && make html SPHINXOPTS="-W --keep-going"
@@ -61,7 +60,7 @@ jobs:
           ${{ matrix.task.run }}
 
       - name: Upload package distribution files
-        if: matrix.task.name == 'Build'
+        if: matrix.task.name == 'Build' && matrix.python == env.DEFAULT_PYTHON
         uses: actions/upload-artifact@v3
         with:
           name: package
@@ -72,6 +71,77 @@ jobs:
         run: |
           . .venv/bin/activate
           pip uninstall -y ai2-catwalk
+  tests:
+    name: Test - suite ${{ matrix.test_suite.name }}
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        test_suite:
+          - name: A
+            mark: "not suite_B and not suite_C and not suite_D"
+
+          - name: B
+            mark: "suite_B"
+
+          - name: C
+            mark: "suite_C"
+
+          - name: D
+            mark: "suite_D"
+    steps:
+      - name: Determine current commit SHA (pull request)
+        if: github.event_name == 'pull_request'
+        run: |
+          echo "COMMIT_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
+
+      - name: Determine current commit SHA (push)
+        if: github.event_name != 'pull_request'
+        run: |
+          echo "COMMIT_SHA=$GITHUB_SHA" >> $GITHUB_ENV
+
+      - name: Tests
+        uses: allenai/beaker-run-action@v1.1
+        with:
+          spec: |
+            version: v2
+            description: Catwalk tests - suite ${{ matrix.test_suite.name }}
+            tasks:
+              - name: tests
+                image:
+                  beaker: ${{ env.BEAKER_IMAGE }}
+                envVars:
+                  - name: COMMIT_SHA
+                    value: ${{ env.COMMIT_SHA }}
+                command:
+                  - "/entrypoint.sh"
+                  - "pytest"
+                  - "-v"
+                  - "--forked"
+                  - "-n4"
+                  - "--durations=5"
+                  - "--color=yes"
+                  - "tests/"
+                  - "-m"
+                  - "${{ matrix.test_suite.mark }}"
+                constraints:
+                  cluster:
+                    - ai2/general-cirrascale
+                    - ai2/allennlp-cirrascale
+                    - ai2/aristo-cirrascale
+                    - ai2/mosaic-cirrascale
+                    - ai2/s2-cirrascale
+                context:
+                  priority: preemptible
+                datasets:
+                  - mountPath: /root/.cache
+                    source:
+                      hostPath: /net/nfs/allennlp/catwalk-cache
+                result:
+                  path: /unused
+          token: ${{ secrets.BEAKER_TOKEN }}
+          workspace: ${{ env.BEAKER_WORKSPACE }}
 
   release:
     name: Release

diff --git a/Dockerfile.test b/Dockerfile.test
@@ -0,0 +1,16 @@
+# This Dockerfile is for building an image suitable for running catwalk's tests.
+# There are no instruction lines in this Dockerfile that install catwalk. Instead, the entrypoint
+# script handles installing catwalk from a particular commit at runtime, based on the environment
+# variable "COMMIT_SHA". That way we don't need to rebuild and push the image each time we run
+# tests, and we can be sure the dependencies are always up-to-date.
+#
+# To rebuild and push this image to Beaker, run 'make docker-testing'.
+
+FROM ghcr.io/allenai/pytorch:1.13.0-cuda11.6-python3.9
+
+COPY scripts/entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+WORKDIR /testing
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/Makefile b/Makefile
@@ -7,3 +7,11 @@ docs :
 run-checks :
 	mypy .
 	CUDA_VISIBLE_DEVICES='' pytest -v --color=yes --doctest-modules tests/ catwalk/
+
+
+.PHONY : docker-testing
+docker-testing :
+	docker build -t catwalk-testing -f Dockerfile.test .
+	beaker image create --workspace ai2/catwalk-tests --name catwalk-testing-tmp catwalk-testing
+	beaker image delete petew/catwalk-testing || true
+	beaker image rename petew/catwalk-testing-tmp catwalk-testing
diff --git a/catwalk/task.py b/catwalk/task.py
@@ -26,7 +26,7 @@
 
 
 try:
-    from functools import cache as memoize
+    from functools import cache as memoize  # type: ignore
 except ImportError:
     def memoize(user_function, /):      # type: ignore
         import functools

diff --git a/pytest.ini b/pytest.ini
@@ -4,4 +4,8 @@ python_classes = Test* *Test
 log_format = %(asctime)s - %(levelname)s - %(name)s - %(message)s
 log_level = DEBUG
 markers =
-filterwarnings =
+    suite_A: mark test to run with suite A
+    suite_B: mark test to run with suite B
+    suite_C: mark test to run with suite C
+    suite_D: mark test to run with suite D
+filterwarnings =
diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Exit script if any commands fail.
+set -e
+set -o pipefail
+
+# Check that the environment variable has been set correctly
+if [ -z "$COMMIT_SHA" ]; then
+  echo >&2 'error: missing COMMIT_SHA environment variable'
+  exit 1
+fi
+
+# Upgrade pip
+/opt/conda/bin/pip install --upgrade pip
+
+# Clone and install catwalk.
+git clone https://github.com/allenai/catwalk.git
+cd catwalk
+git checkout --quiet "$COMMIT_SHA"
+/opt/conda/bin/pip install --no-cache-dir '.[dev]'
+
+# Create directory for results.
+mkdir -p /results
+
+# Execute the arguments to this script as commands themselves, piping output into a log file.
+exec "$@" 2>&1 | tee /results/out.log
diff --git a/tests/test_all_tasks.py b/tests/test_all_tasks.py
@@ -3,21 +3,49 @@
 
 import pytest
 
-import catwalk.tasks
 import catwalk.models
+import catwalk.tasks
 from catwalk.task import InstanceFormat
 from catwalk.tasks.huggingface import HFMCInstance
 
+from .util import suite_B, suite_C
+
+# These are tasks are known to fail for now due to an unreachable server.
+known_failures = {
+    "lambada_mt_en",
+    "lambada_mt_fr",
+    "lambada_mt_de",
+    "lambada_mt_it",
+    "lambada_mt_es",
+    "triviaqa",
+}
+
 # There are too many P3 tasks, so we just pick one.
 # MRQA dataset takes too long to load, so we skip it.
-task_names = [task for task in catwalk.tasks.TASKS.keys() if not task.startswith("p3::") and not task.startswith("mrqa::")]
+task_names = [
+    pytest.param(
+        task,
+        id=task,
+        marks=pytest.mark.xfail if task in known_failures else (),
+    )
+    for task in catwalk.tasks.TASKS.keys()
+    if not task.startswith("p3::") and not task.startswith("mrqa::")
+]
 task_names.insert(0, "p3::wiki_qa_Is_This_True_")
 
 
 @pytest.mark.parametrize("task_name", task_names)
+@suite_B
 def test_task(task_name: str):
     task = catwalk.tasks.TASKS[task_name]
-    instances = next((task.get_split(split) for split in ["train", "validation", "test"] if task.has_split(split)), None)
+    instances = next(
+        (
+            task.get_split(split)
+            for split in ["train", "validation", "test"]
+            if task.has_split(split)
+        ),
+        None,
+    )
     if not instances:
         return
     for conversion in task.instance_conversions.values():
@@ -28,7 +56,9 @@ def test_task(task_name: str):
                 kwargs["num_fewshot"] = 0
             try:
                 if "fewshot_instances" in signature.parameters:
-                    kwargs["fewshot_instances"] = task.get_fewshot_instances(2, exceptions=instance)
+                    kwargs["fewshot_instances"] = task.get_fewshot_instances(
+                        2, exceptions=instance
+                    )
             except ValueError:  # This task doesn't support fewshot for the chosen split.
                 kwargs = {}
             assert conversion(instance, **kwargs) is not None
@@ -42,13 +72,18 @@ def test_task(task_name: str):
 
 
 @pytest.mark.parametrize("task_name", mc_tasks)
+@suite_C
 def test_mc_tasks(task_name):
     task = catwalk.tasks.TASKS[task_name]
     for split in ["train", "validation", "test"]:
         if not task.has_split(split):
             continue
         for instance in task.get_split(split):
-            mc_instance = cast(HFMCInstance, task.convert_instance(instance, InstanceFormat.HF_MC))
+            mc_instance = cast(
+                HFMCInstance, task.convert_instance(instance, InstanceFormat.HF_MC)
+            )
             if mc_instance.correct_answer_index is not None:
                 assert mc_instance.correct_answer_index >= 0
-                assert mc_instance.correct_answer_index < len(mc_instance.answer_choices)
+                assert mc_instance.correct_answer_index < len(
+                    mc_instance.answer_choices
+                )
diff --git a/tests/test_spotchecks.py b/tests/test_spotchecks.py
@@ -1,25 +1,33 @@
-import sys
-
 import pytest
 
 import catwalk.__main__
-from catwalk.steps import PredictStep, CalculateMetricsStep
+from catwalk.steps import CalculateMetricsStep, PredictStep
+
+from .util import suite_C
 
 
+@suite_C
 def test_squad():
-    args = catwalk.__main__._parser.parse_args([
-        "--model", "bert-base-uncased",
-        "--task", "squad",
-        "--split", "validation",
-        "--limit", "100"
-    ])
+    args = catwalk.__main__._parser.parse_args(
+        [
+            "--model",
+            "bert-base-uncased",
+            "--task",
+            "squad",
+            "--split",
+            "validation",
+            "--limit",
+            "100",
+        ]
+    )
     catwalk.__main__.main(args)
 
 
 @pytest.mark.parametrize("task", ["mnli", "cola", "rte"])
+@suite_C
 def test_gpt2_performance(task: str):
     model = "rc::gpt2"
     predictions = PredictStep(model=model, task=task, limit=100)
     metrics = CalculateMetricsStep(model=model, task=task, predictions=predictions)
     results = metrics.result()
-    assert results['relative_improvement'] > 0
+    assert results["relative_improvement"] > 0
diff --git a/tests/test_steps.py b/tests/test_steps.py
@@ -3,6 +3,8 @@
 from catwalk import MODELS
 from catwalk.steps import PredictStep, CalculateMetricsStep
 
+from .util import suite_A
+
 task_names = [
     "arc_challenge",
     "boolq",
@@ -40,6 +42,7 @@
 params = params + generation_params
 
 @pytest.mark.parametrize("task_name,model_name", params)
+@suite_A
 def test_task_eval(task_name: str, model_name: str):
     if MODELS[model_name].supports_fewshot:
         predict_kwargs = {"num_shots": 3}