From 3dbb4e573c8e1c76fc50183d6c974c8bfb37a354 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Tue, 7 Jan 2025 01:31:24 +0100 Subject: [PATCH] [TKW] Distribute gpu tests (#353) Signed-off-by: Ivan Butygin --- .github/workflows/ci-tk.yaml | 2 +- iree/turbine/kernel/wave/utils.py | 12 +++++++++++- tests/conftest.py | 26 ++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-tk.yaml b/.github/workflows/ci-tk.yaml index 97af29cf..d2ad0a03 100644 --- a/.github/workflows/ci-tk.yaml +++ b/.github/workflows/ci-tk.yaml @@ -76,7 +76,7 @@ jobs: run: | pip install --no-compile -r pytorch-rocm-requirements.txt export WAVE_RUN_E2E_TESTS=1 - WAVE_CACHE_ON=0 pytest -n 4 --capture=tee-sys -vv ./tests/kernel/wave/ + WAVE_CACHE_ON=0 pytest -n 8 --capture=tee-sys -vv --gpu-distribute 8 ./tests/kernel/wave/ - name: Run e2e tests on AMD GPU MI250 if: "contains(matrix.os, 'mi250') && !cancelled()" diff --git a/iree/turbine/kernel/wave/utils.py b/iree/turbine/kernel/wave/utils.py index 5fb4ff30..90c8b0ad 100644 --- a/iree/turbine/kernel/wave/utils.py +++ b/iree/turbine/kernel/wave/utils.py @@ -1053,8 +1053,18 @@ def all_equal(input_list: list[Any]) -> bool: return all(elem == input_list[0] for elem in input_list) +DEFAULT_GPU_DEVICE = None + + +def get_default_gpu_device_name() -> str: + if DEFAULT_GPU_DEVICE is None: + return "cuda" + + return f"cuda:{DEFAULT_GPU_DEVICE}" + + def get_default_device() -> str: - return "cuda" if torch.cuda.is_available() else "cpu" + return get_default_gpu_device_name() if torch.cuda.is_available() else "cpu" def to_default_device(tensor: torch.Tensor) -> torch.Tensor: diff --git a/tests/conftest.py b/tests/conftest.py index ccbd0088..22de80fa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,12 @@ def pytest_addoption(parser): default=None, help="save performance info into provided directory, filename based on current test name", ) + parser.addoption( + "--gpu-distribute", + type=int, + default=0, + help="Distribute over N gpu devices when running with pytest-xdist", + ) def pytest_configure(config): @@ -28,11 +34,31 @@ def pytest_configure(config): ) +def _set_default_device(config): + distribute = int(config.getoption("--gpu-distribute")) + if distribute < 1: + return + + if not hasattr(config, "workerinput"): + return + + worker_id = config.workerinput["workerid"] + if not worker_id.startswith("gw"): + return + + device_id = int(worker_id[2:]) % int(distribute) + + import iree.turbine.kernel.wave.utils as utils + + utils.DEFAULT_GPU_DEVICE = device_id + + def _has_marker(item, marker): return next(item.iter_markers(marker), None) is not None def pytest_collection_modifyitems(config, items): + _set_default_device(config) run_perf = config.getoption("--runperf") for item in items: is_validate_only = _has_marker(item, "validate_only")