From ace7f64234edb9f45c3557c7e969fb003c60b01f Mon Sep 17 00:00:00 2001 From: Songki Choi Date: Tue, 5 Mar 2024 10:05:18 +0900 Subject: [PATCH] Add perf benchmark tests for v2 (#3004) * Add perf tests for all tasks * Update perf GH workflow * Parameterize accuracy | efficiency tests * Log raw data csv * Subprocess run & csv summary * Align with v1 --- .github/workflows/perf_accuracy.yaml | 96 ++++++ .github/workflows/perf_efficiency.yaml | 80 +++++ .github/workflows/perf_test.yaml | 31 -- tests/perf/__init__.py | 4 + tests/perf/benchmark.py | 246 ++++++++++++++++ tests/perf/conftest.py | 357 +++++++++++++++++++++++ tests/perf/test_anomaly.py | 4 + tests/perf/test_classification.py | 296 +++++++++++++++++++ tests/perf/test_detection.py | 126 ++++++++ tests/perf/test_instance_segmentation.py | 217 ++++++++++++++ tests/perf/test_semantic_segmentation.py | 111 +++++++ tests/perf/test_visual_prompting.py | 178 +++++++++++ tox.ini | 4 +- 13 files changed, 1717 insertions(+), 33 deletions(-) create mode 100644 .github/workflows/perf_accuracy.yaml create mode 100644 .github/workflows/perf_efficiency.yaml delete mode 100644 .github/workflows/perf_test.yaml create mode 100644 tests/perf/__init__.py create mode 100644 tests/perf/benchmark.py create mode 100644 tests/perf/conftest.py create mode 100644 tests/perf/test_anomaly.py create mode 100644 tests/perf/test_classification.py create mode 100644 tests/perf/test_detection.py create mode 100644 tests/perf/test_instance_segmentation.py create mode 100644 tests/perf/test_semantic_segmentation.py create mode 100644 tests/perf/test_visual_prompting.py diff --git a/.github/workflows/perf_accuracy.yaml b/.github/workflows/perf_accuracy.yaml new file mode 100644 index 00000000000..68693179118 --- /dev/null +++ b/.github/workflows/perf_accuracy.yaml @@ -0,0 +1,96 @@ +name: Perf-Accuracy Benchmark + +on: + workflow_dispatch: # run on request (no need for PR) + inputs: + model-category: + type: choice + description: Model category to run benchmark + options: + - default # speed, balance, accuracy models only + - all # default + other models + default: default + data-size: + type: choice + description: Dataset size to run benchmark + options: + - small + - medium + - large + - all + default: all + num-repeat: + description: Overrides default per-data-size number of repeat setting + default: 0 + num-epoch: + description: Overrides default per-model number of epoch setting + default: 0 + eval-upto: + type: choice + description: The last operation to evaluate. 'optimize' means all. + options: + - train + - export + - optimize + default: optimize + pytest-args: + type: string + description: | + Additional perf-benchmark pytest arguments. + "-k detection" -> detection task only + "--dry-run" -> print command w/o execution. + data-root: + type: string + description: Root directory containing validation data in CI server. + default: "/home/validation/data/v2/" + +jobs: + Perf-Accuracy-Benchmark: + strategy: + fail-fast: false + matrix: + include: + - task-short: "ano" + task: "anomaly" + - task-short: "cls" + task: "classification" + - task-short: "det" + task: "detection" + - task-short: "isg" + task: "instance_segmentation" + - task-short: "ssg" + task: "semantic_segmentation" + - task-short: "vsp" + task: "visual_prompting" + name: Perf-Accuracy-Benchmark-${{ matrix.task-short }} + runs-on: [self-hosted, linux, x64, dmount] + timeout-minutes: 8640 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Install Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install tox + run: python -m pip install tox + - name: Run Performance Test + run: > + tox -vv -e perf-benchmark -- tests/perf/test_${{ matrix.task }}.py ${{ inputs.pytest-args }} + --benchmark-type accuracy + --model-category ${{ inputs.model-category }} + --data-root ${{ inputs.data-root }} + --data-size ${{ inputs.data-size }} + --num-repeat ${{ inputs.num-repeat }} + --num-epoch ${{ inputs.num-epoch }} + --eval-upto ${{ inputs.eval-upto }} + --summary-csv .tox/perf-accuracy-benchmark-${{ matrix.task-short }}.csv + --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }} + --user-name ${{ github.triggering_actor }} + - name: Upload test results + uses: actions/upload-artifact@v3 + with: + name: perf-accuracy-benchmark-${{ matrix.task-short }} + path: .tox/perf-*.csv + # Use always() to always run this step to publish test results when there are test failures + if: ${{ always() }} diff --git a/.github/workflows/perf_efficiency.yaml b/.github/workflows/perf_efficiency.yaml new file mode 100644 index 00000000000..070b3c0837b --- /dev/null +++ b/.github/workflows/perf_efficiency.yaml @@ -0,0 +1,80 @@ +name: Perf-Efficiency Benchmark + +on: + workflow_dispatch: # run on request (no need for PR) + inputs: + model-category: + type: choice + description: Model category to run benchmark + options: + - default # speed, balance, accuracy models only + - all # default + other models + default: default + data-size: + type: choice + description: Dataset size to run benchmark + options: + - small + - medium + - large + - all + default: medium + num-repeat: + description: Overrides default per-data-size number of repeat setting + default: 1 + num-epoch: + description: Overrides default per-model number of epoch setting + default: 2 + eval-upto: + type: choice + description: The last operation to evaluate. 'optimize' means all. + options: + - train + - export + - optimize + default: optimize + pytest-args: + type: string + description: | + Additional perf-benchmark pytest arguments. + "-k detection" -> detection task only + "--dry-run" -> print command w/o execution. + data-root: + type: string + description: Root directory containing validation data in CI server. + default: "/home/validation/data/v2/" + +jobs: + Perf-Efficiency-Benchmark: + name: Perf-Efficiency-Benchmark-all + runs-on: [self-hosted, linux, x64, dmount] + timeout-minutes: 8640 + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Install Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install tox + run: python -m pip install tox + - name: Run Performance Test + run: > + tox -vv -e perf-benchmark -- tests/perf ${{ inputs.pytest-args }} + --benchmark-type efficiency + --model-category ${{ inputs.model-category }} + --data-root ${{ inputs.data-root }} + --data-size ${{ inputs.data-size }} + --num-repeat ${{ inputs.num-repeat }} + --num-epoch ${{ inputs.num-epoch }} + --eval-upto ${{ inputs.eval-upto }} + --summary-csv .tox/perf-efficiency-benchmark-all.csv + --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }} + --user-name ${{ github.triggering_actor }} + - name: Upload test results + uses: actions/upload-artifact@v3 + with: + name: perf-efficiency-benchmark-all + path: .tox/perf-*.csv + # Use always() to always run this step to publish test results when there are test failures + if: ${{ always() }} diff --git a/.github/workflows/perf_test.yaml b/.github/workflows/perf_test.yaml deleted file mode 100644 index 0a210f66dec..00000000000 --- a/.github/workflows/perf_test.yaml +++ /dev/null @@ -1,31 +0,0 @@ -name: V2 Performance Test - -on: - schedule: - - cron: "0 16 * * 1-5" # currently run daily on 16:00 UTC; [TODO]: run weekly - workflow_dispatch: # run on request - -jobs: - Performance-Test: - runs-on: [self-hosted, linux, x64, dmount] - strategy: - fail-fast: false - matrix: - include: - - python-version: "3.10" - tox-env: "py310" - name: Performance-Test-Py${{ matrix.python-version }} - concurrency: - group: ${{ github.workflow }}-Performance-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - name: Install Python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: Install tox - run: python -m pip install tox - - name: Run Performance Test - run: tox -vv -e performance-test -- --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }} --user-name ${{ vars.USER_NAME }} --dataset-root-dir /home/validation/data/v2 diff --git a/tests/perf/__init__.py b/tests/perf/__init__.py new file mode 100644 index 00000000000..d832bb41bf2 --- /dev/null +++ b/tests/perf/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""OTX perfomance benchamrk tests.""" diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py new file mode 100644 index 00000000000..29974b1cb35 --- /dev/null +++ b/tests/perf/benchmark.py @@ -0,0 +1,246 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""OTX benchmark runner.""" + +from __future__ import annotations + +import gc +import logging +import os +import subprocess +from dataclasses import dataclass +from pathlib import Path + +import pandas as pd + +log = logging.getLogger(__name__) + + +class Benchmark: + """Benchmark runner for OTX2.x. + + Args: + benchmark_type (str): 'accuracy' or 'efficiency' + data_root (str): Path to the root of dataset directories. Defaults to './data'. + output_root (str): Output root dirctory for logs and results. Defaults to './otx-benchmark'. + criteria (list[Criterion]): Benchmark criteria settings + num_epoch (int): Overrides the per-model default number of epoch settings. + Defaults to 0, which means no overriding. + num_repeat (int): Number for trials with different random seed, which would be set + as range(0, num_repeat). Defaults to 1. + eval_upto (str): The last serial operation to evaluate. Choose one of ('train', 'export', 'optimize'). + Operations include the preceeding ones. + e.x) Eval up to 'optimize': train -> eval -> export -> eval -> optimize -> eval + Default to 'train'. + tags (dict, optional): Key-values pair metadata for the experiment. + dry_run (bool): Whether to just print the OTX command without execution. Defaults to False. + deterministic (bool): Whether to turn on deterministic training mode. Defaults to False. + accelerator (str): Accelerator device on which to run benchmark. Defaults to gpu. + """ + + @dataclass + class Model: + """Benchmark model.""" + + task: str + name: str + category: str + + @dataclass + class Dataset: + """Benchmark dataset.""" + + name: str + path: Path + size: str + data_format: str + num_classes: int + num_repeat: int = 1 + extra_overrides: dict | None = None + + @dataclass + class Criterion: + """Benchmark criterion.""" + + name: str + summary: str + compare: str + margin: float + + def __init__( + self, + benchmark_type: str = "accuracy", + data_root: Path = Path("data"), + output_root: Path = Path("otx-benchmark"), + criteria: list[Criterion] | None = None, + num_epoch: int = 0, + num_repeat: int = 1, + eval_upto: str = "train", + tags: dict[str, str] | None = None, + dry_run: bool = False, + deterministic: bool = False, + accelerator: str = "gpu", + ): + self.benchmark_type = benchmark_type + self.data_root = data_root + self.output_root = output_root + self.criteria = criteria + self.num_epoch = num_epoch + self.num_repeat = num_repeat + self.eval_upto = eval_upto + self.tags = tags or {} + self.dry_run = dry_run + self.deterministic = deterministic + self.accelerator = accelerator + + if num_epoch == 0 and benchmark_type == "efficiency": + self.num_epoch = 2 + + def run( + self, + model: Model, + dataset: Dataset, + ) -> pd.DataFrame | None: + """Run configured benchmark with given dataset and model and return the result. + + Args: + model (Model): Target model settings + dataset (Dataset): Target dataset settings + + Retruns: + pd.DataFrame | None: Table with benchmark metrics + """ + + run_name = f"{self.benchmark_type}/{model.task}/{model.name}/{dataset.name}" + log.info(f"{run_name = }") + work_dir = self.output_root / run_name + data_root = self.data_root / dataset.path + + tags = { + "benchmark": self.benchmark_type, + "task": model.task, + "data_size": dataset.size, + "model": model.name, + "dataset": dataset.name, + **self.tags, + } + + num_repeat = dataset.num_repeat + if self.num_repeat > 0: + num_repeat = self.num_repeat # Override by global setting + + for seed in range(num_repeat): + sub_work_dir = work_dir / str(seed) + tags["seed"] = str(seed) + + # Train & test + command = [ + "otx", + "train", + "--config", + f"src/otx/recipe/{model.task}/{model.name}.yaml", + "--data_root", + str(data_root), + "--work_dir", + str(sub_work_dir), + "--model.num_classes", + str(dataset.num_classes), + "--data.config.data_format", + dataset.data_format, + "--engine.device", + self.accelerator, + ] + for key, value in dataset.extra_overrides.items(): + command.append(f"--{key}") + command.append(str(value)) + command.extend(["--seed", str(seed)]) + command.extend(["--deterministic", str(self.deterministic)]) + if self.num_epoch > 0: + command.extend(["--max_epochs", str(self.num_epoch)]) + self._run_command(command) + + command = [ + "otx", + "test", + "--work_dir", + str(sub_work_dir), + ] + self._run_command(command) + + # Export & test + # Optimize & test + + self._log_metrics(work_dir=sub_work_dir, tags=tags) + + # Force memory clean up + gc.collect() + + return self.load_result(work_dir) + + def _run_command(self, command: list[str]) -> None: + if self.dry_run: + print(" ".join(command)) + else: + subprocess.run(command, check=True) # noqa: S603 + + def _log_metrics(self, work_dir: Path, tags: dict[str, str]) -> None: + if not work_dir.exists(): + return + # Load raw metrics + csv_files = work_dir.glob("**/metrics.csv") + raw_data = [pd.read_csv(csv_file) for csv_file in csv_files] + raw_data = pd.concat(raw_data, ignore_index=True) + # Summarize + metrics = [] + for criterion in self.criteria: + if criterion.name not in raw_data: + continue + column = raw_data[criterion.name].dropna() + if len(column) == 0: + continue + if criterion.summary == "mean": + value = column[min(1, len(column) - 1) :].mean() # Drop 1st epoch if possible + elif criterion.summary == "max": + value = column.max() + elif criterion.summary == "min": + value = column.min() + else: + value = 0.0 + metrics.append(pd.Series([value], name=criterion.name)) + if len(metrics) == 0: + return + metrics = pd.concat(metrics, axis=1) + # Write csv w/ tags + for k, v in tags.items(): + metrics[k] = v + metrics.to_csv(work_dir / "benchmark.raw.csv", index=False) + + @staticmethod + def load_result(result_path: Path) -> pd.DataFrame | None: + """Load benchmark results recursively and merge as pd.DataFrame. + + Args: + result_path (Path): Result directory or speicific file. + + Retruns: + pd.DataFrame: Table with benchmark metrics & options + """ + if not result_path.exists(): + return None + # Load csv data + csv_files = result_path.glob("**/benchmark.raw.csv") if result_path.is_dir() else [result_path] + results = [pd.read_csv(csv_file) for csv_file in csv_files] + if len(results) == 0: + return None + # Merge data + data = pd.concat(results, ignore_index=True) + # Average by unique group + grouped = data.groupby(["benchmark", "task", "data_size", "model"]) + aggregated = grouped.mean(numeric_only=True) + # Merge tag columns (non-numeric & non-index) + tag_columns = set(data.columns) - set(aggregated.columns) - set(grouped.keys) + for col in tag_columns: + # Take common string prefix such as: ["data/1", "data/2", "data/3"] -> "data/" + aggregated[col] = grouped[col].agg(lambda x: os.path.commonprefix(x.tolist())) + return aggregated diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py new file mode 100644 index 00000000000..3c4fa356846 --- /dev/null +++ b/tests/perf/conftest.py @@ -0,0 +1,357 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import logging +import os +import platform +import subprocess +from datetime import datetime, timedelta, timezone +from pathlib import Path +from urllib.parse import urlparse + +import pytest +from cpuinfo import get_cpu_info + +from .benchmark import Benchmark + +log = logging.getLogger(__name__) + + +def pytest_addoption(parser): + """Add custom options for perf tests.""" + parser.addoption( + "--benchmark-type", + action="store", + default="accuracy", + choices=("accuracy", "efficiency", "all"), + help="Choose accuracy|efficiency|all. Defaults to accuracy.", + ) + parser.addoption( + "--model-category", + action="store", + default="all", + choices=("default", "all"), + help="Choose default|all. Defaults to all.", + ) + parser.addoption( + "--data-size", + action="store", + default="all", + choices=("small", "medium", "large", "all"), + help="Choose small|medium|large|all. Defaults to all.", + ) + parser.addoption( + "--num-repeat", + action="store", + default=0, + help="Overrides default per-data-size number of repeat setting. " + "Random seeds are set to 0 ~ num_repeat-1 for the trials. " + "Defaults to 0 (small=3, medium=3, large=1).", + ) + parser.addoption( + "--num-epoch", + action="store", + default=0, + help="Overrides default per-model number of epoch setting. " + "Defaults to 0 (per-model epoch & early-stopping).", + ) + parser.addoption( + "--eval-upto", + action="store", + default="train", + choices=("train", "export", "optimize"), + help="Choose train|export|optimize. Defaults to train.", + ) + parser.addoption( + "--data-root", + action="store", + default="data", + help="Dataset root directory.", + ) + parser.addoption( + "--output-root", + action="store", + help="Output root directory. Defaults to temp directory.", + ) + parser.addoption( + "--summary-csv", + action="store", + help="Path to output summary cvs file. Defaults to {output-root}/benchmark-summary.csv", + ) + parser.addoption( + "--dry-run", + action="store_true", + default=False, + help="Print OTX commands without execution.", + ) + parser.addoption( + "--deterministic", + action="store_true", + default=False, + help="Turn on deterministic training.", + ) + parser.addoption( + "--user-name", + type=str, + default="anonymous", + help='Sign-off the user name who launched the regression tests this time, e.g., `--user-name "John Doe"`.', + ) + parser.addoption( + "--mlflow-tracking-uri", + type=str, + help="URI for MLFlow Tracking server to store the regression test results.", + ) + + +@pytest.fixture(scope="session") +def fxt_benchmark_type(request: pytest.FixtureRequest) -> str: + """Select benchmark type.""" + benchmark_type: str = request.config.getoption("--benchmark-type") + msg = f"{benchmark_type = }" + log.info(msg) + return benchmark_type + + +@pytest.fixture(scope="session") +def fxt_model_category(request: pytest.FixtureRequest) -> str: + """Model category to run the benchmark.""" + model_category = request.config.getoption("--model-category") + msg = f"{model_category = }" + log.info(msg) + return model_category + + +@pytest.fixture(scope="session") +def fxt_data_size(request: pytest.FixtureRequest) -> str: + """Data size to run the benchmark.""" + data_size = request.config.getoption("--data-size") + msg = f"{data_size = }" + log.info(msg) + return data_size + + +@pytest.fixture(scope="session") +def fxt_num_repeat(request: pytest.FixtureRequest) -> int: + """Number of repeated run with different random seed.""" + num_repeat = int(request.config.getoption("--num-repeat")) + msg = f"{num_repeat = }" + log.info(msg) + return num_repeat + + +@pytest.fixture(scope="session") +def fxt_num_epoch(request: pytest.FixtureRequest) -> int: + """Number of epochs to train models.""" + num_epoch = int(request.config.getoption("--num-epoch")) + msg = f"{num_epoch = }" + log.info(msg) + return num_epoch + + +@pytest.fixture(scope="session") +def fxt_eval_upto(request: pytest.FixtureRequest) -> str: + """Last operation to evaluate ~ train|export|optimize.""" + eval_upto = request.config.getoption("--eval-upto") + msg = f"{eval_upto = }" + log.info(msg) + return eval_upto + + +@pytest.fixture(scope="session") +def fxt_data_root(request: pytest.FixtureRequest) -> Path: + """Dataset root directory path.""" + data_root = Path(request.config.getoption("--data-root")) + msg = f"{data_root = }" + log.info(msg) + return data_root + + +@pytest.fixture(scope="session") +def fxt_output_root(request: pytest.FixtureRequest, tmp_path_factory: pytest.TempPathFactory) -> Path: + """Output root + date + short commit hash.""" + output_root = request.config.getoption("--output-root") + if output_root is None: + output_root = tmp_path_factory.mktemp("otx-benchmark") + tz = timezone(offset=timedelta(hours=9), name="Seoul") + date_str = datetime.now(tz=tz).strftime("%Y%m%d-%H%M%S") + output_root = Path(output_root) / date_str + msg = f"{output_root = }" + log.info(msg) + return output_root + + +@pytest.fixture(scope="session") +def fxt_version_tags() -> dict[str, str]: + """Version / branch / commit info.""" + import otx + + version_str = otx.__version__ + try: + branch_str = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("ascii").strip() # noqa: S603, S607 + except Exception: + branch_str = os.environ.get("GH_CTX_REF_NAME", "unknown") + try: + commit_str = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("ascii").strip() # noqa: S603, S607 + except Exception: + commit_str = os.environ.get("GH_CTX_SHA", "unknown") + version_tags = { + "version": version_str, + "branch": branch_str, + "commit": commit_str, + } + msg = f"{version_tags = }" + log.info(msg) + return version_tags + + +@pytest.fixture(scope="session") +def fxt_summary_csv(request: pytest.FixtureRequest, fxt_output_root: Path) -> Path: + """Path to benchmark result summary csv file.""" + summary_csv = request.config.getoption("--summary-csv") + if summary_csv is None: + summary_csv = fxt_output_root / "benchmark-summary.csv" + msg = f"{summary_csv = }" + log.info(msg) + return summary_csv + + +@pytest.fixture(scope="session") +def fxt_dry_run(request: pytest.FixtureRequest) -> str: + """Option to print OTX commands without execution.""" + dry_run = request.config.getoption("--dry-run") + msg = f"{dry_run = }" + log.info(msg) + return dry_run + + +@pytest.fixture(scope="session") +def fxt_deterministic(request: pytest.FixtureRequest) -> str: + """Option to turn on deterministic training.""" + deterministic = request.config.getoption("--deterministic") + msg = f"{deterministic = }" + log.info(msg) + return deterministic + + +@pytest.fixture(scope="session") +def fxt_user_name(request: pytest.FixtureRequest) -> str: + """User name to sign off the regression test execution.""" + user_name = request.config.getoption("--user-name") + msg = f"{user_name = }" + log.info(msg) + return user_name + + +@pytest.fixture(scope="session") +def fxt_mlflow_tracking_uri(request: pytest.FixtureRequest) -> str: + """MLFLow tracking server URI.""" + mlflow_tracking_uri = urlparse( + request.config.getoption("--mlflow-tracking-uri"), + ).geturl() + msg = f"{mlflow_tracking_uri = }" + log.info(msg) + return mlflow_tracking_uri + + +@pytest.fixture() +def fxt_model(request: pytest.FixtureRequest, fxt_model_category) -> Benchmark.Model: + """Skip models according to user options.""" + model: Benchmark.Model = request.param + if fxt_model_category == "default" and model.category == "other": + pytest.skip(f"{model.category} category model") + return model + + +@pytest.fixture() +def fxt_dataset(request: pytest.FixtureRequest, fxt_data_size) -> Benchmark.Data: + """Skip datasets according to user options.""" + dataset: Benchmark.Dataset = request.param + if fxt_data_size not in {"all", dataset.size}: + pytest.skip(f"{dataset.size} size dataset") + return dataset + + +@pytest.fixture(scope="session") +def fxt_tags(fxt_user_name: str, fxt_version_tags: dict[str, str]) -> dict[str, str]: + """Tag fields to record the machine and user executing this perf test.""" + tags = { + **fxt_version_tags, + "user_name": fxt_user_name, + "machine_name": platform.node(), + "cpu_info": get_cpu_info()["brand_raw"], + "accelerator_info": subprocess.check_output( + ["nvidia-smi", "-L"], # noqa: S603, S607 + ) + .decode() + .strip(), + } + msg = f"{tags = }" + log.info(msg) + return tags + + +@pytest.fixture() +def fxt_benchmark( + request: pytest.FixtureRequest, + fxt_benchmark_type: str, + fxt_data_root: Path, + fxt_output_root: Path, + fxt_num_epoch: int, + fxt_num_repeat: int, + fxt_eval_upto: str, + fxt_tags: dict[str, str], + fxt_dry_run: bool, + fxt_deterministic: bool, + fxt_accelerator: str, +) -> Benchmark: + """Configure benchmark.""" + benchmark_type: str = request.param["type"] + if fxt_benchmark_type not in {"all", benchmark_type}: + pytest.skip(f"{benchmark_type} benchmark") + + return Benchmark( + benchmark_type=benchmark_type, + data_root=fxt_data_root, + output_root=fxt_output_root, + criteria=request.param["criteria"], + num_epoch=fxt_num_epoch, + num_repeat=fxt_num_repeat, + eval_upto=fxt_eval_upto, + tags=fxt_tags, + dry_run=fxt_dry_run, + deterministic=fxt_deterministic, + accelerator=fxt_accelerator, + ) + + +@pytest.fixture(scope="session", autouse=True) +def fxt_benchmark_summary( + fxt_output_root: Path, + fxt_summary_csv: Path, +): + """Summarize all results at the end of test session.""" + yield + all_results = Benchmark.load_result(fxt_output_root) + if all_results is not None: + print("=" * 20, "[Benchmark summary]") + print(all_results) + all_results.to_csv(fxt_summary_csv) + print(f" -> Saved to {fxt_summary_csv}.") + + +class PerfTestBase: + """Base perf test structure.""" + + def _test_perf( + self, + model: Benchmark.Model, + dataset: Benchmark.Dataset, + benchmark: Benchmark, + ) -> None: + result = benchmark.run( + model=model, + dataset=dataset, + ) + print(result) diff --git a/tests/perf/test_anomaly.py b/tests/perf/test_anomaly.py new file mode 100644 index 00000000000..04c30c8a647 --- /dev/null +++ b/tests/perf/test_anomaly.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""OTX anomaly perfomance benchmark tests.""" diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py new file mode 100644 index 00000000000..9b10b4772d9 --- /dev/null +++ b/tests/perf/test_classification.py @@ -0,0 +1,296 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""OTX classification perfomance benchmark tests.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from .benchmark import Benchmark +from .conftest import PerfTestBase + + +class TestPerfSingleLabelClassification(PerfTestBase): + """Benchmark single-label classification.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="classification/multi_class_cls", name="efficientnet_b0_light", category="speed"), + Benchmark.Model(task="classification/multi_class_cls", name="efficientnet_v2_light", category="balance"), + Benchmark.Model(task="classification/multi_class_cls", name="mobilenet_v3_large_light", category="accuracy"), + Benchmark.Model(task="classification/multi_class_cls", name="otx_deit_tiny", category="other"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"multiclass_CUB_small_{idx}", + path=Path("multiclass_classification/multiclass_CUB_small") / f"{idx}", + size="small", + data_format="imagenet_with_subset_dirs", + num_classes=2, + num_repeat=3, + extra_overrides={}, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="multiclass_CUB_medium", + path=Path("multiclass_classification/multiclass_CUB_medium"), + size="medium", + data_format="imagenet_with_subset_dirs", + num_classes=67, + num_repeat=3, + extra_overrides={}, + ), + Benchmark.Dataset( + name="multiclass_food101_large", + path=Path("multiclass_classification/multiclass_food101_large"), + size="large", + data_format="imagenet_with_subset_dirs", + num_classes=20, + num_repeat=1, + extra_overrides={}, + ), + ] + + BENCHMARK_TEST_CASES = [ # noqa: RUF012 + { + "type": "accuracy", + "criteria": [ + Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1), + ], + }, + { + "type": "efficiency", + "criteria": [ + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ], + }, + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_benchmark", + BENCHMARK_TEST_CASES, + ids=lambda benchmark: benchmark["type"], + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + ) + + +class TestPerfMultiLabelClassification(PerfTestBase): + """Benchmark multi-label classification.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="classification/multi_label_cls", name="efficientnet_b0_light", category="speed"), + Benchmark.Model(task="classification/multi_label_cls", name="efficientnet_v2_light", category="balance"), + Benchmark.Model(task="classification/multi_label_cls", name="mobilenet_v3_large_light", category="accuracy"), + Benchmark.Model(task="classification/multi_label_cls", name="otx_deit_tiny", category="other"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"multilabel_CUB_small_{idx}", + path=Path("multilabel_classification/multilabel_CUB_small") / f"{idx}", + size="small", + data_format="datumaro", + num_classes=3, + num_repeat=3, + extra_overrides={}, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="multilabel_CUB_medium", + path=Path("multilabel_classification/multilabel_CUB_medium"), + size="medium", + data_format="datumaro", + num_classes=68, + num_repeat=3, + extra_overrides={}, + ), + Benchmark.Dataset( + name="multilabel_food101_large", + path=Path("multilabel_classification/multilabel_food101_large"), + size="large", + data_format="datumaro", + num_classes=21, + num_repeat=1, + extra_overrides={}, + ), + ] + + BENCHMARK_TEST_CASES = [ # noqa: RUF012 + { + "type": "accuracy", + "criteria": [ + Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1), + ], + }, + { + "type": "efficiency", + "criteria": [ + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ], + }, + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_benchmark", + BENCHMARK_TEST_CASES, + ids=lambda benchmark: benchmark["type"], + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + ) + + +class TestPerfHierarchicalLabelClassification(PerfTestBase): + """Benchmark hierarchical-label classification.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="classification/h_label_cls", name="efficientnet_b0_light", category="speed"), + Benchmark.Model(task="classification/h_label_cls", name="efficientnet_v2_light", category="balance"), + Benchmark.Model(task="classification/h_label_cls", name="mobilenet_v3_large_light", category="accuracy"), + Benchmark.Model(task="classification/h_label_cls", name="otx_deit_tiny", category="other"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"hlabel_CUB_small_{idx}", + path=Path("hlabel_classification/hlabel_CUB_small") / f"{idx}", + size="small", + data_format="datumaro", + num_classes=6, + num_repeat=3, + extra_overrides={ + "model.num_multiclass_heads": "3", + "model.num_multilabel_classes": "0", + }, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="hlabel_CUB_medium", + path=Path("hlabel_classification/hlabel_CUB_medium"), + size="medium", + data_format="datumaro", + num_classes=102, + num_repeat=3, + extra_overrides={ + "model.num_multiclass_heads": "23", + "model.num_multilabel_classes": "0", + }, + ), + # Add large dataset + ] + + BENCHMARK_TEST_CASES = [ # noqa: RUF012 + { + "type": "accuracy", + "criteria": [ + Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1), + ], + }, + { + "type": "efficiency", + "criteria": [ + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ], + }, + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_benchmark", + BENCHMARK_TEST_CASES, + ids=lambda benchmark: benchmark["type"], + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + ) diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py new file mode 100644 index 00000000000..83d0dabd5a2 --- /dev/null +++ b/tests/perf/test_detection.py @@ -0,0 +1,126 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""OTX object detection perfomance benchmark tests.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from .benchmark import Benchmark +from .conftest import PerfTestBase + + +class TestPerfObjectDetection(PerfTestBase): + """Benchmark object detection.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="detection", name="atss_mobilenetv2", category="accuracy"), + Benchmark.Model(task="detection", name="atss_resnext101", category="other"), + Benchmark.Model(task="detection", name="ssd_mobilenetv2", category="balance"), + Benchmark.Model(task="detection", name="yolox_tiny", category="speed"), + Benchmark.Model(task="detection", name="yolox_s", category="other"), + Benchmark.Model(task="detection", name="yolox_l", category="other"), + Benchmark.Model(task="detection", name="yolox_x", category="other"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"pothole_small_{idx}", + path=Path("detection/pothole_small") / f"{idx}", + size="small", + data_format="coco", + num_classes=1, + num_repeat=3, + extra_overrides={ + "deterministic": "True", + "metric": "otx.algo.metrices.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="pothole_medium", + path=Path("detection/pothole_medium"), + size="medium", + data_format="coco", + num_classes=1, + num_repeat=3, + extra_overrides={ + "deterministic": "True", + "metric": "otx.algo.metrices.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + ), + Benchmark.Dataset( + name="vitens_large", + path=Path("detection/vitens_large"), + size="large", + data_format="coco", + num_classes=1, + num_repeat=1, + extra_overrides={ + "deterministic": "True", + "metric": "otx.algo.metrices.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + ), + ] + + BENCHMARK_TEST_CASES = [ # noqa: RUF012 + { + "type": "accuracy", + "criteria": [ + Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="val/f1-score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/f1-score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/f1-score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/f1-score", summary="max", compare=">", margin=0.1), + ], + }, + { + "type": "efficiency", + "criteria": [ + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ], + }, + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_benchmark", + BENCHMARK_TEST_CASES, + ids=lambda benchmark: benchmark["type"], + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + ) diff --git a/tests/perf/test_instance_segmentation.py b/tests/perf/test_instance_segmentation.py new file mode 100644 index 00000000000..c2fb2e36252 --- /dev/null +++ b/tests/perf/test_instance_segmentation.py @@ -0,0 +1,217 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""OTX instance segmentation perfomance benchmark tests.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from .benchmark import Benchmark +from .conftest import PerfTestBase + + +class TestPerfInstanceSegmentation(PerfTestBase): + """Benchmark instance segmentation.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="instance_segmentation", name="maskrcnn_efficientnetb2b", category="speed"), + Benchmark.Model(task="instance_segmentation", name="maskrcnn_r50", category="accuracy"), + Benchmark.Model(task="instance_segmentation", name="maskrcnn_swint", category="other"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"wgisd_small_{idx}", + path=Path("instance_seg/wgisd_small") / f"{idx}", + size="small", + data_format="coco", + num_classes=5, + num_repeat=3, + extra_overrides={ + "deterministic": "True", + "metric": "otx.algo.metrices.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="coco_car_person_medium", + path=Path("instance_seg/coco_car_person_medium"), + size="medium", + data_format="coco", + num_classes=2, + num_repeat=3, + extra_overrides={ + "deterministic": "True", + "metric": "otx.algo.metrices.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + ), + Benchmark.Dataset( + name="vitens_coliform", + path=Path("instance_seg/Vitens-Coliform-coco"), + size="large", + data_format="coco", + num_classes=1, + num_repeat=1, + extra_overrides={ + "deterministic": "True", + "metric": "otx.algo.metrices.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + ), + ] + + BENCHMARK_TEST_CASES = [ # noqa: RUF012 + { + "type": "accuracy", + "criteria": [ + Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="val/f1-score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/f1-score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/f1-score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/f1-score", summary="max", compare=">", margin=0.1), + ], + }, + { + "type": "efficiency", + "criteria": [ + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ], + }, + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_benchmark", + BENCHMARK_TEST_CASES, + ids=lambda benchmark: benchmark["type"], + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + ) + + +class TestPerfTilingInstanceSegmentation(PerfTestBase): + """Benchmark tiling instance segmentation.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="instance_segmentation", name="maskrcnn_efficientnetb2b_tile", category="speed"), + Benchmark.Model(task="instance_segmentation", name="maskrcnn_r50_tile", category="accuracy"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"vitens_aeromonas_small_{idx}", + path=Path("tiling_instance_seg/vitens_aeromonas_small") / f"{idx}", + size="small", + data_format="coco", + num_classes=1, + num_repeat=3, + extra_overrides={ + "deterministic": "True", + "metric": "otx.algo.metrices.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="vitens_aeromonas_medium", + path=Path("tiling_instance_seg/vitens_aeromonas_medium"), + size="medium", + data_format="coco", + num_classes=1, + num_repeat=3, + extra_overrides={ + "deterministic": "True", + "metric": "otx.algo.metrices.fmeasure.FMeasure", + "callback_monitor": "val/f1-score", + "scheduler.monitor": "val/f1-score", + }, + ), + # Add large dataset + ] + + BENCHMARK_TEST_CASES = [ # noqa: RUF012 + { + "type": "accuracy", + "criteria": [ + Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="val/f1-score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/f1-score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/f1-score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/f1-score", summary="max", compare=">", margin=0.1), + ], + }, + { + "type": "efficiency", + "criteria": [ + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ], + }, + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_benchmark", + BENCHMARK_TEST_CASES, + ids=lambda benchmark: benchmark["type"], + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + ) diff --git a/tests/perf/test_semantic_segmentation.py b/tests/perf/test_semantic_segmentation.py new file mode 100644 index 00000000000..dd88252b490 --- /dev/null +++ b/tests/perf/test_semantic_segmentation.py @@ -0,0 +1,111 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""OTX semantic segmentation perfomance benchmark tests.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from .benchmark import Benchmark +from .conftest import PerfTestBase + + +class TestPerfSemanticSegmentation(PerfTestBase): + """Benchmark semantic segmentation.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="semantic_segmentation", name="litehrnet_18", category="balance"), + Benchmark.Model(task="semantic_segmentation", name="litehrnet_s", category="speed"), + Benchmark.Model(task="semantic_segmentation", name="litehrnet_x", category="accuracy"), + Benchmark.Model(task="semantic_segmentation", name="segnext_b", category="other"), + Benchmark.Model(task="semantic_segmentation", name="segnext_s", category="other"), + Benchmark.Model(task="semantic_segmentation", name="segnext_t", category="other"), + Benchmark.Model(task="semantic_segmentation", name="dino_v2", category="other"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"kvasir_small_{idx}", + path=Path("semantic_seg/kvasir_small") / f"{idx}", + size="small", + data_format="common_semantic_segmentation_with_subset_dirs", + num_classes=2, + num_repeat=3, + extra_overrides={}, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="kvasir_medium", + path=Path("semantic_seg/kvasir_medium"), + size="medium", + data_format="common_semantic_segmentation_with_subset_dirs", + num_classes=2, + num_repeat=3, + extra_overrides={}, + ), + Benchmark.Dataset( + name="kvasir_large", + path=Path("semantic_seg/kvasir_large"), + size="large", + data_format="common_semantic_segmentation_with_subset_dirs", + num_classes=2, + num_repeat=1, + extra_overrides={}, + ), + ] + + BENCHMARK_TEST_CASES = [ # noqa: RUF012 + { + "type": "accuracy", + "criteria": [ + Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="val/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/Dice", summary="max", compare=">", margin=0.1), + ], + }, + { + "type": "efficiency", + "criteria": [ + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ], + }, + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_benchmark", + BENCHMARK_TEST_CASES, + ids=lambda benchmark: benchmark["type"], + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + ) diff --git a/tests/perf/test_visual_prompting.py b/tests/perf/test_visual_prompting.py new file mode 100644 index 00000000000..c892ae7fdc2 --- /dev/null +++ b/tests/perf/test_visual_prompting.py @@ -0,0 +1,178 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""OTX visual prompting perfomance benchmark tests.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from .benchmark import Benchmark +from .conftest import PerfTestBase + + +class TestPerfVisualPrompting(PerfTestBase): + """Benchmark visual prompting.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="visual_prompting", name="sam_tiny_vit", category="other"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"wgisd_small_{idx}", + path=Path("visual_prompting/wgisd_small") / f"{idx}", + size="small", + data_format="coco", + num_classes=5, + num_repeat=3, + extra_overrides={}, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="coco_car_person_medium", + path=Path("visual_prompting/coco_car_person_medium"), + size="medium", + data_format="coco", + num_classes=2, + num_repeat=3, + extra_overrides={}, + ), + Benchmark.Dataset( + name="vitens_coliform", + path=Path("visual_prompting/Vitens-Coliform-coco"), + size="large", + data_format="coco", + num_classes=1, + num_repeat=1, + extra_overrides={}, + ), + ] + + BENCHMARK_TEST_CASES = [ # noqa: RUF012 + { + "type": "accuracy", + "criteria": [ + Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="val/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/Dice", summary="max", compare=">", margin=0.1), + ], + }, + { + "type": "efficiency", + "criteria": [ + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ], + }, + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_benchmark", + BENCHMARK_TEST_CASES, + ids=lambda benchmark: benchmark["type"], + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + ) + + +class TestPerfZeroShotVisualPrompting(PerfTestBase): + """Benchmark zero-shot visual prompting.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="zero_shot_visual_prompting", name="sam_tiny_vit", category="other"), + Benchmark.Model(task="zero_shot_visual_prompting", name="sam_vit_b", category="other"), + ] + + DATASET_TEST_CASES = [ # noqa: RUF012 + Benchmark.Dataset( + name="coco_car_person_medium_datumaro", + path=Path("zero_shot_visual_prompting/coco_car_person_medium_datumaro"), + size="medium", + data_format="datumaro", + num_classes=2, + num_repeat=3, + extra_overrides={"max_epochs": "1"}, + ), + ] + + BENCHMARK_TEST_CASES = [ # noqa: RUF012 + { + "type": "accuracy", + "criteria": [ + Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="val/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/Dice", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/Dice", summary="max", compare=">", margin=0.1), + ], + }, + { + "type": "efficiency", + "criteria": [ + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ], + }, + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_benchmark", + BENCHMARK_TEST_CASES, + ids=lambda benchmark: benchmark["type"], + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + ) diff --git a/tox.ini b/tox.ini index 20456491fbc..69169b06b4e 100644 --- a/tox.ini +++ b/tox.ini @@ -63,7 +63,7 @@ commands_pre = commands = python -m pytest tests/integration -ra --showlocals --csv={toxworkdir}/{envname}.csv --task {[testenv]task} --open-subprocess {posargs} -[testenv:performance-test] +[testenv:perf-benchmark] deps = .[dev] commands_pre = @@ -72,7 +72,7 @@ commands_pre = ; temporary as Anomalib v1 is not available on PyPI pip install git+https://github.com/openvinotoolkit/anomalib.git@cbb623e33876e446b7788375cc355e3a3dd44cef commands = - pytest tests/regression -ra --showlocals --csv={toxworkdir}/{envname}.csv {posargs} + pytest -ra --showlocals --csv={toxworkdir}/{envname}.csv {posargs:tests/perf} [testenv:build-doc]