diff --git a/.github/workflows/perf_benchmark.yaml b/.github/workflows/perf_benchmark.yaml index 204fb4cf69f..d7d13bb7e68 100644 --- a/.github/workflows/perf_benchmark.yaml +++ b/.github/workflows/perf_benchmark.yaml @@ -7,6 +7,9 @@ on: type: choice description: Model category to run benchmark options: + - speed + - balance + - accuracy - default # speed, balance, accuracy models only - all # default + other models default: default @@ -50,6 +53,45 @@ on: `pip install otx[full]@https://github.com/openvinotoolkit/training_extensions.git@{otx_ref}` will be executed before run, and reverted after run. Works only for v2.x assuming CLI compatibility. default: __CURRENT_BRANCH_COMMIT__ + workflow_call: + inputs: + model-category: + type: string + description: Model category to run benchmark [speed, balance, accuracy, default, all] + default: default + data-group: + type: string + description: Data group to run benchmark [small, medium, large, all] + default: all + num-repeat: + type: number + description: Overrides default per-data-group number of repeat setting + default: 0 + num-epoch: + type: number + description: Overrides default per-model number of epoch setting + default: 0 + eval-upto: + type: string + description: The last operation to evaluate. 'optimize' means all. [train, export, optimize] + default: optimize + pytest-args: + type: string + description: | + Additional perf-benchmark pytest arguments. + "-k detection" -> detection task only + "--dry-run" -> print command w/o execution. + data-root: + type: string + description: Root directory containing validation data in CI server. + default: "/home/validation/data/v2/" + otx-ref: + type: string + description: | + Target OTX ref (tag / branch name / commit hash) on main repo to test. Defaults to the current branch. + `pip install otx[full]@https://github.com/openvinotoolkit/training_extensions.git@{otx_ref}` will be executed before run, + and reverted after run. Works only for v2.x assuming CLI compatibility. + default: __CURRENT_BRANCH_COMMIT__ # Declare default permissions as read only. permissions: read-all @@ -73,7 +115,7 @@ jobs: - task-short: "vsp" task: "visual_prompting" name: Perf-Benchmark-${{ matrix.task-short }} - runs-on: [self-hosted, linux, x64, dmount-v2, perf] + runs-on: [self-hosted, linux, x64, dmount-v2] timeout-minutes: 8640 steps: - name: Checkout repository @@ -85,6 +127,10 @@ jobs: - name: Install tox run: python -m pip install --require-hashes --no-deps -r .ci/tox-deps.txt - name: Run Performance Test + env: + BENCHMARK_RESULTS_CLEAR: ${{ vars.BENCHMARK_RESULTS_CLEAR }} + GH_CTX_REF_NAME: ${{ github.ref_name }} + GH_CTX_SHA: ${{ github.sha }} run: > tox -vv -e perf-benchmark -- tests/perf/test_${{ matrix.task }}.py ${{ inputs.pytest-args }} --model-category ${{ inputs.model-category }} diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py index 95bb0e76de1..d2810f4bd4c 100644 --- a/tests/perf/benchmark.py +++ b/tests/perf/benchmark.py @@ -56,8 +56,6 @@ class Dataset: name: str path: Path group: str - data_format: str - num_classes: int num_repeat: int = 1 extra_overrides: dict | None = None @@ -155,10 +153,6 @@ def run( str(data_root), "--work_dir", str(sub_work_dir), - "--model.num_classes", - str(dataset.num_classes), - "--data.config.data_format", - dataset.data_format, "--engine.device", self.accelerator, ] @@ -172,7 +166,10 @@ def run( start_time = time() self._run_command(command) extra_metrics = {"train/e2e_time": time() - start_time} - self._rename_raw_data(work_dir=sub_work_dir / ".latest" / "train", replaces={"epoch": "train/epoch"}) + self._rename_raw_data( + work_dir=sub_work_dir / ".latest" / "train", + replaces={"train_": "train/", "{pre}": "train/"}, + ) self._log_metrics( work_dir=sub_work_dir / ".latest" / "train", tags=tags, @@ -187,6 +184,10 @@ def run( str(sub_work_dir), ] self._run_command(command) + self._rename_raw_data( + work_dir=sub_work_dir / ".latest" / "test", + replaces={"test_": "test/", "{pre}": "test/"}, + ) self._log_metrics(work_dir=sub_work_dir / ".latest" / "test", tags=tags, criteria=criteria) # Export & test @@ -215,7 +216,10 @@ def run( ] self._run_command(command) - self._rename_raw_data(work_dir=sub_work_dir / ".latest" / "test", replaces={"test": "export"}) + self._rename_raw_data( + work_dir=sub_work_dir / ".latest" / "test", + replaces={"test": "export", "{pre}": "export/"}, + ) self._log_metrics(work_dir=sub_work_dir / ".latest" / "test", tags=tags, criteria=criteria) # Optimize & test @@ -250,7 +254,10 @@ def run( ] self._run_command(command) - self._rename_raw_data(work_dir=sub_work_dir / ".latest" / "test", replaces={"test": "optimize"}) + self._rename_raw_data( + work_dir=sub_work_dir / ".latest" / "test", + replaces={"test": "optimize", "{pre}": "optimize/"}, + ) self._log_metrics(work_dir=sub_work_dir / ".latest" / "test", tags=tags, criteria=criteria) # Force memory clean up @@ -310,11 +317,25 @@ def _log_metrics( metrics.to_csv(work_dir / "benchmark.raw.csv", index=False) def _rename_raw_data(self, work_dir: Path, replaces: dict[str, str]) -> None: + replaces = {**self.NAME_MAPPING, **replaces} + + def _rename_col(col_name: str) -> str: + for src_str, dst_str in replaces.items(): + if src_str == "{pre}": + if not col_name.startswith(dst_str): + col_name = dst_str + col_name + elif src_str == "{post}": + if not col_name.endswith(dst_str): + col_name = col_name + dst_str + else: + col_name = col_name.replace(src_str, dst_str) + return col_name + csv_files = work_dir.glob("**/metrics.csv") for csv_file in csv_files: data = pd.read_csv(csv_file) - for src_str, dst_str in replaces.items(): - data.columns = data.columns.str.replace(src_str, dst_str) + data = data.rename(columns=_rename_col) # Column names + data = data.replace(replaces) # Values data.to_csv(csv_file, index=False) @staticmethod @@ -338,7 +359,7 @@ def load_result(result_path: Path) -> pd.DataFrame | None: return pd.concat(results, ignore_index=True).set_index(["task", "model", "data_group", "data"]) @staticmethod - def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame: + def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame | None: """Average result w.r.t. given keys Args: @@ -348,6 +369,9 @@ def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame: Retruns: pd.DataFrame: Averaged result table """ + if data is None: + return None + # Flatten index index_names = data.index.names column_names = data.columns @@ -391,3 +415,5 @@ def check(self, result: pd.DataFrame, criteria: list[Criterion]): for criterion in criteria: criterion(result_entry, target_entry) + + NAME_MAPPING: dict[str, str] = {} # noqa: RUF012 diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py index d9538e40400..2953a9f7d70 100644 --- a/tests/perf/conftest.py +++ b/tests/perf/conftest.py @@ -27,8 +27,8 @@ def pytest_addoption(parser): "--model-category", action="store", default="all", - choices=("default", "all"), - help="Choose default|all. Defaults to all.", + choices=("speed", "balance", "accuracy", "default", "other", "all"), + help="Choose speed|balcence|accuracy|default|other|all. Defaults to all.", ) parser.addoption( "--data-group", @@ -290,7 +290,9 @@ def fxt_mlflow_client(request: pytest.FixtureRequest) -> MlflowClient: def fxt_model(request: pytest.FixtureRequest, fxt_model_category) -> Benchmark.Model: """Skip models according to user options.""" model: Benchmark.Model = request.param - if fxt_model_category == "default" and model.category == "other": + if fxt_model_category == "all": + return model + if (fxt_model_category == "default" and model.category == "other") or fxt_model_category != model.category: pytest.skip(f"{model.category} category model") return model diff --git a/tests/perf/test_anomaly.py b/tests/perf/test_anomaly.py index fcb06875d61..63883e068bf 100644 --- a/tests/perf/test_anomaly.py +++ b/tests/perf/test_anomaly.py @@ -5,11 +5,225 @@ from __future__ import annotations +from pathlib import Path + +import pytest + +from .benchmark import Benchmark from .conftest import PerfTestBase class TestPerfAnomalyClassification(PerfTestBase): """Benchmark anomaly classification.""" - def test_dummay(self): - pass + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="anomaly_classification", name="padim", category="speed"), + Benchmark.Model(task="anomaly_classification", name="stfpm", category="accuracy"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"mvtec_bottle_small_{idx}", + path=Path("anomaly/mvtec/bottle_small") / f"{idx}", + group="small", + num_repeat=5, + extra_overrides={}, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="mvtec_wood_medium", + path=Path("anomaly/mvtec/wood_medium"), + group="medium", + num_repeat=5, + extra_overrides={}, + ), + Benchmark.Dataset( + name="mvtec_hazelnut_large", + path=Path("anomaly/mvtec/hazelnut_large"), + group="large", + num_repeat=5, + extra_overrides={}, + ), + ] + + BENCHMARK_CRITERIA = [ # noqa: RUF012 + Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="test/image_F1Score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/image_F1Score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/image_F1Score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + criteria=self.BENCHMARK_CRITERIA, + ) + + +class TestPerfAnomalyDetection(PerfTestBase): + """Benchmark anomaly detection.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="anomaly_detection", name="padim", category="speed"), + Benchmark.Model(task="anomaly_detection", name="stfpm", category="accuracy"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"mvtec_bottle_small_{idx}", + path=Path("anomaly/mvtec/bottle_small") / f"{idx}", + group="small", + num_repeat=5, + extra_overrides={}, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="mvtec_wood_medium", + path=Path("anomaly/mvtec/wood_medium"), + group="medium", + num_repeat=5, + extra_overrides={}, + ), + Benchmark.Dataset( + name="mvtec_hazelnut_large", + path=Path("anomaly/mvtec/hazelnut_large"), + group="large", + num_repeat=5, + extra_overrides={}, + ), + ] + + BENCHMARK_CRITERIA = [ # noqa: RUF012 + Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="test/image_F1Score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/image_F1Score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/image_F1Score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + criteria=self.BENCHMARK_CRITERIA, + ) + + +class TestPerfAnomalySegmentation(PerfTestBase): + """Benchmark anomaly segmentation.""" + + MODEL_TEST_CASES = [ # noqa: RUF012 + Benchmark.Model(task="anomaly_segmentation", name="padim", category="speed"), + Benchmark.Model(task="anomaly_segmentation", name="stfpm", category="accuracy"), + ] + + DATASET_TEST_CASES = [ + Benchmark.Dataset( + name=f"mvtec_bottle_small_{idx}", + path=Path("anomaly/mvtec/bottle_small") / f"{idx}", + group="small", + num_repeat=5, + extra_overrides={}, + ) + for idx in (1, 2, 3) + ] + [ + Benchmark.Dataset( + name="mvtec_wood_medium", + path=Path("anomaly/mvtec/wood_medium"), + group="medium", + num_repeat=5, + extra_overrides={}, + ), + Benchmark.Dataset( + name="mvtec_hazelnut_large", + path=Path("anomaly/mvtec/hazelnut_large"), + group="large", + num_repeat=5, + extra_overrides={}, + ), + ] + + BENCHMARK_CRITERIA = [ # noqa: RUF012 + Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), + Benchmark.Criterion(name="test/pixel_F1Score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/pixel_F1Score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/pixel_F1Score", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), + Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1), + ] + + @pytest.mark.parametrize( + "fxt_model", + MODEL_TEST_CASES, + ids=lambda model: model.name, + indirect=True, + ) + @pytest.mark.parametrize( + "fxt_dataset", + DATASET_TEST_CASES, + ids=lambda dataset: dataset.name, + indirect=True, + ) + def test_perf( + self, + fxt_model: Benchmark.Model, + fxt_dataset: Benchmark.Dataset, + fxt_benchmark: Benchmark, + ): + self._test_perf( + model=fxt_model, + dataset=fxt_dataset, + benchmark=fxt_benchmark, + criteria=self.BENCHMARK_CRITERIA, + ) diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py index ceb5574002e..6b00715831d 100644 --- a/tests/perf/test_classification.py +++ b/tests/perf/test_classification.py @@ -29,8 +29,6 @@ class TestPerfSingleLabelClassification(PerfTestBase): name=f"multiclass_CUB_small_{idx}", path=Path("multiclass_classification/multiclass_CUB_small") / f"{idx}", group="small", - data_format="imagenet_with_subset_dirs", - num_classes=2, num_repeat=5, extra_overrides={}, ) @@ -40,8 +38,6 @@ class TestPerfSingleLabelClassification(PerfTestBase): name="multiclass_CUB_medium", path=Path("multiclass_classification/multiclass_CUB_medium"), group="medium", - data_format="imagenet_with_subset_dirs", - num_classes=67, num_repeat=5, extra_overrides={}, ), @@ -49,8 +45,6 @@ class TestPerfSingleLabelClassification(PerfTestBase): name="multiclass_food101_large", path=Path("multiclass_classification/multiclass_food101_large"), group="large", - data_format="imagenet_with_subset_dirs", - num_classes=20, num_repeat=5, extra_overrides={}, ), @@ -110,8 +104,6 @@ class TestPerfMultiLabelClassification(PerfTestBase): name=f"multilabel_CUB_small_{idx}", path=Path("multilabel_classification/multilabel_CUB_small") / f"{idx}", group="small", - data_format="datumaro", - num_classes=3, num_repeat=5, extra_overrides={}, ) @@ -121,8 +113,6 @@ class TestPerfMultiLabelClassification(PerfTestBase): name="multilabel_CUB_medium", path=Path("multilabel_classification/multilabel_CUB_medium"), group="medium", - data_format="datumaro", - num_classes=68, num_repeat=5, extra_overrides={}, ), @@ -130,8 +120,6 @@ class TestPerfMultiLabelClassification(PerfTestBase): name="multilabel_food101_large", path=Path("multilabel_classification/multilabel_food101_large"), group="large", - data_format="datumaro", - num_classes=21, num_repeat=5, extra_overrides={}, ), @@ -191,8 +179,6 @@ class TestPerfHierarchicalLabelClassification(PerfTestBase): name=f"hlabel_CUB_small_{idx}", path=Path("hlabel_classification/hlabel_CUB_small") / f"{idx}", group="small", - data_format="datumaro", - num_classes=6, num_repeat=5, extra_overrides={ "model.num_multiclass_heads": "3", @@ -205,8 +191,6 @@ class TestPerfHierarchicalLabelClassification(PerfTestBase): name="hlabel_CUB_medium", path=Path("hlabel_classification/hlabel_CUB_medium"), group="medium", - data_format="datumaro", - num_classes=102, num_repeat=5, extra_overrides={ "model.num_multiclass_heads": "23", diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py index ce282a29749..f048eb7c357 100644 --- a/tests/perf/test_detection.py +++ b/tests/perf/test_detection.py @@ -31,8 +31,6 @@ class TestPerfObjectDetection(PerfTestBase): name=f"pothole_small_{idx}", path=Path("detection/pothole_small") / f"{idx}", group="small", - data_format="coco", - num_classes=1, num_repeat=5, extra_overrides={ "deterministic": "True", @@ -47,8 +45,6 @@ class TestPerfObjectDetection(PerfTestBase): name="pothole_medium", path=Path("detection/pothole_medium"), group="medium", - data_format="coco", - num_classes=1, num_repeat=5, extra_overrides={ "deterministic": "True", @@ -61,8 +57,6 @@ class TestPerfObjectDetection(PerfTestBase): name="vitens_large", path=Path("detection/vitens_large"), group="large", - data_format="coco", - num_classes=1, num_repeat=5, extra_overrides={ "deterministic": "True", diff --git a/tests/perf/test_instance_segmentation.py b/tests/perf/test_instance_segmentation.py index 0ec701950d8..2711ea961f4 100644 --- a/tests/perf/test_instance_segmentation.py +++ b/tests/perf/test_instance_segmentation.py @@ -27,8 +27,6 @@ class TestPerfInstanceSegmentation(PerfTestBase): name=f"wgisd_small_{idx}", path=Path("instance_seg/wgisd_small") / f"{idx}", group="small", - data_format="coco", - num_classes=5, num_repeat=5, extra_overrides={ "deterministic": "True", @@ -43,8 +41,6 @@ class TestPerfInstanceSegmentation(PerfTestBase): name="coco_car_person_medium", path=Path("instance_seg/coco_car_person_medium"), group="medium", - data_format="coco", - num_classes=2, num_repeat=5, extra_overrides={ "deterministic": "True", @@ -57,8 +53,6 @@ class TestPerfInstanceSegmentation(PerfTestBase): name="vitens_coliform", path=Path("instance_seg/Vitens-Coliform-coco"), group="large", - data_format="coco", - num_classes=1, num_repeat=5, extra_overrides={ "deterministic": "True", @@ -122,8 +116,6 @@ class TestPerfTilingInstanceSegmentation(PerfTestBase): name=f"vitens_aeromonas_small_{idx}", path=Path("tiling_instance_seg/vitens_aeromonas_small") / f"{idx}", group="small", - data_format="coco", - num_classes=1, num_repeat=5, extra_overrides={ "deterministic": "True", @@ -138,8 +130,6 @@ class TestPerfTilingInstanceSegmentation(PerfTestBase): name="vitens_aeromonas_medium", path=Path("tiling_instance_seg/vitens_aeromonas_medium"), group="medium", - data_format="coco", - num_classes=1, num_repeat=5, extra_overrides={ "deterministic": "True", diff --git a/tests/perf/test_semantic_segmentation.py b/tests/perf/test_semantic_segmentation.py index d0614dbc498..1cd5fe7a968 100644 --- a/tests/perf/test_semantic_segmentation.py +++ b/tests/perf/test_semantic_segmentation.py @@ -31,8 +31,6 @@ class TestPerfSemanticSegmentation(PerfTestBase): name=f"kvasir_small_{idx}", path=Path("semantic_seg/kvasir_small") / f"{idx}", group="small", - data_format="common_semantic_segmentation_with_subset_dirs", - num_classes=2, num_repeat=5, extra_overrides={}, ) @@ -42,8 +40,6 @@ class TestPerfSemanticSegmentation(PerfTestBase): name="kvasir_medium", path=Path("semantic_seg/kvasir_medium"), group="medium", - data_format="common_semantic_segmentation_with_subset_dirs", - num_classes=2, num_repeat=5, extra_overrides={}, ), @@ -51,8 +47,6 @@ class TestPerfSemanticSegmentation(PerfTestBase): name="kvasir_large", path=Path("semantic_seg/kvasir_large"), group="large", - data_format="common_semantic_segmentation_with_subset_dirs", - num_classes=2, num_repeat=5, extra_overrides={}, ), diff --git a/tests/perf/test_visual_prompting.py b/tests/perf/test_visual_prompting.py index 71b3d00429d..64da6d6e5aa 100644 --- a/tests/perf/test_visual_prompting.py +++ b/tests/perf/test_visual_prompting.py @@ -26,8 +26,6 @@ class TestPerfVisualPrompting(PerfTestBase): name=f"wgisd_small_{idx}", path=Path("visual_prompting/wgisd_small") / f"{idx}", group="small", - data_format="coco", - num_classes=5, num_repeat=5, extra_overrides={}, ) @@ -37,8 +35,6 @@ class TestPerfVisualPrompting(PerfTestBase): name="coco_car_person_medium", path=Path("visual_prompting/coco_car_person_medium"), group="medium", - data_format="coco", - num_classes=2, num_repeat=5, extra_overrides={}, ), @@ -46,8 +42,6 @@ class TestPerfVisualPrompting(PerfTestBase): name="vitens_coliform", path=Path("visual_prompting/Vitens-Coliform-coco"), group="large", - data_format="coco", - num_classes=1, num_repeat=5, extra_overrides={}, ), @@ -105,8 +99,6 @@ class TestPerfZeroShotVisualPrompting(PerfTestBase): name="coco_car_person_medium_datumaro", path=Path("zero_shot_visual_prompting/coco_car_person_medium_datumaro"), group="medium", - data_format="datumaro", - num_classes=2, num_repeat=5, extra_overrides={"max_epochs": "1"}, ),