From ace7f64234edb9f45c3557c7e969fb003c60b01f Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Tue, 5 Mar 2024 10:05:18 +0900
Subject: [PATCH] Add perf benchmark tests for v2 (#3004)

* Add perf tests for all tasks

* Update perf GH workflow

* Parameterize accuracy | efficiency tests

* Log raw data csv

* Subprocess run & csv summary

* Align with v1
---
 .github/workflows/perf_accuracy.yaml     |  96 ++++++
 .github/workflows/perf_efficiency.yaml   |  80 +++++
 .github/workflows/perf_test.yaml         |  31 --
 tests/perf/__init__.py                   |   4 +
 tests/perf/benchmark.py                  | 246 ++++++++++++++++
 tests/perf/conftest.py                   | 357 +++++++++++++++++++++++
 tests/perf/test_anomaly.py               |   4 +
 tests/perf/test_classification.py        | 296 +++++++++++++++++++
 tests/perf/test_detection.py             | 126 ++++++++
 tests/perf/test_instance_segmentation.py | 217 ++++++++++++++
 tests/perf/test_semantic_segmentation.py | 111 +++++++
 tests/perf/test_visual_prompting.py      | 178 +++++++++++
 tox.ini                                  |   4 +-
 13 files changed, 1717 insertions(+), 33 deletions(-)
 create mode 100644 .github/workflows/perf_accuracy.yaml
 create mode 100644 .github/workflows/perf_efficiency.yaml
 delete mode 100644 .github/workflows/perf_test.yaml
 create mode 100644 tests/perf/__init__.py
 create mode 100644 tests/perf/benchmark.py
 create mode 100644 tests/perf/conftest.py
 create mode 100644 tests/perf/test_anomaly.py
 create mode 100644 tests/perf/test_classification.py
 create mode 100644 tests/perf/test_detection.py
 create mode 100644 tests/perf/test_instance_segmentation.py
 create mode 100644 tests/perf/test_semantic_segmentation.py
 create mode 100644 tests/perf/test_visual_prompting.py

diff --git a/.github/workflows/perf_accuracy.yaml b/.github/workflows/perf_accuracy.yaml
new file mode 100644
index 00000000000..68693179118
--- /dev/null
+++ b/.github/workflows/perf_accuracy.yaml
@@ -0,0 +1,96 @@
+name: Perf-Accuracy Benchmark
+
+on:
+  workflow_dispatch: # run on request (no need for PR)
+    inputs:
+      model-category:
+        type: choice
+        description: Model category to run benchmark
+        options:
+          - default # speed, balance, accuracy models only
+          - all # default + other models
+        default: default
+      data-size:
+        type: choice
+        description: Dataset size to run benchmark
+        options:
+          - small
+          - medium
+          - large
+          - all
+        default: all
+      num-repeat:
+        description: Overrides default per-data-size number of repeat setting
+        default: 0
+      num-epoch:
+        description: Overrides default per-model number of epoch setting
+        default: 0
+      eval-upto:
+        type: choice
+        description: The last operation to evaluate. 'optimize' means all.
+        options:
+          - train
+          - export
+          - optimize
+        default: optimize
+      pytest-args:
+        type: string
+        description: |
+          Additional perf-benchmark pytest arguments.
+          "-k detection" -> detection task only
+          "--dry-run" -> print command w/o execution.
+      data-root:
+        type: string
+        description: Root directory containing validation data in CI server.
+        default: "/home/validation/data/v2/"
+
+jobs:
+  Perf-Accuracy-Benchmark:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - task-short: "ano"
+            task: "anomaly"
+          - task-short: "cls"
+            task: "classification"
+          - task-short: "det"
+            task: "detection"
+          - task-short: "isg"
+            task: "instance_segmentation"
+          - task-short: "ssg"
+            task: "semantic_segmentation"
+          - task-short: "vsp"
+            task: "visual_prompting"
+    name: Perf-Accuracy-Benchmark-${{ matrix.task-short }}
+    runs-on: [self-hosted, linux, x64, dmount]
+    timeout-minutes: 8640
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Install Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Install tox
+        run: python -m pip install tox
+      - name: Run Performance Test
+        run: >
+          tox -vv -e perf-benchmark -- tests/perf/test_${{ matrix.task }}.py ${{ inputs.pytest-args }}
+          --benchmark-type accuracy
+          --model-category ${{ inputs.model-category }}
+          --data-root ${{ inputs.data-root }}
+          --data-size ${{ inputs.data-size }}
+          --num-repeat ${{ inputs.num-repeat }}
+          --num-epoch ${{ inputs.num-epoch }}
+          --eval-upto ${{ inputs.eval-upto }}
+          --summary-csv .tox/perf-accuracy-benchmark-${{ matrix.task-short }}.csv
+          --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
+          --user-name ${{ github.triggering_actor }}
+      - name: Upload test results
+        uses: actions/upload-artifact@v3
+        with:
+          name: perf-accuracy-benchmark-${{ matrix.task-short }}
+          path: .tox/perf-*.csv
+        # Use always() to always run this step to publish test results when there are test failures
+        if: ${{ always() }}
diff --git a/.github/workflows/perf_efficiency.yaml b/.github/workflows/perf_efficiency.yaml
new file mode 100644
index 00000000000..070b3c0837b
--- /dev/null
+++ b/.github/workflows/perf_efficiency.yaml
@@ -0,0 +1,80 @@
+name: Perf-Efficiency Benchmark
+
+on:
+  workflow_dispatch: # run on request (no need for PR)
+    inputs:
+      model-category:
+        type: choice
+        description: Model category to run benchmark
+        options:
+          - default # speed, balance, accuracy models only
+          - all # default + other models
+        default: default
+      data-size:
+        type: choice
+        description: Dataset size to run benchmark
+        options:
+          - small
+          - medium
+          - large
+          - all
+        default: medium
+      num-repeat:
+        description: Overrides default per-data-size number of repeat setting
+        default: 1
+      num-epoch:
+        description: Overrides default per-model number of epoch setting
+        default: 2
+      eval-upto:
+        type: choice
+        description: The last operation to evaluate. 'optimize' means all.
+        options:
+          - train
+          - export
+          - optimize
+        default: optimize
+      pytest-args:
+        type: string
+        description: |
+          Additional perf-benchmark pytest arguments.
+          "-k detection" -> detection task only
+          "--dry-run" -> print command w/o execution.
+      data-root:
+        type: string
+        description: Root directory containing validation data in CI server.
+        default: "/home/validation/data/v2/"
+
+jobs:
+  Perf-Efficiency-Benchmark:
+    name: Perf-Efficiency-Benchmark-all
+    runs-on: [self-hosted, linux, x64, dmount]
+    timeout-minutes: 8640
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Install Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Install tox
+        run: python -m pip install tox
+      - name: Run Performance Test
+        run: >
+          tox -vv -e perf-benchmark -- tests/perf ${{ inputs.pytest-args }}
+          --benchmark-type efficiency
+          --model-category ${{ inputs.model-category }}
+          --data-root ${{ inputs.data-root }}
+          --data-size ${{ inputs.data-size }}
+          --num-repeat ${{ inputs.num-repeat }}
+          --num-epoch ${{ inputs.num-epoch }}
+          --eval-upto ${{ inputs.eval-upto }}
+          --summary-csv .tox/perf-efficiency-benchmark-all.csv
+          --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
+          --user-name ${{ github.triggering_actor }}
+      - name: Upload test results
+        uses: actions/upload-artifact@v3
+        with:
+          name: perf-efficiency-benchmark-all
+          path: .tox/perf-*.csv
+        # Use always() to always run this step to publish test results when there are test failures
+        if: ${{ always() }}
diff --git a/.github/workflows/perf_test.yaml b/.github/workflows/perf_test.yaml
deleted file mode 100644
index 0a210f66dec..00000000000
--- a/.github/workflows/perf_test.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: V2 Performance Test
-
-on:
-  schedule:
-    - cron: "0 16 * * 1-5" # currently run daily on 16:00 UTC; [TODO]: run weekly
-  workflow_dispatch: # run on request
-
-jobs:
-  Performance-Test:
-    runs-on: [self-hosted, linux, x64, dmount]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - python-version: "3.10"
-            tox-env: "py310"
-    name: Performance-Test-Py${{ matrix.python-version }}
-    concurrency:
-      group: ${{ github.workflow }}-Performance-${{ github.event.pull_request.number || github.ref }}
-      cancel-in-progress: true
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-      - name: Install Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install tox
-        run: python -m pip install tox
-      - name: Run Performance Test
-        run: tox -vv -e performance-test -- --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }} --user-name ${{ vars.USER_NAME }} --dataset-root-dir /home/validation/data/v2
diff --git a/tests/perf/__init__.py b/tests/perf/__init__.py
new file mode 100644
index 00000000000..d832bb41bf2
--- /dev/null
+++ b/tests/perf/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX perfomance benchamrk tests."""
diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
new file mode 100644
index 00000000000..29974b1cb35
--- /dev/null
+++ b/tests/perf/benchmark.py
@@ -0,0 +1,246 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX benchmark runner."""
+
+from __future__ import annotations
+
+import gc
+import logging
+import os
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+
+import pandas as pd
+
+log = logging.getLogger(__name__)
+
+
+class Benchmark:
+    """Benchmark runner for OTX2.x.
+
+    Args:
+        benchmark_type (str): 'accuracy' or 'efficiency'
+        data_root (str): Path to the root of dataset directories. Defaults to './data'.
+        output_root (str): Output root dirctory for logs and results. Defaults to './otx-benchmark'.
+        criteria (list[Criterion]): Benchmark criteria settings
+        num_epoch (int): Overrides the per-model default number of epoch settings.
+            Defaults to 0, which means no overriding.
+        num_repeat (int): Number for trials with different random seed, which would be set
+            as range(0, num_repeat). Defaults to 1.
+        eval_upto (str): The last serial operation to evaluate. Choose one of ('train', 'export', 'optimize').
+            Operations include the preceeding ones.
+            e.x) Eval up to 'optimize': train -> eval -> export -> eval -> optimize -> eval
+            Default to 'train'.
+        tags (dict, optional): Key-values pair metadata for the experiment.
+        dry_run (bool): Whether to just print the OTX command without execution. Defaults to False.
+        deterministic (bool): Whether to turn on deterministic training mode. Defaults to False.
+        accelerator (str): Accelerator device on which to run benchmark. Defaults to gpu.
+    """
+
+    @dataclass
+    class Model:
+        """Benchmark model."""
+
+        task: str
+        name: str
+        category: str
+
+    @dataclass
+    class Dataset:
+        """Benchmark dataset."""
+
+        name: str
+        path: Path
+        size: str
+        data_format: str
+        num_classes: int
+        num_repeat: int = 1
+        extra_overrides: dict | None = None
+
+    @dataclass
+    class Criterion:
+        """Benchmark criterion."""
+
+        name: str
+        summary: str
+        compare: str
+        margin: float
+
+    def __init__(
+        self,
+        benchmark_type: str = "accuracy",
+        data_root: Path = Path("data"),
+        output_root: Path = Path("otx-benchmark"),
+        criteria: list[Criterion] | None = None,
+        num_epoch: int = 0,
+        num_repeat: int = 1,
+        eval_upto: str = "train",
+        tags: dict[str, str] | None = None,
+        dry_run: bool = False,
+        deterministic: bool = False,
+        accelerator: str = "gpu",
+    ):
+        self.benchmark_type = benchmark_type
+        self.data_root = data_root
+        self.output_root = output_root
+        self.criteria = criteria
+        self.num_epoch = num_epoch
+        self.num_repeat = num_repeat
+        self.eval_upto = eval_upto
+        self.tags = tags or {}
+        self.dry_run = dry_run
+        self.deterministic = deterministic
+        self.accelerator = accelerator
+
+        if num_epoch == 0 and benchmark_type == "efficiency":
+            self.num_epoch = 2
+
+    def run(
+        self,
+        model: Model,
+        dataset: Dataset,
+    ) -> pd.DataFrame | None:
+        """Run configured benchmark with given dataset and model and return the result.
+
+        Args:
+            model (Model): Target model settings
+            dataset (Dataset): Target dataset settings
+
+        Retruns:
+            pd.DataFrame | None: Table with benchmark metrics
+        """
+
+        run_name = f"{self.benchmark_type}/{model.task}/{model.name}/{dataset.name}"
+        log.info(f"{run_name = }")
+        work_dir = self.output_root / run_name
+        data_root = self.data_root / dataset.path
+
+        tags = {
+            "benchmark": self.benchmark_type,
+            "task": model.task,
+            "data_size": dataset.size,
+            "model": model.name,
+            "dataset": dataset.name,
+            **self.tags,
+        }
+
+        num_repeat = dataset.num_repeat
+        if self.num_repeat > 0:
+            num_repeat = self.num_repeat  # Override by global setting
+
+        for seed in range(num_repeat):
+            sub_work_dir = work_dir / str(seed)
+            tags["seed"] = str(seed)
+
+            # Train & test
+            command = [
+                "otx",
+                "train",
+                "--config",
+                f"src/otx/recipe/{model.task}/{model.name}.yaml",
+                "--data_root",
+                str(data_root),
+                "--work_dir",
+                str(sub_work_dir),
+                "--model.num_classes",
+                str(dataset.num_classes),
+                "--data.config.data_format",
+                dataset.data_format,
+                "--engine.device",
+                self.accelerator,
+            ]
+            for key, value in dataset.extra_overrides.items():
+                command.append(f"--{key}")
+                command.append(str(value))
+            command.extend(["--seed", str(seed)])
+            command.extend(["--deterministic", str(self.deterministic)])
+            if self.num_epoch > 0:
+                command.extend(["--max_epochs", str(self.num_epoch)])
+            self._run_command(command)
+
+            command = [
+                "otx",
+                "test",
+                "--work_dir",
+                str(sub_work_dir),
+            ]
+            self._run_command(command)
+
+            # Export & test
+            # Optimize & test
+
+            self._log_metrics(work_dir=sub_work_dir, tags=tags)
+
+            # Force memory clean up
+            gc.collect()
+
+        return self.load_result(work_dir)
+
+    def _run_command(self, command: list[str]) -> None:
+        if self.dry_run:
+            print(" ".join(command))
+        else:
+            subprocess.run(command, check=True)  # noqa: S603
+
+    def _log_metrics(self, work_dir: Path, tags: dict[str, str]) -> None:
+        if not work_dir.exists():
+            return
+        # Load raw metrics
+        csv_files = work_dir.glob("**/metrics.csv")
+        raw_data = [pd.read_csv(csv_file) for csv_file in csv_files]
+        raw_data = pd.concat(raw_data, ignore_index=True)
+        # Summarize
+        metrics = []
+        for criterion in self.criteria:
+            if criterion.name not in raw_data:
+                continue
+            column = raw_data[criterion.name].dropna()
+            if len(column) == 0:
+                continue
+            if criterion.summary == "mean":
+                value = column[min(1, len(column) - 1) :].mean()  # Drop 1st epoch if possible
+            elif criterion.summary == "max":
+                value = column.max()
+            elif criterion.summary == "min":
+                value = column.min()
+            else:
+                value = 0.0
+            metrics.append(pd.Series([value], name=criterion.name))
+        if len(metrics) == 0:
+            return
+        metrics = pd.concat(metrics, axis=1)
+        # Write csv w/ tags
+        for k, v in tags.items():
+            metrics[k] = v
+        metrics.to_csv(work_dir / "benchmark.raw.csv", index=False)
+
+    @staticmethod
+    def load_result(result_path: Path) -> pd.DataFrame | None:
+        """Load benchmark results recursively and merge as pd.DataFrame.
+
+        Args:
+            result_path (Path): Result directory or speicific file.
+
+        Retruns:
+            pd.DataFrame: Table with benchmark metrics & options
+        """
+        if not result_path.exists():
+            return None
+        # Load csv data
+        csv_files = result_path.glob("**/benchmark.raw.csv") if result_path.is_dir() else [result_path]
+        results = [pd.read_csv(csv_file) for csv_file in csv_files]
+        if len(results) == 0:
+            return None
+        # Merge data
+        data = pd.concat(results, ignore_index=True)
+        # Average by unique group
+        grouped = data.groupby(["benchmark", "task", "data_size", "model"])
+        aggregated = grouped.mean(numeric_only=True)
+        # Merge tag columns (non-numeric & non-index)
+        tag_columns = set(data.columns) - set(aggregated.columns) - set(grouped.keys)
+        for col in tag_columns:
+            # Take common string prefix such as: ["data/1", "data/2", "data/3"] -> "data/"
+            aggregated[col] = grouped[col].agg(lambda x: os.path.commonprefix(x.tolist()))
+        return aggregated
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
new file mode 100644
index 00000000000..3c4fa356846
--- /dev/null
+++ b/tests/perf/conftest.py
@@ -0,0 +1,357 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import logging
+import os
+import platform
+import subprocess
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from urllib.parse import urlparse
+
+import pytest
+from cpuinfo import get_cpu_info
+
+from .benchmark import Benchmark
+
+log = logging.getLogger(__name__)
+
+
+def pytest_addoption(parser):
+    """Add custom options for perf tests."""
+    parser.addoption(
+        "--benchmark-type",
+        action="store",
+        default="accuracy",
+        choices=("accuracy", "efficiency", "all"),
+        help="Choose accuracy|efficiency|all. Defaults to accuracy.",
+    )
+    parser.addoption(
+        "--model-category",
+        action="store",
+        default="all",
+        choices=("default", "all"),
+        help="Choose default|all. Defaults to all.",
+    )
+    parser.addoption(
+        "--data-size",
+        action="store",
+        default="all",
+        choices=("small", "medium", "large", "all"),
+        help="Choose small|medium|large|all. Defaults to all.",
+    )
+    parser.addoption(
+        "--num-repeat",
+        action="store",
+        default=0,
+        help="Overrides default per-data-size number of repeat setting. "
+        "Random seeds are set to 0 ~ num_repeat-1 for the trials. "
+        "Defaults to 0 (small=3, medium=3, large=1).",
+    )
+    parser.addoption(
+        "--num-epoch",
+        action="store",
+        default=0,
+        help="Overrides default per-model number of epoch setting. "
+        "Defaults to 0 (per-model epoch & early-stopping).",
+    )
+    parser.addoption(
+        "--eval-upto",
+        action="store",
+        default="train",
+        choices=("train", "export", "optimize"),
+        help="Choose train|export|optimize. Defaults to train.",
+    )
+    parser.addoption(
+        "--data-root",
+        action="store",
+        default="data",
+        help="Dataset root directory.",
+    )
+    parser.addoption(
+        "--output-root",
+        action="store",
+        help="Output root directory. Defaults to temp directory.",
+    )
+    parser.addoption(
+        "--summary-csv",
+        action="store",
+        help="Path to output summary cvs file. Defaults to {output-root}/benchmark-summary.csv",
+    )
+    parser.addoption(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="Print OTX commands without execution.",
+    )
+    parser.addoption(
+        "--deterministic",
+        action="store_true",
+        default=False,
+        help="Turn on deterministic training.",
+    )
+    parser.addoption(
+        "--user-name",
+        type=str,
+        default="anonymous",
+        help='Sign-off the user name who launched the regression tests this time, e.g., `--user-name "John Doe"`.',
+    )
+    parser.addoption(
+        "--mlflow-tracking-uri",
+        type=str,
+        help="URI for MLFlow Tracking server to store the regression test results.",
+    )
+
+
+@pytest.fixture(scope="session")
+def fxt_benchmark_type(request: pytest.FixtureRequest) -> str:
+    """Select benchmark type."""
+    benchmark_type: str = request.config.getoption("--benchmark-type")
+    msg = f"{benchmark_type = }"
+    log.info(msg)
+    return benchmark_type
+
+
+@pytest.fixture(scope="session")
+def fxt_model_category(request: pytest.FixtureRequest) -> str:
+    """Model category to run the benchmark."""
+    model_category = request.config.getoption("--model-category")
+    msg = f"{model_category = }"
+    log.info(msg)
+    return model_category
+
+
+@pytest.fixture(scope="session")
+def fxt_data_size(request: pytest.FixtureRequest) -> str:
+    """Data size to run the benchmark."""
+    data_size = request.config.getoption("--data-size")
+    msg = f"{data_size = }"
+    log.info(msg)
+    return data_size
+
+
+@pytest.fixture(scope="session")
+def fxt_num_repeat(request: pytest.FixtureRequest) -> int:
+    """Number of repeated run with different random seed."""
+    num_repeat = int(request.config.getoption("--num-repeat"))
+    msg = f"{num_repeat = }"
+    log.info(msg)
+    return num_repeat
+
+
+@pytest.fixture(scope="session")
+def fxt_num_epoch(request: pytest.FixtureRequest) -> int:
+    """Number of epochs to train models."""
+    num_epoch = int(request.config.getoption("--num-epoch"))
+    msg = f"{num_epoch = }"
+    log.info(msg)
+    return num_epoch
+
+
+@pytest.fixture(scope="session")
+def fxt_eval_upto(request: pytest.FixtureRequest) -> str:
+    """Last operation to evaluate ~ train|export|optimize."""
+    eval_upto = request.config.getoption("--eval-upto")
+    msg = f"{eval_upto = }"
+    log.info(msg)
+    return eval_upto
+
+
+@pytest.fixture(scope="session")
+def fxt_data_root(request: pytest.FixtureRequest) -> Path:
+    """Dataset root directory path."""
+    data_root = Path(request.config.getoption("--data-root"))
+    msg = f"{data_root = }"
+    log.info(msg)
+    return data_root
+
+
+@pytest.fixture(scope="session")
+def fxt_output_root(request: pytest.FixtureRequest, tmp_path_factory: pytest.TempPathFactory) -> Path:
+    """Output root + date + short commit hash."""
+    output_root = request.config.getoption("--output-root")
+    if output_root is None:
+        output_root = tmp_path_factory.mktemp("otx-benchmark")
+    tz = timezone(offset=timedelta(hours=9), name="Seoul")
+    date_str = datetime.now(tz=tz).strftime("%Y%m%d-%H%M%S")
+    output_root = Path(output_root) / date_str
+    msg = f"{output_root = }"
+    log.info(msg)
+    return output_root
+
+
+@pytest.fixture(scope="session")
+def fxt_version_tags() -> dict[str, str]:
+    """Version / branch / commit info."""
+    import otx
+
+    version_str = otx.__version__
+    try:
+        branch_str = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode("ascii").strip()  # noqa: S603, S607
+    except Exception:
+        branch_str = os.environ.get("GH_CTX_REF_NAME", "unknown")
+    try:
+        commit_str = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("ascii").strip()  # noqa: S603, S607
+    except Exception:
+        commit_str = os.environ.get("GH_CTX_SHA", "unknown")
+    version_tags = {
+        "version": version_str,
+        "branch": branch_str,
+        "commit": commit_str,
+    }
+    msg = f"{version_tags = }"
+    log.info(msg)
+    return version_tags
+
+
+@pytest.fixture(scope="session")
+def fxt_summary_csv(request: pytest.FixtureRequest, fxt_output_root: Path) -> Path:
+    """Path to benchmark result summary csv file."""
+    summary_csv = request.config.getoption("--summary-csv")
+    if summary_csv is None:
+        summary_csv = fxt_output_root / "benchmark-summary.csv"
+    msg = f"{summary_csv = }"
+    log.info(msg)
+    return summary_csv
+
+
+@pytest.fixture(scope="session")
+def fxt_dry_run(request: pytest.FixtureRequest) -> str:
+    """Option to print OTX commands without execution."""
+    dry_run = request.config.getoption("--dry-run")
+    msg = f"{dry_run = }"
+    log.info(msg)
+    return dry_run
+
+
+@pytest.fixture(scope="session")
+def fxt_deterministic(request: pytest.FixtureRequest) -> str:
+    """Option to turn on deterministic training."""
+    deterministic = request.config.getoption("--deterministic")
+    msg = f"{deterministic = }"
+    log.info(msg)
+    return deterministic
+
+
+@pytest.fixture(scope="session")
+def fxt_user_name(request: pytest.FixtureRequest) -> str:
+    """User name to sign off the regression test execution."""
+    user_name = request.config.getoption("--user-name")
+    msg = f"{user_name = }"
+    log.info(msg)
+    return user_name
+
+
+@pytest.fixture(scope="session")
+def fxt_mlflow_tracking_uri(request: pytest.FixtureRequest) -> str:
+    """MLFLow tracking server URI."""
+    mlflow_tracking_uri = urlparse(
+        request.config.getoption("--mlflow-tracking-uri"),
+    ).geturl()
+    msg = f"{mlflow_tracking_uri = }"
+    log.info(msg)
+    return mlflow_tracking_uri
+
+
+@pytest.fixture()
+def fxt_model(request: pytest.FixtureRequest, fxt_model_category) -> Benchmark.Model:
+    """Skip models according to user options."""
+    model: Benchmark.Model = request.param
+    if fxt_model_category == "default" and model.category == "other":
+        pytest.skip(f"{model.category} category model")
+    return model
+
+
+@pytest.fixture()
+def fxt_dataset(request: pytest.FixtureRequest, fxt_data_size) -> Benchmark.Data:
+    """Skip datasets according to user options."""
+    dataset: Benchmark.Dataset = request.param
+    if fxt_data_size not in {"all", dataset.size}:
+        pytest.skip(f"{dataset.size} size dataset")
+    return dataset
+
+
+@pytest.fixture(scope="session")
+def fxt_tags(fxt_user_name: str, fxt_version_tags: dict[str, str]) -> dict[str, str]:
+    """Tag fields to record the machine and user executing this perf test."""
+    tags = {
+        **fxt_version_tags,
+        "user_name": fxt_user_name,
+        "machine_name": platform.node(),
+        "cpu_info": get_cpu_info()["brand_raw"],
+        "accelerator_info": subprocess.check_output(
+            ["nvidia-smi", "-L"],  # noqa: S603, S607
+        )
+        .decode()
+        .strip(),
+    }
+    msg = f"{tags = }"
+    log.info(msg)
+    return tags
+
+
+@pytest.fixture()
+def fxt_benchmark(
+    request: pytest.FixtureRequest,
+    fxt_benchmark_type: str,
+    fxt_data_root: Path,
+    fxt_output_root: Path,
+    fxt_num_epoch: int,
+    fxt_num_repeat: int,
+    fxt_eval_upto: str,
+    fxt_tags: dict[str, str],
+    fxt_dry_run: bool,
+    fxt_deterministic: bool,
+    fxt_accelerator: str,
+) -> Benchmark:
+    """Configure benchmark."""
+    benchmark_type: str = request.param["type"]
+    if fxt_benchmark_type not in {"all", benchmark_type}:
+        pytest.skip(f"{benchmark_type} benchmark")
+
+    return Benchmark(
+        benchmark_type=benchmark_type,
+        data_root=fxt_data_root,
+        output_root=fxt_output_root,
+        criteria=request.param["criteria"],
+        num_epoch=fxt_num_epoch,
+        num_repeat=fxt_num_repeat,
+        eval_upto=fxt_eval_upto,
+        tags=fxt_tags,
+        dry_run=fxt_dry_run,
+        deterministic=fxt_deterministic,
+        accelerator=fxt_accelerator,
+    )
+
+
+@pytest.fixture(scope="session", autouse=True)
+def fxt_benchmark_summary(
+    fxt_output_root: Path,
+    fxt_summary_csv: Path,
+):
+    """Summarize all results at the end of test session."""
+    yield
+    all_results = Benchmark.load_result(fxt_output_root)
+    if all_results is not None:
+        print("=" * 20, "[Benchmark summary]")
+        print(all_results)
+        all_results.to_csv(fxt_summary_csv)
+        print(f"  -> Saved to {fxt_summary_csv}.")
+
+
+class PerfTestBase:
+    """Base perf test structure."""
+
+    def _test_perf(
+        self,
+        model: Benchmark.Model,
+        dataset: Benchmark.Dataset,
+        benchmark: Benchmark,
+    ) -> None:
+        result = benchmark.run(
+            model=model,
+            dataset=dataset,
+        )
+        print(result)
diff --git a/tests/perf/test_anomaly.py b/tests/perf/test_anomaly.py
new file mode 100644
index 00000000000..04c30c8a647
--- /dev/null
+++ b/tests/perf/test_anomaly.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX anomaly perfomance benchmark tests."""
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
new file mode 100644
index 00000000000..9b10b4772d9
--- /dev/null
+++ b/tests/perf/test_classification.py
@@ -0,0 +1,296 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX classification perfomance benchmark tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from .benchmark import Benchmark
+from .conftest import PerfTestBase
+
+
+class TestPerfSingleLabelClassification(PerfTestBase):
+    """Benchmark single-label classification."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="classification/multi_class_cls", name="efficientnet_b0_light", category="speed"),
+        Benchmark.Model(task="classification/multi_class_cls", name="efficientnet_v2_light", category="balance"),
+        Benchmark.Model(task="classification/multi_class_cls", name="mobilenet_v3_large_light", category="accuracy"),
+        Benchmark.Model(task="classification/multi_class_cls", name="otx_deit_tiny", category="other"),
+    ]
+
+    DATASET_TEST_CASES = [
+        Benchmark.Dataset(
+            name=f"multiclass_CUB_small_{idx}",
+            path=Path("multiclass_classification/multiclass_CUB_small") / f"{idx}",
+            size="small",
+            data_format="imagenet_with_subset_dirs",
+            num_classes=2,
+            num_repeat=3,
+            extra_overrides={},
+        )
+        for idx in (1, 2, 3)
+    ] + [
+        Benchmark.Dataset(
+            name="multiclass_CUB_medium",
+            path=Path("multiclass_classification/multiclass_CUB_medium"),
+            size="medium",
+            data_format="imagenet_with_subset_dirs",
+            num_classes=67,
+            num_repeat=3,
+            extra_overrides={},
+        ),
+        Benchmark.Dataset(
+            name="multiclass_food101_large",
+            path=Path("multiclass_classification/multiclass_food101_large"),
+            size="large",
+            data_format="imagenet_with_subset_dirs",
+            num_classes=20,
+            num_repeat=1,
+            extra_overrides={},
+        ),
+    ]
+
+    BENCHMARK_TEST_CASES = [  # noqa: RUF012
+        {
+            "type": "accuracy",
+            "criteria": [
+                Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1),
+                Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1),
+            ],
+        },
+        {
+            "type": "efficiency",
+            "criteria": [
+                Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+            ],
+        },
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_benchmark",
+        BENCHMARK_TEST_CASES,
+        ids=lambda benchmark: benchmark["type"],
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+        )
+
+
+class TestPerfMultiLabelClassification(PerfTestBase):
+    """Benchmark multi-label classification."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="classification/multi_label_cls", name="efficientnet_b0_light", category="speed"),
+        Benchmark.Model(task="classification/multi_label_cls", name="efficientnet_v2_light", category="balance"),
+        Benchmark.Model(task="classification/multi_label_cls", name="mobilenet_v3_large_light", category="accuracy"),
+        Benchmark.Model(task="classification/multi_label_cls", name="otx_deit_tiny", category="other"),
+    ]
+
+    DATASET_TEST_CASES = [
+        Benchmark.Dataset(
+            name=f"multilabel_CUB_small_{idx}",
+            path=Path("multilabel_classification/multilabel_CUB_small") / f"{idx}",
+            size="small",
+            data_format="datumaro",
+            num_classes=3,
+            num_repeat=3,
+            extra_overrides={},
+        )
+        for idx in (1, 2, 3)
+    ] + [
+        Benchmark.Dataset(
+            name="multilabel_CUB_medium",
+            path=Path("multilabel_classification/multilabel_CUB_medium"),
+            size="medium",
+            data_format="datumaro",
+            num_classes=68,
+            num_repeat=3,
+            extra_overrides={},
+        ),
+        Benchmark.Dataset(
+            name="multilabel_food101_large",
+            path=Path("multilabel_classification/multilabel_food101_large"),
+            size="large",
+            data_format="datumaro",
+            num_classes=21,
+            num_repeat=1,
+            extra_overrides={},
+        ),
+    ]
+
+    BENCHMARK_TEST_CASES = [  # noqa: RUF012
+        {
+            "type": "accuracy",
+            "criteria": [
+                Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1),
+                Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1),
+            ],
+        },
+        {
+            "type": "efficiency",
+            "criteria": [
+                Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+            ],
+        },
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_benchmark",
+        BENCHMARK_TEST_CASES,
+        ids=lambda benchmark: benchmark["type"],
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+        )
+
+
+class TestPerfHierarchicalLabelClassification(PerfTestBase):
+    """Benchmark hierarchical-label classification."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="classification/h_label_cls", name="efficientnet_b0_light", category="speed"),
+        Benchmark.Model(task="classification/h_label_cls", name="efficientnet_v2_light", category="balance"),
+        Benchmark.Model(task="classification/h_label_cls", name="mobilenet_v3_large_light", category="accuracy"),
+        Benchmark.Model(task="classification/h_label_cls", name="otx_deit_tiny", category="other"),
+    ]
+
+    DATASET_TEST_CASES = [
+        Benchmark.Dataset(
+            name=f"hlabel_CUB_small_{idx}",
+            path=Path("hlabel_classification/hlabel_CUB_small") / f"{idx}",
+            size="small",
+            data_format="datumaro",
+            num_classes=6,
+            num_repeat=3,
+            extra_overrides={
+                "model.num_multiclass_heads": "3",
+                "model.num_multilabel_classes": "0",
+            },
+        )
+        for idx in (1, 2, 3)
+    ] + [
+        Benchmark.Dataset(
+            name="hlabel_CUB_medium",
+            path=Path("hlabel_classification/hlabel_CUB_medium"),
+            size="medium",
+            data_format="datumaro",
+            num_classes=102,
+            num_repeat=3,
+            extra_overrides={
+                "model.num_multiclass_heads": "23",
+                "model.num_multilabel_classes": "0",
+            },
+        ),
+        # Add large dataset
+    ]
+
+    BENCHMARK_TEST_CASES = [  # noqa: RUF012
+        {
+            "type": "accuracy",
+            "criteria": [
+                Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1),
+                Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1),
+            ],
+        },
+        {
+            "type": "efficiency",
+            "criteria": [
+                Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+            ],
+        },
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_benchmark",
+        BENCHMARK_TEST_CASES,
+        ids=lambda benchmark: benchmark["type"],
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+        )
diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py
new file mode 100644
index 00000000000..83d0dabd5a2
--- /dev/null
+++ b/tests/perf/test_detection.py
@@ -0,0 +1,126 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX object detection perfomance benchmark tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from .benchmark import Benchmark
+from .conftest import PerfTestBase
+
+
+class TestPerfObjectDetection(PerfTestBase):
+    """Benchmark object detection."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="detection", name="atss_mobilenetv2", category="accuracy"),
+        Benchmark.Model(task="detection", name="atss_resnext101", category="other"),
+        Benchmark.Model(task="detection", name="ssd_mobilenetv2", category="balance"),
+        Benchmark.Model(task="detection", name="yolox_tiny", category="speed"),
+        Benchmark.Model(task="detection", name="yolox_s", category="other"),
+        Benchmark.Model(task="detection", name="yolox_l", category="other"),
+        Benchmark.Model(task="detection", name="yolox_x", category="other"),
+    ]
+
+    DATASET_TEST_CASES = [
+        Benchmark.Dataset(
+            name=f"pothole_small_{idx}",
+            path=Path("detection/pothole_small") / f"{idx}",
+            size="small",
+            data_format="coco",
+            num_classes=1,
+            num_repeat=3,
+            extra_overrides={
+                "deterministic": "True",
+                "metric": "otx.algo.metrices.fmeasure.FMeasure",
+                "callback_monitor": "val/f1-score",
+                "scheduler.monitor": "val/f1-score",
+            },
+        )
+        for idx in (1, 2, 3)
+    ] + [
+        Benchmark.Dataset(
+            name="pothole_medium",
+            path=Path("detection/pothole_medium"),
+            size="medium",
+            data_format="coco",
+            num_classes=1,
+            num_repeat=3,
+            extra_overrides={
+                "deterministic": "True",
+                "metric": "otx.algo.metrices.fmeasure.FMeasure",
+                "callback_monitor": "val/f1-score",
+                "scheduler.monitor": "val/f1-score",
+            },
+        ),
+        Benchmark.Dataset(
+            name="vitens_large",
+            path=Path("detection/vitens_large"),
+            size="large",
+            data_format="coco",
+            num_classes=1,
+            num_repeat=1,
+            extra_overrides={
+                "deterministic": "True",
+                "metric": "otx.algo.metrices.fmeasure.FMeasure",
+                "callback_monitor": "val/f1-score",
+                "scheduler.monitor": "val/f1-score",
+            },
+        ),
+    ]
+
+    BENCHMARK_TEST_CASES = [  # noqa: RUF012
+        {
+            "type": "accuracy",
+            "criteria": [
+                Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1),
+                Benchmark.Criterion(name="val/f1-score", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="test/f1-score", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="export/f1-score", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="optimize/f1-score", summary="max", compare=">", margin=0.1),
+            ],
+        },
+        {
+            "type": "efficiency",
+            "criteria": [
+                Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+            ],
+        },
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_benchmark",
+        BENCHMARK_TEST_CASES,
+        ids=lambda benchmark: benchmark["type"],
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+        )
diff --git a/tests/perf/test_instance_segmentation.py b/tests/perf/test_instance_segmentation.py
new file mode 100644
index 00000000000..c2fb2e36252
--- /dev/null
+++ b/tests/perf/test_instance_segmentation.py
@@ -0,0 +1,217 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX instance segmentation perfomance benchmark tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from .benchmark import Benchmark
+from .conftest import PerfTestBase
+
+
+class TestPerfInstanceSegmentation(PerfTestBase):
+    """Benchmark instance segmentation."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="instance_segmentation", name="maskrcnn_efficientnetb2b", category="speed"),
+        Benchmark.Model(task="instance_segmentation", name="maskrcnn_r50", category="accuracy"),
+        Benchmark.Model(task="instance_segmentation", name="maskrcnn_swint", category="other"),
+    ]
+
+    DATASET_TEST_CASES = [
+        Benchmark.Dataset(
+            name=f"wgisd_small_{idx}",
+            path=Path("instance_seg/wgisd_small") / f"{idx}",
+            size="small",
+            data_format="coco",
+            num_classes=5,
+            num_repeat=3,
+            extra_overrides={
+                "deterministic": "True",
+                "metric": "otx.algo.metrices.fmeasure.FMeasure",
+                "callback_monitor": "val/f1-score",
+                "scheduler.monitor": "val/f1-score",
+            },
+        )
+        for idx in (1, 2, 3)
+    ] + [
+        Benchmark.Dataset(
+            name="coco_car_person_medium",
+            path=Path("instance_seg/coco_car_person_medium"),
+            size="medium",
+            data_format="coco",
+            num_classes=2,
+            num_repeat=3,
+            extra_overrides={
+                "deterministic": "True",
+                "metric": "otx.algo.metrices.fmeasure.FMeasure",
+                "callback_monitor": "val/f1-score",
+                "scheduler.monitor": "val/f1-score",
+            },
+        ),
+        Benchmark.Dataset(
+            name="vitens_coliform",
+            path=Path("instance_seg/Vitens-Coliform-coco"),
+            size="large",
+            data_format="coco",
+            num_classes=1,
+            num_repeat=1,
+            extra_overrides={
+                "deterministic": "True",
+                "metric": "otx.algo.metrices.fmeasure.FMeasure",
+                "callback_monitor": "val/f1-score",
+                "scheduler.monitor": "val/f1-score",
+            },
+        ),
+    ]
+
+    BENCHMARK_TEST_CASES = [  # noqa: RUF012
+        {
+            "type": "accuracy",
+            "criteria": [
+                Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1),
+                Benchmark.Criterion(name="val/f1-score", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="test/f1-score", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="export/f1-score", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="optimize/f1-score", summary="max", compare=">", margin=0.1),
+            ],
+        },
+        {
+            "type": "efficiency",
+            "criteria": [
+                Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+            ],
+        },
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_benchmark",
+        BENCHMARK_TEST_CASES,
+        ids=lambda benchmark: benchmark["type"],
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+        )
+
+
+class TestPerfTilingInstanceSegmentation(PerfTestBase):
+    """Benchmark tiling instance segmentation."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="instance_segmentation", name="maskrcnn_efficientnetb2b_tile", category="speed"),
+        Benchmark.Model(task="instance_segmentation", name="maskrcnn_r50_tile", category="accuracy"),
+    ]
+
+    DATASET_TEST_CASES = [
+        Benchmark.Dataset(
+            name=f"vitens_aeromonas_small_{idx}",
+            path=Path("tiling_instance_seg/vitens_aeromonas_small") / f"{idx}",
+            size="small",
+            data_format="coco",
+            num_classes=1,
+            num_repeat=3,
+            extra_overrides={
+                "deterministic": "True",
+                "metric": "otx.algo.metrices.fmeasure.FMeasure",
+                "callback_monitor": "val/f1-score",
+                "scheduler.monitor": "val/f1-score",
+            },
+        )
+        for idx in (1, 2, 3)
+    ] + [
+        Benchmark.Dataset(
+            name="vitens_aeromonas_medium",
+            path=Path("tiling_instance_seg/vitens_aeromonas_medium"),
+            size="medium",
+            data_format="coco",
+            num_classes=1,
+            num_repeat=3,
+            extra_overrides={
+                "deterministic": "True",
+                "metric": "otx.algo.metrices.fmeasure.FMeasure",
+                "callback_monitor": "val/f1-score",
+                "scheduler.monitor": "val/f1-score",
+            },
+        ),
+        # Add large dataset
+    ]
+
+    BENCHMARK_TEST_CASES = [  # noqa: RUF012
+        {
+            "type": "accuracy",
+            "criteria": [
+                Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1),
+                Benchmark.Criterion(name="val/f1-score", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="test/f1-score", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="export/f1-score", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="optimize/f1-score", summary="max", compare=">", margin=0.1),
+            ],
+        },
+        {
+            "type": "efficiency",
+            "criteria": [
+                Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+            ],
+        },
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_benchmark",
+        BENCHMARK_TEST_CASES,
+        ids=lambda benchmark: benchmark["type"],
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+        )
diff --git a/tests/perf/test_semantic_segmentation.py b/tests/perf/test_semantic_segmentation.py
new file mode 100644
index 00000000000..dd88252b490
--- /dev/null
+++ b/tests/perf/test_semantic_segmentation.py
@@ -0,0 +1,111 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX semantic segmentation perfomance benchmark tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from .benchmark import Benchmark
+from .conftest import PerfTestBase
+
+
+class TestPerfSemanticSegmentation(PerfTestBase):
+    """Benchmark semantic segmentation."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="semantic_segmentation", name="litehrnet_18", category="balance"),
+        Benchmark.Model(task="semantic_segmentation", name="litehrnet_s", category="speed"),
+        Benchmark.Model(task="semantic_segmentation", name="litehrnet_x", category="accuracy"),
+        Benchmark.Model(task="semantic_segmentation", name="segnext_b", category="other"),
+        Benchmark.Model(task="semantic_segmentation", name="segnext_s", category="other"),
+        Benchmark.Model(task="semantic_segmentation", name="segnext_t", category="other"),
+        Benchmark.Model(task="semantic_segmentation", name="dino_v2", category="other"),
+    ]
+
+    DATASET_TEST_CASES = [
+        Benchmark.Dataset(
+            name=f"kvasir_small_{idx}",
+            path=Path("semantic_seg/kvasir_small") / f"{idx}",
+            size="small",
+            data_format="common_semantic_segmentation_with_subset_dirs",
+            num_classes=2,
+            num_repeat=3,
+            extra_overrides={},
+        )
+        for idx in (1, 2, 3)
+    ] + [
+        Benchmark.Dataset(
+            name="kvasir_medium",
+            path=Path("semantic_seg/kvasir_medium"),
+            size="medium",
+            data_format="common_semantic_segmentation_with_subset_dirs",
+            num_classes=2,
+            num_repeat=3,
+            extra_overrides={},
+        ),
+        Benchmark.Dataset(
+            name="kvasir_large",
+            path=Path("semantic_seg/kvasir_large"),
+            size="large",
+            data_format="common_semantic_segmentation_with_subset_dirs",
+            num_classes=2,
+            num_repeat=1,
+            extra_overrides={},
+        ),
+    ]
+
+    BENCHMARK_TEST_CASES = [  # noqa: RUF012
+        {
+            "type": "accuracy",
+            "criteria": [
+                Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1),
+                Benchmark.Criterion(name="val/Dice", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="test/Dice", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="export/Dice", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="optimize/Dice", summary="max", compare=">", margin=0.1),
+            ],
+        },
+        {
+            "type": "efficiency",
+            "criteria": [
+                Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+            ],
+        },
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_benchmark",
+        BENCHMARK_TEST_CASES,
+        ids=lambda benchmark: benchmark["type"],
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+        )
diff --git a/tests/perf/test_visual_prompting.py b/tests/perf/test_visual_prompting.py
new file mode 100644
index 00000000000..c892ae7fdc2
--- /dev/null
+++ b/tests/perf/test_visual_prompting.py
@@ -0,0 +1,178 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX visual prompting perfomance benchmark tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from .benchmark import Benchmark
+from .conftest import PerfTestBase
+
+
+class TestPerfVisualPrompting(PerfTestBase):
+    """Benchmark visual prompting."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="visual_prompting", name="sam_tiny_vit", category="other"),
+    ]
+
+    DATASET_TEST_CASES = [
+        Benchmark.Dataset(
+            name=f"wgisd_small_{idx}",
+            path=Path("visual_prompting/wgisd_small") / f"{idx}",
+            size="small",
+            data_format="coco",
+            num_classes=5,
+            num_repeat=3,
+            extra_overrides={},
+        )
+        for idx in (1, 2, 3)
+    ] + [
+        Benchmark.Dataset(
+            name="coco_car_person_medium",
+            path=Path("visual_prompting/coco_car_person_medium"),
+            size="medium",
+            data_format="coco",
+            num_classes=2,
+            num_repeat=3,
+            extra_overrides={},
+        ),
+        Benchmark.Dataset(
+            name="vitens_coliform",
+            path=Path("visual_prompting/Vitens-Coliform-coco"),
+            size="large",
+            data_format="coco",
+            num_classes=1,
+            num_repeat=1,
+            extra_overrides={},
+        ),
+    ]
+
+    BENCHMARK_TEST_CASES = [  # noqa: RUF012
+        {
+            "type": "accuracy",
+            "criteria": [
+                Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1),
+                Benchmark.Criterion(name="val/Dice", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="test/Dice", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="export/Dice", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="optimize/Dice", summary="max", compare=">", margin=0.1),
+            ],
+        },
+        {
+            "type": "efficiency",
+            "criteria": [
+                Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+            ],
+        },
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_benchmark",
+        BENCHMARK_TEST_CASES,
+        ids=lambda benchmark: benchmark["type"],
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+        )
+
+
+class TestPerfZeroShotVisualPrompting(PerfTestBase):
+    """Benchmark zero-shot visual prompting."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="zero_shot_visual_prompting", name="sam_tiny_vit", category="other"),
+        Benchmark.Model(task="zero_shot_visual_prompting", name="sam_vit_b", category="other"),
+    ]
+
+    DATASET_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Dataset(
+            name="coco_car_person_medium_datumaro",
+            path=Path("zero_shot_visual_prompting/coco_car_person_medium_datumaro"),
+            size="medium",
+            data_format="datumaro",
+            num_classes=2,
+            num_repeat=3,
+            extra_overrides={"max_epochs": "1"},
+        ),
+    ]
+
+    BENCHMARK_TEST_CASES = [  # noqa: RUF012
+        {
+            "type": "accuracy",
+            "criteria": [
+                Benchmark.Criterion(name="epoch", summary="max", compare="<", margin=0.1),
+                Benchmark.Criterion(name="val/Dice", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="test/Dice", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="export/Dice", summary="max", compare=">", margin=0.1),
+                Benchmark.Criterion(name="optimize/Dice", summary="max", compare=">", margin=0.1),
+            ],
+        },
+        {
+            "type": "efficiency",
+            "criteria": [
+                Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+                Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+            ],
+        },
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_benchmark",
+        BENCHMARK_TEST_CASES,
+        ids=lambda benchmark: benchmark["type"],
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+        )
diff --git a/tox.ini b/tox.ini
index 20456491fbc..69169b06b4e 100644
--- a/tox.ini
+++ b/tox.ini
@@ -63,7 +63,7 @@ commands_pre =
 commands =
     python -m pytest tests/integration -ra --showlocals --csv={toxworkdir}/{envname}.csv --task {[testenv]task} --open-subprocess {posargs}
 
-[testenv:performance-test]
+[testenv:perf-benchmark]
 deps =
     .[dev]
 commands_pre =
@@ -72,7 +72,7 @@ commands_pre =
     ; temporary as Anomalib v1 is not available on PyPI
     pip install git+https://github.com/openvinotoolkit/anomalib.git@cbb623e33876e446b7788375cc355e3a3dd44cef
 commands =
-    pytest tests/regression -ra --showlocals --csv={toxworkdir}/{envname}.csv {posargs}
+    pytest -ra --showlocals --csv={toxworkdir}/{envname}.csv {posargs:tests/perf}
 
 
 [testenv:build-doc]