From 099635914bf27638661907dac37f62985166a5e5 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Thu, 11 Apr 2024 11:00:27 +0900
Subject: [PATCH] Add perf benchmark test cases for action and visual prompting
 v1 (#3292)

* Run command w/ subprocess.run() for better stability

* Collect raw data to get seed info

* Fix model-category default to all

* Add action perf test cases

* Add visual prompting perf test cases

* Fix pre-commit
---
 .github/workflows/perf_benchmark.yaml |   4 +-
 tests/perf/benchmark.py               |   8 +-
 tests/perf/test_action.py             | 203 ++++++++++++++++++++++++++
 tests/perf/test_visual_prompting.py   | 186 +++++++++++++++++++++++
 tools/experiment.py                   |   6 +-
 5 files changed, 400 insertions(+), 7 deletions(-)
 create mode 100644 tests/perf/test_action.py

diff --git a/.github/workflows/perf_benchmark.yaml b/.github/workflows/perf_benchmark.yaml
index 6fbc3981be1..37065151fdf 100644
--- a/.github/workflows/perf_benchmark.yaml
+++ b/.github/workflows/perf_benchmark.yaml
@@ -9,7 +9,7 @@ on:
         options:
           - default # speed, balance, accuracy models only
           - all # default + other models
-        default: default
+        default: all
       data-group:
         type: choice
         description: Data group to run benchmark
@@ -98,6 +98,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
+          - task-short: "act"
+            task: "action"
           - task-short: "ano"
             task: "anomaly"
           - task-short: "cls"
diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index f0fa4d28477..5daf5e1e4d2 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -121,7 +121,7 @@ def load_result(result_path: str) -> pd.DataFrame | None:
         """
         # Search csv files
         if os.path.isdir(result_path):
-            csv_file_paths = glob.glob(f"{result_path}/**/exp_summary.csv", recursive=True)
+            csv_file_paths = glob.glob(f"{result_path}/**/all_exp_result.csv", recursive=True)
         else:
             csv_file_paths = [result_path]
         results = []
@@ -142,7 +142,9 @@ def load_result(result_path: str) -> pd.DataFrame | None:
 
         # Merge experiments
         data = pd.concat(results, ignore_index=True)
-        data["train_e2e_time"] = pd.to_timedelta(data["train_e2e_time"]).dt.total_seconds()  # H:M:S str -> seconds
+        if "train_e2e_time" in data:
+            data["train_e2e_time"] = pd.to_timedelta(data["train_e2e_time"]).dt.total_seconds()  # H:M:S str -> seconds
+        data = data.rename(columns={"repeat": "seed"})
         return data.set_index(["task", "model", "data_group", "data"])
 
     @staticmethod
@@ -231,6 +233,8 @@ def _set_num_epoch(model_id: str, train_params: dict, num_epoch: int):
             return  # No configurable parameter for num_epoch
         elif "stfpm" in model_id:
             train_params["learning_parameters.max_epochs"] = num_epoch
+        elif "SAM" in model_id:
+            train_params["learning_parameters.trainer.max_epochs"] = num_epoch
         else:
             train_params["learning_parameters.num_iters"] = num_epoch
 
diff --git a/tests/perf/test_action.py b/tests/perf/test_action.py
new file mode 100644
index 00000000000..387bf5515c4
--- /dev/null
+++ b/tests/perf/test_action.py
@@ -0,0 +1,203 @@
+"""OTX Action perfomance tests."""
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from typing import Callable
+from .benchmark import Benchmark
+
+
+class TestPerfActionClassification:
+    """Benchmark action classification."""
+
+    MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="ACTION_CLASSIFICATION").templates
+    MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "action_classification",
+            },
+            "datasets": [
+                "action/action_classification/ucf_cvat_5percent",
+            ],
+            "num_repeat": 5,
+            "num_epoch": 10,
+        },
+        "medium": {
+            "tags": {
+                "task": "action_classification",
+            },
+            "datasets": [
+                "action/action_classification/ucf_cvat_30percent",
+            ],
+            "num_repeat": 5,
+            "num_epoch": 10,
+        },
+        "large": {
+            "tags": {
+                "task": "action_classification",
+            },
+            "datasets": [
+                "action/action_classification/ucf_cvat",
+            ],
+            "num_repeat": 5,
+            "num_epoch": 3,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_perf(self, fxt_model_id: str, fxt_benchmark: Benchmark):
+        """Benchmark performance metrics."""
+        result = fxt_benchmark.run(model_id=fxt_model_id)
+        fxt_benchmark.check(
+            result,
+            criteria=[
+                {
+                    "name": "Accuracy(train)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "Accuracy(export)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "Accuracy(optimize)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "epoch",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "train_e2e_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_data_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_iter_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_time_per_image(export)",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_time_per_image(optimize)",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+            ],
+        )
+
+
+class TestPerfActionDetection:
+    """Benchmark action detection."""
+
+    MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="ACTION_DETECTION").templates
+    MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "action_detection",
+            },
+            "datasets": [
+                "action/action_detection/UCF101_cvat_5percent",
+            ],
+            "num_repeat": 5,
+            "num_epoch": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "action_detection",
+            },
+            "datasets": [
+                "action/action_detection/UCF101_cvat_30percent",
+            ],
+            "num_repeat": 5,
+            "num_epoch": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "action_detection",
+            },
+            "datasets": [
+                "action/action_detection/UCF101_cvat",
+            ],
+            "num_repeat": 5,
+            "num_epoch": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_perf(self, fxt_model_id: str, fxt_benchmark: Benchmark):
+        """Benchmark performance metrics."""
+        result = fxt_benchmark.run(model_id=fxt_model_id)
+        fxt_benchmark.check(
+            result,
+            criteria=[
+                {
+                    "name": "f-measure(train)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "epoch",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "f-measure(export)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "f-measure(optimize)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "train_e2e_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_data_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_iter_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_time_per_image(export)",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_time_per_image(optimize)",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+            ],
+        )
diff --git a/tests/perf/test_visual_prompting.py b/tests/perf/test_visual_prompting.py
index 5d59f7ba09c..63377b52c35 100644
--- a/tests/perf/test_visual_prompting.py
+++ b/tests/perf/test_visual_prompting.py
@@ -2,3 +2,189 @@
 
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from typing import Callable
+from .benchmark import Benchmark
+
+
+class TestPerfVisualPrompting:
+    """Benchmark visual prompting."""
+
+    MODEL_TEMPLATES = [
+        template
+        for template in Registry("src/otx/algorithms/visual_prompting").filter(task_type="VISUAL_PROMPTING").templates
+        if "Zero_Shot" not in template.name
+    ]
+    MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "visual_prompting",
+            },
+            "datasets": [
+                "visual_prompting/wgisd_small/1",
+                "visual_prompting/wgisd_small/2",
+                "visual_prompting/wgisd_small/3",
+            ],
+            "num_repeat": 5,
+        },
+        "medium": {
+            "tags": {
+                "task": "visual_prompting",
+            },
+            "datasets": [
+                "visual_prompting/coco_car_person_medium",
+            ],
+            "num_repeat": 5,
+        },
+        "large": {
+            "tags": {
+                "task": "visual_prompting",
+            },
+            "datasets": [
+                "visual_prompting/Vitens-Coliform-coco",
+            ],
+            "num_repeat": 5,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_perf(self, fxt_model_id: str, fxt_benchmark: Benchmark):
+        """Benchmark performance metrics."""
+        result = fxt_benchmark.run(model_id=fxt_model_id)
+        fxt_benchmark.check(
+            result,
+            criteria=[
+                {
+                    "name": "Dice Average(train)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "Dice Average(export)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "Dice Average(optimize)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "epoch",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "train_e2e_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_data_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_iter_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_time_per_image(export)",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_time_per_image(optimize)",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+            ],
+        )
+
+
+class TestPerfZeroShotVisualPrompting:
+    """Benchmark zero-shot visual prompting."""
+
+    MODEL_TEMPLATES = [
+        template
+        for template in Registry("src/otx/algorithms/visual_prompting").filter(task_type="VISUAL_PROMPTING").templates
+        if "Zero_Shot" in template.name
+    ]
+    MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+    BENCHMARK_CONFIGS = {
+        "medium": {
+            "tags": {
+                "task": "zero_shot_visual_prompting",
+            },
+            "datasets": [
+                "zero_shot_visual_prompting/coco_car_person_medium",
+            ],
+            "num_repeat": 5,
+            "num_epoch": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_perf(self, fxt_model_id: str, fxt_benchmark: Benchmark):
+        """Benchmark performance metrics."""
+        result = fxt_benchmark.run(model_id=fxt_model_id)
+        fxt_benchmark.check(
+            result,
+            criteria=[
+                {
+                    "name": "Dice Average(train)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "epoch",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "Dice Average(export)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "Dice Average(optimize)",
+                    "op": ">",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "train_e2e_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_data_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_iter_time",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_time_per_image(export)",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+                {
+                    "name": "avg_time_per_image(optimize)",
+                    "op": "<",
+                    "margin": 0.1,
+                },
+            ],
+        )
diff --git a/tools/experiment.py b/tools/experiment.py
index 6a79aae7537..d3ce63d3288 100644
--- a/tools/experiment.py
+++ b/tools/experiment.py
@@ -12,7 +12,7 @@
 import re
 import shutil
 import statistics
-import sys
+import subprocess
 from abc import ABC, abstractmethod
 from copy import copy, deepcopy
 from dataclasses import dataclass, field
@@ -22,7 +22,6 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import yaml
-from otx.cli.tools.cli import main as otx_cli
 from rich.console import Console
 from rich.table import Table
 
@@ -919,9 +918,8 @@ def _prepare_run_command(self, command: List[str]) -> bool:
         return True
 
     def _run_otx_command(self, command: List[str]):
-        sys.argv = copy(command)
         try:
-            otx_cli()
+            subprocess.run(command, check=True)
         except Exception as e:
             self._fail_logs.append(CommandFailInfo(variable=self._command_var, exception=e, command=" ".join(command)))