From 099635914bf27638661907dac37f62985166a5e5 Mon Sep 17 00:00:00 2001 From: Songki Choi Date: Thu, 11 Apr 2024 11:00:27 +0900 Subject: [PATCH] Add perf benchmark test cases for action and visual prompting v1 (#3292) * Run command w/ subprocess.run() for better stability * Collect raw data to get seed info * Fix model-category default to all * Add action perf test cases * Add visual prompting perf test cases * Fix pre-commit --- .github/workflows/perf_benchmark.yaml | 4 +- tests/perf/benchmark.py | 8 +- tests/perf/test_action.py | 203 ++++++++++++++++++++++++++ tests/perf/test_visual_prompting.py | 186 +++++++++++++++++++++++ tools/experiment.py | 6 +- 5 files changed, 400 insertions(+), 7 deletions(-) create mode 100644 tests/perf/test_action.py diff --git a/.github/workflows/perf_benchmark.yaml b/.github/workflows/perf_benchmark.yaml index 6fbc3981be1..37065151fdf 100644 --- a/.github/workflows/perf_benchmark.yaml +++ b/.github/workflows/perf_benchmark.yaml @@ -9,7 +9,7 @@ on: options: - default # speed, balance, accuracy models only - all # default + other models - default: default + default: all data-group: type: choice description: Data group to run benchmark @@ -98,6 +98,8 @@ jobs: fail-fast: false matrix: include: + - task-short: "act" + task: "action" - task-short: "ano" task: "anomaly" - task-short: "cls" diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py index f0fa4d28477..5daf5e1e4d2 100644 --- a/tests/perf/benchmark.py +++ b/tests/perf/benchmark.py @@ -121,7 +121,7 @@ def load_result(result_path: str) -> pd.DataFrame | None: """ # Search csv files if os.path.isdir(result_path): - csv_file_paths = glob.glob(f"{result_path}/**/exp_summary.csv", recursive=True) + csv_file_paths = glob.glob(f"{result_path}/**/all_exp_result.csv", recursive=True) else: csv_file_paths = [result_path] results = [] @@ -142,7 +142,9 @@ def load_result(result_path: str) -> pd.DataFrame | None: # Merge experiments data = pd.concat(results, ignore_index=True) - data["train_e2e_time"] = pd.to_timedelta(data["train_e2e_time"]).dt.total_seconds() # H:M:S str -> seconds + if "train_e2e_time" in data: + data["train_e2e_time"] = pd.to_timedelta(data["train_e2e_time"]).dt.total_seconds() # H:M:S str -> seconds + data = data.rename(columns={"repeat": "seed"}) return data.set_index(["task", "model", "data_group", "data"]) @staticmethod @@ -231,6 +233,8 @@ def _set_num_epoch(model_id: str, train_params: dict, num_epoch: int): return # No configurable parameter for num_epoch elif "stfpm" in model_id: train_params["learning_parameters.max_epochs"] = num_epoch + elif "SAM" in model_id: + train_params["learning_parameters.trainer.max_epochs"] = num_epoch else: train_params["learning_parameters.num_iters"] = num_epoch diff --git a/tests/perf/test_action.py b/tests/perf/test_action.py new file mode 100644 index 00000000000..387bf5515c4 --- /dev/null +++ b/tests/perf/test_action.py @@ -0,0 +1,203 @@ +"""OTX Action perfomance tests.""" + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +import pytest + +from otx.cli.registry import Registry +from typing import Callable +from .benchmark import Benchmark + + +class TestPerfActionClassification: + """Benchmark action classification.""" + + MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="ACTION_CLASSIFICATION").templates + MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES] + + BENCHMARK_CONFIGS = { + "small": { + "tags": { + "task": "action_classification", + }, + "datasets": [ + "action/action_classification/ucf_cvat_5percent", + ], + "num_repeat": 5, + "num_epoch": 10, + }, + "medium": { + "tags": { + "task": "action_classification", + }, + "datasets": [ + "action/action_classification/ucf_cvat_30percent", + ], + "num_repeat": 5, + "num_epoch": 10, + }, + "large": { + "tags": { + "task": "action_classification", + }, + "datasets": [ + "action/action_classification/ucf_cvat", + ], + "num_repeat": 5, + "num_epoch": 3, + }, + } + + @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) + @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) + def test_perf(self, fxt_model_id: str, fxt_benchmark: Benchmark): + """Benchmark performance metrics.""" + result = fxt_benchmark.run(model_id=fxt_model_id) + fxt_benchmark.check( + result, + criteria=[ + { + "name": "Accuracy(train)", + "op": ">", + "margin": 0.1, + }, + { + "name": "Accuracy(export)", + "op": ">", + "margin": 0.1, + }, + { + "name": "Accuracy(optimize)", + "op": ">", + "margin": 0.1, + }, + { + "name": "epoch", + "op": "<", + "margin": 0.1, + }, + { + "name": "train_e2e_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_data_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_iter_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_time_per_image(export)", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_time_per_image(optimize)", + "op": "<", + "margin": 0.1, + }, + ], + ) + + +class TestPerfActionDetection: + """Benchmark action detection.""" + + MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="ACTION_DETECTION").templates + MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES] + + BENCHMARK_CONFIGS = { + "small": { + "tags": { + "task": "action_detection", + }, + "datasets": [ + "action/action_detection/UCF101_cvat_5percent", + ], + "num_repeat": 5, + "num_epoch": 3, + }, + "medium": { + "tags": { + "task": "action_detection", + }, + "datasets": [ + "action/action_detection/UCF101_cvat_30percent", + ], + "num_repeat": 5, + "num_epoch": 3, + }, + "large": { + "tags": { + "task": "action_detection", + }, + "datasets": [ + "action/action_detection/UCF101_cvat", + ], + "num_repeat": 5, + "num_epoch": 1, + }, + } + + @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) + @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) + def test_perf(self, fxt_model_id: str, fxt_benchmark: Benchmark): + """Benchmark performance metrics.""" + result = fxt_benchmark.run(model_id=fxt_model_id) + fxt_benchmark.check( + result, + criteria=[ + { + "name": "f-measure(train)", + "op": ">", + "margin": 0.1, + }, + { + "name": "epoch", + "op": "<", + "margin": 0.1, + }, + { + "name": "f-measure(export)", + "op": ">", + "margin": 0.1, + }, + { + "name": "f-measure(optimize)", + "op": ">", + "margin": 0.1, + }, + { + "name": "train_e2e_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_data_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_iter_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_time_per_image(export)", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_time_per_image(optimize)", + "op": "<", + "margin": 0.1, + }, + ], + ) diff --git a/tests/perf/test_visual_prompting.py b/tests/perf/test_visual_prompting.py index 5d59f7ba09c..63377b52c35 100644 --- a/tests/perf/test_visual_prompting.py +++ b/tests/perf/test_visual_prompting.py @@ -2,3 +2,189 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + + +import pytest + +from otx.cli.registry import Registry +from typing import Callable +from .benchmark import Benchmark + + +class TestPerfVisualPrompting: + """Benchmark visual prompting.""" + + MODEL_TEMPLATES = [ + template + for template in Registry("src/otx/algorithms/visual_prompting").filter(task_type="VISUAL_PROMPTING").templates + if "Zero_Shot" not in template.name + ] + MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES] + + BENCHMARK_CONFIGS = { + "small": { + "tags": { + "task": "visual_prompting", + }, + "datasets": [ + "visual_prompting/wgisd_small/1", + "visual_prompting/wgisd_small/2", + "visual_prompting/wgisd_small/3", + ], + "num_repeat": 5, + }, + "medium": { + "tags": { + "task": "visual_prompting", + }, + "datasets": [ + "visual_prompting/coco_car_person_medium", + ], + "num_repeat": 5, + }, + "large": { + "tags": { + "task": "visual_prompting", + }, + "datasets": [ + "visual_prompting/Vitens-Coliform-coco", + ], + "num_repeat": 5, + }, + } + + @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) + @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) + def test_perf(self, fxt_model_id: str, fxt_benchmark: Benchmark): + """Benchmark performance metrics.""" + result = fxt_benchmark.run(model_id=fxt_model_id) + fxt_benchmark.check( + result, + criteria=[ + { + "name": "Dice Average(train)", + "op": ">", + "margin": 0.1, + }, + { + "name": "Dice Average(export)", + "op": ">", + "margin": 0.1, + }, + { + "name": "Dice Average(optimize)", + "op": ">", + "margin": 0.1, + }, + { + "name": "epoch", + "op": "<", + "margin": 0.1, + }, + { + "name": "train_e2e_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_data_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_iter_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_time_per_image(export)", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_time_per_image(optimize)", + "op": "<", + "margin": 0.1, + }, + ], + ) + + +class TestPerfZeroShotVisualPrompting: + """Benchmark zero-shot visual prompting.""" + + MODEL_TEMPLATES = [ + template + for template in Registry("src/otx/algorithms/visual_prompting").filter(task_type="VISUAL_PROMPTING").templates + if "Zero_Shot" in template.name + ] + MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES] + + BENCHMARK_CONFIGS = { + "medium": { + "tags": { + "task": "zero_shot_visual_prompting", + }, + "datasets": [ + "zero_shot_visual_prompting/coco_car_person_medium", + ], + "num_repeat": 5, + "num_epoch": 1, + }, + } + + @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True) + @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True) + def test_perf(self, fxt_model_id: str, fxt_benchmark: Benchmark): + """Benchmark performance metrics.""" + result = fxt_benchmark.run(model_id=fxt_model_id) + fxt_benchmark.check( + result, + criteria=[ + { + "name": "Dice Average(train)", + "op": ">", + "margin": 0.1, + }, + { + "name": "epoch", + "op": "<", + "margin": 0.1, + }, + { + "name": "Dice Average(export)", + "op": ">", + "margin": 0.1, + }, + { + "name": "Dice Average(optimize)", + "op": ">", + "margin": 0.1, + }, + { + "name": "train_e2e_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_data_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_iter_time", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_time_per_image(export)", + "op": "<", + "margin": 0.1, + }, + { + "name": "avg_time_per_image(optimize)", + "op": "<", + "margin": 0.1, + }, + ], + ) diff --git a/tools/experiment.py b/tools/experiment.py index 6a79aae7537..d3ce63d3288 100644 --- a/tools/experiment.py +++ b/tools/experiment.py @@ -12,7 +12,7 @@ import re import shutil import statistics -import sys +import subprocess from abc import ABC, abstractmethod from copy import copy, deepcopy from dataclasses import dataclass, field @@ -22,7 +22,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union import yaml -from otx.cli.tools.cli import main as otx_cli from rich.console import Console from rich.table import Table @@ -919,9 +918,8 @@ def _prepare_run_command(self, command: List[str]) -> bool: return True def _run_otx_command(self, command: List[str]): - sys.argv = copy(command) try: - otx_cli() + subprocess.run(command, check=True) except Exception as e: self._fail_logs.append(CommandFailInfo(variable=self._command_var, exception=e, command=" ".join(command)))