From ec0f90658ea6d817ceb9609241149429fcd2147c Mon Sep 17 00:00:00 2001
From: Eunwoo Shin <eunwoo.shin@intel.com>
Date: Thu, 30 May 2024 21:07:52 +0900
Subject: [PATCH] Make Perf test available to load pervious Perf test to skip
 training stage (#3556)

* symlink to relative path

* skip training if prev perf result exists

* implement missing part

* align with pre-commit

* udpate test code

* fix typo

* change arg name to resume-from

* revert checkpoint symlink
---
 src/otx/cli/cli.py                       |   2 +-
 tests/conftest.py                        |   6 +
 tests/perf/benchmark.py                  | 175 +++++++++++++++--------
 tests/perf/conftest.py                   |  12 ++
 tests/perf/test_action.py                |  10 ++
 tests/perf/test_anomaly.py               |  15 ++
 tests/perf/test_classification.py        |  15 ++
 tests/perf/test_detection.py             |   5 +
 tests/perf/test_instance_segmentation.py |  10 ++
 tests/perf/test_semantic_segmentation.py |   5 +
 tests/perf/test_visual_prompting.py      |  10 ++
 11 files changed, 205 insertions(+), 60 deletions(-)

diff --git a/src/otx/cli/cli.py b/src/otx/cli/cli.py
index 4694275c0cd..83da4f45023 100644
--- a/src/otx/cli/cli.py
+++ b/src/otx/cli/cli.py
@@ -492,7 +492,7 @@ def update_latest(self, work_dir: Path) -> None:
         cache_dir = latest_dir / self.subcommand
         if cache_dir.exists():
             cache_dir.unlink()
-        cache_dir.symlink_to(work_dir)
+        cache_dir.symlink_to(Path("..") / work_dir.relative_to(work_dir.parent))
 
     def set_seed(self) -> None:
         """Set the random seed for reproducibility.
diff --git a/tests/conftest.py b/tests/conftest.py
index 4ef8c4059fd..2d4c9d484b8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -119,6 +119,12 @@ def pytest_addoption(parser: pytest.Parser):
         "`pip install otx[full]@https://github.com/openvinotoolkit/training_extensions.git@{otx_ref}` will be executed before run, "
         "and reverted after run. Works only for v2.x assuming CLI compatibility.",
     )
+    parser.addoption(
+        "--resume-from",
+        type=str,
+        help="Previous performance test directory which contains execution results. "
+        "If training was already done in previous performance test, training is skipped and refer previous result.",
+    )
     parser.addoption(
         "--open-subprocess",
         action="store_true",
diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index d590ba993a2..ac575529c9f 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -7,11 +7,12 @@
 
 import gc
 import logging
+import shutil
 import subprocess
 from dataclasses import dataclass
 from pathlib import Path
 from time import time
-from typing import Any
+from typing import Any, Literal
 
 import numpy as np
 import pandas as pd
@@ -132,6 +133,7 @@ def run(
         model: Model,
         dataset: Dataset,
         criteria: list[Criterion],
+        resume_from: Path | None = None,
     ) -> pd.DataFrame | None:
         """Run configured benchmark with given dataset and model and return the result.
 
@@ -139,6 +141,9 @@ def run(
             model (Model): Target model settings
             dataset (Dataset): Target dataset settings
             criteria (list[Criterion]): Target criteria settings
+            resume_from(Path | None, optional):
+                Previous performance directory to load. If training was already done in previous performance test,
+                training is skipped and refer previous result.
 
         Retruns:
             pd.DataFrame | None: Table with benchmark metrics
@@ -168,6 +173,13 @@ def run(
                 tags["seed"] = str(seed)
 
                 # Train & test
+                copied_train_dir = None
+                if (
+                    resume_from is not None
+                    and (prev_train_dir := self._find_corresponding_dir(resume_from, tags)) is not None
+                ):
+                    copied_train_dir = self._copy_prev_train_dir(prev_train_dir, sub_work_dir)
+
                 command = [
                     "otx",
                     "train",
@@ -189,13 +201,19 @@ def run(
                 command.extend(["--deterministic", str(self.deterministic)])
                 if self.num_epoch > 0:
                     command.extend(["--max_epochs", str(self.num_epoch)])
-                start_time = time()
-                self._run_command(command)
-                extra_metrics = {"train/e2e_time": time() - start_time}
-                self._rename_raw_data(
-                    work_dir=sub_work_dir / ".latest" / "train",
-                    replaces={"train_": "train/", "{pre}": "train/"},
-                )
+                extra_metrics = {}
+                if copied_train_dir is not None:
+                    command.append("--print_config")
+                    with (copied_train_dir / "configs.yaml").open("w") as f:
+                        self._run_command(command, stdout=f)  # replace previuos configs.yaml to new one
+                else:
+                    start_time = time()
+                    self._run_command(command)
+                    extra_metrics["train/e2e_time"] = time() - start_time
+                    self._rename_raw_data(
+                        work_dir=sub_work_dir / ".latest" / "train",
+                        replaces={"train_": "train/", "{pre}": "train/"},
+                    )
                 self._log_metrics(
                     work_dir=sub_work_dir / ".latest" / "train",
                     tags=tags,
@@ -203,21 +221,7 @@ def run(
                     extra_metrics=extra_metrics,
                 )
 
-                command = [
-                    "otx",
-                    "test",
-                    "--work_dir",
-                    str(sub_work_dir),
-                ]
-                for key, value in dataset.extra_overrides.get("test", {}).items():
-                    command.append(f"--{key}")
-                    command.append(str(value))
-                self._run_command(command)
-                self._rename_raw_data(
-                    work_dir=sub_work_dir / ".latest" / "test",
-                    replaces={"test_": "test/", "{pre}": "test/"},
-                )
-                self._log_metrics(work_dir=sub_work_dir / ".latest" / "test", tags=tags, criteria=criteria)
+                self._run_test(sub_work_dir, dataset, tags, criteria, what2test="train")
 
                 # Export & test
                 if self.eval_upto in ["export", "optimize"]:
@@ -236,24 +240,14 @@ def run(
                     if not exported_model_path.exists():
                         exported_model_path = sub_work_dir / ".latest" / "export" / "exported_model_decoder.xml"
 
-                    command = [  # NOTE: not working for h_label_cls. to be fixed
-                        "otx",
-                        "test",
-                        "--checkpoint",
-                        str(exported_model_path),
-                        "--work_dir",
-                        str(sub_work_dir),
-                    ]
-                    for key, value in dataset.extra_overrides.get("test", {}).items():
-                        command.append(f"--{key}")
-                        command.append(str(value))
-                    self._run_command(command)
-
-                    self._rename_raw_data(
-                        work_dir=sub_work_dir / ".latest" / "test",
-                        replaces={"test": "export", "{pre}": "export/"},
+                    self._run_test(
+                        sub_work_dir,
+                        dataset,
+                        tags,
+                        criteria,
+                        checkpoint=exported_model_path,
+                        what2test="export",
                     )
-                    self._log_metrics(work_dir=sub_work_dir / ".latest" / "test", tags=tags, criteria=criteria)
 
                 # Optimize & test
                 if self.eval_upto == "optimize":
@@ -274,24 +268,14 @@ def run(
                     if not optimized_model_path.exists():
                         optimized_model_path = sub_work_dir / ".latest" / "optimize" / "optimized_model_decoder.xml"
 
-                    command = [
-                        "otx",
-                        "test",
-                        "--checkpoint",
-                        str(optimized_model_path),
-                        "--work_dir",
-                        str(sub_work_dir),
-                    ]
-                    for key, value in dataset.extra_overrides.get("test", {}).items():
-                        command.append(f"--{key}")
-                        command.append(str(value))
-                    self._run_command(command)
-
-                    self._rename_raw_data(
-                        work_dir=sub_work_dir / ".latest" / "test",
-                        replaces={"test": "optimize", "{pre}": "optimize/"},
+                    self._run_test(
+                        sub_work_dir,
+                        dataset,
+                        tags,
+                        criteria,
+                        checkpoint=optimized_model_path,
+                        what2test="optimize",
                     )
-                    self._log_metrics(work_dir=sub_work_dir / ".latest" / "test", tags=tags, criteria=criteria)
 
                 # Force memory clean up
                 gc.collect()
@@ -308,10 +292,83 @@ def run(
         result = summary.average(result, keys=["task", "model", "data_group", "data"])  # Average out seeds
         return result.set_index(["task", "model", "data_group", "data"])
 
-    def _run_command(self, command: list[str]) -> None:
+    def _find_corresponding_dir(self, resume_from: Path, tags: dict[str, str]) -> Path | None:
+        for csv_file in resume_from.rglob("benchmark.raw.csv"):
+            raw_data = pd.read_csv(csv_file)
+            if (
+                "train/epoch" in raw_data.columns  # check it's csv of train result
+                and all(  # check meta info is same
+                    str(raw_data.iloc[0].get(key, "NOT_IN_CSV")) == tags.get(key, "NOT_IN_TAG")
+                    for key in ["data_group", "data", "model", "task", "seed"]
+                )
+            ):
+                return csv_file.parent
+        return None
+
+    def _copy_prev_train_dir(self, prev_train_dir: Path, work_dir: Path) -> Path:
+        work_dir.mkdir(parents=True, exist_ok=True)
+        new_train_dir = work_dir / prev_train_dir.name
+        shutil.copytree(prev_train_dir, new_train_dir, ignore_dangling_symlinks=True)
+        cache_dir = work_dir / ".latest" / "train"
+        cache_dir.parent.mkdir(exist_ok=True)
+        cache_dir.symlink_to(Path("..") / new_train_dir.relative_to(work_dir))
+
+        return new_train_dir
+
+    def _run_test(
+        self,
+        work_dir: Path | str,
+        dataset: Dataset,
+        tags: dict[str, str],
+        criteria: list[Criterion],
+        checkpoint: Path | str | None = None,
+        what2test: Literal["train", "export", "optimize"] = "train",
+    ) -> None:
+        """Run otx test and update result csv file to align it's indices to the current task."""
+        replace_map = {
+            "train": {"test_": "test/", "{pre}": "export/"},
+            "export": {"test": "export", "{pre}": "export/"},
+            "optimize": {"test": "optimize", "{pre}": "optimize/"},
+        }
+
+        command = [
+            "otx",
+            "test",
+            "--work_dir",
+            str(work_dir),
+        ]
+        if checkpoint is not None:
+            command.extend(["--checkpoint", str(checkpoint)])
+        for key, value in dataset.extra_overrides.get("test", {}).items():
+            command.append(f"--{key}")
+            command.append(str(value))
+
+        start_time = time()
+        self._run_command(command)
+        extra_metrics = {f"test({what2test})/e2e_time": time() - start_time}
+
+        self._rename_raw_data(
+            work_dir=work_dir / ".latest" / "test",
+            replaces=replace_map[what2test],
+        )
+        self._log_metrics(
+            work_dir=work_dir / ".latest" / "test",
+            tags=tags,
+            criteria=criteria,
+            extra_metrics=extra_metrics,
+        )
+
+    def _run_command(self, command: list[str], **kwargs) -> None:
+        """Run command using 'subprocess.run'.
+
+        Args:
+            command (list[str]): command to execute.
+            kwags: arguments to 'subprocess.run'.
+        """
         print(" ".join(command))
+        kwargs["check"] = True
         if not self.dry_run:
-            subprocess.run(command, check=True)  # noqa: S603
+            subprocess.run(command, **kwargs)  # noqa: S603, PLW1510
 
     def _log_metrics(
         self,
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index e690f3d8d16..6a0904ac6d9 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -245,6 +245,16 @@ def fxt_tags(fxt_user_name: str, fxt_version_tags: dict[str, str]) -> dict[str,
     return tags
 
 
+@pytest.fixture(scope="session")
+def fxt_resume_from(request: pytest.FixtureRequest) -> Path | None:
+    resume_from = request.config.getoption("--resume-from")
+    if resume_from is not None:
+        resume_from = Path(resume_from)
+    msg = f"{resume_from = }"
+    log.info(msg)
+    return resume_from
+
+
 @pytest.fixture()
 def fxt_benchmark(
     fxt_data_root: Path,
@@ -356,11 +366,13 @@ def _test_perf(
         dataset: Benchmark.Dataset,
         benchmark: Benchmark,
         criteria: list[Benchmark.Criterion],
+        resume_from: Path | None,
     ) -> None:
         result = benchmark.run(
             model=model,
             dataset=dataset,
             criteria=criteria,
+            resume_from=resume_from,
         )
         benchmark.check(
             result=result,
diff --git a/tests/perf/test_action.py b/tests/perf/test_action.py
index 96c6595cbef..bba1ab52ea6 100644
--- a/tests/perf/test_action.py
+++ b/tests/perf/test_action.py
@@ -70,6 +70,9 @@ class TestPerfActionClassification(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -89,12 +92,14 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
 
 
@@ -154,6 +159,9 @@ class TestPerfActionDetection(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -173,10 +181,12 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
diff --git a/tests/perf/test_anomaly.py b/tests/perf/test_anomaly.py
index 63883e068bf..cea8ff3d804 100644
--- a/tests/perf/test_anomaly.py
+++ b/tests/perf/test_anomaly.py
@@ -57,6 +57,9 @@ class TestPerfAnomalyClassification(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -76,12 +79,14 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
 
 
@@ -129,6 +134,9 @@ class TestPerfAnomalyDetection(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -148,12 +156,14 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
 
 
@@ -201,6 +211,9 @@ class TestPerfAnomalySegmentation(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -220,10 +233,12 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index 3f2924d2db9..212c4bcbd65 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -61,6 +61,9 @@ class TestPerfSingleLabelClassification(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -80,12 +83,14 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
 
 
@@ -136,6 +141,9 @@ class TestPerfMultiLabelClassification(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -155,12 +163,14 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
 
 
@@ -205,6 +215,9 @@ class TestPerfHierarchicalLabelClassification(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -224,10 +237,12 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py
index 045badf4606..9ae73f7c932 100644
--- a/tests/perf/test_detection.py
+++ b/tests/perf/test_detection.py
@@ -84,6 +84,9 @@ class TestPerfObjectDetection(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -103,10 +106,12 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
diff --git a/tests/perf/test_instance_segmentation.py b/tests/perf/test_instance_segmentation.py
index 288b2a48732..1a2e6342118 100644
--- a/tests/perf/test_instance_segmentation.py
+++ b/tests/perf/test_instance_segmentation.py
@@ -89,6 +89,9 @@ class TestPerfInstanceSegmentation(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -108,12 +111,14 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
 
 
@@ -177,6 +182,9 @@ class TestPerfTilingInstanceSegmentation(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -196,10 +204,12 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
diff --git a/tests/perf/test_semantic_segmentation.py b/tests/perf/test_semantic_segmentation.py
index 1cd5fe7a968..bf644258d3c 100644
--- a/tests/perf/test_semantic_segmentation.py
+++ b/tests/perf/test_semantic_segmentation.py
@@ -63,6 +63,9 @@ class TestPerfSemanticSegmentation(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -82,10 +85,12 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
diff --git a/tests/perf/test_visual_prompting.py b/tests/perf/test_visual_prompting.py
index c2d0a2f4766..0aae2d8c29b 100644
--- a/tests/perf/test_visual_prompting.py
+++ b/tests/perf/test_visual_prompting.py
@@ -58,6 +58,9 @@ class TestPerfVisualPrompting(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -77,12 +80,14 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )
 
 
@@ -119,6 +124,9 @@ class TestPerfZeroShotVisualPrompting(PerfTestBase):
         Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
         Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
     ]
 
     @pytest.mark.parametrize(
@@ -138,10 +146,12 @@ def test_perf(
         fxt_model: Benchmark.Model,
         fxt_dataset: Benchmark.Dataset,
         fxt_benchmark: Benchmark,
+        fxt_resume_from: Path | None,
     ):
         self._test_perf(
             model=fxt_model,
             dataset=fxt_dataset,
             benchmark=fxt_benchmark,
             criteria=self.BENCHMARK_CRITERIA,
+            resume_from=fxt_resume_from,
         )