Support benchmark history summary v1 (#3307)

* Add history data, notebook and script * Add perf summary workflow * Update xlsx summary * Use history as check ref * Fix normalize
openvinotoolkit · Apr 15, 2024 · 7d0b894 · 7d0b894
1 parent 479f86b
commit 7d0b894
Show file tree

Hide file tree

Showing 10 changed files with 9,318 additions and 204 deletions.
diff --git a/.github/workflows/perf_benchmark.yaml b/.github/workflows/perf_benchmark.yaml
@@ -88,12 +88,13 @@ on:
           Target OTX ref (tag / branch name / commit hash) on main repo to test. Defaults to the current branch.
           `pip install otx[full]@https://github.com/openvinotoolkit/training_extensions.git@{otx_ref}` will be executed before run,
           and reverted after run. Works only for v1.x assuming CLI compatibility.
+        default: __CURRENT_BRANCH_COMMIT__
 
 # Declare default permissions as read only.
 permissions: read-all
 
 jobs:
-  Perf-Benchmark:
+  Perf-Benchmark-Run:
     strategy:
       fail-fast: false
       matrix:
@@ -142,14 +143,42 @@ jobs:
           --num-repeat ${{ inputs.num-repeat }}
           --num-epoch ${{ inputs.num-epoch }}
           --eval-upto ${{ inputs.eval-upto }}
-          --summary-csv .tox/perf-benchmark-summary.csv
+          --summary-file .tox/perf-benchmark-summary.xlsx
           --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
           --user-name ${{ github.triggering_actor }}
           --otx-ref ${{ inputs.otx-ref }}
       - name: Upload test results
         uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
         with:
           name: perf-benchmark-${{ matrix.task-short }}
-          path: .tox/perf-*.csv
+          path: .tox/perf-benchmark-*.*
+        # Use always() to always run this step to publish test results when there are test failures
+        if: ${{ always() }}
+
+  Perf-Benchmark-Summary:
+    needs: Perf-Benchmark-Run
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download benchmark results
+        uses: actions/download-artifact@v4
+        with:
+          path: tests/perf/history/latest
+      - name: Checkout repository
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: Install Python
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: python -m pip install --upgrade pip pandas matplotlib nbconvert ipython ipykernel openpyxl
+      - name: Summarize benchamrk results
+        run: |
+          python tests/perf/history/summary.py tests/perf/history ./perf-benchmark-summary --pattern "*raw*.csv"
+          jupyter nbconvert --execute --to html --no-input tests/perf/history/summary.ipynb --output-dir ./perf-benchmark-summary --output perf-benchmark-summary
+      - name: Upload benchmark summary
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
+        with:
+          name: perf-benchmark-summary
+          path: perf-benchmark-summary
         # Use always() to always run this step to publish test results when there are test failures
         if: ${{ always() }}
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -12,3 +12,4 @@ pytest-csv==3.0.*
 tox==4.11.*
 mlflow==2.10.2
 py-cpuinfo==9.0.0
+openpyxl==3.1.2
diff --git a/tests/perf/benchmark-reference.csv b/tests/perf/benchmark-reference.csv
diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
@@ -11,6 +11,7 @@
 import subprocess  # nosec B404
 import yaml
 from pathlib import Path
+from .history import summary
 
 
 class Benchmark:
@@ -107,6 +108,7 @@ def run(
         subprocess.run(cmd, check=True)
         # Load result
         result = self.load_result(cfg_dir)
+        result = summary.average(result, ["task", "model", "data_group", "data"])
         return result
 
     @staticmethod
@@ -145,35 +147,7 @@ def load_result(result_path: str) -> pd.DataFrame | None:
         if "train_e2e_time" in data:
             data["train_e2e_time"] = pd.to_timedelta(data["train_e2e_time"]).dt.total_seconds()  # H:M:S str -> seconds
         data = data.rename(columns={"repeat": "seed"})
-        return data.set_index(["task", "model", "data_group", "data"])
-
-    @staticmethod
-    def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame:
-        """Average result w.r.t. given keys
-        Args:
-            result (pd.DataFrame): Result data frame
-            keys (list[str]): Keys to summarize whole data
-        Retruns:
-            pd.DataFrame: Averaged result table
-        """
-        # Flatten index
-        index_names = data.index.names
-        column_names = data.columns
-        data = data.reset_index()
-        # Average by keys
-        grouped = data.groupby(keys)
-        aggregated = grouped.mean(numeric_only=True)
-        # Merge index columns
-        idx_columns = set(index_names) - set(keys)
-        for col in idx_columns:
-            aggregated[col] = "all"
-        # Merge tag columns (non-numeric & non-index)
-        tag_columns = set(column_names) - set(aggregated.columns) - set(keys)
-        for col in tag_columns:
-            # Take common string prefix such as: ["data/1", "data/2", "data/3"] -> "data/"
-            aggregated[col] = grouped[col].agg(lambda x: os.path.commonprefix(x.tolist()))
-        # Recover index
-        return aggregated.reset_index().set_index(index_names)
+        return data
 
     def _build_config(
         self,
@@ -189,7 +163,7 @@ def _build_config(
 
         cfg = {}
         cfg["tags"] = all_tags  # metadata
-        cfg["output_path"] = os.path.abspath(self.output_root)
+        cfg["output_path"] = os.path.abspath(f"{self.output_root}/{model_id}")
         cfg["constants"] = {
             "dataroot": os.path.abspath(self.data_root),
         }
@@ -246,30 +220,44 @@ def check(self, result: pd.DataFrame, criteria: list[dict]):
             criteria (list[dict]): Criteria to check results
         """
         if result is None:
+            print("[Check] No results loaded. Skipping result checking.")
             return
 
         if self.reference_results is None:
-            print("No benchmark references loaded. Skipping result checking.")
+            print("[Check] No benchmark references loaded. Skipping result checking.")
             return
 
+        result = result.set_index(["task", "model", "data_group", "data"])
+
         for key, result_entry in result.iterrows():
             if key not in self.reference_results.index:
-                print(f"No benchmark reference for {key} loaded. Skipping result checking.")
+                print(f"[Check] No benchmark reference for {key} loaded. Skipping result checking.")
                 continue
             target_entry = self.reference_results.loc[key]
             if isinstance(target_entry, pd.DataFrame):
-                target_entry = target_entry.iloc[0]  # 1-row pd.DataFrame to pd.Series
+                # Match num_repeat & seeds of result and target
+                result_seed_average = result_entry["seed"]
+                result_num_repeat = 2 * result_seed_average + 1  # (0+1+2+3+4)/5 = 2.0 -> 2*2.0+1 = 5
+                target_entry = target_entry.query(f"seed < {result_num_repeat}")
+                target_entry = target_entry.mean(numeric_only=True)  # N-row pd.DataFrame to pd.Series
 
             def compare(name: str, op: str, margin: float):
                 if name not in result_entry or result_entry[name] is None or np.isnan(result_entry[name]):
+                    print(f"[Check] {name} not in result")
                     return
                 if name not in target_entry or target_entry[name] is None or np.isnan(target_entry[name]):
+                    print(f"[Check] {name} not in target")
                     return
                 if op == "==":
+                    print(
+                        f"[Check] abs({name}:{result_entry[name]} - {name}:{target_entry[name]}) < {name}:{target_entry[name]} * {margin}",
+                    )
                     assert abs(result_entry[name] - target_entry[name]) < target_entry[name] * margin
                 elif op == "<":
+                    print(f"[Check] {name}:{result_entry[name]} < {name}:{target_entry[name]} * (1.0 + {margin})")
                     assert result_entry[name] < target_entry[name] * (1.0 + margin)
                 elif op == ">":
+                    print(f"[Check] {name}:{result_entry[name]} > {name}:{target_entry[name]} * (1.0 - {margin})")
                     assert result_entry[name] > target_entry[name] * (1.0 - margin)
 
             for criterion in criteria:

diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
@@ -20,6 +20,7 @@
 import yaml
 
 from .benchmark import Benchmark
+from .history import summary
 
 
 def pytest_addoption(parser):
@@ -72,9 +73,9 @@ def pytest_addoption(parser):
         help="Output root directory. Defaults to temp directory.",
     )
     parser.addoption(
-        "--summary-csv",
+        "--summary-file",
         action="store",
-        help="Path to output summary cvs file. Defaults to {output-root}/benchmark-summary.csv",
+        help="Path to output summary file. Defaults to {output-root}/benchmark-summary.csv",
     )
     parser.addoption(
         "--dry-run",
@@ -248,12 +249,15 @@ def fxt_benchmark(
 
 
 def _log_benchmark_results_to_mlflow(results: pd.DataFrame, tags: dict[str, str], client: MlflowClient):
+    results = summary.normalize(results)  # Standardize names for comparison
+    results = summary.average(results, keys=["task", "model", "data_group", "data"])  # Average out seeds
+    results = results.set_index(["task", "data_group", "data"])
     for index, result in results.iterrows():
-        task, model, data_group, data = index
-        exp_name = f"[Benchmark] {task} | {model} | {data_group} | {data}"
+        task, data_group, data = index
+        model = result["model"]
+        exp_name = f"[Benchmark] {task} | {data_group} | {data}"
         exp_tags = {
             "task": task,
-            "model": model,
             "data_group": data_group,
             "data": data,
         }
@@ -264,7 +268,7 @@ def _log_benchmark_results_to_mlflow(results: pd.DataFrame, tags: dict[str, str]
             exp_id = exp.experiment_id
             if exp.lifecycle_stage != "active":
                 client.restore_experiment(exp_id)
-        run_name = f"[{tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
+        run_name = f"[{model}] {tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
         run_tags = {k: v for k, v in result.items() if isinstance(v, str)}
         run_tags.update(**exp_tags, **tags)
         run = client.create_run(exp_id, run_name=run_name, tags=run_tags)
@@ -282,32 +286,32 @@ def fxt_benchmark_summary(
 ):
     """Summarize all results at the end of test session."""
     yield
+
     raw_results = Benchmark.load_result(fxt_output_root)
-    if raw_results is None:
+    if raw_results is None or len(raw_results) == 0:
         print("No benchmark results loaded in ", fxt_output_root)
         return
 
-    print("=" * 20, "[Benchmark summary]")
-    summary_results = [
-        Benchmark.average_result(raw_results, ["task", "model", "data_group", "data"]),
-        Benchmark.average_result(raw_results, ["task", "model", "data_group"]),
-        Benchmark.average_result(raw_results, ["task", "model"]),
-        Benchmark.average_result(raw_results, ["task"]),
-    ]
-    summary_results = pd.concat(summary_results)
+    summary_results = summary.summarize(raw_results)
 
     print("=" * 20, "[Benchmark summary]")
     print(summary_results)
 
-    summary_csv = request.config.getoption("--summary-csv")
-    if not summary_csv:
-        summary_csv = fxt_output_root / "perf-benchmark-summary.csv"
+    summary_file = request.config.getoption("--summary-file")
+    if not summary_file:
+        summary_file = fxt_output_root / "perf-benchmark-summary.csv"
+    else:
+        summary_file = Path(summary_file)
+    summary_file.parent.mkdir(parents=True, exist_ok=True)
+    raw_results.to_csv(summary_file.parent / "perf-benchmark-raw.csv", index=False)
+    if summary_file.suffix == ".xlsx":
+        summary_results.to_excel(summary_file)
     else:
-        summary_csv = Path(summary_csv)
-    summary_csv.parent.mkdir(parents=True, exist_ok=True)
-    summary_results.to_csv(summary_csv)
-    raw_results.to_csv(summary_csv.parent / "perf-benchmark-raw.csv")
-    print(f"  -> Saved to {summary_csv}.")
+        if summary_file.suffix != ".csv":
+            print(f"{summary_file.suffix} output is not supported.")
+            summary_file = summary_file.with_suffix(".csv")
+        summary_results.to_csv(summary_file)
+    print(f"  -> Saved to {summary_file}.")
 
     if fxt_mlflow_client is None:
         print(
@@ -320,7 +324,7 @@ def fxt_benchmark_summary(
     # test_branch = fxt_tags["test_branch"]
     # if test_branch == "develop" or bool(re.match("^releases/[0-9]+\.[0-9]+\.[0-9]+$", test_branch)):
     try:
-        _log_benchmark_results_to_mlflow(summary_results, fxt_tags, fxt_mlflow_client)
+        _log_benchmark_results_to_mlflow(raw_results, fxt_tags, fxt_mlflow_client)
     except Exception as e:
         print("MLFlow loging failed: ", e)
 
@@ -331,7 +335,7 @@ def fxt_benchmark_summary(
 @pytest.fixture(scope="session")
 def fxt_benchmark_reference() -> pd.DataFrame | None:
     """Load reference benchmark results with index."""
-    ref = pd.read_csv(Path(__file__).parent.resolve() / "benchmark-reference.csv")
+    ref = summary.load(Path(__file__).parent.resolve() / "history/v1.5.2")
     if ref is not None:
         ref = ref.set_index(["task", "model", "data_group", "data"])
     return ref
diff --git a/tests/perf/history/__init__.py b/tests/perf/history/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX perfomance benchmark history."""