Support benchmark history summary v2 (#3298)

* Add perf benchmark history data * Add summary notebook * Support optional excel output * Update mlflow logging * Check with right seed average
openvinotoolkit · Apr 15, 2024 · 98bb7f3 · 98bb7f3
1 parent 3dea944
commit 98bb7f3
Show file tree

Hide file tree

Showing 11 changed files with 9,449 additions and 216 deletions.
diff --git a/.github/workflows/perf_benchmark.yaml b/.github/workflows/perf_benchmark.yaml
@@ -97,7 +97,7 @@ on:
 permissions: read-all
 
 jobs:
-  Perf-Benchmark:
+  Perf-Benchmark-Run:
     strategy:
       fail-fast: false
       matrix:
@@ -141,14 +141,42 @@ jobs:
           --num-repeat ${{ inputs.num-repeat }}
           --num-epoch ${{ inputs.num-epoch }}
           --eval-upto ${{ inputs.eval-upto }}
-          --summary-csv .tox/perf-benchmark-summary.csv
+          --summary-file .tox/perf-benchmark-summary.xlsx
           --mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
           --user-name ${{ github.triggering_actor }}
           --otx-ref ${{ inputs.otx-ref }}
       - name: Upload test results
         uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
         with:
           name: perf-benchmark-${{ matrix.task-short }}
-          path: .tox/perf-*.csv
+          path: .tox/perf-benchmark-*.*
+        # Use always() to always run this step to publish test results when there are test failures
+        if: ${{ always() }}
+
+  Perf-Benchmark-Summary:
+    needs: Perf-Benchmark-Run
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download benchmark results
+        uses: actions/download-artifact@v4
+        with:
+          path: tests/perf/history/latest
+      - name: Checkout repository
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: Install Python
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: python -m pip install --upgrade pip pandas matplotlib nbconvert ipython ipykernel openpyxl
+      - name: Summarize benchamrk results
+        run: |
+          python tests/perf/history/summary.py tests/perf/history ./perf-benchmark-summary --pattern "*raw*.csv" --normalize
+          jupyter nbconvert --execute --to html --no-input tests/perf/history/summary.ipynb --output-dir ./perf-benchmark-summary --output perf-benchmark-summary
+      - name: Upload benchmark summary
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
+        with:
+          name: perf-benchmark-summary
+          path: perf-benchmark-summary
         # Use always() to always run this step to publish test results when there are test failures
         if: ${{ always() }}
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,8 +50,9 @@ dev = [
     "pytest-mock",
     "pytest-csv",
     "pytest-cov",
-    "mlflow==2.11.1",        # For regression test
-    "py-cpuinfo==9.0.0",    # For regression test
+    "mlflow==2.11.1",     # For perf benchmark
+    "py-cpuinfo==9.0.0",  # For perf benchmark
+    "openpyxl",           # For perf benchmark
 ]
 docs = [
     "furo",

diff --git a/tests/perf/__init__.py b/tests/perf/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-"""OTX perfomance benchamrk tests."""
+"""OTX perfomance benchmark tests."""
diff --git a/tests/perf/benchmark-reference.csv b/tests/perf/benchmark-reference.csv
diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
@@ -7,7 +7,6 @@
 
 import gc
 import logging
-import os
 import subprocess
 from dataclasses import dataclass
 from pathlib import Path
@@ -17,6 +16,8 @@
 import numpy as np
 import pandas as pd
 
+from .history import summary
+
 log = logging.getLogger(__name__)
 
 
@@ -71,14 +72,25 @@ class Criterion:
         def __call__(self, result_entry: pd.Series, target_entry: pd.Series) -> None:
             """Check result against given target."""
             if self.name not in result_entry or result_entry[self.name] is None or np.isnan(result_entry[self.name]):
+                print(f"[Check] {self.name} not in result")
                 return
             if self.name not in target_entry or target_entry[self.name] is None or np.isnan(target_entry[self.name]):
+                print(f"[Check] {self.name} not in target")
                 return
             if self.compare == "==":
+                print(
+                    f"[Check] abs({self.name}:{result_entry[self.name]} - {self.name}:{target_entry[self.name]}) < {self.name}:{target_entry[self.name]} * {self.margin}",
+                )
                 assert abs(result_entry[self.name] - target_entry[self.name]) < target_entry[self.name] * self.margin
             elif self.compare == "<":
+                print(
+                    f"[Check] {self.name}:{result_entry[self.name]} < {self.name}:{target_entry[self.name]} * (1.0 + {self.margin})",
+                )
                 assert result_entry[self.name] < target_entry[self.name] * (1.0 + self.margin)
             elif self.compare == ">":
+                print(
+                    f"[Check] {self.name}:{result_entry[self.name]} > {self.name}:{target_entry[self.name]} * (1.0 - {self.margin})",
+                )
                 assert result_entry[self.name] > target_entry[self.name] * (1.0 - self.margin)
 
     def __init__(
@@ -279,7 +291,10 @@ def run(
             gc.collect()
 
         result = self.load_result(work_dir)
-        return self.average_result(result, keys=["task", "model", "data_group", "data"])
+        if result is None:
+            return None
+        result = summary.average(result, keys=["task", "model", "data_group", "data"])  # Average out seeds
+        return result.set_index(["task", "model", "data_group", "data"])
 
     def _run_command(self, command: list[str]) -> None:
         print(" ".join(command))
@@ -370,40 +385,7 @@ def load_result(result_path: Path) -> pd.DataFrame | None:
         if len(results) == 0:
             return None
 
-        return pd.concat(results, ignore_index=True).set_index(["task", "model", "data_group", "data"])
-
-    @staticmethod
-    def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame | None:
-        """Average result w.r.t. given keys
-
-        Args:
-            result (pd.DataFrame): Result data frame
-            keys (list[str]): Keys to summarize whole data
-
-        Retruns:
-            pd.DataFrame: Averaged result table
-        """
-        if data is None:
-            return None
-
-        # Flatten index
-        index_names = data.index.names
-        column_names = data.columns
-        data = data.reset_index()
-        # Average by keys
-        grouped = data.groupby(keys)
-        aggregated = grouped.mean(numeric_only=True)
-        # Merge index columns
-        idx_columns = set(index_names) - set(keys)
-        for col in idx_columns:
-            aggregated[col] = "all"
-        # Merge tag columns (non-numeric & non-index)
-        tag_columns = set(column_names) - set(aggregated.columns) - set(keys)
-        for col in tag_columns:
-            # Take common string prefix such as: ["data/1", "data/2", "data/3"] -> "data/"
-            aggregated[col] = grouped[col].agg(lambda x: os.path.commonprefix(x.tolist()))
-        # Recover index
-        return aggregated.reset_index().set_index(index_names)
+        return pd.concat(results, ignore_index=True)
 
     def check(self, result: pd.DataFrame, criteria: list[Criterion]):
         """Check result w.r.t. reference data.
@@ -413,19 +395,24 @@ def check(self, result: pd.DataFrame, criteria: list[Criterion]):
             criteria (list[Criterion]): Criteria to check results
         """
         if result is None:
+            print("[Check] No results loaded. Skipping result checking.")
             return
 
         if self.reference_results is None:
-            print("No benchmark references loaded. Skipping result checking.")
+            print("[Check] No benchmark references loaded. Skipping result checking.")
             return
 
         for key, result_entry in result.iterrows():
             if key not in self.reference_results.index:
-                print(f"No benchmark reference for {key} loaded. Skipping result checking.")
+                print(f"[Check] No benchmark reference for {key} loaded. Skipping result checking.")
                 continue
             target_entry = self.reference_results.loc[key]
             if isinstance(target_entry, pd.DataFrame):
-                target_entry = target_entry.iloc[0]  # 1-row pd.DataFrame to pd.Series
+                # Match num_repeat of result and target
+                result_seed_average = result_entry["seed"]
+                result_num_repeat = 2 * result_seed_average + 1  # (0+1+2+3+4)/5 = 2.0 -> 2*2.0+1 = 5
+                target_entry = target_entry.query(f"seed < {result_num_repeat}")
+                target_entry = target_entry.mean(numeric_only=True)  # N-row pd.DataFrame to pd.Series
 
             for criterion in criteria:
                 criterion(result_entry, target_entry)

diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
@@ -9,14 +9,18 @@
 import subprocess
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
+from typing import TYPE_CHECKING
 from urllib.parse import urlparse
 
-import pandas as pd
 import pytest
 from cpuinfo import get_cpu_info
 from mlflow.client import MlflowClient
 
 from .benchmark import Benchmark
+from .history import summary
+
+if TYPE_CHECKING:
+    import pandas as pd
 
 log = logging.getLogger(__name__)
 
@@ -71,9 +75,9 @@ def pytest_addoption(parser):
         help="Output root directory. Defaults to temp directory.",
     )
     parser.addoption(
-        "--summary-csv",
+        "--summary-file",
         action="store",
-        help="Path to output summary cvs file. Defaults to {output-root}/benchmark-summary.csv",
+        help="Path to output summary file. Defaults to {output-root}/benchmark-summary.csv",
     )
     parser.addoption(
         "--dry-run",
@@ -237,13 +241,13 @@ def fxt_version_tags(fxt_current_date: str, fxt_otx_ref: str) -> dict[str, str]:
 
 
 @pytest.fixture(scope="session")
-def fxt_summary_csv(request: pytest.FixtureRequest, fxt_output_root: Path) -> Path:
+def fxt_summary_file(request: pytest.FixtureRequest, fxt_output_root: Path) -> Path:
     """Path to benchmark result summary csv file."""
-    summary_csv = request.config.getoption("--summary-csv")
-    summary_csv = fxt_output_root / "benchmark-summary.csv" if summary_csv is None else Path(summary_csv)
-    msg = f"{summary_csv = }"
+    summary_file = request.config.getoption("--summary-file")
+    summary_file = fxt_output_root / "benchmark-summary.csv" if summary_file is None else Path(summary_file)
+    msg = f"{summary_file = }"
     log.info(msg)
-    return summary_csv
+    return summary_file
 
 
 @pytest.fixture(scope="session")
@@ -292,7 +296,10 @@ def fxt_model(request: pytest.FixtureRequest, fxt_model_category) -> Benchmark.M
     model: Benchmark.Model = request.param
     if fxt_model_category == "all":
         return model
-    if (fxt_model_category == "default" and model.category == "other") or fxt_model_category != model.category:
+    if fxt_model_category == "default":
+        if model.category == "other":
+            pytest.skip(f"{model.category} category model")
+    elif fxt_model_category != model.category:
         pytest.skip(f"{model.category} category model")
     return model
 
@@ -356,47 +363,49 @@ def fxt_benchmark(
 @pytest.fixture(scope="session", autouse=True)
 def fxt_benchmark_summary(
     fxt_output_root: Path,
-    fxt_summary_csv: Path,
+    fxt_summary_file: Path,
     fxt_mlflow_client: MlflowClient,
     fxt_tags: dict[str, str],
 ):
     """Summarize all results at the end of test session."""
     yield
 
     raw_results = Benchmark.load_result(fxt_output_root)
-    if raw_results is None:
+    if raw_results is None or len(raw_results) == 0:
         print("No benchmark results loaded in ", fxt_output_root)
         return
 
-    summary_results = [
-        Benchmark.average_result(raw_results, ["task", "model", "data_group", "data"]),
-        Benchmark.average_result(raw_results, ["task", "model", "data_group"]),
-        Benchmark.average_result(raw_results, ["task", "model"]),
-        Benchmark.average_result(raw_results, ["task"]),
-    ]
-    summary_results = pd.concat(summary_results)
+    summary_results = summary.summarize(raw_results)
 
     print("=" * 20, "[Benchmark summary]")
     print(summary_results)
-    fxt_summary_csv.parent.mkdir(parents=True, exist_ok=True)
-    summary_results.to_csv(fxt_summary_csv)
-    raw_results.to_csv(fxt_summary_csv.parent / "perf-benchmark-raw.csv")
-    print(f"  -> Saved to {fxt_summary_csv}.")
+    fxt_summary_file.parent.mkdir(parents=True, exist_ok=True)
+    raw_results.to_csv(fxt_summary_file.parent / "perf-benchmark-raw.csv", index=False)
+    if fxt_summary_file.suffix == ".xlsx":
+        summary_results.to_excel(fxt_summary_file)
+    else:
+        if fxt_summary_file.suffix != ".csv":
+            print(f"{fxt_summary_file.suffix} output is not supported.")
+            fxt_summary_file = fxt_summary_file.with_suffix(".csv")
+        summary_results.to_csv(fxt_summary_file)
+    print(f"  -> Saved to {fxt_summary_file}.")
 
     if fxt_mlflow_client:
         try:
-            _log_benchmark_results_to_mlflow(summary_results, fxt_mlflow_client, fxt_tags)
+            _log_benchmark_results_to_mlflow(raw_results, fxt_mlflow_client, fxt_tags)
         except Exception as e:
             print("MLFlow logging failed: ", e)
 
 
 def _log_benchmark_results_to_mlflow(results: pd.DataFrame, client: MlflowClient, tags: dict[str, str]) -> None:
+    results = summary.average(results, keys=["task", "model", "data_group", "data"])  # Average out seeds
+    results = results.set_index(["task", "data_group", "data"])
     for index, result in results.iterrows():
-        task, model, data_group, data = index
-        exp_name = f"[Benchmark] {task} | {model} | {data_group} | {data}"
+        task, data_group, data = index
+        model = result["model"]
+        exp_name = f"[Benchmark] {task} | {data_group} | {data}"
         exp_tags = {
             "task": task,
-            "model": model,
             "data_group": data_group,
             "data": data,
         }
@@ -407,7 +416,7 @@ def _log_benchmark_results_to_mlflow(results: pd.DataFrame, client: MlflowClient
             exp_id = exp.experiment_id
             if exp.lifecycle_stage != "active":
                 client.restore_experiment(exp_id)
-        run_name = f"[{tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
+        run_name = f"[{model}] {tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
         run_tags = {k: v for k, v in result.items() if isinstance(v, str)}
         run_tags.update(**exp_tags, **tags)
         run = client.create_run(exp_id, run_name=run_name, tags=run_tags)
@@ -419,7 +428,7 @@ def _log_benchmark_results_to_mlflow(results: pd.DataFrame, client: MlflowClient
 @pytest.fixture(scope="session")
 def fxt_benchmark_reference() -> pd.DataFrame | None:
     """Load reference benchmark results with index."""
-    ref = pd.read_csv(Path(__file__).parent.resolve() / "benchmark-reference.csv")
+    ref = summary.load(Path(__file__).parent.resolve() / "history/v1.5.2", need_normalize=True)
     if ref is not None:
         ref = ref.set_index(["task", "model", "data_group", "data"])
     return ref

diff --git a/tests/perf/history/__init__.py b/tests/perf/history/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""OTX perfomance benchmark history."""