Skip to content

Commit

Permalink
Support benchmark history summary v2 (#3298)
Browse files Browse the repository at this point in the history
* Add perf benchmark history data

* Add summary notebook

* Support optional excel output

* Update mlflow logging

* Check with right seed average
  • Loading branch information
goodsong81 authored Apr 15, 2024
1 parent 3dea944 commit 98bb7f3
Show file tree
Hide file tree
Showing 11 changed files with 9,449 additions and 216 deletions.
34 changes: 31 additions & 3 deletions .github/workflows/perf_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ on:
permissions: read-all

jobs:
Perf-Benchmark:
Perf-Benchmark-Run:
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -141,14 +141,42 @@ jobs:
--num-repeat ${{ inputs.num-repeat }}
--num-epoch ${{ inputs.num-epoch }}
--eval-upto ${{ inputs.eval-upto }}
--summary-csv .tox/perf-benchmark-summary.csv
--summary-file .tox/perf-benchmark-summary.xlsx
--mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
--user-name ${{ github.triggering_actor }}
--otx-ref ${{ inputs.otx-ref }}
- name: Upload test results
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-${{ matrix.task-short }}
path: .tox/perf-*.csv
path: .tox/perf-benchmark-*.*
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}

Perf-Benchmark-Summary:
needs: Perf-Benchmark-Run
runs-on: ubuntu-latest
steps:
- name: Download benchmark results
uses: actions/download-artifact@v4
with:
path: tests/perf/history/latest
- name: Checkout repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: Install Python
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
with:
python-version: "3.10"
- name: Install dependencies
run: python -m pip install --upgrade pip pandas matplotlib nbconvert ipython ipykernel openpyxl
- name: Summarize benchamrk results
run: |
python tests/perf/history/summary.py tests/perf/history ./perf-benchmark-summary --pattern "*raw*.csv" --normalize
jupyter nbconvert --execute --to html --no-input tests/perf/history/summary.ipynb --output-dir ./perf-benchmark-summary --output perf-benchmark-summary
- name: Upload benchmark summary
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-summary
path: perf-benchmark-summary
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ dev = [
"pytest-mock",
"pytest-csv",
"pytest-cov",
"mlflow==2.11.1", # For regression test
"py-cpuinfo==9.0.0", # For regression test
"mlflow==2.11.1", # For perf benchmark
"py-cpuinfo==9.0.0", # For perf benchmark
"openpyxl", # For perf benchmark
]
docs = [
"furo",
Expand Down
2 changes: 1 addition & 1 deletion tests/perf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

"""OTX perfomance benchamrk tests."""
"""OTX perfomance benchmark tests."""
143 changes: 0 additions & 143 deletions tests/perf/benchmark-reference.csv

This file was deleted.

65 changes: 26 additions & 39 deletions tests/perf/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import gc
import logging
import os
import subprocess
from dataclasses import dataclass
from pathlib import Path
Expand All @@ -17,6 +16,8 @@
import numpy as np
import pandas as pd

from .history import summary

log = logging.getLogger(__name__)


Expand Down Expand Up @@ -71,14 +72,25 @@ class Criterion:
def __call__(self, result_entry: pd.Series, target_entry: pd.Series) -> None:
"""Check result against given target."""
if self.name not in result_entry or result_entry[self.name] is None or np.isnan(result_entry[self.name]):
print(f"[Check] {self.name} not in result")
return
if self.name not in target_entry or target_entry[self.name] is None or np.isnan(target_entry[self.name]):
print(f"[Check] {self.name} not in target")
return
if self.compare == "==":
print(
f"[Check] abs({self.name}:{result_entry[self.name]} - {self.name}:{target_entry[self.name]}) < {self.name}:{target_entry[self.name]} * {self.margin}",
)
assert abs(result_entry[self.name] - target_entry[self.name]) < target_entry[self.name] * self.margin
elif self.compare == "<":
print(
f"[Check] {self.name}:{result_entry[self.name]} < {self.name}:{target_entry[self.name]} * (1.0 + {self.margin})",
)
assert result_entry[self.name] < target_entry[self.name] * (1.0 + self.margin)
elif self.compare == ">":
print(
f"[Check] {self.name}:{result_entry[self.name]} > {self.name}:{target_entry[self.name]} * (1.0 - {self.margin})",
)
assert result_entry[self.name] > target_entry[self.name] * (1.0 - self.margin)

def __init__(
Expand Down Expand Up @@ -279,7 +291,10 @@ def run(
gc.collect()

result = self.load_result(work_dir)
return self.average_result(result, keys=["task", "model", "data_group", "data"])
if result is None:
return None
result = summary.average(result, keys=["task", "model", "data_group", "data"]) # Average out seeds
return result.set_index(["task", "model", "data_group", "data"])

def _run_command(self, command: list[str]) -> None:
print(" ".join(command))
Expand Down Expand Up @@ -370,40 +385,7 @@ def load_result(result_path: Path) -> pd.DataFrame | None:
if len(results) == 0:
return None

return pd.concat(results, ignore_index=True).set_index(["task", "model", "data_group", "data"])

@staticmethod
def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame | None:
"""Average result w.r.t. given keys
Args:
result (pd.DataFrame): Result data frame
keys (list[str]): Keys to summarize whole data
Retruns:
pd.DataFrame: Averaged result table
"""
if data is None:
return None

# Flatten index
index_names = data.index.names
column_names = data.columns
data = data.reset_index()
# Average by keys
grouped = data.groupby(keys)
aggregated = grouped.mean(numeric_only=True)
# Merge index columns
idx_columns = set(index_names) - set(keys)
for col in idx_columns:
aggregated[col] = "all"
# Merge tag columns (non-numeric & non-index)
tag_columns = set(column_names) - set(aggregated.columns) - set(keys)
for col in tag_columns:
# Take common string prefix such as: ["data/1", "data/2", "data/3"] -> "data/"
aggregated[col] = grouped[col].agg(lambda x: os.path.commonprefix(x.tolist()))
# Recover index
return aggregated.reset_index().set_index(index_names)
return pd.concat(results, ignore_index=True)

def check(self, result: pd.DataFrame, criteria: list[Criterion]):
"""Check result w.r.t. reference data.
Expand All @@ -413,19 +395,24 @@ def check(self, result: pd.DataFrame, criteria: list[Criterion]):
criteria (list[Criterion]): Criteria to check results
"""
if result is None:
print("[Check] No results loaded. Skipping result checking.")
return

if self.reference_results is None:
print("No benchmark references loaded. Skipping result checking.")
print("[Check] No benchmark references loaded. Skipping result checking.")
return

for key, result_entry in result.iterrows():
if key not in self.reference_results.index:
print(f"No benchmark reference for {key} loaded. Skipping result checking.")
print(f"[Check] No benchmark reference for {key} loaded. Skipping result checking.")
continue
target_entry = self.reference_results.loc[key]
if isinstance(target_entry, pd.DataFrame):
target_entry = target_entry.iloc[0] # 1-row pd.DataFrame to pd.Series
# Match num_repeat of result and target
result_seed_average = result_entry["seed"]
result_num_repeat = 2 * result_seed_average + 1 # (0+1+2+3+4)/5 = 2.0 -> 2*2.0+1 = 5
target_entry = target_entry.query(f"seed < {result_num_repeat}")
target_entry = target_entry.mean(numeric_only=True) # N-row pd.DataFrame to pd.Series

for criterion in criteria:
criterion(result_entry, target_entry)
Expand Down
65 changes: 37 additions & 28 deletions tests/perf/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@
import subprocess
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import TYPE_CHECKING
from urllib.parse import urlparse

import pandas as pd
import pytest
from cpuinfo import get_cpu_info
from mlflow.client import MlflowClient

from .benchmark import Benchmark
from .history import summary

if TYPE_CHECKING:
import pandas as pd

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -71,9 +75,9 @@ def pytest_addoption(parser):
help="Output root directory. Defaults to temp directory.",
)
parser.addoption(
"--summary-csv",
"--summary-file",
action="store",
help="Path to output summary cvs file. Defaults to {output-root}/benchmark-summary.csv",
help="Path to output summary file. Defaults to {output-root}/benchmark-summary.csv",
)
parser.addoption(
"--dry-run",
Expand Down Expand Up @@ -237,13 +241,13 @@ def fxt_version_tags(fxt_current_date: str, fxt_otx_ref: str) -> dict[str, str]:


@pytest.fixture(scope="session")
def fxt_summary_csv(request: pytest.FixtureRequest, fxt_output_root: Path) -> Path:
def fxt_summary_file(request: pytest.FixtureRequest, fxt_output_root: Path) -> Path:
"""Path to benchmark result summary csv file."""
summary_csv = request.config.getoption("--summary-csv")
summary_csv = fxt_output_root / "benchmark-summary.csv" if summary_csv is None else Path(summary_csv)
msg = f"{summary_csv = }"
summary_file = request.config.getoption("--summary-file")
summary_file = fxt_output_root / "benchmark-summary.csv" if summary_file is None else Path(summary_file)
msg = f"{summary_file = }"
log.info(msg)
return summary_csv
return summary_file


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -292,7 +296,10 @@ def fxt_model(request: pytest.FixtureRequest, fxt_model_category) -> Benchmark.M
model: Benchmark.Model = request.param
if fxt_model_category == "all":
return model
if (fxt_model_category == "default" and model.category == "other") or fxt_model_category != model.category:
if fxt_model_category == "default":
if model.category == "other":
pytest.skip(f"{model.category} category model")
elif fxt_model_category != model.category:
pytest.skip(f"{model.category} category model")
return model

Expand Down Expand Up @@ -356,47 +363,49 @@ def fxt_benchmark(
@pytest.fixture(scope="session", autouse=True)
def fxt_benchmark_summary(
fxt_output_root: Path,
fxt_summary_csv: Path,
fxt_summary_file: Path,
fxt_mlflow_client: MlflowClient,
fxt_tags: dict[str, str],
):
"""Summarize all results at the end of test session."""
yield

raw_results = Benchmark.load_result(fxt_output_root)
if raw_results is None:
if raw_results is None or len(raw_results) == 0:
print("No benchmark results loaded in ", fxt_output_root)
return

summary_results = [
Benchmark.average_result(raw_results, ["task", "model", "data_group", "data"]),
Benchmark.average_result(raw_results, ["task", "model", "data_group"]),
Benchmark.average_result(raw_results, ["task", "model"]),
Benchmark.average_result(raw_results, ["task"]),
]
summary_results = pd.concat(summary_results)
summary_results = summary.summarize(raw_results)

print("=" * 20, "[Benchmark summary]")
print(summary_results)
fxt_summary_csv.parent.mkdir(parents=True, exist_ok=True)
summary_results.to_csv(fxt_summary_csv)
raw_results.to_csv(fxt_summary_csv.parent / "perf-benchmark-raw.csv")
print(f" -> Saved to {fxt_summary_csv}.")
fxt_summary_file.parent.mkdir(parents=True, exist_ok=True)
raw_results.to_csv(fxt_summary_file.parent / "perf-benchmark-raw.csv", index=False)
if fxt_summary_file.suffix == ".xlsx":
summary_results.to_excel(fxt_summary_file)
else:
if fxt_summary_file.suffix != ".csv":
print(f"{fxt_summary_file.suffix} output is not supported.")
fxt_summary_file = fxt_summary_file.with_suffix(".csv")
summary_results.to_csv(fxt_summary_file)
print(f" -> Saved to {fxt_summary_file}.")

if fxt_mlflow_client:
try:
_log_benchmark_results_to_mlflow(summary_results, fxt_mlflow_client, fxt_tags)
_log_benchmark_results_to_mlflow(raw_results, fxt_mlflow_client, fxt_tags)
except Exception as e:
print("MLFlow logging failed: ", e)


def _log_benchmark_results_to_mlflow(results: pd.DataFrame, client: MlflowClient, tags: dict[str, str]) -> None:
results = summary.average(results, keys=["task", "model", "data_group", "data"]) # Average out seeds
results = results.set_index(["task", "data_group", "data"])
for index, result in results.iterrows():
task, model, data_group, data = index
exp_name = f"[Benchmark] {task} | {model} | {data_group} | {data}"
task, data_group, data = index
model = result["model"]
exp_name = f"[Benchmark] {task} | {data_group} | {data}"
exp_tags = {
"task": task,
"model": model,
"data_group": data_group,
"data": data,
}
Expand All @@ -407,7 +416,7 @@ def _log_benchmark_results_to_mlflow(results: pd.DataFrame, client: MlflowClient
exp_id = exp.experiment_id
if exp.lifecycle_stage != "active":
client.restore_experiment(exp_id)
run_name = f"[{tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
run_name = f"[{model}] {tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
run_tags = {k: v for k, v in result.items() if isinstance(v, str)}
run_tags.update(**exp_tags, **tags)
run = client.create_run(exp_id, run_name=run_name, tags=run_tags)
Expand All @@ -419,7 +428,7 @@ def _log_benchmark_results_to_mlflow(results: pd.DataFrame, client: MlflowClient
@pytest.fixture(scope="session")
def fxt_benchmark_reference() -> pd.DataFrame | None:
"""Load reference benchmark results with index."""
ref = pd.read_csv(Path(__file__).parent.resolve() / "benchmark-reference.csv")
ref = summary.load(Path(__file__).parent.resolve() / "history/v1.5.2", need_normalize=True)
if ref is not None:
ref = ref.set_index(["task", "model", "data_group", "data"])
return ref
Expand Down
4 changes: 4 additions & 0 deletions tests/perf/history/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

"""OTX perfomance benchmark history."""
Loading

0 comments on commit 98bb7f3

Please sign in to comment.