Skip to content

Commit

Permalink
Support benchmark history summary v1 (#3307)
Browse files Browse the repository at this point in the history
* Add history data, notebook and script

* Add perf summary workflow

* Update xlsx summary

* Use history as check ref

* Fix normalize
  • Loading branch information
goodsong81 authored Apr 15, 2024
1 parent 479f86b commit 7d0b894
Show file tree
Hide file tree
Showing 10 changed files with 9,318 additions and 204 deletions.
35 changes: 32 additions & 3 deletions .github/workflows/perf_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,13 @@ on:
Target OTX ref (tag / branch name / commit hash) on main repo to test. Defaults to the current branch.
`pip install otx[full]@https://github.com/openvinotoolkit/training_extensions.git@{otx_ref}` will be executed before run,
and reverted after run. Works only for v1.x assuming CLI compatibility.
default: __CURRENT_BRANCH_COMMIT__

# Declare default permissions as read only.
permissions: read-all

jobs:
Perf-Benchmark:
Perf-Benchmark-Run:
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -142,14 +143,42 @@ jobs:
--num-repeat ${{ inputs.num-repeat }}
--num-epoch ${{ inputs.num-epoch }}
--eval-upto ${{ inputs.eval-upto }}
--summary-csv .tox/perf-benchmark-summary.csv
--summary-file .tox/perf-benchmark-summary.xlsx
--mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
--user-name ${{ github.triggering_actor }}
--otx-ref ${{ inputs.otx-ref }}
- name: Upload test results
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-${{ matrix.task-short }}
path: .tox/perf-*.csv
path: .tox/perf-benchmark-*.*
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}

Perf-Benchmark-Summary:
needs: Perf-Benchmark-Run
runs-on: ubuntu-latest
steps:
- name: Download benchmark results
uses: actions/download-artifact@v4
with:
path: tests/perf/history/latest
- name: Checkout repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: Install Python
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
with:
python-version: "3.10"
- name: Install dependencies
run: python -m pip install --upgrade pip pandas matplotlib nbconvert ipython ipykernel openpyxl
- name: Summarize benchamrk results
run: |
python tests/perf/history/summary.py tests/perf/history ./perf-benchmark-summary --pattern "*raw*.csv"
jupyter nbconvert --execute --to html --no-input tests/perf/history/summary.ipynb --output-dir ./perf-benchmark-summary --output perf-benchmark-summary
- name: Upload benchmark summary
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-summary
path: perf-benchmark-summary
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}
1 change: 1 addition & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ pytest-csv==3.0.*
tox==4.11.*
mlflow==2.10.2
py-cpuinfo==9.0.0
openpyxl==3.1.2
143 changes: 0 additions & 143 deletions tests/perf/benchmark-reference.csv

This file was deleted.

54 changes: 21 additions & 33 deletions tests/perf/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import subprocess # nosec B404
import yaml
from pathlib import Path
from .history import summary


class Benchmark:
Expand Down Expand Up @@ -107,6 +108,7 @@ def run(
subprocess.run(cmd, check=True)
# Load result
result = self.load_result(cfg_dir)
result = summary.average(result, ["task", "model", "data_group", "data"])
return result

@staticmethod
Expand Down Expand Up @@ -145,35 +147,7 @@ def load_result(result_path: str) -> pd.DataFrame | None:
if "train_e2e_time" in data:
data["train_e2e_time"] = pd.to_timedelta(data["train_e2e_time"]).dt.total_seconds() # H:M:S str -> seconds
data = data.rename(columns={"repeat": "seed"})
return data.set_index(["task", "model", "data_group", "data"])

@staticmethod
def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame:
"""Average result w.r.t. given keys
Args:
result (pd.DataFrame): Result data frame
keys (list[str]): Keys to summarize whole data
Retruns:
pd.DataFrame: Averaged result table
"""
# Flatten index
index_names = data.index.names
column_names = data.columns
data = data.reset_index()
# Average by keys
grouped = data.groupby(keys)
aggregated = grouped.mean(numeric_only=True)
# Merge index columns
idx_columns = set(index_names) - set(keys)
for col in idx_columns:
aggregated[col] = "all"
# Merge tag columns (non-numeric & non-index)
tag_columns = set(column_names) - set(aggregated.columns) - set(keys)
for col in tag_columns:
# Take common string prefix such as: ["data/1", "data/2", "data/3"] -> "data/"
aggregated[col] = grouped[col].agg(lambda x: os.path.commonprefix(x.tolist()))
# Recover index
return aggregated.reset_index().set_index(index_names)
return data

def _build_config(
self,
Expand All @@ -189,7 +163,7 @@ def _build_config(

cfg = {}
cfg["tags"] = all_tags # metadata
cfg["output_path"] = os.path.abspath(self.output_root)
cfg["output_path"] = os.path.abspath(f"{self.output_root}/{model_id}")
cfg["constants"] = {
"dataroot": os.path.abspath(self.data_root),
}
Expand Down Expand Up @@ -246,30 +220,44 @@ def check(self, result: pd.DataFrame, criteria: list[dict]):
criteria (list[dict]): Criteria to check results
"""
if result is None:
print("[Check] No results loaded. Skipping result checking.")
return

if self.reference_results is None:
print("No benchmark references loaded. Skipping result checking.")
print("[Check] No benchmark references loaded. Skipping result checking.")
return

result = result.set_index(["task", "model", "data_group", "data"])

for key, result_entry in result.iterrows():
if key not in self.reference_results.index:
print(f"No benchmark reference for {key} loaded. Skipping result checking.")
print(f"[Check] No benchmark reference for {key} loaded. Skipping result checking.")
continue
target_entry = self.reference_results.loc[key]
if isinstance(target_entry, pd.DataFrame):
target_entry = target_entry.iloc[0] # 1-row pd.DataFrame to pd.Series
# Match num_repeat & seeds of result and target
result_seed_average = result_entry["seed"]
result_num_repeat = 2 * result_seed_average + 1 # (0+1+2+3+4)/5 = 2.0 -> 2*2.0+1 = 5
target_entry = target_entry.query(f"seed < {result_num_repeat}")
target_entry = target_entry.mean(numeric_only=True) # N-row pd.DataFrame to pd.Series

def compare(name: str, op: str, margin: float):
if name not in result_entry or result_entry[name] is None or np.isnan(result_entry[name]):
print(f"[Check] {name} not in result")
return
if name not in target_entry or target_entry[name] is None or np.isnan(target_entry[name]):
print(f"[Check] {name} not in target")
return
if op == "==":
print(
f"[Check] abs({name}:{result_entry[name]} - {name}:{target_entry[name]}) < {name}:{target_entry[name]} * {margin}",
)
assert abs(result_entry[name] - target_entry[name]) < target_entry[name] * margin
elif op == "<":
print(f"[Check] {name}:{result_entry[name]} < {name}:{target_entry[name]} * (1.0 + {margin})")
assert result_entry[name] < target_entry[name] * (1.0 + margin)
elif op == ">":
print(f"[Check] {name}:{result_entry[name]} > {name}:{target_entry[name]} * (1.0 - {margin})")
assert result_entry[name] > target_entry[name] * (1.0 - margin)

for criterion in criteria:
Expand Down
54 changes: 29 additions & 25 deletions tests/perf/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import yaml

from .benchmark import Benchmark
from .history import summary


def pytest_addoption(parser):
Expand Down Expand Up @@ -72,9 +73,9 @@ def pytest_addoption(parser):
help="Output root directory. Defaults to temp directory.",
)
parser.addoption(
"--summary-csv",
"--summary-file",
action="store",
help="Path to output summary cvs file. Defaults to {output-root}/benchmark-summary.csv",
help="Path to output summary file. Defaults to {output-root}/benchmark-summary.csv",
)
parser.addoption(
"--dry-run",
Expand Down Expand Up @@ -248,12 +249,15 @@ def fxt_benchmark(


def _log_benchmark_results_to_mlflow(results: pd.DataFrame, tags: dict[str, str], client: MlflowClient):
results = summary.normalize(results) # Standardize names for comparison
results = summary.average(results, keys=["task", "model", "data_group", "data"]) # Average out seeds
results = results.set_index(["task", "data_group", "data"])
for index, result in results.iterrows():
task, model, data_group, data = index
exp_name = f"[Benchmark] {task} | {model} | {data_group} | {data}"
task, data_group, data = index
model = result["model"]
exp_name = f"[Benchmark] {task} | {data_group} | {data}"
exp_tags = {
"task": task,
"model": model,
"data_group": data_group,
"data": data,
}
Expand All @@ -264,7 +268,7 @@ def _log_benchmark_results_to_mlflow(results: pd.DataFrame, tags: dict[str, str]
exp_id = exp.experiment_id
if exp.lifecycle_stage != "active":
client.restore_experiment(exp_id)
run_name = f"[{tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
run_name = f"[{model}] {tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
run_tags = {k: v for k, v in result.items() if isinstance(v, str)}
run_tags.update(**exp_tags, **tags)
run = client.create_run(exp_id, run_name=run_name, tags=run_tags)
Expand All @@ -282,32 +286,32 @@ def fxt_benchmark_summary(
):
"""Summarize all results at the end of test session."""
yield

raw_results = Benchmark.load_result(fxt_output_root)
if raw_results is None:
if raw_results is None or len(raw_results) == 0:
print("No benchmark results loaded in ", fxt_output_root)
return

print("=" * 20, "[Benchmark summary]")
summary_results = [
Benchmark.average_result(raw_results, ["task", "model", "data_group", "data"]),
Benchmark.average_result(raw_results, ["task", "model", "data_group"]),
Benchmark.average_result(raw_results, ["task", "model"]),
Benchmark.average_result(raw_results, ["task"]),
]
summary_results = pd.concat(summary_results)
summary_results = summary.summarize(raw_results)

print("=" * 20, "[Benchmark summary]")
print(summary_results)

summary_csv = request.config.getoption("--summary-csv")
if not summary_csv:
summary_csv = fxt_output_root / "perf-benchmark-summary.csv"
summary_file = request.config.getoption("--summary-file")
if not summary_file:
summary_file = fxt_output_root / "perf-benchmark-summary.csv"
else:
summary_file = Path(summary_file)
summary_file.parent.mkdir(parents=True, exist_ok=True)
raw_results.to_csv(summary_file.parent / "perf-benchmark-raw.csv", index=False)
if summary_file.suffix == ".xlsx":
summary_results.to_excel(summary_file)
else:
summary_csv = Path(summary_csv)
summary_csv.parent.mkdir(parents=True, exist_ok=True)
summary_results.to_csv(summary_csv)
raw_results.to_csv(summary_csv.parent / "perf-benchmark-raw.csv")
print(f" -> Saved to {summary_csv}.")
if summary_file.suffix != ".csv":
print(f"{summary_file.suffix} output is not supported.")
summary_file = summary_file.with_suffix(".csv")
summary_results.to_csv(summary_file)
print(f" -> Saved to {summary_file}.")

if fxt_mlflow_client is None:
print(
Expand All @@ -320,7 +324,7 @@ def fxt_benchmark_summary(
# test_branch = fxt_tags["test_branch"]
# if test_branch == "develop" or bool(re.match("^releases/[0-9]+\.[0-9]+\.[0-9]+$", test_branch)):
try:
_log_benchmark_results_to_mlflow(summary_results, fxt_tags, fxt_mlflow_client)
_log_benchmark_results_to_mlflow(raw_results, fxt_tags, fxt_mlflow_client)
except Exception as e:
print("MLFlow loging failed: ", e)

Expand All @@ -331,7 +335,7 @@ def fxt_benchmark_summary(
@pytest.fixture(scope="session")
def fxt_benchmark_reference() -> pd.DataFrame | None:
"""Load reference benchmark results with index."""
ref = pd.read_csv(Path(__file__).parent.resolve() / "benchmark-reference.csv")
ref = summary.load(Path(__file__).parent.resolve() / "history/v1.5.2")
if ref is not None:
ref = ref.set_index(["task", "model", "data_group", "data"])
return ref
4 changes: 4 additions & 0 deletions tests/perf/history/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

"""OTX perfomance benchmark history."""
Loading

0 comments on commit 7d0b894

Please sign in to comment.