Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support benchmark history summary v2 #3298

Merged
merged 24 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions .github/workflows/perf_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ on:
permissions: read-all

jobs:
Perf-Benchmark:
Perf-Benchmark-Run:
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -141,14 +141,42 @@ jobs:
--num-repeat ${{ inputs.num-repeat }}
--num-epoch ${{ inputs.num-epoch }}
--eval-upto ${{ inputs.eval-upto }}
--summary-csv .tox/perf-benchmark-summary.csv
--summary-file .tox/perf-benchmark-summary.xlsx
--mlflow-tracking-uri ${{ vars.MLFLOW_TRACKING_SERVER_URI }}
--user-name ${{ github.triggering_actor }}
--otx-ref ${{ inputs.otx-ref }}
- name: Upload test results
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-${{ matrix.task-short }}
path: .tox/perf-*.csv
path: .tox/perf-benchmark-*.*
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}

Perf-Benchmark-Summary:
needs: Perf-Benchmark-Run
runs-on: ubuntu-latest
steps:
- name: Download benchmark results
uses: actions/download-artifact@v4
with:
path: tests/perf/history/latest
- name: Checkout repository
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: Install Python
uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
with:
python-version: "3.10"
- name: Install dependencies
run: python -m pip install --upgrade pip pandas matplotlib nbconvert ipython ipykernel openpyxl
- name: Summarize benchamrk results
run: |
python tests/perf/history/summary.py tests/perf/history ./perf-benchmark-summary --pattern "*raw*.csv" --normalize
jupyter nbconvert --execute --to html --no-input tests/perf/history/summary.ipynb --output-dir ./perf-benchmark-summary --output perf-benchmark-summary
- name: Upload benchmark summary
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
with:
name: perf-benchmark-summary
path: perf-benchmark-summary
# Use always() to always run this step to publish test results when there are test failures
if: ${{ always() }}
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ dev = [
"pytest-mock",
"pytest-csv",
"pytest-cov",
"mlflow==2.11.1", # For regression test
"py-cpuinfo==9.0.0", # For regression test
"mlflow==2.11.1", # For perf benchmark
"py-cpuinfo==9.0.0", # For perf benchmark
"openpyxl", # For perf benchmark
]
docs = [
"furo",
Expand Down
2 changes: 1 addition & 1 deletion tests/perf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

"""OTX perfomance benchamrk tests."""
"""OTX perfomance benchmark tests."""
143 changes: 0 additions & 143 deletions tests/perf/benchmark-reference.csv

This file was deleted.

65 changes: 26 additions & 39 deletions tests/perf/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import gc
import logging
import os
import subprocess
from dataclasses import dataclass
from pathlib import Path
Expand All @@ -17,6 +16,8 @@
import numpy as np
import pandas as pd

from .history import summary

log = logging.getLogger(__name__)


Expand Down Expand Up @@ -71,14 +72,25 @@ class Criterion:
def __call__(self, result_entry: pd.Series, target_entry: pd.Series) -> None:
"""Check result against given target."""
if self.name not in result_entry or result_entry[self.name] is None or np.isnan(result_entry[self.name]):
print(f"[Check] {self.name} not in result")
return
if self.name not in target_entry or target_entry[self.name] is None or np.isnan(target_entry[self.name]):
print(f"[Check] {self.name} not in target")
return
if self.compare == "==":
print(
f"[Check] abs({self.name}:{result_entry[self.name]} - {self.name}:{target_entry[self.name]}) < {self.name}:{target_entry[self.name]} * {self.margin}",
)
assert abs(result_entry[self.name] - target_entry[self.name]) < target_entry[self.name] * self.margin
elif self.compare == "<":
print(
f"[Check] {self.name}:{result_entry[self.name]} < {self.name}:{target_entry[self.name]} * (1.0 + {self.margin})",
)
assert result_entry[self.name] < target_entry[self.name] * (1.0 + self.margin)
elif self.compare == ">":
print(
f"[Check] {self.name}:{result_entry[self.name]} > {self.name}:{target_entry[self.name]} * (1.0 - {self.margin})",
)
assert result_entry[self.name] > target_entry[self.name] * (1.0 - self.margin)

def __init__(
Expand Down Expand Up @@ -279,7 +291,10 @@ def run(
gc.collect()

result = self.load_result(work_dir)
return self.average_result(result, keys=["task", "model", "data_group", "data"])
if result is None:
return None
result = summary.average(result, keys=["task", "model", "data_group", "data"]) # Average out seeds
return result.set_index(["task", "model", "data_group", "data"])

def _run_command(self, command: list[str]) -> None:
print(" ".join(command))
Expand Down Expand Up @@ -370,40 +385,7 @@ def load_result(result_path: Path) -> pd.DataFrame | None:
if len(results) == 0:
return None

return pd.concat(results, ignore_index=True).set_index(["task", "model", "data_group", "data"])

@staticmethod
def average_result(data: pd.DataFrame, keys: list[str]) -> pd.DataFrame | None:
"""Average result w.r.t. given keys

Args:
result (pd.DataFrame): Result data frame
keys (list[str]): Keys to summarize whole data

Retruns:
pd.DataFrame: Averaged result table
"""
if data is None:
return None

# Flatten index
index_names = data.index.names
column_names = data.columns
data = data.reset_index()
# Average by keys
grouped = data.groupby(keys)
aggregated = grouped.mean(numeric_only=True)
# Merge index columns
idx_columns = set(index_names) - set(keys)
for col in idx_columns:
aggregated[col] = "all"
# Merge tag columns (non-numeric & non-index)
tag_columns = set(column_names) - set(aggregated.columns) - set(keys)
for col in tag_columns:
# Take common string prefix such as: ["data/1", "data/2", "data/3"] -> "data/"
aggregated[col] = grouped[col].agg(lambda x: os.path.commonprefix(x.tolist()))
# Recover index
return aggregated.reset_index().set_index(index_names)
return pd.concat(results, ignore_index=True)

def check(self, result: pd.DataFrame, criteria: list[Criterion]):
"""Check result w.r.t. reference data.
Expand All @@ -413,19 +395,24 @@ def check(self, result: pd.DataFrame, criteria: list[Criterion]):
criteria (list[Criterion]): Criteria to check results
"""
if result is None:
print("[Check] No results loaded. Skipping result checking.")
return

if self.reference_results is None:
print("No benchmark references loaded. Skipping result checking.")
print("[Check] No benchmark references loaded. Skipping result checking.")
return

for key, result_entry in result.iterrows():
if key not in self.reference_results.index:
print(f"No benchmark reference for {key} loaded. Skipping result checking.")
print(f"[Check] No benchmark reference for {key} loaded. Skipping result checking.")
continue
target_entry = self.reference_results.loc[key]
if isinstance(target_entry, pd.DataFrame):
target_entry = target_entry.iloc[0] # 1-row pd.DataFrame to pd.Series
# Match num_repeat of result and target
result_seed_average = result_entry["seed"]
result_num_repeat = 2 * result_seed_average + 1 # (0+1+2+3+4)/5 = 2.0 -> 2*2.0+1 = 5
target_entry = target_entry.query(f"seed < {result_num_repeat}")
target_entry = target_entry.mean(numeric_only=True) # N-row pd.DataFrame to pd.Series

for criterion in criteria:
criterion(result_entry, target_entry)
Expand Down
65 changes: 37 additions & 28 deletions tests/perf/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@
import subprocess
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import TYPE_CHECKING
from urllib.parse import urlparse

import pandas as pd
import pytest
from cpuinfo import get_cpu_info
from mlflow.client import MlflowClient

from .benchmark import Benchmark
from .history import summary

if TYPE_CHECKING:
import pandas as pd

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -71,9 +75,9 @@ def pytest_addoption(parser):
help="Output root directory. Defaults to temp directory.",
)
parser.addoption(
"--summary-csv",
"--summary-file",
action="store",
help="Path to output summary cvs file. Defaults to {output-root}/benchmark-summary.csv",
help="Path to output summary file. Defaults to {output-root}/benchmark-summary.csv",
)
parser.addoption(
"--dry-run",
Expand Down Expand Up @@ -237,13 +241,13 @@ def fxt_version_tags(fxt_current_date: str, fxt_otx_ref: str) -> dict[str, str]:


@pytest.fixture(scope="session")
def fxt_summary_csv(request: pytest.FixtureRequest, fxt_output_root: Path) -> Path:
def fxt_summary_file(request: pytest.FixtureRequest, fxt_output_root: Path) -> Path:
"""Path to benchmark result summary csv file."""
summary_csv = request.config.getoption("--summary-csv")
summary_csv = fxt_output_root / "benchmark-summary.csv" if summary_csv is None else Path(summary_csv)
msg = f"{summary_csv = }"
summary_file = request.config.getoption("--summary-file")
summary_file = fxt_output_root / "benchmark-summary.csv" if summary_file is None else Path(summary_file)
msg = f"{summary_file = }"
log.info(msg)
return summary_csv
return summary_file


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -292,7 +296,10 @@ def fxt_model(request: pytest.FixtureRequest, fxt_model_category) -> Benchmark.M
model: Benchmark.Model = request.param
if fxt_model_category == "all":
return model
if (fxt_model_category == "default" and model.category == "other") or fxt_model_category != model.category:
if fxt_model_category == "default":
if model.category == "other":
pytest.skip(f"{model.category} category model")
elif fxt_model_category != model.category:
pytest.skip(f"{model.category} category model")
return model

Expand Down Expand Up @@ -356,47 +363,49 @@ def fxt_benchmark(
@pytest.fixture(scope="session", autouse=True)
def fxt_benchmark_summary(
fxt_output_root: Path,
fxt_summary_csv: Path,
fxt_summary_file: Path,
fxt_mlflow_client: MlflowClient,
fxt_tags: dict[str, str],
):
"""Summarize all results at the end of test session."""
yield

raw_results = Benchmark.load_result(fxt_output_root)
if raw_results is None:
if raw_results is None or len(raw_results) == 0:
print("No benchmark results loaded in ", fxt_output_root)
return

summary_results = [
Benchmark.average_result(raw_results, ["task", "model", "data_group", "data"]),
Benchmark.average_result(raw_results, ["task", "model", "data_group"]),
Benchmark.average_result(raw_results, ["task", "model"]),
Benchmark.average_result(raw_results, ["task"]),
]
summary_results = pd.concat(summary_results)
summary_results = summary.summarize(raw_results)

print("=" * 20, "[Benchmark summary]")
print(summary_results)
fxt_summary_csv.parent.mkdir(parents=True, exist_ok=True)
summary_results.to_csv(fxt_summary_csv)
raw_results.to_csv(fxt_summary_csv.parent / "perf-benchmark-raw.csv")
print(f" -> Saved to {fxt_summary_csv}.")
fxt_summary_file.parent.mkdir(parents=True, exist_ok=True)
raw_results.to_csv(fxt_summary_file.parent / "perf-benchmark-raw.csv", index=False)
if fxt_summary_file.suffix == ".xlsx":
summary_results.to_excel(fxt_summary_file)
else:
if fxt_summary_file.suffix != ".csv":
print(f"{fxt_summary_file.suffix} output is not supported.")
fxt_summary_file = fxt_summary_file.with_suffix(".csv")
summary_results.to_csv(fxt_summary_file)
print(f" -> Saved to {fxt_summary_file}.")

if fxt_mlflow_client:
try:
_log_benchmark_results_to_mlflow(summary_results, fxt_mlflow_client, fxt_tags)
_log_benchmark_results_to_mlflow(raw_results, fxt_mlflow_client, fxt_tags)
except Exception as e:
print("MLFlow logging failed: ", e)


def _log_benchmark_results_to_mlflow(results: pd.DataFrame, client: MlflowClient, tags: dict[str, str]) -> None:
results = summary.average(results, keys=["task", "model", "data_group", "data"]) # Average out seeds
results = results.set_index(["task", "data_group", "data"])
for index, result in results.iterrows():
task, model, data_group, data = index
exp_name = f"[Benchmark] {task} | {model} | {data_group} | {data}"
task, data_group, data = index
model = result["model"]
exp_name = f"[Benchmark] {task} | {data_group} | {data}"
exp_tags = {
"task": task,
"model": model,
"data_group": data_group,
"data": data,
}
Expand All @@ -407,7 +416,7 @@ def _log_benchmark_results_to_mlflow(results: pd.DataFrame, client: MlflowClient
exp_id = exp.experiment_id
if exp.lifecycle_stage != "active":
client.restore_experiment(exp_id)
run_name = f"[{tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
run_name = f"[{model}] {tags['date']} | {tags['user_name']} | {tags['otx_version']} | {tags['test_branch']} | {tags['test_commit']}"
run_tags = {k: v for k, v in result.items() if isinstance(v, str)}
run_tags.update(**exp_tags, **tags)
run = client.create_run(exp_id, run_name=run_name, tags=run_tags)
Expand All @@ -419,7 +428,7 @@ def _log_benchmark_results_to_mlflow(results: pd.DataFrame, client: MlflowClient
@pytest.fixture(scope="session")
def fxt_benchmark_reference() -> pd.DataFrame | None:
"""Load reference benchmark results with index."""
ref = pd.read_csv(Path(__file__).parent.resolve() / "benchmark-reference.csv")
ref = summary.load(Path(__file__).parent.resolve() / "history/v1.5.2", need_normalize=True)
if ref is not None:
ref = ref.set_index(["task", "model", "data_group", "data"])
return ref
Expand Down
4 changes: 4 additions & 0 deletions tests/perf/history/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

"""OTX perfomance benchmark history."""
Loading
Loading