Skip to content

Commit

Permalink
change markdown output in benchmark PR comments
Browse files Browse the repository at this point in the history
add an option for limiting markdown content size
calculate relative performance with different baselines
calculate relative performance using only already saved data
group results according to suite names and explicit groups
add multiple data columns if multiple --compare specified
  • Loading branch information
EuphoricThinking committed Feb 16, 2025
1 parent f66751d commit 37a046c
Show file tree
Hide file tree
Showing 6 changed files with 353 additions and 143 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/benchmarks-reusable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -220,11 +220,12 @@ jobs:
--compute-runtime ${{ inputs.compute_runtime_commit }}
--build-igc
${{ inputs.upload_report && '--output-html' || '' }}
${{ inputs.pr_no != 0 && '--output-markdown' || '' }}
${{ inputs.bench_script_params }}
- name: Print benchmark results
run: |
cat ${{ github.workspace }}/ur-repo/benchmark_results.md
cat ${{ github.workspace }}/ur-repo/benchmark_results.md || true
- name: Add comment to PR
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
Expand Down
9 changes: 9 additions & 0 deletions scripts/benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,20 @@ By default, the benchmark results are not stored. To store them, use the option

To compare a benchmark run with a previously stored result, use the option `--compare <name>`. You can compare with more than one result.

In a markdown output file (see below), listing more than two `--compare` options results in displaying performance time. If only one `--compare` option is specified, the relative performance of provided results is calculated against previously saved `baseline`. You can compare your data against results other than `baseline` by using:

`--compare <name> --relative-perf <name> --compare <new_baseline> --new-base-name <new_baseline>`.

If no `--compare` option is specified, the benchmark run is compared against a previously stored `baseline`.

Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html).


## Output formats
You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`.


## Requirements

### Python
Expand Down
2 changes: 1 addition & 1 deletion scripts/benchmarks/benches/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Result:
stdout: str
passed: bool = True
unit: str = ""
explicit_group: str = ""
explicit_group: str = "Ungrouped"
# stddev can be optionally set by the benchmark,
# if not set, it will be calculated automatically.
stddev: float = 0.0
Expand Down
29 changes: 22 additions & 7 deletions scripts/benchmarks/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,12 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
benchmark.teardown()
print("complete.")

this_name = "This PR"

chart_data = {this_name : results}
this_name = options.current_run_name
chart_data = {}

if not options.dry_run:
chart_data = {this_name : results}

history = BenchmarkHistory(directory)
# limit how many files we load.
Expand All @@ -207,7 +210,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
chart_data[name] = compare_result.results

if options.output_markdown:
markdown_content = generate_markdown(this_name, chart_data)
markdown_content = generate_markdown(this_name, chart_data, options.output_markdown)

with open('benchmark_results.md', 'w') as file:
file.write(markdown_content)
Expand Down Expand Up @@ -241,6 +244,11 @@ def validate_and_parse_env_args(env_args):
env_vars[key] = value
return env_vars

def substitute_baseline(run_names_to_compare: list[str], new_baseline_name: str):
new_compare_names = [run_name if run_name != options.default_baseline else new_baseline_name for run_name in run_names_to_compare]

return new_compare_names

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Unified Runtime Benchmark Runner')
parser.add_argument('benchmark_directory', type=str, help='Working directory to setup benchmarks.')
Expand All @@ -251,7 +259,7 @@ def validate_and_parse_env_args(env_args):
parser.add_argument("--no-rebuild", help='Do not rebuild the benchmarks from scratch.', action="store_true")
parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=[options.default_baseline])
parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations)
parser.add_argument("--stddev-threshold", type=float, help='If stddev pct is above this threshold, rerun all iterations', default=options.stddev_threshold)
parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout)
Expand All @@ -261,12 +269,14 @@ def validate_and_parse_env_args(env_args):
parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max)
parser.add_argument("--output-markdown", nargs='?', const=options.output_markdown, help='Specify whether markdown output should fit the content size limit for request validation')
parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)
parser.add_argument("--compute-runtime", nargs='?', const=options.compute_runtime_tag, help="Fetch and build compute runtime")
parser.add_argument("--iterations-stddev", type=int, help="Max number of iterations of the loop calculating stddev after completed benchmark runs", default=options.iterations_stddev)
parser.add_argument("--build-igc", help="Build IGC from source instead of using the OS-installed version", action="store_true", default=options.build_igc)
parser.add_argument("--relative-perf", type=str, help="The name of the results which should be used as a baseline for metrics calculation", default=options.current_run_name)
parser.add_argument("--new-base-name", help="New name of the default baseline to compare", type=str, default='')

args = parser.parse_args()
additional_env_vars = validate_and_parse_env_args(args.env)
Expand All @@ -283,12 +293,13 @@ def validate_and_parse_env_args(env_args):
options.exit_on_failure = args.exit_on_failure
options.compare = Compare(args.compare_type)
options.compare_max = args.compare_max
options.output_html = args.output_html
options.output_markdown = args.output_markdown
options.output_html = args.output_html
options.dry_run = args.dry_run
options.umf = args.umf
options.iterations_stddev = args.iterations_stddev
options.build_igc = args.build_igc
options.current_run_name = args.relative_perf

if args.build_igc and args.compute_runtime is None:
parser.error("--build-igc requires --compute-runtime to be set")
Expand All @@ -298,4 +309,8 @@ def validate_and_parse_env_args(env_args):

benchmark_filter = re.compile(args.filter) if args.filter else None

main(args.benchmark_directory, additional_env_vars, args.save, args.compare, benchmark_filter)
compare_names = args.compare
if args.new_base_name != '':
compare_names = substitute_baseline(run_names_to_compare=args.compare, new_baseline_name=args.new_base_name)

main(args.benchmark_directory, additional_env_vars, args.save, compare_names, benchmark_filter)
8 changes: 7 additions & 1 deletion scripts/benchmarks/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ class Compare(Enum):
AVERAGE = 'average'
MEDIAN = 'median'

class MarkdownSize(Enum):
SHORT = 'short'
FULL = 'full'

@dataclass
class Options:
workdir: str = None
Expand All @@ -20,8 +24,8 @@ class Options:
verbose: bool = False
compare: Compare = Compare.LATEST
compare_max: int = 10 # average/median over how many results
output_markdown: MarkdownSize = MarkdownSize.SHORT
output_html: bool = False
output_markdown: bool = True
dry_run: bool = False
# these two should probably be merged into one setting
stddev_threshold: float = 0.02
Expand All @@ -32,6 +36,8 @@ class Options:
extra_env_vars: dict = field(default_factory=dict)
compute_runtime_tag: str = '24.52.32224.10'
build_igc: bool = False
current_run_name: str = "This PR"
default_baseline: str = "baseline"

options = Options()

Loading

0 comments on commit 37a046c

Please sign in to comment.