From bbd9097470d7ad35ecbaac22ffb2ef474dc4670a Mon Sep 17 00:00:00 2001 From: Agata Momot Date: Mon, 10 Feb 2025 12:54:03 +0100 Subject: [PATCH] change markdown output in benchmark PR comments add an option for limiting markdown content size calculate relative performance with different baselines calculate relative performance using only already saved data group results according to suite names and explicit groups add multiple data columns if multiple --compare specified --- .github/workflows/benchmarks-reusable.yml | 3 +- .github/workflows/benchmarks.yml | 2 +- scripts/benchmarks/README.md | 9 +- scripts/benchmarks/main.py | 19 +- scripts/benchmarks/options.py | 7 +- scripts/benchmarks/output_markdown.py | 436 +++++++++++++++------- 6 files changed, 335 insertions(+), 141 deletions(-) diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml index bfd6064ba1..f8ef31fcc2 100644 --- a/.github/workflows/benchmarks-reusable.yml +++ b/.github/workflows/benchmarks-reusable.yml @@ -220,11 +220,12 @@ jobs: --compute-runtime ${{ inputs.compute_runtime_commit }} --build-igc ${{ inputs.upload_report && '--output-html' || '' }} + ${{ inputs.pr_no != 0 && '--output-markdown' || '' }} ${{ inputs.bench_script_params }} - name: Print benchmark results run: | - cat ${{ github.workspace }}/ur-repo/benchmark_results.md + cat ${{ github.workspace }}/ur-repo/benchmark_results.md || true - name: Add comment to PR uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 7de3926daf..edcb5c02f2 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -24,7 +24,7 @@ on: type: number required: true bench_script_params: - description: Parameters passed to script executing benchmark + description: Parameters passed to the script executing benchmark (recommended `--compare baseline`) type: string required: false default: '' diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md index 9cef0e52a3..4fe7f8b3c4 100644 --- a/scripts/benchmarks/README.md +++ b/scripts/benchmarks/README.md @@ -35,13 +35,16 @@ You must be a member of the `oneapi-src` organization to access these features. By default, the benchmark results are not stored. To store them, use the option `--save `. This will make the results available for comparison during the next benchmark runs. -To compare a benchmark run with a previously stored result, use the option `--compare `. You can compare with more than one result. - -If no `--compare` option is specified, the benchmark run is compared against a previously stored `baseline`. +You can compare benchmark results using `--compare` option as described: in a markdown output file (see below), listing more than two `--compare` options results in displaying only execution time, without statistical analysis. If you want to calculate the relative performance of the new results against the previously saved data, use `--compare `. In case of comparing only stored data without generating new results, use `--dry-run --compare --compare --relative-perf `, where `name1` indicates the baseline for the relative performance calculation, `--dry-run` prevents the script for running benchmarks and `--relative-perf` substitutes the name of the current run. Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html). + +## Output formats +You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`. + + ## Requirements ### Python diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py index 77524a6e02..a740c02672 100755 --- a/scripts/benchmarks/main.py +++ b/scripts/benchmarks/main.py @@ -189,9 +189,12 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): benchmark.teardown() print("complete.") - this_name = "This PR" - chart_data = {this_name : results} + this_name = options.current_run_name + chart_data = {} + + if not options.dry_run: + chart_data = {this_name : results} history = BenchmarkHistory(directory) # limit how many files we load. @@ -199,7 +202,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): history.load(1000) # remove duplicates. this can happen if e.g., --compare baseline is specified manually. - compare_names = list(dict.fromkeys(compare_names)) + compare_names = list(dict.fromkeys(compare_names)) if compare_names is not None else [] for name in compare_names: compare_result = history.get_compare(name) @@ -207,7 +210,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): chart_data[name] = compare_result.results if options.output_markdown: - markdown_content = generate_markdown(this_name, chart_data) + markdown_content = generate_markdown(this_name, chart_data, options.output_markdown) with open('benchmark_results.md', 'w') as file: file.write(markdown_content) @@ -251,7 +254,7 @@ def validate_and_parse_env_args(env_args): parser.add_argument("--no-rebuild", help='Do not rebuild the benchmarks from scratch.', action="store_true") parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[]) parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.') - parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"]) + parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append") parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations) parser.add_argument("--stddev-threshold", type=float, help='If stddev pct is above this threshold, rerun all iterations', default=options.stddev_threshold) parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout) @@ -261,12 +264,13 @@ def validate_and_parse_env_args(env_args): parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true") parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value) parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max) + parser.add_argument("--output-markdown", nargs='?', const=options.output_markdown, help='Specify whether markdown output should fit the content size limit for request validation') parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False) - parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True) parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False) parser.add_argument("--compute-runtime", nargs='?', const=options.compute_runtime_tag, help="Fetch and build compute runtime") parser.add_argument("--iterations-stddev", type=int, help="Max number of iterations of the loop calculating stddev after completed benchmark runs", default=options.iterations_stddev) parser.add_argument("--build-igc", help="Build IGC from source instead of using the OS-installed version", action="store_true", default=options.build_igc) + parser.add_argument("--relative-perf", type=str, help="The name of the results which should be used as a baseline for metrics calculation", default=options.current_run_name) args = parser.parse_args() additional_env_vars = validate_and_parse_env_args(args.env) @@ -283,12 +287,13 @@ def validate_and_parse_env_args(env_args): options.exit_on_failure = args.exit_on_failure options.compare = Compare(args.compare_type) options.compare_max = args.compare_max - options.output_html = args.output_html options.output_markdown = args.output_markdown + options.output_html = args.output_html options.dry_run = args.dry_run options.umf = args.umf options.iterations_stddev = args.iterations_stddev options.build_igc = args.build_igc + options.current_run_name = args.relative_perf if args.build_igc and args.compute_runtime is None: parser.error("--build-igc requires --compute-runtime to be set") diff --git a/scripts/benchmarks/options.py b/scripts/benchmarks/options.py index 1bd79f6878..772fee2e02 100644 --- a/scripts/benchmarks/options.py +++ b/scripts/benchmarks/options.py @@ -6,6 +6,10 @@ class Compare(Enum): AVERAGE = 'average' MEDIAN = 'median' +class MarkdownSize(Enum): + SHORT = 'short' + FULL = 'full' + @dataclass class Options: workdir: str = None @@ -20,8 +24,8 @@ class Options: verbose: bool = False compare: Compare = Compare.LATEST compare_max: int = 10 # average/median over how many results + output_markdown: MarkdownSize = MarkdownSize.SHORT output_html: bool = False - output_markdown: bool = True dry_run: bool = False # these two should probably be merged into one setting stddev_threshold: float = 0.02 @@ -32,6 +36,7 @@ class Options: extra_env_vars: dict = field(default_factory=dict) compute_runtime_tag: str = '24.52.32224.10' build_igc: bool = False + current_run_name: str = "This PR" options = Options() diff --git a/scripts/benchmarks/output_markdown.py b/scripts/benchmarks/output_markdown.py index fc3b65507b..daa01e34ff 100644 --- a/scripts/benchmarks/output_markdown.py +++ b/scripts/benchmarks/output_markdown.py @@ -1,12 +1,13 @@ -# Copyright (C) 2024 Intel Corporation -# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# Copyright (C) 2024-2025 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +# Exceptions. # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -import collections, re +import collections from benches.result import Result -from options import options -import math +from options import options, MarkdownSize +import ast class OutputLine: def __init__(self, name): @@ -14,6 +15,8 @@ def __init__(self, name): self.diff = None self.bars = None self.row = "" + self.suite = "Unknown" + self.explicit_group = "" def __str__(self): return f"(Label:{self.label}, diff:{self.diff})" @@ -21,40 +24,167 @@ def __str__(self): def __repr__(self): return self.__str__() -# Function to generate the markdown collapsible sections for each variant -def generate_markdown_details(results: list[Result]): - markdown_sections = [] +# The number of the required columns in the markdown table, +# independent of the chart_data content. +# Required columns: +# - benchmark_name +# +# optional +1: relative performance +num_info_columns = 1 + +# Number of columns required for relative performance change calculation. +# In case of multiple provided saved baselines to compare, the relative +# performance is not calculated, since the base (hopefully) usage case +# for this script would be comparing the performance of PR with the main branch +num_baselines_required_for_rel_change = 2 + +# Maximum number of characters that is allowed in request validation +# for posting comments in GitHub PRs +max_markdown_size = 65536 + + +def is_relative_perf_comparison_to_be_performed(chart_data: + dict[str, list[Result]], + baseline_name: str): + return (len(chart_data) == num_baselines_required_for_rel_change) and \ + (baseline_name in chart_data.keys()) + + +def get_chart_markdown_header(chart_data: dict[str, list[Result]], + baseline_name: str): + summary_header = '' + final_num_columns = num_info_columns + + if is_relative_perf_comparison_to_be_performed(chart_data, baseline_name): + summary_header = "| Benchmark | " + " | ".join(chart_data.keys()) + \ + " | Change |\n" + final_num_columns += 1 + else: + summary_header = "| Benchmark | " + " | ".join(chart_data.keys()) + \ + " |\n" + + summary_header += "|---" * (len(chart_data) + final_num_columns) + "|\n" + + return summary_header + + +def get_improved_regressed_summary(is_improved: bool, rows_count: int): + title = "Improved" + if not is_improved: + title = "Regressed" + + summary = ( + "\n
\n" + "\n" + f"{title} {rows_count} " + f"(threshold {options.epsilon*100:.2f}%)\n" + "\n\n" + ) + + return summary + + +def get_relative_perf_summary(group_size: int, group_name: str): + summary = ( + "\n
\n" + f" Relative perf in group {group_name} " + f"({group_size})\n" + "\n\n" + ) + + return summary + - markdown_sections.append(f""" -
-Benchmark details - environment, command... -""") +def get_main_branch_run_name(chart_data: dict[str, list[Result]], + baseline_name: str): + for key in chart_data.keys(): + if key != baseline_name: + return key + + return None - for res in results: - env_vars_str = '\n'.join(f"{key}={value}" for key, value in res.env.items()) - markdown_sections.append(f""" -
-{res.label} -#### Environment Variables: -{env_vars_str} +def get_available_markdown_size(current_markdown_size: int): + return max(0, max_markdown_size - current_markdown_size) -#### Command: -{' '.join(res.command)} -
-""") - markdown_sections.append(f""" -
-""") - return "\n".join(markdown_sections) +def is_content_in_size_limit(content_size: int, current_markdown_size: int): + return content_size <= get_available_markdown_size(current_markdown_size) -def generate_summary_table_and_chart(chart_data: dict[str, list[Result]]): - summary_table = "| Benchmark | " + " | ".join(chart_data.keys()) + " | Relative perf | Change | - |\n" - summary_table += "|---" * (len(chart_data) + 4) + "|\n" + +def get_explicit_group_name(result: Result): + explicit_group_name = result.explicit_group + + if explicit_group_name != "": + return explicit_group_name + else: + return "Other" + + +# Function to generate the markdown collapsible sections for each variant +def generate_markdown_details(results: list[Result], + current_markdown_size: int, + markdown_size: MarkdownSize): + markdown_sections = [] + markdown_start = ("\n
\n" + "Benchmark details - environment, command..." + "\n") + markdown_sections.append(markdown_start) + + for res in results: + env_dict = res.env + command = res.command + + # If data is collected from already saved results, + # the content is parsed as strings + if isinstance(res.env, str): + # Since the scripts would be used solely on data prepared + # by our scripts, this should be safe + # However, maybe needs an additional blessing + # https://docs.python.org/3/library/ast.html#ast.literal_eval + env_dict = ast.literal_eval(res.env) + if isinstance(res.command, str): + command = ast.literal_eval(res.command) + + section = ("\n
\n" + f"{res.label}\n\n" + "#### Command:\n" + f"{' '.join(command)}\n\n") + + if env_dict: + env_vars_str = '\n'.join(f"{key}={value}" + for key, value in env_dict.items()) + section += (f"#### Environment Variables:\n {env_vars_str}\n") + + section += "\n
\n" + + markdown_sections.append(section) + + markdown_sections.append("\n
\n") + + full_markdown = "\n".join(markdown_sections) + + if markdown_size == MarkdownSize.FULL: + return full_markdown + else: + if is_content_in_size_limit(len(full_markdown), current_markdown_size): + return full_markdown + else: + return "\nBenchmark details contain too many chars to display\n" + +def generate_summary_table_and_chart(chart_data: dict[str, list[Result]], + baseline_name: str, + markdown_size: MarkdownSize): + summary_table = get_chart_markdown_header(chart_data=chart_data, + baseline_name=baseline_name) # Collect all benchmarks and their results + # key: benchmark name, + # value: dict(run_name : single_result_in_the_given_run) benchmark_results = collections.defaultdict(dict) + + # key: run name + # results: results from different benchmarks collected in the named run for key, results in chart_data.items(): for res in results: benchmark_results[res.name][key] = res @@ -62,159 +192,209 @@ def generate_summary_table_and_chart(chart_data: dict[str, list[Result]]): # Generate the table rows output_detailed_list = [] - - global_product = 1 - mean_cnt = 0 - improved = 0 - regressed = 0 - no_change = 0 - for bname, results in benchmark_results.items(): oln = OutputLine(bname) oln.row = f"| {bname} |" best_value = None best_key = None - # Determine the best value + are_suite_group_assigned = False + + # Determine the best value for the given benchmark, among the results + # from all saved runs specified by --compare + # key: run name, + # res: single result collected in the given run for key, res in results.items(): - if best_value is None or (res.lower_is_better and res.value < best_value) or (not res.lower_is_better and res.value > best_value): + if not are_suite_group_assigned: + oln.suite = res.suite + oln.explicit_group = get_explicit_group_name(res) + + are_suite_group_assigned = True + + if best_value is None or \ + (res.lower_is_better and res.value < best_value) or \ + (not res.lower_is_better and res.value > best_value): best_value = res.value best_key = key - # Generate the row with the best value highlighted + # Generate the row with all the results from saved runs specified by + # --compare, + # Highight the best value in the row with data if options.verbose: print(f"Results: {results}") for key in chart_data.keys(): if key in results: intv = results[key].value if key == best_key: - oln.row += f" {intv:3f} {results[key].unit} |" # Highlight the best value + # Highlight the best value + oln.row += f" {intv:3f} {results[key].unit} |" else: oln.row += f" {intv:.3f} {results[key].unit} |" else: oln.row += " - |" - if len(chart_data.keys()) == 2: - key0 = list(chart_data.keys())[0] - key1 = list(chart_data.keys())[1] - if (key0 in results) and (key1 in results): - v0 = results[key0].value - v1 = results[key1].value + if is_relative_perf_comparison_to_be_performed(chart_data, + baseline_name): + pr_key = baseline_name + main_key = get_main_branch_run_name(chart_data, baseline_name) + + if (pr_key in results) and (main_key in results): + pr_val = results[pr_key].value + main_val = results[main_key].value diff = None - if v0 != 0 and results[key0].lower_is_better: - diff = v1/v0 - elif v1 != 0 and not results[key0].lower_is_better: - diff = v0/v1 + if pr_val != 0 and results[pr_key].lower_is_better: + diff = main_val / pr_val + elif main_val != 0 and not results[pr_key].lower_is_better: + diff = pr_val / main_val if diff != None: - oln.row += f"{(diff * 100):.2f}%" oln.diff = diff output_detailed_list.append(oln) - sorted_detailed_list = sorted(output_detailed_list, key=lambda x: (x.diff is not None, x.diff), reverse=True) + sorted_detailed_list = sorted(output_detailed_list, key=lambda x: + (x.diff is not None, x.diff), reverse=True) - diff_values = [oln.diff for oln in sorted_detailed_list if oln.diff is not None] + diff_values = [oln.diff for oln in sorted_detailed_list + if oln.diff is not None] - if len(diff_values) > 0: - max_diff = max(max(diff_values) - 1, 1 - min(diff_values)) + improved_rows = [] + regressed_rows = [] + if len(diff_values) > 0: for oln in sorted_detailed_list: if oln.diff != None: - oln.row += f" | {(oln.diff - 1)*100:.2f}%" delta = oln.diff - 1 - oln.bars = round(10*(oln.diff - 1)/max_diff) if max_diff != 0.0 else 0 - if oln.bars == 0 or abs(delta) < options.epsilon: - oln.row += " | . |" - elif oln.bars > 0: - oln.row += f" | {'+' * oln.bars} |" - else: - oln.row += f" | {'-' * (-oln.bars)} |" + oln.row += f" {delta*100:.2f}%" - mean_cnt += 1 if abs(delta) > options.epsilon: if delta > 0: - improved+=1 + improved_rows.append(oln.row + " | \n") else: - regressed+=1 - else: - no_change+=1 - - global_product *= oln.diff - else: - oln.row += " | |" + regressed_rows.append(oln.row + " | \n") if options.verbose: print(oln.row) + summary_table += oln.row + "\n" else: for oln in sorted_detailed_list: - oln.row += " | |" - if options.verbose: print(oln.row) summary_table += oln.row + "\n" - grouped_objects = collections.defaultdict(list) - - for oln in output_detailed_list: - s = oln.label - prefix = re.match(r'^[^_\s]+', s)[0] - grouped_objects[prefix].append(oln) - - grouped_objects = dict(grouped_objects) - - if mean_cnt > 0: - global_mean = global_product ** (1/mean_cnt) - summary_line = f"Total {mean_cnt} benchmarks in mean. " - summary_line += "\n" + f"Geomean {global_mean*100:.3f}%. \nImproved {improved} Regressed {regressed} (threshold {options.epsilon*100:.2f}%)" - else: + regressed_rows.reverse() + + is_at_least_one_diff = False + summary_line = '' + + if len(improved_rows) > 0: + is_at_least_one_diff = True + summary_line += get_improved_regressed_summary( + is_improved=True, + rows_count=len(improved_rows) + ) + summary_line += get_chart_markdown_header( + chart_data=chart_data, + baseline_name=baseline_name + ) + + for row in improved_rows: + summary_line += row + + summary_line += "\n
" + + if len(regressed_rows) > 0: + is_at_least_one_diff = True + summary_line += get_improved_regressed_summary( + is_improved=False, + rows_count=len(regressed_rows) + ) + + summary_line += get_chart_markdown_header( + chart_data=chart_data, + baseline_name=baseline_name + ) + + for row in regressed_rows: + summary_line += row + + summary_line += "\n
" + + if not is_at_least_one_diff: summary_line = f"No diffs to calculate performance change" if options.verbose: print(summary_line) - summary_table = "\n## Performance change in benchmark groups\n" - for name, outgroup in grouped_objects.items(): - outgroup_s = sorted(outgroup, key=lambda x: (x.diff is not None, x.diff), reverse=True) - product = 1.0 - n = len(outgroup_s) - r = 0 - for oln in outgroup_s: - if oln.diff != None: - product *= oln.diff - r += 1 - if r > 0: - summary_table += f""" -
- Relative perf in group {name} ({n}): {math.pow(product, 1/r)*100:.3f}% - -""" - else: - summary_table += f""" -
- Relative perf in group {name} ({n}): cannot calculate + grouped_in_suites = collections.defaultdict(lambda: + collections.defaultdict(list)) + for oln in output_detailed_list: + grouped_in_suites[oln.suite][oln.explicit_group].append(oln) + + for suite_name, suite_groups in grouped_in_suites.items(): + summary_table += f"
{suite_name}\n\n" -""" - summary_table += "| Benchmark | " + " | ".join(chart_data.keys()) + " | Relative perf | Change | - |\n" - summary_table += "|---" * (len(chart_data) + 4) + "|\n" + for name, outgroup in suite_groups.items(): + outgroup_s = sorted(outgroup, key=lambda x: + (x.diff is not None, x.diff), reverse=True) - for oln in outgroup_s: - summary_table += f"{oln.row}\n" + summary_table += get_relative_perf_summary( + group_size=len(outgroup_s), + group_name=name + ) + summary_table += get_chart_markdown_header(chart_data, + baseline_name) - summary_table += f""" -
+ for oln in outgroup_s: + summary_table += f"{oln.row}\n" -""" + summary_table += "\n
\n\n" - return summary_line, summary_table + summary_table += "
" -def generate_markdown(name: str, chart_data: dict[str, list[Result]]): - (summary_line, summary_table) = generate_summary_table_and_chart(chart_data) + if markdown_size == MarkdownSize.FULL: + return summary_line, summary_table + else: + full_content_size = len(summary_table) + len(summary_line) - return f""" -# Summary -{summary_line}\n -(result is better)\n -{summary_table} -# Details -{generate_markdown_details(chart_data[name])} -""" + if is_content_in_size_limit(content_size=full_content_size, + current_markdown_size=0): + return summary_line, summary_table + else: + if is_content_in_size_limit(content_size=len(summary_line), + current_markdown_size=0): + return summary_line, '' + else: + return ( + "\n# Summary\n" + "Benchmark output is too large to display\n\n" + ) + + +def generate_markdown(name: str, + chart_data: dict[str, list[Result]], + markdown_size: MarkdownSize): + (summary_line, summary_table) = generate_summary_table_and_chart( + chart_data, + name, + markdown_size + ) + + current_markdown_size = len(summary_line) + len(summary_table) + + generated_markdown = ( + "\n# Summary\n" + "(Emphasized values are the best results)\n" + f"{summary_line}\n" + f"{summary_table}\n\n" + ) + + if name in chart_data.keys(): + markdown_details = generate_markdown_details(chart_data[name], + current_markdown_size, + markdown_size) + generated_markdown += ( + "\n# Details\n" + f"{markdown_details}\n" + ) + + return generated_markdown