change markdown output in benchmark PR comments

add an option for limiting markdown content size calculate relative performance with different baselines calculate relative performance using only already saved data group results according to suite names and explicit groups add multiple data columns if multiple --compare specified
oneapi-src · Feb 17, 2025 · bbd9097 · bbd9097
1 parent f66751d
commit bbd9097
Show file tree

Hide file tree

Showing 6 changed files with 335 additions and 141 deletions.
diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml
@@ -220,11 +220,12 @@ jobs:
         --compute-runtime ${{ inputs.compute_runtime_commit }}
         --build-igc
         ${{ inputs.upload_report && '--output-html' || '' }}
+        ${{ inputs.pr_no != 0 && '--output-markdown' || '' }}
         ${{ inputs.bench_script_params }}
 
     - name: Print benchmark results
       run: |
-        cat ${{ github.workspace }}/ur-repo/benchmark_results.md
+        cat ${{ github.workspace }}/ur-repo/benchmark_results.md || true
 
     - name: Add comment to PR
       uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -24,7 +24,7 @@ on:
         type: number
         required: true
       bench_script_params:
-        description: Parameters passed to script executing benchmark
+        description: Parameters passed to the script executing benchmark (recommended `--compare baseline`)
         type: string
         required: false
         default: ''

diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md
@@ -35,13 +35,16 @@ You must be a member of the `oneapi-src` organization to access these features.
 
 By default, the benchmark results are not stored. To store them, use the option `--save <name>`. This will make the results available for comparison during the next benchmark runs.
 
-To compare a benchmark run with a previously stored result, use the option `--compare <name>`. You can compare with more than one result.
-
-If no `--compare` option is specified, the benchmark run is compared against a previously stored `baseline`.
+You can compare benchmark results using `--compare` option as described: in a markdown output file (see below), listing more than two `--compare` options results in displaying only execution time, without statistical analysis. If you want to calculate the relative performance of the new results against the previously saved data, use `--compare <previously_saved_data>`. In case of comparing only stored data without generating new results, use `--dry-run --compare <name1> --compare <name2> --relative-perf <name1>`, where `name1` indicates the baseline for the relative performance calculation, `--dry-run` prevents the script for running benchmarks and `--relative-perf` substitutes the name of the current run.
 
 Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
 are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html).
 
+
+## Output formats
+You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`.
+
+
 ## Requirements
 
 ### Python

diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
@@ -189,25 +189,28 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
         benchmark.teardown()
         print("complete.")
 
-    this_name = "This PR"
 
-    chart_data = {this_name : results}
+    this_name = options.current_run_name
+    chart_data = {}
+
+    if not options.dry_run:
+        chart_data = {this_name : results}
 
     history = BenchmarkHistory(directory)
     # limit how many files we load.
     # should this be configurable?
     history.load(1000)
 
     # remove duplicates. this can happen if e.g., --compare baseline is specified manually.
-    compare_names = list(dict.fromkeys(compare_names))
+    compare_names = list(dict.fromkeys(compare_names)) if compare_names is not None else []
 
     for name in compare_names:
         compare_result = history.get_compare(name)
         if compare_result:
             chart_data[name] = compare_result.results
 
     if options.output_markdown:
-        markdown_content = generate_markdown(this_name, chart_data)
+        markdown_content = generate_markdown(this_name, chart_data, options.output_markdown)
 
         with open('benchmark_results.md', 'w') as file:
             file.write(markdown_content)
@@ -251,7 +254,7 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument("--no-rebuild", help='Do not rebuild the benchmarks from scratch.', action="store_true")
     parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
     parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
-    parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
+    parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append")
     parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations)
     parser.add_argument("--stddev-threshold", type=float, help='If stddev pct is above this threshold, rerun all iterations', default=options.stddev_threshold)
     parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout)
@@ -261,12 +264,13 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
     parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
     parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max)
+    parser.add_argument("--output-markdown", nargs='?', const=options.output_markdown, help='Specify whether markdown output should fit the content size limit for request validation')
     parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
-    parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
     parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)
     parser.add_argument("--compute-runtime", nargs='?', const=options.compute_runtime_tag, help="Fetch and build compute runtime")
     parser.add_argument("--iterations-stddev", type=int, help="Max number of iterations of the loop calculating stddev after completed benchmark runs", default=options.iterations_stddev)
     parser.add_argument("--build-igc", help="Build IGC from source instead of using the OS-installed version", action="store_true", default=options.build_igc)
+    parser.add_argument("--relative-perf",  type=str, help="The name of the results which should be used as a baseline for metrics calculation", default=options.current_run_name)
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
@@ -283,12 +287,13 @@ def validate_and_parse_env_args(env_args):
     options.exit_on_failure = args.exit_on_failure
     options.compare = Compare(args.compare_type)
     options.compare_max = args.compare_max
-    options.output_html = args.output_html
     options.output_markdown = args.output_markdown
+    options.output_html = args.output_html
     options.dry_run = args.dry_run
     options.umf = args.umf
     options.iterations_stddev = args.iterations_stddev
     options.build_igc = args.build_igc
+    options.current_run_name = args.relative_perf
 
     if args.build_igc and args.compute_runtime is None:
         parser.error("--build-igc requires --compute-runtime to be set")

diff --git a/scripts/benchmarks/options.py b/scripts/benchmarks/options.py
@@ -6,6 +6,10 @@ class Compare(Enum):
     AVERAGE = 'average'
     MEDIAN = 'median'
 
+class MarkdownSize(Enum):
+    SHORT = 'short'
+    FULL = 'full'
+
 @dataclass
 class Options:
     workdir: str = None
@@ -20,8 +24,8 @@ class Options:
     verbose: bool = False
     compare: Compare = Compare.LATEST
     compare_max: int = 10 # average/median over how many results
+    output_markdown: MarkdownSize = MarkdownSize.SHORT
     output_html: bool = False
-    output_markdown: bool = True
     dry_run: bool = False
     # these two should probably be merged into one setting
     stddev_threshold: float = 0.02
@@ -32,6 +36,7 @@ class Options:
     extra_env_vars: dict = field(default_factory=dict)
     compute_runtime_tag: str = '24.52.32224.10'
     build_igc: bool = False
+    current_run_name: str = "This PR"
 
 options = Options()