From bbd9097470d7ad35ecbaac22ffb2ef474dc4670a Mon Sep 17 00:00:00 2001
From: Agata Momot <agata.momot@intel.com>
Date: Mon, 10 Feb 2025 12:54:03 +0100
Subject: [PATCH] change markdown output in benchmark PR comments

add an option for limiting markdown content size
calculate relative performance with different baselines
calculate relative performance using only already saved data
group results according to suite names and explicit groups
add multiple data columns if multiple --compare specified
---
 .github/workflows/benchmarks-reusable.yml |   3 +-
 .github/workflows/benchmarks.yml          |   2 +-
 scripts/benchmarks/README.md              |   9 +-
 scripts/benchmarks/main.py                |  19 +-
 scripts/benchmarks/options.py             |   7 +-
 scripts/benchmarks/output_markdown.py     | 436 +++++++++++++++-------
 6 files changed, 335 insertions(+), 141 deletions(-)
diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml
index bfd6064ba1..f8ef31fcc2 100644
--- a/.github/workflows/benchmarks-reusable.yml
+++ b/.github/workflows/benchmarks-reusable.yml
@@ -220,11 +220,12 @@ jobs:
         --compute-runtime ${{ inputs.compute_runtime_commit }}
         --build-igc
         ${{ inputs.upload_report && '--output-html' || '' }}
+        ${{ inputs.pr_no != 0 && '--output-markdown' || '' }}
         ${{ inputs.bench_script_params }}
 
     - name: Print benchmark results
       run: |
-        cat ${{ github.workspace }}/ur-repo/benchmark_results.md
+        cat ${{ github.workspace }}/ur-repo/benchmark_results.md || true
 
     - name: Add comment to PR
       uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 7de3926daf..edcb5c02f2 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -24,7 +24,7 @@ on:
         type: number
         required: true
       bench_script_params:
-        description: Parameters passed to script executing benchmark
+        description: Parameters passed to the script executing benchmark (recommended `--compare baseline`)
         type: string
         required: false
         default: ''
diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md
index 9cef0e52a3..4fe7f8b3c4 100644
--- a/scripts/benchmarks/README.md
+++ b/scripts/benchmarks/README.md
@@ -35,13 +35,16 @@ You must be a member of the `oneapi-src` organization to access these features.
 
 By default, the benchmark results are not stored. To store them, use the option `--save <name>`. This will make the results available for comparison during the next benchmark runs.
 
-To compare a benchmark run with a previously stored result, use the option `--compare <name>`. You can compare with more than one result.
-
-If no `--compare` option is specified, the benchmark run is compared against a previously stored `baseline`.
+You can compare benchmark results using `--compare` option as described: in a markdown output file (see below), listing more than two `--compare` options results in displaying only execution time, without statistical analysis. If you want to calculate the relative performance of the new results against the previously saved data, use `--compare <previously_saved_data>`. In case of comparing only stored data without generating new results, use `--dry-run --compare <name1> --compare <name2> --relative-perf <name1>`, where `name1` indicates the baseline for the relative performance calculation, `--dry-run` prevents the script for running benchmarks and `--relative-perf` substitutes the name of the current run.
 
 Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
 are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html).
 
+
+## Output formats
+You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`.
+
+
 ## Requirements
 
 ### Python
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index 77524a6e02..a740c02672 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -189,9 +189,12 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
         benchmark.teardown()
         print("complete.")
 
-    this_name = "This PR"
 
-    chart_data = {this_name : results}
+    this_name = options.current_run_name
+    chart_data = {}
+
+    if not options.dry_run:
+        chart_data = {this_name : results}
 
     history = BenchmarkHistory(directory)
     # limit how many files we load.
@@ -199,7 +202,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     history.load(1000)
 
     # remove duplicates. this can happen if e.g., --compare baseline is specified manually.
-    compare_names = list(dict.fromkeys(compare_names))
+    compare_names = list(dict.fromkeys(compare_names)) if compare_names is not None else []
 
     for name in compare_names:
         compare_result = history.get_compare(name)
@@ -207,7 +210,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             chart_data[name] = compare_result.results
 
     if options.output_markdown:
-        markdown_content = generate_markdown(this_name, chart_data)
+        markdown_content = generate_markdown(this_name, chart_data, options.output_markdown)
 
         with open('benchmark_results.md', 'w') as file:
             file.write(markdown_content)
@@ -251,7 +254,7 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument("--no-rebuild", help='Do not rebuild the benchmarks from scratch.', action="store_true")
     parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
     parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
-    parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
+    parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append")
     parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations)
     parser.add_argument("--stddev-threshold", type=float, help='If stddev pct is above this threshold, rerun all iterations', default=options.stddev_threshold)
     parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout)
@@ -261,12 +264,13 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
     parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
     parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max)
+    parser.add_argument("--output-markdown", nargs='?', const=options.output_markdown, help='Specify whether markdown output should fit the content size limit for request validation')
     parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
-    parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
     parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)
     parser.add_argument("--compute-runtime", nargs='?', const=options.compute_runtime_tag, help="Fetch and build compute runtime")
     parser.add_argument("--iterations-stddev", type=int, help="Max number of iterations of the loop calculating stddev after completed benchmark runs", default=options.iterations_stddev)
     parser.add_argument("--build-igc", help="Build IGC from source instead of using the OS-installed version", action="store_true", default=options.build_igc)
+    parser.add_argument("--relative-perf",  type=str, help="The name of the results which should be used as a baseline for metrics calculation", default=options.current_run_name)
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
@@ -283,12 +287,13 @@ def validate_and_parse_env_args(env_args):
     options.exit_on_failure = args.exit_on_failure
     options.compare = Compare(args.compare_type)
     options.compare_max = args.compare_max
-    options.output_html = args.output_html
     options.output_markdown = args.output_markdown
+    options.output_html = args.output_html
     options.dry_run = args.dry_run
     options.umf = args.umf
     options.iterations_stddev = args.iterations_stddev
     options.build_igc = args.build_igc
+    options.current_run_name = args.relative_perf
 
     if args.build_igc and args.compute_runtime is None:
         parser.error("--build-igc requires --compute-runtime to be set")
diff --git a/scripts/benchmarks/options.py b/scripts/benchmarks/options.py
index 1bd79f6878..772fee2e02 100644
--- a/scripts/benchmarks/options.py
+++ b/scripts/benchmarks/options.py
@@ -6,6 +6,10 @@ class Compare(Enum):
     AVERAGE = 'average'
     MEDIAN = 'median'
 
+class MarkdownSize(Enum):
+    SHORT = 'short'
+    FULL = 'full'
+
 @dataclass
 class Options:
     workdir: str = None
@@ -20,8 +24,8 @@ class Options:
     verbose: bool = False
     compare: Compare = Compare.LATEST
     compare_max: int = 10 # average/median over how many results
+    output_markdown: MarkdownSize = MarkdownSize.SHORT
     output_html: bool = False
-    output_markdown: bool = True
     dry_run: bool = False
     # these two should probably be merged into one setting
     stddev_threshold: float = 0.02
@@ -32,6 +36,7 @@ class Options:
     extra_env_vars: dict = field(default_factory=dict)
     compute_runtime_tag: str = '24.52.32224.10'
     build_igc: bool = False
+    current_run_name: str = "This PR"
 
 options = Options()
 
diff --git a/scripts/benchmarks/output_markdown.py b/scripts/benchmarks/output_markdown.py
index fc3b65507b..daa01e34ff 100644
--- a/scripts/benchmarks/output_markdown.py
+++ b/scripts/benchmarks/output_markdown.py
@@ -1,12 +1,13 @@
-# Copyright (C) 2024 Intel Corporation
-# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# Copyright (C) 2024-2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM 
+# Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import collections, re
+import collections
 from benches.result import Result
-from options import options
-import math
+from options import options, MarkdownSize
+import ast
 
 class OutputLine:
     def __init__(self, name):
@@ -14,6 +15,8 @@ def __init__(self, name):
         self.diff = None
         self.bars = None
         self.row = ""
+        self.suite = "Unknown"
+        self.explicit_group = ""
 
     def __str__(self):
         return f"(Label:{self.label}, diff:{self.diff})"
@@ -21,40 +24,167 @@ def __str__(self):
     def __repr__(self):
         return self.__str__()
 
-# Function to generate the markdown collapsible sections for each variant
-def generate_markdown_details(results: list[Result]):
-    markdown_sections = []
+# The number of the required columns in the markdown table,
+# independent of the chart_data content.
+# Required columns:
+# - benchmark_name
+#
+# optional +1: relative performance
+num_info_columns = 1
+
+# Number of columns required for relative performance change calculation.
+# In case of multiple provided saved baselines to compare, the relative
+# performance is not calculated, since the base (hopefully) usage case 
+# for this script would be comparing the performance of PR with the main branch
+num_baselines_required_for_rel_change = 2
+
+# Maximum number of characters that is allowed in request validation
+# for posting comments in GitHub PRs
+max_markdown_size = 65536
+
+
+def is_relative_perf_comparison_to_be_performed(chart_data: 
+                                                dict[str, list[Result]], 
+                                                baseline_name: str):
+    return (len(chart_data) == num_baselines_required_for_rel_change) and \
+            (baseline_name in chart_data.keys())
+    
+
+def get_chart_markdown_header(chart_data: dict[str, list[Result]], 
+                              baseline_name: str):
+    summary_header = ''
+    final_num_columns = num_info_columns
+
+    if is_relative_perf_comparison_to_be_performed(chart_data, baseline_name):
+        summary_header = "| Benchmark | " + " | ".join(chart_data.keys()) + \
+                        " | Change |\n"
+        final_num_columns += 1
+    else:
+        summary_header = "| Benchmark | " + " | ".join(chart_data.keys()) + \
+                        " |\n"
+
+    summary_header += "|---" * (len(chart_data) + final_num_columns) + "|\n"
+
+    return summary_header
+
+
+def get_improved_regressed_summary(is_improved: bool, rows_count: int):
+    title = "Improved"
+    if not is_improved:
+        title = "Regressed"
+
+    summary = (
+            "\n<details>\n"
+            "<summary>\n"        
+            f"{title} {rows_count} "
+            f"(threshold {options.epsilon*100:.2f}%)\n" 
+            "</summary>\n\n"
+            )
+
+    return summary
+
+
+def get_relative_perf_summary(group_size: int, group_name: str):
+    summary = (
+            "\n<details>\n"
+            f"<summary> Relative perf in group {group_name} " 
+            f"({group_size})\n"
+            "</summary>\n\n"
+            )
+
+    return summary
+
 
-    markdown_sections.append(f"""
-<details>
-<summary>Benchmark details - environment, command...</summary>
-""")
+def get_main_branch_run_name(chart_data: dict[str, list[Result]], 
+                             baseline_name: str):
+    for key in chart_data.keys():
+        if key != baseline_name:
+            return key
+        
+    return None
 
-    for res in results:
-        env_vars_str = '\n'.join(f"{key}={value}" for key, value in res.env.items())
-        markdown_sections.append(f"""
-<details>
-<summary>{res.label}</summary>
 
-#### Environment Variables:
-{env_vars_str}
+def get_available_markdown_size(current_markdown_size: int):
+    return max(0, max_markdown_size - current_markdown_size)
 
-#### Command:
-{' '.join(res.command)}
 
-</details>
-""")
-    markdown_sections.append(f"""
-</details>
-""")
-    return "\n".join(markdown_sections)
+def is_content_in_size_limit(content_size: int, current_markdown_size: int):
+    return content_size <= get_available_markdown_size(current_markdown_size)
 
-def generate_summary_table_and_chart(chart_data: dict[str, list[Result]]):
-    summary_table = "| Benchmark | " + " | ".join(chart_data.keys()) + " | Relative perf | Change | - |\n"
-    summary_table += "|---" * (len(chart_data) + 4) + "|\n"
+
+def get_explicit_group_name(result: Result):
+    explicit_group_name = result.explicit_group
+
+    if explicit_group_name != "":
+        return explicit_group_name
+    else:
+        return "Other"
+    
+
+# Function to generate the markdown collapsible sections for each variant
+def generate_markdown_details(results: list[Result], 
+                              current_markdown_size: int, 
+                              markdown_size: MarkdownSize):
+    markdown_sections = []
+    markdown_start = ("\n<details>\n"
+                      "<summary>Benchmark details - environment, command..." 
+                      "</summary>\n")
+    markdown_sections.append(markdown_start)
+
+    for res in results:        
+        env_dict = res.env
+        command = res.command
+
+        # If data is collected from already saved results,
+        # the content is parsed as strings
+        if isinstance(res.env, str):
+            # Since the scripts would be used solely on data prepared
+            # by our scripts, this should be safe
+            # However, maybe needs an additional blessing
+            # https://docs.python.org/3/library/ast.html#ast.literal_eval
+            env_dict = ast.literal_eval(res.env)
+        if isinstance(res.command, str):
+            command = ast.literal_eval(res.command)
+
+        section = ("\n<details>\n"
+                    f"<summary>{res.label}</summary>\n\n"
+                    "#### Command:\n" 
+                    f"{' '.join(command)}\n\n")
+        
+        if env_dict:
+            env_vars_str = '\n'.join(f"{key}={value}" 
+                                 for key, value in env_dict.items())
+            section += (f"#### Environment Variables:\n {env_vars_str}\n")
+
+        section += "\n</details>\n" 
+            
+        markdown_sections.append(section)
+
+    markdown_sections.append("\n</details>\n")
+    
+    full_markdown = "\n".join(markdown_sections)
+
+    if markdown_size == MarkdownSize.FULL:
+        return full_markdown
+    else:
+        if is_content_in_size_limit(len(full_markdown), current_markdown_size):
+            return full_markdown
+        else:
+            return "\nBenchmark details contain too many chars to display\n"
+
+def generate_summary_table_and_chart(chart_data: dict[str, list[Result]], 
+                                     baseline_name: str, 
+                                     markdown_size: MarkdownSize):
+    summary_table = get_chart_markdown_header(chart_data=chart_data,
+                                              baseline_name=baseline_name)
 
     # Collect all benchmarks and their results
+    # key: benchmark name, 
+    # value: dict(run_name : single_result_in_the_given_run)
     benchmark_results = collections.defaultdict(dict)
+
+    # key: run name
+    # results: results from different benchmarks collected in the named run
     for key, results in chart_data.items():
         for res in results:
             benchmark_results[res.name][key] = res
@@ -62,159 +192,209 @@ def generate_summary_table_and_chart(chart_data: dict[str, list[Result]]):
     # Generate the table rows
     output_detailed_list = []
 
-
-    global_product = 1
-    mean_cnt = 0
-    improved = 0
-    regressed = 0
-    no_change = 0
-
     for bname, results in benchmark_results.items():
         oln = OutputLine(bname)
         oln.row = f"| {bname} |"
         best_value = None
         best_key = None
 
-        # Determine the best value
+        are_suite_group_assigned = False
+
+        # Determine the best value for the given benchmark, among the results
+        # from all saved runs specified by --compare
+        # key: run name,
+        # res: single result collected in the given run
         for key, res in results.items():
-            if best_value is None or (res.lower_is_better and res.value < best_value) or (not res.lower_is_better and res.value > best_value):
+            if not are_suite_group_assigned:
+                oln.suite = res.suite
+                oln.explicit_group = get_explicit_group_name(res)
+
+                are_suite_group_assigned = True
+
+            if best_value is None or \
+            (res.lower_is_better and res.value < best_value) or \
+            (not res.lower_is_better and res.value > best_value):
                 best_value = res.value
                 best_key = key
 
-        # Generate the row with the best value highlighted
+        # Generate the row with all the results from saved runs specified by
+        # --compare,
+        # Highight the best value in the row with data
         if options.verbose: print(f"Results: {results}")
         for key in chart_data.keys():
             if key in results:
                 intv = results[key].value
                 if key == best_key:
-                    oln.row += f" <ins>{intv:3f}</ins> {results[key].unit} |"  # Highlight the best value
+                    # Highlight the best value
+                    oln.row += f" <ins>{intv:3f}</ins> {results[key].unit} |"  
                 else:
                     oln.row += f" {intv:.3f} {results[key].unit} |"
             else:
                 oln.row += " - |"
 
-        if len(chart_data.keys()) == 2:
-            key0 = list(chart_data.keys())[0]
-            key1 = list(chart_data.keys())[1]
-            if (key0 in results) and (key1 in results):
-                v0 = results[key0].value
-                v1 = results[key1].value
+        if is_relative_perf_comparison_to_be_performed(chart_data, 
+                                                       baseline_name):
+            pr_key = baseline_name
+            main_key = get_main_branch_run_name(chart_data, baseline_name) 
+
+            if (pr_key in results) and (main_key in results):
+                pr_val = results[pr_key].value
+                main_val = results[main_key].value
                 diff = None
-                if v0 != 0 and results[key0].lower_is_better:
-                    diff = v1/v0
-                elif v1 != 0 and not results[key0].lower_is_better:
-                    diff = v0/v1
+                if pr_val != 0 and results[pr_key].lower_is_better:
+                    diff = main_val / pr_val
+                elif main_val != 0 and not results[pr_key].lower_is_better:
+                    diff = pr_val / main_val
 
                 if diff != None:
-                    oln.row += f"{(diff * 100):.2f}%"
                     oln.diff = diff
 
         output_detailed_list.append(oln)
 
 
-    sorted_detailed_list = sorted(output_detailed_list, key=lambda x: (x.diff is not None, x.diff), reverse=True)
+    sorted_detailed_list = sorted(output_detailed_list, key=lambda x:
+                                  (x.diff is not None, x.diff), reverse=True)
 
-    diff_values = [oln.diff for oln in sorted_detailed_list if oln.diff is not None]
+    diff_values = [oln.diff for oln in sorted_detailed_list 
+                   if oln.diff is not None]
 
-    if len(diff_values) > 0:
-        max_diff = max(max(diff_values) - 1, 1 - min(diff_values))
+    improved_rows = []
+    regressed_rows = []
 
+    if len(diff_values) > 0:
         for oln in sorted_detailed_list:
             if oln.diff != None:
-                oln.row += f" | {(oln.diff - 1)*100:.2f}%"
                 delta = oln.diff - 1
-                oln.bars = round(10*(oln.diff - 1)/max_diff) if max_diff != 0.0 else 0
-                if oln.bars == 0 or abs(delta) < options.epsilon:
-                    oln.row += " | . |"
-                elif oln.bars > 0:
-                    oln.row += f" | {'+' * oln.bars} |"
-                else:
-                    oln.row += f" | {'-' * (-oln.bars)} |"
+                oln.row += f" {delta*100:.2f}%"
 
-                mean_cnt += 1
                 if abs(delta) > options.epsilon:
                     if delta > 0:
-                        improved+=1
+                        improved_rows.append(oln.row + " | \n")
                     else:
-                        regressed+=1
-                else:
-                    no_change+=1
-
-                global_product *= oln.diff
-            else:
-                oln.row += " |   |"
+                        regressed_rows.append(oln.row + " | \n")
 
             if options.verbose: print(oln.row)
+
             summary_table += oln.row + "\n"
     else:
         for oln in sorted_detailed_list:
-            oln.row += " |   |"
-            if options.verbose: print(oln.row)
             summary_table += oln.row + "\n"
 
-    grouped_objects = collections.defaultdict(list)
-
-    for oln in output_detailed_list:
-        s = oln.label
-        prefix = re.match(r'^[^_\s]+', s)[0]
-        grouped_objects[prefix].append(oln)
-
-    grouped_objects = dict(grouped_objects)
-
-    if mean_cnt > 0:
-        global_mean = global_product ** (1/mean_cnt)
-        summary_line = f"Total {mean_cnt} benchmarks in mean. "
-        summary_line += "\n" + f"Geomean {global_mean*100:.3f}%. \nImproved {improved} Regressed {regressed} (threshold {options.epsilon*100:.2f}%)"
-    else:
+    regressed_rows.reverse()
+
+    is_at_least_one_diff = False
+    summary_line = ''
+    
+    if len(improved_rows) > 0:
+        is_at_least_one_diff = True
+        summary_line += get_improved_regressed_summary(
+            is_improved=True, 
+            rows_count=len(improved_rows)
+            )        
+        summary_line += get_chart_markdown_header(
+            chart_data=chart_data,
+            baseline_name=baseline_name
+            ) 
+
+        for row in improved_rows:
+            summary_line += row 
+
+        summary_line += "\n</details>"
+    
+    if len(regressed_rows) > 0:
+        is_at_least_one_diff = True
+        summary_line += get_improved_regressed_summary(
+            is_improved=False, 
+            rows_count=len(regressed_rows)
+            )
+            
+        summary_line += get_chart_markdown_header(
+            chart_data=chart_data,
+            baseline_name=baseline_name
+            ) 
+
+        for row in regressed_rows:
+            summary_line += row 
+        
+        summary_line += "\n</details>"
+
+    if not is_at_least_one_diff:
         summary_line = f"No diffs to calculate performance change"
 
     if options.verbose: print(summary_line)
 
-
     summary_table = "\n## Performance change in benchmark groups\n"
 
-    for name, outgroup in grouped_objects.items():
-        outgroup_s = sorted(outgroup, key=lambda x: (x.diff is not None, x.diff), reverse=True)
-        product = 1.0
-        n = len(outgroup_s)
-        r = 0
-        for oln in outgroup_s:
-            if oln.diff != None:
-                product *= oln.diff
-                r += 1
-        if r > 0:
-            summary_table += f"""
-<details>
-<summary> Relative perf in group {name} ({n}): {math.pow(product, 1/r)*100:.3f}% </summary>
-
-"""
-        else:
-            summary_table += f"""
-<details>
-<summary> Relative perf in group {name} ({n}): cannot calculate </summary>
+    grouped_in_suites = collections.defaultdict(lambda: 
+                                                collections.defaultdict(list))
+    for oln in output_detailed_list:
+        grouped_in_suites[oln.suite][oln.explicit_group].append(oln)
+    
+    for suite_name, suite_groups in grouped_in_suites.items():
+        summary_table += f"<details><summary>{suite_name}</summary>\n\n"
 
-"""
-        summary_table += "| Benchmark | " + " | ".join(chart_data.keys()) + " | Relative perf | Change | - |\n"
-        summary_table += "|---" * (len(chart_data) + 4) + "|\n"
+        for name, outgroup in suite_groups.items():
+            outgroup_s = sorted(outgroup, key=lambda x: 
+                                (x.diff is not None, x.diff), reverse=True)
 
-        for oln in outgroup_s:
-            summary_table += f"{oln.row}\n"
+            summary_table += get_relative_perf_summary(
+                                                    group_size=len(outgroup_s), 
+                                                    group_name=name
+                                                    )
+            summary_table += get_chart_markdown_header(chart_data, 
+                                                       baseline_name) 
 
-        summary_table += f"""
-</details>
+            for oln in outgroup_s:
+                summary_table += f"{oln.row}\n"
 
-"""
+            summary_table += "\n</details>\n\n"
 
-    return summary_line, summary_table
+        summary_table += "</details>"
 
-def generate_markdown(name: str, chart_data: dict[str, list[Result]]):
-    (summary_line, summary_table) = generate_summary_table_and_chart(chart_data)
+    if markdown_size == MarkdownSize.FULL:
+        return summary_line, summary_table
+    else:
+        full_content_size = len(summary_table) + len(summary_line)
 
-    return f"""
-# Summary
-{summary_line}\n
-(<ins>result</ins> is better)\n
-{summary_table}
-# Details
-{generate_markdown_details(chart_data[name])}
-"""
+        if is_content_in_size_limit(content_size=full_content_size,
+                                     current_markdown_size=0):
+            return summary_line, summary_table
+        else:
+            if is_content_in_size_limit(content_size=len(summary_line), 
+                                        current_markdown_size=0):
+                return summary_line, ''
+            else:
+                return (
+                    "\n# Summary\n"
+                    "Benchmark output is too large to display\n\n"
+                    )
+
+
+def generate_markdown(name: str, 
+                      chart_data: dict[str, list[Result]],
+                      markdown_size: MarkdownSize):
+    (summary_line, summary_table) = generate_summary_table_and_chart(
+        chart_data, 
+        name, 
+        markdown_size
+        )
+
+    current_markdown_size = len(summary_line) + len(summary_table)
+
+    generated_markdown = (
+        "\n# Summary\n"
+        "(<ins>Emphasized values</ins> are the best results)\n"
+        f"{summary_line}\n"
+        f"{summary_table}\n\n"
+    )
+
+    if name in chart_data.keys():
+        markdown_details = generate_markdown_details(chart_data[name], 
+                                                     current_markdown_size,
+                                                     markdown_size)
+        generated_markdown += (
+            "\n# Details\n"
+            f"{markdown_details}\n"
+        )
+
+    return generated_markdown