diff --git a/tools/perf/perf_run.py b/tools/perf/perf_run.py index 4475155221..ca37316ea8 100644 --- a/tools/perf/perf_run.py +++ b/tools/perf/perf_run.py @@ -175,11 +175,17 @@ def run_ts_trt(model, input_tensors, params, precision, batch_size): "inputs": input_tensors, "enabled_precisions": {precision_to_dtype(precision)}, "truncate_long_and_double": params.get("truncate", False), + "use_python_runtime": params.get("use_python_runtime", False), } if precision == "int8": compile_settings.update({"calib": params.get("calibration_cache")}) + if params.get("enable_cuda_graph", False): + logging.warning( + f"Torchscript backend doesn't support CUDA Graphs. `--enable_cuda_graph` will be ignored." + ) + start_compile = timeit.default_timer() model = torchtrt.compile(model, ir="ts", **compile_settings) end_compile = timeit.default_timer() @@ -217,19 +223,34 @@ def run_hf_dynamo(model, input_tensors, params, precision, batch_size): inputs=input_tensors, enabled_precisions={precision_to_dtype(precision)}, truncate_double=params.get("truncate", False), + use_python_runtime=params.get("use_python_runtime", False), ) end_compile = timeit.default_timer() compile_time_s = end_compile - start_compile - record_llm_perf( - trt_model, - "Dynamo", - input_tensors, - precision, - osl, - batch_size, - iters, - compile_time_s, - ) + + if params.get("enable_cuda_graph", False): + with torchtrt.runtime.enable_cudagraphs(trt_model) as cudagraphs_module: + record_llm_perf( + cudagraphs_module, + "Dynamo", + input_tensors, + precision, + osl, + batch_size, + iters, + compile_time_s, + ) + else: + record_llm_perf( + trt_model, + "Dynamo", + input_tensors, + precision, + osl, + batch_size, + iters, + compile_time_s, + ) @run_with_try_except @@ -262,14 +283,27 @@ def run_dynamo(model, input_tensors, params, precision, batch_size): ), cache_built_engines=params.get("cache_built_engines", False), reuse_cached_engines=params.get("reuse_cached_engines", False), + use_python_runtime=params.get("use_python_runtime", False), ) end_compile = timeit.default_timer() compile_time_s = end_compile - start_compile iters = params.get("iterations", 20) - record_perf( - model, "Dynamo", input_tensors, precision, iters, batch_size, compile_time_s - ) + if params.get("enable_cuda_graph", False): + with torchtrt.runtime.enable_cudagraphs(model) as cudagraphs_module: + record_perf( + cudagraphs_module, + "Dynamo", + input_tensors, + precision, + iters, + batch_size, + compile_time_s, + ) + else: + record_perf( + model, "Dynamo", input_tensors, precision, iters, batch_size, compile_time_s + ) @run_with_try_except @@ -292,6 +326,7 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size): "enabled_precisions": {precision_to_dtype(precision)}, "truncate": params.get("truncate", False), "min_block_size": params.get("min_block_size", 1), + "use_python_runtime": params.get("use_python_runtime", False), } start_compile = timeit.default_timer() model = torch.compile(model, backend="tensorrt", dynamic=None, options=compile_spec) @@ -300,15 +335,27 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size): compile_time_s = end_compile - start_compile iters = params.get("iterations", 20) - record_perf( - model, - "torch_compile", - input_tensors, - precision, - iters, - batch_size, - compile_time_s, - ) + if params.get("enable_cuda_graph", False): + with torchtrt.runtime.enable_cudagraphs(model) as cudagraphs_module: + record_perf( + cudagraphs_module, + "torch_compile", + input_tensors, + precision, + iters, + batch_size, + compile_time_s, + ) + else: + record_perf( + model, + "torch_compile", + input_tensors, + precision, + iters, + batch_size, + compile_time_s, + ) @run_with_try_except @@ -320,9 +367,13 @@ def run_hf_inductor(model, input_tensors, params, precision, batch_size): # Mark dynamic shapes for input sequence input_seq = input_tensors[0] torch._dynamo.mark_dynamic(input_seq, 1, min=1, max=osl) + mode = "max-autotune" + if params.get("enable_cuda_graph", False): + mode = "reduce-overhead" + start_compile = timeit.default_timer() # Compile the model - model = torch.compile(model, backend="inductor", dynamic=None, mode="max-autotune") + model = torch.compile(model, backend="inductor", dynamic=None, mode=mode) model(input_seq) end_compile = timeit.default_timer() compile_time_s = end_compile - start_compile @@ -356,15 +407,25 @@ def run_inductor(model, input_tensors, params, precision, batch_size): if params["is_text_llm"]: return run_hf_inductor(model, input_tensors, params, precision, batch_size) + mode = "max-autotune" + if params.get("enable_cuda_graph", False): + mode = "reduce-overhead" + start_compile = timeit.default_timer() - model = torch.compile(model, backend="inductor", dynamic=None, mode="max-autotune") + model = torch.compile(model, backend="inductor", dynamic=None, mode=mode) model(*input_tensors) end_compile = timeit.default_timer() compile_time_s = end_compile - start_compile iters = params.get("iterations", 20) record_perf( - model, "inductor", input_tensors, precision, iters, batch_size, compile_time_s + model, + "inductor", + input_tensors, + precision, + iters, + batch_size, + compile_time_s, ) @@ -587,6 +648,16 @@ def run( action="store_true", help="Boolean flag to determine if the user provided model is a TRT engine or not", ) + arg_parser.add_argument( + "--use_python_runtime", + action="store_true", + help="Whether to use Python runtime or not. Using C++ runtime by default", + ) + arg_parser.add_argument( + "--enable_cuda_graph", + action="store_true", + help="Whether to enable CUDA Graph. It is not used by default", + ) arg_parser.add_argument( "--report", type=str,