Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add --use_python_runtime and --enable_cuda_graph args to the perf run script #3397

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 96 additions & 25 deletions tools/perf/perf_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,17 @@ def run_ts_trt(model, input_tensors, params, precision, batch_size):
"inputs": input_tensors,
"enabled_precisions": {precision_to_dtype(precision)},
"truncate_long_and_double": params.get("truncate", False),
"use_python_runtime": params.get("use_python_runtime", False),
}

if precision == "int8":
compile_settings.update({"calib": params.get("calibration_cache")})

if params.get("enable_cuda_graph", False):
logging.warning(
f"Torchscript backend doesn't support CUDA Graphs. `--enable_cuda_graph` will be ignored."
)

start_compile = timeit.default_timer()
model = torchtrt.compile(model, ir="ts", **compile_settings)
end_compile = timeit.default_timer()
Expand Down Expand Up @@ -217,19 +223,34 @@ def run_hf_dynamo(model, input_tensors, params, precision, batch_size):
inputs=input_tensors,
enabled_precisions={precision_to_dtype(precision)},
truncate_double=params.get("truncate", False),
use_python_runtime=params.get("use_python_runtime", False),
)
end_compile = timeit.default_timer()
compile_time_s = end_compile - start_compile
record_llm_perf(
trt_model,
"Dynamo",
input_tensors,
precision,
osl,
batch_size,
iters,
compile_time_s,
)

if params.get("enable_cuda_graph", False):
with torchtrt.runtime.enable_cudagraphs(trt_model) as cudagraphs_module:
record_llm_perf(
cudagraphs_module,
"Dynamo",
input_tensors,
precision,
osl,
batch_size,
iters,
compile_time_s,
)
else:
record_llm_perf(
trt_model,
"Dynamo",
input_tensors,
precision,
osl,
batch_size,
iters,
compile_time_s,
)


@run_with_try_except
Expand Down Expand Up @@ -262,14 +283,27 @@ def run_dynamo(model, input_tensors, params, precision, batch_size):
),
cache_built_engines=params.get("cache_built_engines", False),
reuse_cached_engines=params.get("reuse_cached_engines", False),
use_python_runtime=params.get("use_python_runtime", False),
)
end_compile = timeit.default_timer()
compile_time_s = end_compile - start_compile
iters = params.get("iterations", 20)

record_perf(
model, "Dynamo", input_tensors, precision, iters, batch_size, compile_time_s
)
if params.get("enable_cuda_graph", False):
with torchtrt.runtime.enable_cudagraphs(model) as cudagraphs_module:
record_perf(
cudagraphs_module,
"Dynamo",
input_tensors,
precision,
iters,
batch_size,
compile_time_s,
)
else:
record_perf(
model, "Dynamo", input_tensors, precision, iters, batch_size, compile_time_s
)


@run_with_try_except
Expand All @@ -292,6 +326,7 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size):
"enabled_precisions": {precision_to_dtype(precision)},
"truncate": params.get("truncate", False),
"min_block_size": params.get("min_block_size", 1),
"use_python_runtime": params.get("use_python_runtime", False),
}
start_compile = timeit.default_timer()
model = torch.compile(model, backend="tensorrt", dynamic=None, options=compile_spec)
Expand All @@ -300,15 +335,27 @@ def run_torch_compile(model, input_tensors, params, precision, batch_size):
compile_time_s = end_compile - start_compile
iters = params.get("iterations", 20)

record_perf(
model,
"torch_compile",
input_tensors,
precision,
iters,
batch_size,
compile_time_s,
)
if params.get("enable_cuda_graph", False):
with torchtrt.runtime.enable_cudagraphs(model) as cudagraphs_module:
record_perf(
cudagraphs_module,
"torch_compile",
input_tensors,
precision,
iters,
batch_size,
compile_time_s,
)
else:
record_perf(
model,
"torch_compile",
input_tensors,
precision,
iters,
batch_size,
compile_time_s,
)


@run_with_try_except
Expand All @@ -320,9 +367,13 @@ def run_hf_inductor(model, input_tensors, params, precision, batch_size):
# Mark dynamic shapes for input sequence
input_seq = input_tensors[0]
torch._dynamo.mark_dynamic(input_seq, 1, min=1, max=osl)
mode = "max-autotune"
if params.get("enable_cuda_graph", False):
mode = "reduce-overhead"

start_compile = timeit.default_timer()
# Compile the model
model = torch.compile(model, backend="inductor", dynamic=None, mode="max-autotune")
model = torch.compile(model, backend="inductor", dynamic=None, mode=mode)
model(input_seq)
end_compile = timeit.default_timer()
compile_time_s = end_compile - start_compile
Expand Down Expand Up @@ -356,15 +407,25 @@ def run_inductor(model, input_tensors, params, precision, batch_size):
if params["is_text_llm"]:
return run_hf_inductor(model, input_tensors, params, precision, batch_size)

mode = "max-autotune"
if params.get("enable_cuda_graph", False):
mode = "reduce-overhead"

start_compile = timeit.default_timer()
model = torch.compile(model, backend="inductor", dynamic=None, mode="max-autotune")
model = torch.compile(model, backend="inductor", dynamic=None, mode=mode)
model(*input_tensors)
end_compile = timeit.default_timer()
compile_time_s = end_compile - start_compile
iters = params.get("iterations", 20)

record_perf(
model, "inductor", input_tensors, precision, iters, batch_size, compile_time_s
model,
"inductor",
input_tensors,
precision,
iters,
batch_size,
compile_time_s,
)


Expand Down Expand Up @@ -587,6 +648,16 @@ def run(
action="store_true",
help="Boolean flag to determine if the user provided model is a TRT engine or not",
)
arg_parser.add_argument(
"--use_python_runtime",
action="store_true",
help="Whether to use Python runtime or not. Using C++ runtime by default",
)
arg_parser.add_argument(
"--enable_cuda_graph",
action="store_true",
help="Whether to enable CUDA Graph. It is not used by default",
)
arg_parser.add_argument(
"--report",
type=str,
Expand Down
Loading