From fb6cdfd3d2bcdbe266d42556c3b73b87df0d302a Mon Sep 17 00:00:00 2001 From: Naren Dasan <1790613+narendasan@users.noreply.github.com> Date: Wed, 29 Jan 2025 17:27:16 -0700 Subject: [PATCH 1/5] =?UTF-8?q?fix(aten::instance=5Fnorm):=20Handle=20opti?= =?UTF-8?q?onal=20inputs=20in=20instance=20norm=20con=E2=80=A6=20(#3367)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Dheeraj Peri --- .../scripts/generate_binary_build_matrix.py | 1 - .../conversion/converters/impl/batch_norm.cpp | 15 +- core/util/prelude.h | 1 + .../custom_kernel_plugins.py | 4 - examples/dynamo/custom_kernel_plugins.py | 4 - notebooks/CitriNet-example.ipynb | 14 +- notebooks/EfficientNet-example.ipynb | 18 +- notebooks/Hugging-Face-BERT.ipynb | 12 +- notebooks/Resnet50-CPP.ipynb | 1 - notebooks/Resnet50-example.ipynb | 21 +- notebooks/dynamic-shapes.ipynb | 20 +- ...ng_started_with_fx_path_lower_to_trt.ipynb | 996 +++++++++--------- notebooks/lenet-getting-started.ipynb | 11 +- notebooks/qat-ptq-workflow.ipynb | 25 +- notebooks/ssd-object-detection-demo.ipynb | 14 +- notebooks/vgg-qat.ipynb | 10 +- py/torch_tensorrt/_Device.py | 3 +- py/torch_tensorrt/_compile.py | 2 - py/torch_tensorrt/_features.py | 4 +- py/torch_tensorrt/dynamo/_engine_cache.py | 2 - py/torch_tensorrt/dynamo/_exporter.py | 1 - py/torch_tensorrt/dynamo/_refit.py | 2 - .../dynamo/conversion/_TRTInterpreter.py | 1 - .../dynamo/conversion/_conversion.py | 2 - .../dynamo/conversion/converter_utils.py | 3 +- .../dynamo/conversion/impl/arange.py | 1 - .../dynamo/conversion/impl/elementwise/ops.py | 1 - .../dynamo/conversion/impl/matmul.py | 1 - .../conversion/impl/normalization/ops.py | 1 - .../dynamo/conversion/impl/pad.py | 1 - .../dynamo/conversion/impl/slice/ops.py | 1 - .../dynamo/conversion/impl/unary/ops.py | 1 - .../runtime/_MutableTorchTensorRTModule.py | 3 - .../runtime/_PythonTorchTensorRTModule.py | 2 - py/torch_tensorrt/dynamo/utils.py | 1 - py/torch_tensorrt/logging.py | 4 +- py/torch_tensorrt/runtime/_utils.py | 1 - setup.py | 2 - .../converters/test_instance_norm.cpp | 2 +- tests/py/dynamo/conversion/harness.py | 1 - .../py/dynamo/conversion/test_resize_aten.py | 1 - .../py/dynamo/conversion/test_sym_not_aten.py | 1 - .../py/dynamo/lowering/test_decompositions.py | 1 - tests/py/dynamo/models/test_dtype_support.py | 1 - tests/py/dynamo/models/test_engine_cache.py | 2 - .../dynamo/models/test_export_kwargs_serde.py | 1 - tests/py/dynamo/models/test_model_refit.py | 10 - tests/py/dynamo/runtime/test_001_streams.py | 1 - .../runtime/test_002_lazy_engine_init.py | 1 - .../test_003_cross_compile_for_windows.py | 1 - .../runtime/test_004_weight_streaming.py | 1 - .../runtime/test_mutable_torchtrt_module.py | 4 - 52 files changed, 581 insertions(+), 654 deletions(-) diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index 26bb447b4f..f56b45b33b 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -469,7 +469,6 @@ def generate_wheels_matrix( ret: List[Dict[str, Any]] = [] for python_version in python_versions: for arch_version in arches: - # TODO: Enable Python 3.13 support for ROCM if arch_version in ROCM_ARCHES and python_version == "3.13": continue diff --git a/core/conversion/converters/impl/batch_norm.cpp b/core/conversion/converters/impl/batch_norm.cpp index 07cf445f50..c8ec1977a7 100644 --- a/core/conversion/converters/impl/batch_norm.cpp +++ b/core/conversion/converters/impl/batch_norm.cpp @@ -134,9 +134,14 @@ auto batch_norm_registrations TORCHTRT_UNUSED = auto eps = static_cast(args[7].unwrapToDouble(1e-5f)); - auto scales = args[1].unwrapToTensor(at::ones(shape[1], options)).cpu().contiguous(); - auto bias = args[2].unwrapToTensor(at::zeros(shape[1], options)).cpu().contiguous(); - + auto scales = at::ones(shape[1], options); + if (!args[1].IValue()->isNone()) { + scales = args[1].unwrapToTensor(at::ones(shape[1], options)).cpu().contiguous(); + } + auto bias = at::zeros(shape[1], options); + if (!args[2].IValue()->isNone()) { + bias = args[2].unwrapToTensor(at::zeros(shape[1], options)).cpu().contiguous(); + } // track_running_stats=True if (!args[3].IValue()->isNone() || !args[4].IValue()->isNone()) { auto running_mean = args[3].unwrapToTensor(); @@ -154,6 +159,8 @@ auto batch_norm_registrations TORCHTRT_UNUSED = return true; } + // Not sure this actually does something since the cudnn_enabled is from the PyTorch context. + // We need cuDNN either way to run this converter auto cudnn_enabled = static_cast(args[8].unwrapToBool(false)); if (!cudnn_enabled) { LOG_DEBUG( @@ -162,7 +169,7 @@ auto batch_norm_registrations TORCHTRT_UNUSED = so for some functionalities, users need to install correct \ cuDNN version by themselves. Please see our support matrix \ here: https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html."); - return false; + // return false; } const int relu = 0; diff --git a/core/util/prelude.h b/core/util/prelude.h index 957562c3c5..d269a9347a 100644 --- a/core/util/prelude.h +++ b/core/util/prelude.h @@ -2,6 +2,7 @@ // A collection of headers from util that will typically get included in most // files +#include #include "core/util/Exception.h" #include "core/util/build_info.h" #include "core/util/jit_util.h" diff --git a/docs/_downloads/c0341280f3b022df00c4241c42d9ee8b/custom_kernel_plugins.py b/docs/_downloads/c0341280f3b022df00c4241c42d9ee8b/custom_kernel_plugins.py index 73b06119ae..398c0a1ebe 100644 --- a/docs/_downloads/c0341280f3b022df00c4241c42d9ee8b/custom_kernel_plugins.py +++ b/docs/_downloads/c0341280f3b022df00c4241c42d9ee8b/custom_kernel_plugins.py @@ -316,7 +316,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: import cupy as cp # Needed to work around API gaps in PyTorch to build torch.Tensors around preallocated CUDA memory import numpy as np - import tensorrt as trt @@ -348,7 +347,6 @@ def get_output_dimensions( inputs: List[trt.DimsExprs], exprBuilder: trt.IExprBuilder, ) -> trt.DimsExprs: - output_dims = trt.DimsExprs(inputs[0]) for i in range(np.size(self.pads) // 2): @@ -404,7 +402,6 @@ def enqueue( workspace: int, stream: int, ) -> None: - # Host code is slightly different as this will be run as part of the TRT execution in_dtype = torchtrt.dtype.try_from(input_desc[0].type).to(np.dtype) @@ -528,7 +525,6 @@ def circular_padding_converter( kwargs: Dict[str, Argument], name: str, ): - # How to retrieve a plugin if it is defined elsewhere (e.g. linked library) plugin_registry = trt.get_plugin_registry() plugin_creator = plugin_registry.get_plugin_creator( diff --git a/examples/dynamo/custom_kernel_plugins.py b/examples/dynamo/custom_kernel_plugins.py index 73b06119ae..398c0a1ebe 100644 --- a/examples/dynamo/custom_kernel_plugins.py +++ b/examples/dynamo/custom_kernel_plugins.py @@ -316,7 +316,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: import cupy as cp # Needed to work around API gaps in PyTorch to build torch.Tensors around preallocated CUDA memory import numpy as np - import tensorrt as trt @@ -348,7 +347,6 @@ def get_output_dimensions( inputs: List[trt.DimsExprs], exprBuilder: trt.IExprBuilder, ) -> trt.DimsExprs: - output_dims = trt.DimsExprs(inputs[0]) for i in range(np.size(self.pads) // 2): @@ -404,7 +402,6 @@ def enqueue( workspace: int, stream: int, ) -> None: - # Host code is slightly different as this will be run as part of the TRT execution in_dtype = torchtrt.dtype.try_from(input_desc[0].type).to(np.dtype) @@ -528,7 +525,6 @@ def circular_padding_converter( kwargs: Dict[str, Argument], name: str, ): - # How to retrieve a plugin if it is defined elsewhere (e.g. linked library) plugin_registry = trt.get_plugin_registry() plugin_creator = plugin_registry.get_plugin_creator( diff --git a/notebooks/CitriNet-example.ipynb b/notebooks/CitriNet-example.ipynb index b9d615d5f1..88d59e3424 100644 --- a/notebooks/CitriNet-example.ipynb +++ b/notebooks/CitriNet-example.ipynb @@ -384,12 +384,11 @@ "metadata": {}, "outputs": [], "source": [ - "import nemo\n", "import torch\n", "\n", "import nemo.collections.asr as nemo_asr\n", "from nemo.core import typecheck\n", - "typecheck.set_typecheck_enabled(False) " + "typecheck.set_typecheck_enabled(False)" ] }, { @@ -572,11 +571,8 @@ "from __future__ import absolute_import\n", "from __future__ import division\n", "\n", - "import argparse\n", "import timeit\n", "import numpy as np\n", - "import torch\n", - "import torch_tensorrt as trtorch\n", "import torch.backends.cudnn as cudnn\n", "\n", "def benchmark(model, input_tensor, num_loops, model_name, batch_size):\n", @@ -632,7 +628,7 @@ " else:\n", " model_name = f\"{variant}.ts\"\n", "\n", - " print(f\"Loading model: {model_name}\") \n", + " print(f\"Loading model: {model_name}\")\n", " # Load traced model to CPU first\n", " model = torch.jit.load(model_name).cuda()\n", " cudnn.benchmark = True\n", @@ -727,9 +723,7 @@ ], "source": [ "import torch\n", - "import torch.nn as nn\n", "import torch_tensorrt as torchtrt\n", - "import argparse\n", "\n", "variant = \"stt_en_citrinet_256\"\n", "precisions = [torch.float, torch.half]\n", @@ -827,7 +821,7 @@ " else:\n", " model_name = f\"{variant}.ts\"\n", "\n", - " print(f\"Loading model: {model_name}\") \n", + " print(f\"Loading model: {model_name}\")\n", " # Load traced model to CPU first\n", " model = torch.jit.load(model_name).cuda()\n", " cudnn.benchmark = True\n", @@ -906,7 +900,7 @@ " else:\n", " model_name = f\"{variant}.ts\"\n", "\n", - " print(f\"Loading model: {model_name}\") \n", + " print(f\"Loading model: {model_name}\")\n", " # Load traced model to CPU first\n", " model = torch.jit.load(model_name).cuda()\n", " cudnn.benchmark = True\n", diff --git a/notebooks/EfficientNet-example.ipynb b/notebooks/EfficientNet-example.ipynb index cfb8e79232..bbbfe6f94e 100644 --- a/notebooks/EfficientNet-example.ipynb +++ b/notebooks/EfficientNet-example.ipynb @@ -167,7 +167,7 @@ "import torch.backends.cudnn as cudnn\n", "from timm.data import resolve_data_config\n", "from timm.data.transforms_factory import create_transform\n", - "import json \n", + "import json\n", "\n", "efficientnet_b0_model = timm.create_model('efficientnet_b0',pretrained=True)\n", "model = efficientnet_b0_model.eval().to(\"cuda\")" @@ -305,13 +305,13 @@ " transforms.ToTensor(),\n", " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", " ])\n", - " input_tensor = preprocess(img) \n", + " input_tensor = preprocess(img)\n", " plt.subplot(2,2,i+1)\n", " plt.imshow(img)\n", " plt.axis('off')\n", "\n", "# loading labels\n", - "with open(\"./data/imagenet_class_index.json\") as json_file: \n", + "with open(\"./data/imagenet_class_index.json\") as json_file:\n", " d = json.load(json_file)" ] }, @@ -341,7 +341,7 @@ " preprocess = efficientnet_preprocess()\n", " input_tensor = preprocess(img)\n", " input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", - " \n", + "\n", " # move the input and model to GPU for speed if available\n", " if torch.cuda.is_available():\n", " input_batch = input_batch.to('cuda')\n", @@ -351,7 +351,7 @@ " output = model(input_batch)\n", " # Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", " sm_output = torch.nn.functional.softmax(output[0], dim=0)\n", - " \n", + "\n", " ind = torch.argmax(sm_output)\n", " return d[str(ind.item())], sm_output[ind] #([predicted class, description], probability)\n", "\n", @@ -360,7 +360,7 @@ " input_data = input_data.to(\"cuda\")\n", " if dtype=='fp16':\n", " input_data = input_data.half()\n", - " \n", + "\n", " print(\"Warm up ...\")\n", " with torch.no_grad():\n", " for _ in range(nwarmup):\n", @@ -430,13 +430,13 @@ "for i in range(4):\n", " img_path = './data/img%d.JPG'%i\n", " img = Image.open(img_path)\n", - " \n", + "\n", " pred, prob = predict(img_path, efficientnet_b0_model)\n", " print('{} - Predicted: {}, Probablility: {}'.format(img_path, pred, prob))\n", "\n", " plt.subplot(2,2,i+1)\n", - " plt.imshow(img);\n", - " plt.axis('off');\n", + " plt.imshow(img)\n", + " plt.axis('off')\n", " plt.title(pred[1])" ] }, diff --git a/notebooks/Hugging-Face-BERT.ipynb b/notebooks/Hugging-Face-BERT.ipynb index 81034d8e38..36068dbd58 100644 --- a/notebooks/Hugging-Face-BERT.ipynb +++ b/notebooks/Hugging-Face-BERT.ipynb @@ -233,9 +233,9 @@ "metadata": {}, "outputs": [], "source": [ - "masked_sentences = ['Paris is the [MASK] of France.', \n", - " 'The primary [MASK] of the United States is English.', \n", - " 'A baseball game consists of at least nine [MASK].', \n", + "masked_sentences = ['Paris is the [MASK] of France.',\n", + " 'The primary [MASK] of the United States is English.',\n", + " 'A baseball game consists of at least nine [MASK].',\n", " 'Topology is a branch of [MASK] concerned with the properties of geometric objects that remain unchanged under continuous transformations.']\n", "pos_masks = [4, 3, 9, 6]" ] @@ -357,7 +357,7 @@ "metadata": {}, "outputs": [], "source": [ - "trt_model = torch_tensorrt.compile(traced_mlm_model, \n", + "trt_model = torch_tensorrt.compile(traced_mlm_model,\n", " inputs= [torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32), # input_ids\n", " torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32), # token_type_ids\n", " torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32)], # attention_mask\n", @@ -396,7 +396,7 @@ "enc_inputs = enc(masked_sentences, return_tensors='pt', padding='max_length', max_length=128)\n", "enc_inputs = {k: v.type(torch.int32).cuda() for k, v in enc_inputs.items()}\n", "output_trt = trt_model(enc_inputs['input_ids'], enc_inputs['token_type_ids'], enc_inputs['attention_mask'])\n", - "most_likely_token_ids_trt = [torch.argmax(output_trt[i, pos, :]) for i, pos in enumerate(pos_masks)] \n", + "most_likely_token_ids_trt = [torch.argmax(output_trt[i, pos, :]) for i, pos in enumerate(pos_masks)]\n", "unmasked_tokens_trt = enc.decode(most_likely_token_ids_trt).split(' ')\n", "unmasked_sentences_trt = [masked_sentences[i].replace('[MASK]', token) for i, token in enumerate(unmasked_tokens_trt)]\n", "for sentence in unmasked_sentences_trt:\n", @@ -418,7 +418,7 @@ "metadata": {}, "outputs": [], "source": [ - "trt_model_fp16 = torch_tensorrt.compile(traced_mlm_model, \n", + "trt_model_fp16 = torch_tensorrt.compile(traced_mlm_model,\n", " inputs= [torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32), # input_ids\n", " torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32), # token_type_ids\n", " torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32)], # attention_mask\n", diff --git a/notebooks/Resnet50-CPP.ipynb b/notebooks/Resnet50-CPP.ipynb index 198ebc9911..87800e0a24 100755 --- a/notebooks/Resnet50-CPP.ipynb +++ b/notebooks/Resnet50-CPP.ipynb @@ -70,7 +70,6 @@ "outputs": [], "source": [ "import torch\n", - "import torchvision\n", "\n", "torch.hub._validate_not_a_forked_repo=lambda a,b,c: True\n", "\n", diff --git a/notebooks/Resnet50-example.ipynb b/notebooks/Resnet50-example.ipynb index a7d3d4eddd..7b5944ea8d 100644 --- a/notebooks/Resnet50-example.ipynb +++ b/notebooks/Resnet50-example.ipynb @@ -428,7 +428,6 @@ ], "source": [ "import torch\n", - "import torchvision\n", "\n", "torch.hub._validate_not_a_forked_repo=lambda a,b,c: True\n", "\n", @@ -558,7 +557,7 @@ "from PIL import Image\n", "from torchvision import transforms\n", "import matplotlib.pyplot as plt\n", - "import json \n", + "import json\n", "\n", "fig, axes = plt.subplots(nrows=2, ncols=2)\n", "\n", @@ -571,13 +570,13 @@ " transforms.ToTensor(),\n", " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", " ])\n", - " input_tensor = preprocess(img) \n", + " input_tensor = preprocess(img)\n", " plt.subplot(2,2,i+1)\n", " plt.imshow(img)\n", " plt.axis('off')\n", "\n", - "# loading labels \n", - "with open(\"./data/imagenet_class_index.json\") as json_file: \n", + "# loading labels\n", + "with open(\"./data/imagenet_class_index.json\") as json_file:\n", " d = json.load(json_file)" ] }, @@ -614,7 +613,7 @@ " preprocess = rn50_preprocess()\n", " input_tensor = preprocess(img)\n", " input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", - " \n", + "\n", " # move the input and model to GPU for speed if available\n", " if torch.cuda.is_available():\n", " input_batch = input_batch.to('cuda')\n", @@ -624,7 +623,7 @@ " output = model(input_batch)\n", " # Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", " sm_output = torch.nn.functional.softmax(output[0], dim=0)\n", - " \n", + "\n", " ind = torch.argmax(sm_output)\n", " return d[str(ind.item())], sm_output[ind] #([predicted class, description], probability)\n", "\n", @@ -633,7 +632,7 @@ " input_data = input_data.to(\"cuda\")\n", " if dtype=='fp16':\n", " input_data = input_data.half()\n", - " \n", + "\n", " print(\"Warm up ...\")\n", " with torch.no_grad():\n", " for _ in range(nwarmup):\n", @@ -695,13 +694,13 @@ "for i in range(4):\n", " img_path = './data/img%d.JPG'%i\n", " img = Image.open(img_path)\n", - " \n", + "\n", " pred, prob = predict(img_path, resnet50_model)\n", " print('{} - Predicted: {}, Probablility: {}'.format(img_path, pred, prob))\n", "\n", " plt.subplot(2,2,i+1)\n", - " plt.imshow(img);\n", - " plt.axis('off');\n", + " plt.imshow(img)\n", + " plt.axis('off')\n", " plt.title(pred[1])" ] }, diff --git a/notebooks/dynamic-shapes.ipynb b/notebooks/dynamic-shapes.ipynb index 5738f13521..046f2bfe2d 100644 --- a/notebooks/dynamic-shapes.ipynb +++ b/notebooks/dynamic-shapes.ipynb @@ -313,7 +313,7 @@ "from PIL import Image\n", "from torchvision import transforms\n", "import matplotlib.pyplot as plt\n", - "import json \n", + "import json\n", "\n", "fig, axes = plt.subplots(nrows=2, ncols=2)\n", "\n", @@ -326,13 +326,13 @@ " transforms.ToTensor(),\n", " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", " ])\n", - " input_tensor = preprocess(img) \n", + " input_tensor = preprocess(img)\n", " plt.subplot(2,2,i+1)\n", " plt.imshow(img)\n", " plt.axis('off')\n", "\n", - "# loading labels \n", - "with open(\"./data/imagenet_class_index.json\") as json_file: \n", + "# loading labels\n", + "with open(\"./data/imagenet_class_index.json\") as json_file:\n", " d = json.load(json_file)" ] }, @@ -589,7 +589,7 @@ " preprocess = rn50_preprocess()\n", " input_tensor = preprocess(img)\n", " input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n", - " \n", + "\n", " # move the input and model to GPU for speed if available\n", " if torch.cuda.is_available():\n", " input_batch = input_batch.to('cuda')\n", @@ -599,7 +599,7 @@ " output = model(input_batch)\n", " # Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n", " sm_output = torch.nn.functional.softmax(output[0], dim=0)\n", - " \n", + "\n", " ind = torch.argmax(sm_output)\n", " return d[str(ind.item())], sm_output[ind] #([predicted class, description], probability)\n", "\n", @@ -609,7 +609,7 @@ " input_data = input_data.to(\"cuda\")\n", " if dtype=='fp16':\n", " input_data = input_data.half()\n", - " \n", + "\n", " print(\"Warm up ...\")\n", " with torch.no_grad():\n", " for _ in range(nwarmup):\n", @@ -673,13 +673,13 @@ "for i in range(4):\n", " img_path = './data/img%d.JPG'%i\n", " img = Image.open(img_path)\n", - " \n", + "\n", " pred, prob = predict(img_path, resnet50_model)\n", " print('{} - Predicted: {}, Probablility: {}'.format(img_path, pred, prob))\n", "\n", " plt.subplot(2,2,i+1)\n", - " plt.imshow(img);\n", - " plt.axis('off');\n", + " plt.imshow(img)\n", + " plt.axis('off')\n", " plt.title(pred[1])" ] }, diff --git a/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb b/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb index 8e480903ab..0b90e34bd6 100644 --- a/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb +++ b/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb @@ -1,517 +1,517 @@ { - "metadata": { - "dataExplorerConfig": {}, - "bento_stylesheets": { - "bento/extensions/flow/main.css": true, - "bento/extensions/kernel_selector/main.css": true, - "bento/extensions/kernel_ui/main.css": true, - "bento/extensions/new_kernel/main.css": true, - "bento/extensions/system_usage/main.css": true, - "bento/extensions/theme/main.css": true + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "code_folding": [], + "customInput": null, + "hidden_ranges": [], + "originalKey": "8ca7695d-8a19-454e-b32b-3d5c36d52faf", + "showInput": false + }, + "source": [ + "The purpose of this example is to demostrate the overall flow of lowering a PyTorch model\n", + "to TensorRT conveniently with lower.py. We integrated the transformation process including `TRTInterpreter`, `TRTModule`, pass optimization into the `lower_to_trt` API, users are encouraged to check the docstring of the API and tune it to meet your needs." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false, + "customInput": null, + "customOutput": null, + "executionStartTime": 1661189891682, + "executionStopTime": 1661189891856, + "originalKey": "7db2accc-9fa4-4a1e-8142-d887f2947bcd", + "requestMsgId": "b5d8efce-0963-4074-bc9d-e8e1a78fd424", + "showInput": true + }, + "outputs": [], + "source": [ + "import typing as t\n", + "from copy import deepcopy\n", + "from dataclasses import dataclass, field, replace\n", + "\n", + "import torch\n", + "import torchvision\n", + "from torch_tensorrt.fx.lower import compile\n", + "from torch_tensorrt.fx.utils import LowerPrecision" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "code_folding": [], + "customInput": null, + "hidden_ranges": [], + "originalKey": "e324a1ff-1bc2-4e78-932f-33534c3ac3f5", + "showInput": false + }, + "source": [ + "Specify the `configuration` class used for FX path lowering and benchmark. To extend, add a new configuration field to this class, and modify the lowering or benchmark behavior in `run_configuration_benchmark()` correspondingly. It automatically stores all its values to a `Result` dataclass. \n", + "`Result` is another dataclass that holds raw essential benchmark result values like Batch size, QPS, accuracy, etc..\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "code_folding": [], + "collapsed": false, + "customInput": null, + "customOutput": null, + "executionStartTime": 1661189260550, + "executionStopTime": 1661189262039, + "hidden_ranges": [], + "originalKey": "2835fffa-cc50-479a-9080-c4f7002c0726", + "requestMsgId": "6ea72dbf-dbfe-451e-8613-15f87e34a1a5", + "showInput": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 102740.872 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n" + ] }, - "kernelspec": { - "display_name": "dper3_pytorch (cinder)", - "language": "python", - "name": "bento_kernel_dper3_pytorch_cinder", - "metadata": { - "kernel_name": "bento_kernel_dper3_pytorch_cinder", - "nightly_builds": false, - "fbpkg_supported": true, - "cinder_runtime": true, - "is_prebuilt": true - } + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 102740.873 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n" + ] + } + ], + "source": [ + "@dataclass\n", + "class Configuration:\n", + " # number of inferences to run\n", + " batch_iter: int\n", + "\n", + " # Input batch size\n", + " batch_size: int\n", + "\n", + " # Friendly name of the configuration\n", + " name: str = \"\"\n", + "\n", + " # Whether to apply TRT lowering to the model before benchmarking\n", + " trt: bool = False\n", + "\n", + " # Whether to apply engine holder to the lowered model\n", + " jit: bool = False\n", + "\n", + " # Whether to enable FP16 mode for TRT lowering\n", + " fp16: bool = False\n", + "\n", + " # Relative tolerance for accuracy check after lowering. -1 means do not\n", + " # check accuracy.\n", + " accuracy_rtol: float = -1 # disable\n", + "\n", + "@dataclass\n", + "class Result:\n", + " module: torch.nn.Module = field(repr=False)\n", + " input: t.Any = field(repr=False)\n", + " conf: Configuration\n", + " time_sec: float\n", + " accuracy_res: t.Optional[bool] = None\n", + "\n", + " @property\n", + " def time_per_iter_ms(self) -> float:\n", + " return self.time_sec * 1.0e3\n", + "\n", + " @property\n", + " def qps(self) -> float:\n", + " return self.conf.batch_size / self.time_sec\n", + "\n", + " def format(self) -> str:\n", + " return (\n", + " f\"== Benchmark Result for: {self.conf}\\n\"\n", + " f\"BS: {self.conf.batch_size}, \"\n", + " f\"Time per iter: {self.time_per_iter_ms:.2f}ms, \"\n", + " f\"QPS: {self.qps:.2f}, \"\n", + " f\"Accuracy: {self.accuracy_res} (rtol={self.conf.accuracy_rtol})\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "code_folding": [], + "customInput": null, + "hidden_ranges": [], + "originalKey": "3e462cf6-d282-402d-955b-a3ecb400bf0b", + "showInput": false + }, + "source": [ + "Run FX path lowering and benchmark the given model according to the specified benchmark configuration. Prints the benchmark result for each configuration at the end of the run. `benchmark_torch_function` is the actual function that computes the fixed number of iterations of functions runs.\n", + "The FX path lowering and TensorRT engine creation is integrated into `compile()` API which is defined in `fx/lower.py` file.\n", + "It is good to list it out and show the usage of it. It takes in original module, input and lowering setting, run lowering workflow to turn module into a executable TRT engine \n", + "```\n", + "def compile(\n", + " module: nn.Module,\n", + " input: ,\n", + " max_batch_size: int = 2048,\n", + " max_workspace_size=1 << 25,\n", + " explicit_batch_dimension=False,\n", + " lower_precision=LowerPrecision.FP16,\n", + " verbose_log=False,\n", + " timing_cache_prefix=\"\",\n", + " save_timing_cache=False,\n", + " cuda_graph_batch_size=-1,\n", + " dynamic_batch=False,\n", + ") -> nn.Module:\n", + "``` \n", + "\n", + " Args:\n", + " module: Original module for lowering.\n", + " input: Input for module.\n", + " max_batch_size: Maximum batch size (must be >= 1 to be set, 0 means not set)\n", + " max_workspace_size: Maximum size of workspace given to TensorRT.\n", + " explicit_batch_dimension: Use explicit batch dimension in TensorRT if set True, otherwise use implicit batch dimension.\n", + " lower_precision: lower_precision config given to TRTModule.\n", + " verbose_log: Enable verbose log for TensorRT if set True.\n", + " timing_cache_prefix: Timing cache file name for timing cache used by fx2trt.\n", + " save_timing_cache: Update timing cache with current timing cache data if set to True.\n", + " cuda_graph_batch_size: Cuda graph batch size, default to be -1.\n", + " dynamic_batch: batch dimension (dim=0) is dynamic.\n", + "\n", + " Returns:\n", + " A torch.nn.Module lowered by TensorRT.\n", + "We testd a resnet18 network with input size of [128,3,224,224] for [Batch, Channel, Width, Height]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "code_folding": [], + "collapsed": false, + "customInput": null, + "customOutput": null, + "executionStartTime": 1661189697773, + "executionStopTime": 1661189753875, + "hidden_ranges": [], + "originalKey": "3002935b-b95a-4a08-a57f-f7a35485af5b", + "requestMsgId": "dc73f2d0-427b-4f71-bec1-b118cc5642d0", + "showInput": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103458.189 manifold.py:1435] URL manifold://torchvision/tree/models/resnet18-f37072fd.pth was already cached in /home/wwei6/.torch/iopath_cache/manifold_cache/tree/models/resnet18-f37072fd.pth\n" + ] }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1) green\n== Start benchmark iterations\n" + ] }, - "last_server_session_id": "24a1a10c-29aa-4e2b-a11f-2b5108fc1e58", - "last_kernel_id": "5f014373-151c-4ee8-8939-4daab994d202", - "last_base_url": "https://devgpu005.ftw6.facebook.com:8091/", - "last_msg_id": "687e81e8-4414f32c89cd026dd1ea3fd9_139", - "outputWidgetContext": {} - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ { - "cell_type": "markdown", - "metadata": { - "originalKey": "8ca7695d-8a19-454e-b32b-3d5c36d52faf", - "showInput": false, - "customInput": null, - "code_folding": [], - "hidden_ranges": [] - }, - "source": [ - "The purpose of this example is to demostrate the overall flow of lowering a PyTorch model\n", - "to TensorRT conveniently with lower.py. We integrated the transformation process including `TRTInterpreter`, `TRTModule`, pass optimization into the `lower_to_trt` API, users are encouraged to check the docstring of the API and tune it to meet your needs." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001) green\n" + ] }, { - "cell_type": "code", - "metadata": { - "originalKey": "7db2accc-9fa4-4a1e-8142-d887f2947bcd", - "showInput": true, - "customInput": null, - "collapsed": false, - "requestMsgId": "b5d8efce-0963-4074-bc9d-e8e1a78fd424", - "customOutput": null, - "executionStartTime": 1661189891682, - "executionStopTime": 1661189891856 - }, - "source": [ - "import typing as t\n", - "from copy import deepcopy\n", - "from dataclasses import dataclass, field, replace\n", - "\n", - "import torch\n", - "import torchvision\n", - "from torch_tensorrt.fx.lower import compile\n", - "from torch_tensorrt.fx.utils import LowerPrecision" - ], - "execution_count": 9, - "outputs": [] + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103501.297 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpe_7p37fq\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "originalKey": "e324a1ff-1bc2-4e78-932f-33534c3ac3f5", - "showInput": false, - "customInput": null, - "code_folding": [], - "hidden_ranges": [] - }, - "source": [ - "Specify the `configuration` class used for FX path lowering and benchmark. To extend, add a new configuration field to this class, and modify the lowering or benchmark behavior in `run_configuration_benchmark()` correspondingly. It automatically stores all its values to a `Result` dataclass. \n", - "`Result` is another dataclass that holds raw essential benchmark result values like Batch size, QPS, accuracy, etc..\n", - "" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103501.390 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpg_a347f0\n" + ] }, { - "cell_type": "code", - "metadata": { - "originalKey": "2835fffa-cc50-479a-9080-c4f7002c0726", - "showInput": true, - "customInput": null, - "code_folding": [], - "hidden_ranges": [], - "collapsed": false, - "requestMsgId": "6ea72dbf-dbfe-451e-8613-15f87e34a1a5", - "customOutput": null, - "executionStartTime": 1661189260550, - "executionStopTime": 1661189262039 - }, - "source": [ - "@dataclass\n", - "class Configuration:\n", - " # number of inferences to run\n", - " batch_iter: int\n", - "\n", - " # Input batch size\n", - " batch_size: int\n", - "\n", - " # Friendly name of the configuration\n", - " name: str = \"\"\n", - "\n", - " # Whether to apply TRT lowering to the model before benchmarking\n", - " trt: bool = False\n", - "\n", - " # Whether to apply engine holder to the lowered model\n", - " jit: bool = False\n", - "\n", - " # Whether to enable FP16 mode for TRT lowering\n", - " fp16: bool = False\n", - "\n", - " # Relative tolerance for accuracy check after lowering. -1 means do not\n", - " # check accuracy.\n", - " accuracy_rtol: float = -1 # disable\n", - " \n", - "@dataclass\n", - "class Result:\n", - " module: torch.nn.Module = field(repr=False)\n", - " input: t.Any = field(repr=False)\n", - " conf: Configuration\n", - " time_sec: float\n", - " accuracy_res: t.Optional[bool] = None\n", - "\n", - " @property\n", - " def time_per_iter_ms(self) -> float:\n", - " return self.time_sec * 1.0e3\n", - "\n", - " @property\n", - " def qps(self) -> float:\n", - " return self.conf.batch_size / self.time_sec\n", - "\n", - " def format(self) -> str:\n", - " return (\n", - " f\"== Benchmark Result for: {self.conf}\\n\"\n", - " f\"BS: {self.conf.batch_size}, \"\n", - " f\"Time per iter: {self.time_per_iter_ms:.2f}ms, \"\n", - " f\"QPS: {self.qps:.2f}, \"\n", - " f\"Accuracy: {self.accuracy_res} (rtol={self.conf.accuracy_rtol})\"\n", - " )" - ], - "execution_count": 2, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 102740.872 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 102740.873 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n" - ] - } - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103501.509 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "originalKey": "3e462cf6-d282-402d-955b-a3ecb400bf0b", - "showInput": false, - "customInput": null, - "code_folding": [], - "hidden_ranges": [] - }, - "source": [ - "Run FX path lowering and benchmark the given model according to the specified benchmark configuration. Prints the benchmark result for each configuration at the end of the run. `benchmark_torch_function` is the actual function that computes the fixed number of iterations of functions runs.\n", - "The FX path lowering and TensorRT engine creation is integrated into `compile()` API which is defined in `fx/lower.py` file.\n", - "It is good to list it out and show the usage of it. It takes in original module, input and lowering setting, run lowering workflow to turn module into a executable TRT engine \n", - "```\n", - "def compile(\n", - " module: nn.Module,\n", - " input: ,\n", - " max_batch_size: int = 2048,\n", - " max_workspace_size=1 << 25,\n", - " explicit_batch_dimension=False,\n", - " lower_precision=LowerPrecision.FP16,\n", - " verbose_log=False,\n", - " timing_cache_prefix=\"\",\n", - " save_timing_cache=False,\n", - " cuda_graph_batch_size=-1,\n", - " dynamic_batch=False,\n", - ") -> nn.Module:\n", - "``` \n", - "\n", - " Args:\n", - " module: Original module for lowering.\n", - " input: Input for module.\n", - " max_batch_size: Maximum batch size (must be >= 1 to be set, 0 means not set)\n", - " max_workspace_size: Maximum size of workspace given to TensorRT.\n", - " explicit_batch_dimension: Use explicit batch dimension in TensorRT if set True, otherwise use implicit batch dimension.\n", - " lower_precision: lower_precision config given to TRTModule.\n", - " verbose_log: Enable verbose log for TensorRT if set True.\n", - " timing_cache_prefix: Timing cache file name for timing cache used by fx2trt.\n", - " save_timing_cache: Update timing cache with current timing cache data if set to True.\n", - " cuda_graph_batch_size: Cuda graph batch size, default to be -1.\n", - " dynamic_batch: batch dimension (dim=0) is dynamic.\n", - "\n", - " Returns:\n", - " A torch.nn.Module lowered by TensorRT.\n", - "We testd a resnet18 network with input size of [128,3,224,224] for [Batch, Channel, Width, Height]" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103501.511 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float32, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n" + ] }, { - "cell_type": "code", - "metadata": { - "originalKey": "3002935b-b95a-4a08-a57f-f7a35485af5b", - "showInput": true, - "customInput": null, - "code_folding": [], - "hidden_ranges": [], - "collapsed": false, - "requestMsgId": "dc73f2d0-427b-4f71-bec1-b118cc5642d0", - "customOutput": null, - "executionStartTime": 1661189697773, - "executionStopTime": 1661189753875 - }, - "source": [ - "def benchmark_torch_function(iters: int, f, *args) -> float:\n", - " \"\"\"Estimates the average time duration for a single inference call in second\n", - "\n", - " If the input is batched, then the estimation is for the batches inference call.\n", - " \"\"\"\n", - " with torch.inference_mode():\n", - " f(*args)\n", - " torch.cuda.synchronize()\n", - " start_event = torch.cuda.Event(enable_timing=True)\n", - " end_event = torch.cuda.Event(enable_timing=True)\n", - " print(\"== Start benchmark iterations\")\n", - " with torch.inference_mode():\n", - " start_event.record()\n", - " for _ in range(iters):\n", - " f(*args)\n", - " end_event.record()\n", - " torch.cuda.synchronize()\n", - " print(\"== End benchmark iterations\")\n", - " return (start_event.elapsed_time(end_event) * 1.0e-3) / iters\n", - "\n", - "\n", - "def run_configuration_benchmark(\n", - " module,\n", - " input,\n", - " conf: Configuration,\n", - ") -> Result:\n", - " print(f\"=== Running benchmark for: {conf}\", \"green\")\n", - " time = -1.0\n", - "\n", - " if conf.fp16:\n", - " module = module.half()\n", - " input = [i.half() for i in input]\n", - "\n", - " if not conf.trt:\n", - " # Run eager mode benchmark\n", - " time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))\n", - " elif not conf.jit:\n", - " # Run lowering eager mode benchmark\n", - " lowered_module = compile(\n", - " module,\n", - " input,\n", - " max_batch_size=conf.batch_size,\n", - " lower_precision=LowerPrecision.FP16 if conf.fp16 else LowerPrecision.FP32,\n", - " )\n", - " time = benchmark_torch_function(conf.batch_iter, lambda: lowered_module(*input))\n", - " else:\n", - " print(\"Lowering with JIT is not available!\", \"red\")\n", - "\n", - " result = Result(module=module, input=input, conf=conf, time_sec=time)\n", - " return result\n", - "\n", - "\n", - "@torch.inference_mode()\n", - "def benchmark(\n", - " model,\n", - " inputs,\n", - " batch_iter: int,\n", - " batch_size: int,\n", - ") -> None:\n", - " model = model.cuda().eval()\n", - " inputs = [x.cuda() for x in inputs]\n", - "\n", - " # benchmark base configuration\n", - " conf = Configuration(batch_iter=batch_iter, batch_size=batch_size)\n", - "\n", - " configurations = [\n", - " # Baseline\n", - " replace(conf, name=\"CUDA Eager\", trt=False),\n", - " # FP32\n", - " replace(\n", - " conf,\n", - " name=\"TRT FP32 Eager\",\n", - " trt=True,\n", - " jit=False,\n", - " fp16=False,\n", - " accuracy_rtol=1e-3,\n", - " ),\n", - " # FP16\n", - " replace(\n", - " conf,\n", - " name=\"TRT FP16 Eager\",\n", - " trt=True,\n", - " jit=False,\n", - " fp16=True,\n", - " accuracy_rtol=1e-2,\n", - " ),\n", - " ]\n", - "\n", - " results = [run_configuration_benchmark(deepcopy(model), inputs, conf_) for conf_ in configurations]\n", - "\n", - " for res in results:\n", - " print(res.format())\n", - "\n", - "\n", - "test_model = torchvision.models.resnet18(pretrained=True)\n", - "input = [torch.rand(128, 3, 224, 224)]\n", - "benchmark(test_model, input, 50, 128)" - ], - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103458.189 manifold.py:1435] URL manifold://torchvision/tree/models/resnet18-f37072fd.pth was already cached in /home/wwei6/.torch/iopath_cache/manifold_cache/tree/models/resnet18-f37072fd.pth\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1) green\n== Start benchmark iterations\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001) green\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103501.297 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpe_7p37fq\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103501.390 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpg_a347f0\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103501.509 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103501.511 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float32, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103503.964 fx2trt.py:204] Run Module elapsed time: 0:00:00.435984\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103520.647 fx2trt.py:258] Build TRT engine elapsed time: 0:00:16.681226\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103520.658 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:19.147071\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "== Start benchmark iterations\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01) green\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103523.067 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpgphlicna\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103523.106 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpy9cumddi\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103523.173 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103523.174 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float16, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103523.466 fx2trt.py:204] Run Module elapsed time: 0:00:00.288043\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103553.687 fx2trt.py:258] Build TRT engine elapsed time: 0:00:30.220316\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "I0822 103553.698 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:30.523791\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "== Start benchmark iterations\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 14.66ms, QPS: 8732.53, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.27ms, QPS: 17595.70, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.49ms, QPS: 28480.34, Accuracy: None (rtol=0.01)\n" - ] - } - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "originalKey": "80bbae99-41ff-4baa-94a5-12bf0c9938f3", - "showInput": true, - "customInput": null - }, - "source": [ - "" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103503.964 fx2trt.py:204] Run Module elapsed time: 0:00:00.435984\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103520.647 fx2trt.py:258] Build TRT engine elapsed time: 0:00:16.681226\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103520.658 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:19.147071\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== Start benchmark iterations\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01) green\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103523.067 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpgphlicna\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103523.106 pass_utils.py:166] == Log pass before/after graph to /tmp/tmpy9cumddi\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103523.173 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103523.174 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float16, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103523.466 fx2trt.py:204] Run Module elapsed time: 0:00:00.288043\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103553.687 fx2trt.py:258] Build TRT engine elapsed time: 0:00:30.220316\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0822 103553.698 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:30.523791\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== Start benchmark iterations\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 14.66ms, QPS: 8732.53, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.27ms, QPS: 17595.70, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.49ms, QPS: 28480.34, Accuracy: None (rtol=0.01)\n" + ] } - ] + ], + "source": [ + "def benchmark_torch_function(iters: int, f, *args) -> float:\n", + " \"\"\"Estimates the average time duration for a single inference call in second\n", + "\n", + " If the input is batched, then the estimation is for the batches inference call.\n", + " \"\"\"\n", + " with torch.inference_mode():\n", + " f(*args)\n", + " torch.cuda.synchronize()\n", + " start_event = torch.cuda.Event(enable_timing=True)\n", + " end_event = torch.cuda.Event(enable_timing=True)\n", + " print(\"== Start benchmark iterations\")\n", + " with torch.inference_mode():\n", + " start_event.record()\n", + " for _ in range(iters):\n", + " f(*args)\n", + " end_event.record()\n", + " torch.cuda.synchronize()\n", + " print(\"== End benchmark iterations\")\n", + " return (start_event.elapsed_time(end_event) * 1.0e-3) / iters\n", + "\n", + "\n", + "def run_configuration_benchmark(\n", + " module,\n", + " input,\n", + " conf: Configuration,\n", + ") -> Result:\n", + " print(f\"=== Running benchmark for: {conf}\", \"green\")\n", + " time = -1.0\n", + "\n", + " if conf.fp16:\n", + " module = module.half()\n", + " input = [i.half() for i in input]\n", + "\n", + " if not conf.trt:\n", + " # Run eager mode benchmark\n", + " time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))\n", + " elif not conf.jit:\n", + " # Run lowering eager mode benchmark\n", + " lowered_module = compile(\n", + " module,\n", + " input,\n", + " max_batch_size=conf.batch_size,\n", + " lower_precision=LowerPrecision.FP16 if conf.fp16 else LowerPrecision.FP32,\n", + " )\n", + " time = benchmark_torch_function(conf.batch_iter, lambda: lowered_module(*input))\n", + " else:\n", + " print(\"Lowering with JIT is not available!\", \"red\")\n", + "\n", + " result = Result(module=module, input=input, conf=conf, time_sec=time)\n", + " return result\n", + "\n", + "\n", + "@torch.inference_mode()\n", + "def benchmark(\n", + " model,\n", + " inputs,\n", + " batch_iter: int,\n", + " batch_size: int,\n", + ") -> None:\n", + " model = model.cuda().eval()\n", + " inputs = [x.cuda() for x in inputs]\n", + "\n", + " # benchmark base configuration\n", + " conf = Configuration(batch_iter=batch_iter, batch_size=batch_size)\n", + "\n", + " configurations = [\n", + " # Baseline\n", + " replace(conf, name=\"CUDA Eager\", trt=False),\n", + " # FP32\n", + " replace(\n", + " conf,\n", + " name=\"TRT FP32 Eager\",\n", + " trt=True,\n", + " jit=False,\n", + " fp16=False,\n", + " accuracy_rtol=1e-3,\n", + " ),\n", + " # FP16\n", + " replace(\n", + " conf,\n", + " name=\"TRT FP16 Eager\",\n", + " trt=True,\n", + " jit=False,\n", + " fp16=True,\n", + " accuracy_rtol=1e-2,\n", + " ),\n", + " ]\n", + "\n", + " results = [run_configuration_benchmark(deepcopy(model), inputs, conf_) for conf_ in configurations]\n", + "\n", + " for res in results:\n", + " print(res.format())\n", + "\n", + "\n", + "test_model = torchvision.models.resnet18(pretrained=True)\n", + "input = [torch.rand(128, 3, 224, 224)]\n", + "benchmark(test_model, input, 50, 128)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "customInput": null, + "originalKey": "80bbae99-41ff-4baa-94a5-12bf0c9938f3", + "showInput": true + }, + "source": [ + "" + ] + } + ], + "metadata": { + "bento_stylesheets": { + "bento/extensions/flow/main.css": true, + "bento/extensions/kernel_selector/main.css": true, + "bento/extensions/kernel_ui/main.css": true, + "bento/extensions/new_kernel/main.css": true, + "bento/extensions/system_usage/main.css": true, + "bento/extensions/theme/main.css": true + }, + "dataExplorerConfig": {}, + "kernelspec": { + "display_name": "dper3_pytorch (cinder)", + "language": "python", + "metadata": { + "cinder_runtime": true, + "fbpkg_supported": true, + "is_prebuilt": true, + "kernel_name": "bento_kernel_dper3_pytorch_cinder", + "nightly_builds": false + }, + "name": "bento_kernel_dper3_pytorch_cinder" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "last_base_url": "https://devgpu005.ftw6.facebook.com:8091/", + "last_kernel_id": "5f014373-151c-4ee8-8939-4daab994d202", + "last_msg_id": "687e81e8-4414f32c89cd026dd1ea3fd9_139", + "last_server_session_id": "24a1a10c-29aa-4e2b-a11f-2b5108fc1e58", + "outputWidgetContext": {} + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/notebooks/lenet-getting-started.ipynb b/notebooks/lenet-getting-started.ipynb index 144d47813b..2cf06d2c05 100644 --- a/notebooks/lenet-getting-started.ipynb +++ b/notebooks/lenet-getting-started.ipynb @@ -193,7 +193,7 @@ "metadata": {}, "outputs": [], "source": [ - "import torch \n", + "import torch\n", "from torch import nn\n", "import torch.nn.functional as F\n", "\n", @@ -258,7 +258,7 @@ " input_data = input_data.to(\"cuda\")\n", " if dtype=='fp16':\n", " input_data = input_data.half()\n", - " \n", + "\n", " print(\"Warm up ...\")\n", " with torch.no_grad():\n", " for _ in range(nwarmup):\n", @@ -278,9 +278,8 @@ "\n", " print(\"Input shape:\", input_data.size())\n", " print(\"Output features size:\", features.size())\n", - " \n", - " print('Average batch time: %.2f ms'%(np.mean(timings)*1000))\n", - " " + "\n", + " print('Average batch time: %.2f ms'%(np.mean(timings)*1000))\n" ] }, { @@ -559,7 +558,7 @@ " opt_shape=[1024, 1, 33, 33],\n", " max_shape=[1024, 1, 34, 34],\n", " dtype=torch.half\n", - " )], \n", + " )],\n", " enabled_precisions = {torch.half})\n", "\n", "input_data = torch.randn((1024, 1, 32, 32))\n", diff --git a/notebooks/qat-ptq-workflow.ipynb b/notebooks/qat-ptq-workflow.ipynb index c0e719b3b4..7b6bf6ef89 100644 --- a/notebooks/qat-ptq-workflow.ipynb +++ b/notebooks/qat-ptq-workflow.ipynb @@ -117,20 +117,17 @@ "import pytorch_quantization\n", "from pytorch_quantization import nn as quant_nn\n", "from pytorch_quantization import quant_modules\n", - "from pytorch_quantization.tensor_quant import QuantDescriptor\n", "from pytorch_quantization import calib\n", "from tqdm import tqdm\n", "\n", "print(pytorch_quantization.__version__)\n", "\n", "import os\n", - "import sys\n", "import warnings\n", "import time\n", "import numpy as np\n", "import wget\n", "import tarfile\n", - "import shutil\n", "warnings.simplefilter('ignore')" ] }, @@ -194,9 +191,9 @@ "outputs": [], "source": [ "# Define main data directory\n", - "DATA_DIR = './data/imagenette2-320' \n", + "DATA_DIR = './data/imagenette2-320'\n", "# Define training and validation data paths\n", - "TRAIN_DIR = os.path.join(DATA_DIR, 'train') \n", + "TRAIN_DIR = os.path.join(DATA_DIR, 'train')\n", "VAL_DIR = os.path.join(DATA_DIR, 'val')" ] }, @@ -286,14 +283,14 @@ "metadata": {}, "outputs": [], "source": [ - "#This function allows you to set the all the parameters to not have gradients, \n", - "#allowing you to freeze the model and not undergo training during the train step. \n", + "#This function allows you to set the all the parameters to not have gradients,\n", + "#allowing you to freeze the model and not undergo training during the train step.\n", "def set_parameter_requires_grad(model, feature_extracting):\n", " if feature_extracting:\n", " for param in model.parameters():\n", " param.requires_grad = False\n", "\n", - "feature_extract = True #This varaible can be set False if you want to finetune the model by updating all the parameters. \n", + "feature_extract = True #This varaible can be set False if you want to finetune the model by updating all the parameters.\n", "model = models.mobilenet_v2(pretrained=True)\n", "set_parameter_requires_grad(model, feature_extract)\n", "#Define a classification head for 10 classes.\n", @@ -338,7 +335,7 @@ " if batch % 100 == 99:\n", " print(\"Batch: [%5d | %5d] loss: %.3f\" % (batch + 1, len(dataloader), running_loss / 100))\n", " running_loss = 0.0\n", - " \n", + "\n", "def evaluate(model, dataloader, crit, epoch):\n", " total = 0\n", " correct = 0\n", @@ -365,7 +362,7 @@ "def save_checkpoint(state, ckpt_path=\"checkpoint.pth\"):\n", " torch.save(state, ckpt_path)\n", " print(\"Checkpoint saved\")\n", - " \n", + "\n", "cudnn.benchmark = True\n", "# Helper function to benchmark the model\n", "def benchmark(model, input_shape=(1024, 1, 32, 32), dtype='fp32', nwarmup=50, nruns=1000):\n", @@ -373,7 +370,7 @@ " input_data = input_data.to(\"cuda\")\n", " if dtype=='fp16':\n", " input_data = input_data.half()\n", - " \n", + "\n", " with torch.no_grad():\n", " for _ in range(nwarmup):\n", " features = model(input_data)\n", @@ -426,7 +423,7 @@ " test_loss, test_acc = evaluate(model, val_dataloader, criterion, epoch)\n", "\n", " print(\"Test Loss: {:.5f} Test Acc: {:.2f}%\".format(test_loss, 100 * test_acc))\n", - " \n", + "\n", "save_checkpoint({'epoch': epoch + 1,\n", " 'model_state_dict': model.state_dict(),\n", " 'acc': test_acc,\n", @@ -576,7 +573,7 @@ " \"enabled_precisions\": torch.int8,\n", " \"calibrator\": calibrator,\n", " \"truncate_long_and_double\": True\n", - " \n", + "\n", " }\n", "trt_ptq = torch_tensorrt.compile(baseline_model, **compile_spec)" ] @@ -772,7 +769,7 @@ " test_loss, test_acc = evaluate(q_model, val_dataloader, criterion, epoch)\n", "\n", " print(\"Test Loss: {:.5f} Test Acc: {:.2f}%\".format(test_loss, 100 * test_acc))\n", - " \n", + "\n", "save_checkpoint({'epoch': epoch + 1,\n", " 'model_state_dict': q_model.state_dict(),\n", " 'acc': test_acc,\n", diff --git a/notebooks/ssd-object-detection-demo.ipynb b/notebooks/ssd-object-detection-demo.ipynb index b7ae8dc2e8..f48fb2bccd 100644 --- a/notebooks/ssd-object-detection-demo.ipynb +++ b/notebooks/ssd-object-detection-demo.ipynb @@ -403,7 +403,7 @@ "tensor = utils.prepare_tensor(inputs, False)\n", "\n", "# The model was trained on COCO dataset, which we need to access in order to\n", - "# translate class IDs into object names. \n", + "# translate class IDs into object names.\n", "classes_to_labels = utils.get_coco_object_dictionary()" ] }, @@ -417,8 +417,8 @@ "model = ssd300.eval().to(\"cuda\")\n", "detections_batch = model(tensor)\n", "\n", - "# By default, raw output from SSD network per input image contains 8732 boxes with \n", - "# localization and class probability distribution. \n", + "# By default, raw output from SSD network per input image contains 8732 boxes with\n", + "# localization and class probability distribution.\n", "# Let’s filter this output to only get reasonable detections (confidence>40%) in a more comprehensive format.\n", "results_per_input = utils.decode_results(detections_batch)\n", "best_results_per_input = [utils.pick_best(results, 0.40) for results in results_per_input]" @@ -530,7 +530,7 @@ " input_data = input_data.to(\"cuda\")\n", " if dtype=='fp16':\n", " input_data = input_data.half()\n", - " \n", + "\n", " print(\"Warm up ...\")\n", " with torch.no_grad():\n", " for _ in range(nwarmup):\n", @@ -718,7 +718,7 @@ "\n", "# The compiled module will have precision as specified by \"op_precision\".\n", "# Here, it will have FP16 precision.\n", - "trt_model = torch_tensorrt.compile(traced_model, \n", + "trt_model = torch_tensorrt.compile(traced_model,\n", " inputs= [torch_tensorrt.Input((3, 3, 300, 300), dtype=torch.half)],\n", " enabled_precisions= {torch.half}, # Run with FP16\n", " workspace_size= 1 << 20\n", @@ -750,8 +750,8 @@ "# using a Torch-TensorRT module is exactly the same as how we usually do inference in PyTorch i.e. model(inputs)\n", "detections_batch = trt_model(tensor.to(torch.half)) # convert the input to half precision\n", "\n", - "# By default, raw output from SSD network per input image contains 8732 boxes with \n", - "# localization and class probability distribution. \n", + "# By default, raw output from SSD network per input image contains 8732 boxes with\n", + "# localization and class probability distribution.\n", "# Let’s filter this output to only get reasonable detections (confidence>40%) in a more comprehensive format.\n", "results_per_input = utils.decode_results(detections_batch)\n", "best_results_per_input_trt = [utils.pick_best(results, 0.40) for results in results_per_input]" diff --git a/notebooks/vgg-qat.ipynb b/notebooks/vgg-qat.ipynb index 5888950378..1232b03393 100644 --- a/notebooks/vgg-qat.ipynb +++ b/notebooks/vgg-qat.ipynb @@ -97,12 +97,10 @@ "import torchvision.datasets as datasets\n", "import torch_tensorrt\n", "\n", - "from torch.utils.tensorboard import SummaryWriter\n", "\n", "import pytorch_quantization\n", "from pytorch_quantization import nn as quant_nn\n", "from pytorch_quantization import quant_modules\n", - "from pytorch_quantization.tensor_quant import QuantDescriptor\n", "from pytorch_quantization import calib\n", "from tqdm import tqdm\n", "\n", @@ -209,7 +207,7 @@ " if batch % 500 == 499:\n", " print(\"Batch: [%5d | %5d] loss: %.3f\" % (batch + 1, len(dataloader), running_loss / 100))\n", " running_loss = 0.0\n", - " \n", + "\n", "def test(model, dataloader, crit, epoch):\n", " global writer\n", " global classes\n", @@ -440,7 +438,7 @@ " test_loss, test_acc = test(model, testing_dataloader, crit, epoch)\n", "\n", " print(\"Test Loss: {:.5f} Test Acc: {:.2f}%\".format(test_loss, 100 * test_acc))\n", - " \n", + "\n", "save_checkpoint({'epoch': epoch + 1,\n", " 'model_state_dict': model.state_dict(),\n", " 'acc': test_acc,\n", @@ -831,7 +829,7 @@ " test_loss, test_acc = test(qat_model, testing_dataloader, crit, epoch)\n", "\n", " print(\"Test Loss: {:.5f} Test Acc: {:.2f}%\".format(test_loss, 100 * test_acc))\n", - " \n", + "\n", "save_checkpoint({'epoch': epoch + 1,\n", " 'model_state_dict': qat_model.state_dict(),\n", " 'acc': test_acc,\n", @@ -1097,7 +1095,7 @@ " input_data = input_data.to(\"cuda\")\n", " if dtype=='fp16':\n", " input_data = input_data.half()\n", - " \n", + "\n", " print(\"Warm up ...\")\n", " with torch.no_grad():\n", " for _ in range(nwarmup):\n", diff --git a/py/torch_tensorrt/_Device.py b/py/torch_tensorrt/_Device.py index e92085d3a3..33941d1e7b 100644 --- a/py/torch_tensorrt/_Device.py +++ b/py/torch_tensorrt/_Device.py @@ -9,12 +9,11 @@ else: from typing_extensions import Self +import tensorrt as trt import torch from torch_tensorrt._enums import DeviceType from torch_tensorrt._features import needs_torch_tensorrt_runtime -import tensorrt as trt - class Device(object): """ diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index 302928a784..e9c5c3d622 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -653,7 +653,6 @@ def save( ) torch.export.save(module, file_path) elif module_type == _ModuleType.fx: - # The module type is torch.fx.GraphModule if output_format == "torchscript": module_ts = torch.jit.trace( @@ -671,7 +670,6 @@ def save( exp_program = export(module) torch.export.save(exp_program, file_path) else: - if arg_inputs is None: raise ValueError( "Provided model is a torch.fx.GraphModule and retrace is True, however the inputs or arg_inputs are empty. Please provide valid torch.tensors as inputs or arg_inputs to trace and save the model" diff --git a/py/torch_tensorrt/_features.py b/py/torch_tensorrt/_features.py index 5e95bacee0..8da7ac6fff 100644 --- a/py/torch_tensorrt/_features.py +++ b/py/torch_tensorrt/_features.py @@ -44,9 +44,7 @@ def _enabled_features_str() -> str: enabled = lambda x: "ENABLED" if x else "DISABLED" - out_str: str = ( - f"Enabled Features:\n - Dynamo Frontend: {enabled(_DYNAMO_FE_AVAIL)}\n - Torch-TensorRT Runtime: {enabled(_TORCHTRT_RT_AVAIL)}\n - FX Frontend: {enabled(_FX_FE_AVAIL)}\n - TorchScript Frontend: {enabled(_TS_FE_AVAIL)}\n" # type: ignore[no-untyped-call] - ) + out_str: str = f"Enabled Features:\n - Dynamo Frontend: {enabled(_DYNAMO_FE_AVAIL)}\n - Torch-TensorRT Runtime: {enabled(_TORCHTRT_RT_AVAIL)}\n - FX Frontend: {enabled(_FX_FE_AVAIL)}\n - TorchScript Frontend: {enabled(_TS_FE_AVAIL)}\n" # type: ignore[no-untyped-call] return out_str diff --git a/py/torch_tensorrt/dynamo/_engine_cache.py b/py/torch_tensorrt/dynamo/_engine_cache.py index 7835c419d0..83f75dc4e9 100644 --- a/py/torch_tensorrt/dynamo/_engine_cache.py +++ b/py/torch_tensorrt/dynamo/_engine_cache.py @@ -29,7 +29,6 @@ class BaseEngineCache(ABC): - @abstractmethod def __init__( self, @@ -224,7 +223,6 @@ def __init__( engine_cache_dir: str, engine_cache_size: int, ) -> None: - def get_dir_size(path: str) -> int: total = 0 with os.scandir(path) as it: diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py index c7a063d675..f2d4cfee88 100644 --- a/py/torch_tensorrt/dynamo/_exporter.py +++ b/py/torch_tensorrt/dynamo/_exporter.py @@ -112,7 +112,6 @@ def lift( non_user_input_idx = 0 for node in gm.graph.nodes: if node.op == "get_attr": - lift_val = None input_kind = None diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py index f1041682f8..64c2382582 100644 --- a/py/torch_tensorrt/dynamo/_refit.py +++ b/py/torch_tensorrt/dynamo/_refit.py @@ -251,7 +251,6 @@ def refit_module_weights( # Get the settings and check the setting to be uniform settings: Optional[CompilationSettings] = None if inline_module: - # Obtain the settings compiled_submodules = [ (name.replace("_engine", ""), engine) @@ -362,7 +361,6 @@ def refit_module_weights( # Generate the corresponding TRT Module for those for name, new_submodule in new_partitioned_module.named_children(): - # Refit each submodule # Extract engine from the submodule try: diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py index d7c0ea449e..83fcfbff36 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py @@ -197,7 +197,6 @@ def _populate_trt_builder_config( algorithm_selector: Optional[trt.IAlgorithmSelector] = None, tactic_sources: Optional[int] = None, ) -> trt.IBuilderConfig: - builder_config = self.builder.create_builder_config() if self.compilation_settings.debug: diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py index 6dad862892..1dad18989c 100644 --- a/py/torch_tensorrt/dynamo/conversion/_conversion.py +++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py @@ -94,7 +94,6 @@ def convert_module( rt_cls = PythonTorchTensorRTModule if ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime: - from torch_tensorrt.dynamo.runtime import TorchTensorRTModule rt_cls = TorchTensorRTModule @@ -102,7 +101,6 @@ def convert_module( elif ( not ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime ): - logger.info( "Since Torch-TensorRT runtime is not available, using Python Runtime, some features may not be available" ) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index f4bb4877cc..62526080c4 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -6,6 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, overload import numpy as np +import tensorrt as trt import torch import torch_tensorrt.dynamo.conversion.impl as impl from torch.fx.node import Argument, Target @@ -19,8 +20,6 @@ DynamoConverterImplSignature, ) -import tensorrt as trt - from ..types import Shape, TRTDataType, TRTLayer, TRTTensor _LOGGER: logging.Logger = logging.getLogger(__name__) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/arange.py b/py/torch_tensorrt/dynamo/conversion/impl/arange.py index 72eda19733..5b24c641b6 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/arange.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/arange.py @@ -22,7 +22,6 @@ def arange( end: Union[int, TRTTensor], step: Union[int, TRTTensor], ) -> TRTTensor: - if any(isinstance(tensor, TRTTensor) for tensor in (start, end, step)): start_rank_0 = get_trt_tensor(ctx, start, name + "_start_rank_0", min_rank=0) start_rank_1 = get_trt_tensor(ctx, start, name + "_start_rank_1", min_rank=1) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py index ec3cfcf28c..17e5042ce7 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py @@ -457,7 +457,6 @@ def add( lhs_val: Union[TRTTensor, int, float], rhs_val: Union[TRTTensor, int, float], ) -> TRTTensor: - return convert_binary_elementwise( ctx, target, source_ir, name, trt.ElementWiseOperation.SUM, lhs_val, rhs_val ) diff --git a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py index 2480d15df2..83ea3dd99b 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py @@ -25,7 +25,6 @@ def matrix_multiply( input_matrix_op: trt.MatrixOperation = trt.MatrixOperation.NONE, other_matrix_op: trt.MatrixOperation = trt.MatrixOperation.NONE, ) -> TRTTensor: - if not isinstance(input, trt.ITensor): input = get_trt_tensor(ctx, input, f"{name}_input") if not isinstance(other, trt.ITensor): diff --git a/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py index a46a9319c4..7e5b03a87e 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py @@ -42,7 +42,6 @@ def batch_norm( cudnn_enabled: bool, return_mean_rstd: bool, ) -> Union[TRTTensor, Tuple[TRTTensor, torch.Tensor, torch.Tensor]]: - if has_dynamic_shape(input.shape): assert input.shape[1] != -1, "Channel dim can't be dynamic for batch norm." diff --git a/py/torch_tensorrt/dynamo/conversion/impl/pad.py b/py/torch_tensorrt/dynamo/conversion/impl/pad.py index 8cc6bd42c8..731058a122 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/pad.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/pad.py @@ -112,7 +112,6 @@ def constant_padNd( pad: Sequence[Union[int, TRTTensor]], value: Union[int, float] = 0, ) -> TRTTensor: - rank = len(input.shape) start_indices_tensor, padded_shape_tensor = get_padded_shape_tensors( diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py index 3274d78c2b..990b01eb70 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py @@ -332,7 +332,6 @@ def cumsum( input: TRTTensor, dim: int, ) -> TRTTensor: - input_shape = input.shape dim = get_positive_dim(dim, len(input_shape)) if input_shape[dim] < 0: diff --git a/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py index c900c51b8f..34b667acf1 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py @@ -478,7 +478,6 @@ def sign( name: str, input_val: TRTTensor, ) -> TRTTensor: - return convert_unary( ctx, target, source_ir, name, trt.UnaryOperation.SIGN, input_val ) diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py index 134d84cf6d..a0e570e992 100644 --- a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py @@ -204,7 +204,6 @@ def __init__( self.init_finished = True def store_state_dict_metadata(self) -> None: - for k, v in self.original_model.state_dict().items(): self.state_dict_metadata[k] = v.shape @@ -400,7 +399,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: return self.forward(*args, **kwargs) def __getattr__(self, name: str) -> Any: - if name in self.__dict__: # this object has it return getattr(self, name) @@ -413,7 +411,6 @@ def __getattr__(self, name: str) -> Any: return getattr(self.pytorch_model, name) def __delattr__(self, name: str) -> Any: - if name in self.__dict__: # this object has it super().__delattr__(name) diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py index f72d510a17..9086de657f 100644 --- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py @@ -466,7 +466,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . outputs = self.create_output_tensors() for o, output_name in enumerate(self.output_names): - if need_cudagraphs_record: self._output_buffers[o] = outputs[o].clone() @@ -496,7 +495,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, . self._engine_stream.wait_stream(self._caller_stream) with torch.cuda.stream(self._engine_stream): - if cudagraphs_enabled: if need_cudagraphs_record: self.cudagraph = torch.cuda.CUDAGraph() diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 5d6807f33a..467811ef28 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -633,7 +633,6 @@ def check_output_equal( rtol: float = RTOL, atol: float = ATOL, ) -> bool: - if type(output1) != type(output2): logger.warning( "The output types are different. Check_output_equal will always return false." diff --git a/py/torch_tensorrt/logging.py b/py/torch_tensorrt/logging.py index 8447169cc2..0cba3bd510 100644 --- a/py/torch_tensorrt/logging.py +++ b/py/torch_tensorrt/logging.py @@ -1,17 +1,15 @@ import logging from typing import Any +import tensorrt as trt import torch from torch_tensorrt._features import ENABLED_FEATURES -import tensorrt as trt - logging.captureWarnings(True) _LOGGER = logging.getLogger("torch_tensorrt [TensorRT Conversion Context]") class _TRTLogger(trt.ILogger): # type: ignore[misc] - def __init__(self) -> None: trt.ILogger.__init__(self) diff --git a/py/torch_tensorrt/runtime/_utils.py b/py/torch_tensorrt/runtime/_utils.py index 90da7f69ad..c42a2b2a2b 100644 --- a/py/torch_tensorrt/runtime/_utils.py +++ b/py/torch_tensorrt/runtime/_utils.py @@ -145,7 +145,6 @@ def no_op_placeholder_for_execute_engine( serialized_metadata: str, serialized_target_platform: str, ) -> List[torch.Tensor]: - raise RuntimeError( "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api." ) diff --git a/setup.py b/setup.py index e426123e8b..17b3d33c75 100644 --- a/setup.py +++ b/setup.py @@ -285,7 +285,6 @@ def finalize_options(self): self.root_is_pure = False def run(self): - if not PY_ONLY: global CXX11_ABI build_libtorchtrt_pre_cxx11_abi( @@ -309,7 +308,6 @@ def finalize_options(self): self.root_is_pure = False def run(self): - if not PY_ONLY: global CXX11_ABI build_libtorchtrt_pre_cxx11_abi( diff --git a/tests/core/conversion/converters/test_instance_norm.cpp b/tests/core/conversion/converters/test_instance_norm.cpp index 2986d73cca..8f9904ef84 100644 --- a/tests/core/conversion/converters/test_instance_norm.cpp +++ b/tests/core/conversion/converters/test_instance_norm.cpp @@ -18,7 +18,7 @@ constexpr auto graph = R"IR( %running_mean.1 : Tensor?, %running_var.1 : Tensor?, %use_input_stats.1 : bool): - %cudnn_enabled.1 : bool = prim::Constant[value=0]() + %cudnn_enabled.1 : bool = prim::Constant[value=1]() %momentum.1 : float = prim::Constant[value=0.10000000000000001]() %eps.1 : float = prim::Constant[value=1.0000000000000001e-05]() %4 : Tensor = aten::instance_norm(%input.1, diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py index 26818acd8a..9813548a10 100644 --- a/tests/py/dynamo/conversion/harness.py +++ b/tests/py/dynamo/conversion/harness.py @@ -500,7 +500,6 @@ def run_test_compare_tensor_attributes_only( enable_passes=False, immutable_weights=True, ): - # Previous instance of the interpreter auto-casted 64-bit inputs # We replicate this behavior here compilation_settings = CompilationSettings( diff --git a/tests/py/dynamo/conversion/test_resize_aten.py b/tests/py/dynamo/conversion/test_resize_aten.py index 8318035d86..2ca878479b 100644 --- a/tests/py/dynamo/conversion/test_resize_aten.py +++ b/tests/py/dynamo/conversion/test_resize_aten.py @@ -6,7 +6,6 @@ class TestResizeConverter(DispatchTestCase): - def compare_resized_tensors(self, tensor1, tensor2, input_shape, target_shape): # Check if the sizes match if tensor1.size() != tensor2.size(): diff --git a/tests/py/dynamo/conversion/test_sym_not_aten.py b/tests/py/dynamo/conversion/test_sym_not_aten.py index 3ba0889f9b..f6c1e4fa4f 100644 --- a/tests/py/dynamo/conversion/test_sym_not_aten.py +++ b/tests/py/dynamo/conversion/test_sym_not_aten.py @@ -7,7 +7,6 @@ class TestSymNotConverter(DispatchTestCase): - @parameterized.expand( [ (torch.tensor(True),), diff --git a/tests/py/dynamo/lowering/test_decompositions.py b/tests/py/dynamo/lowering/test_decompositions.py index 797d8d3263..95320b9996 100644 --- a/tests/py/dynamo/lowering/test_decompositions.py +++ b/tests/py/dynamo/lowering/test_decompositions.py @@ -1533,7 +1533,6 @@ def __init__(self): super().__init__() def forward(self, input): - return torch.ops.aten.scatter_reduce_.two( input, dim, index, src, reduce=reduce_op_str ) diff --git a/tests/py/dynamo/models/test_dtype_support.py b/tests/py/dynamo/models/test_dtype_support.py index b486784e52..146f7fdb7d 100644 --- a/tests/py/dynamo/models/test_dtype_support.py +++ b/tests/py/dynamo/models/test_dtype_support.py @@ -13,7 +13,6 @@ class Test64BitSupport(TestCase): - @unittest.skipIf( not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime, "Torch-TensorRT Runtime is not available", diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py index 68451674c5..36bf5edc95 100644 --- a/tests/py/dynamo/models/test_engine_cache.py +++ b/tests/py/dynamo/models/test_engine_cache.py @@ -57,7 +57,6 @@ def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]: class TestHashFunction(TestCase): - def test_reexport_is_equal(self): pyt_model = models.resnet18(pretrained=True).eval().to("cuda") example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) @@ -177,7 +176,6 @@ def test_engine_settings_is_not_equal(self): class TestEngineCache(TestCase): - @pytest.mark.xfail def test_dynamo_compile_with_default_disk_engine_cache(self): model = models.resnet18(pretrained=True).eval().to("cuda") diff --git a/tests/py/dynamo/models/test_export_kwargs_serde.py b/tests/py/dynamo/models/test_export_kwargs_serde.py index aa4ea14cea..928d62e7ba 100644 --- a/tests/py/dynamo/models/test_export_kwargs_serde.py +++ b/tests/py/dynamo/models/test_export_kwargs_serde.py @@ -393,7 +393,6 @@ def forward(self, x, b=None, c=None, d=None, e=[]): @pytest.mark.unit def test_custom_model_with_dynamo_trace_kwarg_list_dynamic(): - class net(nn.Module): def __init__(self): super().__init__() diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py index bb61ac2d43..a0b3292c29 100644 --- a/tests/py/dynamo/models/test_model_refit.py +++ b/tests/py/dynamo/models/test_model_refit.py @@ -32,7 +32,6 @@ ) @pytest.mark.unit def test_mapping(): - model = models.resnet18(pretrained=False).eval().to("cuda") model2 = models.resnet18(pretrained=True).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] @@ -88,7 +87,6 @@ def test_mapping(): ) @pytest.mark.unit def test_refit_one_engine_with_weightmap(): - model = models.resnet18(pretrained=False).eval().to("cuda") model2 = models.resnet18(pretrained=True).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] @@ -138,7 +136,6 @@ def test_refit_one_engine_with_weightmap(): ) @pytest.mark.unit def test_refit_one_engine_no_map_with_weightmap(): - model = models.resnet18(pretrained=False).eval().to("cuda") model2 = models.resnet18(pretrained=True).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] @@ -189,7 +186,6 @@ def test_refit_one_engine_no_map_with_weightmap(): ) @pytest.mark.unit def test_refit_one_engine_with_wrong_weightmap(): - model = models.resnet18(pretrained=False).eval().to("cuda") model2 = models.resnet18(pretrained=True).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] @@ -345,7 +341,6 @@ def test_refit_one_engine_inline_runtime__with_weightmap(): @pytest.mark.unit def test_refit_one_engine_python_runtime_with_weightmap(): - model = models.resnet18(pretrained=False).eval().to("cuda") model2 = models.resnet18(pretrained=True).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] @@ -394,7 +389,6 @@ def test_refit_one_engine_python_runtime_with_weightmap(): ) @pytest.mark.unit def test_refit_multiple_engine_with_weightmap(): - class net(nn.Module): def __init__(self): super().__init__() @@ -466,7 +460,6 @@ def forward(self, x): ) @pytest.mark.unit def test_refit_one_engine_without_weightmap(): - model = models.resnet18(pretrained=True).eval().to("cuda") model2 = models.resnet18(pretrained=False).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] @@ -616,7 +609,6 @@ def test_refit_one_engine_inline_runtime_without_weightmap(): @pytest.mark.unit def test_refit_one_engine_python_runtime_without_weightmap(): - model = models.resnet18(pretrained=True).eval().to("cuda") model2 = models.resnet18(pretrained=False).eval().to("cuda") inputs = [torch.randn((1, 3, 224, 224)).to("cuda")] @@ -665,7 +657,6 @@ def test_refit_one_engine_python_runtime_without_weightmap(): ) @pytest.mark.unit def test_refit_multiple_engine_without_weightmap(): - class net(nn.Module): def __init__(self): super().__init__() @@ -733,7 +724,6 @@ def forward(self, x): @pytest.mark.unit def test_refit_cumsum_fallback(): - class net(nn.Module): def __init__(self): super().__init__() diff --git a/tests/py/dynamo/runtime/test_001_streams.py b/tests/py/dynamo/runtime/test_001_streams.py index aaec9e3d41..e948107edf 100644 --- a/tests/py/dynamo/runtime/test_001_streams.py +++ b/tests/py/dynamo/runtime/test_001_streams.py @@ -12,7 +12,6 @@ class TestStreams(TestCase): - def test_non_default_stream_exec(self): class SampleModel(torch.nn.Module): def forward(self, x): diff --git a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py index da0dce8f44..6a67bd4ea0 100644 --- a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py +++ b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py @@ -45,7 +45,6 @@ def assert_close(outputs, ref_outputs): class TestLazyEngineInit(TestCase): - def test_lazy_engine_init_py(self): class Test(torch.nn.Module): def forward(self, a, b): diff --git a/tests/py/dynamo/runtime/test_003_cross_compile_for_windows.py b/tests/py/dynamo/runtime/test_003_cross_compile_for_windows.py index acf2aa006f..867bf14bee 100644 --- a/tests/py/dynamo/runtime/test_003_cross_compile_for_windows.py +++ b/tests/py/dynamo/runtime/test_003_cross_compile_for_windows.py @@ -12,7 +12,6 @@ class TestCrossCompileSaveForWindows(TestCase): - @unittest.skipIf( platform.system() != "Linux" or platform.architecture()[0] != "64bit", "Cross compile for windows can only be enabled on linux x86-64 platform", diff --git a/tests/py/dynamo/runtime/test_004_weight_streaming.py b/tests/py/dynamo/runtime/test_004_weight_streaming.py index 10ff950823..78522388d1 100644 --- a/tests/py/dynamo/runtime/test_004_weight_streaming.py +++ b/tests/py/dynamo/runtime/test_004_weight_streaming.py @@ -31,7 +31,6 @@ def forward(self, x): class TestWeightStreamingPython(TestCase): - @parameterized.expand( [ ("python_runtime", True), diff --git a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py index f2bcaf7ede..ab1137e2b3 100644 --- a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py +++ b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py @@ -42,7 +42,6 @@ def test_check_output_equal(): ) @pytest.mark.unit def test_resnet18(): - torch.manual_seed(0) inputs = [torch.rand((1, 3, 224, 224)).to("cuda")] @@ -79,7 +78,6 @@ def test_resnet18(): ) @pytest.mark.unit def test_save(): - torch.manual_seed(0) inputs = [torch.rand((1, 3, 224, 224)).to("cuda")] @@ -116,7 +114,6 @@ def test_save(): ) @pytest.mark.unit def test_resnet18_modify_attribute(): - torch.manual_seed(0) inputs = [torch.rand((1, 3, 224, 224)).to("cuda")] @@ -157,7 +154,6 @@ def test_resnet18_modify_attribute(): ) @pytest.mark.unit def test_resnet18_modify_attribute_no_refit(): - torch.manual_seed(0) inputs = [torch.rand((1, 3, 224, 224)).to("cuda")] From a57f267c98aa8d57a3bbcfd14eb9466cebe9d68d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 29 Jan 2025 16:27:03 -0800 Subject: [PATCH 2/5] chore: address flaky test failures related to global partitioning (#3369) --- .../torch_compile_advanced_usage.py | 3 +- .../torch_export_gpt2.py | 3 +- .../torch_export_cudagraphs.py | 3 +- .../dynamo_compile_resnet_example.py | 3 +- .../torch_export_llama2.py | 3 +- .../torch_compile_resnet_example.py | 3 +- .../torch_compile_transformers_example.py | 3 +- .../dynamo_compile_advanced_usage.py | 3 +- .../dynamo_compile_transformers_example.py | 3 +- .../dynamo_compile_resnet_example.py | 3 +- .../dynamo_compile_advanced_usage.py | 3 +- .../dynamo_compile_transformers_example.py | 3 +- .../dynamo/torch_compile_advanced_usage.py | 3 +- .../dynamo/torch_compile_resnet_example.py | 3 +- .../torch_compile_transformers_example.py | 3 +- examples/dynamo/torch_export_cudagraphs.py | 3 +- examples/dynamo/torch_export_gpt2.py | 3 +- examples/dynamo/torch_export_llama2.py | 3 +- py/torch_tensorrt/_Input.py | 2 +- py/torch_tensorrt/_enums.py | 2 +- .../dynamo/conversion/_TRTBuilderMonitor.py | 6 +- .../dynamo/conversion/impl/activation/ops.py | 4 +- py/torch_tensorrt/dynamo/utils.py | 2 +- .../fx/test/converters/acc_op/test_where.py | 2 +- .../fx/tracer/acc_tracer/acc_tracer.py | 5 +- .../test_flaky_global_partitioning.py | 108 ++++++++++++++++++ .../partitioning/test_global_partitioning.py | 83 -------------- 27 files changed, 155 insertions(+), 113 deletions(-) create mode 100644 tests/py/dynamo/partitioning/test_flaky_global_partitioning.py diff --git a/docs/_downloads/0e30a6276601af7e5fc4d5166e2e3d37/torch_compile_advanced_usage.py b/docs/_downloads/0e30a6276601af7e5fc4d5166e2e3d37/torch_compile_advanced_usage.py index 8ebedab111..af7d4b212d 100644 --- a/docs/_downloads/0e30a6276601af7e5fc4d5166e2e3d37/torch_compile_advanced_usage.py +++ b/docs/_downloads/0e30a6276601af7e5fc4d5166e2e3d37/torch_compile_advanced_usage.py @@ -4,7 +4,8 @@ Torch Compile Advanced Usage ====================================================== -This interactive script is intended as an overview of the process by which `torch_tensorrt.compile(..., ir="torch_compile", ...)` works, and how it integrates with the `torch.compile` API.""" +This interactive script is intended as an overview of the process by which `torch_tensorrt.compile(..., ir="torch_compile", ...)` works, and how it integrates with the `torch.compile` API. +""" # %% # Imports and Model Definition diff --git a/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py b/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py index cea0f3adf2..4d34c58de4 100644 --- a/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py +++ b/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py @@ -4,7 +4,8 @@ Compiling GPT2 using the dynamo backend ========================================================== -This script illustrates Torch-TensorRT workflow with dynamo backend on popular GPT2 model.""" +This script illustrates Torch-TensorRT workflow with dynamo backend on popular GPT2 model. +""" # %% # Imports and Model Definition diff --git a/docs/_downloads/3d4d74f6636d986f33167154f6553961/torch_export_cudagraphs.py b/docs/_downloads/3d4d74f6636d986f33167154f6553961/torch_export_cudagraphs.py index 1671c7783d..fb31766b7c 100644 --- a/docs/_downloads/3d4d74f6636d986f33167154f6553961/torch_export_cudagraphs.py +++ b/docs/_downloads/3d4d74f6636d986f33167154f6553961/torch_export_cudagraphs.py @@ -4,7 +4,8 @@ Torch Export with Cudagraphs ====================================================== -This interactive script is intended as an overview of the process by which the Torch-TensorRT Cudagraphs integration can be used in the `ir="dynamo"` path. The functionality works similarly in the `torch.compile` path as well.""" +This interactive script is intended as an overview of the process by which the Torch-TensorRT Cudagraphs integration can be used in the `ir="dynamo"` path. The functionality works similarly in the `torch.compile` path as well. +""" # %% # Imports and Model Definition diff --git a/docs/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py b/docs/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py index 797e41f5fd..5826e28d1e 100644 --- a/docs/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py +++ b/docs/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py @@ -4,7 +4,8 @@ Compiling ResNet using the Torch-TensorRT Dyanmo Frontend ========================================================== -This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a ResNet model.""" +This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a ResNet model. +""" # %% # Imports and Model Definition diff --git a/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py b/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py index 5cfd1ed61c..2f3e3cba43 100644 --- a/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py +++ b/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py @@ -4,7 +4,8 @@ Compiling Llama2 using the dynamo backend ========================================================== -This script illustrates Torch-TensorRT workflow with dynamo backend on popular Llama2 model.""" +This script illustrates Torch-TensorRT workflow with dynamo backend on popular Llama2 model. +""" # %% # Imports and Model Definition diff --git a/docs/_downloads/d6e1bb6ec5f884994554d9d12e37a0f6/torch_compile_resnet_example.py b/docs/_downloads/d6e1bb6ec5f884994554d9d12e37a0f6/torch_compile_resnet_example.py index f852d60158..fb75986099 100644 --- a/docs/_downloads/d6e1bb6ec5f884994554d9d12e37a0f6/torch_compile_resnet_example.py +++ b/docs/_downloads/d6e1bb6ec5f884994554d9d12e37a0f6/torch_compile_resnet_example.py @@ -4,7 +4,8 @@ Compiling ResNet with dynamic shapes using the `torch.compile` backend ========================================================== -This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a ResNet model.""" +This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a ResNet model. +""" # %% # Imports and Model Definition diff --git a/docs/_downloads/dfa60e8f9850fd7761f3e7da81304d32/torch_compile_transformers_example.py b/docs/_downloads/dfa60e8f9850fd7761f3e7da81304d32/torch_compile_transformers_example.py index 221ecd4fd1..17cf46e8a3 100644 --- a/docs/_downloads/dfa60e8f9850fd7761f3e7da81304d32/torch_compile_transformers_example.py +++ b/docs/_downloads/dfa60e8f9850fd7761f3e7da81304d32/torch_compile_transformers_example.py @@ -4,7 +4,8 @@ Compiling BERT using the `torch.compile` backend ============================================================== -This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a BERT model.""" +This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a BERT model. +""" # %% # Imports and Model Definition diff --git a/docs/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py b/docs/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py index f73bd1e780..3fb63e8a32 100644 --- a/docs/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py +++ b/docs/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py @@ -4,7 +4,8 @@ Dynamo Compile Advanced Usage ====================================================== -This interactive script is intended as an overview of the process by which `torch_tensorrt.dynamo.compile` works, and how it integrates with the new `torch.compile` API.""" +This interactive script is intended as an overview of the process by which `torch_tensorrt.dynamo.compile` works, and how it integrates with the new `torch.compile` API. +""" # %% # Imports and Model Definition diff --git a/docs/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py b/docs/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py index dd7fe2e07a..59319078a4 100644 --- a/docs/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py +++ b/docs/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py @@ -4,7 +4,8 @@ Compiling a Transformer using torch.compile and TensorRT ============================================================== -This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a transformer-based model.""" +This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a transformer-based model. +""" # %% # Imports and Model Definition diff --git a/docs/v1.4.0/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py b/docs/v1.4.0/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py index 797e41f5fd..5826e28d1e 100644 --- a/docs/v1.4.0/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py +++ b/docs/v1.4.0/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py @@ -4,7 +4,8 @@ Compiling ResNet using the Torch-TensorRT Dyanmo Frontend ========================================================== -This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a ResNet model.""" +This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a ResNet model. +""" # %% # Imports and Model Definition diff --git a/docs/v1.4.0/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py b/docs/v1.4.0/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py index f73bd1e780..3fb63e8a32 100644 --- a/docs/v1.4.0/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py +++ b/docs/v1.4.0/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py @@ -4,7 +4,8 @@ Dynamo Compile Advanced Usage ====================================================== -This interactive script is intended as an overview of the process by which `torch_tensorrt.dynamo.compile` works, and how it integrates with the new `torch.compile` API.""" +This interactive script is intended as an overview of the process by which `torch_tensorrt.dynamo.compile` works, and how it integrates with the new `torch.compile` API. +""" # %% # Imports and Model Definition diff --git a/docs/v1.4.0/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py b/docs/v1.4.0/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py index dd7fe2e07a..59319078a4 100644 --- a/docs/v1.4.0/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py +++ b/docs/v1.4.0/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py @@ -4,7 +4,8 @@ Compiling a Transformer using torch.compile and TensorRT ============================================================== -This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a transformer-based model.""" +This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a transformer-based model. +""" # %% # Imports and Model Definition diff --git a/examples/dynamo/torch_compile_advanced_usage.py b/examples/dynamo/torch_compile_advanced_usage.py index 8ebedab111..af7d4b212d 100644 --- a/examples/dynamo/torch_compile_advanced_usage.py +++ b/examples/dynamo/torch_compile_advanced_usage.py @@ -4,7 +4,8 @@ Torch Compile Advanced Usage ====================================================== -This interactive script is intended as an overview of the process by which `torch_tensorrt.compile(..., ir="torch_compile", ...)` works, and how it integrates with the `torch.compile` API.""" +This interactive script is intended as an overview of the process by which `torch_tensorrt.compile(..., ir="torch_compile", ...)` works, and how it integrates with the `torch.compile` API. +""" # %% # Imports and Model Definition diff --git a/examples/dynamo/torch_compile_resnet_example.py b/examples/dynamo/torch_compile_resnet_example.py index f852d60158..fb75986099 100644 --- a/examples/dynamo/torch_compile_resnet_example.py +++ b/examples/dynamo/torch_compile_resnet_example.py @@ -4,7 +4,8 @@ Compiling ResNet with dynamic shapes using the `torch.compile` backend ========================================================== -This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a ResNet model.""" +This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a ResNet model. +""" # %% # Imports and Model Definition diff --git a/examples/dynamo/torch_compile_transformers_example.py b/examples/dynamo/torch_compile_transformers_example.py index 221ecd4fd1..17cf46e8a3 100644 --- a/examples/dynamo/torch_compile_transformers_example.py +++ b/examples/dynamo/torch_compile_transformers_example.py @@ -4,7 +4,8 @@ Compiling BERT using the `torch.compile` backend ============================================================== -This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a BERT model.""" +This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a BERT model. +""" # %% # Imports and Model Definition diff --git a/examples/dynamo/torch_export_cudagraphs.py b/examples/dynamo/torch_export_cudagraphs.py index 1671c7783d..fb31766b7c 100644 --- a/examples/dynamo/torch_export_cudagraphs.py +++ b/examples/dynamo/torch_export_cudagraphs.py @@ -4,7 +4,8 @@ Torch Export with Cudagraphs ====================================================== -This interactive script is intended as an overview of the process by which the Torch-TensorRT Cudagraphs integration can be used in the `ir="dynamo"` path. The functionality works similarly in the `torch.compile` path as well.""" +This interactive script is intended as an overview of the process by which the Torch-TensorRT Cudagraphs integration can be used in the `ir="dynamo"` path. The functionality works similarly in the `torch.compile` path as well. +""" # %% # Imports and Model Definition diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index cea0f3adf2..4d34c58de4 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -4,7 +4,8 @@ Compiling GPT2 using the dynamo backend ========================================================== -This script illustrates Torch-TensorRT workflow with dynamo backend on popular GPT2 model.""" +This script illustrates Torch-TensorRT workflow with dynamo backend on popular GPT2 model. +""" # %% # Imports and Model Definition diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py index 5cfd1ed61c..2f3e3cba43 100644 --- a/examples/dynamo/torch_export_llama2.py +++ b/examples/dynamo/torch_export_llama2.py @@ -4,7 +4,8 @@ Compiling Llama2 using the dynamo backend ========================================================== -This script illustrates Torch-TensorRT workflow with dynamo backend on popular Llama2 model.""" +This script illustrates Torch-TensorRT workflow with dynamo backend on popular Llama2 model. +""" # %% # Imports and Model Definition diff --git a/py/torch_tensorrt/_Input.py b/py/torch_tensorrt/_Input.py index 126219ee8a..2f953094ca 100644 --- a/py/torch_tensorrt/_Input.py +++ b/py/torch_tensorrt/_Input.py @@ -261,7 +261,7 @@ def _supported_input_size_type(input_size: Any) -> bool: @staticmethod def _parse_tensor_domain( - domain: Optional[Tuple[float, float]] + domain: Optional[Tuple[float, float]], ) -> Tuple[float, float]: """ Produce a tuple of integers which specifies a tensor domain in the interval format: [lo, hi) diff --git a/py/torch_tensorrt/_enums.py b/py/torch_tensorrt/_enums.py index eaefb68ce5..c706c345d6 100644 --- a/py/torch_tensorrt/_enums.py +++ b/py/torch_tensorrt/_enums.py @@ -1200,7 +1200,7 @@ def _from( @classmethod def try_from( - c: Union[trt.EngineCapability, EngineCapability] + c: Union[trt.EngineCapability, EngineCapability], ) -> Optional[EngineCapability]: """Create a Torch-TensorRT engine capability enum from a TensorRT engine capability enum. diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTBuilderMonitor.py b/py/torch_tensorrt/dynamo/conversion/_TRTBuilderMonitor.py index 9a1189e44a..9b2755f4c7 100644 --- a/py/torch_tensorrt/dynamo/conversion/_TRTBuilderMonitor.py +++ b/py/torch_tensorrt/dynamo/conversion/_TRTBuilderMonitor.py @@ -53,13 +53,13 @@ def _redraw(self, *, blank_lines: int = 0) -> None: if self._render: def clear_line() -> None: - print("\x1B[2K", end="") + print("\x1b[2K", end="") def move_to_start_of_line() -> None: - print("\x1B[0G", end="") + print("\x1b[0G", end="") def move_cursor_up(lines: int) -> None: - print("\x1B[{}A".format(lines), end="") + print("\x1b[{}A".format(lines), end="") def progress_bar(steps: int, num_steps: int) -> str: INNER_WIDTH = 10 diff --git a/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py index a563118526..eb981f2031 100644 --- a/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py +++ b/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py @@ -247,7 +247,7 @@ def hard_sigmoid( operation_type = trt.ActivationType.HARD_SIGMOID def hard_sigmoid_dyn_range_fn( - dyn_range: Tuple[float, float] + dyn_range: Tuple[float, float], ) -> Tuple[float, float]: def hard_sigmoid_fn(x: float) -> float: return max(0, min(1, alpha * x + beta)) @@ -310,7 +310,7 @@ def thresholded_relu( operation_type = trt.ActivationType.THRESHOLDED_RELU def thresholded_relu_dyn_range_fn( - dyn_range: Tuple[float, float] + dyn_range: Tuple[float, float], ) -> Tuple[float, float]: def thresholded_relu_fn(x: float) -> float: return x if x > alpha else 0 diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 467811ef28..2d3cb2924d 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -465,7 +465,7 @@ def to_torch_device(device: Optional[Union[Device, torch.device, str]]) -> torch def to_torch_tensorrt_device( - device: Optional[Union[Device, torch.device, str]] + device: Optional[Union[Device, torch.device, str]], ) -> Device: """Cast a device-type to torch_tensorrt.Device diff --git a/py/torch_tensorrt/fx/test/converters/acc_op/test_where.py b/py/torch_tensorrt/fx/test/converters/acc_op/test_where.py index 72fea70265..1e14b50305 100644 --- a/py/torch_tensorrt/fx/test/converters/acc_op/test_where.py +++ b/py/torch_tensorrt/fx/test/converters/acc_op/test_where.py @@ -101,7 +101,7 @@ def __init__(self, x_shape, y_shape): def forward(self, condition): return torch.where(condition, self.x, self.y) - inputs = [(torch.randn(condition_shape) > 0)] + inputs = [torch.randn(condition_shape) > 0] self.run_test( Where(x_shape, y_shape), inputs, diff --git a/py/torch_tensorrt/fx/tracer/acc_tracer/acc_tracer.py b/py/torch_tensorrt/fx/tracer/acc_tracer/acc_tracer.py index 9d5576bd63..c8db1b62ef 100644 --- a/py/torch_tensorrt/fx/tracer/acc_tracer/acc_tracer.py +++ b/py/torch_tensorrt/fx/tracer/acc_tracer/acc_tracer.py @@ -10,7 +10,6 @@ from typing import ( Any, Callable, - cast, Dict, Iterable, Optional, @@ -19,6 +18,7 @@ Tuple, Type, Union, + cast, ) import torch @@ -32,7 +32,6 @@ from . import acc_normalizer, acc_ops, acc_shape_prop, acc_utils # noqa: F401 - _LOGGER = logging.getLogger(__name__) @@ -517,7 +516,7 @@ def _replace_transpose_last_dims_impl( changed = False def _calculate_dim( - transpose_dim: Union[torch.fx.Node, int] + transpose_dim: Union[torch.fx.Node, int], ) -> Union[torch.fx.Node, int]: nonlocal transpose_input_node nonlocal changed diff --git a/tests/py/dynamo/partitioning/test_flaky_global_partitioning.py b/tests/py/dynamo/partitioning/test_flaky_global_partitioning.py new file mode 100644 index 0000000000..2e2013d5e6 --- /dev/null +++ b/tests/py/dynamo/partitioning/test_flaky_global_partitioning.py @@ -0,0 +1,108 @@ +from copy import deepcopy + +import numpy as np +import pytest +import torch +import torch.nn.functional as F +import torch_tensorrt +from parameterized import parameterized +from torch.testing._internal.common_utils import TestCase, run_tests +from torch_tensorrt.dynamo import partitioning + +from ..testing_utilities import lower_graph_testing + +# Note: the following tests were a part of test_global_partitioning.py and were flaky when +# we ran all the tests. So, the following test cases were separated out in this test_flaky_global_partitioning.py +# The partitioned graphs were different when you ran the graph as a part of test_global_partitioning.py vs when you +# run these tests independently. pytest by default doesn't use parallel execution, so we are not sure why this behavior occurs +# currently. When you run these tests independently, the partitioned graph is structurally correct and is similar to fast partitioning. + + +class TestGlobalPartitioning(TestCase): + def test_partition_partially_supported_multi_op(self): + class PartiallySupportedMultiOp(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def forward(self, x, y): + sum_1 = torch.ops.aten.add.Tensor(x, y) + sum_2 = torch.ops.aten.add.Tensor(x, sum_1) + sum_ = np.sum(sum_1) + np.sum(sum_2) + relu_ = torch.ops.aten.relu.default(sum_) + pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2) + return pow_ + + fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp()) + partitioned_graph, _ = partitioning.global_partition( + deepcopy(fx_graph), min_block_size=2 + ) + # breakpoint() + self.assertEqual( + len(list(partitioned_graph.named_children())), + 2, + "Unsupported operators interleave supported ones, expected 2 segments", + ) + + def test_partition_partially_supported_with_torch_executed_ops(self): + class PartiallySupportedMultiOp(torch.nn.Module): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def forward(self, x, y): + sum_1 = torch.ops.aten.add.Tensor(x, y) + sum_2 = torch.ops.aten.add.Tensor(x, sum_1) + sum_ = torch.ops.aten.add.Tensor(sum_1, sum_2) + relu_ = torch.ops.aten.relu.default(sum_) + pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2) + return pow_ + + unexpected_ops = {torch.ops.aten.add.Tensor} + + inputs = [ + torch.randint( + 1, + 10, + (5,), + ), + torch.randint( + 1, + 10, + (5,), + ), + ] + + fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp()) + ( + unexpected_ops_seen, + _, + partitioned_graphs, + ) = lower_graph_testing( + fx_graph, + inputs, + unexpected_ops=unexpected_ops, + min_block_size=2, + torch_executed_ops={"torch.ops.aten.add.Tensor"}, + testing_partitioning=True, + use_fast_partitioner=False, + ) + + self.assertEqual( + len(unexpected_ops_seen), + 0, + f"The following unexpected ops were encountered: {unexpected_ops_seen}", + ) + + self.assertEqual( + len(partitioned_graphs), + 1, + "Without control flow breaks, there should only be a single graph", + ) + self.assertEqual( + len(list(partitioned_graphs[0].named_children())), + 1, + "Certain operators are set to run in Torch, expected 1 segment", + ) + + +if __name__ == "__main__": + run_tests() diff --git a/tests/py/dynamo/partitioning/test_global_partitioning.py b/tests/py/dynamo/partitioning/test_global_partitioning.py index 80b6716d20..887fa35659 100644 --- a/tests/py/dynamo/partitioning/test_global_partitioning.py +++ b/tests/py/dynamo/partitioning/test_global_partitioning.py @@ -117,89 +117,6 @@ def forward(self, x, y): "All operators are supported, there should be one segment", ) - def test_partition_partially_supported_multi_op(self): - class PartiallySupportedMultiOp(torch.nn.Module): - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - def forward(self, x, y): - sum_1 = torch.ops.aten.add.Tensor(x, y) - sum_2 = torch.ops.aten.add.Tensor(x, sum_1) - sum_ = np.sum(sum_1) + np.sum(sum_2) - relu_ = torch.ops.aten.relu.default(sum_) - pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2) - return pow_ - - fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp()) - partitioned_graph, _ = partitioning.global_partition( - deepcopy(fx_graph), min_block_size=2 - ) - self.assertEqual( - len(list(partitioned_graph.named_children())), - 2, - "Unsupported operators interleave supported ones, expected 2 segments", - ) - - def test_partition_partially_supported_with_torch_executed_ops(self): - class PartiallySupportedMultiOp(torch.nn.Module): - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - def forward(self, x, y): - sum_1 = torch.ops.aten.add.Tensor(x, y) - sum_2 = torch.ops.aten.add.Tensor(x, sum_1) - sum_ = torch.ops.aten.add.Tensor(sum_1, sum_2) - relu_ = torch.ops.aten.relu.default(sum_) - pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2) - return pow_ - - unexpected_ops = {torch.ops.aten.add.Tensor} - - inputs = [ - torch.randint( - 1, - 10, - (5,), - ), - torch.randint( - 1, - 10, - (5,), - ), - ] - - fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp()) - ( - unexpected_ops_seen, - _, - partitioned_graphs, - ) = lower_graph_testing( - fx_graph, - inputs, - unexpected_ops=unexpected_ops, - min_block_size=2, - torch_executed_ops={"torch.ops.aten.add.Tensor"}, - testing_partitioning=True, - use_fast_partitioner=False, - ) - - self.assertEqual( - len(unexpected_ops_seen), - 0, - f"The following unexpected ops were encountered: {unexpected_ops_seen}", - ) - - self.assertEqual( - len(partitioned_graphs), - 1, - "Without control flow breaks, there should only be a single graph", - ) - self.assertEqual( - len(list(partitioned_graphs[0].named_children())), - 1, - "Certain operators are set to run in Torch, expected 1 segment", - ) - if __name__ == "__main__": run_tests() From ed6ef65b51ab6922788ba9e906b80101e8441aed Mon Sep 17 00:00:00 2001 From: "Zewen (Evan) Li" Date: Sat, 18 Jan 2025 08:59:51 +0800 Subject: [PATCH 3/5] fix: CI docker build error for release 2.6 (#3360) --- .github/workflows/docker_builder.yml | 2 +- docker/README.md | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docker_builder.yml b/.github/workflows/docker_builder.yml index a978d82b6a..4aa228db95 100644 --- a/.github/workflows/docker_builder.yml +++ b/.github/workflows/docker_builder.yml @@ -54,7 +54,7 @@ jobs: TRT_VERSION=$(python3 -c "import versions; versions.tensorrt_version()") echo "TRT VERSION = ${TRT_VERSION}" - DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=$TRT_VERSION -f docker/Dockerfile --tag $DOCKER_TAG . + DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=$TRT_VERSION --build-arg USE_CXX11_ABI=1 -f docker/Dockerfile --tag $DOCKER_TAG . - name: Push Docker image env: diff --git a/docker/README.md b/docker/README.md index 7435973b1a..85be0d5791 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,13 +1,13 @@ # Building a Torch-TensorRT container -* Use `Dockerfile` to build a container which provides the exact development environment that our master branch is usually tested against. +* Use `Dockerfile` to build a container which provides the exact development environment that our main branch is usually tested against. * The `Dockerfile` currently uses Bazelisk to select the Bazel version, and uses the exact library versions of Torch and CUDA listed in dependencies. * The desired versions of TensorRT must be specified as build-args, with major and minor versions as in: `--build-arg TENSORRT_VERSION=a.b` - * [**Optional**] The desired base image be changed by explicitly setting a base image, as in `--build-arg BASE_IMG=nvidia/cuda:11.8.0-devel-ubuntu22.04`, though this is optional + * [**Optional**] The desired base image be changed by explicitly setting a base image, as in `--build-arg BASE_IMG=nvidia/cuda:11.8.0-devel-ubuntu22.04`, though this is optional. * [**Optional**] Additionally, the desired Python version can be changed by explicitly setting a version, as in `--build-arg PYTHON_VERSION=3.10`, though this is optional as well. -* This `Dockerfile` installs `pre-cxx11-abi` versions of Pytorch and builds Torch-TRT using `pre-cxx11-abi` libtorch as well. +* This `Dockerfile` installs `pre-cxx11-abi` versions of Pytorch and builds Torch-TRT using `pre-cxx11-abi` libtorch as well. Update on 1/17/2025: In torch 2.6, `PRE_CXX11_ABI` is required for CUDA 11.8 and 12.4, while `USE_CXX11_ABI` is required for CUDA 12.6. As of torch 2.7, torch requires `USE_CXX11_ABI` for all CUDA 11.8, 12.4, and 12.6. Note: By default the container uses the `pre-cxx11-abi` version of Torch + Torch-TRT. If you are using a workflow that requires a build of PyTorch on the CXX11 ABI (e.g. using the PyTorch NGC containers as a base image), add the Docker build argument: `--build-arg USE_CXX11_ABI=1` @@ -24,7 +24,7 @@ Note: By default the container uses the `pre-cxx11-abi` version of Torch + Torch Build: ``` -DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=10.6.0 -f docker/Dockerfile -t torch_tensorrt:latest . +DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=10.6.0 --build-arg USE_CXX11_ABI=1 -f docker/Dockerfile -t torch_tensorrt:latest . ``` Run: From 627b0cfff91e1c19737d3a96e69f8c75fc2b6d7c Mon Sep 17 00:00:00 2001 From: "Zewen (Evan) Li" Date: Fri, 17 Jan 2025 04:46:49 +0800 Subject: [PATCH 4/5] fix: CI errors on release 2.6 (#3358) --- .github/workflows/assigner.yml | 2 +- .github/workflows/build-tensorrt-linux.yml | 6 ++--- .github/workflows/build-tensorrt-windows.yml | 4 +-- .github/workflows/build-test-linux.yml | 26 ++++++++++++++----- .../workflows/build-test-tensorrt-linux.yml | 26 ++++++++++++++----- .../workflows/build-test-tensorrt-windows.yml | 26 ++++++++++++++----- .github/workflows/build-test-windows.yml | 24 ++++++++++++----- .github/workflows/docker_builder.yml | 2 +- .github/workflows/linter.yml | 4 +-- .github/workflows/linux-test.yml | 6 ++--- .github/workflows/nightlies.yml | 2 +- .github/workflows/release-linux.yml | 4 +-- .github/workflows/release-wheel-linux.yml | 10 +++---- .github/workflows/release-wheel-windows.yml | 4 +-- .github/workflows/release-windows.yml | 2 +- .github/workflows/windows-test.yml | 6 ++--- 16 files changed, 101 insertions(+), 53 deletions(-) diff --git a/.github/workflows/assigner.yml b/.github/workflows/assigner.yml index 2b65e554b1..b1056c50b3 100644 --- a/.github/workflows/assigner.yml +++ b/.github/workflows/assigner.yml @@ -22,7 +22,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Assign uses: ./.github/actions/assigner diff --git a/.github/workflows/build-tensorrt-linux.yml b/.github/workflows/build-tensorrt-linux.yml index 7581c38ae8..42fd32eb55 100644 --- a/.github/workflows/build-tensorrt-linux.yml +++ b/.github/workflows/build-tensorrt-linux.yml @@ -114,13 +114,13 @@ jobs: rm -rf "${RUNNER_TEMP}/*" fi echo "::endgroup::" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # Support the use case where we need to checkout someone's fork repository: ${{ inputs.test-infra-repository }} ref: ${{ inputs.test-infra-ref }} path: test-infra - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 if: ${{ env.ARCH == 'aarch64' }} with: # Support the use case where we need to checkout someone's fork @@ -212,7 +212,7 @@ jobs: # NB: Only upload to GitHub after passing smoke tests - name: Upload wheel to GitHub continue-on-error: true - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ env.UPLOAD_ARTIFACT_NAME }} path: ${{ inputs.repository }}/dist diff --git a/.github/workflows/build-tensorrt-windows.yml b/.github/workflows/build-tensorrt-windows.yml index 4b86910768..67639a3f02 100644 --- a/.github/workflows/build-tensorrt-windows.yml +++ b/.github/workflows/build-tensorrt-windows.yml @@ -100,7 +100,7 @@ jobs: # to have a conversation timeout-minutes: 120 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # Support the use case where we need to checkout someone's fork repository: ${{ inputs.test-infra-repository }} @@ -216,7 +216,7 @@ jobs: # NB: Only upload to GitHub after passing smoke tests - name: Upload wheel to GitHub continue-on-error: true - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ env.UPLOAD_ARTIFACT_NAME }} path: ${{ inputs.repository }}/dist/ diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml index b0a487bb79..ecbe57036a 100644 --- a/.github/workflows/build-test-linux.yml +++ b/.github/workflows/build-test-linux.yml @@ -33,7 +33,7 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: repository: pytorch/tensorrt - name: Generate release matrix @@ -136,7 +136,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/ popd @@ -165,7 +167,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/ popd @@ -194,7 +198,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py popd @@ -224,7 +230,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py @@ -255,7 +263,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ @@ -286,7 +296,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo nvidia-smi python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py || true python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py || true diff --git a/.github/workflows/build-test-tensorrt-linux.yml b/.github/workflows/build-test-tensorrt-linux.yml index dd83299fe7..9bf9b2c3de 100644 --- a/.github/workflows/build-test-tensorrt-linux.yml +++ b/.github/workflows/build-test-tensorrt-linux.yml @@ -30,7 +30,7 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: repository: pytorch/tensorrt - name: Generate tensorrt matrix @@ -132,7 +132,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/ popd @@ -161,7 +163,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/ popd @@ -190,7 +194,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py popd @@ -219,7 +225,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py @@ -250,7 +258,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ @@ -281,7 +291,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo nvidia-smi python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py || true python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py || true diff --git a/.github/workflows/build-test-tensorrt-windows.yml b/.github/workflows/build-test-tensorrt-windows.yml index 883e7fe42a..cd73675407 100644 --- a/.github/workflows/build-test-tensorrt-windows.yml +++ b/.github/workflows/build-test-tensorrt-windows.yml @@ -30,7 +30,7 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: repository: pytorch/tensorrt - name: Generate tensorrt matrix @@ -135,7 +135,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/ popd @@ -161,7 +163,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/ popd @@ -187,7 +191,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py popd @@ -213,7 +219,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py @@ -241,7 +249,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ @@ -269,7 +279,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py popd diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml index c227d14a0f..2ee31b4b74 100644 --- a/.github/workflows/build-test-windows.yml +++ b/.github/workflows/build-test-windows.yml @@ -118,7 +118,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/ popd @@ -144,7 +146,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/ popd @@ -170,7 +174,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py popd @@ -197,7 +203,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py @@ -225,7 +233,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/ python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/ @@ -253,7 +263,9 @@ jobs: export USE_HOST_DEPS=1 export CI_BUILD=1 pushd . - cd tests/py/dynamo + cd tests/py + python -m pip install -r requirements.txt + cd dynamo python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py popd diff --git a/.github/workflows/docker_builder.yml b/.github/workflows/docker_builder.yml index 4aa228db95..771dc79f42 100644 --- a/.github/workflows/docker_builder.yml +++ b/.github/workflows/docker_builder.yml @@ -30,7 +30,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Fix Slashes Repo Name id: fix_slashes diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml index 6428bef8c8..c05f45b6c7 100644 --- a/.github/workflows/linter.yml +++ b/.github/workflows/linter.yml @@ -26,7 +26,7 @@ jobs: with: ref: ${{ github.event.pull_request.head.sha }} - name: Set up Python 3.9 - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: '3.9' - name: Setup env @@ -66,7 +66,7 @@ jobs: with: ref: ${{ github.event.pull_request.head.sha }} - name: Set up Python 3.9 - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: '3.9' - name: Setup env diff --git a/.github/workflows/linux-test.yml b/.github/workflows/linux-test.yml index 6ddc601f2c..e4880f8ee8 100644 --- a/.github/workflows/linux-test.yml +++ b/.github/workflows/linux-test.yml @@ -85,7 +85,7 @@ jobs: rm -rfv "${GITHUB_WORKSPACE}" mkdir -p "${GITHUB_WORKSPACE}" echo "::endgroup::" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # Support the use case where we need to checkout someone's fork repository: ${{ inputs.test-infra-repository }} @@ -120,7 +120,7 @@ jobs: path: /opt/torch-tensorrt-builds/ - name: Download artifacts if: ${{ matrix.tensorrt != '' }} - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: ${{ env.DOWNLOAD_ARTIFACT_NAME }} path: /opt/torch-tensorrt-builds/ @@ -184,7 +184,7 @@ jobs: echo "upload-docs=${upload_docs}" >> "${GITHUB_OUTPUT}" - name: Upload artifacts to GitHub (if any) - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: ${{ inputs.upload-artifact != '' }} with: name: ${{ inputs.upload-artifact }} diff --git a/.github/workflows/nightlies.yml b/.github/workflows/nightlies.yml index a0692cdafe..aac1c58f7f 100644 --- a/.github/workflows/nightlies.yml +++ b/.github/workflows/nightlies.yml @@ -11,7 +11,7 @@ jobs: environment: trigger-nightly timeout-minutes: 120 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: main token: ${{ secrets.GH_PYTORCHBOT_TOKEN }} diff --git a/.github/workflows/release-linux.yml b/.github/workflows/release-linux.yml index ca13b37443..8caf525e76 100644 --- a/.github/workflows/release-linux.yml +++ b/.github/workflows/release-linux.yml @@ -34,7 +34,7 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: repository: pytorch/tensorrt - name: Generate release matrix @@ -84,7 +84,7 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: repository: pytorch/tensorrt - name: Generate release matrix diff --git a/.github/workflows/release-wheel-linux.yml b/.github/workflows/release-wheel-linux.yml index 6ddd9e0306..54732378eb 100644 --- a/.github/workflows/release-wheel-linux.yml +++ b/.github/workflows/release-wheel-linux.yml @@ -114,13 +114,13 @@ jobs: rm -rf "${RUNNER_TEMP}/*" fi echo "::endgroup::" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # Support the use case where we need to checkout someone's fork repository: ${{ inputs.test-infra-repository }} ref: ${{ inputs.test-infra-ref }} path: test-infra - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 if: ${{ env.ARCH == 'aarch64' }} with: # Support the use case where we need to checkout someone's fork @@ -236,21 +236,21 @@ jobs: - name: Upload wheel to GitHub if: ${{ inputs.cxx11-tarball-release != 'true' }} continue-on-error: true - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ env.ARTIFACT_NAME }} path: ${{ inputs.repository }}/release/wheel/ - name: Upload pre-cxx11 tarball to GitHub if: ${{ inputs.cxx11-tarball-release != 'true' && env.PYTHON_VERSION == '3.10' }} continue-on-error: true - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: pre-cxx11-tarball-${{ env.PYTHON_VERSION }}-${{ env.CU_VERSION }} path: ${{ inputs.repository }}/release/tarball/ - name: Upload cxx11 tarball to GitHub if: ${{ inputs.cxx11-tarball-release == 'true' }} continue-on-error: true - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: cxx11-tarball-${{ env.PYTHON_VERSION }}-${{ env.CU_VERSION }} path: ${{ inputs.repository }}/release/tarball/ diff --git a/.github/workflows/release-wheel-windows.yml b/.github/workflows/release-wheel-windows.yml index 6a6c993502..2ea88bce9e 100644 --- a/.github/workflows/release-wheel-windows.yml +++ b/.github/workflows/release-wheel-windows.yml @@ -90,7 +90,7 @@ jobs: # to have a conversation timeout-minutes: 120 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # Support the use case where we need to checkout someone's fork repository: ${{ inputs.test-infra-repository }} @@ -199,7 +199,7 @@ jobs: # NB: Only upload to GitHub after passing smoke tests - name: Upload wheel to GitHub continue-on-error: true - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ env.ARTIFACT_NAME }} path: ${{ inputs.repository }}/dist/ diff --git a/.github/workflows/release-windows.yml b/.github/workflows/release-windows.yml index 271547cec3..489cc6ab30 100644 --- a/.github/workflows/release-windows.yml +++ b/.github/workflows/release-windows.yml @@ -34,7 +34,7 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: repository: pytorch/tensorrt - name: Generate release matrix diff --git a/.github/workflows/windows-test.yml b/.github/workflows/windows-test.yml index 13feedfa8c..a8b27c0aa9 100644 --- a/.github/workflows/windows-test.yml +++ b/.github/workflows/windows-test.yml @@ -70,7 +70,7 @@ jobs: mkdir -p "${GITHUB_WORKSPACE}" echo "::endgroup::" - name: Checkout repository (${{ inputs.test-infra-repository }}@${{ inputs.test-infra-ref }}) - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: # Support the use case where we need to checkout someone's fork repository: ${{ inputs.test-infra-repository }} @@ -105,13 +105,13 @@ jobs: is_windows: 'enabled' - name: Download artifacts if: ${{ matrix.tensorrt == '' }} - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: ${{ env.ARTIFACT_NAME }} path: ${{ runner.temp }}/artifacts/ - name: Download artifacts if: ${{ matrix.tensorrt != '' }} - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: ${{ env.DOWNLOAD_ARTIFACT_NAME }} path: ${{ runner.temp }}/artifacts/ From e3141ed122ccfc677c00ddebbbaf4460d0a7c11b Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Wed, 29 Jan 2025 16:57:06 -0800 Subject: [PATCH 5/5] chore: fix linting issues --- noxfile.py | 4 +++- setup.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/noxfile.py b/noxfile.py index 6dcf1da60f..10c9b647fa 100644 --- a/noxfile.py +++ b/noxfile.py @@ -237,7 +237,9 @@ def run_dynamo_lower_tests(session): tests = ["lowering"] for test in tests: if USE_HOST_DEPS: - session.run_always("pytest", test, "-n", num_workers, env={"PYTHONPATH": PYT_PATH}) + session.run_always( + "pytest", test, "-n", num_workers, env={"PYTHONPATH": PYT_PATH} + ) else: session.run_always("pytest", test, "-n", num_workers) diff --git a/setup.py b/setup.py index 17b3d33c75..91648e57a1 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def get_root_dir() -> Path: - dir_path = os.path.dirname(os.path.realpath(__file__)) + dir_path = os.path.dirname(os.path.realpath(__file__)) return dir_path @@ -119,7 +119,6 @@ def load_dep_info(): gpu_arch_version = f"cu{__cuda_version__.replace('.','')}" - __version__ = os.environ.get("BUILD_VERSION") if "--ci" in sys.argv: