From fb6cdfd3d2bcdbe266d42556c3b73b87df0d302a Mon Sep 17 00:00:00 2001
From: Naren Dasan <1790613+narendasan@users.noreply.github.com>
Date: Wed, 29 Jan 2025 17:27:16 -0700
Subject: [PATCH 1/5] =?UTF-8?q?fix(aten::instance=5Fnorm):=20Handle=20opti?=
 =?UTF-8?q?onal=20inputs=20in=20instance=20norm=20con=E2=80=A6=20(#3367)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 .../scripts/generate_binary_build_matrix.py   |   1 -
 .../conversion/converters/impl/batch_norm.cpp |  15 +-
 core/util/prelude.h                           |   1 +
 .../custom_kernel_plugins.py                  |   4 -
 examples/dynamo/custom_kernel_plugins.py      |   4 -
 notebooks/CitriNet-example.ipynb              |  14 +-
 notebooks/EfficientNet-example.ipynb          |  18 +-
 notebooks/Hugging-Face-BERT.ipynb             |  12 +-
 notebooks/Resnet50-CPP.ipynb                  |   1 -
 notebooks/Resnet50-example.ipynb              |  21 +-
 notebooks/dynamic-shapes.ipynb                |  20 +-
 ...ng_started_with_fx_path_lower_to_trt.ipynb | 996 +++++++++---------
 notebooks/lenet-getting-started.ipynb         |  11 +-
 notebooks/qat-ptq-workflow.ipynb              |  25 +-
 notebooks/ssd-object-detection-demo.ipynb     |  14 +-
 notebooks/vgg-qat.ipynb                       |  10 +-
 py/torch_tensorrt/_Device.py                  |   3 +-
 py/torch_tensorrt/_compile.py                 |   2 -
 py/torch_tensorrt/_features.py                |   4 +-
 py/torch_tensorrt/dynamo/_engine_cache.py     |   2 -
 py/torch_tensorrt/dynamo/_exporter.py         |   1 -
 py/torch_tensorrt/dynamo/_refit.py            |   2 -
 .../dynamo/conversion/_TRTInterpreter.py      |   1 -
 .../dynamo/conversion/_conversion.py          |   2 -
 .../dynamo/conversion/converter_utils.py      |   3 +-
 .../dynamo/conversion/impl/arange.py          |   1 -
 .../dynamo/conversion/impl/elementwise/ops.py |   1 -
 .../dynamo/conversion/impl/matmul.py          |   1 -
 .../conversion/impl/normalization/ops.py      |   1 -
 .../dynamo/conversion/impl/pad.py             |   1 -
 .../dynamo/conversion/impl/slice/ops.py       |   1 -
 .../dynamo/conversion/impl/unary/ops.py       |   1 -
 .../runtime/_MutableTorchTensorRTModule.py    |   3 -
 .../runtime/_PythonTorchTensorRTModule.py     |   2 -
 py/torch_tensorrt/dynamo/utils.py             |   1 -
 py/torch_tensorrt/logging.py                  |   4 +-
 py/torch_tensorrt/runtime/_utils.py           |   1 -
 setup.py                                      |   2 -
 .../converters/test_instance_norm.cpp         |   2 +-
 tests/py/dynamo/conversion/harness.py         |   1 -
 .../py/dynamo/conversion/test_resize_aten.py  |   1 -
 .../py/dynamo/conversion/test_sym_not_aten.py |   1 -
 .../py/dynamo/lowering/test_decompositions.py |   1 -
 tests/py/dynamo/models/test_dtype_support.py  |   1 -
 tests/py/dynamo/models/test_engine_cache.py   |   2 -
 .../dynamo/models/test_export_kwargs_serde.py |   1 -
 tests/py/dynamo/models/test_model_refit.py    |  10 -
 tests/py/dynamo/runtime/test_001_streams.py   |   1 -
 .../runtime/test_002_lazy_engine_init.py      |   1 -
 .../test_003_cross_compile_for_windows.py     |   1 -
 .../runtime/test_004_weight_streaming.py      |   1 -
 .../runtime/test_mutable_torchtrt_module.py   |   4 -
 52 files changed, 581 insertions(+), 654 deletions(-)

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index 26bb447b4f..f56b45b33b 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -469,7 +469,6 @@ def generate_wheels_matrix(
     ret: List[Dict[str, Any]] = []
     for python_version in python_versions:
         for arch_version in arches:
-
             # TODO: Enable Python 3.13 support for ROCM
             if arch_version in ROCM_ARCHES and python_version == "3.13":
                 continue
diff --git a/core/conversion/converters/impl/batch_norm.cpp b/core/conversion/converters/impl/batch_norm.cpp
index 07cf445f50..c8ec1977a7 100644
--- a/core/conversion/converters/impl/batch_norm.cpp
+++ b/core/conversion/converters/impl/batch_norm.cpp
@@ -134,9 +134,14 @@ auto batch_norm_registrations TORCHTRT_UNUSED =
 
               auto eps = static_cast<float>(args[7].unwrapToDouble(1e-5f));
 
-              auto scales = args[1].unwrapToTensor(at::ones(shape[1], options)).cpu().contiguous();
-              auto bias = args[2].unwrapToTensor(at::zeros(shape[1], options)).cpu().contiguous();
-
+              auto scales = at::ones(shape[1], options);
+              if (!args[1].IValue()->isNone()) {
+                scales = args[1].unwrapToTensor(at::ones(shape[1], options)).cpu().contiguous();
+              }
+              auto bias = at::zeros(shape[1], options);
+              if (!args[2].IValue()->isNone()) {
+                bias = args[2].unwrapToTensor(at::zeros(shape[1], options)).cpu().contiguous();
+              }
               // track_running_stats=True
               if (!args[3].IValue()->isNone() || !args[4].IValue()->isNone()) {
                 auto running_mean = args[3].unwrapToTensor();
@@ -154,6 +159,8 @@ auto batch_norm_registrations TORCHTRT_UNUSED =
                 return true;
               }
 
+              // Not sure this actually does something since the cudnn_enabled is from the PyTorch context.
+              // We need cuDNN either way to run this converter
               auto cudnn_enabled = static_cast<bool>(args[8].unwrapToBool(false));
               if (!cudnn_enabled) {
                 LOG_DEBUG(
@@ -162,7 +169,7 @@ auto batch_norm_registrations TORCHTRT_UNUSED =
                     so for some functionalities, users need to install correct \
                     cuDNN version by themselves. Please see our support matrix \
                     here: https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html.");
-                return false;
+                // return false;
               }
 
               const int relu = 0;
diff --git a/core/util/prelude.h b/core/util/prelude.h
index 957562c3c5..d269a9347a 100644
--- a/core/util/prelude.h
+++ b/core/util/prelude.h
@@ -2,6 +2,7 @@
 
 // A collection of headers from util that will typically get included in most
 // files
+#include <cstdint>
 #include "core/util/Exception.h"
 #include "core/util/build_info.h"
 #include "core/util/jit_util.h"
diff --git a/docs/_downloads/c0341280f3b022df00c4241c42d9ee8b/custom_kernel_plugins.py b/docs/_downloads/c0341280f3b022df00c4241c42d9ee8b/custom_kernel_plugins.py
index 73b06119ae..398c0a1ebe 100644
--- a/docs/_downloads/c0341280f3b022df00c4241c42d9ee8b/custom_kernel_plugins.py
+++ b/docs/_downloads/c0341280f3b022df00c4241c42d9ee8b/custom_kernel_plugins.py
@@ -316,7 +316,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 import cupy as cp  # Needed to work around API gaps in PyTorch to build torch.Tensors around preallocated CUDA memory
 import numpy as np
-
 import tensorrt as trt
 
 
@@ -348,7 +347,6 @@ def get_output_dimensions(
         inputs: List[trt.DimsExprs],
         exprBuilder: trt.IExprBuilder,
     ) -> trt.DimsExprs:
-
         output_dims = trt.DimsExprs(inputs[0])
 
         for i in range(np.size(self.pads) // 2):
@@ -404,7 +402,6 @@ def enqueue(
         workspace: int,
         stream: int,
     ) -> None:
-
         # Host code is slightly different as this will be run as part of the TRT execution
         in_dtype = torchtrt.dtype.try_from(input_desc[0].type).to(np.dtype)
 
@@ -528,7 +525,6 @@ def circular_padding_converter(
     kwargs: Dict[str, Argument],
     name: str,
 ):
-
     # How to retrieve a plugin if it is defined elsewhere (e.g. linked library)
     plugin_registry = trt.get_plugin_registry()
     plugin_creator = plugin_registry.get_plugin_creator(
diff --git a/examples/dynamo/custom_kernel_plugins.py b/examples/dynamo/custom_kernel_plugins.py
index 73b06119ae..398c0a1ebe 100644
--- a/examples/dynamo/custom_kernel_plugins.py
+++ b/examples/dynamo/custom_kernel_plugins.py
@@ -316,7 +316,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 import cupy as cp  # Needed to work around API gaps in PyTorch to build torch.Tensors around preallocated CUDA memory
 import numpy as np
-
 import tensorrt as trt
 
 
@@ -348,7 +347,6 @@ def get_output_dimensions(
         inputs: List[trt.DimsExprs],
         exprBuilder: trt.IExprBuilder,
     ) -> trt.DimsExprs:
-
         output_dims = trt.DimsExprs(inputs[0])
 
         for i in range(np.size(self.pads) // 2):
@@ -404,7 +402,6 @@ def enqueue(
         workspace: int,
         stream: int,
     ) -> None:
-
         # Host code is slightly different as this will be run as part of the TRT execution
         in_dtype = torchtrt.dtype.try_from(input_desc[0].type).to(np.dtype)
 
@@ -528,7 +525,6 @@ def circular_padding_converter(
     kwargs: Dict[str, Argument],
     name: str,
 ):
-
     # How to retrieve a plugin if it is defined elsewhere (e.g. linked library)
     plugin_registry = trt.get_plugin_registry()
     plugin_creator = plugin_registry.get_plugin_creator(
diff --git a/notebooks/CitriNet-example.ipynb b/notebooks/CitriNet-example.ipynb
index b9d615d5f1..88d59e3424 100644
--- a/notebooks/CitriNet-example.ipynb
+++ b/notebooks/CitriNet-example.ipynb
@@ -384,12 +384,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import nemo\n",
     "import torch\n",
     "\n",
     "import nemo.collections.asr as nemo_asr\n",
     "from nemo.core import typecheck\n",
-    "typecheck.set_typecheck_enabled(False) "
+    "typecheck.set_typecheck_enabled(False)"
    ]
   },
   {
@@ -572,11 +571,8 @@
     "from __future__ import absolute_import\n",
     "from __future__ import division\n",
     "\n",
-    "import argparse\n",
     "import timeit\n",
     "import numpy as np\n",
-    "import torch\n",
-    "import torch_tensorrt as trtorch\n",
     "import torch.backends.cudnn as cudnn\n",
     "\n",
     "def benchmark(model, input_tensor, num_loops, model_name, batch_size):\n",
@@ -632,7 +628,7 @@
     "    else:\n",
     "        model_name = f\"{variant}.ts\"\n",
     "\n",
-    "    print(f\"Loading model: {model_name}\") \n",
+    "    print(f\"Loading model: {model_name}\")\n",
     "    # Load traced model to CPU first\n",
     "    model = torch.jit.load(model_name).cuda()\n",
     "    cudnn.benchmark = True\n",
@@ -727,9 +723,7 @@
    ],
    "source": [
     "import torch\n",
-    "import torch.nn as nn\n",
     "import torch_tensorrt as torchtrt\n",
-    "import argparse\n",
     "\n",
     "variant = \"stt_en_citrinet_256\"\n",
     "precisions = [torch.float, torch.half]\n",
@@ -827,7 +821,7 @@
     "    else:\n",
     "        model_name = f\"{variant}.ts\"\n",
     "\n",
-    "    print(f\"Loading model: {model_name}\") \n",
+    "    print(f\"Loading model: {model_name}\")\n",
     "    # Load traced model to CPU first\n",
     "    model = torch.jit.load(model_name).cuda()\n",
     "    cudnn.benchmark = True\n",
@@ -906,7 +900,7 @@
     "    else:\n",
     "        model_name = f\"{variant}.ts\"\n",
     "\n",
-    "    print(f\"Loading model: {model_name}\") \n",
+    "    print(f\"Loading model: {model_name}\")\n",
     "    # Load traced model to CPU first\n",
     "    model = torch.jit.load(model_name).cuda()\n",
     "    cudnn.benchmark = True\n",
diff --git a/notebooks/EfficientNet-example.ipynb b/notebooks/EfficientNet-example.ipynb
index cfb8e79232..bbbfe6f94e 100644
--- a/notebooks/EfficientNet-example.ipynb
+++ b/notebooks/EfficientNet-example.ipynb
@@ -167,7 +167,7 @@
     "import torch.backends.cudnn as cudnn\n",
     "from timm.data import resolve_data_config\n",
     "from timm.data.transforms_factory import create_transform\n",
-    "import json \n",
+    "import json\n",
     "\n",
     "efficientnet_b0_model = timm.create_model('efficientnet_b0',pretrained=True)\n",
     "model = efficientnet_b0_model.eval().to(\"cuda\")"
@@ -305,13 +305,13 @@
     "        transforms.ToTensor(),\n",
     "        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
     "    ])\n",
-    "    input_tensor = preprocess(img)      \n",
+    "    input_tensor = preprocess(img)\n",
     "    plt.subplot(2,2,i+1)\n",
     "    plt.imshow(img)\n",
     "    plt.axis('off')\n",
     "\n",
     "# loading labels\n",
-    "with open(\"./data/imagenet_class_index.json\") as json_file: \n",
+    "with open(\"./data/imagenet_class_index.json\") as json_file:\n",
     "    d = json.load(json_file)"
    ]
   },
@@ -341,7 +341,7 @@
     "    preprocess = efficientnet_preprocess()\n",
     "    input_tensor = preprocess(img)\n",
     "    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n",
-    "    \n",
+    "\n",
     "    # move the input and model to GPU for speed if available\n",
     "    if torch.cuda.is_available():\n",
     "        input_batch = input_batch.to('cuda')\n",
@@ -351,7 +351,7 @@
     "        output = model(input_batch)\n",
     "        # Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n",
     "        sm_output = torch.nn.functional.softmax(output[0], dim=0)\n",
-    "        \n",
+    "\n",
     "    ind = torch.argmax(sm_output)\n",
     "    return d[str(ind.item())], sm_output[ind] #([predicted class, description], probability)\n",
     "\n",
@@ -360,7 +360,7 @@
     "    input_data = input_data.to(\"cuda\")\n",
     "    if dtype=='fp16':\n",
     "        input_data = input_data.half()\n",
-    "        \n",
+    "\n",
     "    print(\"Warm up ...\")\n",
     "    with torch.no_grad():\n",
     "        for _ in range(nwarmup):\n",
@@ -430,13 +430,13 @@
     "for i in range(4):\n",
     "    img_path = './data/img%d.JPG'%i\n",
     "    img = Image.open(img_path)\n",
-    "    \n",
+    "\n",
     "    pred, prob = predict(img_path, efficientnet_b0_model)\n",
     "    print('{} - Predicted: {}, Probablility: {}'.format(img_path, pred, prob))\n",
     "\n",
     "    plt.subplot(2,2,i+1)\n",
-    "    plt.imshow(img);\n",
-    "    plt.axis('off');\n",
+    "    plt.imshow(img)\n",
+    "    plt.axis('off')\n",
     "    plt.title(pred[1])"
    ]
   },
diff --git a/notebooks/Hugging-Face-BERT.ipynb b/notebooks/Hugging-Face-BERT.ipynb
index 81034d8e38..36068dbd58 100644
--- a/notebooks/Hugging-Face-BERT.ipynb
+++ b/notebooks/Hugging-Face-BERT.ipynb
@@ -233,9 +233,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "masked_sentences = ['Paris is the [MASK] of France.', \n",
-    "                    'The primary [MASK] of the United States is English.', \n",
-    "                    'A baseball game consists of at least nine [MASK].', \n",
+    "masked_sentences = ['Paris is the [MASK] of France.',\n",
+    "                    'The primary [MASK] of the United States is English.',\n",
+    "                    'A baseball game consists of at least nine [MASK].',\n",
     "                    'Topology is a branch of [MASK] concerned with the properties of geometric objects that remain unchanged under continuous transformations.']\n",
     "pos_masks = [4, 3, 9, 6]"
    ]
@@ -357,7 +357,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "trt_model = torch_tensorrt.compile(traced_mlm_model, \n",
+    "trt_model = torch_tensorrt.compile(traced_mlm_model,\n",
     "    inputs= [torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32),  # input_ids\n",
     "             torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32),  # token_type_ids\n",
     "             torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32)], # attention_mask\n",
@@ -396,7 +396,7 @@
     "enc_inputs = enc(masked_sentences, return_tensors='pt', padding='max_length', max_length=128)\n",
     "enc_inputs = {k: v.type(torch.int32).cuda() for k, v in enc_inputs.items()}\n",
     "output_trt = trt_model(enc_inputs['input_ids'], enc_inputs['token_type_ids'], enc_inputs['attention_mask'])\n",
-    "most_likely_token_ids_trt = [torch.argmax(output_trt[i, pos, :]) for i, pos in enumerate(pos_masks)] \n",
+    "most_likely_token_ids_trt = [torch.argmax(output_trt[i, pos, :]) for i, pos in enumerate(pos_masks)]\n",
     "unmasked_tokens_trt = enc.decode(most_likely_token_ids_trt).split(' ')\n",
     "unmasked_sentences_trt = [masked_sentences[i].replace('[MASK]', token) for i, token in enumerate(unmasked_tokens_trt)]\n",
     "for sentence in unmasked_sentences_trt:\n",
@@ -418,7 +418,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "trt_model_fp16 = torch_tensorrt.compile(traced_mlm_model, \n",
+    "trt_model_fp16 = torch_tensorrt.compile(traced_mlm_model,\n",
     "    inputs= [torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32),  # input_ids\n",
     "             torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32),  # token_type_ids\n",
     "             torch_tensorrt.Input(shape=[batch_size, 128], dtype=torch.int32)], # attention_mask\n",
diff --git a/notebooks/Resnet50-CPP.ipynb b/notebooks/Resnet50-CPP.ipynb
index 198ebc9911..87800e0a24 100755
--- a/notebooks/Resnet50-CPP.ipynb
+++ b/notebooks/Resnet50-CPP.ipynb
@@ -70,7 +70,6 @@
    "outputs": [],
    "source": [
     "import torch\n",
-    "import torchvision\n",
     "\n",
     "torch.hub._validate_not_a_forked_repo=lambda a,b,c: True\n",
     "\n",
diff --git a/notebooks/Resnet50-example.ipynb b/notebooks/Resnet50-example.ipynb
index a7d3d4eddd..7b5944ea8d 100644
--- a/notebooks/Resnet50-example.ipynb
+++ b/notebooks/Resnet50-example.ipynb
@@ -428,7 +428,6 @@
    ],
    "source": [
     "import torch\n",
-    "import torchvision\n",
     "\n",
     "torch.hub._validate_not_a_forked_repo=lambda a,b,c: True\n",
     "\n",
@@ -558,7 +557,7 @@
     "from PIL import Image\n",
     "from torchvision import transforms\n",
     "import matplotlib.pyplot as plt\n",
-    "import json \n",
+    "import json\n",
     "\n",
     "fig, axes = plt.subplots(nrows=2, ncols=2)\n",
     "\n",
@@ -571,13 +570,13 @@
     "        transforms.ToTensor(),\n",
     "        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
     "    ])\n",
-    "    input_tensor = preprocess(img)      \n",
+    "    input_tensor = preprocess(img)\n",
     "    plt.subplot(2,2,i+1)\n",
     "    plt.imshow(img)\n",
     "    plt.axis('off')\n",
     "\n",
-    "# loading labels    \n",
-    "with open(\"./data/imagenet_class_index.json\") as json_file: \n",
+    "# loading labels\n",
+    "with open(\"./data/imagenet_class_index.json\") as json_file:\n",
     "    d = json.load(json_file)"
    ]
   },
@@ -614,7 +613,7 @@
     "    preprocess = rn50_preprocess()\n",
     "    input_tensor = preprocess(img)\n",
     "    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n",
-    "    \n",
+    "\n",
     "    # move the input and model to GPU for speed if available\n",
     "    if torch.cuda.is_available():\n",
     "        input_batch = input_batch.to('cuda')\n",
@@ -624,7 +623,7 @@
     "        output = model(input_batch)\n",
     "        # Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n",
     "        sm_output = torch.nn.functional.softmax(output[0], dim=0)\n",
-    "        \n",
+    "\n",
     "    ind = torch.argmax(sm_output)\n",
     "    return d[str(ind.item())], sm_output[ind] #([predicted class, description], probability)\n",
     "\n",
@@ -633,7 +632,7 @@
     "    input_data = input_data.to(\"cuda\")\n",
     "    if dtype=='fp16':\n",
     "        input_data = input_data.half()\n",
-    "        \n",
+    "\n",
     "    print(\"Warm up ...\")\n",
     "    with torch.no_grad():\n",
     "        for _ in range(nwarmup):\n",
@@ -695,13 +694,13 @@
     "for i in range(4):\n",
     "    img_path = './data/img%d.JPG'%i\n",
     "    img = Image.open(img_path)\n",
-    "    \n",
+    "\n",
     "    pred, prob = predict(img_path, resnet50_model)\n",
     "    print('{} - Predicted: {}, Probablility: {}'.format(img_path, pred, prob))\n",
     "\n",
     "    plt.subplot(2,2,i+1)\n",
-    "    plt.imshow(img);\n",
-    "    plt.axis('off');\n",
+    "    plt.imshow(img)\n",
+    "    plt.axis('off')\n",
     "    plt.title(pred[1])"
    ]
   },
diff --git a/notebooks/dynamic-shapes.ipynb b/notebooks/dynamic-shapes.ipynb
index 5738f13521..046f2bfe2d 100644
--- a/notebooks/dynamic-shapes.ipynb
+++ b/notebooks/dynamic-shapes.ipynb
@@ -313,7 +313,7 @@
     "from PIL import Image\n",
     "from torchvision import transforms\n",
     "import matplotlib.pyplot as plt\n",
-    "import json \n",
+    "import json\n",
     "\n",
     "fig, axes = plt.subplots(nrows=2, ncols=2)\n",
     "\n",
@@ -326,13 +326,13 @@
     "        transforms.ToTensor(),\n",
     "        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
     "    ])\n",
-    "    input_tensor = preprocess(img)      \n",
+    "    input_tensor = preprocess(img)\n",
     "    plt.subplot(2,2,i+1)\n",
     "    plt.imshow(img)\n",
     "    plt.axis('off')\n",
     "\n",
-    "# loading labels    \n",
-    "with open(\"./data/imagenet_class_index.json\") as json_file: \n",
+    "# loading labels\n",
+    "with open(\"./data/imagenet_class_index.json\") as json_file:\n",
     "    d = json.load(json_file)"
    ]
   },
@@ -589,7 +589,7 @@
     "    preprocess = rn50_preprocess()\n",
     "    input_tensor = preprocess(img)\n",
     "    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model\n",
-    "    \n",
+    "\n",
     "    # move the input and model to GPU for speed if available\n",
     "    if torch.cuda.is_available():\n",
     "        input_batch = input_batch.to('cuda')\n",
@@ -599,7 +599,7 @@
     "        output = model(input_batch)\n",
     "        # Tensor of shape 1000, with confidence scores over Imagenet's 1000 classes\n",
     "        sm_output = torch.nn.functional.softmax(output[0], dim=0)\n",
-    "        \n",
+    "\n",
     "    ind = torch.argmax(sm_output)\n",
     "    return d[str(ind.item())], sm_output[ind] #([predicted class, description], probability)\n",
     "\n",
@@ -609,7 +609,7 @@
     "    input_data = input_data.to(\"cuda\")\n",
     "    if dtype=='fp16':\n",
     "        input_data = input_data.half()\n",
-    "        \n",
+    "\n",
     "    print(\"Warm up ...\")\n",
     "    with torch.no_grad():\n",
     "        for _ in range(nwarmup):\n",
@@ -673,13 +673,13 @@
     "for i in range(4):\n",
     "    img_path = './data/img%d.JPG'%i\n",
     "    img = Image.open(img_path)\n",
-    "    \n",
+    "\n",
     "    pred, prob = predict(img_path, resnet50_model)\n",
     "    print('{} - Predicted: {}, Probablility: {}'.format(img_path, pred, prob))\n",
     "\n",
     "    plt.subplot(2,2,i+1)\n",
-    "    plt.imshow(img);\n",
-    "    plt.axis('off');\n",
+    "    plt.imshow(img)\n",
+    "    plt.axis('off')\n",
     "    plt.title(pred[1])"
    ]
   },
diff --git a/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb b/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb
index 8e480903ab..0b90e34bd6 100644
--- a/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb
+++ b/notebooks/getting_started_with_fx_path_lower_to_trt.ipynb
@@ -1,517 +1,517 @@
 {
-  "metadata": {
-    "dataExplorerConfig": {},
-    "bento_stylesheets": {
-      "bento/extensions/flow/main.css": true,
-      "bento/extensions/kernel_selector/main.css": true,
-      "bento/extensions/kernel_ui/main.css": true,
-      "bento/extensions/new_kernel/main.css": true,
-      "bento/extensions/system_usage/main.css": true,
-      "bento/extensions/theme/main.css": true
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "code_folding": [],
+    "customInput": null,
+    "hidden_ranges": [],
+    "originalKey": "8ca7695d-8a19-454e-b32b-3d5c36d52faf",
+    "showInput": false
+   },
+   "source": [
+    "The purpose of this example is to demostrate the overall flow of lowering a PyTorch model\n",
+    "to TensorRT conveniently with lower.py. We integrated the transformation process including `TRTInterpreter`, `TRTModule`, pass optimization into the `lower_to_trt` API, users are encouraged to check the docstring of the API and tune it to meet your needs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false,
+    "customInput": null,
+    "customOutput": null,
+    "executionStartTime": 1661189891682,
+    "executionStopTime": 1661189891856,
+    "originalKey": "7db2accc-9fa4-4a1e-8142-d887f2947bcd",
+    "requestMsgId": "b5d8efce-0963-4074-bc9d-e8e1a78fd424",
+    "showInput": true
+   },
+   "outputs": [],
+   "source": [
+    "import typing as t\n",
+    "from copy import deepcopy\n",
+    "from dataclasses import dataclass, field, replace\n",
+    "\n",
+    "import torch\n",
+    "import torchvision\n",
+    "from torch_tensorrt.fx.lower import compile\n",
+    "from torch_tensorrt.fx.utils import LowerPrecision"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "code_folding": [],
+    "customInput": null,
+    "hidden_ranges": [],
+    "originalKey": "e324a1ff-1bc2-4e78-932f-33534c3ac3f5",
+    "showInput": false
+   },
+   "source": [
+    "Specify the `configuration` class used for FX path lowering and benchmark. To extend, add a new configuration field to this class, and modify the lowering or benchmark behavior in `run_configuration_benchmark()` correspondingly. It automatically stores all its values to a `Result` dataclass.   \n",
+    "`Result` is another dataclass that holds raw essential benchmark result values like Batch size, QPS, accuracy, etc..\n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "code_folding": [],
+    "collapsed": false,
+    "customInput": null,
+    "customOutput": null,
+    "executionStartTime": 1661189260550,
+    "executionStopTime": 1661189262039,
+    "hidden_ranges": [],
+    "originalKey": "2835fffa-cc50-479a-9080-c4f7002c0726",
+    "requestMsgId": "6ea72dbf-dbfe-451e-8613-15f87e34a1a5",
+    "showInput": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 102740.872 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n"
+     ]
     },
-    "kernelspec": {
-      "display_name": "dper3_pytorch (cinder)",
-      "language": "python",
-      "name": "bento_kernel_dper3_pytorch_cinder",
-      "metadata": {
-        "kernel_name": "bento_kernel_dper3_pytorch_cinder",
-        "nightly_builds": false,
-        "fbpkg_supported": true,
-        "cinder_runtime": true,
-        "is_prebuilt": true
-      }
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 102740.873 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n"
+     ]
+    }
+   ],
+   "source": [
+    "@dataclass\n",
+    "class Configuration:\n",
+    "    # number of inferences to run\n",
+    "    batch_iter: int\n",
+    "\n",
+    "    # Input batch size\n",
+    "    batch_size: int\n",
+    "\n",
+    "    # Friendly name of the configuration\n",
+    "    name: str = \"\"\n",
+    "\n",
+    "    # Whether to apply TRT lowering to the model before benchmarking\n",
+    "    trt: bool = False\n",
+    "\n",
+    "    # Whether to apply engine holder to the lowered model\n",
+    "    jit: bool = False\n",
+    "\n",
+    "    # Whether to enable FP16 mode for TRT lowering\n",
+    "    fp16: bool = False\n",
+    "\n",
+    "    # Relative tolerance for accuracy check after lowering. -1 means do not\n",
+    "    # check accuracy.\n",
+    "    accuracy_rtol: float = -1  # disable\n",
+    "\n",
+    "@dataclass\n",
+    "class Result:\n",
+    "    module: torch.nn.Module = field(repr=False)\n",
+    "    input: t.Any = field(repr=False)\n",
+    "    conf: Configuration\n",
+    "    time_sec: float\n",
+    "    accuracy_res: t.Optional[bool] = None\n",
+    "\n",
+    "    @property\n",
+    "    def time_per_iter_ms(self) -> float:\n",
+    "        return self.time_sec * 1.0e3\n",
+    "\n",
+    "    @property\n",
+    "    def qps(self) -> float:\n",
+    "        return self.conf.batch_size / self.time_sec\n",
+    "\n",
+    "    def format(self) -> str:\n",
+    "        return (\n",
+    "            f\"== Benchmark Result for: {self.conf}\\n\"\n",
+    "            f\"BS: {self.conf.batch_size}, \"\n",
+    "            f\"Time per iter: {self.time_per_iter_ms:.2f}ms, \"\n",
+    "            f\"QPS: {self.qps:.2f}, \"\n",
+    "            f\"Accuracy: {self.accuracy_res} (rtol={self.conf.accuracy_rtol})\"\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "code_folding": [],
+    "customInput": null,
+    "hidden_ranges": [],
+    "originalKey": "3e462cf6-d282-402d-955b-a3ecb400bf0b",
+    "showInput": false
+   },
+   "source": [
+    "Run FX path lowering and benchmark the given model according to the specified benchmark configuration. Prints the benchmark result for each configuration at the end of the run. `benchmark_torch_function` is the actual function that computes the fixed number of iterations of functions runs.\n",
+    "The FX path lowering and TensorRT engine creation is integrated into `compile()` API which is defined in `fx/lower.py` file.\n",
+    "It is good to list it out and show the usage of it. It takes in original module, input and lowering setting, run lowering workflow to turn module into a executable TRT engine \n",
+    "```\n",
+    "def compile(\n",
+    "    module: nn.Module,\n",
+    "    input: ,\n",
+    "    max_batch_size: int = 2048,\n",
+    "    max_workspace_size=1 << 25,\n",
+    "    explicit_batch_dimension=False,\n",
+    "    lower_precision=LowerPrecision.FP16,\n",
+    "    verbose_log=False,\n",
+    "    timing_cache_prefix=\"\",\n",
+    "    save_timing_cache=False,\n",
+    "    cuda_graph_batch_size=-1,\n",
+    "    dynamic_batch=False,\n",
+    ") -> nn.Module:\n",
+    "``` \n",
+    "\n",
+    "    Args:\n",
+    "        module: Original module for lowering.\n",
+    "        input: Input for module.\n",
+    "        max_batch_size: Maximum batch size (must be >= 1 to be set, 0 means not set)\n",
+    "        max_workspace_size: Maximum size of workspace given to TensorRT.\n",
+    "        explicit_batch_dimension: Use explicit batch dimension in TensorRT if set True, otherwise use implicit batch dimension.\n",
+    "        lower_precision: lower_precision config given to TRTModule.\n",
+    "        verbose_log: Enable verbose log for TensorRT if set True.\n",
+    "        timing_cache_prefix: Timing cache file name for timing cache used by fx2trt.\n",
+    "        save_timing_cache: Update timing cache with current timing cache data if set to True.\n",
+    "        cuda_graph_batch_size: Cuda graph batch size, default to be -1.\n",
+    "        dynamic_batch: batch dimension (dim=0) is dynamic.\n",
+    "\n",
+    "    Returns:\n",
+    "        A torch.nn.Module lowered by TensorRT.\n",
+    "We testd a resnet18 network with input size of [128,3,224,224] for [Batch, Channel, Width, Height]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "code_folding": [],
+    "collapsed": false,
+    "customInput": null,
+    "customOutput": null,
+    "executionStartTime": 1661189697773,
+    "executionStopTime": 1661189753875,
+    "hidden_ranges": [],
+    "originalKey": "3002935b-b95a-4a08-a57f-f7a35485af5b",
+    "requestMsgId": "dc73f2d0-427b-4f71-bec1-b118cc5642d0",
+    "showInput": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103458.189 manifold.py:1435] URL manifold://torchvision/tree/models/resnet18-f37072fd.pth was already cached in /home/wwei6/.torch/iopath_cache/manifold_cache/tree/models/resnet18-f37072fd.pth\n"
+     ]
     },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3"
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1) green\n== Start benchmark iterations\n"
+     ]
     },
-    "last_server_session_id": "24a1a10c-29aa-4e2b-a11f-2b5108fc1e58",
-    "last_kernel_id": "5f014373-151c-4ee8-8939-4daab994d202",
-    "last_base_url": "https://devgpu005.ftw6.facebook.com:8091/",
-    "last_msg_id": "687e81e8-4414f32c89cd026dd1ea3fd9_139",
-    "outputWidgetContext": {}
-  },
-  "nbformat": 4,
-  "nbformat_minor": 2,
-  "cells": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "originalKey": "8ca7695d-8a19-454e-b32b-3d5c36d52faf",
-        "showInput": false,
-        "customInput": null,
-        "code_folding": [],
-        "hidden_ranges": []
-      },
-      "source": [
-        "The purpose of this example is to demostrate the overall flow of lowering a PyTorch model\n",
-        "to TensorRT conveniently with lower.py. We integrated the transformation process including `TRTInterpreter`, `TRTModule`, pass optimization into the `lower_to_trt` API, users are encouraged to check the docstring of the API and tune it to meet your needs."
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001) green\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "metadata": {
-        "originalKey": "7db2accc-9fa4-4a1e-8142-d887f2947bcd",
-        "showInput": true,
-        "customInput": null,
-        "collapsed": false,
-        "requestMsgId": "b5d8efce-0963-4074-bc9d-e8e1a78fd424",
-        "customOutput": null,
-        "executionStartTime": 1661189891682,
-        "executionStopTime": 1661189891856
-      },
-      "source": [
-        "import typing as t\n",
-        "from copy import deepcopy\n",
-        "from dataclasses import dataclass, field, replace\n",
-        "\n",
-        "import torch\n",
-        "import torchvision\n",
-        "from torch_tensorrt.fx.lower import compile\n",
-        "from torch_tensorrt.fx.utils import LowerPrecision"
-      ],
-      "execution_count": 9,
-      "outputs": []
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103501.297 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpe_7p37fq\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "originalKey": "e324a1ff-1bc2-4e78-932f-33534c3ac3f5",
-        "showInput": false,
-        "customInput": null,
-        "code_folding": [],
-        "hidden_ranges": []
-      },
-      "source": [
-        "Specify the `configuration` class used for FX path lowering and benchmark. To extend, add a new configuration field to this class, and modify the lowering or benchmark behavior in `run_configuration_benchmark()` correspondingly. It automatically stores all its values to a `Result` dataclass.   \n",
-        "`Result` is another dataclass that holds raw essential benchmark result values like Batch size, QPS, accuracy, etc..\n",
-        ""
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103501.390 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpg_a347f0\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "metadata": {
-        "originalKey": "2835fffa-cc50-479a-9080-c4f7002c0726",
-        "showInput": true,
-        "customInput": null,
-        "code_folding": [],
-        "hidden_ranges": [],
-        "collapsed": false,
-        "requestMsgId": "6ea72dbf-dbfe-451e-8613-15f87e34a1a5",
-        "customOutput": null,
-        "executionStartTime": 1661189260550,
-        "executionStopTime": 1661189262039
-      },
-      "source": [
-        "@dataclass\n",
-        "class Configuration:\n",
-        "    # number of inferences to run\n",
-        "    batch_iter: int\n",
-        "\n",
-        "    # Input batch size\n",
-        "    batch_size: int\n",
-        "\n",
-        "    # Friendly name of the configuration\n",
-        "    name: str = \"\"\n",
-        "\n",
-        "    # Whether to apply TRT lowering to the model before benchmarking\n",
-        "    trt: bool = False\n",
-        "\n",
-        "    # Whether to apply engine holder to the lowered model\n",
-        "    jit: bool = False\n",
-        "\n",
-        "    # Whether to enable FP16 mode for TRT lowering\n",
-        "    fp16: bool = False\n",
-        "\n",
-        "    # Relative tolerance for accuracy check after lowering. -1 means do not\n",
-        "    # check accuracy.\n",
-        "    accuracy_rtol: float = -1  # disable\n",
-        "    \n",
-        "@dataclass\n",
-        "class Result:\n",
-        "    module: torch.nn.Module = field(repr=False)\n",
-        "    input: t.Any = field(repr=False)\n",
-        "    conf: Configuration\n",
-        "    time_sec: float\n",
-        "    accuracy_res: t.Optional[bool] = None\n",
-        "\n",
-        "    @property\n",
-        "    def time_per_iter_ms(self) -> float:\n",
-        "        return self.time_sec * 1.0e3\n",
-        "\n",
-        "    @property\n",
-        "    def qps(self) -> float:\n",
-        "        return self.conf.batch_size / self.time_sec\n",
-        "\n",
-        "    def format(self) -> str:\n",
-        "        return (\n",
-        "            f\"== Benchmark Result for: {self.conf}\\n\"\n",
-        "            f\"BS: {self.conf.batch_size}, \"\n",
-        "            f\"Time per iter: {self.time_per_iter_ms:.2f}ms, \"\n",
-        "            f\"QPS: {self.qps:.2f}, \"\n",
-        "            f\"Accuracy: {self.accuracy_res} (rtol={self.conf.accuracy_rtol})\"\n",
-        "        )"
-      ],
-      "execution_count": 2,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 102740.872 _utils_internal.py:179] NCCL_DEBUG env var is set to None\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 102740.873 _utils_internal.py:188] NCCL_DEBUG is INFO from /etc/nccl.conf\n"
-          ]
-        }
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103501.509 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "originalKey": "3e462cf6-d282-402d-955b-a3ecb400bf0b",
-        "showInput": false,
-        "customInput": null,
-        "code_folding": [],
-        "hidden_ranges": []
-      },
-      "source": [
-        "Run FX path lowering and benchmark the given model according to the specified benchmark configuration. Prints the benchmark result for each configuration at the end of the run. `benchmark_torch_function` is the actual function that computes the fixed number of iterations of functions runs.\n",
-        "The FX path lowering and TensorRT engine creation is integrated into `compile()` API which is defined in `fx/lower.py` file.\n",
-        "It is good to list it out and show the usage of it. It takes in original module, input and lowering setting, run lowering workflow to turn module into a executable TRT engine \n",
-        "```\n",
-        "def compile(\n",
-        "    module: nn.Module,\n",
-        "    input: ,\n",
-        "    max_batch_size: int = 2048,\n",
-        "    max_workspace_size=1 << 25,\n",
-        "    explicit_batch_dimension=False,\n",
-        "    lower_precision=LowerPrecision.FP16,\n",
-        "    verbose_log=False,\n",
-        "    timing_cache_prefix=\"\",\n",
-        "    save_timing_cache=False,\n",
-        "    cuda_graph_batch_size=-1,\n",
-        "    dynamic_batch=False,\n",
-        ") -> nn.Module:\n",
-        "``` \n",
-        "\n",
-        "    Args:\n",
-        "        module: Original module for lowering.\n",
-        "        input: Input for module.\n",
-        "        max_batch_size: Maximum batch size (must be >= 1 to be set, 0 means not set)\n",
-        "        max_workspace_size: Maximum size of workspace given to TensorRT.\n",
-        "        explicit_batch_dimension: Use explicit batch dimension in TensorRT if set True, otherwise use implicit batch dimension.\n",
-        "        lower_precision: lower_precision config given to TRTModule.\n",
-        "        verbose_log: Enable verbose log for TensorRT if set True.\n",
-        "        timing_cache_prefix: Timing cache file name for timing cache used by fx2trt.\n",
-        "        save_timing_cache: Update timing cache with current timing cache data if set to True.\n",
-        "        cuda_graph_batch_size: Cuda graph batch size, default to be -1.\n",
-        "        dynamic_batch: batch dimension (dim=0) is dynamic.\n",
-        "\n",
-        "    Returns:\n",
-        "        A torch.nn.Module lowered by TensorRT.\n",
-        "We testd a resnet18 network with input size of [128,3,224,224] for [Batch, Channel, Width, Height]"
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103501.511 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float32, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "metadata": {
-        "originalKey": "3002935b-b95a-4a08-a57f-f7a35485af5b",
-        "showInput": true,
-        "customInput": null,
-        "code_folding": [],
-        "hidden_ranges": [],
-        "collapsed": false,
-        "requestMsgId": "dc73f2d0-427b-4f71-bec1-b118cc5642d0",
-        "customOutput": null,
-        "executionStartTime": 1661189697773,
-        "executionStopTime": 1661189753875
-      },
-      "source": [
-        "def benchmark_torch_function(iters: int, f, *args) -> float:\n",
-        "    \"\"\"Estimates the average time duration for a single inference call in second\n",
-        "\n",
-        "    If the input is batched, then the estimation is for the batches inference call.\n",
-        "    \"\"\"\n",
-        "    with torch.inference_mode():\n",
-        "        f(*args)\n",
-        "    torch.cuda.synchronize()\n",
-        "    start_event = torch.cuda.Event(enable_timing=True)\n",
-        "    end_event = torch.cuda.Event(enable_timing=True)\n",
-        "    print(\"== Start benchmark iterations\")\n",
-        "    with torch.inference_mode():\n",
-        "        start_event.record()\n",
-        "        for _ in range(iters):\n",
-        "            f(*args)\n",
-        "        end_event.record()\n",
-        "    torch.cuda.synchronize()\n",
-        "    print(\"== End benchmark iterations\")\n",
-        "    return (start_event.elapsed_time(end_event) * 1.0e-3) / iters\n",
-        "\n",
-        "\n",
-        "def run_configuration_benchmark(\n",
-        "    module,\n",
-        "    input,\n",
-        "    conf: Configuration,\n",
-        ") -> Result:\n",
-        "    print(f\"=== Running benchmark for: {conf}\", \"green\")\n",
-        "    time = -1.0\n",
-        "\n",
-        "    if conf.fp16:\n",
-        "        module = module.half()\n",
-        "        input = [i.half() for i in input]\n",
-        "\n",
-        "    if not conf.trt:\n",
-        "        # Run eager mode benchmark\n",
-        "        time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))\n",
-        "    elif not conf.jit:\n",
-        "        # Run lowering eager mode benchmark\n",
-        "        lowered_module = compile(\n",
-        "            module,\n",
-        "            input,\n",
-        "            max_batch_size=conf.batch_size,\n",
-        "            lower_precision=LowerPrecision.FP16 if conf.fp16 else LowerPrecision.FP32,\n",
-        "        )\n",
-        "        time = benchmark_torch_function(conf.batch_iter, lambda: lowered_module(*input))\n",
-        "    else:\n",
-        "        print(\"Lowering with JIT is not available!\", \"red\")\n",
-        "\n",
-        "    result = Result(module=module, input=input, conf=conf, time_sec=time)\n",
-        "    return result\n",
-        "\n",
-        "\n",
-        "@torch.inference_mode()\n",
-        "def benchmark(\n",
-        "    model,\n",
-        "    inputs,\n",
-        "    batch_iter: int,\n",
-        "    batch_size: int,\n",
-        ") -> None:\n",
-        "    model = model.cuda().eval()\n",
-        "    inputs = [x.cuda() for x in inputs]\n",
-        "\n",
-        "    # benchmark base configuration\n",
-        "    conf = Configuration(batch_iter=batch_iter, batch_size=batch_size)\n",
-        "\n",
-        "    configurations = [\n",
-        "        # Baseline\n",
-        "        replace(conf, name=\"CUDA Eager\", trt=False),\n",
-        "        # FP32\n",
-        "        replace(\n",
-        "            conf,\n",
-        "            name=\"TRT FP32 Eager\",\n",
-        "            trt=True,\n",
-        "            jit=False,\n",
-        "            fp16=False,\n",
-        "            accuracy_rtol=1e-3,\n",
-        "        ),\n",
-        "        # FP16\n",
-        "        replace(\n",
-        "            conf,\n",
-        "            name=\"TRT FP16 Eager\",\n",
-        "            trt=True,\n",
-        "            jit=False,\n",
-        "            fp16=True,\n",
-        "            accuracy_rtol=1e-2,\n",
-        "        ),\n",
-        "    ]\n",
-        "\n",
-        "    results = [run_configuration_benchmark(deepcopy(model), inputs, conf_) for conf_ in configurations]\n",
-        "\n",
-        "    for res in results:\n",
-        "        print(res.format())\n",
-        "\n",
-        "\n",
-        "test_model = torchvision.models.resnet18(pretrained=True)\n",
-        "input = [torch.rand(128, 3, 224, 224)]\n",
-        "benchmark(test_model, input, 50, 128)"
-      ],
-      "execution_count": 8,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103458.189 manifold.py:1435] URL manifold://torchvision/tree/models/resnet18-f37072fd.pth was already cached in /home/wwei6/.torch/iopath_cache/manifold_cache/tree/models/resnet18-f37072fd.pth\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1) green\n== Start benchmark iterations\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001) green\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103501.297 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpe_7p37fq\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103501.390 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpg_a347f0\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103501.509 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103501.511 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float32, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103503.964 fx2trt.py:204] Run Module elapsed time: 0:00:00.435984\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103520.647 fx2trt.py:258] Build TRT engine elapsed time: 0:00:16.681226\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103520.658 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:19.147071\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "== Start benchmark iterations\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01) green\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103523.067 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpgphlicna\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103523.106 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpy9cumddi\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103523.173 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103523.174 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float16, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103523.466 fx2trt.py:204] Run Module elapsed time: 0:00:00.288043\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103553.687 fx2trt.py:258] Build TRT engine elapsed time: 0:00:30.220316\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "I0822 103553.698 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:30.523791\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "== Start benchmark iterations\n"
-          ]
-        },
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 14.66ms, QPS: 8732.53, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.27ms, QPS: 17595.70, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.49ms, QPS: 28480.34, Accuracy: None (rtol=0.01)\n"
-          ]
-        }
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float32, 'weight': torch.float32})\nacc_ops.batch_norm: ((), {'input': torch.float32, 'running_mean': torch.float32, 'running_var': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\nacc_ops.relu: ((), {'input': torch.float32})\nacc_ops.max_pool2d: ((), {'input': torch.float32})\nacc_ops.add: ((), {'input': torch.float32, 'other': torch.float32})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float32})\nacc_ops.flatten: ((), {'input': torch.float32})\nacc_ops.linear: ((), {'input': torch.float32, 'weight': torch.float32, 'bias': torch.float32})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
+     ]
     },
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "originalKey": "80bbae99-41ff-4baa-94a5-12bf0c9938f3",
-        "showInput": true,
-        "customInput": null
-      },
-      "source": [
-        ""
-      ]
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103503.964 fx2trt.py:204] Run Module elapsed time: 0:00:00.435984\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103520.647 fx2trt.py:258] Build TRT engine elapsed time: 0:00:16.681226\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103520.658 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:19.147071\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "== Start benchmark iterations\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "== End benchmark iterations\n=== Running benchmark for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01) green\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103523.067 pass_utils.py:166] == Log pass <function fuse_permute_matmul at 0x7f787a0e08b0> before/after graph to /tmp/tmpgphlicna\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103523.106 pass_utils.py:166] == Log pass <function fuse_permute_linear at 0x7f787a0e0670> before/after graph to /tmp/tmpy9cumddi\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103523.173 lower_pass_manager_builder.py:151] Now lowering submodule _run_on_acc_0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103523.174 lower.py:89] split_name='_run_on_acc_0' self.lower_setting.input_specs=[InputTensorSpec(shape=torch.Size([128, 3, 224, 224]), dtype=torch.float16, device=device(type='cuda', index=0), shape_ranges=[], has_batch_dim=True)]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\nSupported node types in the model:\nacc_ops.conv2d: ((), {'input': torch.float16, 'weight': torch.float16})\nacc_ops.batch_norm: ((), {'input': torch.float16, 'running_mean': torch.float16, 'running_var': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\nacc_ops.relu: ((), {'input': torch.float16})\nacc_ops.max_pool2d: ((), {'input': torch.float16})\nacc_ops.add: ((), {'input': torch.float16, 'other': torch.float16})\nacc_ops.adaptive_avg_pool2d: ((), {'input': torch.float16})\nacc_ops.flatten: ((), {'input': torch.float16})\nacc_ops.linear: ((), {'input': torch.float16, 'weight': torch.float16, 'bias': torch.float16})\n\nUnsupported node types in the model:\n\nGot 1 acc subgraphs and 0 non-acc subgraphs\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103523.466 fx2trt.py:204] Run Module elapsed time: 0:00:00.288043\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103553.687 fx2trt.py:258] Build TRT engine elapsed time: 0:00:30.220316\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0822 103553.698 lower_pass_manager_builder.py:168] Lowering submodule _run_on_acc_0 elapsed time 0:00:30.523791\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "== Start benchmark iterations\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "== End benchmark iterations\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='CUDA Eager', trt=False, jit=False, fp16=False, accuracy_rtol=-1)\nBS: 128, Time per iter: 14.66ms, QPS: 8732.53, Accuracy: None (rtol=-1)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP32 Eager', trt=True, jit=False, fp16=False, accuracy_rtol=0.001)\nBS: 128, Time per iter: 7.27ms, QPS: 17595.70, Accuracy: None (rtol=0.001)\n== Benchmark Result for: Configuration(batch_iter=50, batch_size=128, name='TRT FP16 Eager', trt=True, jit=False, fp16=True, accuracy_rtol=0.01)\nBS: 128, Time per iter: 4.49ms, QPS: 28480.34, Accuracy: None (rtol=0.01)\n"
+     ]
     }
-  ]
+   ],
+   "source": [
+    "def benchmark_torch_function(iters: int, f, *args) -> float:\n",
+    "    \"\"\"Estimates the average time duration for a single inference call in second\n",
+    "\n",
+    "    If the input is batched, then the estimation is for the batches inference call.\n",
+    "    \"\"\"\n",
+    "    with torch.inference_mode():\n",
+    "        f(*args)\n",
+    "    torch.cuda.synchronize()\n",
+    "    start_event = torch.cuda.Event(enable_timing=True)\n",
+    "    end_event = torch.cuda.Event(enable_timing=True)\n",
+    "    print(\"== Start benchmark iterations\")\n",
+    "    with torch.inference_mode():\n",
+    "        start_event.record()\n",
+    "        for _ in range(iters):\n",
+    "            f(*args)\n",
+    "        end_event.record()\n",
+    "    torch.cuda.synchronize()\n",
+    "    print(\"== End benchmark iterations\")\n",
+    "    return (start_event.elapsed_time(end_event) * 1.0e-3) / iters\n",
+    "\n",
+    "\n",
+    "def run_configuration_benchmark(\n",
+    "    module,\n",
+    "    input,\n",
+    "    conf: Configuration,\n",
+    ") -> Result:\n",
+    "    print(f\"=== Running benchmark for: {conf}\", \"green\")\n",
+    "    time = -1.0\n",
+    "\n",
+    "    if conf.fp16:\n",
+    "        module = module.half()\n",
+    "        input = [i.half() for i in input]\n",
+    "\n",
+    "    if not conf.trt:\n",
+    "        # Run eager mode benchmark\n",
+    "        time = benchmark_torch_function(conf.batch_iter, lambda: module(*input))\n",
+    "    elif not conf.jit:\n",
+    "        # Run lowering eager mode benchmark\n",
+    "        lowered_module = compile(\n",
+    "            module,\n",
+    "            input,\n",
+    "            max_batch_size=conf.batch_size,\n",
+    "            lower_precision=LowerPrecision.FP16 if conf.fp16 else LowerPrecision.FP32,\n",
+    "        )\n",
+    "        time = benchmark_torch_function(conf.batch_iter, lambda: lowered_module(*input))\n",
+    "    else:\n",
+    "        print(\"Lowering with JIT is not available!\", \"red\")\n",
+    "\n",
+    "    result = Result(module=module, input=input, conf=conf, time_sec=time)\n",
+    "    return result\n",
+    "\n",
+    "\n",
+    "@torch.inference_mode()\n",
+    "def benchmark(\n",
+    "    model,\n",
+    "    inputs,\n",
+    "    batch_iter: int,\n",
+    "    batch_size: int,\n",
+    ") -> None:\n",
+    "    model = model.cuda().eval()\n",
+    "    inputs = [x.cuda() for x in inputs]\n",
+    "\n",
+    "    # benchmark base configuration\n",
+    "    conf = Configuration(batch_iter=batch_iter, batch_size=batch_size)\n",
+    "\n",
+    "    configurations = [\n",
+    "        # Baseline\n",
+    "        replace(conf, name=\"CUDA Eager\", trt=False),\n",
+    "        # FP32\n",
+    "        replace(\n",
+    "            conf,\n",
+    "            name=\"TRT FP32 Eager\",\n",
+    "            trt=True,\n",
+    "            jit=False,\n",
+    "            fp16=False,\n",
+    "            accuracy_rtol=1e-3,\n",
+    "        ),\n",
+    "        # FP16\n",
+    "        replace(\n",
+    "            conf,\n",
+    "            name=\"TRT FP16 Eager\",\n",
+    "            trt=True,\n",
+    "            jit=False,\n",
+    "            fp16=True,\n",
+    "            accuracy_rtol=1e-2,\n",
+    "        ),\n",
+    "    ]\n",
+    "\n",
+    "    results = [run_configuration_benchmark(deepcopy(model), inputs, conf_) for conf_ in configurations]\n",
+    "\n",
+    "    for res in results:\n",
+    "        print(res.format())\n",
+    "\n",
+    "\n",
+    "test_model = torchvision.models.resnet18(pretrained=True)\n",
+    "input = [torch.rand(128, 3, 224, 224)]\n",
+    "benchmark(test_model, input, 50, 128)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "customInput": null,
+    "originalKey": "80bbae99-41ff-4baa-94a5-12bf0c9938f3",
+    "showInput": true
+   },
+   "source": [
+    ""
+   ]
+  }
+ ],
+ "metadata": {
+  "bento_stylesheets": {
+   "bento/extensions/flow/main.css": true,
+   "bento/extensions/kernel_selector/main.css": true,
+   "bento/extensions/kernel_ui/main.css": true,
+   "bento/extensions/new_kernel/main.css": true,
+   "bento/extensions/system_usage/main.css": true,
+   "bento/extensions/theme/main.css": true
+  },
+  "dataExplorerConfig": {},
+  "kernelspec": {
+   "display_name": "dper3_pytorch (cinder)",
+   "language": "python",
+   "metadata": {
+    "cinder_runtime": true,
+    "fbpkg_supported": true,
+    "is_prebuilt": true,
+    "kernel_name": "bento_kernel_dper3_pytorch_cinder",
+    "nightly_builds": false
+   },
+   "name": "bento_kernel_dper3_pytorch_cinder"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  },
+  "last_base_url": "https://devgpu005.ftw6.facebook.com:8091/",
+  "last_kernel_id": "5f014373-151c-4ee8-8939-4daab994d202",
+  "last_msg_id": "687e81e8-4414f32c89cd026dd1ea3fd9_139",
+  "last_server_session_id": "24a1a10c-29aa-4e2b-a11f-2b5108fc1e58",
+  "outputWidgetContext": {}
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }
diff --git a/notebooks/lenet-getting-started.ipynb b/notebooks/lenet-getting-started.ipynb
index 144d47813b..2cf06d2c05 100644
--- a/notebooks/lenet-getting-started.ipynb
+++ b/notebooks/lenet-getting-started.ipynb
@@ -193,7 +193,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import torch \n",
+    "import torch\n",
     "from torch import nn\n",
     "import torch.nn.functional as F\n",
     "\n",
@@ -258,7 +258,7 @@
     "    input_data = input_data.to(\"cuda\")\n",
     "    if dtype=='fp16':\n",
     "        input_data = input_data.half()\n",
-    "        \n",
+    "\n",
     "    print(\"Warm up ...\")\n",
     "    with torch.no_grad():\n",
     "        for _ in range(nwarmup):\n",
@@ -278,9 +278,8 @@
     "\n",
     "    print(\"Input shape:\", input_data.size())\n",
     "    print(\"Output features size:\", features.size())\n",
-    "    \n",
-    "    print('Average batch time: %.2f ms'%(np.mean(timings)*1000))\n",
-    "    "
+    "\n",
+    "    print('Average batch time: %.2f ms'%(np.mean(timings)*1000))\n"
    ]
   },
   {
@@ -559,7 +558,7 @@
     "            opt_shape=[1024, 1, 33, 33],\n",
     "            max_shape=[1024, 1, 34, 34],\n",
     "            dtype=torch.half\n",
-    "            )], \n",
+    "            )],\n",
     "            enabled_precisions = {torch.half})\n",
     "\n",
     "input_data = torch.randn((1024, 1, 32, 32))\n",
diff --git a/notebooks/qat-ptq-workflow.ipynb b/notebooks/qat-ptq-workflow.ipynb
index c0e719b3b4..7b6bf6ef89 100644
--- a/notebooks/qat-ptq-workflow.ipynb
+++ b/notebooks/qat-ptq-workflow.ipynb
@@ -117,20 +117,17 @@
     "import pytorch_quantization\n",
     "from pytorch_quantization import nn as quant_nn\n",
     "from pytorch_quantization import quant_modules\n",
-    "from pytorch_quantization.tensor_quant import QuantDescriptor\n",
     "from pytorch_quantization import calib\n",
     "from tqdm import tqdm\n",
     "\n",
     "print(pytorch_quantization.__version__)\n",
     "\n",
     "import os\n",
-    "import sys\n",
     "import warnings\n",
     "import time\n",
     "import numpy as np\n",
     "import wget\n",
     "import tarfile\n",
-    "import shutil\n",
     "warnings.simplefilter('ignore')"
    ]
   },
@@ -194,9 +191,9 @@
    "outputs": [],
    "source": [
     "# Define main data directory\n",
-    "DATA_DIR = './data/imagenette2-320' \n",
+    "DATA_DIR = './data/imagenette2-320'\n",
     "# Define training and validation data paths\n",
-    "TRAIN_DIR = os.path.join(DATA_DIR, 'train') \n",
+    "TRAIN_DIR = os.path.join(DATA_DIR, 'train')\n",
     "VAL_DIR = os.path.join(DATA_DIR, 'val')"
    ]
   },
@@ -286,14 +283,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#This function allows you to set the all the parameters to not have gradients, \n",
-    "#allowing you to freeze the model and not undergo training during the train step. \n",
+    "#This function allows you to set the all the parameters to not have gradients,\n",
+    "#allowing you to freeze the model and not undergo training during the train step.\n",
     "def set_parameter_requires_grad(model, feature_extracting):\n",
     "    if feature_extracting:\n",
     "        for param in model.parameters():\n",
     "            param.requires_grad = False\n",
     "\n",
-    "feature_extract = True #This varaible can be set False if you want to finetune the model by updating all the parameters. \n",
+    "feature_extract = True #This varaible can be set False if you want to finetune the model by updating all the parameters.\n",
     "model = models.mobilenet_v2(pretrained=True)\n",
     "set_parameter_requires_grad(model, feature_extract)\n",
     "#Define a classification head for 10 classes.\n",
@@ -338,7 +335,7 @@
     "        if batch % 100 == 99:\n",
     "            print(\"Batch: [%5d | %5d] loss: %.3f\" % (batch + 1, len(dataloader), running_loss / 100))\n",
     "            running_loss = 0.0\n",
-    "        \n",
+    "\n",
     "def evaluate(model, dataloader, crit, epoch):\n",
     "    total = 0\n",
     "    correct = 0\n",
@@ -365,7 +362,7 @@
     "def save_checkpoint(state, ckpt_path=\"checkpoint.pth\"):\n",
     "    torch.save(state, ckpt_path)\n",
     "    print(\"Checkpoint saved\")\n",
-    "    \n",
+    "\n",
     "cudnn.benchmark = True\n",
     "# Helper function to benchmark the model\n",
     "def benchmark(model, input_shape=(1024, 1, 32, 32), dtype='fp32', nwarmup=50, nruns=1000):\n",
@@ -373,7 +370,7 @@
     "    input_data = input_data.to(\"cuda\")\n",
     "    if dtype=='fp16':\n",
     "        input_data = input_data.half()\n",
-    "        \n",
+    "\n",
     "    with torch.no_grad():\n",
     "        for _ in range(nwarmup):\n",
     "            features = model(input_data)\n",
@@ -426,7 +423,7 @@
     "    test_loss, test_acc = evaluate(model, val_dataloader, criterion, epoch)\n",
     "\n",
     "    print(\"Test Loss: {:.5f} Test Acc: {:.2f}%\".format(test_loss, 100 * test_acc))\n",
-    "    \n",
+    "\n",
     "save_checkpoint({'epoch': epoch + 1,\n",
     "                 'model_state_dict': model.state_dict(),\n",
     "                 'acc': test_acc,\n",
@@ -576,7 +573,7 @@
     "         \"enabled_precisions\": torch.int8,\n",
     "         \"calibrator\": calibrator,\n",
     "        \"truncate_long_and_double\": True\n",
-    "         \n",
+    "\n",
     "     }\n",
     "trt_ptq = torch_tensorrt.compile(baseline_model, **compile_spec)"
    ]
@@ -772,7 +769,7 @@
     "    test_loss, test_acc = evaluate(q_model, val_dataloader, criterion, epoch)\n",
     "\n",
     "    print(\"Test Loss: {:.5f} Test Acc: {:.2f}%\".format(test_loss, 100 * test_acc))\n",
-    "    \n",
+    "\n",
     "save_checkpoint({'epoch': epoch + 1,\n",
     "                 'model_state_dict': q_model.state_dict(),\n",
     "                 'acc': test_acc,\n",
diff --git a/notebooks/ssd-object-detection-demo.ipynb b/notebooks/ssd-object-detection-demo.ipynb
index b7ae8dc2e8..f48fb2bccd 100644
--- a/notebooks/ssd-object-detection-demo.ipynb
+++ b/notebooks/ssd-object-detection-demo.ipynb
@@ -403,7 +403,7 @@
     "tensor = utils.prepare_tensor(inputs, False)\n",
     "\n",
     "# The model was trained on COCO dataset, which we need to access in order to\n",
-    "# translate class IDs into object names. \n",
+    "# translate class IDs into object names.\n",
     "classes_to_labels = utils.get_coco_object_dictionary()"
    ]
   },
@@ -417,8 +417,8 @@
     "model = ssd300.eval().to(\"cuda\")\n",
     "detections_batch = model(tensor)\n",
     "\n",
-    "# By default, raw output from SSD network per input image contains 8732 boxes with \n",
-    "# localization and class probability distribution. \n",
+    "# By default, raw output from SSD network per input image contains 8732 boxes with\n",
+    "# localization and class probability distribution.\n",
     "# Let’s filter this output to only get reasonable detections (confidence>40%) in a more comprehensive format.\n",
     "results_per_input = utils.decode_results(detections_batch)\n",
     "best_results_per_input = [utils.pick_best(results, 0.40) for results in results_per_input]"
@@ -530,7 +530,7 @@
     "    input_data = input_data.to(\"cuda\")\n",
     "    if dtype=='fp16':\n",
     "        input_data = input_data.half()\n",
-    "        \n",
+    "\n",
     "    print(\"Warm up ...\")\n",
     "    with torch.no_grad():\n",
     "        for _ in range(nwarmup):\n",
@@ -718,7 +718,7 @@
     "\n",
     "# The compiled module will have precision as specified by \"op_precision\".\n",
     "# Here, it will have FP16 precision.\n",
-    "trt_model = torch_tensorrt.compile(traced_model, \n",
+    "trt_model = torch_tensorrt.compile(traced_model,\n",
     "    inputs= [torch_tensorrt.Input((3, 3, 300, 300), dtype=torch.half)],\n",
     "    enabled_precisions= {torch.half}, # Run with FP16\n",
     "    workspace_size= 1 << 20\n",
@@ -750,8 +750,8 @@
     "# using a Torch-TensorRT module is exactly the same as how we usually do inference in PyTorch i.e. model(inputs)\n",
     "detections_batch = trt_model(tensor.to(torch.half)) # convert the input to half precision\n",
     "\n",
-    "# By default, raw output from SSD network per input image contains 8732 boxes with \n",
-    "# localization and class probability distribution. \n",
+    "# By default, raw output from SSD network per input image contains 8732 boxes with\n",
+    "# localization and class probability distribution.\n",
     "# Let’s filter this output to only get reasonable detections (confidence>40%) in a more comprehensive format.\n",
     "results_per_input = utils.decode_results(detections_batch)\n",
     "best_results_per_input_trt = [utils.pick_best(results, 0.40) for results in results_per_input]"
diff --git a/notebooks/vgg-qat.ipynb b/notebooks/vgg-qat.ipynb
index 5888950378..1232b03393 100644
--- a/notebooks/vgg-qat.ipynb
+++ b/notebooks/vgg-qat.ipynb
@@ -97,12 +97,10 @@
     "import torchvision.datasets as datasets\n",
     "import torch_tensorrt\n",
     "\n",
-    "from torch.utils.tensorboard import SummaryWriter\n",
     "\n",
     "import pytorch_quantization\n",
     "from pytorch_quantization import nn as quant_nn\n",
     "from pytorch_quantization import quant_modules\n",
-    "from pytorch_quantization.tensor_quant import QuantDescriptor\n",
     "from pytorch_quantization import calib\n",
     "from tqdm import tqdm\n",
     "\n",
@@ -209,7 +207,7 @@
     "        if batch % 500 == 499:\n",
     "            print(\"Batch: [%5d | %5d] loss: %.3f\" % (batch + 1, len(dataloader), running_loss / 100))\n",
     "            running_loss = 0.0\n",
-    "        \n",
+    "\n",
     "def test(model, dataloader, crit, epoch):\n",
     "    global writer\n",
     "    global classes\n",
@@ -440,7 +438,7 @@
     "    test_loss, test_acc = test(model, testing_dataloader, crit, epoch)\n",
     "\n",
     "    print(\"Test Loss: {:.5f} Test Acc: {:.2f}%\".format(test_loss, 100 * test_acc))\n",
-    "    \n",
+    "\n",
     "save_checkpoint({'epoch': epoch + 1,\n",
     "                 'model_state_dict': model.state_dict(),\n",
     "                 'acc': test_acc,\n",
@@ -831,7 +829,7 @@
     "    test_loss, test_acc = test(qat_model, testing_dataloader, crit, epoch)\n",
     "\n",
     "    print(\"Test Loss: {:.5f} Test Acc: {:.2f}%\".format(test_loss, 100 * test_acc))\n",
-    "    \n",
+    "\n",
     "save_checkpoint({'epoch': epoch + 1,\n",
     "                 'model_state_dict': qat_model.state_dict(),\n",
     "                 'acc': test_acc,\n",
@@ -1097,7 +1095,7 @@
     "    input_data = input_data.to(\"cuda\")\n",
     "    if dtype=='fp16':\n",
     "        input_data = input_data.half()\n",
-    "        \n",
+    "\n",
     "    print(\"Warm up ...\")\n",
     "    with torch.no_grad():\n",
     "        for _ in range(nwarmup):\n",
diff --git a/py/torch_tensorrt/_Device.py b/py/torch_tensorrt/_Device.py
index e92085d3a3..33941d1e7b 100644
--- a/py/torch_tensorrt/_Device.py
+++ b/py/torch_tensorrt/_Device.py
@@ -9,12 +9,11 @@
 else:
     from typing_extensions import Self
 
+import tensorrt as trt
 import torch
 from torch_tensorrt._enums import DeviceType
 from torch_tensorrt._features import needs_torch_tensorrt_runtime
 
-import tensorrt as trt
-
 
 class Device(object):
     """
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index 302928a784..e9c5c3d622 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -653,7 +653,6 @@ def save(
                 )
             torch.export.save(module, file_path)
     elif module_type == _ModuleType.fx:
-
         # The module type is torch.fx.GraphModule
         if output_format == "torchscript":
             module_ts = torch.jit.trace(
@@ -671,7 +670,6 @@ def save(
                 exp_program = export(module)
                 torch.export.save(exp_program, file_path)
             else:
-
                 if arg_inputs is None:
                     raise ValueError(
                         "Provided model is a torch.fx.GraphModule and retrace is True, however the inputs or arg_inputs are empty. Please provide valid torch.tensors as inputs or arg_inputs to trace and save the model"
diff --git a/py/torch_tensorrt/_features.py b/py/torch_tensorrt/_features.py
index 5e95bacee0..8da7ac6fff 100644
--- a/py/torch_tensorrt/_features.py
+++ b/py/torch_tensorrt/_features.py
@@ -44,9 +44,7 @@
 
 def _enabled_features_str() -> str:
     enabled = lambda x: "ENABLED" if x else "DISABLED"
-    out_str: str = (
-        f"Enabled Features:\n  - Dynamo Frontend: {enabled(_DYNAMO_FE_AVAIL)}\n  - Torch-TensorRT Runtime: {enabled(_TORCHTRT_RT_AVAIL)}\n  - FX Frontend: {enabled(_FX_FE_AVAIL)}\n  - TorchScript Frontend: {enabled(_TS_FE_AVAIL)}\n"  # type: ignore[no-untyped-call]
-    )
+    out_str: str = f"Enabled Features:\n  - Dynamo Frontend: {enabled(_DYNAMO_FE_AVAIL)}\n  - Torch-TensorRT Runtime: {enabled(_TORCHTRT_RT_AVAIL)}\n  - FX Frontend: {enabled(_FX_FE_AVAIL)}\n  - TorchScript Frontend: {enabled(_TS_FE_AVAIL)}\n"  # type: ignore[no-untyped-call]
     return out_str
 
 
diff --git a/py/torch_tensorrt/dynamo/_engine_cache.py b/py/torch_tensorrt/dynamo/_engine_cache.py
index 7835c419d0..83f75dc4e9 100644
--- a/py/torch_tensorrt/dynamo/_engine_cache.py
+++ b/py/torch_tensorrt/dynamo/_engine_cache.py
@@ -29,7 +29,6 @@
 
 
 class BaseEngineCache(ABC):
-
     @abstractmethod
     def __init__(
         self,
@@ -224,7 +223,6 @@ def __init__(
         engine_cache_dir: str,
         engine_cache_size: int,
     ) -> None:
-
         def get_dir_size(path: str) -> int:
             total = 0
             with os.scandir(path) as it:
diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py
index c7a063d675..f2d4cfee88 100644
--- a/py/torch_tensorrt/dynamo/_exporter.py
+++ b/py/torch_tensorrt/dynamo/_exporter.py
@@ -112,7 +112,6 @@ def lift(
     non_user_input_idx = 0
     for node in gm.graph.nodes:
         if node.op == "get_attr":
-
             lift_val = None
             input_kind = None
 
diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
index f1041682f8..64c2382582 100644
--- a/py/torch_tensorrt/dynamo/_refit.py
+++ b/py/torch_tensorrt/dynamo/_refit.py
@@ -251,7 +251,6 @@ def refit_module_weights(
     # Get the settings and check the setting to be uniform
     settings: Optional[CompilationSettings] = None
     if inline_module:
-
         # Obtain the settings
         compiled_submodules = [
             (name.replace("_engine", ""), engine)
@@ -362,7 +361,6 @@ def refit_module_weights(
     # Generate the corresponding TRT Module for those
 
     for name, new_submodule in new_partitioned_module.named_children():
-
         # Refit each submodule
         # Extract engine from the submodule
         try:
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
index d7c0ea449e..83fcfbff36 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -197,7 +197,6 @@ def _populate_trt_builder_config(
         algorithm_selector: Optional[trt.IAlgorithmSelector] = None,
         tactic_sources: Optional[int] = None,
     ) -> trt.IBuilderConfig:
-
         builder_config = self.builder.create_builder_config()
 
         if self.compilation_settings.debug:
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
index 6dad862892..1dad18989c 100644
--- a/py/torch_tensorrt/dynamo/conversion/_conversion.py
+++ b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -94,7 +94,6 @@ def convert_module(
     rt_cls = PythonTorchTensorRTModule
 
     if ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime:
-
         from torch_tensorrt.dynamo.runtime import TorchTensorRTModule
 
         rt_cls = TorchTensorRTModule
@@ -102,7 +101,6 @@ def convert_module(
     elif (
         not ENABLED_FEATURES.torch_tensorrt_runtime and not settings.use_python_runtime
     ):
-
         logger.info(
             "Since Torch-TensorRT runtime is not available, using Python Runtime, some features may not be available"
         )
diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index f4bb4877cc..62526080c4 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -6,6 +6,7 @@
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, overload
 
 import numpy as np
+import tensorrt as trt
 import torch
 import torch_tensorrt.dynamo.conversion.impl as impl
 from torch.fx.node import Argument, Target
@@ -19,8 +20,6 @@
     DynamoConverterImplSignature,
 )
 
-import tensorrt as trt
-
 from ..types import Shape, TRTDataType, TRTLayer, TRTTensor
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/arange.py b/py/torch_tensorrt/dynamo/conversion/impl/arange.py
index 72eda19733..5b24c641b6 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/arange.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/arange.py
@@ -22,7 +22,6 @@ def arange(
     end: Union[int, TRTTensor],
     step: Union[int, TRTTensor],
 ) -> TRTTensor:
-
     if any(isinstance(tensor, TRTTensor) for tensor in (start, end, step)):
         start_rank_0 = get_trt_tensor(ctx, start, name + "_start_rank_0", min_rank=0)
         start_rank_1 = get_trt_tensor(ctx, start, name + "_start_rank_1", min_rank=1)
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py
index ec3cfcf28c..17e5042ce7 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py
@@ -457,7 +457,6 @@ def add(
     lhs_val: Union[TRTTensor, int, float],
     rhs_val: Union[TRTTensor, int, float],
 ) -> TRTTensor:
-
     return convert_binary_elementwise(
         ctx, target, source_ir, name, trt.ElementWiseOperation.SUM, lhs_val, rhs_val
     )
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py
index 2480d15df2..83ea3dd99b 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/matmul.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/matmul.py
@@ -25,7 +25,6 @@ def matrix_multiply(
     input_matrix_op: trt.MatrixOperation = trt.MatrixOperation.NONE,
     other_matrix_op: trt.MatrixOperation = trt.MatrixOperation.NONE,
 ) -> TRTTensor:
-
     if not isinstance(input, trt.ITensor):
         input = get_trt_tensor(ctx, input, f"{name}_input")
     if not isinstance(other, trt.ITensor):
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py
index a46a9319c4..7e5b03a87e 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/normalization/ops.py
@@ -42,7 +42,6 @@ def batch_norm(
     cudnn_enabled: bool,
     return_mean_rstd: bool,
 ) -> Union[TRTTensor, Tuple[TRTTensor, torch.Tensor, torch.Tensor]]:
-
     if has_dynamic_shape(input.shape):
         assert input.shape[1] != -1, "Channel dim can't be dynamic for batch norm."
 
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/pad.py b/py/torch_tensorrt/dynamo/conversion/impl/pad.py
index 8cc6bd42c8..731058a122 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/pad.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/pad.py
@@ -112,7 +112,6 @@ def constant_padNd(
     pad: Sequence[Union[int, TRTTensor]],
     value: Union[int, float] = 0,
 ) -> TRTTensor:
-
     rank = len(input.shape)
 
     start_indices_tensor, padded_shape_tensor = get_padded_shape_tensors(
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py
index 3274d78c2b..990b01eb70 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py
@@ -332,7 +332,6 @@ def cumsum(
     input: TRTTensor,
     dim: int,
 ) -> TRTTensor:
-
     input_shape = input.shape
     dim = get_positive_dim(dim, len(input_shape))
     if input_shape[dim] < 0:
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py
index c900c51b8f..34b667acf1 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/unary/ops.py
@@ -478,7 +478,6 @@ def sign(
     name: str,
     input_val: TRTTensor,
 ) -> TRTTensor:
-
     return convert_unary(
         ctx, target, source_ir, name, trt.UnaryOperation.SIGN, input_val
     )
diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py
index 134d84cf6d..a0e570e992 100644
--- a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py
@@ -204,7 +204,6 @@ def __init__(
         self.init_finished = True
 
     def store_state_dict_metadata(self) -> None:
-
         for k, v in self.original_model.state_dict().items():
             self.state_dict_metadata[k] = v.shape
 
@@ -400,7 +399,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
         return self.forward(*args, **kwargs)
 
     def __getattr__(self, name: str) -> Any:
-
         if name in self.__dict__:
             # this object has it
             return getattr(self, name)
@@ -413,7 +411,6 @@ def __getattr__(self, name: str) -> Any:
         return getattr(self.pytorch_model, name)
 
     def __delattr__(self, name: str) -> Any:
-
         if name in self.__dict__:
             # this object has it
             super().__delattr__(name)
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
index f72d510a17..9086de657f 100644
--- a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -466,7 +466,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                     outputs = self.create_output_tensors()
 
                 for o, output_name in enumerate(self.output_names):
-
                     if need_cudagraphs_record:
                         self._output_buffers[o] = outputs[o].clone()
 
@@ -496,7 +495,6 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 self._engine_stream.wait_stream(self._caller_stream)
 
                 with torch.cuda.stream(self._engine_stream):
-
                     if cudagraphs_enabled:
                         if need_cudagraphs_record:
                             self.cudagraph = torch.cuda.CUDAGraph()
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 5d6807f33a..467811ef28 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -633,7 +633,6 @@ def check_output_equal(
     rtol: float = RTOL,
     atol: float = ATOL,
 ) -> bool:
-
     if type(output1) != type(output2):
         logger.warning(
             "The output types are different. Check_output_equal will always return false."
diff --git a/py/torch_tensorrt/logging.py b/py/torch_tensorrt/logging.py
index 8447169cc2..0cba3bd510 100644
--- a/py/torch_tensorrt/logging.py
+++ b/py/torch_tensorrt/logging.py
@@ -1,17 +1,15 @@
 import logging
 from typing import Any
 
+import tensorrt as trt
 import torch
 from torch_tensorrt._features import ENABLED_FEATURES
 
-import tensorrt as trt
-
 logging.captureWarnings(True)
 _LOGGER = logging.getLogger("torch_tensorrt [TensorRT Conversion Context]")
 
 
 class _TRTLogger(trt.ILogger):  # type: ignore[misc]
-
     def __init__(self) -> None:
         trt.ILogger.__init__(self)
 
diff --git a/py/torch_tensorrt/runtime/_utils.py b/py/torch_tensorrt/runtime/_utils.py
index 90da7f69ad..c42a2b2a2b 100644
--- a/py/torch_tensorrt/runtime/_utils.py
+++ b/py/torch_tensorrt/runtime/_utils.py
@@ -145,7 +145,6 @@ def no_op_placeholder_for_execute_engine(
     serialized_metadata: str,
     serialized_target_platform: str,
 ) -> List[torch.Tensor]:
-
     raise RuntimeError(
         "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api."
     )
diff --git a/setup.py b/setup.py
index e426123e8b..17b3d33c75 100644
--- a/setup.py
+++ b/setup.py
@@ -285,7 +285,6 @@ def finalize_options(self):
             self.root_is_pure = False
 
     def run(self):
-
         if not PY_ONLY:
             global CXX11_ABI
             build_libtorchtrt_pre_cxx11_abi(
@@ -309,7 +308,6 @@ def finalize_options(self):
             self.root_is_pure = False
 
     def run(self):
-
         if not PY_ONLY:
             global CXX11_ABI
             build_libtorchtrt_pre_cxx11_abi(
diff --git a/tests/core/conversion/converters/test_instance_norm.cpp b/tests/core/conversion/converters/test_instance_norm.cpp
index 2986d73cca..8f9904ef84 100644
--- a/tests/core/conversion/converters/test_instance_norm.cpp
+++ b/tests/core/conversion/converters/test_instance_norm.cpp
@@ -18,7 +18,7 @@ constexpr auto graph = R"IR(
             %running_mean.1 : Tensor?,
             %running_var.1 : Tensor?,
             %use_input_stats.1 : bool):
-        %cudnn_enabled.1 : bool = prim::Constant[value=0]()
+        %cudnn_enabled.1 : bool = prim::Constant[value=1]()
         %momentum.1 : float = prim::Constant[value=0.10000000000000001]()
         %eps.1 : float = prim::Constant[value=1.0000000000000001e-05]()
         %4 : Tensor = aten::instance_norm(%input.1,
diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py
index 26818acd8a..9813548a10 100644
--- a/tests/py/dynamo/conversion/harness.py
+++ b/tests/py/dynamo/conversion/harness.py
@@ -500,7 +500,6 @@ def run_test_compare_tensor_attributes_only(
         enable_passes=False,
         immutable_weights=True,
     ):
-
         # Previous instance of the interpreter auto-casted 64-bit inputs
         # We replicate this behavior here
         compilation_settings = CompilationSettings(
diff --git a/tests/py/dynamo/conversion/test_resize_aten.py b/tests/py/dynamo/conversion/test_resize_aten.py
index 8318035d86..2ca878479b 100644
--- a/tests/py/dynamo/conversion/test_resize_aten.py
+++ b/tests/py/dynamo/conversion/test_resize_aten.py
@@ -6,7 +6,6 @@
 
 
 class TestResizeConverter(DispatchTestCase):
-
     def compare_resized_tensors(self, tensor1, tensor2, input_shape, target_shape):
         # Check if the sizes match
         if tensor1.size() != tensor2.size():
diff --git a/tests/py/dynamo/conversion/test_sym_not_aten.py b/tests/py/dynamo/conversion/test_sym_not_aten.py
index 3ba0889f9b..f6c1e4fa4f 100644
--- a/tests/py/dynamo/conversion/test_sym_not_aten.py
+++ b/tests/py/dynamo/conversion/test_sym_not_aten.py
@@ -7,7 +7,6 @@
 
 
 class TestSymNotConverter(DispatchTestCase):
-
     @parameterized.expand(
         [
             (torch.tensor(True),),
diff --git a/tests/py/dynamo/lowering/test_decompositions.py b/tests/py/dynamo/lowering/test_decompositions.py
index 797d8d3263..95320b9996 100644
--- a/tests/py/dynamo/lowering/test_decompositions.py
+++ b/tests/py/dynamo/lowering/test_decompositions.py
@@ -1533,7 +1533,6 @@ def __init__(self):
                 super().__init__()
 
             def forward(self, input):
-
                 return torch.ops.aten.scatter_reduce_.two(
                     input, dim, index, src, reduce=reduce_op_str
                 )
diff --git a/tests/py/dynamo/models/test_dtype_support.py b/tests/py/dynamo/models/test_dtype_support.py
index b486784e52..146f7fdb7d 100644
--- a/tests/py/dynamo/models/test_dtype_support.py
+++ b/tests/py/dynamo/models/test_dtype_support.py
@@ -13,7 +13,6 @@
 
 
 class Test64BitSupport(TestCase):
-
     @unittest.skipIf(
         not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime,
         "Torch-TensorRT Runtime is not available",
diff --git a/tests/py/dynamo/models/test_engine_cache.py b/tests/py/dynamo/models/test_engine_cache.py
index 68451674c5..36bf5edc95 100644
--- a/tests/py/dynamo/models/test_engine_cache.py
+++ b/tests/py/dynamo/models/test_engine_cache.py
@@ -57,7 +57,6 @@ def load(self, hash: str, prefix: str = "blob") -> Optional[bytes]:
 
 
 class TestHashFunction(TestCase):
-
     def test_reexport_is_equal(self):
         pyt_model = models.resnet18(pretrained=True).eval().to("cuda")
         example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),)
@@ -177,7 +176,6 @@ def test_engine_settings_is_not_equal(self):
 
 
 class TestEngineCache(TestCase):
-
     @pytest.mark.xfail
     def test_dynamo_compile_with_default_disk_engine_cache(self):
         model = models.resnet18(pretrained=True).eval().to("cuda")
diff --git a/tests/py/dynamo/models/test_export_kwargs_serde.py b/tests/py/dynamo/models/test_export_kwargs_serde.py
index aa4ea14cea..928d62e7ba 100644
--- a/tests/py/dynamo/models/test_export_kwargs_serde.py
+++ b/tests/py/dynamo/models/test_export_kwargs_serde.py
@@ -393,7 +393,6 @@ def forward(self, x, b=None, c=None, d=None, e=[]):
 
 @pytest.mark.unit
 def test_custom_model_with_dynamo_trace_kwarg_list_dynamic():
-
     class net(nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py
index bb61ac2d43..a0b3292c29 100644
--- a/tests/py/dynamo/models/test_model_refit.py
+++ b/tests/py/dynamo/models/test_model_refit.py
@@ -32,7 +32,6 @@
 )
 @pytest.mark.unit
 def test_mapping():
-
     model = models.resnet18(pretrained=False).eval().to("cuda")
     model2 = models.resnet18(pretrained=True).eval().to("cuda")
     inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
@@ -88,7 +87,6 @@ def test_mapping():
 )
 @pytest.mark.unit
 def test_refit_one_engine_with_weightmap():
-
     model = models.resnet18(pretrained=False).eval().to("cuda")
     model2 = models.resnet18(pretrained=True).eval().to("cuda")
     inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
@@ -138,7 +136,6 @@ def test_refit_one_engine_with_weightmap():
 )
 @pytest.mark.unit
 def test_refit_one_engine_no_map_with_weightmap():
-
     model = models.resnet18(pretrained=False).eval().to("cuda")
     model2 = models.resnet18(pretrained=True).eval().to("cuda")
     inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
@@ -189,7 +186,6 @@ def test_refit_one_engine_no_map_with_weightmap():
 )
 @pytest.mark.unit
 def test_refit_one_engine_with_wrong_weightmap():
-
     model = models.resnet18(pretrained=False).eval().to("cuda")
     model2 = models.resnet18(pretrained=True).eval().to("cuda")
     inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
@@ -345,7 +341,6 @@ def test_refit_one_engine_inline_runtime__with_weightmap():
 
 @pytest.mark.unit
 def test_refit_one_engine_python_runtime_with_weightmap():
-
     model = models.resnet18(pretrained=False).eval().to("cuda")
     model2 = models.resnet18(pretrained=True).eval().to("cuda")
     inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
@@ -394,7 +389,6 @@ def test_refit_one_engine_python_runtime_with_weightmap():
 )
 @pytest.mark.unit
 def test_refit_multiple_engine_with_weightmap():
-
     class net(nn.Module):
         def __init__(self):
             super().__init__()
@@ -466,7 +460,6 @@ def forward(self, x):
 )
 @pytest.mark.unit
 def test_refit_one_engine_without_weightmap():
-
     model = models.resnet18(pretrained=True).eval().to("cuda")
     model2 = models.resnet18(pretrained=False).eval().to("cuda")
     inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
@@ -616,7 +609,6 @@ def test_refit_one_engine_inline_runtime_without_weightmap():
 
 @pytest.mark.unit
 def test_refit_one_engine_python_runtime_without_weightmap():
-
     model = models.resnet18(pretrained=True).eval().to("cuda")
     model2 = models.resnet18(pretrained=False).eval().to("cuda")
     inputs = [torch.randn((1, 3, 224, 224)).to("cuda")]
@@ -665,7 +657,6 @@ def test_refit_one_engine_python_runtime_without_weightmap():
 )
 @pytest.mark.unit
 def test_refit_multiple_engine_without_weightmap():
-
     class net(nn.Module):
         def __init__(self):
             super().__init__()
@@ -733,7 +724,6 @@ def forward(self, x):
 
 @pytest.mark.unit
 def test_refit_cumsum_fallback():
-
     class net(nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/tests/py/dynamo/runtime/test_001_streams.py b/tests/py/dynamo/runtime/test_001_streams.py
index aaec9e3d41..e948107edf 100644
--- a/tests/py/dynamo/runtime/test_001_streams.py
+++ b/tests/py/dynamo/runtime/test_001_streams.py
@@ -12,7 +12,6 @@
 
 
 class TestStreams(TestCase):
-
     def test_non_default_stream_exec(self):
         class SampleModel(torch.nn.Module):
             def forward(self, x):
diff --git a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py
index da0dce8f44..6a67bd4ea0 100644
--- a/tests/py/dynamo/runtime/test_002_lazy_engine_init.py
+++ b/tests/py/dynamo/runtime/test_002_lazy_engine_init.py
@@ -45,7 +45,6 @@ def assert_close(outputs, ref_outputs):
 
 
 class TestLazyEngineInit(TestCase):
-
     def test_lazy_engine_init_py(self):
         class Test(torch.nn.Module):
             def forward(self, a, b):
diff --git a/tests/py/dynamo/runtime/test_003_cross_compile_for_windows.py b/tests/py/dynamo/runtime/test_003_cross_compile_for_windows.py
index acf2aa006f..867bf14bee 100644
--- a/tests/py/dynamo/runtime/test_003_cross_compile_for_windows.py
+++ b/tests/py/dynamo/runtime/test_003_cross_compile_for_windows.py
@@ -12,7 +12,6 @@
 
 
 class TestCrossCompileSaveForWindows(TestCase):
-
     @unittest.skipIf(
         platform.system() != "Linux" or platform.architecture()[0] != "64bit",
         "Cross compile for windows can only be enabled on linux x86-64 platform",
diff --git a/tests/py/dynamo/runtime/test_004_weight_streaming.py b/tests/py/dynamo/runtime/test_004_weight_streaming.py
index 10ff950823..78522388d1 100644
--- a/tests/py/dynamo/runtime/test_004_weight_streaming.py
+++ b/tests/py/dynamo/runtime/test_004_weight_streaming.py
@@ -31,7 +31,6 @@ def forward(self, x):
 
 
 class TestWeightStreamingPython(TestCase):
-
     @parameterized.expand(
         [
             ("python_runtime", True),
diff --git a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py
index f2bcaf7ede..ab1137e2b3 100644
--- a/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py
+++ b/tests/py/dynamo/runtime/test_mutable_torchtrt_module.py
@@ -42,7 +42,6 @@ def test_check_output_equal():
 )
 @pytest.mark.unit
 def test_resnet18():
-
     torch.manual_seed(0)
     inputs = [torch.rand((1, 3, 224, 224)).to("cuda")]
 
@@ -79,7 +78,6 @@ def test_resnet18():
 )
 @pytest.mark.unit
 def test_save():
-
     torch.manual_seed(0)
     inputs = [torch.rand((1, 3, 224, 224)).to("cuda")]
 
@@ -116,7 +114,6 @@ def test_save():
 )
 @pytest.mark.unit
 def test_resnet18_modify_attribute():
-
     torch.manual_seed(0)
     inputs = [torch.rand((1, 3, 224, 224)).to("cuda")]
 
@@ -157,7 +154,6 @@ def test_resnet18_modify_attribute():
 )
 @pytest.mark.unit
 def test_resnet18_modify_attribute_no_refit():
-
     torch.manual_seed(0)
     inputs = [torch.rand((1, 3, 224, 224)).to("cuda")]
 

From a57f267c98aa8d57a3bbcfd14eb9466cebe9d68d Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Wed, 29 Jan 2025 16:27:03 -0800
Subject: [PATCH 2/5] chore: address flaky test failures related to global
 partitioning (#3369)

---
 .../torch_compile_advanced_usage.py           |   3 +-
 .../torch_export_gpt2.py                      |   3 +-
 .../torch_export_cudagraphs.py                |   3 +-
 .../dynamo_compile_resnet_example.py          |   3 +-
 .../torch_export_llama2.py                    |   3 +-
 .../torch_compile_resnet_example.py           |   3 +-
 .../torch_compile_transformers_example.py     |   3 +-
 .../dynamo_compile_advanced_usage.py          |   3 +-
 .../dynamo_compile_transformers_example.py    |   3 +-
 .../dynamo_compile_resnet_example.py          |   3 +-
 .../dynamo_compile_advanced_usage.py          |   3 +-
 .../dynamo_compile_transformers_example.py    |   3 +-
 .../dynamo/torch_compile_advanced_usage.py    |   3 +-
 .../dynamo/torch_compile_resnet_example.py    |   3 +-
 .../torch_compile_transformers_example.py     |   3 +-
 examples/dynamo/torch_export_cudagraphs.py    |   3 +-
 examples/dynamo/torch_export_gpt2.py          |   3 +-
 examples/dynamo/torch_export_llama2.py        |   3 +-
 py/torch_tensorrt/_Input.py                   |   2 +-
 py/torch_tensorrt/_enums.py                   |   2 +-
 .../dynamo/conversion/_TRTBuilderMonitor.py   |   6 +-
 .../dynamo/conversion/impl/activation/ops.py  |   4 +-
 py/torch_tensorrt/dynamo/utils.py             |   2 +-
 .../fx/test/converters/acc_op/test_where.py   |   2 +-
 .../fx/tracer/acc_tracer/acc_tracer.py        |   5 +-
 .../test_flaky_global_partitioning.py         | 108 ++++++++++++++++++
 .../partitioning/test_global_partitioning.py  |  83 --------------
 27 files changed, 155 insertions(+), 113 deletions(-)
 create mode 100644 tests/py/dynamo/partitioning/test_flaky_global_partitioning.py

diff --git a/docs/_downloads/0e30a6276601af7e5fc4d5166e2e3d37/torch_compile_advanced_usage.py b/docs/_downloads/0e30a6276601af7e5fc4d5166e2e3d37/torch_compile_advanced_usage.py
index 8ebedab111..af7d4b212d 100644
--- a/docs/_downloads/0e30a6276601af7e5fc4d5166e2e3d37/torch_compile_advanced_usage.py
+++ b/docs/_downloads/0e30a6276601af7e5fc4d5166e2e3d37/torch_compile_advanced_usage.py
@@ -4,7 +4,8 @@
 Torch Compile Advanced Usage
 ======================================================
 
-This interactive script is intended as an overview of the process by which `torch_tensorrt.compile(..., ir="torch_compile", ...)` works, and how it integrates with the `torch.compile` API."""
+This interactive script is intended as an overview of the process by which `torch_tensorrt.compile(..., ir="torch_compile", ...)` works, and how it integrates with the `torch.compile` API.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py b/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py
index cea0f3adf2..4d34c58de4 100644
--- a/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py
+++ b/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py
@@ -4,7 +4,8 @@
 Compiling GPT2 using the dynamo backend
 ==========================================================
 
-This script illustrates Torch-TensorRT workflow with dynamo backend on popular GPT2 model."""
+This script illustrates Torch-TensorRT workflow with dynamo backend on popular GPT2 model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/_downloads/3d4d74f6636d986f33167154f6553961/torch_export_cudagraphs.py b/docs/_downloads/3d4d74f6636d986f33167154f6553961/torch_export_cudagraphs.py
index 1671c7783d..fb31766b7c 100644
--- a/docs/_downloads/3d4d74f6636d986f33167154f6553961/torch_export_cudagraphs.py
+++ b/docs/_downloads/3d4d74f6636d986f33167154f6553961/torch_export_cudagraphs.py
@@ -4,7 +4,8 @@
 Torch Export with Cudagraphs
 ======================================================
 
-This interactive script is intended as an overview of the process by which the Torch-TensorRT Cudagraphs integration can be used in the `ir="dynamo"` path. The functionality works similarly in the `torch.compile` path as well."""
+This interactive script is intended as an overview of the process by which the Torch-TensorRT Cudagraphs integration can be used in the `ir="dynamo"` path. The functionality works similarly in the `torch.compile` path as well.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py b/docs/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py
index 797e41f5fd..5826e28d1e 100644
--- a/docs/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py
+++ b/docs/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py
@@ -4,7 +4,8 @@
 Compiling ResNet using the Torch-TensorRT Dyanmo Frontend
 ==========================================================
 
-This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a ResNet model."""
+This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a ResNet model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py b/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py
index 5cfd1ed61c..2f3e3cba43 100644
--- a/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py
+++ b/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py
@@ -4,7 +4,8 @@
 Compiling Llama2 using the dynamo backend
 ==========================================================
 
-This script illustrates Torch-TensorRT workflow with dynamo backend on popular Llama2 model."""
+This script illustrates Torch-TensorRT workflow with dynamo backend on popular Llama2 model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/_downloads/d6e1bb6ec5f884994554d9d12e37a0f6/torch_compile_resnet_example.py b/docs/_downloads/d6e1bb6ec5f884994554d9d12e37a0f6/torch_compile_resnet_example.py
index f852d60158..fb75986099 100644
--- a/docs/_downloads/d6e1bb6ec5f884994554d9d12e37a0f6/torch_compile_resnet_example.py
+++ b/docs/_downloads/d6e1bb6ec5f884994554d9d12e37a0f6/torch_compile_resnet_example.py
@@ -4,7 +4,8 @@
 Compiling ResNet with dynamic shapes using the `torch.compile` backend
 ==========================================================
 
-This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a ResNet model."""
+This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a ResNet model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/_downloads/dfa60e8f9850fd7761f3e7da81304d32/torch_compile_transformers_example.py b/docs/_downloads/dfa60e8f9850fd7761f3e7da81304d32/torch_compile_transformers_example.py
index 221ecd4fd1..17cf46e8a3 100644
--- a/docs/_downloads/dfa60e8f9850fd7761f3e7da81304d32/torch_compile_transformers_example.py
+++ b/docs/_downloads/dfa60e8f9850fd7761f3e7da81304d32/torch_compile_transformers_example.py
@@ -4,7 +4,8 @@
 Compiling BERT using the `torch.compile` backend
 ==============================================================
 
-This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a BERT model."""
+This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a BERT model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py b/docs/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py
index f73bd1e780..3fb63e8a32 100644
--- a/docs/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py
+++ b/docs/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py
@@ -4,7 +4,8 @@
 Dynamo Compile Advanced Usage
 ======================================================
 
-This interactive script is intended as an overview of the process by which `torch_tensorrt.dynamo.compile` works, and how it integrates with the new `torch.compile` API."""
+This interactive script is intended as an overview of the process by which `torch_tensorrt.dynamo.compile` works, and how it integrates with the new `torch.compile` API.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py b/docs/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py
index dd7fe2e07a..59319078a4 100644
--- a/docs/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py
+++ b/docs/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py
@@ -4,7 +4,8 @@
 Compiling a Transformer using torch.compile and TensorRT
 ==============================================================
 
-This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a transformer-based model."""
+This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a transformer-based model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/v1.4.0/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py b/docs/v1.4.0/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py
index 797e41f5fd..5826e28d1e 100644
--- a/docs/v1.4.0/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py
+++ b/docs/v1.4.0/_downloads/418941399c146271a7b7728ba3059960/dynamo_compile_resnet_example.py
@@ -4,7 +4,8 @@
 Compiling ResNet using the Torch-TensorRT Dyanmo Frontend
 ==========================================================
 
-This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a ResNet model."""
+This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a ResNet model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/v1.4.0/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py b/docs/v1.4.0/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py
index f73bd1e780..3fb63e8a32 100644
--- a/docs/v1.4.0/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py
+++ b/docs/v1.4.0/_downloads/e1ef5a42560a98a132f56a79d0b66f79/dynamo_compile_advanced_usage.py
@@ -4,7 +4,8 @@
 Dynamo Compile Advanced Usage
 ======================================================
 
-This interactive script is intended as an overview of the process by which `torch_tensorrt.dynamo.compile` works, and how it integrates with the new `torch.compile` API."""
+This interactive script is intended as an overview of the process by which `torch_tensorrt.dynamo.compile` works, and how it integrates with the new `torch.compile` API.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/docs/v1.4.0/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py b/docs/v1.4.0/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py
index dd7fe2e07a..59319078a4 100644
--- a/docs/v1.4.0/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py
+++ b/docs/v1.4.0/_downloads/e550c5f53cc43e11aa6da8cfb79b54df/dynamo_compile_transformers_example.py
@@ -4,7 +4,8 @@
 Compiling a Transformer using torch.compile and TensorRT
 ==============================================================
 
-This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a transformer-based model."""
+This interactive script is intended as a sample of the `torch_tensorrt.dynamo.compile` workflow on a transformer-based model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/examples/dynamo/torch_compile_advanced_usage.py b/examples/dynamo/torch_compile_advanced_usage.py
index 8ebedab111..af7d4b212d 100644
--- a/examples/dynamo/torch_compile_advanced_usage.py
+++ b/examples/dynamo/torch_compile_advanced_usage.py
@@ -4,7 +4,8 @@
 Torch Compile Advanced Usage
 ======================================================
 
-This interactive script is intended as an overview of the process by which `torch_tensorrt.compile(..., ir="torch_compile", ...)` works, and how it integrates with the `torch.compile` API."""
+This interactive script is intended as an overview of the process by which `torch_tensorrt.compile(..., ir="torch_compile", ...)` works, and how it integrates with the `torch.compile` API.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/examples/dynamo/torch_compile_resnet_example.py b/examples/dynamo/torch_compile_resnet_example.py
index f852d60158..fb75986099 100644
--- a/examples/dynamo/torch_compile_resnet_example.py
+++ b/examples/dynamo/torch_compile_resnet_example.py
@@ -4,7 +4,8 @@
 Compiling ResNet with dynamic shapes using the `torch.compile` backend
 ==========================================================
 
-This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a ResNet model."""
+This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a ResNet model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/examples/dynamo/torch_compile_transformers_example.py b/examples/dynamo/torch_compile_transformers_example.py
index 221ecd4fd1..17cf46e8a3 100644
--- a/examples/dynamo/torch_compile_transformers_example.py
+++ b/examples/dynamo/torch_compile_transformers_example.py
@@ -4,7 +4,8 @@
 Compiling BERT using the `torch.compile` backend
 ==============================================================
 
-This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a BERT model."""
+This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a BERT model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/examples/dynamo/torch_export_cudagraphs.py b/examples/dynamo/torch_export_cudagraphs.py
index 1671c7783d..fb31766b7c 100644
--- a/examples/dynamo/torch_export_cudagraphs.py
+++ b/examples/dynamo/torch_export_cudagraphs.py
@@ -4,7 +4,8 @@
 Torch Export with Cudagraphs
 ======================================================
 
-This interactive script is intended as an overview of the process by which the Torch-TensorRT Cudagraphs integration can be used in the `ir="dynamo"` path. The functionality works similarly in the `torch.compile` path as well."""
+This interactive script is intended as an overview of the process by which the Torch-TensorRT Cudagraphs integration can be used in the `ir="dynamo"` path. The functionality works similarly in the `torch.compile` path as well.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py
index cea0f3adf2..4d34c58de4 100644
--- a/examples/dynamo/torch_export_gpt2.py
+++ b/examples/dynamo/torch_export_gpt2.py
@@ -4,7 +4,8 @@
 Compiling GPT2 using the dynamo backend
 ==========================================================
 
-This script illustrates Torch-TensorRT workflow with dynamo backend on popular GPT2 model."""
+This script illustrates Torch-TensorRT workflow with dynamo backend on popular GPT2 model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py
index 5cfd1ed61c..2f3e3cba43 100644
--- a/examples/dynamo/torch_export_llama2.py
+++ b/examples/dynamo/torch_export_llama2.py
@@ -4,7 +4,8 @@
 Compiling Llama2 using the dynamo backend
 ==========================================================
 
-This script illustrates Torch-TensorRT workflow with dynamo backend on popular Llama2 model."""
+This script illustrates Torch-TensorRT workflow with dynamo backend on popular Llama2 model.
+"""
 
 # %%
 # Imports and Model Definition
diff --git a/py/torch_tensorrt/_Input.py b/py/torch_tensorrt/_Input.py
index 126219ee8a..2f953094ca 100644
--- a/py/torch_tensorrt/_Input.py
+++ b/py/torch_tensorrt/_Input.py
@@ -261,7 +261,7 @@ def _supported_input_size_type(input_size: Any) -> bool:
 
     @staticmethod
     def _parse_tensor_domain(
-        domain: Optional[Tuple[float, float]]
+        domain: Optional[Tuple[float, float]],
     ) -> Tuple[float, float]:
         """
         Produce a tuple of integers which specifies a tensor domain in the interval format: [lo, hi)
diff --git a/py/torch_tensorrt/_enums.py b/py/torch_tensorrt/_enums.py
index eaefb68ce5..c706c345d6 100644
--- a/py/torch_tensorrt/_enums.py
+++ b/py/torch_tensorrt/_enums.py
@@ -1200,7 +1200,7 @@ def _from(
 
     @classmethod
     def try_from(
-        c: Union[trt.EngineCapability, EngineCapability]
+        c: Union[trt.EngineCapability, EngineCapability],
     ) -> Optional[EngineCapability]:
         """Create a Torch-TensorRT engine capability enum from a TensorRT engine capability enum.
 
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTBuilderMonitor.py b/py/torch_tensorrt/dynamo/conversion/_TRTBuilderMonitor.py
index 9a1189e44a..9b2755f4c7 100644
--- a/py/torch_tensorrt/dynamo/conversion/_TRTBuilderMonitor.py
+++ b/py/torch_tensorrt/dynamo/conversion/_TRTBuilderMonitor.py
@@ -53,13 +53,13 @@ def _redraw(self, *, blank_lines: int = 0) -> None:
         if self._render:
 
             def clear_line() -> None:
-                print("\x1B[2K", end="")
+                print("\x1b[2K", end="")
 
             def move_to_start_of_line() -> None:
-                print("\x1B[0G", end="")
+                print("\x1b[0G", end="")
 
             def move_cursor_up(lines: int) -> None:
-                print("\x1B[{}A".format(lines), end="")
+                print("\x1b[{}A".format(lines), end="")
 
             def progress_bar(steps: int, num_steps: int) -> str:
                 INNER_WIDTH = 10
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py
index a563118526..eb981f2031 100644
--- a/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py
+++ b/py/torch_tensorrt/dynamo/conversion/impl/activation/ops.py
@@ -247,7 +247,7 @@ def hard_sigmoid(
     operation_type = trt.ActivationType.HARD_SIGMOID
 
     def hard_sigmoid_dyn_range_fn(
-        dyn_range: Tuple[float, float]
+        dyn_range: Tuple[float, float],
     ) -> Tuple[float, float]:
         def hard_sigmoid_fn(x: float) -> float:
             return max(0, min(1, alpha * x + beta))
@@ -310,7 +310,7 @@ def thresholded_relu(
     operation_type = trt.ActivationType.THRESHOLDED_RELU
 
     def thresholded_relu_dyn_range_fn(
-        dyn_range: Tuple[float, float]
+        dyn_range: Tuple[float, float],
     ) -> Tuple[float, float]:
         def thresholded_relu_fn(x: float) -> float:
             return x if x > alpha else 0
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 467811ef28..2d3cb2924d 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -465,7 +465,7 @@ def to_torch_device(device: Optional[Union[Device, torch.device, str]]) -> torch
 
 
 def to_torch_tensorrt_device(
-    device: Optional[Union[Device, torch.device, str]]
+    device: Optional[Union[Device, torch.device, str]],
 ) -> Device:
     """Cast a device-type to torch_tensorrt.Device
 
diff --git a/py/torch_tensorrt/fx/test/converters/acc_op/test_where.py b/py/torch_tensorrt/fx/test/converters/acc_op/test_where.py
index 72fea70265..1e14b50305 100644
--- a/py/torch_tensorrt/fx/test/converters/acc_op/test_where.py
+++ b/py/torch_tensorrt/fx/test/converters/acc_op/test_where.py
@@ -101,7 +101,7 @@ def __init__(self, x_shape, y_shape):
             def forward(self, condition):
                 return torch.where(condition, self.x, self.y)
 
-        inputs = [(torch.randn(condition_shape) > 0)]
+        inputs = [torch.randn(condition_shape) > 0]
         self.run_test(
             Where(x_shape, y_shape),
             inputs,
diff --git a/py/torch_tensorrt/fx/tracer/acc_tracer/acc_tracer.py b/py/torch_tensorrt/fx/tracer/acc_tracer/acc_tracer.py
index 9d5576bd63..c8db1b62ef 100644
--- a/py/torch_tensorrt/fx/tracer/acc_tracer/acc_tracer.py
+++ b/py/torch_tensorrt/fx/tracer/acc_tracer/acc_tracer.py
@@ -10,7 +10,6 @@
 from typing import (
     Any,
     Callable,
-    cast,
     Dict,
     Iterable,
     Optional,
@@ -19,6 +18,7 @@
     Tuple,
     Type,
     Union,
+    cast,
 )
 
 import torch
@@ -32,7 +32,6 @@
 
 from . import acc_normalizer, acc_ops, acc_shape_prop, acc_utils  # noqa: F401
 
-
 _LOGGER = logging.getLogger(__name__)
 
 
@@ -517,7 +516,7 @@ def _replace_transpose_last_dims_impl(
     changed = False
 
     def _calculate_dim(
-        transpose_dim: Union[torch.fx.Node, int]
+        transpose_dim: Union[torch.fx.Node, int],
     ) -> Union[torch.fx.Node, int]:
         nonlocal transpose_input_node
         nonlocal changed
diff --git a/tests/py/dynamo/partitioning/test_flaky_global_partitioning.py b/tests/py/dynamo/partitioning/test_flaky_global_partitioning.py
new file mode 100644
index 0000000000..2e2013d5e6
--- /dev/null
+++ b/tests/py/dynamo/partitioning/test_flaky_global_partitioning.py
@@ -0,0 +1,108 @@
+from copy import deepcopy
+
+import numpy as np
+import pytest
+import torch
+import torch.nn.functional as F
+import torch_tensorrt
+from parameterized import parameterized
+from torch.testing._internal.common_utils import TestCase, run_tests
+from torch_tensorrt.dynamo import partitioning
+
+from ..testing_utilities import lower_graph_testing
+
+# Note: the following tests were a part of test_global_partitioning.py and were flaky when
+# we ran all the tests. So, the following test cases were separated out in this test_flaky_global_partitioning.py
+# The partitioned graphs were different when you ran the graph as a part of test_global_partitioning.py vs when you
+# run these tests independently. pytest by default doesn't use parallel execution, so we are not sure why this behavior occurs
+# currently. When you run these tests independently, the partitioned graph is structurally correct and is similar to fast partitioning.
+
+
+class TestGlobalPartitioning(TestCase):
+    def test_partition_partially_supported_multi_op(self):
+        class PartiallySupportedMultiOp(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+
+            def forward(self, x, y):
+                sum_1 = torch.ops.aten.add.Tensor(x, y)
+                sum_2 = torch.ops.aten.add.Tensor(x, sum_1)
+                sum_ = np.sum(sum_1) + np.sum(sum_2)
+                relu_ = torch.ops.aten.relu.default(sum_)
+                pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2)
+                return pow_
+
+        fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp())
+        partitioned_graph, _ = partitioning.global_partition(
+            deepcopy(fx_graph), min_block_size=2
+        )
+        # breakpoint()
+        self.assertEqual(
+            len(list(partitioned_graph.named_children())),
+            2,
+            "Unsupported operators interleave supported ones, expected 2 segments",
+        )
+
+    def test_partition_partially_supported_with_torch_executed_ops(self):
+        class PartiallySupportedMultiOp(torch.nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+
+            def forward(self, x, y):
+                sum_1 = torch.ops.aten.add.Tensor(x, y)
+                sum_2 = torch.ops.aten.add.Tensor(x, sum_1)
+                sum_ = torch.ops.aten.add.Tensor(sum_1, sum_2)
+                relu_ = torch.ops.aten.relu.default(sum_)
+                pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2)
+                return pow_
+
+        unexpected_ops = {torch.ops.aten.add.Tensor}
+
+        inputs = [
+            torch.randint(
+                1,
+                10,
+                (5,),
+            ),
+            torch.randint(
+                1,
+                10,
+                (5,),
+            ),
+        ]
+
+        fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp())
+        (
+            unexpected_ops_seen,
+            _,
+            partitioned_graphs,
+        ) = lower_graph_testing(
+            fx_graph,
+            inputs,
+            unexpected_ops=unexpected_ops,
+            min_block_size=2,
+            torch_executed_ops={"torch.ops.aten.add.Tensor"},
+            testing_partitioning=True,
+            use_fast_partitioner=False,
+        )
+
+        self.assertEqual(
+            len(unexpected_ops_seen),
+            0,
+            f"The following unexpected ops were encountered: {unexpected_ops_seen}",
+        )
+
+        self.assertEqual(
+            len(partitioned_graphs),
+            1,
+            "Without control flow breaks, there should only be a single graph",
+        )
+        self.assertEqual(
+            len(list(partitioned_graphs[0].named_children())),
+            1,
+            "Certain operators are set to run in Torch, expected 1 segment",
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/tests/py/dynamo/partitioning/test_global_partitioning.py b/tests/py/dynamo/partitioning/test_global_partitioning.py
index 80b6716d20..887fa35659 100644
--- a/tests/py/dynamo/partitioning/test_global_partitioning.py
+++ b/tests/py/dynamo/partitioning/test_global_partitioning.py
@@ -117,89 +117,6 @@ def forward(self, x, y):
             "All operators are supported, there should be one segment",
         )
 
-    def test_partition_partially_supported_multi_op(self):
-        class PartiallySupportedMultiOp(torch.nn.Module):
-            def __init__(self, *args, **kwargs) -> None:
-                super().__init__(*args, **kwargs)
-
-            def forward(self, x, y):
-                sum_1 = torch.ops.aten.add.Tensor(x, y)
-                sum_2 = torch.ops.aten.add.Tensor(x, sum_1)
-                sum_ = np.sum(sum_1) + np.sum(sum_2)
-                relu_ = torch.ops.aten.relu.default(sum_)
-                pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2)
-                return pow_
-
-        fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp())
-        partitioned_graph, _ = partitioning.global_partition(
-            deepcopy(fx_graph), min_block_size=2
-        )
-        self.assertEqual(
-            len(list(partitioned_graph.named_children())),
-            2,
-            "Unsupported operators interleave supported ones, expected 2 segments",
-        )
-
-    def test_partition_partially_supported_with_torch_executed_ops(self):
-        class PartiallySupportedMultiOp(torch.nn.Module):
-            def __init__(self, *args, **kwargs) -> None:
-                super().__init__(*args, **kwargs)
-
-            def forward(self, x, y):
-                sum_1 = torch.ops.aten.add.Tensor(x, y)
-                sum_2 = torch.ops.aten.add.Tensor(x, sum_1)
-                sum_ = torch.ops.aten.add.Tensor(sum_1, sum_2)
-                relu_ = torch.ops.aten.relu.default(sum_)
-                pow_ = torch.ops.aten.pow.Tensor_Scalar(relu_, 2)
-                return pow_
-
-        unexpected_ops = {torch.ops.aten.add.Tensor}
-
-        inputs = [
-            torch.randint(
-                1,
-                10,
-                (5,),
-            ),
-            torch.randint(
-                1,
-                10,
-                (5,),
-            ),
-        ]
-
-        fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp())
-        (
-            unexpected_ops_seen,
-            _,
-            partitioned_graphs,
-        ) = lower_graph_testing(
-            fx_graph,
-            inputs,
-            unexpected_ops=unexpected_ops,
-            min_block_size=2,
-            torch_executed_ops={"torch.ops.aten.add.Tensor"},
-            testing_partitioning=True,
-            use_fast_partitioner=False,
-        )
-
-        self.assertEqual(
-            len(unexpected_ops_seen),
-            0,
-            f"The following unexpected ops were encountered: {unexpected_ops_seen}",
-        )
-
-        self.assertEqual(
-            len(partitioned_graphs),
-            1,
-            "Without control flow breaks, there should only be a single graph",
-        )
-        self.assertEqual(
-            len(list(partitioned_graphs[0].named_children())),
-            1,
-            "Certain operators are set to run in Torch, expected 1 segment",
-        )
-
 
 if __name__ == "__main__":
     run_tests()

From ed6ef65b51ab6922788ba9e906b80101e8441aed Mon Sep 17 00:00:00 2001
From: "Zewen (Evan) Li" <zewenl@nvidia.com>
Date: Sat, 18 Jan 2025 08:59:51 +0800
Subject: [PATCH 3/5] fix: CI docker build error for release 2.6 (#3360)

---
 .github/workflows/docker_builder.yml | 2 +-
 docker/README.md                     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/docker_builder.yml b/.github/workflows/docker_builder.yml
index a978d82b6a..4aa228db95 100644
--- a/.github/workflows/docker_builder.yml
+++ b/.github/workflows/docker_builder.yml
@@ -54,7 +54,7 @@ jobs:
         TRT_VERSION=$(python3 -c "import versions; versions.tensorrt_version()")
         echo "TRT VERSION = ${TRT_VERSION}"
 
-        DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=$TRT_VERSION -f docker/Dockerfile --tag $DOCKER_TAG .
+        DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=$TRT_VERSION --build-arg USE_CXX11_ABI=1 -f docker/Dockerfile --tag $DOCKER_TAG .
 
     - name: Push Docker image
       env:
diff --git a/docker/README.md b/docker/README.md
index 7435973b1a..85be0d5791 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,13 +1,13 @@
 # Building a Torch-TensorRT container
 
-* Use `Dockerfile` to build a container which provides the exact development environment that our master branch is usually tested against.
+* Use `Dockerfile` to build a container which provides the exact development environment that our main branch is usually tested against.
 
 * The `Dockerfile` currently uses <a href="https://github.com/bazelbuild/bazelisk">Bazelisk</a> to select the Bazel version, and uses the exact library versions of Torch and CUDA listed in <a href="https://github.com/pytorch/TensorRT#dependencies">dependencies</a>.
   * The desired versions of TensorRT must be specified as build-args, with major and minor versions as in: `--build-arg TENSORRT_VERSION=a.b`
-  * [**Optional**] The desired base image be changed by explicitly setting a base image, as in `--build-arg BASE_IMG=nvidia/cuda:11.8.0-devel-ubuntu22.04`, though this is optional
+  * [**Optional**] The desired base image be changed by explicitly setting a base image, as in `--build-arg BASE_IMG=nvidia/cuda:11.8.0-devel-ubuntu22.04`, though this is optional.
   * [**Optional**] Additionally, the desired Python version can be changed by explicitly setting a version, as in `--build-arg PYTHON_VERSION=3.10`, though this is optional as well.
 
-* This `Dockerfile` installs `pre-cxx11-abi` versions of Pytorch and builds Torch-TRT using `pre-cxx11-abi` libtorch as well.
+* This `Dockerfile` installs `pre-cxx11-abi` versions of Pytorch and builds Torch-TRT using `pre-cxx11-abi` libtorch as well. Update on 1/17/2025: In torch 2.6, `PRE_CXX11_ABI` is required for CUDA 11.8 and 12.4, while `USE_CXX11_ABI` is required for CUDA 12.6. As of torch 2.7, torch requires `USE_CXX11_ABI` for all CUDA 11.8, 12.4, and 12.6.
 
 Note: By default the container uses the `pre-cxx11-abi` version of Torch + Torch-TRT. If you are using a workflow that requires a build of PyTorch on the CXX11 ABI (e.g. using the PyTorch NGC containers as a base image), add the Docker build argument: `--build-arg USE_CXX11_ABI=1`
 
@@ -24,7 +24,7 @@ Note: By default the container uses the `pre-cxx11-abi` version of Torch + Torch
 
 Build:
 ```
-DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=10.6.0 -f docker/Dockerfile -t torch_tensorrt:latest .
+DOCKER_BUILDKIT=1 docker build --build-arg TENSORRT_VERSION=10.6.0 --build-arg USE_CXX11_ABI=1 -f docker/Dockerfile -t torch_tensorrt:latest .
 ```
 
 Run:

From 627b0cfff91e1c19737d3a96e69f8c75fc2b6d7c Mon Sep 17 00:00:00 2001
From: "Zewen (Evan) Li" <zewenl@nvidia.com>
Date: Fri, 17 Jan 2025 04:46:49 +0800
Subject: [PATCH 4/5] fix: CI errors on release 2.6 (#3358)

---
 .github/workflows/assigner.yml                |  2 +-
 .github/workflows/build-tensorrt-linux.yml    |  6 ++---
 .github/workflows/build-tensorrt-windows.yml  |  4 +--
 .github/workflows/build-test-linux.yml        | 26 ++++++++++++++-----
 .../workflows/build-test-tensorrt-linux.yml   | 26 ++++++++++++++-----
 .../workflows/build-test-tensorrt-windows.yml | 26 ++++++++++++++-----
 .github/workflows/build-test-windows.yml      | 24 ++++++++++++-----
 .github/workflows/docker_builder.yml          |  2 +-
 .github/workflows/linter.yml                  |  4 +--
 .github/workflows/linux-test.yml              |  6 ++---
 .github/workflows/nightlies.yml               |  2 +-
 .github/workflows/release-linux.yml           |  4 +--
 .github/workflows/release-wheel-linux.yml     | 10 +++----
 .github/workflows/release-wheel-windows.yml   |  4 +--
 .github/workflows/release-windows.yml         |  2 +-
 .github/workflows/windows-test.yml            |  6 ++---
 16 files changed, 101 insertions(+), 53 deletions(-)

diff --git a/.github/workflows/assigner.yml b/.github/workflows/assigner.yml
index 2b65e554b1..b1056c50b3 100644
--- a/.github/workflows/assigner.yml
+++ b/.github/workflows/assigner.yml
@@ -22,7 +22,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Checkout
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Assign
       uses: ./.github/actions/assigner
diff --git a/.github/workflows/build-tensorrt-linux.yml b/.github/workflows/build-tensorrt-linux.yml
index 7581c38ae8..42fd32eb55 100644
--- a/.github/workflows/build-tensorrt-linux.yml
+++ b/.github/workflows/build-tensorrt-linux.yml
@@ -114,13 +114,13 @@ jobs:
             rm -rf "${RUNNER_TEMP}/*"
           fi
           echo "::endgroup::"
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           # Support the use case where we need to checkout someone's fork
           repository: ${{ inputs.test-infra-repository }}
           ref: ${{ inputs.test-infra-ref }}
           path: test-infra
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         if: ${{ env.ARCH == 'aarch64' }}
         with:
           # Support the use case where we need to checkout someone's fork
@@ -212,7 +212,7 @@ jobs:
       # NB: Only upload to GitHub after passing smoke tests
       - name: Upload wheel to GitHub
         continue-on-error: true
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: ${{ env.UPLOAD_ARTIFACT_NAME }}
           path: ${{ inputs.repository }}/dist
diff --git a/.github/workflows/build-tensorrt-windows.yml b/.github/workflows/build-tensorrt-windows.yml
index 4b86910768..67639a3f02 100644
--- a/.github/workflows/build-tensorrt-windows.yml
+++ b/.github/workflows/build-tensorrt-windows.yml
@@ -100,7 +100,7 @@ jobs:
     # to have a conversation
     timeout-minutes: 120
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           # Support the use case where we need to checkout someone's fork
           repository: ${{ inputs.test-infra-repository }}
@@ -216,7 +216,7 @@ jobs:
       # NB: Only upload to GitHub after passing smoke tests
       - name: Upload wheel to GitHub
         continue-on-error: true
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: ${{ env.UPLOAD_ARTIFACT_NAME }}
           path: ${{ inputs.repository }}/dist/
diff --git a/.github/workflows/build-test-linux.yml b/.github/workflows/build-test-linux.yml
index b0a487bb79..ecbe57036a 100644
--- a/.github/workflows/build-test-linux.yml
+++ b/.github/workflows/build-test-linux.yml
@@ -33,7 +33,7 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           repository: pytorch/tensorrt
       - name: Generate release matrix
@@ -136,7 +136,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/
         popd
 
@@ -165,7 +167,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
         popd
 
@@ -194,7 +198,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py
         popd
@@ -224,7 +230,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
@@ -255,7 +263,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
@@ -286,7 +296,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         nvidia-smi
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py || true
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py || true
diff --git a/.github/workflows/build-test-tensorrt-linux.yml b/.github/workflows/build-test-tensorrt-linux.yml
index dd83299fe7..9bf9b2c3de 100644
--- a/.github/workflows/build-test-tensorrt-linux.yml
+++ b/.github/workflows/build-test-tensorrt-linux.yml
@@ -30,7 +30,7 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           repository: pytorch/tensorrt
       - name: Generate tensorrt matrix
@@ -132,7 +132,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/
         popd
 
@@ -161,7 +163,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
         popd
 
@@ -190,7 +194,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
         popd
 
@@ -219,7 +225,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
@@ -250,7 +258,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml  --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
@@ -281,7 +291,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         nvidia-smi
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py || true
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py || true
diff --git a/.github/workflows/build-test-tensorrt-windows.yml b/.github/workflows/build-test-tensorrt-windows.yml
index 883e7fe42a..cd73675407 100644
--- a/.github/workflows/build-test-tensorrt-windows.yml
+++ b/.github/workflows/build-test-tensorrt-windows.yml
@@ -30,7 +30,7 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           repository: pytorch/tensorrt
       - name: Generate tensorrt matrix
@@ -135,7 +135,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/
         popd
 
@@ -161,7 +163,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
         popd
 
@@ -187,7 +191,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
         popd
 
@@ -213,7 +219,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
@@ -241,7 +249,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
@@ -269,7 +279,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py
         popd
diff --git a/.github/workflows/build-test-windows.yml b/.github/workflows/build-test-windows.yml
index c227d14a0f..2ee31b4b74 100644
--- a/.github/workflows/build-test-windows.yml
+++ b/.github/workflows/build-test-windows.yml
@@ -118,7 +118,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dynamo_converters_test_results.xml -n 4 conversion/
         popd
 
@@ -144,7 +146,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/dyn_models_export.xml --ir dynamo models/
         popd
 
@@ -170,7 +174,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/export_serde_test_results.xml --ir dynamo models/test_export_serde.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/reexport_test_results.xml --ir dynamo models/test_reexport.py
         popd
@@ -197,7 +203,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra -n 10 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_be_test_results.xml backend/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_complete_be_e2e_test_results.xml --ir torch_compile models/test_models.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/torch_compile_dyn_models_export.xml --ir torch_compile models/test_dyn_models.py
@@ -225,7 +233,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore runtime/test_002_cudagraphs_py.py --ignore runtime/test_002_cudagraphs_cpp.py runtime/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
         python -m pytest -ra -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
@@ -253,7 +263,9 @@ jobs:
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         pushd .
-        cd tests/py/dynamo
+        cd tests/py
+        python -m pip install -r requirements.txt
+        cd dynamo
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_cpp_test_results.xml runtime/test_002_cudagraphs_cpp.py
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_cudagraphs_py_test_results.xml runtime/test_002_cudagraphs_py.py
         popd
diff --git a/.github/workflows/docker_builder.yml b/.github/workflows/docker_builder.yml
index 4aa228db95..771dc79f42 100644
--- a/.github/workflows/docker_builder.yml
+++ b/.github/workflows/docker_builder.yml
@@ -30,7 +30,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     - name: Fix Slashes Repo Name
       id: fix_slashes
diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 6428bef8c8..c05f45b6c7 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -26,7 +26,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
       - name: Set up Python 3.9
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
           python-version: '3.9'
       - name: Setup env
@@ -66,7 +66,7 @@ jobs:
         with:
           ref: ${{ github.event.pull_request.head.sha }}
       - name: Set up Python 3.9
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
           python-version: '3.9'
       - name: Setup env
diff --git a/.github/workflows/linux-test.yml b/.github/workflows/linux-test.yml
index 6ddc601f2c..e4880f8ee8 100644
--- a/.github/workflows/linux-test.yml
+++ b/.github/workflows/linux-test.yml
@@ -85,7 +85,7 @@ jobs:
           rm -rfv "${GITHUB_WORKSPACE}"
           mkdir -p "${GITHUB_WORKSPACE}"
           echo "::endgroup::"
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           # Support the use case where we need to checkout someone's fork
           repository: ${{ inputs.test-infra-repository }}
@@ -120,7 +120,7 @@ jobs:
           path: /opt/torch-tensorrt-builds/
       - name: Download artifacts
         if: ${{ matrix.tensorrt != '' }}
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: ${{ env.DOWNLOAD_ARTIFACT_NAME }}
           path: /opt/torch-tensorrt-builds/
@@ -184,7 +184,7 @@ jobs:
           echo "upload-docs=${upload_docs}" >> "${GITHUB_OUTPUT}"
 
       - name: Upload artifacts to GitHub (if any)
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: ${{ inputs.upload-artifact != '' }}
         with:
           name: ${{ inputs.upload-artifact }}
diff --git a/.github/workflows/nightlies.yml b/.github/workflows/nightlies.yml
index a0692cdafe..aac1c58f7f 100644
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -11,7 +11,7 @@ jobs:
     environment: trigger-nightly
     timeout-minutes: 120
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           ref: main
           token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
diff --git a/.github/workflows/release-linux.yml b/.github/workflows/release-linux.yml
index ca13b37443..8caf525e76 100644
--- a/.github/workflows/release-linux.yml
+++ b/.github/workflows/release-linux.yml
@@ -34,7 +34,7 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           repository: pytorch/tensorrt
       - name: Generate release matrix
@@ -84,7 +84,7 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           repository: pytorch/tensorrt
       - name: Generate release matrix
diff --git a/.github/workflows/release-wheel-linux.yml b/.github/workflows/release-wheel-linux.yml
index 6ddd9e0306..54732378eb 100644
--- a/.github/workflows/release-wheel-linux.yml
+++ b/.github/workflows/release-wheel-linux.yml
@@ -114,13 +114,13 @@ jobs:
             rm -rf "${RUNNER_TEMP}/*"
           fi
           echo "::endgroup::"
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           # Support the use case where we need to checkout someone's fork
           repository: ${{ inputs.test-infra-repository }}
           ref: ${{ inputs.test-infra-ref }}
           path: test-infra
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         if: ${{ env.ARCH == 'aarch64' }}
         with:
           # Support the use case where we need to checkout someone's fork
@@ -236,21 +236,21 @@ jobs:
       - name: Upload wheel to GitHub
         if: ${{ inputs.cxx11-tarball-release != 'true' }}
         continue-on-error: true
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: ${{ env.ARTIFACT_NAME }}
           path: ${{ inputs.repository }}/release/wheel/
       - name: Upload pre-cxx11 tarball to GitHub
         if: ${{ inputs.cxx11-tarball-release != 'true' && env.PYTHON_VERSION == '3.10' }}
         continue-on-error: true
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: pre-cxx11-tarball-${{ env.PYTHON_VERSION }}-${{ env.CU_VERSION }}
           path: ${{ inputs.repository }}/release/tarball/
       - name: Upload cxx11 tarball to GitHub
         if: ${{ inputs.cxx11-tarball-release == 'true' }}
         continue-on-error: true
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: cxx11-tarball-${{ env.PYTHON_VERSION }}-${{ env.CU_VERSION }}
           path: ${{ inputs.repository }}/release/tarball/
diff --git a/.github/workflows/release-wheel-windows.yml b/.github/workflows/release-wheel-windows.yml
index 6a6c993502..2ea88bce9e 100644
--- a/.github/workflows/release-wheel-windows.yml
+++ b/.github/workflows/release-wheel-windows.yml
@@ -90,7 +90,7 @@ jobs:
     # to have a conversation
     timeout-minutes: 120
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           # Support the use case where we need to checkout someone's fork
           repository: ${{ inputs.test-infra-repository }}
@@ -199,7 +199,7 @@ jobs:
       # NB: Only upload to GitHub after passing smoke tests
       - name: Upload wheel to GitHub
         continue-on-error: true
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: ${{ env.ARTIFACT_NAME }}
           path: ${{ inputs.repository }}/dist/
diff --git a/.github/workflows/release-windows.yml b/.github/workflows/release-windows.yml
index 271547cec3..489cc6ab30 100644
--- a/.github/workflows/release-windows.yml
+++ b/.github/workflows/release-windows.yml
@@ -34,7 +34,7 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: '3.10'
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           repository: pytorch/tensorrt
       - name: Generate release matrix
diff --git a/.github/workflows/windows-test.yml b/.github/workflows/windows-test.yml
index 13feedfa8c..a8b27c0aa9 100644
--- a/.github/workflows/windows-test.yml
+++ b/.github/workflows/windows-test.yml
@@ -70,7 +70,7 @@ jobs:
           mkdir -p "${GITHUB_WORKSPACE}"
           echo "::endgroup::"
       - name: Checkout repository (${{ inputs.test-infra-repository }}@${{ inputs.test-infra-ref }})
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           # Support the use case where we need to checkout someone's fork
           repository: ${{ inputs.test-infra-repository }}
@@ -105,13 +105,13 @@ jobs:
           is_windows: 'enabled'
       - name: Download artifacts
         if: ${{ matrix.tensorrt == '' }}
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: ${{ env.ARTIFACT_NAME }}
           path: ${{ runner.temp }}/artifacts/
       - name: Download artifacts
         if: ${{ matrix.tensorrt != '' }}
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: ${{ env.DOWNLOAD_ARTIFACT_NAME }}
           path: ${{ runner.temp }}/artifacts/

From e3141ed122ccfc677c00ddebbbaf4460d0a7c11b Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Wed, 29 Jan 2025 16:57:06 -0800
Subject: [PATCH 5/5] chore: fix linting issues

---
 noxfile.py | 4 +++-
 setup.py   | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index 6dcf1da60f..10c9b647fa 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -237,7 +237,9 @@ def run_dynamo_lower_tests(session):
     tests = ["lowering"]
     for test in tests:
         if USE_HOST_DEPS:
-            session.run_always("pytest", test, "-n", num_workers, env={"PYTHONPATH": PYT_PATH})
+            session.run_always(
+                "pytest", test, "-n", num_workers, env={"PYTHONPATH": PYT_PATH}
+            )
         else:
             session.run_always("pytest", test, "-n", num_workers)
 
diff --git a/setup.py b/setup.py
index 17b3d33c75..91648e57a1 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,7 @@
 
 
 def get_root_dir() -> Path:
-    dir_path = os.path.dirname(os.path.realpath(__file__)) 
+    dir_path = os.path.dirname(os.path.realpath(__file__))
     return dir_path
 
 
@@ -119,7 +119,6 @@ def load_dep_info():
     gpu_arch_version = f"cu{__cuda_version__.replace('.','')}"
 
 
-
 __version__ = os.environ.get("BUILD_VERSION")
 
 if "--ci" in sys.argv: