[TensorRT EP] Add new provider option to exclude ops from running on …

…TRT (#23705) This PR removes the implicit filtering-out DDS ops from running on TRT. In other words, by default, DDS nodes will be run by TRT if it supports. Moreover, it adds new provider option `trt_op_types_to_exclude`: - User can provide op type list to be excluded from running on TRT - e.g. `trt_op_types_to_exclude="NonMaxSuppression,NonZero,RoiAlignl"` (This PR basically adds back [feature](#22681 previously being held to merge.) [Note] There may be potential performance issues in TRT 10 when running models that contain DDS operations such as NonMaxSuppression, NonZero, and RoiAlign (e.g., Faster-RCNN). If user encounters significant performance degradation, we suggest specifying those DDS ops to be excluded from running by TRT, i.e. trt_op_types_to_exclude=\"NonMaxSuppression,NonZero,RoiAlign\". Those DDS nodes will be run by CUDA EP or CPU.
microsoft · Feb 21, 2025 · 23f787e · 23f787e
1 parent 1b0a2ba
commit 23f787e
Show file tree

Hide file tree

Showing 12 changed files with 139 additions and 16 deletions.
diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -88,4 +88,5 @@ struct OrtTensorRTProviderOptionsV2 {
 
   const char* trt_engine_cache_prefix{nullptr};  // specify engine cache prefix
   int trt_engine_hw_compatible{0};               // Enable hardware compatibility. Default 0 = false, nonzero = true
+  const char* trt_op_types_to_exclude{};         // Exclude specific ops from running on TRT.
 };
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1379,6 +1379,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     profile_opt_shapes = info.profile_opt_shapes;
     cuda_graph_enable_ = info.cuda_graph_enable;
     engine_hw_compatible_ = info.engine_hw_compatible;
+    op_types_to_exclude_ = info.op_types_to_exclude;
   } else {
     try {
       const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations);
@@ -1565,6 +1566,11 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
         cuda_graph_enable_ = (std::stoi(cuda_graph_enable_env) == 0 ? false : true);
       }
 
+      const std::string op_types_to_exclude_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kOpTypesToExclude);
+      if (!op_types_to_exclude_env.empty()) {
+        op_types_to_exclude_ = op_types_to_exclude_env;
+      }
+
     } catch (const std::invalid_argument& ex) {
       LOGS_DEFAULT(WARNING) << "[TensorRT EP] Invalid Argument (from environment variables): " << ex.what();
     } catch (const std::out_of_range& ex) {
@@ -1768,7 +1774,8 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
                         << ", trt_ep_context_embed_mode: " << ep_context_embed_mode_
                         << ", trt_cache_prefix: " << cache_prefix_
                         << ", trt_engine_hw_compatible: " << engine_hw_compatible_
-                        << ", trt_onnx_model_bytestream_size_: " << onnx_model_bytestream_size_;
+                        << ", trt_onnx_model_bytestream_size_: " << onnx_model_bytestream_size_
+                        << ", trt_op_types_to_exclude: " << op_types_to_exclude_;
 }
 
 TensorrtExecutionProvider::~TensorrtExecutionProvider() {
@@ -2482,18 +2489,19 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   std::vector<size_t> nodes_vector(number_of_ort_nodes);
   std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0);
 
-  std::set<std::string> exclude_ops_set;
+  auto get_exclude_ops_set = [&](std::string node_list_to_exclude) -> std::set<std::string> {
+    std::set<std::string> set;
+    if (!node_list_to_exclude.empty()) {
+      std::stringstream node_list(node_list_to_exclude);
+      std::string node;
+      while (std::getline(node_list, node, ',')) {
+        set.insert(node);
+      }
+    }
+    return set;
+  };
 
-  /*
-   * There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10.
-   * TRT EP automatically excludes DDS ops from running on TRT.
-   */
-  if (trt_version_ >= 100000 && trt_version_ < 110000) {
-    exclude_ops_set.insert("NonMaxSuppression");
-    exclude_ops_set.insert("NonZero");
-    exclude_ops_set.insert("RoiAlign");
-    LOGS_DEFAULT(VERBOSE) << "There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10. TRT EP automatically excludes DDS ops from running on TRT, if applicable";
-  }
+  auto exclude_ops_set = get_exclude_ops_set(op_types_to_exclude_);
 
   SubGraphCollection_t parser_nodes_vector, supported_nodes_vector;
   const std::vector<NodeIndex>& node_index = graph.GetNodesInTopologicalOrder(1 /*priority-based topological sort*/);

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -57,6 +57,7 @@ static const std::string kDumpEpContextModel = "ORT_DUMP_EP_CONTEXT_MODEL";
 static const std::string kEpContextEmbedMode = "ORT_EP_CONTEXT_EMBED_MODE";
 static const std::string kEpContextComputeCapabilityEnable = "ORT_EP_CONTEXT_COMPUTE_CAPABILITY_ENABLE";
 static const std::string kEngineCachePrefix = "ORT_TENSORRT_CACHE_PREFIX";
+static const std::string kOpTypesToExclude = "ORT_TENSORRT_OP_TYPES_TO_EXCLUDE";
 // Old env variable for backward compatibility
 static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
 }  // namespace tensorrt_env_vars

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -56,6 +56,7 @@ constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
 constexpr const char* kEngineHwCompatible = "trt_engine_hw_compatible";
 constexpr const char* kONNXBytestream = "trt_onnx_bytestream";
 constexpr const char* kONNXBytestreamSize = "trt_onnx_bytestream_size";
+constexpr const char* kOpTypesToExclude = "trt_op_types_to_exclude";
 
 }  // namespace provider_option_names
 }  // namespace tensorrt
@@ -134,6 +135,7 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
                 return Status::OK();
               })
           .AddAssignmentToReference(tensorrt::provider_option_names::kONNXBytestreamSize, info.onnx_bytestream_size)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kOpTypesToExclude, info.op_types_to_exclude)
           .Parse(options));  // add new provider option here.
 
   info.user_compute_stream = user_compute_stream;
@@ -188,6 +190,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.engine_hw_compatible)},
       {tensorrt::provider_option_names::kONNXBytestream, MakeStringWithClassicLocale(info.onnx_bytestream)},
       {tensorrt::provider_option_names::kONNXBytestreamSize, MakeStringWithClassicLocale(info.onnx_bytestream_size)},
+      {tensorrt::provider_option_names::kOpTypesToExclude, MakeStringWithClassicLocale(info.op_types_to_exclude)},
   };
   return options;
 }
@@ -206,6 +209,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
   const std::string kProfilesOptShapes_ = empty_if_null(info.trt_profile_opt_shapes);
   const std::string kEpContextFilePath_ = empty_if_null(info.trt_ep_context_file_path);
   const std::string kOnnxModelFolderPath_ = empty_if_null(info.trt_onnx_model_folder_path);
+  const std::string kOpTypesToExclude_ = empty_if_null(info.trt_op_types_to_exclude);
 
   const ProviderOptions options{
       {tensorrt::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
@@ -251,6 +255,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.trt_engine_hw_compatible)},
       {tensorrt::provider_option_names::kONNXBytestream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.trt_onnx_bytestream))},
       {tensorrt::provider_option_names::kONNXBytestreamSize, MakeStringWithClassicLocale(info.trt_onnx_bytestream_size)},
+      {tensorrt::provider_option_names::kOpTypesToExclude, kOpTypesToExclude_},
   };
   return options;
 }
@@ -355,5 +360,6 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_engine_hw_compatible = internal_options.engine_hw_compatible;
   trt_provider_options_v2.trt_onnx_bytestream = internal_options.onnx_bytestream;
   trt_provider_options_v2.trt_onnx_bytestream_size = internal_options.onnx_bytestream_size;
+  trt_provider_options_v2.trt_op_types_to_exclude = copy_string_if_needed(internal_options.op_types_to_exclude);
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -60,6 +60,7 @@ struct TensorrtExecutionProviderInfo {
   int ep_context_embed_mode{0};
   std::string engine_cache_prefix{""};
   bool engine_hw_compatible{false};
+  std::string op_types_to_exclude{""};
 
   static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -118,6 +118,7 @@ struct Tensorrt_Provider : Provider {
     info.engine_hw_compatible = options.trt_engine_hw_compatible != 0;
     info.onnx_bytestream = options.trt_onnx_bytestream;
     info.onnx_bytestream_size = options.trt_onnx_bytestream_size;
+    info.op_types_to_exclude = options.trt_op_types_to_exclude == nullptr ? "" : options.trt_op_types_to_exclude;
 
     return std::make_shared<TensorrtProviderFactory>(info);
   }

diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -2583,6 +2583,7 @@ ORT_API(void, OrtApis::ReleaseTensorRTProviderOptions, _Frees_ptr_opt_ OrtTensor
     delete[] ptr->trt_profile_opt_shapes;
     delete[] ptr->trt_ep_context_file_path;
     delete[] ptr->trt_onnx_model_folder_path;
+    delete[] ptr->trt_op_types_to_exclude;
   }
 
   std::unique_ptr<OrtTensorRTProviderOptionsV2> p(ptr);

diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -526,7 +526,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       // and TRT EP instance, so it won't be released.)
       std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources,
           trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile, ep_context_file_path,
-          onnx_model_folder_path;
+          onnx_model_folder_path, trt_op_types_to_exclude;
       auto it = provider_options_map.find(type);
       if (it != provider_options_map.end()) {
         OrtTensorRTProviderOptionsV2 params;
@@ -824,6 +824,9 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_engine_hw_compatible' should be 'True' or 'False'. Default value is 'False'.\n");
             }
+          } else if (option.first == "trt_op_types_to_exclude") {
+            trt_op_types_to_exclude = option.second;
+            params.trt_op_types_to_exclude = trt_op_types_to_exclude.c_str();
           } else {
             ORT_THROW("Invalid TensorRT EP option: ", option.first);
           }

diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
@@ -317,6 +317,11 @@ void BaseTester::ExecuteModel(Model& model, SessionType& session,
     ASSERT_EQ(expect_result, ExpectResult::kExpectFailure) << "Initialize failed but expected success: "
                                                            << status.ErrorMessage();
 
+    // No need to check expected failure string if empty string is given.
+    if (expected_failure_string.empty()) {
+      return;
+    }
+
     // Disable expected_failure_string checks for OpenVINO EP
     if (provider_type != kOpenVINOExecutionProvider) {
       EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr(expected_failure_string));
@@ -337,6 +342,11 @@ void BaseTester::ExecuteModel(Model& model, SessionType& session,
       ASSERT_EQ(expect_result, ExpectResult::kExpectFailure) << "Run failed but expected success: "
                                                              << status.ErrorMessage();
 
+      // No need to check expected failure string if empty string is given.
+      if (expected_failure_string.empty()) {
+        return;
+      }
+
       // Disable expected_failure_string checks for MKL-DNN and OpenVINO EP's
       if (provider_type != kDnnlExecutionProvider &&
           provider_type != kOpenVINOExecutionProvider) {

diff --git a/onnxruntime/test/providers/cpu/object_detection/non_max_suppression_test.cc b/onnxruntime/test/providers/cpu/object_detection/non_max_suppression_test.cc
@@ -63,13 +63,21 @@ TEST(NonMaxSuppressionOpTest, TwoClasses) {
   test.AddInput<int64_t>("max_output_boxes_per_class", {}, {6L});
   test.AddInput<float>("iou_threshold", {}, {0.5f});
   test.AddInput<float>("score_threshold", {}, {0.0f});
+  // The selected_indices in ORT is sorted by class, whereas in TRT the selected_indices is sorted by score,
+// So it needs to sort the output to pass the output check when there is more than one class using TRT EP.
+#ifdef USE_TENSORRT
+  bool sort_output = true;
+#else
+  bool sort_output = false;  // default
+#endif
   test.AddOutput<int64_t>("selected_indices", {6, 3},
                           {0L, 0L, 3L,
                            0L, 0L, 0L,
                            0L, 0L, 5L,
                            0L, 1L, 3L,
                            0L, 1L, 0L,
-                           0L, 1L, 5L});
+                           0L, 1L, 5L},
+                          sort_output);
   test.Run();
 }
 
@@ -125,6 +133,13 @@ TEST(NonMaxSuppressionOpTest, TwoBatches_TwoClasses) {
                         0.1f, 0.2f, 0.6f, 0.3f, 0.9f});
   test.AddInput<int64_t>("max_output_boxes_per_class", {}, {2L});
   test.AddInput<float>("iou_threshold", {}, {0.8f});
+  // The selected_indices in ORT is sorted by class, whereas in TRT the selected_indices is sorted by score,
+// So it needs to sort the output to pass the output check when there is more than one class using TRT EP.
+#ifdef USE_TENSORRT
+  bool sort_output = true;
+#else
+  bool sort_output = false;  // default
+#endif
   test.AddOutput<int64_t>("selected_indices", {8, 3},
                           {0L, 0L, 4L,
                            0L, 0L, 2L,
@@ -134,7 +149,8 @@ TEST(NonMaxSuppressionOpTest, TwoBatches_TwoClasses) {
                            1L, 0L, 4L,
                            1L, 0L, 1L,
                            1L, 1L, 4L,
-                           1L, 1L, 1L});
+                           1L, 1L, 1L},
+                          sort_output);
   test.Run();
 }
 
@@ -302,7 +318,11 @@ TEST(NonMaxSuppressionOpTest, InconsistentBoxAndScoreShapes) {
   test.AddInput<float>("iou_threshold", {}, {0.5f});
   test.AddInput<float>("score_threshold", {}, {0.0f});
   test.AddOutput<int64_t>("selected_indices", {0, 3}, {});
+#ifdef USE_TENSORRT
+  test.Run(OpTester::ExpectResult::kExpectFailure, "");  // TensorRT EP will output different failure message, providing empty string simply skips checking the error message.
+#else
   test.Run(OpTester::ExpectResult::kExpectFailure, "boxes and scores should have same spatial_dimension.");
+#endif
 }
 
 TEST(NonMaxSuppressionOpTest, InvalidIOUThreshold) {
@@ -313,7 +333,8 @@ TEST(NonMaxSuppressionOpTest, InvalidIOUThreshold) {
   test.AddInput<float>("iou_threshold", {}, {1.2f});
   test.AddInput<float>("score_threshold", {}, {0.0f});
   test.AddOutput<int64_t>("selected_indices", {0, 3}, {});
-  test.Run(OpTester::ExpectResult::kExpectFailure, "iou_threshold must be in range [0, 1]");
+  // TRT is missing a runtime check validating IouThreshold value. Once the bug is fixed, we will add back this unit test for TRT.
+  test.Run(OpTester::ExpectResult::kExpectFailure, "iou_threshold must be in range [0, 1]", {kTensorrtExecutionProvider});
 }
 
 TEST(NonMaxSuppressionOpTest, EmptyInput) {

diff --git a/onnxruntime/test/providers/cpu/tensor/nonzero_op_test.cc b/onnxruntime/test/providers/cpu/tensor/nonzero_op_test.cc
@@ -77,13 +77,23 @@ TEST(NonZeroOpTest, Scalar) {
   {
     OpTester test{kOpName, kOpVersion};
     test.AddInput<int32_t>("X", {}, {0});
+#ifdef USE_TENSORRT
+    // TensorRT follows ONNX spec where NonZero produces output shape (0, N) instead of (1, N) for scalar input
+    test.AddOutput<int64_t>("Y", {0, 0}, {});
+#else
     test.AddOutput<int64_t>("Y", {1, 0}, {});
+#endif
     test.Run(so);
   }
   {
     OpTester test{kOpName, kOpVersion};
     test.AddInput<int32_t>("X", {}, {1});
+#ifdef USE_TENSORRT
+    // TensorRT follows ONNX spec where NonZero produces output shape (0, N) instead of (1, N) for scalar input
+    test.AddOutput<int64_t>("Y", {0, 1}, {});
+#else
     test.AddOutput<int64_t>("Y", {1, 1}, {0});
+#endif
     test.Run(so);
   }
 }

diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -616,6 +616,66 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) {
   RunSession(session_object9, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
 }
 
+TEST(TensorrtExecutionProviderTest, ExcludeOpsTest) {
+  /* The mnist.onnx looks like this:
+   *        Conv
+   *         |
+   *        Add
+   *         .
+   *         .
+   *         |
+   *      MaxPool
+   *         |
+   *         .
+   *         .
+   *      MaxPool
+   *         |
+   *      Reshape
+   *         |
+   *      MatMul
+   *         .
+   *         .
+   *
+   */
+  PathString model_name = ORT_TSTR("testdata/mnist.onnx");
+  SessionOptions so;
+  so.session_logid = "TensorrtExecutionProviderExcludeOpsTest";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+  InferenceSession session_object{so, GetEnvironment()};
+  auto cuda_provider = DefaultCudaExecutionProvider();
+  auto cpu_allocator = cuda_provider->CreatePreferredAllocators()[1];
+  std::vector<int64_t> dims_op_x = {1, 1, 28, 28};
+  std::vector<float> values_op_x(784, 1.0f);  // 784=1*1*28*28
+  OrtValue ml_value_x;
+  CreateMLValue<float>(cpu_allocator, dims_op_x, values_op_x, &ml_value_x);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("Input3", ml_value_x));
+
+  // prepare outputs
+  std::vector<std::string> output_names;
+  output_names.push_back("Plus214_Output_0");
+  std::vector<OrtValue> fetches;
+
+  RemoveCachesByType("./", ".engine");
+  OrtTensorRTProviderOptionsV2 params;
+  params.trt_engine_cache_enable = 1;
+  params.trt_op_types_to_exclude = "MaxPool";
+  std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  auto status = session_object.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  status = session_object.Run(run_options, feeds, output_names, &fetches);
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<fs::path> engine_files;
+  engine_files = GetCachesByType("./", ".engine");
+  // The whole graph should be partitioned into 3 TRT subgraphs and 2 cpu nodes
+  ASSERT_EQ(engine_files.size(), 3);
+}
+
 TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) {
   PathString model_name = ORT_TSTR("testdata/trt_plugin_custom_op_test.onnx");
   SessionOptions so;