diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index 15d89b536b39a..e11537492d3a7 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -737,7 +737,6 @@ public void testCoreML() throws OrtException {
     runProvider(OrtProvider.CORE_ML);
   }
 
-  @Disabled("DirectML Java API hasn't been supported yet")
   @Test
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
index fa0b6fd0ef9d9..57c4eb3577fd0 100644
--- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
+++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@@ -27,7 +27,6 @@
 import java.util.HashMap;
 import java.util.Map;
 import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
 import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
 
 public class ProviderOptionsTest {
@@ -35,7 +34,6 @@ public class ProviderOptionsTest {
 
   @Test
   @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
-  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
   public void testCUDAOptions() throws OrtException {
     // Test standard options
     OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
@@ -63,7 +61,6 @@ public void testCUDAOptions() throws OrtException {
 
   @Test
   @EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1")
-  @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
   public void testTensorRT() throws OrtException {
     // Test standard options
     OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0);
diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
index d3e069237217e..6f3e460628566 100644
--- a/onnxruntime/test/common/cuda_op_test_utils.h
+++ b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -5,11 +5,6 @@
 
 #include "test/util/include/default_providers.h"
 
-#define SKIP_CUDA_TEST_WITH_DML                                          \
-  if (DefaultCudaExecutionProvider() == nullptr) {                       \
-    GTEST_SKIP() << "CUDA Tests are not supported while DML is enabled"; \
-  }
-
 namespace onnxruntime {
 namespace test {
 
@@ -18,10 +13,6 @@ namespace test {
 int GetCudaArchitecture();
 
 inline bool HasCudaEnvironment(int min_cuda_architecture) {
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return false;
-  }
-
   if (DefaultCudaExecutionProvider().get() == nullptr) {
     return false;
   }
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 8c69e2d9810b8..9f4ee071925b4 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -75,9 +75,6 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
   const char* const output_names[] = {"sequences"};
 
   Ort::SessionOptions session_options;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
 #ifdef USE_CUDA
   OrtCUDAProviderOptionsV2 cuda_options;
   cuda_options.use_tf32 = false;
@@ -171,9 +168,6 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
   bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
-#if defined(USE_CUDA) && defined(USE_DML)
-    SKIP_CUDA_TEST_WITH_DML;
-#endif
 #ifdef USE_CUDA
     OrtCUDAProviderOptionsV2 cuda_options;
     cuda_options.use_tf32 = false;
diff --git a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
index 297629b015796..027d4b3fff1b0 100644
--- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
@@ -181,9 +181,6 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_s
   t.SetCustomOutputVerifier(output_verifier);
   std::vector<std::unique_ptr<IExecutionProvider>> t_eps;
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   t_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   t_eps.emplace_back(DefaultRocmExecutionProvider());
diff --git a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
index 26b0e3a4dd7a9..7ca4e1004066c 100644
--- a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
@@ -61,9 +61,7 @@ void RunTestForInference(const std::vector<int64_t>& input_dims, bool has_ratio
 
   std::vector<std::unique_ptr<IExecutionProvider>> test_eps;
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    test_eps.emplace_back(DefaultCudaExecutionProvider());
-  }
+  test_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   test_eps.emplace_back(DefaultRocmExecutionProvider());
 #endif
@@ -124,9 +122,6 @@ void RunTestForTraining(const std::vector<int64_t>& input_dims) {
 
     std::vector<std::unique_ptr<IExecutionProvider>> dropout_eps;
 #ifdef USE_CUDA
-    if (DefaultCudaExecutionProvider() == nullptr) {
-      return;
-    }
     dropout_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
     dropout_eps.emplace_back(DefaultRocmExecutionProvider());
diff --git a/onnxruntime/test/contrib_ops/layer_norm_test.cc b/onnxruntime/test/contrib_ops/layer_norm_test.cc
index b414a98c4e756..46082e1b0cd31 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_test.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 #include "test/providers/compare_provider_test_utils.h"
-#include "test/util/include/default_providers.h"
 
 namespace onnxruntime {
 namespace test {
@@ -80,20 +79,14 @@ static void TestLayerNorm(const std::vector<int64_t>& x_dims,
 #endif
 
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    test.CompareWithCPU(kCudaExecutionProvider);
-  }
+  test.CompareWithCPU(kCudaExecutionProvider);
 #elif USE_ROCM
   test.CompareWithCPU(kRocmExecutionProvider);
+#elif USE_DML
+  test.CompareWithCPU(kDmlExecutionProvider);
 #elif USE_WEBGPU
   test.CompareWithCPU(kWebGpuExecutionProvider);
 #endif
-
-#ifdef USE_DML
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    test.CompareWithCPU(kDmlExecutionProvider);
-  }
-#endif
 }
 
 TEST(CudaKernelTest, LayerNorm_NullInput) {
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 6dedce24e7e07..eebe9197573c6 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -490,17 +490,13 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   if (use_float16) {
 #ifdef USE_CUDA
-    if (DefaultCudaExecutionProvider() != nullptr) {
-      execution_providers.push_back(DefaultCudaExecutionProvider());
-    }
+    execution_providers.push_back(DefaultCudaExecutionProvider());
 #endif
 #ifdef USE_ROCM
     execution_providers.push_back(DefaultRocmExecutionProvider());
 #endif
 #ifdef USE_DML
-    if (DefaultDmlExecutionProvider() != nullptr) {
-      execution_providers.push_back(DefaultDmlExecutionProvider());
-    }
+    execution_providers.push_back(DefaultDmlExecutionProvider());
 #endif
 #ifdef USE_WEBGPU
     execution_providers.push_back(DefaultWebGpuExecutionProvider());
@@ -518,11 +514,8 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
 }  // namespace
 
 TEST(MatMulNBits, Float16Cuda) {
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-  std::vector<bool> has_gidx_options = {true, false};
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    has_gidx_options.assign(1, false);
-  }
+#if defined(USE_CUDA) || defined(USE_ROCM)
+  auto has_gidx_options = {true, false};
 #else
   auto has_gidx_options = {false};
 #endif
@@ -533,9 +526,7 @@ TEST(MatMulNBits, Float16Cuda) {
         for (auto block_size : {16, 32, 64, 128}) {
           for (auto has_gidx : has_gidx_options) {
 #ifdef USE_DML
-            if (DefaultDmlExecutionProvider() != nullptr) {
-              RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
-            }
+            RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
 #else
             RunTest(M, N, K, block_size, 0, false, true, has_gidx);
             RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
@@ -548,16 +539,12 @@ TEST(MatMulNBits, Float16Cuda) {
 }
 
 TEST(MatMulNBits, Float16Large) {
-#if defined(USE_CUDA) || defined(USE_DML)
+#ifdef USE_DML
   // For some reason, the A10 machine that runs these tests during CI has a much bigger error than all retail
   // machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an
   // absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number
   // of elements in this test, ULPs should probably be used instead of absolute/relative tolerances.
-  float abs_error = 0.05f;
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    // it means the ep is dml in runtime, the abs_error is changed to 0.3f
-    abs_error = 0.3f;
-  }
+  float abs_error = 0.3f;
 #elif USE_WEBGPU
   // See Intel A770 to pass these tests with an absolute error of 0.08.
   float abs_error = 0.08f;
@@ -573,6 +560,7 @@ TEST(MatMulNBits, Float16Large) {
     }
   }
 }
+
 #endif  // defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index d88c3131a4ca5..8d7629b5fda1c 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -227,7 +227,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
 }
 
 // DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output
-#if defined(USE_DML) && !defined(USE_CUDA)
+#if defined(USE_DML)
 
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
   RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>();
diff --git a/onnxruntime/test/contrib_ops/tensor_op_test.cc b/onnxruntime/test/contrib_ops/tensor_op_test.cc
index d5e2ddebfe67f..bc2ff5f4f724d 100644
--- a/onnxruntime/test/contrib_ops/tensor_op_test.cc
+++ b/onnxruntime/test/contrib_ops/tensor_op_test.cc
@@ -121,15 +121,7 @@ void MeanVarianceNormalizationAcrossChannels(bool across_channels, bool normaliz
   test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
   test.AddInput<float>("input", {N, C, H, W}, X);
   test.AddOutput<float>("output", {N, C, H, W}, result);
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
-  } else if (DefaultDmlExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
-  }
-#else
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider});  // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
-#endif
 }
 
 void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_variance) {
@@ -196,15 +188,7 @@ void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_va
   test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
   test.AddInput<float>("input", {N, C, H, W}, X);
   test.AddOutput<float>("output", {N, C, H, W}, result);
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
-  } else if (DefaultDmlExecutionProvider() == nullptr) {
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
-  }
-#else
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider});  // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
-#endif
 }
 
 TEST(MVNContribOpTest, MeanVarianceNormalizationCPUTest_Version1_TO_8) {
@@ -246,9 +230,7 @@ TEST(UnfoldTensorOpTest, LastDim) {
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    execution_providers.push_back(DefaultCudaExecutionProvider());
-  }
+  execution_providers.push_back(DefaultCudaExecutionProvider());
 #endif
   execution_providers.push_back(DefaultCpuExecutionProvider());
   tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index adab93908cdc4..eaebac177ca91 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -28,7 +28,6 @@ using json = nlohmann::json;
 #ifdef USE_CUDA
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_provider_factory.h"
-#include "test/common/cuda_op_test_utils.h"
 #endif  // USE_CUDA
 #include "core/session/onnxruntime_session_options_config_keys.h"
 using namespace ONNX_NAMESPACE;
@@ -897,9 +896,6 @@ TEST_F(PlannerTest, LocationPlanningForPassThroughExplicitAndImplicitSubgraphInp
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1042,9 +1038,6 @@ TEST_F(PlannerTest, LocationPlanningForInitializersOnlyUsedInANestedSubgraph) {
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1152,9 +1145,6 @@ TEST_F(PlannerTest, LocationPlanningForInitializersUsedOnDifferentDevicesInMainG
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1247,9 +1237,6 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1282,10 +1269,6 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM
 // Test MultiStream scenario for the graph:
 // node1(CPU ep)->node2(CPU ep)->node3(CUDA ep)->node4(CPU ep)
 TEST_F(PlannerTest, MultiStream) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   ONNX_NAMESPACE::TensorProto tensor;
   tensor.add_dims(1);
   tensor.add_float_data(1.0f);
@@ -1304,7 +1287,6 @@ TEST_F(PlannerTest, MultiStream) {
   onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
   auto epFactory = ep.CreateExecutionProviderFactory(epi);
   std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
-
   ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
 
   CreatePlan({}, false);
@@ -1332,9 +1314,6 @@ TEST_F(PlannerTest, MultiStream) {
 //      node3
 // All 3 nodes are CUDA EP, node1 is in stream0, node2 is in stream1, node3 is in stream2
 TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernelAdd = KernelDefBuilder().SetName("Add").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
@@ -1376,9 +1355,6 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
 // stream 1: node2 (CPU EP)
 // node1's output, which is consumed by both node2 and node3, is in CPU.
 TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json");
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams";
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps";
@@ -1400,11 +1376,6 @@ TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
 // TODO(leca): there is a bug in the corresponding graph that node2 will be visited twice when traversing node1's output nodes
 // (see: for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) in BuildExecutionPlan()). We can just break the loop and don't need the extra variables once it is fixed
 TEST_F(PlannerTest, MultiStreamMultiOutput) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
-#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("RNN").Provider(kCudaExecutionProvider).SinceVersion(7).Build();
   std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2");
   std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input1), Arg(Graph_input2), Arg(Graph_input3)}, output1{Arg(Arg1), Arg(Arg2)}, input2{Arg(Arg1), Arg(Arg2)}, output2{Arg(Arg3)};
@@ -1442,9 +1413,6 @@ TEST_F(PlannerTest, MultiStreamMultiOutput) {
 // TODO(leca): the ideal case is there is only 1 wait step before launching node3,
 // as there is a specific order between node1 and node2 if they are in the same stream, thus node3 will only need to wait the latter one
 TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
   std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input1)}, input2{Arg(Graph_input2)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, input3{Arg(Arg1), Arg(Arg2)}, output3{Arg(Arg3)};
@@ -1482,9 +1450,6 @@ TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream)
 
 #if !defined(__wasm__) && defined(ORT_ENABLE_STREAM)
 TEST_F(PlannerTest, ParaPlanCreation) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   TypeProto graph_in_type;
   graph_in_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
   auto* graph_in_shape = graph_in_type.mutable_tensor_type()->mutable_shape();
@@ -1926,10 +1891,6 @@ TEST_F(PlannerTest, ParaPlanCreation) {
 }
 
 TEST_F(PlannerTest, TestMultiStreamConfig) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   const char* type = "DeviceBasedPartitioner";
   constexpr size_t type_len = 22;
 
@@ -2003,10 +1964,6 @@ TEST_F(PlannerTest, TestMultiStreamSaveConfig) {
 
 // Load with partition config where a node is missing, session load expected to fail.
 TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_missing_node.json";
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
@@ -2027,9 +1984,6 @@ TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) {
 
 // Load with partition config where streams and devices has mismatch
 TEST_F(PlannerTest, TestMultiStreamMismatchDevice) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_mismatch_device.json";
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
@@ -2055,9 +2009,6 @@ TEST_F(PlannerTest, TestCpuIf) {
   sess_opt.graph_optimization_level = TransformerLevel::Default;
 
   InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/cpu_if.onnx"));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_STATUS_OK(sess.Load());
   ASSERT_STATUS_OK(sess.Initialize());
@@ -2118,17 +2069,10 @@ TEST_F(PlannerTest, TestCpuIf) {
 //    onnx.save(model, 'issue_19480.onnx')
 //
 TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
 
   InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/issue_19480.onnx"));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   status = sess.Load();
   status = sess.Initialize();
diff --git a/onnxruntime/test/framework/cuda/fence_cuda_test.cc b/onnxruntime/test/framework/cuda/fence_cuda_test.cc
index 3e5ef30e7ebef..e28327941dda4 100644
--- a/onnxruntime/test/framework/cuda/fence_cuda_test.cc
+++ b/onnxruntime/test/framework/cuda/fence_cuda_test.cc
@@ -115,9 +115,6 @@ TEST(CUDAFenceTests, DISABLED_PartOnCPU) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_TRUE(session.Initialize().IsOK());
   ASSERT_TRUE(1 == CountCopyNodes(graph));
@@ -167,9 +164,6 @@ TEST(CUDAFenceTests, TileWithInitializer) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_STATUS_OK(session.Initialize());
 
@@ -230,9 +224,6 @@ TEST(CUDAFenceTests, TileWithComputedInput) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_TRUE(session.Initialize().IsOK());
 
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 7f4616c964e33..740c566794f15 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -34,7 +34,6 @@
 #ifdef USE_CUDA
 #include "core/providers/cuda/cuda_provider_factory.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
-#include "test/common/cuda_op_test_utils.h"
 #endif
 #ifdef USE_TENSORRT
 #include "core/providers/tensorrt/tensorrt_provider_options.h"
@@ -636,9 +635,6 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
 
   InferenceSession session_object(so, GetEnvironment());
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #endif
 #ifdef USE_ROCM
@@ -693,9 +689,6 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions2) {
 
   InferenceSession session_object(so, GetEnvironment());
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #endif
 #ifdef USE_ROCM
@@ -1049,9 +1042,6 @@ static void TestBindHelper(const std::string& log_str,
   if (bind_provider_type == kCudaExecutionProvider || bind_provider_type == kRocmExecutionProvider) {
 #ifdef USE_CUDA
     auto provider = DefaultCudaExecutionProvider();
-    if (provider == nullptr) {
-      return;
-    }
     gpu_provider = provider.get();
     ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(provider)));
 #endif
@@ -1647,9 +1637,6 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) {
 #if USE_TENSORRT
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider()));
 #elif USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #elif USE_ROCM
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider()));
@@ -1802,9 +1789,6 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) {
 #if USE_TENSORRT
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider()));
 #elif USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #elif USE_ROCM
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider()));
@@ -2160,9 +2144,6 @@ TEST(InferenceSessionTests, TestStrictShapeInference) {
 #ifdef USE_CUDA
 // disable it, since we are going to enable parallel execution with cuda ep
 TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
 
   SessionOptions so;
@@ -2186,10 +2167,6 @@ TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
 }
 
 TEST(InferenceSessionTests, TestArenaShrinkageAfterRun) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
-
   OrtArenaCfg arena_cfg;
   arena_cfg.arena_extend_strategy = 1;  // kSameAsRequested
 
diff --git a/onnxruntime/test/framework/memcpy_transformer_test.cc b/onnxruntime/test/framework/memcpy_transformer_test.cc
index 2313f00e4d123..6e86e5b58aead 100644
--- a/onnxruntime/test/framework/memcpy_transformer_test.cc
+++ b/onnxruntime/test/framework/memcpy_transformer_test.cc
@@ -9,9 +9,6 @@
 #include "default_providers.h"
 #include "gtest/gtest.h"
 #include "test_utils.h"
-#ifdef USE_CUDA
-#include "test/common/cuda_op_test_utils.h"
-#endif
 #include "test/test_environment.h"
 #include "asserts.h"
 
@@ -77,9 +74,6 @@ void ExpectCopy(const onnxruntime::Node& source, const std::string copy_op,
 #ifdef USE_CUDA
 
 TEST(TransformerTest, MemcpyTransformerTest) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unordered_map<std::string, int> domain_to_version;
   domain_to_version[kOnnxDomain] = 7;
   auto model = std::make_shared<onnxruntime::Model>("test", false, ModelMetaData(), PathString(),
@@ -112,9 +106,7 @@ TEST(TransformerTest, MemcpyTransformerTest) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA)
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -137,9 +129,6 @@ TEST(TransformerTest, MemcpyTransformerTest) {
 }
 
 TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   std::unordered_map<std::string, int> domain_to_version;
   domain_to_version[kOnnxDomain] = 7;
   auto model = std::make_shared<onnxruntime::Model>("test", false, ModelMetaData(), PathString(),
@@ -172,9 +161,7 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -294,11 +281,7 @@ TEST(TransformerTest, TestInitializerDuplicationInSubgraph) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -340,11 +323,7 @@ TEST(TransformerTest, MemcpyTransformerTestGraphInputConsumedOnMultipleDevices)
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -446,11 +425,7 @@ TEST(TransformerTest, MemcpyTransformerTestImplicitInputConsumedOnMultipleDevice
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index db9592c293fd0..7bd6b47f52b7d 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -1457,9 +1457,6 @@ TEST(SparseTensorConversionTests, CsrConversion) {
 
 #ifdef USE_CUDA
   auto cuda_provider = DefaultCudaExecutionProvider();
-  if (cuda_provider == nullptr) {
-    return;
-  }
   auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0];
   {
     auto cuda_transfer = cuda_provider->GetDataTransfer();
@@ -1687,9 +1684,6 @@ TEST(SparseTensorConversionTests, CooConversion) {
 
 #ifdef USE_CUDA
   auto cuda_provider = DefaultCudaExecutionProvider();
-  if (cuda_provider == nullptr) {
-    return;
-  }
   auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0];
   {
     auto cuda_transfer = cuda_provider->GetDataTransfer();
diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index 9d8febb453739..e8291a36447ca 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -201,16 +201,6 @@ TEST(LoraAdapterTest, Load) {
 
 #ifdef USE_CUDA
 TEST(LoraAdapterTest, VerifyDeviceCopy) {
-  // These checks for CUDA/DML combined Package, Be careful when you want to remove it!
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    GTEST_SKIP() << "Skip This Test Due to this EP is null";
-  }
-#ifdef USE_DML
-  if (DefaultDmlExecutionProvider() != nullptr) {
-    GTEST_FAIL() << "It should not run with DML EP";
-  }
-#endif
-
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
   auto cuda_ep = DefaultCudaExecutionProvider();
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index b0958e05dc373..aa68f68f3e735 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -532,17 +532,6 @@ void BaseTester::Run(ExpectResult expect_result, const std::string& expected_fai
   so.use_deterministic_compute = use_determinism_;
   so.graph_optimization_level = TransformerLevel::Default;  // 'Default' == off
 
-  // remove nullptr in execution_providers.
-  // it's a little ugly but we need to do this because DefaultXXXExecutionProvider() can return nullptr in Runtime.
-  // And there're many places adding DefaultXXXExecutionProvider() to execution_providers directly.
-  if (execution_providers != nullptr) {
-    execution_providers->erase(std::remove(execution_providers->begin(), execution_providers->end(), nullptr), execution_providers->end());
-    if (execution_providers->size() == 0) {
-      // In fact, no ep is needed to run
-      return;
-    }
-  }
-
   Run(so, expect_result, expected_failure_string, excluded_provider_types, run_options, execution_providers, options);
 }
 
diff --git a/onnxruntime/test/providers/compare_provider_test_utils.cc b/onnxruntime/test/providers/compare_provider_test_utils.cc
index 9acb37c24ddd0..386a5656d8a01 100644
--- a/onnxruntime/test/providers/compare_provider_test_utils.cc
+++ b/onnxruntime/test/providers/compare_provider_test_utils.cc
@@ -53,11 +53,6 @@ void CompareOpTester::CompareWithCPU(const std::string& target_provider_type,
   SetTestFunctionCalled();
 
   std::unique_ptr<IExecutionProvider> target_execution_provider = GetExecutionProvider(target_provider_type);
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (target_execution_provider == nullptr) {
-    return;
-  }
-#endif
   ASSERT_TRUE(target_execution_provider != nullptr) << "provider_type " << target_provider_type
                                                     << " is not supported.";
 
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index b46c253fb8ed9..e3c86a137484f 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -491,18 +491,6 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
   // the number of times these are run to reduce the CI time.
   provider_names.erase(provider_name_cpu);
 #endif
-
-#if defined(USE_CUDA) && defined(USE_DML)
-  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test == "1") {
-    provider_names.erase(provider_name_cuda);
-  }
-  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
-  if (no_dml_ep_test == "1") {
-    provider_names.erase(provider_name_dml);
-  }
-#endif
-
   std::vector<std::basic_string<ORTCHAR_T>> v;
   // Permanently exclude following tests because ORT support only opset starting from 7,
   // Please make no more changes to the list
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index 0f23e4c39d7e2..be79a6d29d539 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -3,9 +3,6 @@
 
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "gtest/gtest.h"
-#if USE_CUDA
-#include "test/common/cuda_op_test_utils.h"
-#endif
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
 
@@ -125,9 +122,6 @@ TEST(GatherOpTest, Gather_invalid_index_gpu) {
                          4.0f, 5.0f, 6.0f, 7.0f,
                          0.0f, 0.0f, 0.0f, 0.0f});
 
-#if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DML;
-#endif
   // On GPU, just set the value to 0 instead of report error. exclude all other providers
   test
 #if defined(USE_CUDA)
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
index 7e1a2384d7fc6..05cfb5c13d689 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
@@ -15,13 +15,11 @@ std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(int opset
   execution_providers.emplace_back(DefaultCpuExecutionProvider());
 
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    if (opset_version < 20) {
-      execution_providers.emplace_back(DefaultCudaExecutionProvider());
+  if (opset_version < 20) {
+    execution_providers.emplace_back(DefaultCudaExecutionProvider());
 #ifdef ENABLE_CUDA_NHWC_OPS
-      execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
+    execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
 #endif
-    }
   }
 #endif
 
diff --git a/onnxruntime/test/providers/cuda/cuda_provider_test.cc b/onnxruntime/test/providers/cuda/cuda_provider_test.cc
index e745e1bcb8171..e57cdd2350fab 100644
--- a/onnxruntime/test/providers/cuda/cuda_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/cuda_provider_test.cc
@@ -11,7 +11,7 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA_Test();
 
 namespace test {
 namespace cuda {
-TEST(CudaEpUnittest, All) {
+TEST(CUDA_EP_Unittest, All) {
   onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA_Test();
   ep.TestAll();
 }
diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
index ec7c6ec4e1605..b413d04fe81e8 100644
--- a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc
@@ -11,7 +11,7 @@
 namespace onnxruntime {
 namespace test {
 
-TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
+TEST(AllocatorTest, CUDAAllocatorTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
   // ensure CUDA device is available.
@@ -77,7 +77,7 @@ TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
 }
 
 // test that we fallback to smaller allocations if the growth of the arena exceeds the available memory
-TEST(CudaEpAllocatorTest, CUDAAllocatorFallbackTest) {
+TEST(AllocatorTest, CUDAAllocatorFallbackTest) {
   OrtDevice::DeviceId cuda_device_id = 0;
 
   size_t free = 0;
diff --git a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
index ccdc56de5937d..b2e986f680763 100644
--- a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc
@@ -17,7 +17,7 @@ using onnxruntime::contrib::attention::AttentionBackend;
 namespace onnxruntime {
 namespace test {
 
-TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
+TEST(AttentionKernelOptionsTest, NonZeroValue) {
   {
     AttentionKernelOptions options;
     int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION) | static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION);
@@ -156,7 +156,7 @@ TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
 }
 
 // Test all environment variables take effect when option value is 0.
-TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
+TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
   constexpr int value = 0;
   ScopedEnvironmentVariables scoped_env_vars{
       EnvVarMap{
@@ -186,7 +186,7 @@ TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
 }
 
 // Test default min sequence lengths when environment variables are not set.
-TEST(CudaEpAttentionKernelOptionsTest, DefaultMinSeqLens) {
+TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) {
   constexpr int value = 0;
   ScopedEnvironmentVariables scoped_env_vars{
       EnvVarMap{
diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
index 97d50398a5550..a0d115c41c14b 100644
--- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@@ -68,7 +68,7 @@ void ComputeTopKReference(const std::vector<float>& values,
   }
 }
 
-TEST(CudaEpTestBeamSearch, TopK) {
+TEST(TestBeamSearch, TopK) {
   int32_t batch_size = 4;
   int32_t beam_size = 4;
   int32_t vocab_size = 50257;
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
index d8fb3c8256012..3fcb9045ee7e6 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -230,7 +230,7 @@ void testPrepack(int rows, int columns) {
 }
 
 // TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
-TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
+TEST(BlkQ4_GEMM, PrepackSm80Test) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -263,7 +263,7 @@ TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
   testPrepack<true, false>(256, 256);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
+TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -292,7 +292,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
+TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -305,7 +305,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
+TEST(BlkQ4_GEMM, Sm80SmallMTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
@@ -326,7 +326,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
   onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576);
 }
 
-TEST(CudaEpBlkQ4_GEMM, Sm80SmallTileKernelTest) {
+TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
   Status status = onnxruntime::cuda::test::sm80_supported();
   if (!status.IsOK()) {
     // skip the test if sm80 is not supported
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index f3222c6f683b5..72357ec7e02d2 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -19,7 +19,7 @@ namespace cuda {
 namespace test {
 // TODO: Since the "DeferredRelease" has been migrated to CudaStream class,
 // we should migrate this test from CudaEP unit test to CudaStream unit test.
-TEST(CudaEpTestDeferredRelease, WithArena) {
+TEST(TestDeferredRelease, WithArena) {
   // Create CUDA EP.
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
@@ -52,7 +52,7 @@ TEST(CudaEpTestDeferredRelease, WithArena) {
   ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
-TEST(CudaEpTestDeferredRelease, WithoutArena) {
+TEST(TestDeferredRelease, WithoutArena) {
   // Create CUDA EP.
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
index 3538c7add94d0..7468a5718425e 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@@ -40,7 +40,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
 }
 }  // namespace
 
-TEST(CudaEpUnittest, FillCorrectness) {
+TEST(CudaUtilsTest, FillCorrectness) {
   TestFillCorrectness<int8_t>(1 << 20, 1);
   TestFillCorrectness<int16_t>(1 << 20, 2);
   TestFillCorrectness<int32_t>(1 << 20, 3);
diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
index 518fde5804b23..6636e15040393 100644
--- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@@ -10,7 +10,7 @@ namespace onnxruntime {
 namespace cuda {
 namespace test {
 
-TEST(CudaEpGemmOptions, TestDefaultOptions) {
+TEST(CudaGemmOptions, TestDefaultOptions) {
   HalfGemmOptions gemm_options;
   ASSERT_FALSE(gemm_options.IsCompute16F());
 #if defined(USE_CUDA)
@@ -22,7 +22,7 @@ TEST(CudaEpGemmOptions, TestDefaultOptions) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, TestCompute16F) {
+TEST(CudaGemmOptions, TestCompute16F) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(1);
   ASSERT_TRUE(gemm_options.IsCompute16F());
@@ -35,7 +35,7 @@ TEST(CudaEpGemmOptions, TestCompute16F) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, NoReducedPrecision) {
+TEST(CudaGemmOptions, NoReducedPrecision) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(2);
   ASSERT_FALSE(gemm_options.IsCompute16F());
@@ -48,7 +48,7 @@ TEST(CudaEpGemmOptions, NoReducedPrecision) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, Pedantic) {
+TEST(CudaGemmOptions, Pedantic) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(4);
   ASSERT_FALSE(gemm_options.IsCompute16F());
@@ -61,7 +61,7 @@ TEST(CudaEpGemmOptions, Pedantic) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
+TEST(CudaGemmOptions, Compute16F_Pedantic) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(5);
   ASSERT_TRUE(gemm_options.IsCompute16F());
@@ -74,7 +74,7 @@ TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
 #endif
 }
 
-TEST(CudaEpGemmOptions, Compute16F_NoReducedPrecision) {
+TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) {
   HalfGemmOptions gemm_options;
   gemm_options.Initialize(3);
   ASSERT_TRUE(gemm_options.IsCompute16F());
diff --git a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
index ba24cf858e80f..6b8cd68de0fca 100644
--- a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc
@@ -41,7 +41,7 @@ void ComputeTop1Reference(const std::vector<float>& values,
   }
 }
 
-TEST(CudaEpTestGreedySearch, TopOne) {
+TEST(TestGreedySearch, TopOne) {
   int32_t batch_size = 4;
   int32_t vocab_size = 50257;
   int32_t batch_x_vocab = batch_size * vocab_size;
diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
index 09c9c1e5f8f6a..ec7e98528504e 100644
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -179,7 +179,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
 }
 }  // namespace
 
-TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
+TEST(ReductionFunctionsTest, ReduceRowToScalar) {
   TestReduceRowToScalarApis(3);
   TestReduceRowToScalarApis(19);
   TestReduceRowToScalarApis(123);
@@ -188,7 +188,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
   TestReduceRowToScalarApis(941736, 2e-4f);
 }
 
-TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
+TEST(ReductionFunctionsTest, ReduceRowsToRow) {
   for (int m : {3, 193, 2945}) {
     for (int n : {3, 193, 2945}) {
       TestReduceRowsToRow(m, n, true);
@@ -197,7 +197,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
   }
 }
 
-TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
+TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
   for (int m : {3, 193, 2945}) {
     for (int n : {3, 193, 2945}) {
       TestReduceColumnsToColumn(m, n);
@@ -205,7 +205,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
   }
 }
 
-TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
+TEST(ReductionFunctionsTest, BufferOffsets) {
   const int m = 2048;
   const int n = 1024;
   const TensorShape shape{m, n};
@@ -240,7 +240,7 @@ TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
   }
 }
 
-TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
+TEST(ReductionFunctionsTest, InvalidBufferSize) {
   const int m = 2048;
   const int n = 1024;
   const TensorShape shape{m, n};
@@ -262,7 +262,7 @@ TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
   ASSERT_FALSE(status.IsOK());
 }
 
-TEST(CudaEpReductionFunctionsTest, GetApplicableMatrixReduction) {
+TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
   auto test_get_applicable_matrix_reduction =
       [](cudnnReduceTensorOp_t cudnn_op,
          const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index a274b90dc042f..8fc76da3495a8 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -105,7 +105,7 @@ def load_jsonc(basename: str):
     return json.loads("\n".join(lines))
 
 
-def create_backend_test(devices: list[str], test_name=None):
+def create_backend_test(test_name=None):
     """Creates an OrtBackendTest and adds its TestCase's to global scope so unittest will find them."""
 
     overrides = load_jsonc("onnx_backend_test_series_overrides.jsonc")
@@ -126,29 +126,30 @@ def create_backend_test(devices: list[str], test_name=None):
     else:
         filters = load_jsonc("onnx_backend_test_series_filters.jsonc")
         current_failing_tests = apply_filters(filters, "current_failing_tests")
+
         if platform.architecture()[0] == "32bit":
             current_failing_tests += apply_filters(filters, "current_failing_tests_x86")
 
-        if backend.supports_device("DNNL") or "DNNL" in devices:
+        if backend.supports_device("DNNL"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_DNNL")
 
-        if backend.supports_device("NNAPI") or "NNAPI" in devices:
+        if backend.supports_device("NNAPI"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_NNAPI")
 
-        if backend.supports_device("OPENVINO_GPU") or "OPENVINO_GPU" in devices:
+        if backend.supports_device("OPENVINO_GPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_GPU")
 
-        if backend.supports_device("OPENVINO_CPU") or "OPENVINO_CPU" in devices:
+        if backend.supports_device("OPENVINO_CPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP32")
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP16")
 
-        if backend.supports_device("OPENVINO_NPU") or "OPENVINO_NPU" in devices:
+        if backend.supports_device("OPENVINO_NPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU")
 
-        if backend.supports_device("OPENVINO") or "OPENVINO" in devices:
+        if backend.supports_device("OPENVINO"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_opset18")
 
-        if backend.supports_device("MIGRAPHX") or "MIGRAPHX" in devices:
+        if backend.supports_device("MIGRAPHX"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_MIGRAPHX")
 
         if backend.supports_device("WEBGPU"):
@@ -157,16 +158,8 @@ def create_backend_test(devices: list[str], test_name=None):
         # Skip these tests for a "pure" DML onnxruntime python wheel. We keep these tests enabled for instances where both DML and CUDA
         # EPs are available (Windows GPU CI pipeline has this config) - these test will pass because CUDA has higher precedence than DML
         # and the nodes are assigned to only the CUDA EP (which supports these tests)
-        if (backend.supports_device("DML") and not backend.supports_device("GPU")) or "DML" in devices:
+        if backend.supports_device("DML") and not backend.supports_device("GPU"):
             current_failing_tests += apply_filters(filters, "current_failing_tests_pure_DML")
-            # exclude CUDA EP when DML test is running.
-            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,CUDAExecutionProvider"
-        elif backend.supports_device("DML") and "DML" not in devices:
-            # exclude DML EP when CUDA test is running.
-            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,DmlExecutionProvider"
-        else:
-            # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior
-            os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider"
 
         filters = (
             current_failing_tests
@@ -179,6 +172,9 @@ def create_backend_test(devices: list[str], test_name=None):
         backend_test.exclude("(" + "|".join(filters) + ")")
         print("excluded tests:", filters)
 
+        # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior
+        os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider"
+
     # import all test cases at global scope to make
     # them visible to python.unittest.
     globals().update(backend_test.enable_report().test_cases)
@@ -203,15 +199,6 @@ def parse_args():
         help="Only run tests that match this value. Matching is regex based, and '.*' is automatically appended",
     )
 
-    parser.add_argument(
-        "--devices",
-        type=str,
-        choices=["CPU", "CUDA", "MIGRAPHX", "DNNL", "DML", "OPENVINO_GPU", "OPENVINO_CPU", "OPENVINO_NPU", "OPENVINO"],
-        nargs="+",  # allows multiple values
-        default=["CPU"],  # default to ["CPU"] if no input is given
-        help="Select one or more devices CPU, CUDA, MIGRAPHX, DNNL, DML, OPENVINO_GPU, OPENVINO_CPU, OPENVINO_NPU, OPENVINO",
-    )
-
     # parse just our args. python unittest has its own args and arg parsing, and that runs inside unittest.main()
     parsed, unknown = parser.parse_known_args()
     sys.argv = sys.argv[:1] + unknown
@@ -222,5 +209,5 @@ def parse_args():
 if __name__ == "__main__":
     args = parse_args()
 
-    create_backend_test(args.devices, args.test_name)
+    create_backend_test(args.test_name)
     unittest.main()
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 7ecaab6fedb02..f083ab14ad133 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -750,13 +750,6 @@
         "^test_reduce_log_sum_empty_set_cpu",
         "^test_reduce_log_sum_exp_empty_set_cpu",
         "^test_reduce_prod_empty_set_cpu",
-        // Bug: DML EP some how executes these CUDA tests and failed
-        // TODO: Remove these tests when DML EP is fixed
-        "^test_convtranspose_autopad_same_cuda",
-        "^test_asin_example_cuda",
-        "^test_dynamicquantizelinear_cuda",
-        "^test_dynamicquantizelinear_expanded_cuda",
-        "^test_reduce_min_empty_set_cuda",
         //Bug: DML EP does not execute operators with an empty input tensor
         //TODO: Resolve as a graph implementation that returns a constant inf tensor with appropriate strides
         "^test_reduce_min_empty_set_cpu"
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 59926bbcd1c6f..c1564997c42b8 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -122,12 +122,6 @@ std::unique_ptr<IExecutionProvider> DefaultOpenVINOExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_CUDA
-#ifdef USE_DML
-  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test == "1") {
-    return nullptr;
-  }
-#endif
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
   provider_options.use_tf32 = false;
@@ -140,12 +134,6 @@ std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef ENABLE_CUDA_NHWC_OPS
 std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
 #if defined(USE_CUDA)
-#ifdef USE_DML
-  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test == "1") {
-    return nullptr;
-  }
-#endif
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
   provider_options.use_tf32 = false;
@@ -332,12 +320,6 @@ std::unique_ptr<IExecutionProvider> DefaultCannExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultDmlExecutionProvider() {
 #ifdef USE_DML
-#ifdef USE_CUDA
-  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
-  if (no_dml_ep_test == "1") {
-    return nullptr;
-  }
-#endif
   ConfigOptions config_options{};
   if (auto factory = DMLProviderFactoryCreator::CreateFromDeviceOptions(config_options, nullptr, false, false)) {
     return factory->CreateProvider();
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml
deleted file mode 100644
index 9a721c65de332..0000000000000
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-parameters:
-- name: EP_NAME
-  type: string
-  default: CPU
-
-- name: PYTHON_VERSION
-  type: string
-
-steps:
-- powershell: |
-    python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
-    Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
-    mkdir -p $(Agent.TempDirectory)\ort_test_data
-    Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
-    Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
-    cd $(Agent.TempDirectory)\ort_test_data
-    python onnx_backend_test_series.py --devices ${{ parameters.EP_NAME }} -v
-    cd $(Agent.TempDirectory)
-    Remove-Item -Path $(Agent.TempDirectory)\ort_test_data -Recurse -Force
-  workingDirectory: '$(Build.sourcesDirectory)'
-  displayName: 'Run Python Tests with ${{ parameters.EP_NAME }} EP'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
index 0b3eac0110abc..9c7fbc24ab1b6 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -50,8 +50,6 @@ stages:
     win_trt_home: ${{ parameters.win_trt_home }}
     win_cuda_home: ${{ parameters.win_cuda_home }}
     buildJava: ${{ parameters.buildJava }}
-    SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    BuildId: ${{ parameters.BuildId }}
 
 - template: nuget-cuda-packaging-stage.yml
   parameters:
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index d6b25c98936f0..445066f08995a 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -34,7 +34,7 @@ parameters:
   displayName: Specific Artifact's BuildId
   type: string
   default: '0'
-
+  
 - name: buildJava
   type: boolean
 
@@ -50,14 +50,13 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
     UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
-    ComboTests: true
 # Windows CUDA with TensorRT Packaging
 - template: ../templates/win-ci.yml
   parameters:
@@ -69,7 +68,7 @@ stages:
     msbuildPlatform: x64
     CudaVersion: ${{ parameters.CudaVersion }}
     packageName: x64-tensorrt
-    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --parallel
+    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index f7235e3ad2076..947e4f99b984f 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -56,7 +56,7 @@ stages:
           PYTHON_VERSION: ${{ python_version }}
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
-          EP_BUILD_FLAGS: --use_dml --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          EP_BUILD_FLAGS: --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
           use_tensorrt: True
 
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
index dd0539f751c89..aa7f2845fc0fa 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml
@@ -33,7 +33,7 @@ parameters:
    - Release
    - RelWithDebInfo
    - MinSizeRel
-
+   
 - name: use_tensorrt
   type: boolean
   default: false
@@ -134,7 +134,7 @@ stages:
                 --cmake_generator "$(VSGenerator)"
                 --enable_pybind
                 --enable_onnx_tests
-                --parallel 4 --use_binskim_compliant_compile_flags --update --build
+                --parallel --use_binskim_compliant_compile_flags --update --build
                 $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }}
               workingDirectory: '$(Build.BinariesDirectory)'
 
@@ -206,20 +206,19 @@ stages:
             DownloadTRT: ${{ parameters.use_tensorrt }}
 
         - task: PowerShell@2
-          displayName: 'Install Third Party Dependencies'
+          displayName: 'Install ONNX'
           inputs:
             filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1'
             workingDirectory: '$(Build.BinariesDirectory)'
             arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\installed -build_config ${{ parameters.cmake_build_type }}
 
-        - template: jobs/steps/py_packaging_test_step.yml
-          parameters:
-            EP_NAME: DML
-            PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }}
-
-        - template: jobs/steps/py_packaging_test_step.yml
-          parameters:
-            EP_NAME: CUDA
-            PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }}
-
-
+        - powershell: |
+            python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq
+            Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate}
+            mkdir -p $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data
+            Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data
+            cd $(Agent.TempDirectory)\ort_test_data
+            python onnx_backend_test_series.py
+          workingDirectory: '$(Build.sourcesDirectory)'
+          displayName: 'Run Python Tests'
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
index 7bdd069de711b..e8f391a73fa7b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml
@@ -218,32 +218,16 @@ jobs:
       - powershell: |
          python3 -m pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml -qq
          Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
+
         workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
         displayName: 'Install onnxruntime wheel'
 
   - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
-      - ${{ if and(contains(parameters.additionalBuildFlags, 'use_cuda'), contains(parameters.additionalBuildFlags, 'use_dml')) }}:
-        - powershell: |
-           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-          displayName: 'Run tests excluding CUDA tests'
-          env:
-            NO_CUDA_TEST: '1'
-            GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*:*cpu_*models*' # Exclude CUDA EP tests under providers/cuda/ and cpu models test
-            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' # For onnxruntime4j_test to find dependent dlls
-        - powershell: |
-            python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-          displayName: 'Run tests excluding DML tests'
-          env:
-            NO_DML_TEST: '1'
-            GTEST_FILTER: '-*cpu_*models*'
-            PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)'
-      - ${{ else }}:
-        - powershell: |
-           python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
-          workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
-          displayName: 'Run tests'
+      - powershell: |
+         python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022"  --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
+
+        workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
+        displayName: 'Run tests'
 
   - ${{ if eq(parameters.GenerateDocumentation, true) }}:
     - task: PythonScript@0
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index e046997b4f49a..59950433b3d40 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -25,7 +25,7 @@ parameters:
 
 - name: runTests
   type: boolean
-  default: false
+  default: true
 
 - name: buildJava
   type: boolean
@@ -71,10 +71,6 @@ parameters:
       - 11.8
       - 12.2
 
-- name: ComboTests
-  type: boolean
-  default: false
-
 - name: SpecificArtifact
   displayName: Use Specific Artifact
   type: boolean
@@ -226,7 +222,7 @@ stages:
           condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
           inputs:
             scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --test --skip_submodule_sync --build_shared_lib --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
             workingDirectory: '$(Build.BinariesDirectory)'
       - ${{ else }}:
         - powershell: |
@@ -338,10 +334,6 @@ stages:
           displayName: 'Clean Agent Directories'
           condition: always()
 
-        - script:
-            echo ${{ parameters.SpecificArtifact }}
-          displayName: 'Print Specific Artifact'
-
         - checkout: self
           clean: true
           submodules: none
@@ -407,35 +399,13 @@ stages:
           displayName: 'Append dotnet x86  Directory to PATH'
           condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86'))
 
-        - ${{ if eq(parameters.ComboTests, 'true') }}:
-          - task: PythonScript@0
-            displayName: 'test excludes CUDA'
-            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-            inputs:
-              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
-              workingDirectory: '$(Build.BinariesDirectory)'
-            env:
-              NO_CUDA_TEST: '1'
-              GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*' # Exclude CUDA EP tests under providers/cuda/
-          - task: PythonScript@0
-            displayName: 'test excludes DML'
-            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-            inputs:
-              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
-              workingDirectory: '$(Build.BinariesDirectory)'
-            env:
-              NO_DML_TEST: '1'
-        - ${{ else }}:
-          - task: PythonScript@0
-            displayName: 'test'
-            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-            inputs:
-              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
-              workingDirectory: '$(Build.BinariesDirectory)'
-
+        - task: PythonScript@0
+          displayName: 'test'
+          condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
+          inputs:
+            scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
+            workingDirectory: '$(Build.BinariesDirectory)'
 # Previous stage only assembles the java binaries, testing will be done in this stage with GPU machine
         - ${{ if eq(parameters.buildJava, 'true') }}:
           - template: make_java_win_binaries.yml
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
index 67fd47c3150af..47ece37e66e09 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml
@@ -62,28 +62,4 @@ stages:
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
-
-- stage: cuda_dml
-  dependsOn: []
-  jobs:
-    - template: templates/jobs/win-ci-vs-2022-job.yml
-      parameters:
-        BuildConfig: 'RelWithDebInfo'
-        EnvSetupScript: setup_env_cuda.bat
-        buildArch: x64
-        additionalBuildFlags: >-
-          --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
-          --enable_cuda_profiling --enable_transformers_tool_test
-          --use_dml
-          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
-          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
-          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
-        msbuildPlatform: x64
-        isX86: false
-        job_name_suffix: x64_RelWithDebInfo
-        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-        ORT_EP_NAME: CUDA
-        EnablePython: false
-        WITH_CACHE: true
-        MachinePool: onnxruntime-Win2022-GPU-A10
+        MachinePool: onnxruntime-Win2022-GPU-A10
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
index 911d99cd2adf3..94b0aa680d54d 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml
@@ -43,11 +43,11 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml
+        additionalBuildFlags: --enable_pybind --use_dml --enable_wcos  --use_winml
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: DML
         WITH_CACHE: false
-        MachinePool: onnxruntime-Win2022-GPU-dml-A10
+        MachinePool: onnxruntime-Win2022-GPU-dml-A10
\ No newline at end of file