diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java index 15d89b536b39a..e11537492d3a7 100644 --- a/java/src/test/java/ai/onnxruntime/InferenceTest.java +++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java @@ -737,7 +737,6 @@ public void testCoreML() throws OrtException { runProvider(OrtProvider.CORE_ML); } - @Disabled("DirectML Java API hasn't been supported yet") @Test @EnabledIfSystemProperty(named = "USE_DML", matches = "1") public void testDirectML() throws OrtException { diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java index fa0b6fd0ef9d9..57c4eb3577fd0 100644 --- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java +++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java @@ -27,7 +27,6 @@ import java.util.HashMap; import java.util.Map; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.condition.DisabledIfSystemProperty; import org.junit.jupiter.api.condition.EnabledIfSystemProperty; public class ProviderOptionsTest { @@ -35,7 +34,6 @@ public class ProviderOptionsTest { @Test @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1") - @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1") public void testCUDAOptions() throws OrtException { // Test standard options OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0); @@ -63,7 +61,6 @@ public void testCUDAOptions() throws OrtException { @Test @EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1") - @DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1") public void testTensorRT() throws OrtException { // Test standard options OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0); diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h index d3e069237217e..6f3e460628566 100644 --- a/onnxruntime/test/common/cuda_op_test_utils.h +++ b/onnxruntime/test/common/cuda_op_test_utils.h @@ -5,11 +5,6 @@ #include "test/util/include/default_providers.h" -#define SKIP_CUDA_TEST_WITH_DML \ - if (DefaultCudaExecutionProvider() == nullptr) { \ - GTEST_SKIP() << "CUDA Tests are not supported while DML is enabled"; \ - } - namespace onnxruntime { namespace test { @@ -18,10 +13,6 @@ namespace test { int GetCudaArchitecture(); inline bool HasCudaEnvironment(int min_cuda_architecture) { - if (DefaultCudaExecutionProvider() == nullptr) { - return false; - } - if (DefaultCudaExecutionProvider().get() == nullptr) { return false; } diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc index 8c69e2d9810b8..9f4ee071925b4 100644 --- a/onnxruntime/test/contrib_ops/beam_search_test.cc +++ b/onnxruntime/test/contrib_ops/beam_search_test.cc @@ -75,9 +75,6 @@ TEST(BeamSearchTest, GptBeamSearchFp32) { const char* const output_names[] = {"sequences"}; Ort::SessionOptions session_options; -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif #ifdef USE_CUDA OrtCUDAProviderOptionsV2 cuda_options; cuda_options.use_tf32 = false; @@ -171,9 +168,6 @@ TEST(BeamSearchTest, GptBeamSearchFp16) { bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get()); if (enable_cuda || enable_rocm) { Ort::SessionOptions session_options; -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif #ifdef USE_CUDA OrtCUDAProviderOptionsV2 cuda_options; cuda_options.use_tf32 = false; diff --git a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc index 297629b015796..027d4b3fff1b0 100644 --- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc +++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc @@ -181,9 +181,6 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector& input_s t.SetCustomOutputVerifier(output_verifier); std::vector> t_eps; #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } t_eps.emplace_back(DefaultCudaExecutionProvider()); #elif USE_ROCM t_eps.emplace_back(DefaultRocmExecutionProvider()); diff --git a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc index 26b0e3a4dd7a9..7ca4e1004066c 100644 --- a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc +++ b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc @@ -61,9 +61,7 @@ void RunTestForInference(const std::vector& input_dims, bool has_ratio std::vector> test_eps; #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() != nullptr) { - test_eps.emplace_back(DefaultCudaExecutionProvider()); - } + test_eps.emplace_back(DefaultCudaExecutionProvider()); #elif USE_ROCM test_eps.emplace_back(DefaultRocmExecutionProvider()); #endif @@ -124,9 +122,6 @@ void RunTestForTraining(const std::vector& input_dims) { std::vector> dropout_eps; #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } dropout_eps.emplace_back(DefaultCudaExecutionProvider()); #elif USE_ROCM dropout_eps.emplace_back(DefaultRocmExecutionProvider()); diff --git a/onnxruntime/test/contrib_ops/layer_norm_test.cc b/onnxruntime/test/contrib_ops/layer_norm_test.cc index b414a98c4e756..46082e1b0cd31 100644 --- a/onnxruntime/test/contrib_ops/layer_norm_test.cc +++ b/onnxruntime/test/contrib_ops/layer_norm_test.cc @@ -2,7 +2,6 @@ // Licensed under the MIT License. #include "test/providers/compare_provider_test_utils.h" -#include "test/util/include/default_providers.h" namespace onnxruntime { namespace test { @@ -80,20 +79,14 @@ static void TestLayerNorm(const std::vector& x_dims, #endif #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() != nullptr) { - test.CompareWithCPU(kCudaExecutionProvider); - } + test.CompareWithCPU(kCudaExecutionProvider); #elif USE_ROCM test.CompareWithCPU(kRocmExecutionProvider); +#elif USE_DML + test.CompareWithCPU(kDmlExecutionProvider); #elif USE_WEBGPU test.CompareWithCPU(kWebGpuExecutionProvider); #endif - -#ifdef USE_DML - if (DefaultDmlExecutionProvider() != nullptr) { - test.CompareWithCPU(kDmlExecutionProvider); - } -#endif } TEST(CudaKernelTest, LayerNorm_NullInput) { diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index 6dedce24e7e07..eebe9197573c6 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -490,17 +490,13 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura std::vector> execution_providers; if (use_float16) { #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() != nullptr) { - execution_providers.push_back(DefaultCudaExecutionProvider()); - } + execution_providers.push_back(DefaultCudaExecutionProvider()); #endif #ifdef USE_ROCM execution_providers.push_back(DefaultRocmExecutionProvider()); #endif #ifdef USE_DML - if (DefaultDmlExecutionProvider() != nullptr) { - execution_providers.push_back(DefaultDmlExecutionProvider()); - } + execution_providers.push_back(DefaultDmlExecutionProvider()); #endif #ifdef USE_WEBGPU execution_providers.push_back(DefaultWebGpuExecutionProvider()); @@ -518,11 +514,8 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura } // namespace TEST(MatMulNBits, Float16Cuda) { -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) - std::vector has_gidx_options = {true, false}; - if (DefaultDmlExecutionProvider() != nullptr) { - has_gidx_options.assign(1, false); - } +#if defined(USE_CUDA) || defined(USE_ROCM) + auto has_gidx_options = {true, false}; #else auto has_gidx_options = {false}; #endif @@ -533,9 +526,7 @@ TEST(MatMulNBits, Float16Cuda) { for (auto block_size : {16, 32, 64, 128}) { for (auto has_gidx : has_gidx_options) { #ifdef USE_DML - if (DefaultDmlExecutionProvider() != nullptr) { - RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f); - } + RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f); #else RunTest(M, N, K, block_size, 0, false, true, has_gidx); RunTest(M, N, K, block_size, 0, true, true, has_gidx, false); @@ -548,16 +539,12 @@ TEST(MatMulNBits, Float16Cuda) { } TEST(MatMulNBits, Float16Large) { -#if defined(USE_CUDA) || defined(USE_DML) +#ifdef USE_DML // For some reason, the A10 machine that runs these tests during CI has a much bigger error than all retail // machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an // absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number // of elements in this test, ULPs should probably be used instead of absolute/relative tolerances. - float abs_error = 0.05f; - if (DefaultDmlExecutionProvider() != nullptr) { - // it means the ep is dml in runtime, the abs_error is changed to 0.3f - abs_error = 0.3f; - } + float abs_error = 0.3f; #elif USE_WEBGPU // See Intel A770 to pass these tests with an absolute error of 0.08. float abs_error = 0.08f; @@ -573,6 +560,7 @@ TEST(MatMulNBits, Float16Large) { } } } + #endif // defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index d88c3131a4ca5..8d7629b5fda1c 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -227,7 +227,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) { } // DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output -#if defined(USE_DML) && !defined(USE_CUDA) +#if defined(USE_DML) TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) { RunMatMulIntegerToFloatTest(); diff --git a/onnxruntime/test/contrib_ops/tensor_op_test.cc b/onnxruntime/test/contrib_ops/tensor_op_test.cc index d5e2ddebfe67f..bc2ff5f4f724d 100644 --- a/onnxruntime/test/contrib_ops/tensor_op_test.cc +++ b/onnxruntime/test/contrib_ops/tensor_op_test.cc @@ -121,15 +121,7 @@ void MeanVarianceNormalizationAcrossChannels(bool across_channels, bool normaliz test.AddAttribute("normalize_variance", normalize_variance ? one : zero); test.AddInput("input", {N, C, H, W}, X); test.AddOutput("output", {N, C, H, W}, result); -#if defined(USE_CUDA) && defined(USE_DML) - if (DefaultCudaExecutionProvider() == nullptr) { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider}); - } else if (DefaultDmlExecutionProvider() == nullptr) { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider}); - } -#else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider}); // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator. -#endif } void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_variance) { @@ -196,15 +188,7 @@ void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_va test.AddAttribute("normalize_variance", normalize_variance ? one : zero); test.AddInput("input", {N, C, H, W}, X); test.AddOutput("output", {N, C, H, W}, result); -#if defined(USE_CUDA) && defined(USE_DML) - if (DefaultCudaExecutionProvider() == nullptr) { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider}); - } else if (DefaultDmlExecutionProvider() == nullptr) { - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider}); - } -#else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider}); // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator. -#endif } TEST(MVNContribOpTest, MeanVarianceNormalizationCPUTest_Version1_TO_8) { @@ -246,9 +230,7 @@ TEST(UnfoldTensorOpTest, LastDim) { std::vector> execution_providers; #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() != nullptr) { - execution_providers.push_back(DefaultCudaExecutionProvider()); - } + execution_providers.push_back(DefaultCudaExecutionProvider()); #endif execution_providers.push_back(DefaultCpuExecutionProvider()); tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc index adab93908cdc4..eaebac177ca91 100644 --- a/onnxruntime/test/framework/allocation_planner_test.cc +++ b/onnxruntime/test/framework/allocation_planner_test.cc @@ -28,7 +28,6 @@ using json = nlohmann::json; #ifdef USE_CUDA #include "core/providers/cuda/cuda_execution_provider.h" #include "core/providers/cuda/cuda_provider_factory.h" -#include "test/common/cuda_op_test_utils.h" #endif // USE_CUDA #include "core/session/onnxruntime_session_options_config_keys.h" using namespace ONNX_NAMESPACE; @@ -897,9 +896,6 @@ TEST_F(PlannerTest, LocationPlanningForPassThroughExplicitAndImplicitSubgraphInp SessionOptions so; InferenceSession sess{so, GetEnvironment()}; - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); ASSERT_TRUE(status.IsOK()); @@ -1042,9 +1038,6 @@ TEST_F(PlannerTest, LocationPlanningForInitializersOnlyUsedInANestedSubgraph) { SessionOptions so; InferenceSession sess{so, GetEnvironment()}; - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); ASSERT_TRUE(status.IsOK()); @@ -1152,9 +1145,6 @@ TEST_F(PlannerTest, LocationPlanningForInitializersUsedOnDifferentDevicesInMainG SessionOptions so; InferenceSession sess{so, GetEnvironment()}; - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); ASSERT_TRUE(status.IsOK()); @@ -1247,9 +1237,6 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM SessionOptions so; InferenceSession sess{so, GetEnvironment()}; - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); ASSERT_TRUE(status.IsOK()); @@ -1282,10 +1269,6 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM // Test MultiStream scenario for the graph: // node1(CPU ep)->node2(CPU ep)->node3(CUDA ep)->node4(CPU ep) TEST_F(PlannerTest, MultiStream) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif - ONNX_NAMESPACE::TensorProto tensor; tensor.add_dims(1); tensor.add_float_data(1.0f); @@ -1304,7 +1287,6 @@ TEST_F(PlannerTest, MultiStream) { onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA(); auto epFactory = ep.CreateExecutionProviderFactory(epi); std::unique_ptr execution_provider = epFactory->CreateProvider(); - ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider))); CreatePlan({}, false); @@ -1332,9 +1314,6 @@ TEST_F(PlannerTest, MultiStream) { // node3 // All 3 nodes are CUDA EP, node1 is in stream0, node2 is in stream1, node3 is in stream2 TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build(); std::unique_ptr<::onnxruntime::KernelDef> cudaKernelAdd = KernelDefBuilder().SetName("Add").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build(); std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3"); @@ -1376,9 +1355,6 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) { // stream 1: node2 (CPU EP) // node1's output, which is consumed by both node2 and node3, is in CPU. TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json"); EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams"; EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps"; @@ -1400,11 +1376,6 @@ TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) { // TODO(leca): there is a bug in the corresponding graph that node2 will be visited twice when traversing node1's output nodes // (see: for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) in BuildExecutionPlan()). We can just break the loop and don't need the extra variables once it is fixed TEST_F(PlannerTest, MultiStreamMultiOutput) { -#if defined(USE_CUDA) && defined(USE_DML) - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } -#endif std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("RNN").Provider(kCudaExecutionProvider).SinceVersion(7).Build(); std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"); std::vector input1{Arg(Graph_input1), Arg(Graph_input2), Arg(Graph_input3)}, output1{Arg(Arg1), Arg(Arg2)}, input2{Arg(Arg1), Arg(Arg2)}, output2{Arg(Arg3)}; @@ -1442,9 +1413,6 @@ TEST_F(PlannerTest, MultiStreamMultiOutput) { // TODO(leca): the ideal case is there is only 1 wait step before launching node3, // as there is a specific order between node1 and node2 if they are in the same stream, thus node3 will only need to wait the latter one TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build(); std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3"); std::vector input1{Arg(Graph_input1)}, input2{Arg(Graph_input2)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, input3{Arg(Arg1), Arg(Arg2)}, output3{Arg(Arg3)}; @@ -1482,9 +1450,6 @@ TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream) #if !defined(__wasm__) && defined(ORT_ENABLE_STREAM) TEST_F(PlannerTest, ParaPlanCreation) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif TypeProto graph_in_type; graph_in_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT); auto* graph_in_shape = graph_in_type.mutable_tensor_type()->mutable_shape(); @@ -1926,10 +1891,6 @@ TEST_F(PlannerTest, ParaPlanCreation) { } TEST_F(PlannerTest, TestMultiStreamConfig) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif - const char* type = "DeviceBasedPartitioner"; constexpr size_t type_len = 22; @@ -2003,10 +1964,6 @@ TEST_F(PlannerTest, TestMultiStreamSaveConfig) { // Load with partition config where a node is missing, session load expected to fail. TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif - const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_missing_node.json"; SessionOptions sess_opt; sess_opt.graph_optimization_level = TransformerLevel::Default; @@ -2027,9 +1984,6 @@ TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) { // Load with partition config where streams and devices has mismatch TEST_F(PlannerTest, TestMultiStreamMismatchDevice) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_mismatch_device.json"; SessionOptions sess_opt; sess_opt.graph_optimization_level = TransformerLevel::Default; @@ -2055,9 +2009,6 @@ TEST_F(PlannerTest, TestCpuIf) { sess_opt.graph_optimization_level = TransformerLevel::Default; InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/cpu_if.onnx")); - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(sess.RegisterExecutionProvider(DefaultCudaExecutionProvider())); ASSERT_STATUS_OK(sess.Load()); ASSERT_STATUS_OK(sess.Initialize()); @@ -2118,17 +2069,10 @@ TEST_F(PlannerTest, TestCpuIf) { // onnx.save(model, 'issue_19480.onnx') // TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif - SessionOptions sess_opt; sess_opt.graph_optimization_level = TransformerLevel::Default; InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/issue_19480.onnx")); - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); status = sess.Load(); status = sess.Initialize(); diff --git a/onnxruntime/test/framework/cuda/fence_cuda_test.cc b/onnxruntime/test/framework/cuda/fence_cuda_test.cc index 3e5ef30e7ebef..e28327941dda4 100644 --- a/onnxruntime/test/framework/cuda/fence_cuda_test.cc +++ b/onnxruntime/test/framework/cuda/fence_cuda_test.cc @@ -115,9 +115,6 @@ TEST(CUDAFenceTests, DISABLED_PartOnCPU) { SessionOptions so; FenceCudaTestInferenceSession session(so, GetEnvironment()); ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model)); - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider())); ASSERT_TRUE(session.Initialize().IsOK()); ASSERT_TRUE(1 == CountCopyNodes(graph)); @@ -167,9 +164,6 @@ TEST(CUDAFenceTests, TileWithInitializer) { SessionOptions so; FenceCudaTestInferenceSession session(so, GetEnvironment()); ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model)); - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider())); ASSERT_STATUS_OK(session.Initialize()); @@ -230,9 +224,6 @@ TEST(CUDAFenceTests, TileWithComputedInput) { SessionOptions so; FenceCudaTestInferenceSession session(so, GetEnvironment()); ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model)); - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider())); ASSERT_TRUE(session.Initialize().IsOK()); diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 7f4616c964e33..740c566794f15 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -34,7 +34,6 @@ #ifdef USE_CUDA #include "core/providers/cuda/cuda_provider_factory.h" #include "core/providers/cuda/gpu_data_transfer.h" -#include "test/common/cuda_op_test_utils.h" #endif #ifdef USE_TENSORRT #include "core/providers/tensorrt/tensorrt_provider_options.h" @@ -636,9 +635,6 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) { InferenceSession session_object(so, GetEnvironment()); #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider())); #endif #ifdef USE_ROCM @@ -693,9 +689,6 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions2) { InferenceSession session_object(so, GetEnvironment()); #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider())); #endif #ifdef USE_ROCM @@ -1049,9 +1042,6 @@ static void TestBindHelper(const std::string& log_str, if (bind_provider_type == kCudaExecutionProvider || bind_provider_type == kRocmExecutionProvider) { #ifdef USE_CUDA auto provider = DefaultCudaExecutionProvider(); - if (provider == nullptr) { - return; - } gpu_provider = provider.get(); ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(provider))); #endif @@ -1647,9 +1637,6 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) { #if USE_TENSORRT ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider())); #elif USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider())); #elif USE_ROCM ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider())); @@ -1802,9 +1789,6 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) { #if USE_TENSORRT ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider())); #elif USE_CUDA - if (DefaultCudaExecutionProvider() == nullptr) { - return; - } ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider())); #elif USE_ROCM ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider())); @@ -2160,9 +2144,6 @@ TEST(InferenceSessionTests, TestStrictShapeInference) { #ifdef USE_CUDA // disable it, since we are going to enable parallel execution with cuda ep TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx"; SessionOptions so; @@ -2186,10 +2167,6 @@ TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) { } TEST(InferenceSessionTests, TestArenaShrinkageAfterRun) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif - OrtArenaCfg arena_cfg; arena_cfg.arena_extend_strategy = 1; // kSameAsRequested diff --git a/onnxruntime/test/framework/memcpy_transformer_test.cc b/onnxruntime/test/framework/memcpy_transformer_test.cc index 2313f00e4d123..6e86e5b58aead 100644 --- a/onnxruntime/test/framework/memcpy_transformer_test.cc +++ b/onnxruntime/test/framework/memcpy_transformer_test.cc @@ -9,9 +9,6 @@ #include "default_providers.h" #include "gtest/gtest.h" #include "test_utils.h" -#ifdef USE_CUDA -#include "test/common/cuda_op_test_utils.h" -#endif #include "test/test_environment.h" #include "asserts.h" @@ -77,9 +74,6 @@ void ExpectCopy(const onnxruntime::Node& source, const std::string copy_op, #ifdef USE_CUDA TEST(TransformerTest, MemcpyTransformerTest) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif std::unordered_map domain_to_version; domain_to_version[kOnnxDomain] = 7; auto model = std::make_shared("test", false, ModelMetaData(), PathString(), @@ -112,9 +106,7 @@ TEST(TransformerTest, MemcpyTransformerTest) { KernelRegistryManager kernel_registry_manager; ExecutionProviders execution_providers; -#if defined(USE_CUDA) ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); -#endif ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(CPUExecutionProviderInfo()))); KernelRegistryManager test_registry_manager; @@ -137,9 +129,6 @@ TEST(TransformerTest, MemcpyTransformerTest) { } TEST(TransformerTest, MemcpyTransformerTestCudaFirst) { -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif std::unordered_map domain_to_version; domain_to_version[kOnnxDomain] = 7; auto model = std::make_shared("test", false, ModelMetaData(), PathString(), @@ -172,9 +161,7 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) { KernelRegistryManager kernel_registry_manager; ExecutionProviders execution_providers; - ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); - ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(CPUExecutionProviderInfo()))); KernelRegistryManager test_registry_manager; @@ -294,11 +281,7 @@ TEST(TransformerTest, TestInitializerDuplicationInSubgraph) { KernelRegistryManager kernel_registry_manager; ExecutionProviders execution_providers; -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); - ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(CPUExecutionProviderInfo()))); KernelRegistryManager test_registry_manager; @@ -340,11 +323,7 @@ TEST(TransformerTest, MemcpyTransformerTestGraphInputConsumedOnMultipleDevices) KernelRegistryManager kernel_registry_manager; ExecutionProviders execution_providers; -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); - ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(CPUExecutionProviderInfo()))); KernelRegistryManager test_registry_manager; @@ -446,11 +425,7 @@ TEST(TransformerTest, MemcpyTransformerTestImplicitInputConsumedOnMultipleDevice KernelRegistryManager kernel_registry_manager; ExecutionProviders execution_providers; -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider())); - ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(CPUExecutionProviderInfo()))); KernelRegistryManager test_registry_manager; diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc index db9592c293fd0..7bd6b47f52b7d 100644 --- a/onnxruntime/test/framework/sparse_kernels_test.cc +++ b/onnxruntime/test/framework/sparse_kernels_test.cc @@ -1457,9 +1457,6 @@ TEST(SparseTensorConversionTests, CsrConversion) { #ifdef USE_CUDA auto cuda_provider = DefaultCudaExecutionProvider(); - if (cuda_provider == nullptr) { - return; - } auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0]; { auto cuda_transfer = cuda_provider->GetDataTransfer(); @@ -1687,9 +1684,6 @@ TEST(SparseTensorConversionTests, CooConversion) { #ifdef USE_CUDA auto cuda_provider = DefaultCudaExecutionProvider(); - if (cuda_provider == nullptr) { - return; - } auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0]; { auto cuda_transfer = cuda_provider->GetDataTransfer(); diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc index 9d8febb453739..e8291a36447ca 100644 --- a/onnxruntime/test/lora/lora_test.cc +++ b/onnxruntime/test/lora/lora_test.cc @@ -201,16 +201,6 @@ TEST(LoraAdapterTest, Load) { #ifdef USE_CUDA TEST(LoraAdapterTest, VerifyDeviceCopy) { - // These checks for CUDA/DML combined Package, Be careful when you want to remove it! - if (DefaultCudaExecutionProvider() == nullptr) { - GTEST_SKIP() << "Skip This Test Due to this EP is null"; - } -#ifdef USE_DML - if (DefaultDmlExecutionProvider() != nullptr) { - GTEST_FAIL() << "It should not run with DML EP"; - } -#endif - auto cpu_ep = DefaultCpuExecutionProvider(); auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0]; auto cuda_ep = DefaultCudaExecutionProvider(); diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc index b0958e05dc373..aa68f68f3e735 100644 --- a/onnxruntime/test/providers/base_tester.cc +++ b/onnxruntime/test/providers/base_tester.cc @@ -532,17 +532,6 @@ void BaseTester::Run(ExpectResult expect_result, const std::string& expected_fai so.use_deterministic_compute = use_determinism_; so.graph_optimization_level = TransformerLevel::Default; // 'Default' == off - // remove nullptr in execution_providers. - // it's a little ugly but we need to do this because DefaultXXXExecutionProvider() can return nullptr in Runtime. - // And there're many places adding DefaultXXXExecutionProvider() to execution_providers directly. - if (execution_providers != nullptr) { - execution_providers->erase(std::remove(execution_providers->begin(), execution_providers->end(), nullptr), execution_providers->end()); - if (execution_providers->size() == 0) { - // In fact, no ep is needed to run - return; - } - } - Run(so, expect_result, expected_failure_string, excluded_provider_types, run_options, execution_providers, options); } diff --git a/onnxruntime/test/providers/compare_provider_test_utils.cc b/onnxruntime/test/providers/compare_provider_test_utils.cc index 9acb37c24ddd0..386a5656d8a01 100644 --- a/onnxruntime/test/providers/compare_provider_test_utils.cc +++ b/onnxruntime/test/providers/compare_provider_test_utils.cc @@ -53,11 +53,6 @@ void CompareOpTester::CompareWithCPU(const std::string& target_provider_type, SetTestFunctionCalled(); std::unique_ptr target_execution_provider = GetExecutionProvider(target_provider_type); -#if defined(USE_CUDA) && defined(USE_DML) - if (target_execution_provider == nullptr) { - return; - } -#endif ASSERT_TRUE(target_execution_provider != nullptr) << "provider_type " << target_provider_type << " is not supported."; diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index b46c253fb8ed9..e3c86a137484f 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -491,18 +491,6 @@ ::std::vector<::std::basic_string> GetParameterStrings() { // the number of times these are run to reduce the CI time. provider_names.erase(provider_name_cpu); #endif - -#if defined(USE_CUDA) && defined(USE_DML) - const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST"); - if (no_cuda_ep_test == "1") { - provider_names.erase(provider_name_cuda); - } - const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST"); - if (no_dml_ep_test == "1") { - provider_names.erase(provider_name_dml); - } -#endif - std::vector> v; // Permanently exclude following tests because ORT support only opset starting from 7, // Please make no more changes to the list diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc index 0f23e4c39d7e2..be79a6d29d539 100644 --- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc @@ -3,9 +3,6 @@ #include "core/session/onnxruntime_session_options_config_keys.h" #include "gtest/gtest.h" -#if USE_CUDA -#include "test/common/cuda_op_test_utils.h" -#endif #include "test/providers/provider_test_utils.h" #include "test/util/include/default_providers.h" @@ -125,9 +122,6 @@ TEST(GatherOpTest, Gather_invalid_index_gpu) { 4.0f, 5.0f, 6.0f, 7.0f, 0.0f, 0.0f, 0.0f, 0.0f}); -#if defined(USE_CUDA) && defined(USE_DML) - SKIP_CUDA_TEST_WITH_DML; -#endif // On GPU, just set the value to 0 instead of report error. exclude all other providers test #if defined(USE_CUDA) diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc index 7e1a2384d7fc6..05cfb5c13d689 100644 --- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc @@ -15,13 +15,11 @@ std::vector> GetExecutionProviders(int opset execution_providers.emplace_back(DefaultCpuExecutionProvider()); #ifdef USE_CUDA - if (DefaultCudaExecutionProvider() != nullptr) { - if (opset_version < 20) { - execution_providers.emplace_back(DefaultCudaExecutionProvider()); + if (opset_version < 20) { + execution_providers.emplace_back(DefaultCudaExecutionProvider()); #ifdef ENABLE_CUDA_NHWC_OPS - execution_providers.push_back(DefaultCudaNHWCExecutionProvider()); + execution_providers.push_back(DefaultCudaNHWCExecutionProvider()); #endif - } } #endif diff --git a/onnxruntime/test/providers/cuda/cuda_provider_test.cc b/onnxruntime/test/providers/cuda/cuda_provider_test.cc index e745e1bcb8171..e57cdd2350fab 100644 --- a/onnxruntime/test/providers/cuda/cuda_provider_test.cc +++ b/onnxruntime/test/providers/cuda/cuda_provider_test.cc @@ -11,7 +11,7 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA_Test(); namespace test { namespace cuda { -TEST(CudaEpUnittest, All) { +TEST(CUDA_EP_Unittest, All) { onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA_Test(); ep.TestAll(); } diff --git a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc index ec7c6ec4e1605..b413d04fe81e8 100644 --- a/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/allocator_cuda_test.cc @@ -11,7 +11,7 @@ namespace onnxruntime { namespace test { -TEST(CudaEpAllocatorTest, CUDAAllocatorTest) { +TEST(AllocatorTest, CUDAAllocatorTest) { OrtDevice::DeviceId cuda_device_id = 0; // ensure CUDA device is available. @@ -77,7 +77,7 @@ TEST(CudaEpAllocatorTest, CUDAAllocatorTest) { } // test that we fallback to smaller allocations if the growth of the arena exceeds the available memory -TEST(CudaEpAllocatorTest, CUDAAllocatorFallbackTest) { +TEST(AllocatorTest, CUDAAllocatorFallbackTest) { OrtDevice::DeviceId cuda_device_id = 0; size_t free = 0; diff --git a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc index ccdc56de5937d..b2e986f680763 100644 --- a/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/attention_kernel_options_test.cc @@ -17,7 +17,7 @@ using onnxruntime::contrib::attention::AttentionBackend; namespace onnxruntime { namespace test { -TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) { +TEST(AttentionKernelOptionsTest, NonZeroValue) { { AttentionKernelOptions options; int value = static_cast(AttentionBackend::FLASH_ATTENTION) | static_cast(AttentionBackend::EFFICIENT_ATTENTION); @@ -156,7 +156,7 @@ TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) { } // Test all environment variables take effect when option value is 0. -TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) { +TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) { constexpr int value = 0; ScopedEnvironmentVariables scoped_env_vars{ EnvVarMap{ @@ -186,7 +186,7 @@ TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) { } // Test default min sequence lengths when environment variables are not set. -TEST(CudaEpAttentionKernelOptionsTest, DefaultMinSeqLens) { +TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) { constexpr int value = 0; ScopedEnvironmentVariables scoped_env_vars{ EnvVarMap{ diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc index 97d50398a5550..a0d115c41c14b 100644 --- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc +++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc @@ -68,7 +68,7 @@ void ComputeTopKReference(const std::vector& values, } } -TEST(CudaEpTestBeamSearch, TopK) { +TEST(TestBeamSearch, TopK) { int32_t batch_size = 4; int32_t beam_size = 4; int32_t vocab_size = 50257; diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc index d8fb3c8256012..3fcb9045ee7e6 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc @@ -230,7 +230,7 @@ void testPrepack(int rows, int columns) { } // TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80 -TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) { +TEST(BlkQ4_GEMM, PrepackSm80Test) { Status status = onnxruntime::cuda::test::sm80_supported(); if (!status.IsOK()) { // skip the test if sm80 is not supported @@ -263,7 +263,7 @@ TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) { testPrepack(256, 256); } -TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) { +TEST(BlkQ4_GEMM, Sm80RowBlockingTest) { Status status = onnxruntime::cuda::test::sm80_supported(); if (!status.IsOK()) { // skip the test if sm80 is not supported @@ -292,7 +292,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) { onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576); } -TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) { +TEST(BlkQ4_GEMM, Sm80ColBlockingTest) { Status status = onnxruntime::cuda::test::sm80_supported(); if (!status.IsOK()) { // skip the test if sm80 is not supported @@ -305,7 +305,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) { onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576); } -TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) { +TEST(BlkQ4_GEMM, Sm80SmallMTest) { Status status = onnxruntime::cuda::test::sm80_supported(); if (!status.IsOK()) { // skip the test if sm80 is not supported @@ -326,7 +326,7 @@ TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) { onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576); } -TEST(CudaEpBlkQ4_GEMM, Sm80SmallTileKernelTest) { +TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) { Status status = onnxruntime::cuda::test::sm80_supported(); if (!status.IsOK()) { // skip the test if sm80 is not supported diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc index f3222c6f683b5..72357ec7e02d2 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc @@ -19,7 +19,7 @@ namespace cuda { namespace test { // TODO: Since the "DeferredRelease" has been migrated to CudaStream class, // we should migrate this test from CudaEP unit test to CudaStream unit test. -TEST(CudaEpTestDeferredRelease, WithArena) { +TEST(TestDeferredRelease, WithArena) { // Create CUDA EP. CUDAExecutionProviderInfo info; CUDAExecutionProvider ep(info); @@ -52,7 +52,7 @@ TEST(CudaEpTestDeferredRelease, WithArena) { ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts)); } -TEST(CudaEpTestDeferredRelease, WithoutArena) { +TEST(TestDeferredRelease, WithoutArena) { // Create CUDA EP. CUDAExecutionProviderInfo info; CUDAExecutionProvider ep(info); diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc index 3538c7add94d0..7468a5718425e 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc @@ -40,7 +40,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) { } } // namespace -TEST(CudaEpUnittest, FillCorrectness) { +TEST(CudaUtilsTest, FillCorrectness) { TestFillCorrectness(1 << 20, 1); TestFillCorrectness(1 << 20, 2); TestFillCorrectness(1 << 20, 3); diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc index 518fde5804b23..6636e15040393 100644 --- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc @@ -10,7 +10,7 @@ namespace onnxruntime { namespace cuda { namespace test { -TEST(CudaEpGemmOptions, TestDefaultOptions) { +TEST(CudaGemmOptions, TestDefaultOptions) { HalfGemmOptions gemm_options; ASSERT_FALSE(gemm_options.IsCompute16F()); #if defined(USE_CUDA) @@ -22,7 +22,7 @@ TEST(CudaEpGemmOptions, TestDefaultOptions) { #endif } -TEST(CudaEpGemmOptions, TestCompute16F) { +TEST(CudaGemmOptions, TestCompute16F) { HalfGemmOptions gemm_options; gemm_options.Initialize(1); ASSERT_TRUE(gemm_options.IsCompute16F()); @@ -35,7 +35,7 @@ TEST(CudaEpGemmOptions, TestCompute16F) { #endif } -TEST(CudaEpGemmOptions, NoReducedPrecision) { +TEST(CudaGemmOptions, NoReducedPrecision) { HalfGemmOptions gemm_options; gemm_options.Initialize(2); ASSERT_FALSE(gemm_options.IsCompute16F()); @@ -48,7 +48,7 @@ TEST(CudaEpGemmOptions, NoReducedPrecision) { #endif } -TEST(CudaEpGemmOptions, Pedantic) { +TEST(CudaGemmOptions, Pedantic) { HalfGemmOptions gemm_options; gemm_options.Initialize(4); ASSERT_FALSE(gemm_options.IsCompute16F()); @@ -61,7 +61,7 @@ TEST(CudaEpGemmOptions, Pedantic) { #endif } -TEST(CudaEpGemmOptions, Compute16F_Pedantic) { +TEST(CudaGemmOptions, Compute16F_Pedantic) { HalfGemmOptions gemm_options; gemm_options.Initialize(5); ASSERT_TRUE(gemm_options.IsCompute16F()); @@ -74,7 +74,7 @@ TEST(CudaEpGemmOptions, Compute16F_Pedantic) { #endif } -TEST(CudaEpGemmOptions, Compute16F_NoReducedPrecision) { +TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) { HalfGemmOptions gemm_options; gemm_options.Initialize(3); ASSERT_TRUE(gemm_options.IsCompute16F()); diff --git a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc index ba24cf858e80f..6b8cd68de0fca 100644 --- a/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc +++ b/onnxruntime/test/providers/cuda/test_cases/greedy_search_top_one.cc @@ -41,7 +41,7 @@ void ComputeTop1Reference(const std::vector& values, } } -TEST(CudaEpTestGreedySearch, TopOne) { +TEST(TestGreedySearch, TopOne) { int32_t batch_size = 4; int32_t vocab_size = 50257; int32_t batch_x_vocab = batch_size * vocab_size; diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc index 09c9c1e5f8f6a..ec7e98528504e 100644 --- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc @@ -179,7 +179,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e } } // namespace -TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) { +TEST(ReductionFunctionsTest, ReduceRowToScalar) { TestReduceRowToScalarApis(3); TestReduceRowToScalarApis(19); TestReduceRowToScalarApis(123); @@ -188,7 +188,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) { TestReduceRowToScalarApis(941736, 2e-4f); } -TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) { +TEST(ReductionFunctionsTest, ReduceRowsToRow) { for (int m : {3, 193, 2945}) { for (int n : {3, 193, 2945}) { TestReduceRowsToRow(m, n, true); @@ -197,7 +197,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) { } } -TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) { +TEST(ReductionFunctionsTest, ReduceColumnsToColumn) { for (int m : {3, 193, 2945}) { for (int n : {3, 193, 2945}) { TestReduceColumnsToColumn(m, n); @@ -205,7 +205,7 @@ TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) { } } -TEST(CudaEpReductionFunctionsTest, BufferOffsets) { +TEST(ReductionFunctionsTest, BufferOffsets) { const int m = 2048; const int n = 1024; const TensorShape shape{m, n}; @@ -240,7 +240,7 @@ TEST(CudaEpReductionFunctionsTest, BufferOffsets) { } } -TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) { +TEST(ReductionFunctionsTest, InvalidBufferSize) { const int m = 2048; const int n = 1024; const TensorShape shape{m, n}; @@ -262,7 +262,7 @@ TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) { ASSERT_FALSE(status.IsOK()); } -TEST(CudaEpReductionFunctionsTest, GetApplicableMatrixReduction) { +TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) { auto test_get_applicable_matrix_reduction = [](cudnnReduceTensorOp_t cudnn_op, const std::vector& dims, const std::vector& axes, diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py index a274b90dc042f..8fc76da3495a8 100644 --- a/onnxruntime/test/python/onnx_backend_test_series.py +++ b/onnxruntime/test/python/onnx_backend_test_series.py @@ -105,7 +105,7 @@ def load_jsonc(basename: str): return json.loads("\n".join(lines)) -def create_backend_test(devices: list[str], test_name=None): +def create_backend_test(test_name=None): """Creates an OrtBackendTest and adds its TestCase's to global scope so unittest will find them.""" overrides = load_jsonc("onnx_backend_test_series_overrides.jsonc") @@ -126,29 +126,30 @@ def create_backend_test(devices: list[str], test_name=None): else: filters = load_jsonc("onnx_backend_test_series_filters.jsonc") current_failing_tests = apply_filters(filters, "current_failing_tests") + if platform.architecture()[0] == "32bit": current_failing_tests += apply_filters(filters, "current_failing_tests_x86") - if backend.supports_device("DNNL") or "DNNL" in devices: + if backend.supports_device("DNNL"): current_failing_tests += apply_filters(filters, "current_failing_tests_DNNL") - if backend.supports_device("NNAPI") or "NNAPI" in devices: + if backend.supports_device("NNAPI"): current_failing_tests += apply_filters(filters, "current_failing_tests_NNAPI") - if backend.supports_device("OPENVINO_GPU") or "OPENVINO_GPU" in devices: + if backend.supports_device("OPENVINO_GPU"): current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_GPU") - if backend.supports_device("OPENVINO_CPU") or "OPENVINO_CPU" in devices: + if backend.supports_device("OPENVINO_CPU"): current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP32") current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP16") - if backend.supports_device("OPENVINO_NPU") or "OPENVINO_NPU" in devices: + if backend.supports_device("OPENVINO_NPU"): current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU") - if backend.supports_device("OPENVINO") or "OPENVINO" in devices: + if backend.supports_device("OPENVINO"): current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_opset18") - if backend.supports_device("MIGRAPHX") or "MIGRAPHX" in devices: + if backend.supports_device("MIGRAPHX"): current_failing_tests += apply_filters(filters, "current_failing_tests_MIGRAPHX") if backend.supports_device("WEBGPU"): @@ -157,16 +158,8 @@ def create_backend_test(devices: list[str], test_name=None): # Skip these tests for a "pure" DML onnxruntime python wheel. We keep these tests enabled for instances where both DML and CUDA # EPs are available (Windows GPU CI pipeline has this config) - these test will pass because CUDA has higher precedence than DML # and the nodes are assigned to only the CUDA EP (which supports these tests) - if (backend.supports_device("DML") and not backend.supports_device("GPU")) or "DML" in devices: + if backend.supports_device("DML") and not backend.supports_device("GPU"): current_failing_tests += apply_filters(filters, "current_failing_tests_pure_DML") - # exclude CUDA EP when DML test is running. - os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,CUDAExecutionProvider" - elif backend.supports_device("DML") and "DML" not in devices: - # exclude DML EP when CUDA test is running. - os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider,DmlExecutionProvider" - else: - # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior - os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider" filters = ( current_failing_tests @@ -179,6 +172,9 @@ def create_backend_test(devices: list[str], test_name=None): backend_test.exclude("(" + "|".join(filters) + ")") print("excluded tests:", filters) + # exclude TRT EP temporarily and only test CUDA EP to retain previous behavior + os.environ["ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS"] = "TensorrtExecutionProvider" + # import all test cases at global scope to make # them visible to python.unittest. globals().update(backend_test.enable_report().test_cases) @@ -203,15 +199,6 @@ def parse_args(): help="Only run tests that match this value. Matching is regex based, and '.*' is automatically appended", ) - parser.add_argument( - "--devices", - type=str, - choices=["CPU", "CUDA", "MIGRAPHX", "DNNL", "DML", "OPENVINO_GPU", "OPENVINO_CPU", "OPENVINO_NPU", "OPENVINO"], - nargs="+", # allows multiple values - default=["CPU"], # default to ["CPU"] if no input is given - help="Select one or more devices CPU, CUDA, MIGRAPHX, DNNL, DML, OPENVINO_GPU, OPENVINO_CPU, OPENVINO_NPU, OPENVINO", - ) - # parse just our args. python unittest has its own args and arg parsing, and that runs inside unittest.main() parsed, unknown = parser.parse_known_args() sys.argv = sys.argv[:1] + unknown @@ -222,5 +209,5 @@ def parse_args(): if __name__ == "__main__": args = parse_args() - create_backend_test(args.devices, args.test_name) + create_backend_test(args.test_name) unittest.main() diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index 7ecaab6fedb02..f083ab14ad133 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -750,13 +750,6 @@ "^test_reduce_log_sum_empty_set_cpu", "^test_reduce_log_sum_exp_empty_set_cpu", "^test_reduce_prod_empty_set_cpu", - // Bug: DML EP some how executes these CUDA tests and failed - // TODO: Remove these tests when DML EP is fixed - "^test_convtranspose_autopad_same_cuda", - "^test_asin_example_cuda", - "^test_dynamicquantizelinear_cuda", - "^test_dynamicquantizelinear_expanded_cuda", - "^test_reduce_min_empty_set_cuda", //Bug: DML EP does not execute operators with an empty input tensor //TODO: Resolve as a graph implementation that returns a constant inf tensor with appropriate strides "^test_reduce_min_empty_set_cpu" diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index 59926bbcd1c6f..c1564997c42b8 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -122,12 +122,6 @@ std::unique_ptr DefaultOpenVINOExecutionProvider() { std::unique_ptr DefaultCudaExecutionProvider() { #ifdef USE_CUDA -#ifdef USE_DML - const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST"); - if (no_cuda_ep_test == "1") { - return nullptr; - } -#endif OrtCUDAProviderOptionsV2 provider_options{}; provider_options.do_copy_in_default_stream = true; provider_options.use_tf32 = false; @@ -140,12 +134,6 @@ std::unique_ptr DefaultCudaExecutionProvider() { #ifdef ENABLE_CUDA_NHWC_OPS std::unique_ptr DefaultCudaNHWCExecutionProvider() { #if defined(USE_CUDA) -#ifdef USE_DML - const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST"); - if (no_cuda_ep_test == "1") { - return nullptr; - } -#endif OrtCUDAProviderOptionsV2 provider_options{}; provider_options.do_copy_in_default_stream = true; provider_options.use_tf32 = false; @@ -332,12 +320,6 @@ std::unique_ptr DefaultCannExecutionProvider() { std::unique_ptr DefaultDmlExecutionProvider() { #ifdef USE_DML -#ifdef USE_CUDA - const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST"); - if (no_dml_ep_test == "1") { - return nullptr; - } -#endif ConfigOptions config_options{}; if (auto factory = DMLProviderFactoryCreator::CreateFromDeviceOptions(config_options, nullptr, false, false)) { return factory->CreateProvider(); diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml deleted file mode 100644 index 9a721c65de332..0000000000000 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/steps/py_packaging_test_step.yml +++ /dev/null @@ -1,21 +0,0 @@ -parameters: -- name: EP_NAME - type: string - default: CPU - -- name: PYTHON_VERSION - type: string - -steps: -- powershell: | - python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq - Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} - mkdir -p $(Agent.TempDirectory)\ort_test_data - Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data - Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data - cd $(Agent.TempDirectory)\ort_test_data - python onnx_backend_test_series.py --devices ${{ parameters.EP_NAME }} -v - cd $(Agent.TempDirectory) - Remove-Item -Path $(Agent.TempDirectory)\ort_test_data -Recurse -Force - workingDirectory: '$(Build.sourcesDirectory)' - displayName: 'Run Python Tests with ${{ parameters.EP_NAME }} EP' \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml index 0b3eac0110abc..9c7fbc24ab1b6 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml @@ -50,8 +50,6 @@ stages: win_trt_home: ${{ parameters.win_trt_home }} win_cuda_home: ${{ parameters.win_cuda_home }} buildJava: ${{ parameters.buildJava }} - SpecificArtifact: ${{ parameters.SpecificArtifact }} - BuildId: ${{ parameters.BuildId }} - template: nuget-cuda-packaging-stage.yml parameters: diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml index d6b25c98936f0..445066f08995a 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml @@ -34,7 +34,7 @@ parameters: displayName: Specific Artifact's BuildId type: string default: '0' - + - name: buildJava type: boolean @@ -50,14 +50,13 @@ stages: msbuildPlatform: x64 packageName: x64-cuda CudaVersion: ${{ parameters.CudaVersion }} - buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel + buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }} SpecificArtifact: ${{ parameters.SpecificArtifact }} BuildId: ${{ parameters.BuildId }} - ComboTests: true # Windows CUDA with TensorRT Packaging - template: ../templates/win-ci.yml parameters: @@ -69,7 +68,7 @@ stages: msbuildPlatform: x64 CudaVersion: ${{ parameters.CudaVersion }} packageName: x64-tensorrt - buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --parallel + buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" runTests: ${{ parameters.RunOnnxRuntimeTests }} buildJava: ${{ parameters.buildJava }} java_artifact_id: onnxruntime_gpu diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml index f7235e3ad2076..947e4f99b984f 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml @@ -56,7 +56,7 @@ stages: PYTHON_VERSION: ${{ python_version }} EP_NAME: gpu CudaVersion: ${{ parameters.cuda_version }} - EP_BUILD_FLAGS: --use_dml --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --enable_lto --cuda_home=$(Agent.TempDirectory)\v${{ parameters.cuda_version }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" use_tensorrt: True - ${{ if eq(parameters.enable_linux_cuda, true) }}: diff --git a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml index dd0539f751c89..aa7f2845fc0fa 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-win-gpu-stage.yml @@ -33,7 +33,7 @@ parameters: - Release - RelWithDebInfo - MinSizeRel - + - name: use_tensorrt type: boolean default: false @@ -134,7 +134,7 @@ stages: --cmake_generator "$(VSGenerator)" --enable_pybind --enable_onnx_tests - --parallel 4 --use_binskim_compliant_compile_flags --update --build + --parallel --use_binskim_compliant_compile_flags --update --build $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} ${{ variables.trt_build_flag }} workingDirectory: '$(Build.BinariesDirectory)' @@ -206,20 +206,19 @@ stages: DownloadTRT: ${{ parameters.use_tensorrt }} - task: PowerShell@2 - displayName: 'Install Third Party Dependencies' + displayName: 'Install ONNX' inputs: filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1' workingDirectory: '$(Build.BinariesDirectory)' arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\${{ parameters.cmake_build_type }}\installed -build_config ${{ parameters.cmake_build_type }} - - template: jobs/steps/py_packaging_test_step.yml - parameters: - EP_NAME: DML - PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }} - - - template: jobs/steps/py_packaging_test_step.yml - parameters: - EP_NAME: CUDA - PYTHON_VERSION: ${{ parameters.PYTHON_VERSION }} - - + - powershell: | + python -m pip uninstall -y onnxruntime onnxruntime-gpu -qq + Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} + mkdir -p $(Agent.TempDirectory)\ort_test_data + Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data + Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data + cd $(Agent.TempDirectory)\ort_test_data + python onnx_backend_test_series.py + workingDirectory: '$(Build.sourcesDirectory)' + displayName: 'Run Python Tests' diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml index 7bdd069de711b..e8f391a73fa7b 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml @@ -218,32 +218,16 @@ jobs: - powershell: | python3 -m pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml -qq Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname} + workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' displayName: 'Install onnxruntime wheel' - ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}: - - ${{ if and(contains(parameters.additionalBuildFlags, 'use_cuda'), contains(parameters.additionalBuildFlags, 'use_dml')) }}: - - powershell: | - python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }} - workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' - displayName: 'Run tests excluding CUDA tests' - env: - NO_CUDA_TEST: '1' - GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*:*cpu_*models*' # Exclude CUDA EP tests under providers/cuda/ and cpu models test - PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' # For onnxruntime4j_test to find dependent dlls - - powershell: | - python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }} - workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' - displayName: 'Run tests excluding DML tests' - env: - NO_DML_TEST: '1' - GTEST_FILTER: '-*cpu_*models*' - PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' - - ${{ else }}: - - powershell: | - python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }} - workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' - displayName: 'Run tests' + - powershell: | + python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }} + + workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' + displayName: 'Run tests' - ${{ if eq(parameters.GenerateDocumentation, true) }}: - task: PythonScript@0 diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index e046997b4f49a..59950433b3d40 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -25,7 +25,7 @@ parameters: - name: runTests type: boolean - default: false + default: true - name: buildJava type: boolean @@ -71,10 +71,6 @@ parameters: - 11.8 - 12.2 -- name: ComboTests - type: boolean - default: false - - name: SpecificArtifact displayName: Use Specific Artifact type: boolean @@ -226,7 +222,7 @@ stages: condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) inputs: scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --test --skip_submodule_sync --build_shared_lib --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }}' + arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }}' workingDirectory: '$(Build.BinariesDirectory)' - ${{ else }}: - powershell: | @@ -338,10 +334,6 @@ stages: displayName: 'Clean Agent Directories' condition: always() - - script: - echo ${{ parameters.SpecificArtifact }} - displayName: 'Print Specific Artifact' - - checkout: self clean: true submodules: none @@ -407,35 +399,13 @@ stages: displayName: 'Append dotnet x86 Directory to PATH' condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86')) - - ${{ if eq(parameters.ComboTests, 'true') }}: - - task: PythonScript@0 - displayName: 'test excludes CUDA' - condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests $(TelemetryOption) ' - workingDirectory: '$(Build.BinariesDirectory)' - env: - NO_CUDA_TEST: '1' - GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*' # Exclude CUDA EP tests under providers/cuda/ - - task: PythonScript@0 - displayName: 'test excludes DML' - condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests $(TelemetryOption) ' - workingDirectory: '$(Build.BinariesDirectory)' - env: - NO_DML_TEST: '1' - - ${{ else }}: - - task: PythonScript@0 - displayName: 'test' - condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests $(TelemetryOption) ' - workingDirectory: '$(Build.BinariesDirectory)' - + - task: PythonScript@0 + displayName: 'test' + condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) + inputs: + scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' + arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests $(TelemetryOption) ' + workingDirectory: '$(Build.BinariesDirectory)' # Previous stage only assembles the java binaries, testing will be done in this stage with GPU machine - ${{ if eq(parameters.buildJava, 'true') }}: - template: make_java_win_binaries.yml diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml index 67fd47c3150af..47ece37e66e09 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml @@ -62,28 +62,4 @@ stages: RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} ORT_EP_NAME: CUDA WITH_CACHE: true - MachinePool: onnxruntime-Win2022-GPU-A10 - -- stage: cuda_dml - dependsOn: [] - jobs: - - template: templates/jobs/win-ci-vs-2022-job.yml - parameters: - BuildConfig: 'RelWithDebInfo' - EnvSetupScript: setup_env_cuda.bat - buildArch: x64 - additionalBuildFlags: >- - --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" - --enable_cuda_profiling --enable_transformers_tool_test - --use_dml - --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 - --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON - --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON - msbuildPlatform: x64 - isX86: false - job_name_suffix: x64_RelWithDebInfo - RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} - ORT_EP_NAME: CUDA - EnablePython: false - WITH_CACHE: true - MachinePool: onnxruntime-Win2022-GPU-A10 + MachinePool: onnxruntime-Win2022-GPU-A10 \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml index 911d99cd2adf3..94b0aa680d54d 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml @@ -43,11 +43,11 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml + additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} ORT_EP_NAME: DML WITH_CACHE: false - MachinePool: onnxruntime-Win2022-GPU-dml-A10 + MachinePool: onnxruntime-Win2022-GPU-dml-A10 \ No newline at end of file