From 7a4afcee36bb8590502814b472cebe423c3ee40f Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 4 Nov 2024 06:31:35 +0000 Subject: [PATCH 1/8] cuda ArgMin-12, ArgMin-13, ArgMax-12, ArgMax-13 --- .../providers/cuda/cuda_execution_provider.cc | 63 ++++++++++++++++++- .../providers/cuda/reduction/reduction_ops.cc | 29 +++++---- .../providers/cuda/reduction/reduction_ops.h | 20 +++++- .../cpu/reduction/reduction_ops_test.cc | 42 +++++++++++++ 4 files changed, 137 insertions(+), 17 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 497d0014795ec..21b0b033e32c1 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -963,6 +963,13 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, Dropout); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, Einsum); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, float, ArgMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, double, ArgMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, MLFloat16, ArgMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, float, ArgMin); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, double, ArgMin); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, MLFloat16, ArgMin); + // OpSet 13 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 14, Pow); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, int32_t, Add); @@ -1199,6 +1206,13 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, int8_t, DequantizeLinear); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 18, uint8_t, DequantizeLinear); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, ArgMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ArgMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ArgMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, float, ArgMin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, ArgMin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, ArgMin); + // OpSet 14 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, CumSum); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, Relu); @@ -1640,6 +1654,9 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1822,9 +1839,6 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { 19, IsInf)>, // opset 11 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1916,6 +1930,13 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // OpSet 13 BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2150,6 +2171,13 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // OpSet 14 BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2566,6 +2594,32 @@ static bool CastNeedFallbackToCPU(const onnxruntime::Node& node) { return false; } +static bool ArgMaxOrArgMinNeedFallbackToCPU(const onnxruntime::Node& node) { + // Opset 12 introduced the attribute "select_last_index" + if (node.SinceVersion() >= 12) { + const auto& node_attributes = node.GetAttributes(); + + for (auto& attr : node_attributes) { + auto& attr_name = attr.first; + auto& attr_value = attr.second; + + // CuDNN doesn't support picking the last index in case of encountering + // duplicate max values. + // CuDNN's API doc doesn't mention what happens in case duplicates are encountered, + // but based on testing, the results seem to indicate a "stable" implementation + // (i.e.) relative ordering is preserved which is the expected behavior when the + // attribute takes on the default value (most commong use-case for this operator). + if ("select_last_index" == attr_name) { + if (attr_value.i() != 0) { + return true; + } + } + } + } + + return false; +} + std::unique_ptr CUDAExecutionProvider::GetDataTransfer() const { return std::make_unique(); } @@ -2615,6 +2669,9 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, } else if ("ConvTranspose" == node.OpType()) { not_supported = ConvTransposeNeedFallbackToCPU(node, logger, graph, IsNHWCPreferred()); force_inside = !not_supported; + } else if ("ArgMax" == node.OpType() || "ArgMin" == node.OpType()) { + not_supported = ArgMaxOrArgMinNeedFallbackToCPU(node); + force_inside = !not_supported; } else if ("Cast" == node.OpType()) { not_supported = CastNeedFallbackToCPU(node); // cast is not compute heavy, and may be placed outside diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 860bea67dc719..6eb1a1dc92cf1 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -16,17 +16,17 @@ using namespace onnxruntime::common; namespace onnxruntime { namespace cuda { -#define REGISTER_KERNEL_UNTIL_VERSIONED_TYPED(name, T, end) \ +#define REGISTER_KERNEL_VERSIONED_RANGE_TYPED(name, T, begin, end) \ ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ name, \ kOnnxDomain, \ - 1, end, \ + begin, end, \ T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ name); -#define REGISTER_KERNEL_TYPED_AXES_INPUT(name, T, version) \ +#define REGISTER_KERNEL_VERSIONED_SINCE_TYPED(name, T, version) \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ name, \ kOnnxDomain, \ @@ -37,8 +37,13 @@ namespace cuda { name); #define REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(name, T, last, cur) \ - REGISTER_KERNEL_UNTIL_VERSIONED_TYPED(name, T, last) \ - REGISTER_KERNEL_TYPED_AXES_INPUT(name, T, cur) + REGISTER_KERNEL_VERSIONED_RANGE_TYPED(name, T, 1, last) \ + REGISTER_KERNEL_VERSIONED_SINCE_TYPED(name, T, cur) + +#define REGISTER_KERNEL_ARGMIN_OR_ARGMAX(name, T) \ + REGISTER_KERNEL_VERSIONED_RANGE_TYPED(name, T, 1, 11) \ + REGISTER_KERNEL_VERSIONED_RANGE_TYPED(name, T, 12, 12) \ + REGISTER_KERNEL_VERSIONED_SINCE_TYPED(name, T, 13) // TODO ReduceKernel::ReduceKernelShared() is still used by some other training classes though it's not used here - this should be refactored. template @@ -829,14 +834,14 @@ template std::unique_ptr ReduceCompute class ArgMax final : public ReduceKernel { public: - ArgMax(const OpKernelInfo& info) : ReduceKernel(info) {} + ArgMax(const OpKernelInfo& info) : ReduceKernel(info) { + // The following is just a safety check. + // The logic in ArgMaxOrArgMinNeedFallbackToCPU() makes sure to not assign ArgMax + // nodes with select_last_index == 1 to the CUDA EP. + int64_t select_last_index = 0; + if (info.GetAttr("select_last_index", &select_last_index).IsOK()) { + ORT_ENFORCE(select_last_index == 0, "select_last_index as 1 is not supported on CUDA"); + } + } Status ComputeInternal(OpKernelContext* ctx) const override { return ComputeImpl(ctx, CUDNN_REDUCE_TENSOR_MAX); @@ -98,7 +106,15 @@ class ArgMax final : public ReduceKernel { template class ArgMin final : public ReduceKernel { public: - ArgMin(const OpKernelInfo& info) : ReduceKernel(info) {} + ArgMin(const OpKernelInfo& info) : ReduceKernel(info) { + // The following is just a safety check. + // The logic in ArgMaxOrArgMinNeedFallbackToCPU() makes sure to not assign ArgMax + // nodes with select_last_index == 1 to the CUDA EP. + int64_t select_last_index = 0; + if (info.GetAttr("select_last_index", &select_last_index).IsOK()) { + ORT_ENFORCE(select_last_index == 0, "select_last_index as 1 is not supported on CUDA"); + } + } Status ComputeInternal(OpKernelContext* ctx) const override { return ComputeImpl(ctx, CUDNN_REDUCE_TENSOR_MIN); diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index bb6d732fccb8f..5f467551234cd 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -3337,6 +3337,27 @@ TEST(ReductionOpTest, ArgMax_int32_last_index_dups) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } +TEST(ReductionOpTest, ArgMax_float_last_index_dups) { + OpTester test("ArgMax", 12); + test.AddAttribute("axis", static_cast(0)); + test.AddAttribute("keepdims", static_cast(1)); + + // Since select_last_index is 0 by default, this test should run on both CPU and CUDA + test.AddAttribute("select_last_index", static_cast(0)); + + std::vector data_vec; + + size_t data_size = 10000; + data_vec.reserve(data_size); + for (size_t i = 0; i < data_size; ++i) { + data_vec.push_back(10.f); + } + test.AddInput("data", {10000}, data_vec); + test.AddOutput("reduced", {1}, {0}); + + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + TEST(ReductionOpTest, ArgMax_int32_neg_axis) { OpTester test("ArgMax"); test.AddAttribute("axis", (int64_t)(-2)); @@ -3655,6 +3676,27 @@ TEST(ReductionOpTest, ArgMin_int32_neg_axis) { test.Run(); } +TEST(ReductionOpTest, ArgMin_float_last_index_dups) { + OpTester test("ArgMin", 13); + test.AddAttribute("axis", static_cast(0)); + test.AddAttribute("keepdims", static_cast(1)); + + // Since select_last_index is 0 by default, this test should run on both CPU and CUDA + test.AddAttribute("select_last_index", static_cast(0)); + + std::vector data_vec; + + size_t data_size = 10000; + data_vec.reserve(data_size); + for (size_t i = 0; i < data_size; ++i) { + data_vec.push_back(10.f); + } + test.AddInput("data", {10000}, data_vec); + test.AddOutput("reduced", {1}, {0}); + + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + TEST(ReductionOpTest, OptimizeShapeForFastReduce_ReduceDimWithZero1) { FastReduceKind fast_kind; TensorShapeVector fast_shape, fast_output_shape, fast_axes; From bb5335a7110c9155f40fdf669a6a2d1d829397d4 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 4 Nov 2024 16:17:59 +0000 Subject: [PATCH 2/8] update doc --- docs/OperatorKernels.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index bd886abc98a89..5fb1e54b38c2b 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -554,8 +554,12 @@ Do not modify directly.* |||[7, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)| |Affine|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)| |And|*in* A:**T**
*in* B:**T**
*out* C:**T1**|7+|**T** = tensor(bool)
**T1** = tensor(bool)| -|ArgMax|*in* data:**T**
*out* reduced:**tensor(int64)**|[1, 11]|**T** = tensor(double), tensor(float), tensor(float16)| -|ArgMin|*in* data:**T**
*out* reduced:**tensor(int64)**|[1, 11]|**T** = tensor(double), tensor(float), tensor(float16)| +|ArgMax|*in* data:**T**
*out* reduced:**tensor(int64)**|13+|**T** = tensor(double), tensor(float), tensor(float16)| +|||12|**T** = tensor(double), tensor(float), tensor(float16)| +|||[1, 11]|**T** = tensor(double), tensor(float), tensor(float16)| +|ArgMin|*in* data:**T**
*out* reduced:**tensor(int64)**|13+|**T** = tensor(double), tensor(float), tensor(float16)| +|||12|**T** = tensor(double), tensor(float), tensor(float16)| +|||[1, 11]|**T** = tensor(double), tensor(float), tensor(float16)| |AveragePool|*in* X:**T**
*out* Y:**T**|11+|**T** = tensor(double), tensor(float), tensor(float16)| |||10|**T** = tensor(double), tensor(float), tensor(float16)| |||[7, 9]|**T** = tensor(double), tensor(float), tensor(float16)| From 36da528d173c6000d36693d6d57bae5b6a2ed04f Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 4 Nov 2024 16:44:46 +0000 Subject: [PATCH 3/8] update comments --- onnxruntime/core/providers/cuda/reduction/reduction_ops.cc | 1 - onnxruntime/core/providers/cuda/reduction/reduction_ops.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 6eb1a1dc92cf1..4f8e6605ce151 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -834,7 +834,6 @@ template std::unique_ptr ReduceCompute { public: ArgMin(const OpKernelInfo& info) : ReduceKernel(info) { // The following is just a safety check. - // The logic in ArgMaxOrArgMinNeedFallbackToCPU() makes sure to not assign ArgMax + // The logic in ArgMaxOrArgMinNeedFallbackToCPU() makes sure to not assign ArgMin // nodes with select_last_index == 1 to the CUDA EP. int64_t select_last_index = 0; if (info.GetAttr("select_last_index", &select_last_index).IsOK()) { From 1b7311aca11a2031f5810e6013ca6447d84156b6 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 4 Nov 2024 23:19:56 +0000 Subject: [PATCH 4/8] test random --- .../providers/cuda/cuda_execution_provider.cc | 2 +- .../cpu/reduction/reduction_ops_test.cc | 59 ++++++++++++++----- 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 21b0b033e32c1..8396e2629d2bf 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -2608,7 +2608,7 @@ static bool ArgMaxOrArgMinNeedFallbackToCPU(const onnxruntime::Node& node) { // CuDNN's API doc doesn't mention what happens in case duplicates are encountered, // but based on testing, the results seem to indicate a "stable" implementation // (i.e.) relative ordering is preserved which is the expected behavior when the - // attribute takes on the default value (most commong use-case for this operator). + // attribute takes on the default value (most common use-case for this operator). if ("select_last_index" == attr_name) { if (attr_value.i() != 0) { return true; diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index 5f467551234cd..e18ad150b3e4b 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -3,6 +3,7 @@ #include #include +#include #include #include "gtest/gtest.h" #include "test/common/dnnl_op_test_utils.h" @@ -3337,7 +3338,7 @@ TEST(ReductionOpTest, ArgMax_int32_last_index_dups) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } -TEST(ReductionOpTest, ArgMax_float_last_index_dups) { +TEST(ReductionOpTest, ArgMax_float_first_index_random) { OpTester test("ArgMax", 12); test.AddAttribute("axis", static_cast(0)); test.AddAttribute("keepdims", static_cast(1)); @@ -3345,15 +3346,28 @@ TEST(ReductionOpTest, ArgMax_float_last_index_dups) { // Since select_last_index is 0 by default, this test should run on both CPU and CUDA test.AddAttribute("select_last_index", static_cast(0)); - std::vector data_vec; + constexpr size_t vector_size = 64 * 1024; + constexpr float max_value = std::numeric_limits::infinity(); - size_t data_size = 10000; - data_vec.reserve(data_size); - for (size_t i = 0; i < data_size; ++i) { - data_vec.push_back(10.f); + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution distribution(0, static_cast(vector_size) - 1); + + std::vector data_vec(vector_size, 0.0f); + + int min_index = -1; + + // Try replace 8 elements with max_value. It is fine that some elements hit same index. + for (int i = 0; i < 8; ++i) { + int index = distribution(generator); + data_vec[index] = max_value; + if (i == 0 || index < min_index) { + min_index = index; + } } - test.AddInput("data", {10000}, data_vec); - test.AddOutput("reduced", {1}, {0}); + + test.AddInput("data", {vector_size}, data_vec); + test.AddOutput("reduced", {1}, {min_index}); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } @@ -3676,7 +3690,7 @@ TEST(ReductionOpTest, ArgMin_int32_neg_axis) { test.Run(); } -TEST(ReductionOpTest, ArgMin_float_last_index_dups) { +TEST(ReductionOpTest, ArgMin_float_first_index_random) { OpTester test("ArgMin", 13); test.AddAttribute("axis", static_cast(0)); test.AddAttribute("keepdims", static_cast(1)); @@ -3684,15 +3698,28 @@ TEST(ReductionOpTest, ArgMin_float_last_index_dups) { // Since select_last_index is 0 by default, this test should run on both CPU and CUDA test.AddAttribute("select_last_index", static_cast(0)); - std::vector data_vec; + constexpr size_t vector_size = 64 * 1024; + constexpr float min_value = -std::numeric_limits::infinity(); - size_t data_size = 10000; - data_vec.reserve(data_size); - for (size_t i = 0; i < data_size; ++i) { - data_vec.push_back(10.f); + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution distribution(0, static_cast(vector_size) - 1); + + std::vector data_vec(vector_size, 0.0f); + + int min_index = -1; + + // Try replace 8 elements with min_value. It is fine that some elements hit same index. + for (int i = 0; i < 8; ++i) { + int index = distribution(generator); + data_vec[index] = min_value; + if (i == 0 || index < min_index) { + min_index = index; + } } - test.AddInput("data", {10000}, data_vec); - test.AddOutput("reduced", {1}, {0}); + + test.AddInput("data", {vector_size}, data_vec); + test.AddOutput("reduced", {1}, {min_index}); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } From cbaeebb8a741e8f579d8930f22d7148b59651e3e Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 4 Nov 2024 23:48:49 +0000 Subject: [PATCH 5/8] ArgMax / ArgMin in ROCm --- .../providers/rocm/reduction/reduction_ops.cc | 28 +++++----- .../providers/rocm/rocm_execution_provider.cc | 53 +++++++++++++++++++ 2 files changed, 69 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index 1340c49c38ded..d8b7e26d17b65 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -16,17 +16,17 @@ using namespace onnxruntime::common; namespace onnxruntime { namespace rocm { -#define REGISTER_KERNEL_UNTIL_VERSIONED_TYPED(name, T, end) \ +#define REGISTER_KERNEL_VERSIONED_RANGE_TYPED(name, T, begin, end) \ ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ name, \ kOnnxDomain, \ - 1, end, \ + begin, end, \ T, \ kRocmExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ name); -#define REGISTER_KERNEL_TYPED_AXES_INPUT(name, T, version) \ +#define REGISTER_KERNEL_VERSIONED_SINCE_TYPED(name, T, version) \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ name, \ kOnnxDomain, \ @@ -37,8 +37,13 @@ namespace rocm { name); #define REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(name, T, last, cur) \ - REGISTER_KERNEL_UNTIL_VERSIONED_TYPED(name, T, last) \ - REGISTER_KERNEL_TYPED_AXES_INPUT(name, T, cur) + REGISTER_KERNEL_VERSIONED_RANGE_TYPED(name, T, 1, last) \ + REGISTER_KERNEL_VERSIONED_SINCE_TYPED(name, T, cur) + +#define REGISTER_KERNEL_ARGMIN_OR_ARGMAX(name, T) \ + REGISTER_KERNEL_VERSIONED_RANGE_TYPED(name, T, 1, 11) \ + REGISTER_KERNEL_VERSIONED_RANGE_TYPED(name, T, 12, 12) \ + REGISTER_KERNEL_VERSIONED_SINCE_TYPED(name, T, 13) // TODO ReduceKernel::ReduceKernelShared() is still used by some other training classes though it's not used here - this should be refactored. template @@ -830,14 +835,13 @@ template std::unique_ptr ReduceCompute, // BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1879,6 +1896,13 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + // OpSet 13 BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2112,6 +2136,12 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, // OpSet 14 BuildKernelCreateInfo, @@ -2387,6 +2417,26 @@ static bool CastNeedFallbackToCPU(const onnxruntime::Node& node) { return false; } +static bool ArgMaxOrArgMinNeedFallbackToCPU(const onnxruntime::Node& node) { + // Opset 12 introduced the attribute "select_last_index" + if (node.SinceVersion() >= 12) { + const auto& node_attributes = node.GetAttributes(); + + for (auto& attr : node_attributes) { + auto& attr_name = attr.first; + auto& attr_value = attr.second; + + // It is not supported to pick the last index in case of encountering duplicate max values. + if ("select_last_index" == attr_name) { + if (attr_value.i() != 0) { + return true; + } + } + } + } + + return false; +} std::unique_ptr ROCMExecutionProvider::GetDataTransfer() const { return std::make_unique(); } @@ -2425,6 +2475,9 @@ ROCMExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, "GRU" == node.OpType()) { not_supported = true; force_inside = !not_supported; + } else if ("ArgMax" == node.OpType() || "ArgMin" == node.OpType()) { + not_supported = ArgMaxOrArgMinNeedFallbackToCPU(node); + force_inside = !not_supported; } else if ("Cast" == node.OpType()) { not_supported = CastNeedFallbackToCPU(node); // cast is not compute heavy, and may be placed outside From 8b7c924cdf62e85066384cc918fc63ac1f138489 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 5 Nov 2024 00:22:28 +0000 Subject: [PATCH 6/8] fix rocm --- onnxruntime/core/providers/rocm/rocm_execution_provider.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index 6a6fb68d8cdf9..75b8ac7e134f3 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -1802,9 +1802,6 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { 19, IsInf)>, // opset 11 - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, From 6da3dea432e13a4838e67c66438a00ca740f0903 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 5 Nov 2024 17:59:38 +0000 Subject: [PATCH 7/8] Fix openvino CI --- onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index e18ad150b3e4b..fffd4c5f56f01 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -3721,7 +3721,8 @@ TEST(ReductionOpTest, ArgMin_float_first_index_random) { test.AddInput("data", {vector_size}, data_vec); test.AddOutput("reduced", {1}, {min_index}); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); + // Exclude OpenVINO since it failed to handle this case. + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); } TEST(ReductionOpTest, OptimizeShapeForFastReduce_ReduceDimWithZero1) { From 193f70fde0c40411deddfd66ce4102cd2b86521b Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 5 Nov 2024 23:05:23 +0000 Subject: [PATCH 8/8] Exclude OpenVino in the ArgMax test --- onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index fffd4c5f56f01..c1c049ae5f967 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -3369,7 +3369,8 @@ TEST(ReductionOpTest, ArgMax_float_first_index_random) { test.AddInput("data", {vector_size}, data_vec); test.AddOutput("reduced", {1}, {min_index}); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); + // Exclude OpenVINO since it failed to handle this case. + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); } TEST(ReductionOpTest, ArgMax_int32_neg_axis) {