diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 8b139c2d5514f..d12781252f553 100755 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -840,6 +840,8 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int32_t, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int64_t, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, int8_t, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, uint8_t, ReduceMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, float, ReduceMean); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, double, ReduceMean); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 17, MLFloat16, ReduceMean); @@ -1420,13 +1422,13 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME( kCudaExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME( kCudaExecutionProvider, kOnnxDomain, 14, 14, MLFloat16, BatchNormalization); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMin); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMin); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMin); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMin); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int8_t, ReduceMin); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, uint8_t, ReduceMin); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMin); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, float, ReduceMin); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, double, ReduceMin); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, MLFloat16, ReduceMin); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, int32_t, ReduceMin); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, int8_t, ReduceMin); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, uint8_t, ReduceMin); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, int64_t, ReduceMin); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, Trilu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Add); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, BFloat16, Sub); @@ -1486,11 +1488,13 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, // Opset 18 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, Split); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, ReduceMax); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, ReduceMax); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, ReduceMax); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int32_t, ReduceMax); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, float, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, double, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, MLFloat16, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, int32_t, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, int64_t, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, int8_t, ReduceMax); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 19, uint8_t, ReduceMax); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, ScatterElements); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, ScatterND); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 18, float, Pad); @@ -1577,6 +1581,21 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, 21, float, GridSample); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, ReduceMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, ReduceMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, ReduceMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, int32_t, ReduceMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, int64_t, ReduceMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMax); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, ReduceMin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, ReduceMin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, ReduceMin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, int32_t, ReduceMin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMin); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, int64_t, ReduceMin); + // Opset 21. class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 22, float, Cast); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 22, double, Cast); @@ -2044,6 +2063,8 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2681,18 +2702,20 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { // Opset 18 BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2781,6 +2804,21 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // Opset 21 BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 127cfcc557fd5..a0a2f377d0c80 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -36,6 +36,16 @@ namespace cuda { (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()).InputMemoryType(OrtMemTypeCPUInput, 1), \ name); +#define REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(name, T, begin, end) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + name, \ + kOnnxDomain, \ + begin, end, \ + T, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()).InputMemoryType(OrtMemTypeCPUInput, 1), \ + name); + #define REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(name, T, last, cur) \ REGISTER_KERNEL_VERSIONED_RANGE_TYPED(name, T, 1, last) \ REGISTER_KERNEL_VERSIONED_SINCE_TYPED(name, T, cur) @@ -876,13 +886,27 @@ REGISTER_KERNEL_ARGMIN_OR_ARGMAX(ArgMin, MLFloat16) REGISTER_KERNEL_ARGMIN_OR_ARGMAX(ArgMin, float) REGISTER_KERNEL_ARGMIN_OR_ARGMAX(ArgMin, double) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMax, MLFloat16, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMax, float, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMax, double, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMax, int32_t, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMax, int64_t, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMax, int8_t, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMax, uint8_t, 17, 18) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMax, MLFloat16, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMax, float, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMax, double, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMax, int32_t, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMax, int64_t, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMax, int8_t, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMax, uint8_t, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMax, MLFloat16, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMax, float, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMax, double, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMax, int32_t, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMax, int64_t, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMax, int8_t, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMax, uint8_t, 18, 19) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMax, MLFloat16, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMax, float, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMax, double, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMax, int32_t, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMax, int64_t, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMax, int8_t, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMax, uint8_t, 20) REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMean, MLFloat16, 17, 18) REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMean, float, 17, 18) @@ -890,13 +914,27 @@ REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMean, double, 17, 18) REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMean, BFloat16, 17, 18) REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMean, int32_t, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMin, MLFloat16, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMin, float, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMin, double, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMin, int32_t, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMin, int64_t, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMin, int8_t, 17, 18) -REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceMin, uint8_t, 17, 18) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMin, MLFloat16, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMin, float, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMin, double, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMin, int32_t, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMin, int64_t, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMin, int8_t, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_TYPED(ReduceMin, uint8_t, 1, 17) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMin, MLFloat16, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMin, float, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMin, double, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMin, int32_t, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMin, int64_t, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMin, int8_t, 18, 19) +REGISTER_KERNEL_VERSIONED_RANGE_AXES_INPUT_TYPED(ReduceMin, uint8_t, 18, 19) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMin, MLFloat16, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMin, float, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMin, double, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMin, int32_t, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMin, int64_t, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMin, int8_t, 20) +REGISTER_KERNEL_VERSIONED_SINCE_TYPED(ReduceMin, uint8_t, 20) REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceProd, MLFloat16, 17, 18) REGISTER_KERNEL_TYPED_AXES_INPUT_WITH_VERSIONED(ReduceProd, float, 17, 18) diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index 52e8b55cb3b98..79617dc16e1f5 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -6318,5 +6318,209 @@ TEST(ReductionOpTest, ReduceSumSquare_NoopWithAxesNotProvided_ElementwiseSquare) test.ConfigEp(DefaultCpuExecutionProvider()).RunWithConfig(); } +// Opset 20 tests for ReduceMax and ReduceMin on CUDA. +// Verifies CUDA kernel registration at opset 20 works for all supported types. +#if defined(USE_CUDA) + +TEST(ReductionOpTest, ReduceMax_float_Opset20_Cuda) { + OpTester test("ReduceMax", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, {4.0f, 8.0f, 12.0f}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMax_double_Opset20_Cuda) { + OpTester test("ReduceMax", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1.0, 2.0, 3.0, 4.0, + 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, {4.0, 8.0, 12.0}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMax_half_Opset20_Cuda) { + OpTester test("ReduceMax", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + FloatsToMLFloat16s({1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f})); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, FloatsToMLFloat16s({4.0f, 8.0f, 12.0f})); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMax_int32_Opset20_Cuda) { + OpTester test("ReduceMax", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, {4, 8, 12}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMax_int64_Opset20_Cuda) { + OpTester test("ReduceMax", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, {4, 8, 12}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMin_float_Opset20_Cuda) { + OpTester test("ReduceMin", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, {1.0f, 5.0f, 9.0f}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMin_double_Opset20_Cuda) { + OpTester test("ReduceMin", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1.0, 2.0, 3.0, 4.0, + 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, {1.0, 5.0, 9.0}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMin_half_Opset20_Cuda) { + OpTester test("ReduceMin", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + FloatsToMLFloat16s({1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f})); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, FloatsToMLFloat16s({1.0f, 5.0f, 9.0f})); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMin_int32_Opset20_Cuda) { + OpTester test("ReduceMin", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, {1, 5, 9}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMin_int64_Opset20_Cuda) { + OpTester test("ReduceMin", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, {1, 5, 9}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMin_int8_Opset20_Cuda) { + OpTester test("ReduceMin", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, {1, 5, 9}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +TEST(ReductionOpTest, ReduceMin_uint8_Opset20_Cuda) { + OpTester test("ReduceMin", 20); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + {1, 2, 3, 4, + 5, 6, 7, 8, + 9, 10, 11, 12}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3, 1, 1}, {1, 5, 9}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +// Test ReduceMax at opset 20 with keepdims=0 on CUDA +TEST(ReductionOpTest, ReduceMax_float_Opset20_NoKeepdims_Cuda) { + OpTester test("ReduceMax", 20); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {3, 2, 2}, + {1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3}, {4.0f, 8.0f, 12.0f}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +// Test ReduceMin at opset 20 with keepdims=0 on CUDA +TEST(ReductionOpTest, ReduceMin_float_Opset20_NoKeepdims_Cuda) { + OpTester test("ReduceMin", 20); + test.AddAttribute("keepdims", (int64_t)0); + test.AddInput("data", {3, 2, 2}, + {1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f}); + test.AddInput("axes", {2}, {1, 2}); + test.AddOutput("reduced", {3}, {1.0f, 5.0f, 9.0f}); + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +#endif // defined(USE_CUDA) + } // namespace test } // namespace onnxruntime