diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 60ac16018f539..6d0c08b15d839 100755 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -1441,10 +1441,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, int64_t, ReduceMax); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, ScatterElements); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, ScatterND); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, float, Pad); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, double, Pad); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, MLFloat16, Pad); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, bool, Pad); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 23, float, Pad); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 23, double, Pad); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 23, MLFloat16, Pad); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 23, bool, Pad); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 18, float, Resize); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 18, double, Resize); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 18, 18, MLFloat16, Resize); @@ -2524,10 +2524,10 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index 9f8ecbea1bd8a..8c3cbcc4eacbc 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -40,10 +40,10 @@ namespace cuda { .InputMemoryType(OrtMemTypeCPUInput, 2) \ .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ Pad); \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ Pad, \ kOnnxDomain, \ - 18, \ + 18, 23, \ T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()) \ diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index a18ff6dabbae0..42b3696cb581d 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -42,6 +42,10 @@ static void RunOnnxOpsetTypedTest( if constexpr (std::is_same_v) { provider_types.insert(kTensorrtExecutionProvider); } + // CUDA Pad does not implement wrap mode yet. + if (mode == "wrap") { + provider_types.insert(kCudaExecutionProvider); + } // Exclude QNN due to a few test failures with the CPU backend. provider_types.insert(kQnnExecutionProvider); SessionOptions so; @@ -199,6 +203,31 @@ TYPED_TEST(PadOpTest, Pad_Edge_1D) { "edge"); } +#ifdef USE_CUDA +TEST(PadOpTest, Pad_Edge_Opset19_CudaOnly_MLFloat16) { + if (DefaultCudaExecutionProvider() == nullptr) { + GTEST_SKIP() << "CUDA execution provider is not available"; + } + + OpTester test("Pad", 19); + test.AddAttribute("mode", "edge"); + test.AddInput("data", {3, 2}, + {MLFloat16(1.0f), MLFloat16(2.0f), + MLFloat16(3.0f), MLFloat16(4.0f), + MLFloat16(5.0f), MLFloat16(6.0f)}); + test.AddInput("pads", {4}, {0, 2, 0, 1}, true); + test.AddInput("value", {}, {MLFloat16(0.0f)}, true); + test.AddOutput("output", {3, 5}, + {MLFloat16(1.0f), MLFloat16(1.0f), MLFloat16(1.0f), MLFloat16(2.0f), MLFloat16(2.0f), + MLFloat16(3.0f), MLFloat16(3.0f), MLFloat16(3.0f), MLFloat16(4.0f), MLFloat16(4.0f), + MLFloat16(5.0f), MLFloat16(5.0f), MLFloat16(5.0f), MLFloat16(6.0f), MLFloat16(6.0f)}); + + std::vector> execution_providers; + execution_providers.emplace_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} +#endif + TYPED_TEST(PadOpTest, Pad_Constant_2D) { using T = TypeParam; RunAllOpsetAllDomainPadTests({2, 2}, @@ -1391,9 +1420,8 @@ TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { // Post-slice core: [4]; wrap 3 -> [4, 4, 4, 4] const std::vector expected_data = {4, 4, 4, 4}; - // CUDA registers only up to 18 and does not impl wrap mode - // so we force version to 19 to automatically exclude EPs that do not - // implement wrap mode similar to the above tests. + // Use opset 19 to exercise wrap mode while EPs without wrap support + // (including CUDA) are excluded by the helper or explicit EP exclusions. OpTester test("Pad", 19); test.AddInput("data", input_shape, input_data); test.AddInput("pads", {static_cast(pads.size())}, pads, true);