diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index db30b74301db1..3522e70bceb25 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -844,7 +844,9 @@ Do not modify directly.*
|Relu|*in* X:**T**
*out* Y:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|||13|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Reshape|*in* data:**T**
*in* shape:**tensor(int64)**
*out* reshaped:**T**
or
*in* data:**T**
*out* reshaped:**T**|19+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**shape** = tensor(int64)|
+|Reshape|*in* data:**T**
*in* shape:**tensor(int64)**
*out* reshaped:**T**
or
*in* data:**T**
*out* reshaped:**T**|23+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**shape** = tensor(int64)|
+|||[21, 22]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**shape** = tensor(int64)|
+|||[19, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**shape** = tensor(int64)|
|||[14, 18]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**shape** = tensor(int64)|
|||13|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**shape** = tensor(int64)|
|||[5, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**shape** = tensor(int64)|
@@ -879,7 +881,9 @@ Do not modify directly.*
|SequenceErase|*in* input_sequence:**S**
*in* position:**I**
*out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)
**S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
|SequenceInsert|*in* input_sequence:**S**
*in* tensor:**T**
*in* position:**I**
*out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)
**S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
|SequenceLength|*in* input_sequence:**S**
*out* length:**I**|11+|**I** = tensor(int64)
**S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))|
-|Shape|*in* data:**T**
*out* shape:**T1**|19+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(int64)|
+|Shape|*in* data:**T**
*out* shape:**T1**|23+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(int64)|
+|||[21, 22]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(int64)|
+|||[19, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(int64)|
|||[15, 18]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(int64)|
|||[13, 14]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(int64)|
|||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**T1** = tensor(int64)|
@@ -908,7 +912,9 @@ Do not modify directly.*
|||[2, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Sqrt|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
|||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
-|Squeeze|*in* data:**T**
*in* axes:**tensor(int64)**
*out* squeezed:**T**
or
*in* data:**T**
*out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Squeeze|*in* data:**T**
*in* axes:**tensor(int64)**
*out* squeezed:**T**
or
*in* data:**T**
*out* squeezed:**T**|23+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[21, 22]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Sub|*in* A:**T**
*in* B:**T**
*out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
@@ -926,10 +932,14 @@ Do not modify directly.*
|TopK|*in* X:**T**
*in* K:**tensor(int64)**
*out* Values:**T**
*out* Indices:**I**
or
*in* X:**T**
*out* Values:**T**
*out* Indices:**I**|11+|**I** = tensor(int64)
**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)|
|||10|**I** = tensor(int64)
**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)|
|||[1, 9]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)|
-|Transpose|*in* data:**T**
*out* transposed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Transpose|*in* data:**T**
*out* transposed:**T**|23+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[21, 22]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Trilu|*in* input:**T**
*in* k:**tensor(int64)**
*out* output:**T**|14+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|Unsqueeze|*in* data:**T**
*in* axes:**tensor(int64)**
*out* expanded:**T**
or
*in* data:**T**
*out* expanded:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Unsqueeze|*in* data:**T**
*in* axes:**tensor(int64)**
*out* expanded:**T**
or
*in* data:**T**
*out* expanded:**T**|23+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[21, 22]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|Upsample|*in* X:**T**
*in* scales:**tensor(float)**
*out* Y:**T**
or
*in* X:**T**
*out* Y:**T**|9|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index b4409ee751c38..2fbf96cb7ae4e 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1127,7 +1127,7 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 13, Reshape);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 14, Shape);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Size);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Transpose);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 20, Transpose);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 15, ScatterElements);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int32_t, Slice);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, int64_t, Slice);
@@ -1138,8 +1138,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, double, LogSoftmax);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, LogSoftmax);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 17, Split);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Squeeze);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Unsqueeze);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 20, Squeeze);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, 20, Unsqueeze);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Concat);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, Gather);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, GatherElements);
@@ -1444,9 +1444,9 @@ class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, QuantizeLinear);
#endif
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Reshape);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Shape);
// Opset 20
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu);
@@ -1505,6 +1505,11 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 22, Float8E4M3FN, Cast);
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 22, Float8E5M2, Cast);
#endif
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 22, Shape);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 22, Reshape);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 22, Transpose);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 22, Squeeze);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 22, Unsqueeze);
// Opset 22.
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, HardSigmoid);
@@ -1543,6 +1548,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, Float8E4M3FN, Cast);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, Float8E5M2, Cast);
#endif
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, Shape);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, Reshape);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, Transpose);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, Squeeze);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, Unsqueeze);
#if !defined(DISABLE_FLOAT4_TYPES)
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, Float4E2M1x2, Cast);
@@ -2175,7 +2185,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
- BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
@@ -2186,8 +2196,8 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
- BuildKernelCreateInfo,
- BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
BuildKernelCreateInfo,
@@ -2492,9 +2502,9 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo,
#endif
- BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
BuildKernelCreateInfo,
- BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
// Opset 20
BuildKernelCreateInfo,
@@ -2552,6 +2562,12 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo,
BuildKernelCreateInfo,
#endif
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+
// Opset 22
BuildKernelCreateInfo,
BuildKernelCreateInfo,
@@ -2592,6 +2608,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
#if !defined(DISABLE_FLOAT4_TYPES)
BuildKernelCreateInfo,
#endif
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
+ BuildKernelCreateInfo,
#endif
};
diff --git a/onnxruntime/core/providers/cuda/tensor/reshape.cc b/onnxruntime/core/providers/cuda/tensor/reshape.cc
index 8ffcba9b716da..e30dae636bd5e 100644
--- a/onnxruntime/core/providers/cuda/tensor/reshape.cc
+++ b/onnxruntime/core/providers/cuda/tensor/reshape.cc
@@ -85,7 +85,31 @@ std::unique_ptr FuncReshape(
ONNX_OPERATOR_KERNEL_EX(
Reshape,
kOnnxDomain,
- 19,
+ 23,
+ kCudaExecutionProvider,
+ (*KernelDefBuilder::Create())
+ .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypesIRv9())
+ .TypeConstraint("shape", DataTypeImpl::GetTensorType())
+ .Alias(0, 0)
+ .InputMemoryType(OrtMemTypeCPUInput, 1),
+ Reshape);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ Reshape,
+ kOnnxDomain,
+ 21, 22,
+ kCudaExecutionProvider,
+ (*KernelDefBuilder::Create())
+ .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypesIRv9())
+ .TypeConstraint("shape", DataTypeImpl::GetTensorType())
+ .Alias(0, 0)
+ .InputMemoryType(OrtMemTypeCPUInput, 1),
+ Reshape);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ Reshape,
+ kOnnxDomain,
+ 19, 20,
kCudaExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypesIRv9())
diff --git a/onnxruntime/core/providers/cuda/tensor/shape_op.cc b/onnxruntime/core/providers/cuda/tensor/shape_op.cc
index 0d5da81fe256b..b1650c8d48945 100644
--- a/onnxruntime/core/providers/cuda/tensor/shape_op.cc
+++ b/onnxruntime/core/providers/cuda/tensor/shape_op.cc
@@ -44,10 +44,34 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
.TypeConstraint("T1", DataTypeImpl::GetTensorType()),
Shape);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ Shape,
+ kOnnxDomain,
+ 19, 20,
+ kCudaExecutionProvider,
+ (*KernelDefBuilder::Create())
+ // properly force CPU/GPU synch inside the kernel
+ .OutputMemoryType(OrtMemTypeCPUInput, 0)
+ .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypesIRv9())
+ .TypeConstraint("T1", DataTypeImpl::GetTensorType()),
+ Shape);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ Shape,
+ kOnnxDomain,
+ 21, 22,
+ kCudaExecutionProvider,
+ (*KernelDefBuilder::Create())
+ // properly force CPU/GPU synch inside the kernel
+ .OutputMemoryType(OrtMemTypeCPUInput, 0)
+ .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypesIRv9())
+ .TypeConstraint("T1", DataTypeImpl::GetTensorType()),
+ Shape);
+
ONNX_OPERATOR_KERNEL_EX(
Shape,
kOnnxDomain,
- 19,
+ 23,
kCudaExecutionProvider,
(*KernelDefBuilder::Create())
// properly force CPU/GPU synch inside the kernel
diff --git a/onnxruntime/core/providers/cuda/tensor/squeeze.cc b/onnxruntime/core/providers/cuda/tensor/squeeze.cc
index 4316700ce7060..3ccc57acf2252 100644
--- a/onnxruntime/core/providers/cuda/tensor/squeeze.cc
+++ b/onnxruntime/core/providers/cuda/tensor/squeeze.cc
@@ -28,10 +28,32 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Squeeze);
// axes is input instead of attribute
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ Squeeze,
+ kOnnxDomain,
+ 13, 20,
+ kCudaExecutionProvider,
+ (*KernelDefBuilder::Create())
+ .Alias(0, 0)
+ .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+ .InputMemoryType(OrtMemTypeCPUInput, 1),
+ Squeeze);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ Squeeze,
+ kOnnxDomain,
+ 21, 22,
+ kCudaExecutionProvider,
+ (*KernelDefBuilder::Create())
+ .Alias(0, 0)
+ .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+ .InputMemoryType(OrtMemTypeCPUInput, 1),
+ Squeeze);
+
ONNX_OPERATOR_KERNEL_EX(
Squeeze,
kOnnxDomain,
- 13,
+ 23,
kCudaExecutionProvider,
(*KernelDefBuilder::Create())
.Alias(0, 0)
diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc
index 0e10ebcbef21d..753f9857556e2 100644
--- a/onnxruntime/core/providers/cuda/tensor/transpose.cc
+++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc
@@ -19,10 +19,28 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Transpose);
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ Transpose,
+ kOnnxDomain,
+ 13, 20,
+ kCudaExecutionProvider,
+ (*KernelDefBuilder::Create())
+ .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+ Transpose);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ Transpose,
+ kOnnxDomain,
+ 21, 22,
+ kCudaExecutionProvider,
+ (*KernelDefBuilder::Create())
+ .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+ Transpose);
+
ONNX_OPERATOR_KERNEL_EX(
Transpose,
kOnnxDomain,
- 13,
+ 23,
kCudaExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
diff --git a/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc b/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc
index bc374c2f6bd5f..8d5a601aadd6e 100644
--- a/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc
+++ b/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc
@@ -28,10 +28,32 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Unsqueeze);
// axes is input instead of attribute, support bfloat16
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ Unsqueeze,
+ kOnnxDomain,
+ 13, 20,
+ kCudaExecutionProvider,
+ (*KernelDefBuilder::Create())
+ .Alias(0, 0)
+ .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+ .InputMemoryType(OrtMemTypeCPUInput, 1),
+ Unsqueeze);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+ Unsqueeze,
+ kOnnxDomain,
+ 21, 22,
+ kCudaExecutionProvider,
+ (*KernelDefBuilder::Create())
+ .Alias(0, 0)
+ .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+ .InputMemoryType(OrtMemTypeCPUInput, 1),
+ Unsqueeze);
+
ONNX_OPERATOR_KERNEL_EX(
Unsqueeze,
kOnnxDomain,
- 13,
+ 23,
kCudaExecutionProvider,
(*KernelDefBuilder::Create())
.Alias(0, 0)
diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
index fbdd73617f53f..1730d51fad7bb 100644
--- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
@@ -353,7 +353,7 @@ TEST(TransposeOptimizerTests, TestResize) {
// need the level 2 TransposeOptimizer as pushing a Transpose through a Resize requires it to be
// assigned to the CPU EP first
TransformerLevel::Level2,
- /*opset_version*/ {10, 18});
+ /*opset_version*/ {10, 18, 21, 23});
}
TEST(TransposeOptimizerTests, TestResizeOpset11) {
@@ -593,7 +593,7 @@ TEST(TransposeOptimizerTests, TestShape) {
check_optimized_graph_1,
TransformerLevel::Default,
TransformerLevel::Level1,
- /*opset_version*/ {7, 18});
+ /*opset_version*/ {7, 18, 21, 23});
}
TEST(TransposeOptimizerTests, TestShapeOpset15) {
diff --git a/onnxruntime/test/providers/cpu/tensor/shape_op_test.cc b/onnxruntime/test/providers/cpu/tensor/shape_op_test.cc
index 9fa3b6df7dc16..413aea021032a 100644
--- a/onnxruntime/test/providers/cpu/tensor/shape_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/shape_op_test.cc
@@ -32,6 +32,27 @@ TEST(ShapeOpTest, ShapeOpset15_Default) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT parser: unsupported data types
}
+TEST(ShapeOpTest, ShapeOpset19_Default) {
+ OpTester test("Shape", 19);
+ test.AddInput("data", {1, 2, 2}, {1, 2, 3, 4});
+ test.AddOutput("output", {3}, {1, 2, 2});
+ test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT parser: unsupported data types
+}
+
+TEST(ShapeOpTest, ShapeOpset21_Default) {
+ OpTester test("Shape", 21);
+ test.AddInput("data", {1, 2, 2}, {1, 2, 3, 4});
+ test.AddOutput("output", {3}, {1, 2, 2});
+ test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT parser: unsupported data types
+}
+
+TEST(ShapeOpTest, ShapeOpset23_Default) {
+ OpTester test("Shape", 23);
+ test.AddInput("data", {1, 2, 2}, {1, 2, 3, 4});
+ test.AddOutput("output", {3}, {1, 2, 2});
+ test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT parser: unsupported data types
+}
+
TEST(ShapeOpTest, ShapeOpset15_StartOnly) {
OpTester test("Shape", 15);
test.AddAttribute("start", 1);
diff --git a/onnxruntime/test/providers/cpu/tensor/squeeze_op_test.cc b/onnxruntime/test/providers/cpu/tensor/squeeze_op_test.cc
index cdd3491731720..ab09283cd6122 100644
--- a/onnxruntime/test/providers/cpu/tensor/squeeze_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/squeeze_op_test.cc
@@ -20,6 +20,22 @@ TEST(SqueezeOpTest, Squeeze_1) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // Incorrect precision. Will be re-enabled after it's fixed
}
+TEST(SqueezeOpTest, Squeeze_21) {
+ OpTester test("Squeeze", 21);
+ test.AddInput("data", {1, 3, 4, 5}, std::vector(60, 1.0f));
+ test.AddInput("axes", {1}, std::vector{0});
+ test.AddOutput("squeezed", {3, 4, 5}, std::vector(60, 1.0f));
+ test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // Incorrect precision. Will be re-enabled after it's fixed
+}
+
+TEST(SqueezeOpTest, Squeeze_23) {
+ OpTester test("Squeeze", 23);
+ test.AddInput("data", {1, 3, 4, 5}, std::vector(60, 1.0f));
+ test.AddInput("axes", {1}, std::vector{0});
+ test.AddOutput("squeezed", {3, 4, 5}, std::vector(60, 1.0f));
+ test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); // Incorrect precision. Will be re-enabled after it's fixed
+}
+
TEST(SqueezeOpTest, Squeeze_Empty_Axes_1) {
OpTester test("Squeeze");
test.AddInput("data", {1, 1, 4, 1}, std::vector(4, 1.0f));
diff --git a/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc b/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
index d1910c89f76b7..e086ac0b29e14 100644
--- a/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
@@ -47,6 +47,22 @@ TEST(UnsqueezeOpTest, Unsqueeze_3) {
test.Run();
}
+TEST(UnsqueezeOpTest, Unsqueeze_21) {
+ OpTester test("Unsqueeze", 21);
+ test.AddInput("input", {}, std::vector{1.0f});
+ test.AddInput("axes", {1}, std::vector{0}, true);
+ test.AddOutput("output", {1}, std::vector{1.0f});
+ test.Run();
+}
+
+TEST(UnsqueezeOpTest, Unsqueeze_23) {
+ OpTester test("Unsqueeze", 23);
+ test.AddInput("input", {}, std::vector{1.0f});
+ test.AddInput("axes", {1}, std::vector{0}, true);
+ test.AddOutput("output", {1}, std::vector{1.0f});
+ test.Run();
+}
+
TEST(UnsqueezeOpTest, Unsqueeze_scalar) {
{
OpTester test("Unsqueeze");