diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 507c6722bc349..67e0c9f8620d8 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -697,7 +697,8 @@ Do not modify directly.* |DepthToSpace|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)| |||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)| |||[1, 10]|**T** = tensor(double), tensor(float), tensor(float16)| -|DequantizeLinear|*in* x:**T**
*in* x_scale:**tensor(float)**
*in* x_zero_point:**T**
*out* y:**tensor(float)**

or

*in* x:**T1**
*in* x_scale:**T2**
*in* x_zero_point:**T1**
*out* y:**T2**

or

*in* x:**T1**
*in* x_scale:**T2**
*in* x_zero_point:**T1**
*out* y:**T3**|21+|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)
**T2** = tensor(float), tensor(float16)| +|DequantizeLinear|*in* x:**T**
*in* x_scale:**tensor(float)**
*in* x_zero_point:**T**
*out* y:**tensor(float)**

or

*in* x:**T1**
*in* x_scale:**T2**
*in* x_zero_point:**T1**
*out* y:**T2**

or

*in* x:**T1**
*in* x_scale:**T2**
*in* x_zero_point:**T1**
*out* y:**T3**|25+|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)
**T2** = tensor(float), tensor(float16)| +|||[21, 24]|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)
**T2** = tensor(float), tensor(float16)| |||[19, 20]|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)
**T2** = tensor(float), tensor(float16)| |||[13, 18]|**T** = tensor(int8), tensor(uint8)| |||[10, 12]|**T** = tensor(int8), tensor(uint8)| diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 60ac16018f539..29071e8230f5c 100755 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -1521,19 +1521,19 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO // Opset 21. // TODO(fajin): support other quantized types -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, uint8_t, float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, int8_t, float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, uint8_t, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, int8_t, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, UInt4x2, float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, Int4x2, float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, UInt4x2, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, Int4x2, MLFloat16, DequantizeLinear); #if !defined(DISABLE_FLOAT8_TYPES) -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, Float8E4M3FN, float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, Float8E5M2, float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, Float8E4M3FN, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, 24, Float8E5M2, MLFloat16, DequantizeLinear); #endif class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear); @@ -1639,6 +1639,22 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 24, MLFloat16, Attention); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 24, BFloat16, Attention); +// Opset 25. +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, uint8_t, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, int8_t, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, uint8_t, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, int8_t, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, UInt4x2, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, Int4x2, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, UInt4x2, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, Int4x2, MLFloat16, DequantizeLinear); +#if !defined(DISABLE_FLOAT8_TYPES) +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, Float8E4M3FN, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, Float8E5M2, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, Float8E4M3FN, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 25, Float8E5M2, MLFloat16, DequantizeLinear); +#endif + #endif template <> @@ -2606,19 +2622,19 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { // Opset 21 // TODO(fajin): support other quantized types - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2721,6 +2737,22 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + + // Opset 25 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, +#if !defined(DISABLE_FLOAT8_TYPES) + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, +#endif #endif }; diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc index 6a5dbc433fb1e..3c0583057fa29 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc @@ -590,11 +590,38 @@ REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, MLFloat16) REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E5M2, MLFloat16) #endif -#define REGISTER_DQ_KERNEL_TWO_TYPED_21(T, U) \ +#define REGISTER_DQ_KERNEL_TWO_TYPED_21_24(T, U) \ + ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \ + DequantizeLinear, \ + kOnnxDomain, \ + 21, 24, \ + T, U, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ + DequantizeLinear); + +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(uint8_t, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(int8_t, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(uint8_t, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(int8_t, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(UInt4x2, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(Int4x2, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(UInt4x2, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(Int4x2, MLFloat16) +#if !defined(DISABLE_FLOAT8_TYPES) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(Float8E4M3FN, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(Float8E5M2, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(Float8E4M3FN, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_21_24(Float8E5M2, MLFloat16) +#endif + +#define REGISTER_DQ_KERNEL_TWO_TYPED_25(T, U) \ ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \ DequantizeLinear, \ kOnnxDomain, \ - 21, \ + 25, \ T, U, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()) \ @@ -602,19 +629,19 @@ REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E5M2, MLFloat16) .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ DequantizeLinear); -REGISTER_DQ_KERNEL_TWO_TYPED_21(uint8_t, float) -REGISTER_DQ_KERNEL_TWO_TYPED_21(int8_t, float) -REGISTER_DQ_KERNEL_TWO_TYPED_21(uint8_t, MLFloat16) -REGISTER_DQ_KERNEL_TWO_TYPED_21(int8_t, MLFloat16) -REGISTER_DQ_KERNEL_TWO_TYPED_21(UInt4x2, float) -REGISTER_DQ_KERNEL_TWO_TYPED_21(Int4x2, float) -REGISTER_DQ_KERNEL_TWO_TYPED_21(UInt4x2, MLFloat16) -REGISTER_DQ_KERNEL_TWO_TYPED_21(Int4x2, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_25(uint8_t, float) +REGISTER_DQ_KERNEL_TWO_TYPED_25(int8_t, float) +REGISTER_DQ_KERNEL_TWO_TYPED_25(uint8_t, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_25(int8_t, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_25(UInt4x2, float) +REGISTER_DQ_KERNEL_TWO_TYPED_25(Int4x2, float) +REGISTER_DQ_KERNEL_TWO_TYPED_25(UInt4x2, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_25(Int4x2, MLFloat16) #if !defined(DISABLE_FLOAT8_TYPES) -REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E4M3FN, float) -REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E5M2, float) -REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E4M3FN, MLFloat16) -REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E5M2, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_25(Float8E4M3FN, float) +REGISTER_DQ_KERNEL_TWO_TYPED_25(Float8E5M2, float) +REGISTER_DQ_KERNEL_TWO_TYPED_25(Float8E4M3FN, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_25(Float8E5M2, MLFloat16) #endif // specialize QuantizeLinear::ComputeInternal and DequantizeLinear::ComputeInternal