diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 1b230f9557984..f81a268d38dff 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -78,6 +78,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF) # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead. cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF) +option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF) option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF) option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF) option(onnxruntime_USE_COREML "Build with CoreML support" OFF) @@ -671,6 +672,9 @@ set(ORT_PROVIDER_FLAGS) set(ORT_PROVIDER_CMAKE_FLAGS) if (onnxruntime_USE_CUDA) + if (onnxruntime_USE_CUDA_NHWC_OPS) + add_compile_definitions(ENABLE_CUDA_NHWC_OPS) + endif() enable_language(CUDA) message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}") diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index ac9770d7cedf8..f5f98066675fb 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -374,6 +374,13 @@ if (onnxruntime_USE_CUDA AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_R "${TEST_SRC_DIR}/providers/cuda/*" ) list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_src}) + + if (onnxruntime_USE_CUDA_NHWC_OPS) + file(GLOB onnxruntime_test_providers_cuda_nhwc_src CONFIGURE_DEPENDS + "${TEST_SRC_DIR}/providers/cuda/nhwc/*.cc" + ) + list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_nhwc_src}) + endif() endif() if (onnxruntime_USE_CANN) @@ -851,7 +858,7 @@ if (HAS_SHORTEN_64_TO_32 AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) endif() if (UNIX AND onnxruntime_USE_TENSORRT) - # The test_main.cc includes NvInfer.h where it has many deprecated declarations + # The test_main.cc includes NvInfer.h where it has many deprecated declarations # simply ignore them for TensorRT EP build set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") endif() @@ -1294,7 +1301,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) endif() if (UNIX AND onnxruntime_USE_TENSORRT) - # The test_main.cc includes NvInfer.h where it has many deprecated declarations + # The test_main.cc includes NvInfer.h where it has many deprecated declarations # simply ignore them for TensorRT EP build set_property(TARGET onnxruntime_shared_lib_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") endif() @@ -1583,7 +1590,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") endif() if (UNIX AND onnxruntime_USE_TENSORRT) - # The test_main.cc includes NvInfer.h where it has many deprecated declarations + # The test_main.cc includes NvInfer.h where it has many deprecated declarations # simply ignore them for TensorRT EP build set_property(TARGET onnxruntime_customopregistration_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") endif() diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h index 5f266dd14d36d..82bb8ba83be4a 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h +++ b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h @@ -1,8 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once +#include + #include "onnxruntime_c_api.h" #include "core/framework/arena_extend_strategy.h" @@ -32,5 +35,6 @@ struct OrtCUDAProviderOptionsV2 { int tunable_op_max_tuning_duration_ms = 0; // Max tuning duration time limit for TunableOp. int enable_skip_layer_norm_strict_mode = 0; // flag specifying if SkipLayerNorm is in strict mode. If true, use LayerNormalization kernel. // The strict mode has better accuracy but lower performance. + int prefer_nhwc = 0; // make the CUDA EP NHWC preferred int use_ep_level_unified_stream = 0; // flag specifying if ep level stream is used or not }; diff --git a/onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h b/onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h index 6f7a04d059034..a768b2a7d8a24 100644 --- a/onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h +++ b/onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once @@ -10,12 +11,12 @@ namespace contrib { namespace cuda { template -class ConvTransposeWithDynamicPads : public ::onnxruntime::cuda::ConvTranspose { +class ConvTransposeWithDynamicPads : public ::onnxruntime::cuda::ConvTranspose { public: - ConvTransposeWithDynamicPads(const OpKernelInfo& info) : ::onnxruntime::cuda::ConvTranspose(info) {} + ConvTransposeWithDynamicPads(const OpKernelInfo& info) : ::onnxruntime::cuda::ConvTranspose(info) {} Status ComputeInternal(OpKernelContext* context) const override { - return ::onnxruntime::cuda::ConvTranspose::DoConvTranspose(context, true); + return ::onnxruntime::cuda::ConvTranspose::DoConvTranspose(context, true); } }; } // namespace cuda diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc index 6c91949e467ae..290380cabb036 100644 --- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc +++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc @@ -30,6 +30,23 @@ CostCheckResult PostLayoutTransformCostCheck(const api::GraphRef& graph, const a return OrtEPCostCheck(graph, node, perm, outputs_leading_to_transpose); } +#if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS +const std::unordered_set& GetCUDALayoutSensitiveOps() { + static std::unordered_set cuda_nhwc_ops = []() { + return std::unordered_set{ + "BatchNormalization", + "Conv", + "ConvTranspose", + "GlobalMaxPool", + "MaxPool", + "GlobalAveragePool", + "AveragePool", + }; + }(); + return cuda_nhwc_ops; +} +#endif + /// /// Default function for checking if a node should have its layout changed. Allows EP specific adjustments to the /// default set of layout sensitive operators if required. @@ -71,11 +88,16 @@ bool ConvertNodeLayout(const api::NodeRef& node) { } #endif - // #if defined(USE_CUDA) - // if (node.GetExecutionProviderType() == kCudaExecutionProvider) { - // Update as per https://github.com/microsoft/onnxruntime/pull/17200 with CUDA ops that support NHWC - // } - // #endif +#if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS + if (node.GetExecutionProviderType() == kCudaExecutionProvider) { + if (layout_sensitive_ops.count(node.OpType())) { + const auto& cuda_nhwc_ops = GetCUDALayoutSensitiveOps(); + if (!cuda_nhwc_ops.count(node.OpType())) { + return false; + } + } + } +#endif return layout_sensitive_ops.count(node.OpType()) != 0; } diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h index 8507d87fd2442..a5d46aff83b50 100644 --- a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h +++ b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once @@ -22,11 +23,17 @@ class BatchNormHelper { const Tensor* B, const Tensor* mean, const Tensor* var, - bool is_spatial = true) { + bool is_spatial = true, + bool is_nhwc = false) { const auto& x_dims = X->Shape().GetDims(); // If x_dims size < 2, num_channels defaults to 1. - int64_t num_channels = x_dims.size() > 1 ? x_dims[1] : 1; + int64_t num_channels; + if (is_nhwc) { + num_channels = x_dims.size() > 1 ? x_dims[x_dims.size() - 1] : 1; + } else { + num_channels = x_dims.size() > 1 ? x_dims[1] : 1; + } // the first 2 are respectively - N and C. int num_feature_dims = x_dims.size() > 1 ? static_cast(x_dims.size() - 2) : 0; @@ -109,7 +116,7 @@ class BatchNormHelper { return common::Status::OK(); } - static void NormalizeDims(const TensorShape& x_shape, std::vector& new_dims) { + static void NormalizeDims(const TensorShape& x_shape, std::vector& new_dims, bool is_nhwc = false) { new_dims.clear(); auto orig_dims = x_shape.GetDims(); ORT_ENFORCE(orig_dims.size() < 6, @@ -122,13 +129,19 @@ class BatchNormHelper { auto rank = x_shape.NumDimensions(); auto num_samples = rank > 0 ? orig_dims[0] : 1; // NCHW - auto num_channels = rank > 1 ? orig_dims[1] : 1; - auto height = rank > 2 ? orig_dims[2] : 1; + const size_t channel_dim = is_nhwc ? rank - 1 : 1; + const size_t height_dim = is_nhwc ? 1 : 2; + auto num_channels = rank > 1 ? orig_dims[channel_dim] : 1; + auto height = rank > 2 ? orig_dims[height_dim] : 1; int64_t width = 1; - new_dims = {num_samples, num_channels, height, width}; + if (is_nhwc) { + new_dims = {num_samples, height, width, num_channels}; + } else { + new_dims = {num_samples, num_channels, height, width}; + } } }; } // namespace onnxruntime #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(pop) -#endif \ No newline at end of file +#endif diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h b/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h index a4d67ec63f0c2..4b3b934834ac8 100644 --- a/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h +++ b/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h @@ -14,6 +14,7 @@ * limitations under the License. */ /* Modifications Copyright (c) Microsoft. */ +// Copyright (c) 2023 NVIDIA Corporation. #pragma once @@ -44,17 +45,19 @@ struct ConvTransposeAttributes : public ConvAttributes { }; Status PrepareForCompute(OpKernelContext* context, bool has_bias, Prepare& p, - bool dynamic_padding = false, const TensorShape* filter_shape = nullptr) const { + bool dynamic_padding = false, const TensorShape* filter_shape = nullptr, + bool is_nhwc = false) const { const Tensor* X = context->Input(0); const Tensor* F = (filter_shape != nullptr) ? nullptr : context->Input(1); const TensorShape& F_Shape = (filter_shape != nullptr) ? *filter_shape : F->Shape(); const Tensor* Pads = dynamic_padding ? context->Input(2) : nullptr; const Tensor* B = has_bias ? (dynamic_padding ? context->Input(3) : context->Input(2)) : nullptr; - TensorShape input_shape = X->Shape().Slice(2); - const int64_t num_input_channels = X->Shape()[1]; + const int rank = static_cast(X->Shape().NumDimensions()); + TensorShape input_shape = X->Shape().Slice(is_nhwc ? 1 : 2, is_nhwc ? rank - 1 : rank); + const int64_t num_input_channels = is_nhwc ? X->Shape()[rank - 1] : X->Shape()[1]; const int64_t N = X->Shape()[0]; - const int64_t num_output_channels_multiplier = F_Shape[1]; + const int64_t num_output_channels_multiplier = is_nhwc ? F_Shape[3] : F_Shape[1]; const int64_t num_output_channels = num_output_channels_multiplier * group; // input validations @@ -85,7 +88,7 @@ struct ConvTransposeAttributes : public ConvAttributes { } TensorShapeVector kernel_shape; - ORT_RETURN_IF_ERROR(ComputeKernelShape(F_Shape, kernel_shape)); + ORT_RETURN_IF_ERROR(ComputeKernelShape(F_Shape, kernel_shape, is_nhwc)); TensorShapeVector local_output_padding(output_padding); if (local_output_padding.empty()) { @@ -115,7 +118,7 @@ struct ConvTransposeAttributes : public ConvAttributes { TensorShapeVector Y_dims; ComputePadsAndOutputShape(input_shape, num_output_channels, kernel_shape, - local_strides, local_dilations, local_output_padding, N, &local_pads, &Y_dims); + local_strides, local_dilations, local_output_padding, N, &local_pads, &Y_dims, is_nhwc); TensorShape Yshape(Y_dims); Tensor* Y = context->Output(0, Yshape); @@ -137,9 +140,14 @@ struct ConvTransposeAttributes : public ConvAttributes { void ComputePadsAndOutputShape(TensorShape input_shape, int64_t output_channel, const TensorShapeVector& kernel_shape, const TensorShapeVector& p_strides, const TensorShapeVector& p_dilations, const TensorShapeVector& p_output_padding, const int64_t N, - ConvPadVector* p_pads, TensorShapeVector* output_shape_p) const { + ConvPadVector* p_pads, TensorShapeVector* output_shape_p, + bool is_nhwc = false) const { size_t output_shape_size = output_shape.size(); - output_shape_p->insert(output_shape_p->begin(), {N, output_channel}); + if (is_nhwc) { + output_shape_p->insert(output_shape_p->begin(), {N}); + } else { + output_shape_p->insert(output_shape_p->begin(), {N, output_channel}); + } size_t rank = input_shape.NumDimensions(); for (size_t dim = 0; dim < rank; ++dim) { @@ -163,6 +171,9 @@ struct ConvTransposeAttributes : public ConvAttributes { ORT_ENFORCE(dim_size > 0, "Invalid input shape: ", input_shape.ToString()); output_shape_p->push_back(dim_size); } + if (is_nhwc) { + output_shape_p->push_back(output_channel); + } } TensorShapeVector output_padding; diff --git a/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h b/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h index 48e54ac7eeefb..9a2a710fd291a 100644 --- a/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h +++ b/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once @@ -8,13 +9,16 @@ #include "core/framework/tensor.h" #endif #include +#include namespace onnxruntime { class InstanceNormHelper { public: - static common::Status ValidateInputs(const Tensor* input, const Tensor* scale, const Tensor* B) { - if (input->Shape().NumDimensions() < 3) { + static common::Status ValidateInputs(const Tensor* input, const Tensor* scale, const Tensor* B, + bool is_nhwc = false) { + const auto rank = input->Shape().NumDimensions(); + if (rank < 3) { std::ostringstream ostr; ostr << "Invalid input data: number of dimensions is less than 3: " << input->Shape().NumDimensions(); return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str()); @@ -24,10 +28,13 @@ class InstanceNormHelper { ostr << "Invalid input scale: number of dimensions is not 1: " << scale->Shape().NumDimensions(); return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str()); } - if (scale->Shape().Size() != input->Shape().GetDims()[1]) { + auto in_dims = input->Shape().GetDims(); + auto in_channels = is_nhwc ? in_dims[rank - 1] : in_dims[1]; + + if (scale->Shape().Size() != in_channels) { std::ostringstream ostr; - ostr << "Mismatch between input data and scale: size of scale != input channel count " - << scale->Shape().Size() << " vs. " << input->Shape().GetDims()[1]; + ostr << "Mismatch between input data and scale: size of scale != input channel count " << scale->Shape().Size() + << " vs. " << in_channels; return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str()); } @@ -37,10 +44,10 @@ class InstanceNormHelper { return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str()); } - if (B->Shape().Size() != input->Shape().GetDims()[1]) { + if (B->Shape().Size() != in_channels) { std::ostringstream ostr; - ostr << "Mismatch between input data and B: size of B != input channel count " - << B->Shape().Size() << " vs. " << input->Shape().GetDims()[1]; + ostr << "Mismatch between input data and B: size of B != input channel count " << B->Shape().Size() << " vs. " + << in_channels; return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str()); } diff --git a/onnxruntime/core/providers/cpu/nn/pool_attributes.h b/onnxruntime/core/providers/cpu/nn/pool_attributes.h index 54f41f09f4b24..118cb4a3ba4bd 100644 --- a/onnxruntime/core/providers/cpu/nn/pool_attributes.h +++ b/onnxruntime/core/providers/cpu/nn/pool_attributes.h @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once @@ -98,28 +99,34 @@ struct PoolAttributes { TensorShapeVector SetOutputSize(const TensorShape& input_shape, int64_t output_channel, - TensorShapeVector* actual_pads) const { + TensorShapeVector* actual_pads, + bool is_nhwc = false) const { ORT_ENFORCE(input_shape.Size() > 0 || input_shape[0] == 0, "Invalid input shape. Only N can be zero. Got:", input_shape); TensorShapeVector output_dims; int64_t N = input_shape[0]; - InferOutputSize(input_shape.GetDims(), &output_dims, actual_pads); - - output_dims.insert(output_dims.begin(), {N, output_channel}); - + InferOutputSize(input_shape.GetDims(), &output_dims, actual_pads, is_nhwc); + if (is_nhwc) { + output_dims.insert(output_dims.begin(), N); + output_dims.push_back(output_channel); + } else { + output_dims.insert(output_dims.begin(), {N, output_channel}); + } return output_dims; } void InferOutputSize(gsl::span input_dims, TensorShapeVector* output_dims, - TensorShapeVector* actual_pads) const { + TensorShapeVector* actual_pads, + bool is_nhwc = false) const { ORT_ENFORCE(input_dims.size() >= 2); if (global_pooling) { output_dims->assign(input_dims.size() - 2, 1); } else { for (size_t dim = 0; dim < input_dims.size() - 2; ++dim) { int64_t dim_size = 0; - ComputeSizePadDilations(static_cast(input_dims[dim + 2]), + auto spatial_dim = is_nhwc ? input_dims[dim + 1] : input_dims[dim + 2]; + ComputeSizePadDilations(static_cast(spatial_dim), strides[dim], kernel_shape[dim], &actual_pads->at(dim), diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index d90725971e393..93e18d2940fc2 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #include "core/common/inlined_containers.h" @@ -15,6 +16,10 @@ #include "contrib_ops/cuda/cuda_contrib_kernels.h" #endif +#ifdef ENABLE_CUDA_NHWC_OPS +#include "core/providers/cuda/cuda_nhwc_kernels.h" +#endif + #ifdef ENABLE_TRAINING_OPS #include "orttraining/training_ops/cuda/cuda_training_kernels.h" #endif @@ -233,6 +238,10 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in : IExecutionProvider{onnxruntime::kCudaExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id)}, info_{info}, tuning_context_(this, &info_.tunable_op) { +#ifndef ENABLE_CUDA_NHWC_OPS + ORT_ENFORCE(info_.prefer_nhwc == 0, "This build does not support NHWC layout"); +#endif + CUDA_CALL_THROW(cudaSetDevice(info_.device_id)); // must wait GPU idle, otherwise cudaGetDeviceProperties might fail @@ -271,6 +280,10 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in #endif } +DataLayout CUDAExecutionProvider::GetPreferredLayout() const { + return this->IsNHWCPreferred() ? DataLayout::NHWC : DataLayout::NCHW; +} + CUDAExecutionProvider::~CUDAExecutionProvider() { // clean up thread local context caches { @@ -2330,6 +2343,10 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { ORT_RETURN_IF_ERROR(::onnxruntime::contrib::cuda::RegisterCudaContribKernels(kernel_registry)); #endif +#ifdef ENABLE_CUDA_NHWC_OPS + ORT_RETURN_IF_ERROR(::onnxruntime::cuda::RegisterCudaNhwcKernels(kernel_registry)); +#endif + #ifdef ENABLE_TRAINING_OPS ORT_RETURN_IF_ERROR(::onnxruntime::cuda::RegisterCudaTrainingKernels(kernel_registry)); #endif diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index c9e510b7f472b..d0bb2321edf0a 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once @@ -32,6 +33,8 @@ class CUDAExecutionProvider : public IExecutionProvider { Status OnRunEnd(bool sync_stream) override; + DataLayout GetPreferredLayout() const override; + const void* GetExecutionHandle() const noexcept override { // The CUDA interface does not return anything interesting. return nullptr; @@ -49,6 +52,12 @@ class CUDAExecutionProvider : public IExecutionProvider { return GetPerThreadContext().CudnnHandle(); } + cudaStream_t ComputeStream() { + // this will return the CUDA EP level stream which can differ from the actual compute tasks stream + // the compute task stream is supplied within OpKernelContext during inference + return stream_; + } + template const T* GetConstOnes(size_t count, cudaStream_t stream) { return GetPerThreadContext().template GetConstOnes(count, stream); @@ -68,6 +77,7 @@ class CUDAExecutionProvider : public IExecutionProvider { bool GetCudnnConvUseMaxWorkspace() const { return info_.cudnn_conv_use_max_workspace; } bool GetCudnnConv1dPadToNc1d() const { return info_.cudnn_conv1d_pad_to_nc1d; } bool IsSkipLayerNormInStrictMode() const { return info_.enable_skip_layer_norm_strict_mode; } + bool IsNHWCPreferred() const { return info_.prefer_nhwc; } ProviderOptions GetProviderOptions() const override { return CUDAExecutionProviderInfo::ToProviderOptions(info_); diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc index 966448051264d..daa3b5ff3d72f 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #include "core/providers/shared_library/provider_api.h" @@ -29,6 +30,7 @@ constexpr const char* kTunableOpEnable = "tunable_op_enable"; constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable"; constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms"; constexpr const char* kEnableSkipLayerNormStrictMode = "enable_skip_layer_norm_strict_mode"; +constexpr const char* kPreferNCHWMode = "prefer_nhwc"; constexpr const char* KUseEPLevelUnifiedStream = "use_ep_level_unified_stream"; } // namespace provider_option_names } // namespace cuda @@ -100,6 +102,7 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P .AddAssignmentToReference(cuda::provider_option_names::kEnableCudaGraph, info.enable_cuda_graph) .AddAssignmentToReference(cuda::provider_option_names::kCudnnConv1dPadToNc1d, info.cudnn_conv1d_pad_to_nc1d) .AddAssignmentToReference(cuda::provider_option_names::kEnableSkipLayerNormStrictMode, info.enable_skip_layer_norm_strict_mode) + .AddAssignmentToReference(cuda::provider_option_names::kPreferNCHWMode, info.prefer_nhwc) .AddAssignmentToReference(cuda::provider_option_names::KUseEPLevelUnifiedStream, info.use_ep_level_unified_stream) .AddValueParser( cuda::provider_option_names::kTunableOpEnable, @@ -146,6 +149,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution {cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)}, {cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)}, {cuda::provider_option_names::kEnableSkipLayerNormStrictMode, MakeStringWithClassicLocale(info.enable_skip_layer_norm_strict_mode)}, + {cuda::provider_option_names::kPreferNCHWMode, MakeStringWithClassicLocale(info.prefer_nhwc)}, {cuda::provider_option_names::KUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)}, }; @@ -165,6 +169,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid {cuda::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op_enable)}, {cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op_tuning_enable)}, {cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op_max_tuning_duration_ms)}, + {cuda::provider_option_names::kPreferNCHWMode, MakeStringWithClassicLocale(info.prefer_nhwc)}, {cuda::provider_option_names::KUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)}, }; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h index 89b266f362e8d..b286f5a9161b0 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once @@ -71,6 +72,7 @@ struct CUDAExecutionProviderInfo { cuda::TunableOpInfo tunable_op{}; bool enable_skip_layer_norm_strict_mode{false}; + bool prefer_nhwc{false}; bool use_ep_level_unified_stream{false}; diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h index 58517c2850baf..f8b92eface52f 100644 --- a/onnxruntime/core/providers/cuda/cuda_kernel.h +++ b/onnxruntime/core/providers/cuda/cuda_kernel.h @@ -170,6 +170,12 @@ class CudaKernel : public OpKernel { return provider_->PerThreadDefaultCudnnHandle(); } + inline cudaStream_t DefaultCudaStream() const { + // this will return the CUDA EP level stream which can differ from the actual compute tasks stream + // the compute task stream is supplied within OpKernelContext during inference + return provider_->ComputeStream(); + } + protected: template inline const T* GetConstOnes(size_t count, cudaStream_t stream) const { diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc new file mode 100644 index 0000000000000..f416caecd115f --- /dev/null +++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc @@ -0,0 +1,169 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. +// Licensed under the MIT License. + +#ifdef ENABLE_CUDA_NHWC_OPS + +#include + +#include "core/providers/shared_library/provider_api.h" +#include "core/providers/cuda/cuda_fwd.h" + +#include "core/providers/cuda/cuda_nhwc_kernels.h" + +namespace onnxruntime::cuda { + +// When adding new supported NHWC operations make sure to also integrate them into: ConvertNodeLayout +// in onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc + +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, float, + BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, MLFloat16, + BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, float, + BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, MLFloat16, + BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, + Conv); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, + Conv); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, + ConvTranspose); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, + ConvTranspose); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 9, float, + AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 9, MLFloat16, + AveragePool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalAveragePool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, MLFloat16, + GlobalAveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 7, float, + MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 7, MLFloat16, + MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 8, 9, float, + MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16, + MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalMaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, MLFloat16, GlobalMaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, + AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, + AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, + MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, + MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, Conv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, float, ConvTranspose); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, + ConvTranspose); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, AveragePool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, + MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 11, MLFloat16, + MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float, + BatchNormalization); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, MLFloat16, + BatchNormalization); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, float, + BatchNormalization); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, MLFloat16, + BatchNormalization); + +Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) { + static const BuildKernelCreateInfoFn nhwc_function_table[] = { + BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + }; + + for (auto& function_table_entry : nhwc_function_table) { + KernelCreateInfo info = function_table_entry(); + if (info.kernel_def != nullptr) { // filter disabled entries where type is void + ORT_RETURN_IF_ERROR(kernel_registry.Register(std::move(info))); + } + } + return Status::OK(); +} +} // namespace onnxruntime::cuda +#endif diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.h b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.h new file mode 100644 index 0000000000000..0b3a6d5cff0c7 --- /dev/null +++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.h @@ -0,0 +1,13 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/status.h" + +namespace onnxruntime::cuda { + +onnxruntime::common::Status RegisterCudaNhwcKernels(onnxruntime::KernelRegistry& kernel_registry); + +} // namespace onnxruntime::cuda diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc index 734a890c20cda..892e8d5329eba 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #include "core/providers/shared_library/provider_api.h" @@ -217,6 +218,7 @@ struct CUDA_Provider : Provider { info.default_memory_arena_cfg = params->default_memory_arena_cfg; info.cudnn_conv_use_max_workspace = params->cudnn_conv_use_max_workspace != 0; info.enable_cuda_graph = params->enable_cuda_graph != 0; + info.prefer_nhwc = params->prefer_nhwc; info.cudnn_conv1d_pad_to_nc1d = params->cudnn_conv1d_pad_to_nc1d != 0; info.tunable_op.enable = params->tunable_op_enable; info.tunable_op.tuning_enable = params->tunable_op_tuning_enable; @@ -254,6 +256,7 @@ struct CUDA_Provider : Provider { cuda_options.enable_cuda_graph = internal_options.enable_cuda_graph; cuda_options.cudnn_conv1d_pad_to_nc1d = internal_options.cudnn_conv1d_pad_to_nc1d; cuda_options.enable_skip_layer_norm_strict_mode = internal_options.enable_skip_layer_norm_strict_mode; + cuda_options.prefer_nhwc = internal_options.prefer_nhwc; cuda_options.use_ep_level_unified_stream = internal_options.use_ep_level_unified_stream; } diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc index fc02a6509bf24..4df59a98b12e5 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.cc +++ b/onnxruntime/core/providers/cuda/cudnn_common.cc @@ -1,7 +1,10 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. -#include "cudnn_common.h" +#include + +#include "core/providers/cuda/cudnn_common.h" #include "core/common/inlined_containers.h" #include "core/common/gsl.h" #include "shared_inc/cuda_call.h" @@ -27,7 +30,7 @@ Status CudnnTensor::CreateTensorIfNeeded() { return Status::OK(); } -Status CudnnTensor::Set(gsl::span input_dims, cudnnDataType_t dataType) { +Status CudnnTensor::Set(gsl::span input_dims, cudnnDataType_t dataType, bool is_nhwc) { ORT_RETURN_IF_ERROR(CreateTensorIfNeeded()); int rank = gsl::narrow_cast(input_dims.size()); @@ -38,6 +41,10 @@ Status CudnnTensor::Set(gsl::span input_dims, cudnnDataType_t dat dims[i] = gsl::narrow_cast(input_dims[i]); strides[i] = gsl::narrow_cast(pitches[i]); } + if (is_nhwc) { + std::swap(dims[1], dims[rank - 1]); + std::swap(strides[1], strides[rank - 1]); + } CUDNN_RETURN_IF_ERROR(cudnnSetTensorNdDescriptor(tensor_, dataType, static_cast(rank), dims.data(), strides.data())); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h index ba75ab4f2c029..8a94a334ee688 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.h +++ b/onnxruntime/core/providers/cuda/cudnn_common.h @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once @@ -16,7 +17,7 @@ class CudnnTensor final { ~CudnnTensor(); ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudnnTensor); - Status Set(gsl::span input_dims, cudnnDataType_t dataType); + Status Set(gsl::span input_dims, cudnnDataType_t dataType, bool is_nhwc = false); Status Set(const CudnnTensor& x_desc, cudnnBatchNormMode_t mode); // Set 4D tensor format (for NHWC) Status Set(cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w); diff --git a/onnxruntime/core/providers/cuda/nn/batch_norm.cc b/onnxruntime/core/providers/cuda/nn/batch_norm.cc index 4f22b5298a30a..c468971e1e426 100644 --- a/onnxruntime/core/providers/cuda/nn/batch_norm.cc +++ b/onnxruntime/core/providers/cuda/nn/batch_norm.cc @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #include "batch_norm.h" @@ -11,38 +12,38 @@ using namespace std; namespace onnxruntime { namespace cuda { -#define REGISTER_KERNEL_TYPED(T) \ +#define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC) \ ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ BatchNormalization, \ - kOnnxDomain, \ + DOMAIN, \ 7, 8, \ T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()) \ .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - BatchNorm); \ + BatchNorm); \ ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ BatchNormalization, \ - kOnnxDomain, \ + DOMAIN, \ 9, 13, \ T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()) \ .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - BatchNorm); \ + BatchNorm); \ ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ BatchNormalization, \ - kOnnxDomain, \ + DOMAIN, \ 14, 14, \ T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()) \ .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ .TypeConstraint("U", DataTypeImpl::GetTensorType()), \ - BatchNorm); \ + BatchNorm); \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ BatchNormalization, \ - kOnnxDomain, \ + DOMAIN, \ 15, \ T, \ kCudaExecutionProvider, \ @@ -50,10 +51,10 @@ namespace cuda { .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ - BatchNorm); + BatchNorm); -template -Status BatchNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) const { +template +Status BatchNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) const { typedef typename ToCudaType::MappedType CudaT; const Tensor* X = p_op_kernel_context->Input(0); @@ -62,7 +63,7 @@ Status BatchNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) const const Tensor* mean = p_op_kernel_context->Input(3); const Tensor* var = p_op_kernel_context->Input(4); - ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, scale, B, mean, var, spatial_ == 1)); + ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, scale, B, mean, var, spatial_ == 1, NHWC)); const TensorShape& x_shape = X->Shape(); const TensorShape& channel_shape = mean->Shape(); @@ -87,7 +88,7 @@ Status BatchNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) const CudnnTensor data_desc; vector new_dims; BatchNormHelper::NormalizeDims(x_shape, new_dims); - ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, CudnnTensor::GetDataType())); + ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, CudnnTensor::GetDataType(), NHWC)); // For half data type, the alpha, beta, scale, B, mean, var need to be float type if (X->IsDataType()) { @@ -97,7 +98,7 @@ Status BatchNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) const ORT_RETURN_IF_ERROR(bn_tensor_desc.Set(data_desc, cudnn_batch_norm_mode_)); // Convert the scale, B, mean, var to float - const int64_t C = x_shape.GetDims()[1]; + const int64_t C = x_shape.GetDims()[NHWC ? 3 : 1]; auto f_scale = GetScratchBuffer(C, p_op_kernel_context->GetComputeStream()); auto f_B = GetScratchBuffer(C, p_op_kernel_context->GetComputeStream()); auto f_mean = GetScratchBuffer(C, p_op_kernel_context->GetComputeStream()); @@ -175,13 +176,17 @@ Status BatchNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) const return Status::OK(); } -#define SPECIALIZED_COMPUTE(T) \ - REGISTER_KERNEL_TYPED(T) \ - template Status BatchNorm::ComputeInternal(OpKernelContext* ctx) const; +#define SPECIALIZED_COMPUTE(T, DOMAIN, NHWC) \ + REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC) \ + template Status BatchNorm::ComputeInternal(OpKernelContext* ctx) const; -SPECIALIZED_COMPUTE(float) -SPECIALIZED_COMPUTE(double) -SPECIALIZED_COMPUTE(MLFloat16) +SPECIALIZED_COMPUTE(float, kOnnxDomain, false) +SPECIALIZED_COMPUTE(double, kOnnxDomain, false) +SPECIALIZED_COMPUTE(MLFloat16, kOnnxDomain, false) +#ifdef ENABLE_CUDA_NHWC_OPS +SPECIALIZED_COMPUTE(float, kMSInternalNHWCDomain, true) +SPECIALIZED_COMPUTE(MLFloat16, kMSInternalNHWCDomain, true) +#endif } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/nn/batch_norm.h b/onnxruntime/core/providers/cuda/nn/batch_norm.h index 99da7652a1d24..4eb9fb74d3761 100644 --- a/onnxruntime/core/providers/cuda/nn/batch_norm.h +++ b/onnxruntime/core/providers/cuda/nn/batch_norm.h @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once @@ -9,7 +10,7 @@ namespace onnxruntime { namespace cuda { -template +template class BatchNorm final : public CudaKernel { public: BatchNorm(const OpKernelInfo& op_kernel_info) diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 81db3c4186282..82f3503919237 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -1,38 +1,47 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. +#include + #include "core/providers/cuda/nn/conv.h" #include "core/common/span_utils.h" #include "core/providers/cuda/cuda_common.h" #include "core/providers/cuda/shared_inc/fpgeneric.h" #include "core/providers/cuda/tensor/slice.h" +#include "core/providers/cuda/tensor/transpose.h" namespace onnxruntime { namespace cuda { // Op Set 11 for Conv only update document to clearify default dilations and strides value. // which are already convered by op set 11 cpu versoin, so simply add declaration. -#define REGISTER_KERNEL_TYPED(T) \ +#define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC) \ ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ Conv, \ - kOnnxDomain, \ + DOMAIN, \ 1, 10, \ T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Conv); \ + Conv); \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ Conv, \ - kOnnxDomain, \ + DOMAIN, \ 11, \ T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Conv); + Conv); + +REGISTER_KERNEL_TYPED(float, kOnnxDomain, false) +REGISTER_KERNEL_TYPED(double, kOnnxDomain, false) +REGISTER_KERNEL_TYPED(MLFloat16, kOnnxDomain, false) -REGISTER_KERNEL_TYPED(float) -REGISTER_KERNEL_TYPED(double) -REGISTER_KERNEL_TYPED(MLFloat16) +#ifdef ENABLE_CUDA_NHWC_OPS +REGISTER_KERNEL_TYPED(float, kMSInternalNHWCDomain, true) +REGISTER_KERNEL_TYPED(MLFloat16, kMSInternalNHWCDomain, true) +#endif template const cudnnConvolutionFwdAlgo_t Conv::kAllAlgos[] = { @@ -86,6 +95,39 @@ Status SliceOutUnwantedOutputSection(cudaStream_t stream, return SliceCuda::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size); } +template +Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, + bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) { + is_packed = false; + // only layout of weight input is adjusted via PrePack + if (NHWC && is_nhwc_domain_) { // InputTensors::IN_W + if (input_idx == 1) { + // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group} + auto orig_shape = tensor.Shape(); + + InlinedVector perm{0, 2, 3, 1}; + gsl::span permutation(perm.data(), 4); + TensorShapeVector new_dims{orig_shape[0], + orig_shape[2], + orig_shape[3], + orig_shape[1]}; + W_ = Tensor::Create(tensor.DataType(), TensorShape(new_dims), std::move(alloc)); + + auto status = cuda::Transpose::DoTranspose(GetDeviceProp(), + DefaultCudaStream(), + DefaultCublasHandle(), + permutation, tensor, *W_); + if (!status.IsOK()) { + return status; + } + CUDA_CALL_THROW(cudaStreamSynchronize(DefaultCudaStream())); + is_packed = true; + } + } + + return Status::OK(); +} + template Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const { // set X @@ -95,7 +137,12 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) s_.x_data = reinterpret_cast(X->Data()); s_.element_size = X->DataType()->Size(); // set W - const Tensor* W = context->Input(1); + const Tensor* W; + if (!W_) { + W = context->Input(1); + } else { + W = W_.get(); + } const TensorShape& w_shape = W->Shape(); auto w_dims = w_shape.AsShapeVector(); s_.w_data = reinterpret_cast(W->Data()); diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index 07825b93204ca..bcaa4d855b81e 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -1,13 +1,16 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once +#include +#include + #include "core/platform/ort_mutex.h" #include "core/providers/cuda/cuda_kernel.h" #include "core/providers/cuda/cudnn_common.h" #include "core/providers/cpu/nn/conv_attributes.h" -#include namespace onnxruntime { @@ -187,8 +190,12 @@ class Conv : public CudaKernel { Conv(const OpKernelInfo& info) : CudaKernel(info), conv_attrs_(info) { auto pads_size = conv_attrs_.pads.size(); ORT_ENFORCE(pads_size % 2 == 0); + is_nhwc_domain_ = info.node().Domain() == kMSInternalNHWCDomain; } + Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, + bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override; + Status ComputeInternal(OpKernelContext* context) const override; protected: @@ -201,6 +208,8 @@ class Conv : public CudaKernel { mutable CudnnConvState s_; constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; static const cudnnConvolutionFwdAlgo_t kAllAlgos[]; + std::unique_ptr W_; + bool is_nhwc_domain_; // prepack is only needed for the Conv in kMSInternalNHWCDomain }; Status SliceOutUnwantedOutputSection(cudaStream_t stream, diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc index 04f6bc46dcfcc..55dceaa2698e8 100644 --- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc @@ -1,7 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. +#include + #include "conv_transpose.h" +#include "core/providers/cuda/tensor/transpose.h" // To suppress FP static analyzer warnings: // https://msdata.visualstudio.com/Vienna/_workitems/edit/1944928 and @@ -17,35 +21,59 @@ namespace cuda { // Op Set 11 for ConvTranspose only update document to clarify default dilations and strides value. // which are already covered by op set 11 cpu version, so simply add declaration. -#define REGISTER_KERNEL_TYPED(T) \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ - ConvTranspose, \ - kOnnxDomain, \ - 1, 10, \ - T, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - ConvTranspose); \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - ConvTranspose, \ - kOnnxDomain, \ - 11, \ - T, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - ConvTranspose); - -REGISTER_KERNEL_TYPED(float) -REGISTER_KERNEL_TYPED(double) -REGISTER_KERNEL_TYPED(MLFloat16) - -template -Status ConvTranspose::ComputeInternal(OpKernelContext* context) const { +#define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + ConvTranspose, DOMAIN, 1, 10, T, kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), ConvTranspose); \ + ONNX_OPERATOR_TYPED_KERNEL_EX(ConvTranspose, DOMAIN, 11, T, kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + ConvTranspose); + +REGISTER_KERNEL_TYPED(float, kOnnxDomain, false) +REGISTER_KERNEL_TYPED(double, kOnnxDomain, false) +REGISTER_KERNEL_TYPED(MLFloat16, kOnnxDomain, false) + +#ifdef ENABLE_CUDA_NHWC_OPS +REGISTER_KERNEL_TYPED(float, kMSInternalNHWCDomain, true) +REGISTER_KERNEL_TYPED(MLFloat16, kMSInternalNHWCDomain, true) +#endif + +template +Status ConvTranspose::ComputeInternal(OpKernelContext* context) const { return DoConvTranspose(context, false); } -template -Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const { +template +Status ConvTranspose::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, + [[maybe_unused]] PrePackedWeights* prepacked_weights) { + is_packed = false; + // only layout of weight input is adjusted via PrePack + if (NHWC) { // InputTensors::IN_W + if (input_idx == 1) { + // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group} + auto orig_shape = tensor.Shape(); + + InlinedVector perm{0, 2, 3, 1}; + gsl::span permutation(perm.data(), 4); + TensorShapeVector new_dims{orig_shape[0], orig_shape[2], orig_shape[3], orig_shape[1]}; + W_ = Tensor::Create(tensor.DataType(), TensorShape(new_dims), std::move(alloc)); + + auto status = cuda::Transpose::DoTranspose(GetDeviceProp(), DefaultCudaStream(), DefaultCublasHandle(), + permutation, tensor, *W_); + + if (!status.IsOK()) { + return status; + } + CUDA_CALL_THROW(cudaStreamSynchronize(DefaultCudaStream())); + is_packed = true; + } + } + + return Status::OK(); +} + +template +Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const { typedef typename ToCudaType::MappedType CudaT; const Tensor* X = context->Input(0); @@ -59,7 +87,12 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input X must be 3-, 4- or 5-dimensional.", " X: ", X->Shape().ToString().c_str()); } - const Tensor* W = context->Input(1); + const Tensor* W; + if (!W_) { + W = context->Input(1); + } else { + W = W_.get(); + } const TensorShape& w_shape = W->Shape(); TensorShapeVector w_dims = w_shape.AsShapeVector(); auto w_data = reinterpret_cast(W->Data()); @@ -80,8 +113,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ bool input_dims_changed = (s_.last_x_dims.AsShapeVector() != x_dims); bool w_dims_changed = (s_.last_w_dims.AsShapeVector() != w_dims); if (input_dims_changed || w_dims_changed) { - if (input_dims_changed) - s_.last_x_dims = gsl::make_span(x_dims); + if (input_dims_changed) s_.last_x_dims = gsl::make_span(x_dims); if (w_dims_changed) { s_.last_w_dims = gsl::make_span(w_dims); @@ -89,7 +121,8 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ } ConvTransposeAttributes::Prepare p; - ORT_RETURN_IF_ERROR(conv_transpose_attrs_.PrepareForCompute(context, has_bias, p, dynamic_padding)); + ORT_RETURN_IF_ERROR( + conv_transpose_attrs_.PrepareForCompute(context, has_bias, p, dynamic_padding, &w_shape, NHWC)); auto y_dims = p.Y->Shape().AsShapeVector(); if (x_dimensions == 3) { @@ -103,7 +136,13 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ s_.y_dims = gsl::make_span(y_dims); if (w_dims_changed) { - ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, CudnnTensor::GetDataType())); + if (NHWC) { + ORT_RETURN_IF_ERROR(s_.w_desc.Set(CUDNN_TENSOR_NHWC, CudnnTensor::GetDataType(), + static_cast(w_dims[0]), static_cast(w_dims[3]), + static_cast(w_dims[1]), static_cast(w_dims[2]))); + } else { + ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, CudnnTensor::GetDataType())); + } } // Special case when there is a dim value of 0 in the shape. @@ -113,31 +152,39 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ if (p.Y->Shape().Size() == 0) { return Status::OK(); } - - ORT_RETURN_IF_ERROR(s_.x_tensor.Set(x_dims, CudnnTensor::GetDataType())); - ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims, CudnnTensor::GetDataType())); + if (NHWC) { + ORT_RETURN_IF_ERROR(s_.x_tensor.Set(CUDNN_TENSOR_NHWC, CudnnTensor::GetDataType(), + static_cast(x_dims[0]), static_cast(x_dims[3]), + static_cast(x_dims[1]), static_cast(x_dims[2]))); + ORT_RETURN_IF_ERROR(s_.y_tensor.Set(CUDNN_TENSOR_NHWC, CudnnTensor::GetDataType(), + static_cast(y_dims[0]), static_cast(y_dims[3]), + static_cast(y_dims[1]), static_cast(y_dims[2]))); + } else { + ORT_RETURN_IF_ERROR(s_.x_tensor.Set(x_dims, CudnnTensor::GetDataType())); + ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims, CudnnTensor::GetDataType())); + } cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; ORT_RETURN_IF_ERROR(s_.conv_desc.Set(p.kernel_shape.size(), p.pads, p.strides, p.dilations, - gsl::narrow_cast(conv_transpose_attrs_.group), - mode, CudnnTensor::GetDataType())); + gsl::narrow_cast(conv_transpose_attrs_.group), mode, + CudnnTensor::GetDataType())); if (has_bias) { const auto& b_shape = p.B->Shape(); ORT_RETURN_IF_NOT(b_shape.NumDimensions() == 1, "bias should be 1D"); TensorShapeVector b_dims(2 + p.kernel_shape.size()); - b_dims[0] = 1; // N - b_dims[1] = b_shape[0]; // C - for (size_t i = 0; i < p.kernel_shape.size(); i++) - b_dims[2 + i] = 1; + b_dims[0] = 1; // N + b_dims[NHWC ? 3 : 1] = b_shape[0]; // C + for (size_t i = 0; i < p.kernel_shape.size(); i++) b_dims[(NHWC ? 1 : 2) + i] = 1; - ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType())); + ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType(), NHWC)); } y_data = reinterpret_cast(p.Y->MutableData()); if (!s_.cached_benchmark_results.contains(x_dims)) { - IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); + IAllocatorUniquePtr algo_search_workspace = + GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); // set math type to tensor core before algorithm search if constexpr (std::is_same::value) @@ -146,19 +193,8 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ cudnnConvolutionBwdDataAlgoPerf_t perf; int algo_count = 1; CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionBackwardDataAlgorithmEx( - GetCudnnHandle(context), - s_.w_desc, - w_data, - s_.x_tensor, - x_data, - s_.conv_desc, - s_.y_tensor, - y_data, - 1, - &algo_count, - &perf, - algo_search_workspace.get(), - AlgoSearchWorkspaceSize)); + GetCudnnHandle(context), s_.w_desc, w_data, s_.x_tensor, x_data, s_.conv_desc, s_.y_tensor, y_data, 1, + &algo_count, &perf, algo_search_workspace.get(), AlgoSearchWorkspaceSize)); s_.cached_benchmark_results.insert(x_dims, {perf.algo, perf.memory, perf.mathType}); } @@ -189,26 +225,15 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ IAllocatorUniquePtr workspace = GetScratchBuffer(s_.workspace_bytes, context->GetComputeStream()); - CUDNN_RETURN_IF_ERROR( - cudnnConvolutionBackwardData( - GetCudnnHandle(context), - &alpha, - s_.w_desc, - w_data, - s_.x_tensor, - x_data, - s_.conv_desc, - s_.algo, - workspace.get(), - s_.workspace_bytes, - &beta, - s_.y_tensor, - y_data)); + CUDNN_RETURN_IF_ERROR(cudnnConvolutionBackwardData(GetCudnnHandle(context), &alpha, s_.w_desc, w_data, s_.x_tensor, + x_data, s_.conv_desc, s_.algo, workspace.get(), + s_.workspace_bytes, &beta, s_.y_tensor, y_data)); if (has_bias) { const Tensor* B = dynamic_padding ? context->Input(3) : context->Input(2); auto b_data = reinterpret_cast(B->Data()); - CUDNN_RETURN_IF_ERROR(cudnnAddTensor(GetCudnnHandle(context), &alpha, s_.b_tensor, b_data, &alpha, s_.y_tensor, y_data)); + CUDNN_RETURN_IF_ERROR( + cudnnAddTensor(GetCudnnHandle(context), &alpha, s_.b_tensor, b_data, &alpha, s_.y_tensor, y_data)); } } diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.h b/onnxruntime/core/providers/cuda/nn/conv_transpose.h index 165d548d27fa2..77c9d94162b6b 100644 --- a/onnxruntime/core/providers/cuda/nn/conv_transpose.h +++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.h @@ -1,8 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once +#include + #include "core/providers/cuda/cuda_common.h" #include "core/providers/cuda/cuda_kernel.h" #include "core/providers/cuda/cudnn_common.h" @@ -12,10 +15,12 @@ namespace onnxruntime { namespace cuda { -template +template class ConvTranspose : public CudaKernel { public: ConvTranspose(const OpKernelInfo& info) : CudaKernel(info), conv_transpose_attrs_(info){}; + Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, + bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override; Status ComputeInternal(OpKernelContext* context) const override; Status DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const; @@ -23,6 +28,7 @@ class ConvTranspose : public CudaKernel { ConvTransposeAttributes conv_transpose_attrs_; mutable CudnnConvState s_; + std::unique_ptr W_; }; } // namespace cuda diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc index e632ef20bce43..8bc96958693bc 100644 --- a/onnxruntime/core/providers/cuda/nn/pool.cc +++ b/onnxruntime/core/providers/cuda/nn/pool.cc @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #include "core/providers/shared_library/provider_api.h" @@ -11,92 +12,99 @@ using namespace onnxruntime::common; namespace onnxruntime { namespace cuda { -#define POOLING_KERNEL(op_name, data_type, pool_type, since_version) \ +#define POOLING_KERNEL(op_name, data_type, pool_type, since_version, op_domain, nhwc) \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ - op_name, \ - kOnnxDomain, \ - since_version, \ - data_type, \ - kCudaExecutionProvider, \ + op_name, op_domain, since_version, data_type, kCudaExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Pool); - -#define POOLING_KERNEL_VERSIONED(op_name, data_type, pool_type, since_version, end_version) \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ - op_name, \ - kOnnxDomain, \ - since_version, \ - end_version, \ - data_type, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Pool); - -#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - op_name, \ - kOnnxDomain, \ - since_version, \ - data_type, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ - Pool); - -#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version) \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ - op_name, \ - kOnnxDomain, \ - since_version, \ - end_version, \ - data_type, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ - Pool); - -POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9) -POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9) -POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9) -POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10) -POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 10, 10) -POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10) + Pool); + +#define POOLING_KERNEL_VERSIONED(op_name, data_type, pool_type, since_version, end_version, op_domain, nhwc) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + op_name, op_domain, since_version, end_version, data_type, kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + Pool); + +#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version, op_domain, nhwc) \ + ONNX_OPERATOR_TYPED_KERNEL_EX(op_name, op_domain, since_version, data_type, kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ + Pool); + +#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version, op_domain, \ + nhwc) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(op_name, op_domain, since_version, end_version, data_type, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("I", DataTypeImpl::GetTensorType()), \ + Pool); + +POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 10, 10, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10, kOnnxDomain, false) // AveragePool and MaxPool op set 11 only update spec document on default value for dilations and strides. -POOLING_KERNEL(AveragePool, float, AveragePool, 11) -POOLING_KERNEL(AveragePool, double, AveragePool, 11) -POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11) -POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1) -POOLING_KERNEL(GlobalAveragePool, double, AveragePool, 1) -POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1) -POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<1>, 1, 7) -POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<1>, 1, 7) -POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 8, 9) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 8, 9) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 8, 9) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 10, 10) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 10, 10) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 10, 10) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 11, 11) -POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11) -POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12) -POOLING_KERNEL_WITH_INDICES(MaxPool, double, MaxPool<8>, 12) -POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12) -POOLING_KERNEL_WITH_INDICES(MaxPool, int8_t, MaxPool<8>, 12) -POOLING_KERNEL_WITH_INDICES(MaxPool, uint8_t, MaxPool<8>, 12) - -POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1) -POOLING_KERNEL(GlobalMaxPool, double, MaxPool<1>, 1) -POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1) +POOLING_KERNEL(AveragePool, float, AveragePool, 11, kOnnxDomain, false) +POOLING_KERNEL(AveragePool, double, AveragePool, 11, kOnnxDomain, false) +POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11, kOnnxDomain, false) +POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1, kOnnxDomain, false) +POOLING_KERNEL(GlobalAveragePool, double, AveragePool, 1, kOnnxDomain, false) +POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<1>, 1, 7, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<1>, 1, 7, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 8, 9, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 8, 9, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 8, 9, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 10, 10, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 10, 10, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 10, 10, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 11, 11, kOnnxDomain, false) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11, kOnnxDomain, false) +POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12, kOnnxDomain, false) +POOLING_KERNEL_WITH_INDICES(MaxPool, double, MaxPool<8>, 12, kOnnxDomain, false) +POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12, kOnnxDomain, false) +POOLING_KERNEL_WITH_INDICES(MaxPool, int8_t, MaxPool<8>, 12, kOnnxDomain, false) +POOLING_KERNEL_WITH_INDICES(MaxPool, uint8_t, MaxPool<8>, 12, kOnnxDomain, false) + +POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1, kOnnxDomain, false) +POOLING_KERNEL(GlobalMaxPool, double, MaxPool<1>, 1, kOnnxDomain, false) +POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1, kOnnxDomain, false) + +// NHWC variants +#ifdef ENABLE_CUDA_NHWC_OPS +POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<1>, 1, 7, kMSInternalNHWCDomain, true) +POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7, kMSInternalNHWCDomain, true) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 8, 9, kMSInternalNHWCDomain, true) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 8, 9, kMSInternalNHWCDomain, true) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 10, 10, kMSInternalNHWCDomain, true) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 10, 10, kMSInternalNHWCDomain, true) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11, kMSInternalNHWCDomain, true) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11, kMSInternalNHWCDomain, true) +POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12, kMSInternalNHWCDomain, true) +POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12, kMSInternalNHWCDomain, true) + +POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1, kMSInternalNHWCDomain, true) +POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1, kMSInternalNHWCDomain, true) + +POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9, kMSInternalNHWCDomain, true) +POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9, kMSInternalNHWCDomain, true) +POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10, kMSInternalNHWCDomain, true) +POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10, kMSInternalNHWCDomain, true) +// AveragePool and MaxPool op set 11 only update spec document on default value for dilations +POOLING_KERNEL(AveragePool, float, AveragePool, 11, kMSInternalNHWCDomain, true) +POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11, kMSInternalNHWCDomain, true) +POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1, kMSInternalNHWCDomain, true) +POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1, kMSInternalNHWCDomain, true) +#endif class CudnnPoolingDescriptor final { public: - CudnnPoolingDescriptor() : desc_(nullptr) { - } + CudnnPoolingDescriptor() : desc_(nullptr) {} ~CudnnPoolingDescriptor() { if (desc_ != nullptr) { @@ -108,12 +116,9 @@ class CudnnPoolingDescriptor final { CudnnPoolingDescriptor(const CudnnPoolingDescriptor&) = delete; CudnnPoolingDescriptor& operator=(const CudnnPoolingDescriptor&) = delete; - Status Set(cudnnPoolingMode_t mode, - const gsl::span& kernel_shape, - const gsl::span& pads, - const gsl::span& strides) { - if (!desc_) - CUDNN_RETURN_IF_ERROR(cudnnCreatePoolingDescriptor(&desc_)); + Status Set(cudnnPoolingMode_t mode, const gsl::span& kernel_shape, + const gsl::span& pads, const gsl::span& strides) { + if (!desc_) CUDNN_RETURN_IF_ERROR(cudnnCreatePoolingDescriptor(&desc_)); int rank = gsl::narrow_cast(kernel_shape.size()); InlinedVector window(rank); @@ -128,14 +133,8 @@ class CudnnPoolingDescriptor final { for (int i = 0; i < rank; i++) { stride[i] = gsl::narrow_cast(strides[i]); } - CUDNN_RETURN_IF_ERROR(SetPoolingNdDescriptorHelper( - desc_, - mode, - CUDNN_PROPAGATE_NAN, - rank, - window.data(), - padding.data(), - stride.data())); + CUDNN_RETURN_IF_ERROR(SetPoolingNdDescriptorHelper(desc_, mode, CUDNN_PROPAGATE_NAN, rank, window.data(), + padding.data(), stride.data())); return Status::OK(); } @@ -146,8 +145,8 @@ class CudnnPoolingDescriptor final { cudnnPoolingDescriptor_t desc_; }; -template -Status Pool::ComputeInternal(OpKernelContext* context) const { +template +Status Pool::ComputeInternal(OpKernelContext* context) const { typedef typename ToCudaType::MappedType CudaT; const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); @@ -166,13 +165,12 @@ Status Pool::ComputeInternal(OpKernelContext* context) const { pads.assign(kernel_shape.size(), 0); strides.assign(kernel_shape.size(), 1); } - - auto y_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads); + auto out_channel = NHWC ? x_shape[3] : x_shape[1]; + auto y_dims = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC); TensorShape y_shape(y_dims); Tensor* Y = context->Output(0, y_shape); // special case when there is a dim value of 0 in the shape. - if (y_shape.Size() == 0) - return Status::OK(); + if (y_shape.Size() == 0) return Status::OK(); auto x_data = reinterpret_cast(X->Data()); auto y_data = reinterpret_cast(Y->MutableData()); @@ -181,12 +179,19 @@ Status Pool::ComputeInternal(OpKernelContext* context) const { TensorShapeVector y_dims_cudnn(y_dims); if (kernel_shape.size() < 2) { // cudnn only takes 4D or 5D input, so pad dimensions if needed - x_dims_cudnn.push_back(1); - y_dims_cudnn.push_back(1); + if (NHWC) { + x_dims_cudnn.insert(x_dims_cudnn.begin() + 1, 1); + y_dims_cudnn.insert(y_dims_cudnn.begin() + 1, 1); + kernel_shape.insert(kernel_shape.begin() + 1, 1); + strides.insert(strides.begin() + 1, 1); + } else { + x_dims_cudnn.push_back(1); + y_dims_cudnn.push_back(1); + kernel_shape.push_back(1); + strides.push_back(1); + } pads.insert(pads.begin() + kernel_shape.size(), 0); pads.insert(pads.end(), 0); - kernel_shape.push_back(1); - strides.push_back(1); } cudnnPoolingMode_t mode = CUDNN_POOLING_MAX; @@ -203,8 +208,8 @@ Status Pool::ComputeInternal(OpKernelContext* context) const { const auto beta = Consts::Zero; CudnnTensor x_tensor; CudnnTensor y_tensor; - ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType())); - ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType())); + ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType(), NHWC)); + ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType(), NHWC)); const auto input_count = x_shape.Size(); const auto output_count = y_shape.Size(); @@ -212,24 +217,26 @@ Status Pool::ComputeInternal(OpKernelContext* context) const { IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count, context->GetComputeStream()); auto temp_Y = GetScratchBuffer(output_count, context->GetComputeStream()); Impl_Cast(Stream(context), reinterpret_cast(x_data), temp_X.get(), input_count); - CUDNN_RETURN_IF_ERROR(PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, temp_X.get(), &beta, y_tensor, temp_Y.get())); + CUDNN_RETURN_IF_ERROR(PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, temp_X.get(), + &beta, y_tensor, temp_Y.get())); Impl_Cast(Stream(context), temp_Y.get(), y_data, output_count); } else { const auto alpha = Consts::One; const auto beta = Consts::Zero; CudnnTensor x_tensor; CudnnTensor y_tensor; - ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType())); - ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType())); + ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType(), NHWC)); + ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType(), NHWC)); - CUDNN_RETURN_IF_ERROR(PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, x_data, &beta, y_tensor, y_data)); + CUDNN_RETURN_IF_ERROR( + PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, x_data, &beta, y_tensor, y_data)); } return Status::OK(); } -template -Status Pool>::ComputeInternal(OpKernelContext* context) const { +template +Status Pool, NHWC>::ComputeInternal(OpKernelContext* context) const { typedef typename ToCudaType::MappedType CudaT; const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); @@ -248,13 +255,12 @@ Status Pool>::ComputeInternal(OpKernelContext* context) const { pads.assign(kernel_shape.size(), 0); strides.assign(kernel_shape.size(), 1); } - - auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads); + auto out_channel = NHWC ? x_shape[3] : x_shape[1]; + auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC); Tensor* Y = context->Output(0, TensorShape(y_dims)); // special case when there is a dim value of 0 in the shape. - if (Y->Shape().Size() == 0) - return Status::OK(); + if (Y->Shape().Size() == 0) return Status::OK(); auto x_data = reinterpret_cast(X->Data()); auto y_data = reinterpret_cast(Y->MutableData()); @@ -262,20 +268,10 @@ Status Pool>::ComputeInternal(OpKernelContext* context) const { Tensor* I = context->Output(1, TensorShape(y_dims)); if (nullptr != I || !this->pool_attrs_.default_dilations) { auto i_data = nullptr == I ? nullptr : I->MutableData(); - MaxPoolWithIndex( - this->Stream(context), - x_shape, - TensorShape(y_dims), - kernel_shape, - strides, - pads, - this->pool_attrs_.dilations, - this->pool_attrs_.storage_order, - x_data, - y_data, - i_data); + MaxPoolWithIndex(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape, strides, pads, + this->pool_attrs_.dilations, this->pool_attrs_.storage_order, x_data, y_data, i_data); } else { - ORT_RETURN_IF_ERROR((Pool>::ComputeInternal(context))); + ORT_RETURN_IF_ERROR((Pool, NHWC>::ComputeInternal(context))); } return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/nn/pool.h b/onnxruntime/core/providers/cuda/nn/pool.h index fb223c18d2625..8b5152a1565a9 100644 --- a/onnxruntime/core/providers/cuda/nn/pool.h +++ b/onnxruntime/core/providers/cuda/nn/pool.h @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #pragma once @@ -10,7 +11,7 @@ namespace onnxruntime { namespace cuda { -template +template class Pool : public CudaKernel, public PoolBase { public: Pool(const OpKernelInfo& info) : CudaKernel(info), PoolBase(info) {} @@ -18,10 +19,10 @@ class Pool : public CudaKernel, public PoolBase { Status ComputeInternal(OpKernelContext* context) const override; }; -template -class Pool> final : public Pool> { +template +class Pool, NHWC> final : public Pool, NHWC> { public: - Pool(const OpKernelInfo& info) : Pool>(info) {} + explicit Pool(const OpKernelInfo& info) : Pool, NHWC>(info) {} Status ComputeInternal(OpKernelContext* context) const override; }; diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h index 85599fab808b3..76533a0061702 100644 --- a/onnxruntime/core/providers/shared_library/provider_api.h +++ b/onnxruntime/core/providers/shared_library/provider_api.h @@ -240,6 +240,7 @@ struct DeleteOnUnloadPtr { constexpr const char* kOnnxDomain = ""; constexpr const char* kMSDomain = "com.microsoft"; +constexpr const char* kMSInternalNHWCDomain = "com.ms.internal.nhwc"; constexpr const char* kPytorchAtenDomain = "org.pytorch.aten"; constexpr const char* kNGraphDomain = "com.intel.ai"; constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider"; diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 8d30202faaeb3..d950223f2d108 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1332,6 +1332,7 @@ OrtCUDAProviderOptionsV2 OrtCUDAProviderOptionsToOrtCUDAProviderOptionsV2(const // Use default value as this field is not available in OrtCUDAProviderOptions cuda_options_converted.cudnn_conv_use_max_workspace = 1; cuda_options_converted.enable_cuda_graph = 0; + cuda_options_converted.prefer_nhwc = 0; cuda_options_converted.cudnn_conv1d_pad_to_nc1d = 0; cuda_options_converted.enable_skip_layer_norm_strict_mode = 0; cuda_options_converted.use_ep_level_unified_stream = 0; diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index 56f924ed351fb..b1a04a00e89b1 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -1,4 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. // Licensed under the MIT License. #include "command_args_parser.h" @@ -53,8 +54,8 @@ namespace perftest { "\t-o [optimization level]: Default is 99 (all). Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n" "\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels.\n" "\t-u [optimized_model_path]: Specify the optimized model path for saving.\n" - "\t-d [cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). \n" - "\t-q: [CUDA only] use separate stream for copy. \n" + "\t-d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). \n" + "\t-q [CUDA only] use separate stream for copy. \n" "\t-z: Set denormal as zero. When turning on this option reduces latency dramatically, a model may have denormals.\n" "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n" "\t [OpenVINO only] [device_type]: Overrides the accelerator hardware type and precision with these values at runtime.\n" diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 1111a92a385fd..a7f0b7584a211 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -1,3 +1,7 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. +// Licensed under the MIT License. + #include "ort_test_session.h" #include #include @@ -93,11 +97,66 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device #endif } else if (provider_name == onnxruntime::kCudaExecutionProvider) { #ifdef USE_CUDA - OrtCUDAProviderOptions cuda_options; - cuda_options.cudnn_conv_algo_search = static_cast(performance_test_config.run_config.cudnn_conv_algo); - cuda_options.do_copy_in_default_stream = !performance_test_config.run_config.do_cuda_copy_in_separate_stream; - // TODO: Support arena configuration for users of perf test - session_options.AppendExecutionProvider_CUDA(cuda_options); + const auto& api = Ort::GetApi(); + OrtCUDAProviderOptionsV2* cuda_options; + Ort::ThrowOnError(api.CreateCUDAProviderOptions(&cuda_options)); + + const char* cudnn_conv_algo_search = "cudnn_conv_algo_search"; + const char* default_conv = "DEFAULT"; + const char* benchmarking = "EXHAUSTIVE"; + const char* heuristic = "HEURISTIC"; + switch (performance_test_config.run_config.cudnn_conv_algo) { + case 0: + Ort::ThrowOnError( + api.UpdateCUDAProviderOptions(cuda_options, &cudnn_conv_algo_search, &benchmarking, 1)); + break; + case 1: + Ort::ThrowOnError( + api.UpdateCUDAProviderOptions(cuda_options, &cudnn_conv_algo_search, &heuristic, 1)); + break; + default: + Ort::ThrowOnError( + api.UpdateCUDAProviderOptions(cuda_options, &cudnn_conv_algo_search, &default_conv, 1)); + break; + } + + const char* do_copy_in_default_stream = "do_copy_in_default_stream"; + if (performance_test_config.run_config.do_cuda_copy_in_separate_stream) { + const char* v = "1"; + Ort::ThrowOnError( + api.UpdateCUDAProviderOptions(cuda_options, &do_copy_in_default_stream, &v, 1)); + } else { + const char* v = "0"; + Ort::ThrowOnError( + api.UpdateCUDAProviderOptions(cuda_options, &do_copy_in_default_stream, &v, 1)); + } + +#ifdef _MSC_VER + std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string); +#else + std::string ov_string = performance_test_config.run_config.ep_runtime_config_string; +#endif + std::istringstream ss(ov_string); + std::string token; + while (ss >> token) { + if (token == "") { + continue; + } + auto pos = token.find("|"); + if (pos == std::string::npos || pos == 0 || pos == token.length()) { + ORT_THROW( + "[ERROR] [CUDA] Use a '|' to separate the key and value for the run-time option you are trying to use.\n"); + } + + auto key = token.substr(0, pos); + auto value = token.substr(pos + 1); + auto key_p = key.c_str(); + auto value_p = value.c_str(); + Ort::ThrowOnError( + api.UpdateCUDAProviderOptions(cuda_options, &key_p, &value_p, 1)); + } + + session_options.AppendExecutionProvider_CUDA_V2(*cuda_options); #else ORT_THROW("CUDA is not supported in this build\n"); #endif diff --git a/onnxruntime/test/providers/compare_provider_test_utils.cc b/onnxruntime/test/providers/compare_provider_test_utils.cc index 94fb03540e3f8..3ef74259e27b6 100644 --- a/onnxruntime/test/providers/compare_provider_test_utils.cc +++ b/onnxruntime/test/providers/compare_provider_test_utils.cc @@ -121,5 +121,83 @@ void CompareOpTester::CompareWithCPU(const std::string& target_provider_type, } } +void CompareOpTester::CompareEPs(const std::shared_ptr& source_execution_provider, + std::vector>& target_execution_providers, + double per_sample_tolerance, + double relative_per_sample_tolerance, + const bool need_cpu_cast, + const std::unordered_map& extra_domain_to_version) { + SetTestFunctionCalled(); + + auto& model = BuildModel(extra_domain_to_version); + auto& graph = model.MainGraph(); + + // In InferenceSession::Initialize(), the call to graph partitioner, which is responsible + // for Inlining function bodies for ops whose kernel is missing happens before the + // Cast Transformer. As a result, for MLFloat16 tests where the node is missing a CPU kernel, + // the function body is instead used for CPU pass. This option allows the comparison with + // the CPU kernel by adding the input/output casts before looking for a registered CPU kernel. + if (need_cpu_cast) { + InsertCastTransformer transformer("Test", GetExecutionProvider(kCpuExecutionProvider)->GetKernelRegistry().get()); + bool modified = false; + ASSERT_STATUS_OK(transformer.Apply(graph, modified, DefaultLoggingManager().DefaultLogger())); + } + + ASSERT_STATUS_OK(graph.Resolve()); + + // Hookup the inputs and outputs + std::unordered_map feeds; + std::vector output_names; + FillFeedsAndOutputNames(feeds, output_names); + + // Run the model + SessionOptions so; + so.session_logid = Op(); + + InferenceSession source_session_object{so, GetEnvironment()}; + ASSERT_STATUS_OK(source_session_object.RegisterExecutionProvider(source_execution_provider)); + + // first run with source provider + std::string s1; + model.ToProto().SerializeToString(&s1); + std::istringstream model_proto_str(s1); + + ASSERT_STATUS_OK(source_session_object.Load(model_proto_str)); + + ASSERT_STATUS_OK(source_session_object.Initialize()); + + std::vector source_fetches; + ASSERT_STATUS_OK(source_session_object.Run({}, feeds, output_names, &source_fetches)); + + for (auto& target_execution_provider : target_execution_providers) { + // run with target provider + // build the graph again as the other graphs may be with casts + auto& tp_model = BuildModel(extra_domain_to_version); + auto& tp_graph = tp_model.MainGraph(); + + ASSERT_STATUS_OK(tp_graph.Resolve()); + + InferenceSession target_session_object{so, GetEnvironment()}; + ASSERT_STATUS_OK(target_session_object.RegisterExecutionProvider(target_execution_provider)); + + std::string s2; + tp_model.ToProto().SerializeToString(&s2); + std::istringstream model_proto_str1(s2); + ASSERT_STATUS_OK(target_session_object.Load(model_proto_str1)); + + ASSERT_STATUS_OK(target_session_object.Initialize()); + + std::vector target_fetches; + ASSERT_STATUS_OK(target_session_object.Run({}, feeds, output_names, &target_fetches)); + + // compare + ASSERT_TRUE(source_fetches.size() == target_fetches.size()); + for (size_t i = 0; i < source_fetches.size(); i++) { + auto ret = CompareOrtValue(target_fetches[i], source_fetches[i], per_sample_tolerance, + relative_per_sample_tolerance, false); + EXPECT_EQ(ret.first, COMPARE_RESULT::SUCCESS) << ret.second; + } + } +} } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/compare_provider_test_utils.h b/onnxruntime/test/providers/compare_provider_test_utils.h index 924fe405ba8dd..155016d7e69a2 100644 --- a/onnxruntime/test/providers/compare_provider_test_utils.h +++ b/onnxruntime/test/providers/compare_provider_test_utils.h @@ -3,6 +3,11 @@ #pragma once +#include +#include +#include +#include + #include "core/graph/constants.h" #include "test/common/tensor_op_test_utils.h" #include "test/providers/provider_test_utils.h" @@ -22,6 +27,13 @@ class CompareOpTester : public OpTester { double relative_per_sample_tolerance = 1e-4, const bool need_cpu_cast = false, const std::unordered_map& extra_domain_to_version = {}); + + void CompareEPs(const std::shared_ptr& source_execution_provider, + std::vector>& target_execution_providers, + double per_sample_tolerance, + double relative_per_sample_tolerance = 1e-4, + const bool need_cpu_cast = false, + const std::unordered_map& extra_domain_to_version = {}); }; } // namespace test diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc new file mode 100644 index 0000000000000..be0082f95feb8 --- /dev/null +++ b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc @@ -0,0 +1,74 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. +// Licensed under the MIT License. + +#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h" + +namespace onnxruntime { +namespace test { + +template +struct ConvOp { + const std::vector input_dims; + const std::vector kernel_shape; + int64_t channels; + int64_t group = 1; + bool bias = false; + std::vector strides = {1, 1}; + std::vector padding = {0, 0, 0, 0}; + std::vector dilations = {1, 1}; + + std::unique_ptr get_test() { + RandomValueGenerator random{}; + + auto test = std::make_unique("Conv", 7); + std::vector input_data = random.Uniform(input_dims, 0.0f, 1.0f); + + std::vector weight_dims{channels, input_dims[1] / group, kernel_shape[0], kernel_shape[1]}; + std::vector weight_data = random.Uniform(weight_dims, -0.4f, 0.4f); + + test->AddInput("X", input_dims, input_data); + test->AddInput("W", weight_dims, weight_data, true); + if (bias) { + std::vector bias_dims{channels}; + std::vector bias_data = random.Uniform(bias_dims, 0.2f, 0.4f); + test->AddInput("B", bias_dims, bias_data, true); + } + test->AddAttribute("group", group); + test->AddAttribute("kernel_shape", kernel_shape); + test->AddAttribute("strides", strides); + test->AddAttribute("dilations", dilations); + test->AddAttribute("pads", padding); + + std::vector output_dims = { + input_dims[0], channels, + ComputeOutputShape(input_dims[2], strides[0], kernel_shape[0], dilations[0], padding[0], padding[1]), + ComputeOutputShape(input_dims[3], strides[1], kernel_shape[1], dilations[1], padding[2], padding[3])}; + std::vector output_data = FillZeros(output_dims); + + test->AddOutput("Y", output_dims, output_data); + return test; + } +}; + +TYPED_TEST(CudaNhwcTypedTest, ConvNhwcBias) { + auto op = ConvOp{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .bias = true}; + + MAKE_PROVIDERS_EPS_TYPE(TypeParam) +} + +TYPED_TEST(CudaNhwcTypedTest, ConvNhwcGroupNoBias) { + auto op = ConvOp{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .group = 4}; + + MAKE_PROVIDERS_EPS_TYPE(TypeParam) +} + +TYPED_TEST(CudaNhwcTypedTest, ConvNhwcPadding) { + auto op = + ConvOp{.input_dims = {2, 4, 64, 64}, .kernel_shape = {3, 3}, .channels = 4, .padding = {4, 4, 4, 4}}; + + MAKE_PROVIDERS_EPS_TYPE(TypeParam) +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc new file mode 100644 index 0000000000000..d45323190c514 --- /dev/null +++ b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc @@ -0,0 +1,93 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. +// Licensed under the MIT License. + +#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h" + +namespace onnxruntime { +namespace test { + +template +struct ConvTransposeOp { + const std::vector input_dims; + const std::vector kernel_shape; + int64_t channels; + int64_t group = 1; + bool bias = false; + std::vector strides = {1, 1}; + std::vector padding = {0, 0, 0, 0}; + std::vector output_padding = {0, 0, 0, 0}; + std::vector dilations = {1, 1}; + + std::unique_ptr get_test() { + RandomValueGenerator random{}; + + auto test = std::make_unique("ConvTranspose", 14); + std::vector input_data = random.Uniform(input_dims, 0.0f, 1.0f); + + std::vector weight_dims{input_dims[1], channels / group, kernel_shape[0], kernel_shape[1]}; + std::vector weight_data = random.Uniform(weight_dims, -0.4f, 0.4f); + + test->AddInput("X", input_dims, input_data); + test->AddInput("W", weight_dims, weight_data, true); + if (bias) { + std::vector bias_dims{channels}; + std::vector bias_data = random.Uniform(bias_dims, 0.2f, 0.4f); + test->AddInput("B", bias_dims, bias_data, true); + } + test->AddAttribute("group", group); + test->AddAttribute("kernel_shape", kernel_shape); + test->AddAttribute("strides", strides); + test->AddAttribute("dilations", dilations); + test->AddAttribute("pads", padding); + if (!output_padding.empty()) { + test->AddAttribute("output_padding", output_padding); + } + + std::vector output_dims = { + input_dims[0], channels, + (kernel_shape[1] - 1) * dilations[1] + (input_dims[2] - 1) * strides[1] - (padding[1] + padding[0]) + 1, + (kernel_shape[0] - 1) * dilations[0] + (input_dims[3] - 1) * strides[0] - (padding[3] + padding[2]) + 1}; + std::vector output_data = FillZeros(output_dims); + + test->AddOutput("Y", output_dims, output_data); + return test; + } +}; + +TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcGroupNoBias) { + auto op = + ConvTransposeOp{.input_dims = {8, 8, 32, 32}, .kernel_shape = {3, 3}, .channels = 16, .group = 4}; + + MAKE_PROVIDERS_EPS_TYPE(TypeParam) +} + +TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcBias) { + auto op = + ConvTransposeOp{.input_dims = {1, 8, 80, 80}, .kernel_shape = {5, 5}, .channels = 16, .bias = true}; + + MAKE_PROVIDERS_EPS_TYPE(TypeParam) +} + +TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcPad) { + auto op = ConvTransposeOp{.input_dims = {1, 16, 8, 8}, + .kernel_shape = {3, 3}, + .channels = 32, + .padding = {2, 2, 2, 2}, + .output_padding = {}}; + + MAKE_PROVIDERS_EPS_TYPE(TypeParam) +} + +TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcOutPad) { + auto op = ConvTransposeOp{.input_dims = {1, 32, 8, 8}, + .kernel_shape = {3, 3}, + .channels = 32, + .strides = {2, 2}, + .output_padding = {1, 1, 1, 1}}; + + MAKE_PROVIDERS_EPS_TYPE(TypeParam) +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h new file mode 100644 index 0000000000000..2c942bb790096 --- /dev/null +++ b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h @@ -0,0 +1,47 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. +// Licensed under the MIT License. + +#include +#include +#include + +#include "core/providers/cuda/cuda_provider_options.h" +#include "core/providers/common.h" + +#include "test/providers/compare_provider_test_utils.h" +#include "test/common/cuda_op_test_utils.h" + +#include "gtest/gtest.h" + +#define MAKE_PROVIDERS_EPS(eps) \ + std::vector> execution_providers; \ + OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true}; \ + execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc)); \ + \ + double error_tolerance = eps; \ + OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false}; \ + auto source_ep = CudaExecutionProviderWithOptions(&nchw); \ + auto test = op.get_test(); \ + test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance); + +#define MAKE_PROVIDERS() MAKE_PROVIDERS_EPS(1e-3) + +#define MAKE_PROVIDERS_EPS_TYPE(T) \ + if (std::is_same::value) { \ + MAKE_PROVIDERS_EPS(2e-2) \ + } else if (std::is_same::value) { \ + MAKE_PROVIDERS_EPS(2e-4) \ + } else { \ + MAKE_PROVIDERS_EPS(2e-3) \ + } +namespace onnxruntime { +namespace test { + +template +class CudaNhwcTypedTest : public ::testing::Test {}; + +using CudaNhwcTestTypes = ::testing::Types; // double, +TYPED_TEST_SUITE(CudaNhwcTypedTest, CudaNhwcTestTypes); +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cuda/nhwc/norm_test.cc b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc new file mode 100644 index 0000000000000..52da8ba557c2d --- /dev/null +++ b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. +// Licensed under the MIT License. + +#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h" + +namespace onnxruntime { +namespace test { + +template +struct BatchNormOp { + const std::vector input_dims; + + std::unique_ptr get_test() { + // create rand inputs + RandomValueGenerator random{}; + + auto test = std::make_unique("BatchNormalization", 14); + std::vector input_data = random.Uniform(input_dims, 0.0f, 0.3f); + auto channels = input_dims[1]; + test->AddInput("X", input_dims, input_data); + + std::vector bias_dims{channels}; + std::vector bias_data = random.Uniform(bias_dims, 0.2f, 1.0f); + test->AddInput("B", bias_dims, bias_data); + // we simply gonna reuse the bias data here. + test->AddInput("scale", bias_dims, bias_data); + + std::vector mean{channels}; + std::vector mean_data = random.Uniform(mean, 0.7f, 0.8f); + test->AddInput("input_mean", bias_dims, bias_data); + std::vector var{channels}; + std::vector var_data = random.Uniform(var, 0.0f, 0.1f); + test->AddInput("input_var", bias_dims, bias_data); + + std::vector output_data = FillZeros(input_dims); + test->AddOutput("Y", input_dims, output_data); + return test; + } +}; + +TYPED_TEST(CudaNhwcTypedTest, BatchNormNhwc) { + auto op = BatchNormOp{ + .input_dims = {4, 16, 64, 64}, + }; + + MAKE_PROVIDERS() +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cuda/nhwc/pool_test.cc b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc new file mode 100644 index 0000000000000..3d1f81e6bc282 --- /dev/null +++ b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc @@ -0,0 +1,95 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) 2023 NVIDIA Corporation. +// Licensed under the MIT License. + +#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h" + +namespace onnxruntime { +namespace test { + +template +struct PoolOp { + const std::string pooling_type; + const std::vector input_dims; + const std::vector kernel_shape; + int64_t channels; + int64_t group = 1; + std::vector strides = {1, 1}; + std::vector padding = {0, 0, 0, 0}; + + std::unique_ptr get_test() { + RandomValueGenerator random{}; + + auto test = std::make_unique(pooling_type.c_str(), 14); + std::vector input_data = random.Uniform(input_dims, 0.0f, 0.3f); + + test->AddInput("X", input_dims, input_data); + + test->AddAttribute("kernel_shape", kernel_shape); + test->AddAttribute("strides", strides); + test->AddAttribute("pads", padding); + + std::vector output_dims = { + input_dims[0], channels, + (kernel_shape[1] - 1) + (input_dims[2] - 1) * strides[1] - (padding[1] + padding[0]) + 1, + (kernel_shape[0] - 1) + (input_dims[3] - 1) * strides[0] - (padding[3] + padding[2]) + 1}; + std::vector output_data = FillZeros(output_dims); + + test->AddOutput("Y", output_dims, output_data); + return test; + } +}; + +TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwc) { + auto op = PoolOp{ + .pooling_type = "AveragePool", + .input_dims = {1, 16, 64, 64}, + .kernel_shape = {3, 3}, + .channels = 16, + }; + MAKE_PROVIDERS() +} + +TYPED_TEST(CudaNhwcTypedTest, MaxPoolNhwc) { + auto op = PoolOp{ + .pooling_type = "MaxPool", + .input_dims = {1, 16, 64, 64}, + .kernel_shape = {3, 3}, + .channels = 16, + }; + MAKE_PROVIDERS() +} + +TYPED_TEST(CudaNhwcTypedTest, GlobalMaxPoolNhwc) { + RandomValueGenerator random{}; + auto test = std::make_unique("GlobalMaxPool", 14); + const std::vector input_dims = {4, 16, 4, 8}; + std::vector input_data = random.Uniform(input_dims, 0.5f, 1.3f); + test->AddInput("X", input_dims, input_data); + + std::vector output_dims = {input_dims[0], input_dims[1], 1, 1}; + std::vector output_data = FillZeros(output_dims); + test->AddOutput("Y", output_dims, output_data); + + std::vector> execution_providers; + OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true}; + execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc)); + + double error_tolerance = 1e-3; + OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false}; + auto source_ep = CudaExecutionProviderWithOptions(&nchw); + test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance); +} + +TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwcPad) { + auto op = PoolOp{.pooling_type = "AveragePool", + .input_dims = {1, 16, 64, 64}, + .kernel_shape = {3, 3}, + .channels = 16, + .padding = {2, 2, 2, 2}}; + + MAKE_PROVIDERS() +} + +} // namespace test +} // namespace onnxruntime