diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 1b230f9557984..f81a268d38dff 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -78,6 +78,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
 cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
 
+option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
 option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
 option(onnxruntime_USE_COREML "Build with CoreML support" OFF)
@@ -671,6 +672,9 @@ set(ORT_PROVIDER_FLAGS)
 set(ORT_PROVIDER_CMAKE_FLAGS)
 
 if (onnxruntime_USE_CUDA)
+  if (onnxruntime_USE_CUDA_NHWC_OPS)
+    add_compile_definitions(ENABLE_CUDA_NHWC_OPS)
+  endif()
   enable_language(CUDA)
   message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")
 
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index ac9770d7cedf8..f5f98066675fb 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -374,6 +374,13 @@ if (onnxruntime_USE_CUDA AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_R
     "${TEST_SRC_DIR}/providers/cuda/*"
     )
   list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_src})
+
+  if (onnxruntime_USE_CUDA_NHWC_OPS)
+    file(GLOB onnxruntime_test_providers_cuda_nhwc_src CONFIGURE_DEPENDS
+      "${TEST_SRC_DIR}/providers/cuda/nhwc/*.cc"
+    )
+    list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_nhwc_src})
+  endif()
 endif()
 
 if (onnxruntime_USE_CANN)
@@ -851,7 +858,7 @@ if (HAS_SHORTEN_64_TO_32 AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
 endif()
 
 if (UNIX AND onnxruntime_USE_TENSORRT)
-    # The test_main.cc includes NvInfer.h where it has many deprecated declarations  
+    # The test_main.cc includes NvInfer.h where it has many deprecated declarations
     # simply ignore them for TensorRT EP build
     set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
 endif()
@@ -1294,7 +1301,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
     endif()
 
     if (UNIX AND onnxruntime_USE_TENSORRT)
-        # The test_main.cc includes NvInfer.h where it has many deprecated declarations  
+        # The test_main.cc includes NvInfer.h where it has many deprecated declarations
         # simply ignore them for TensorRT EP build
         set_property(TARGET onnxruntime_shared_lib_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
     endif()
@@ -1583,7 +1590,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
     endif()
 
     if (UNIX AND onnxruntime_USE_TENSORRT)
-        # The test_main.cc includes NvInfer.h where it has many deprecated declarations  
+        # The test_main.cc includes NvInfer.h where it has many deprecated declarations
         # simply ignore them for TensorRT EP build
         set_property(TARGET onnxruntime_customopregistration_test APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations")
     endif()
diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
index 5f266dd14d36d..82bb8ba83be4a 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
@@ -1,8 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
 
+#include <limits>
+
 #include "onnxruntime_c_api.h"
 #include "core/framework/arena_extend_strategy.h"
 
@@ -32,5 +35,6 @@ struct OrtCUDAProviderOptionsV2 {
   int tunable_op_max_tuning_duration_ms = 0;                                                                   // Max tuning duration time limit for TunableOp.
   int enable_skip_layer_norm_strict_mode = 0;                                                                  // flag specifying if SkipLayerNorm is in strict mode. If true, use LayerNormalization kernel.
                                                                                                                // The strict mode has better accuracy but lower performance.
+  int prefer_nhwc = 0;                                                                                         // make the CUDA EP NHWC preferred
   int use_ep_level_unified_stream = 0;                                                                         // flag specifying if ep level stream is used or not
 };
diff --git a/onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h b/onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h
index 6f7a04d059034..a768b2a7d8a24 100644
--- a/onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h
+++ b/onnxruntime/contrib_ops/cuda/conv_transpose_with_dynamic_pads.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -10,12 +11,12 @@ namespace contrib {
 namespace cuda {
 
 template <typename T>
-class ConvTransposeWithDynamicPads : public ::onnxruntime::cuda::ConvTranspose<T> {
+class ConvTransposeWithDynamicPads : public ::onnxruntime::cuda::ConvTranspose<T, false> {
  public:
-  ConvTransposeWithDynamicPads(const OpKernelInfo& info) : ::onnxruntime::cuda::ConvTranspose<T>(info) {}
+  ConvTransposeWithDynamicPads(const OpKernelInfo& info) : ::onnxruntime::cuda::ConvTranspose<T, false>(info) {}
 
   Status ComputeInternal(OpKernelContext* context) const override {
-    return ::onnxruntime::cuda::ConvTranspose<T>::DoConvTranspose(context, true);
+    return ::onnxruntime::cuda::ConvTranspose<T, false>::DoConvTranspose(context, true);
   }
 };
 }  // namespace cuda
diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
index 6c91949e467ae..290380cabb036 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -30,6 +30,23 @@ CostCheckResult PostLayoutTransformCostCheck(const api::GraphRef& graph, const a
   return OrtEPCostCheck(graph, node, perm, outputs_leading_to_transpose);
 }
 
+#if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS
+const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
+  static std::unordered_set<std::string_view> cuda_nhwc_ops = []() {
+    return std::unordered_set<std::string_view>{
+        "BatchNormalization",
+        "Conv",
+        "ConvTranspose",
+        "GlobalMaxPool",
+        "MaxPool",
+        "GlobalAveragePool",
+        "AveragePool",
+    };
+  }();
+  return cuda_nhwc_ops;
+}
+#endif
+
 /// <summary>
 /// Default function for checking if a node should have its layout changed. Allows EP specific adjustments to the
 /// default set of layout sensitive operators if required.
@@ -71,11 +88,16 @@ bool ConvertNodeLayout(const api::NodeRef& node) {
   }
 #endif
 
-  // #if defined(USE_CUDA)
-  //   if (node.GetExecutionProviderType() == kCudaExecutionProvider) {
-  //     Update as per https://github.com/microsoft/onnxruntime/pull/17200 with CUDA ops that support NHWC
-  //   }
-  // #endif
+#if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS
+  if (node.GetExecutionProviderType() == kCudaExecutionProvider) {
+    if (layout_sensitive_ops.count(node.OpType())) {
+      const auto& cuda_nhwc_ops = GetCUDALayoutSensitiveOps();
+      if (!cuda_nhwc_ops.count(node.OpType())) {
+        return false;
+      }
+    }
+  }
+#endif
 
   return layout_sensitive_ops.count(node.OpType()) != 0;
 }
diff --git a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
index 8507d87fd2442..a5d46aff83b50 100644
--- a/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
+++ b/onnxruntime/core/providers/cpu/nn/batch_norm_helper.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -22,11 +23,17 @@ class BatchNormHelper {
                                        const Tensor* B,
                                        const Tensor* mean,
                                        const Tensor* var,
-                                       bool is_spatial = true) {
+                                       bool is_spatial = true,
+                                       bool is_nhwc = false) {
     const auto& x_dims = X->Shape().GetDims();
 
     // If x_dims size < 2, num_channels defaults to 1.
-    int64_t num_channels = x_dims.size() > 1 ? x_dims[1] : 1;
+    int64_t num_channels;
+    if (is_nhwc) {
+      num_channels = x_dims.size() > 1 ? x_dims[x_dims.size() - 1] : 1;
+    } else {
+      num_channels = x_dims.size() > 1 ? x_dims[1] : 1;
+    }
     // the first 2 are respectively - N and C.
     int num_feature_dims = x_dims.size() > 1 ? static_cast<int>(x_dims.size() - 2) : 0;
 
@@ -109,7 +116,7 @@ class BatchNormHelper {
     return common::Status::OK();
   }
 
-  static void NormalizeDims(const TensorShape& x_shape, std::vector<int64_t>& new_dims) {
+  static void NormalizeDims(const TensorShape& x_shape, std::vector<int64_t>& new_dims, bool is_nhwc = false) {
     new_dims.clear();
     auto orig_dims = x_shape.GetDims();
     ORT_ENFORCE(orig_dims.size() < 6,
@@ -122,13 +129,19 @@ class BatchNormHelper {
 
     auto rank = x_shape.NumDimensions();
     auto num_samples = rank > 0 ? orig_dims[0] : 1;  // NCHW
-    auto num_channels = rank > 1 ? orig_dims[1] : 1;
-    auto height = rank > 2 ? orig_dims[2] : 1;
+    const size_t channel_dim = is_nhwc ? rank - 1 : 1;
+    const size_t height_dim = is_nhwc ? 1 : 2;
+    auto num_channels = rank > 1 ? orig_dims[channel_dim] : 1;
+    auto height = rank > 2 ? orig_dims[height_dim] : 1;
     int64_t width = 1;
-    new_dims = {num_samples, num_channels, height, width};
+    if (is_nhwc) {
+      new_dims = {num_samples, height, width, num_channels};
+    } else {
+      new_dims = {num_samples, num_channels, height, width};
+    }
   }
 };
 }  // namespace onnxruntime
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
-#endif
\ No newline at end of file
+#endif
diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h b/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h
index a4d67ec63f0c2..4b3b934834ac8 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose_attributes.h
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 /* Modifications Copyright (c) Microsoft. */
+// Copyright (c) 2023 NVIDIA Corporation.
 
 #pragma once
 
@@ -44,17 +45,19 @@ struct ConvTransposeAttributes : public ConvAttributes {
   };
 
   Status PrepareForCompute(OpKernelContext* context, bool has_bias, Prepare& p,
-                           bool dynamic_padding = false, const TensorShape* filter_shape = nullptr) const {
+                           bool dynamic_padding = false, const TensorShape* filter_shape = nullptr,
+                           bool is_nhwc = false) const {
     const Tensor* X = context->Input<Tensor>(0);
     const Tensor* F = (filter_shape != nullptr) ? nullptr : context->Input<Tensor>(1);
     const TensorShape& F_Shape = (filter_shape != nullptr) ? *filter_shape : F->Shape();
     const Tensor* Pads = dynamic_padding ? context->Input<Tensor>(2) : nullptr;
     const Tensor* B = has_bias ? (dynamic_padding ? context->Input<Tensor>(3) : context->Input<Tensor>(2)) : nullptr;
-    TensorShape input_shape = X->Shape().Slice(2);
 
-    const int64_t num_input_channels = X->Shape()[1];
+    const int rank = static_cast<int>(X->Shape().NumDimensions());
+    TensorShape input_shape = X->Shape().Slice(is_nhwc ? 1 : 2, is_nhwc ? rank - 1 : rank);
+    const int64_t num_input_channels = is_nhwc ? X->Shape()[rank - 1] : X->Shape()[1];
     const int64_t N = X->Shape()[0];
-    const int64_t num_output_channels_multiplier = F_Shape[1];
+    const int64_t num_output_channels_multiplier = is_nhwc ? F_Shape[3] : F_Shape[1];
     const int64_t num_output_channels = num_output_channels_multiplier * group;
 
     // input validations
@@ -85,7 +88,7 @@ struct ConvTransposeAttributes : public ConvAttributes {
     }
 
     TensorShapeVector kernel_shape;
-    ORT_RETURN_IF_ERROR(ComputeKernelShape(F_Shape, kernel_shape));
+    ORT_RETURN_IF_ERROR(ComputeKernelShape(F_Shape, kernel_shape, is_nhwc));
 
     TensorShapeVector local_output_padding(output_padding);
     if (local_output_padding.empty()) {
@@ -115,7 +118,7 @@ struct ConvTransposeAttributes : public ConvAttributes {
     TensorShapeVector Y_dims;
 
     ComputePadsAndOutputShape(input_shape, num_output_channels, kernel_shape,
-                              local_strides, local_dilations, local_output_padding, N, &local_pads, &Y_dims);
+                              local_strides, local_dilations, local_output_padding, N, &local_pads, &Y_dims, is_nhwc);
     TensorShape Yshape(Y_dims);
     Tensor* Y = context->Output(0, Yshape);
 
@@ -137,9 +140,14 @@ struct ConvTransposeAttributes : public ConvAttributes {
   void ComputePadsAndOutputShape(TensorShape input_shape, int64_t output_channel,
                                  const TensorShapeVector& kernel_shape, const TensorShapeVector& p_strides,
                                  const TensorShapeVector& p_dilations, const TensorShapeVector& p_output_padding, const int64_t N,
-                                 ConvPadVector* p_pads, TensorShapeVector* output_shape_p) const {
+                                 ConvPadVector* p_pads, TensorShapeVector* output_shape_p,
+                                 bool is_nhwc = false) const {
     size_t output_shape_size = output_shape.size();
-    output_shape_p->insert(output_shape_p->begin(), {N, output_channel});
+    if (is_nhwc) {
+      output_shape_p->insert(output_shape_p->begin(), {N});
+    } else {
+      output_shape_p->insert(output_shape_p->begin(), {N, output_channel});
+    }
 
     size_t rank = input_shape.NumDimensions();
     for (size_t dim = 0; dim < rank; ++dim) {
@@ -163,6 +171,9 @@ struct ConvTransposeAttributes : public ConvAttributes {
       ORT_ENFORCE(dim_size > 0, "Invalid input shape: ", input_shape.ToString());
       output_shape_p->push_back(dim_size);
     }
+    if (is_nhwc) {
+      output_shape_p->push_back(output_channel);
+    }
   }
 
   TensorShapeVector output_padding;
diff --git a/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h b/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h
index 48e54ac7eeefb..9a2a710fd291a 100644
--- a/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h
+++ b/onnxruntime/core/providers/cpu/nn/instance_norm_helper.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -8,13 +9,16 @@
 #include "core/framework/tensor.h"
 #endif
 #include <sstream>
+#include <utility>
 
 namespace onnxruntime {
 
 class InstanceNormHelper {
  public:
-  static common::Status ValidateInputs(const Tensor* input, const Tensor* scale, const Tensor* B) {
-    if (input->Shape().NumDimensions() < 3) {
+  static common::Status ValidateInputs(const Tensor* input, const Tensor* scale, const Tensor* B,
+                                       bool is_nhwc = false) {
+    const auto rank = input->Shape().NumDimensions();
+    if (rank < 3) {
       std::ostringstream ostr;
       ostr << "Invalid input data: number of dimensions is less than 3: " << input->Shape().NumDimensions();
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
@@ -24,10 +28,13 @@ class InstanceNormHelper {
       ostr << "Invalid input scale: number of dimensions is not 1: " << scale->Shape().NumDimensions();
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
     }
-    if (scale->Shape().Size() != input->Shape().GetDims()[1]) {
+    auto in_dims = input->Shape().GetDims();
+    auto in_channels = is_nhwc ? in_dims[rank - 1] : in_dims[1];
+
+    if (scale->Shape().Size() != in_channels) {
       std::ostringstream ostr;
-      ostr << "Mismatch between input data and scale: size of scale != input channel count "
-           << scale->Shape().Size() << " vs. " << input->Shape().GetDims()[1];
+      ostr << "Mismatch between input data and scale: size of scale != input channel count " << scale->Shape().Size()
+           << " vs. " << in_channels;
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
     }
 
@@ -37,10 +44,10 @@ class InstanceNormHelper {
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
     }
 
-    if (B->Shape().Size() != input->Shape().GetDims()[1]) {
+    if (B->Shape().Size() != in_channels) {
       std::ostringstream ostr;
-      ostr << "Mismatch between input data and B: size of B != input channel count "
-           << B->Shape().Size() << " vs. " << input->Shape().GetDims()[1];
+      ostr << "Mismatch between input data and B: size of B != input channel count " << B->Shape().Size() << " vs. "
+           << in_channels;
       return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str());
     }
 
diff --git a/onnxruntime/core/providers/cpu/nn/pool_attributes.h b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
index 54f41f09f4b24..118cb4a3ba4bd 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -98,28 +99,34 @@ struct PoolAttributes {
 
   TensorShapeVector SetOutputSize(const TensorShape& input_shape,
                                   int64_t output_channel,
-                                  TensorShapeVector* actual_pads) const {
+                                  TensorShapeVector* actual_pads,
+                                  bool is_nhwc = false) const {
     ORT_ENFORCE(input_shape.Size() > 0 || input_shape[0] == 0,
                 "Invalid input shape. Only N can be zero. Got:", input_shape);
     TensorShapeVector output_dims;
     int64_t N = input_shape[0];
-    InferOutputSize(input_shape.GetDims(), &output_dims, actual_pads);
-
-    output_dims.insert(output_dims.begin(), {N, output_channel});
-
+    InferOutputSize(input_shape.GetDims(), &output_dims, actual_pads, is_nhwc);
+    if (is_nhwc) {
+      output_dims.insert(output_dims.begin(), N);
+      output_dims.push_back(output_channel);
+    } else {
+      output_dims.insert(output_dims.begin(), {N, output_channel});
+    }
     return output_dims;
   }
 
   void InferOutputSize(gsl::span<const int64_t> input_dims,
                        TensorShapeVector* output_dims,
-                       TensorShapeVector* actual_pads) const {
+                       TensorShapeVector* actual_pads,
+                       bool is_nhwc = false) const {
     ORT_ENFORCE(input_dims.size() >= 2);
     if (global_pooling) {
       output_dims->assign(input_dims.size() - 2, 1);
     } else {
       for (size_t dim = 0; dim < input_dims.size() - 2; ++dim) {
         int64_t dim_size = 0;
-        ComputeSizePadDilations(static_cast<int>(input_dims[dim + 2]),
+        auto spatial_dim = is_nhwc ? input_dims[dim + 1] : input_dims[dim + 2];
+        ComputeSizePadDilations(static_cast<int>(spatial_dim),
                                 strides[dim],
                                 kernel_shape[dim],
                                 &actual_pads->at(dim),
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index d90725971e393..93e18d2940fc2 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #include "core/common/inlined_containers.h"
@@ -15,6 +16,10 @@
 #include "contrib_ops/cuda/cuda_contrib_kernels.h"
 #endif
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+#include "core/providers/cuda/cuda_nhwc_kernels.h"
+#endif
+
 #ifdef ENABLE_TRAINING_OPS
 #include "orttraining/training_ops/cuda/cuda_training_kernels.h"
 #endif
@@ -233,6 +238,10 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in
     : IExecutionProvider{onnxruntime::kCudaExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id)},
       info_{info},
       tuning_context_(this, &info_.tunable_op) {
+#ifndef ENABLE_CUDA_NHWC_OPS
+  ORT_ENFORCE(info_.prefer_nhwc == 0, "This build does not support NHWC layout");
+#endif
+
   CUDA_CALL_THROW(cudaSetDevice(info_.device_id));
 
   // must wait GPU idle, otherwise cudaGetDeviceProperties might fail
@@ -271,6 +280,10 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in
 #endif
 }
 
+DataLayout CUDAExecutionProvider::GetPreferredLayout() const {
+  return this->IsNHWCPreferred() ? DataLayout::NHWC : DataLayout::NCHW;
+}
+
 CUDAExecutionProvider::~CUDAExecutionProvider() {
   // clean up thread local context caches
   {
@@ -2330,6 +2343,10 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
   ORT_RETURN_IF_ERROR(::onnxruntime::contrib::cuda::RegisterCudaContribKernels(kernel_registry));
 #endif
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+  ORT_RETURN_IF_ERROR(::onnxruntime::cuda::RegisterCudaNhwcKernels(kernel_registry));
+#endif
+
 #ifdef ENABLE_TRAINING_OPS
   ORT_RETURN_IF_ERROR(::onnxruntime::cuda::RegisterCudaTrainingKernels(kernel_registry));
 #endif
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index c9e510b7f472b..d0bb2321edf0a 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -32,6 +33,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
 
   Status OnRunEnd(bool sync_stream) override;
 
+  DataLayout GetPreferredLayout() const override;
+
   const void* GetExecutionHandle() const noexcept override {
     // The CUDA interface does not return anything interesting.
     return nullptr;
@@ -49,6 +52,12 @@ class CUDAExecutionProvider : public IExecutionProvider {
     return GetPerThreadContext().CudnnHandle();
   }
 
+  cudaStream_t ComputeStream() {
+    // this will return the CUDA EP level stream which can differ from the actual compute tasks stream
+    // the compute task stream is supplied within OpKernelContext during inference
+    return stream_;
+  }
+
   template <typename T>
   const T* GetConstOnes(size_t count, cudaStream_t stream) {
     return GetPerThreadContext().template GetConstOnes<T>(count, stream);
@@ -68,6 +77,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
   bool GetCudnnConvUseMaxWorkspace() const { return info_.cudnn_conv_use_max_workspace; }
   bool GetCudnnConv1dPadToNc1d() const { return info_.cudnn_conv1d_pad_to_nc1d; }
   bool IsSkipLayerNormInStrictMode() const { return info_.enable_skip_layer_norm_strict_mode; }
+  bool IsNHWCPreferred() const { return info_.prefer_nhwc; }
 
   ProviderOptions GetProviderOptions() const override {
     return CUDAExecutionProviderInfo::ToProviderOptions(info_);
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
index 966448051264d..daa3b5ff3d72f 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #include "core/providers/shared_library/provider_api.h"
@@ -29,6 +30,7 @@ constexpr const char* kTunableOpEnable = "tunable_op_enable";
 constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable";
 constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms";
 constexpr const char* kEnableSkipLayerNormStrictMode = "enable_skip_layer_norm_strict_mode";
+constexpr const char* kPreferNCHWMode = "prefer_nhwc";
 constexpr const char* KUseEPLevelUnifiedStream = "use_ep_level_unified_stream";
 }  // namespace provider_option_names
 }  // namespace cuda
@@ -100,6 +102,7 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
           .AddAssignmentToReference(cuda::provider_option_names::kEnableCudaGraph, info.enable_cuda_graph)
           .AddAssignmentToReference(cuda::provider_option_names::kCudnnConv1dPadToNc1d, info.cudnn_conv1d_pad_to_nc1d)
           .AddAssignmentToReference(cuda::provider_option_names::kEnableSkipLayerNormStrictMode, info.enable_skip_layer_norm_strict_mode)
+          .AddAssignmentToReference(cuda::provider_option_names::kPreferNCHWMode, info.prefer_nhwc)
           .AddAssignmentToReference(cuda::provider_option_names::KUseEPLevelUnifiedStream, info.use_ep_level_unified_stream)
           .AddValueParser(
               cuda::provider_option_names::kTunableOpEnable,
@@ -146,6 +149,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
       {cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)},
       {cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)},
       {cuda::provider_option_names::kEnableSkipLayerNormStrictMode, MakeStringWithClassicLocale(info.enable_skip_layer_norm_strict_mode)},
+      {cuda::provider_option_names::kPreferNCHWMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
       {cuda::provider_option_names::KUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
   };
 
@@ -165,6 +169,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid
       {cuda::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op_enable)},
       {cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op_tuning_enable)},
       {cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op_max_tuning_duration_ms)},
+      {cuda::provider_option_names::kPreferNCHWMode, MakeStringWithClassicLocale(info.prefer_nhwc)},
       {cuda::provider_option_names::KUseEPLevelUnifiedStream, MakeStringWithClassicLocale(info.use_ep_level_unified_stream)},
   };
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
index 89b266f362e8d..b286f5a9161b0 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -71,6 +72,7 @@ struct CUDAExecutionProviderInfo {
   cuda::TunableOpInfo tunable_op{};
 
   bool enable_skip_layer_norm_strict_mode{false};
+  bool prefer_nhwc{false};
 
   bool use_ep_level_unified_stream{false};
 
diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h
index 58517c2850baf..f8b92eface52f 100644
--- a/onnxruntime/core/providers/cuda/cuda_kernel.h
+++ b/onnxruntime/core/providers/cuda/cuda_kernel.h
@@ -170,6 +170,12 @@ class CudaKernel : public OpKernel {
     return provider_->PerThreadDefaultCudnnHandle();
   }
 
+  inline cudaStream_t DefaultCudaStream() const {
+    // this will return the CUDA EP level stream which can differ from the actual compute tasks stream
+    // the compute task stream is supplied within OpKernelContext during inference
+    return provider_->ComputeStream();
+  }
+
  protected:
   template <typename T>
   inline const T* GetConstOnes(size_t count, cudaStream_t stream) const {
diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
new file mode 100644
index 0000000000000..f416caecd115f
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
@@ -0,0 +1,169 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// Licensed under the MIT License.
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+
+#include <utility>
+
+#include "core/providers/shared_library/provider_api.h"
+#include "core/providers/cuda/cuda_fwd.h"
+
+#include "core/providers/cuda/cuda_nhwc_kernels.h"
+
+namespace onnxruntime::cuda {
+
+// When adding new supported NHWC operations make sure to also integrate them into: ConvertNodeLayout
+// in onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
+
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, float,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, MLFloat16,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, float,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, MLFloat16,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float,
+                                                      Conv);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16,
+                                                      Conv);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float,
+                                                      ConvTranspose);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16,
+                                                      ConvTranspose);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 9, float,
+                                                      AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 9, MLFloat16,
+                                                      AveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalAveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, MLFloat16,
+                                            GlobalAveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 7, float,
+                                                      MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 7, MLFloat16,
+                                                      MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 8, 9, float,
+                                                      MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16,
+                                                      MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, float, GlobalMaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, MLFloat16, GlobalMaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float,
+                                                      AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16,
+                                                      AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float,
+                                                      MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16,
+                                                      MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, Conv);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, float, ConvTranspose);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16,
+                                            ConvTranspose);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, float, AveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 11, float,
+                                                      MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 11, MLFloat16,
+                                                      MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, MLFloat16,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, float,
+                                            BatchNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, MLFloat16,
+                                            BatchNormalization);
+
+Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
+  static const BuildKernelCreateInfoFn nhwc_function_table[] = {
+      BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, MLFloat16, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 8, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, MLFloat16, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 9, 13, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, MLFloat16, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15,
+                                                                  MLFloat16, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15,
+                                                                  float, BatchNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider,
+                                                                            kMSInternalNHWCDomain, 1, 10, float, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
+                                                                  float, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
+                                                                  MLFloat16, Conv)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 9, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 7, 9, MLFloat16, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1,
+                                                                  float, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1,
+                                                                  MLFloat16, GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 7, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 7, MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 8, 9, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1,
+                                                                  float, GlobalMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1,
+                                                                  MLFloat16, GlobalMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
+                                                                  float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
+                                                                  MLFloat16, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 11, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 11, MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
+                                                                  float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12,
+                                                                  MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
+                                                                  float, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11,
+                                                                  MLFloat16, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
+          kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, ConvTranspose)>,
+  };
+
+  for (auto& function_table_entry : nhwc_function_table) {
+    KernelCreateInfo info = function_table_entry();
+    if (info.kernel_def != nullptr) {  // filter disabled entries where type is void
+      ORT_RETURN_IF_ERROR(kernel_registry.Register(std::move(info)));
+    }
+  }
+  return Status::OK();
+}
+}  // namespace onnxruntime::cuda
+#endif
diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.h b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.h
new file mode 100644
index 0000000000000..0b3a6d5cff0c7
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.h
@@ -0,0 +1,13 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/status.h"
+
+namespace onnxruntime::cuda {
+
+onnxruntime::common::Status RegisterCudaNhwcKernels(onnxruntime::KernelRegistry& kernel_registry);
+
+}  // namespace onnxruntime::cuda
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
index 734a890c20cda..892e8d5329eba 100644
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #include "core/providers/shared_library/provider_api.h"
@@ -217,6 +218,7 @@ struct CUDA_Provider : Provider {
     info.default_memory_arena_cfg = params->default_memory_arena_cfg;
     info.cudnn_conv_use_max_workspace = params->cudnn_conv_use_max_workspace != 0;
     info.enable_cuda_graph = params->enable_cuda_graph != 0;
+    info.prefer_nhwc = params->prefer_nhwc;
     info.cudnn_conv1d_pad_to_nc1d = params->cudnn_conv1d_pad_to_nc1d != 0;
     info.tunable_op.enable = params->tunable_op_enable;
     info.tunable_op.tuning_enable = params->tunable_op_tuning_enable;
@@ -254,6 +256,7 @@ struct CUDA_Provider : Provider {
     cuda_options.enable_cuda_graph = internal_options.enable_cuda_graph;
     cuda_options.cudnn_conv1d_pad_to_nc1d = internal_options.cudnn_conv1d_pad_to_nc1d;
     cuda_options.enable_skip_layer_norm_strict_mode = internal_options.enable_skip_layer_norm_strict_mode;
+    cuda_options.prefer_nhwc = internal_options.prefer_nhwc;
     cuda_options.use_ep_level_unified_stream = internal_options.use_ep_level_unified_stream;
   }
 
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index fc02a6509bf24..4df59a98b12e5 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -1,7 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
-#include "cudnn_common.h"
+#include <utility>
+
+#include "core/providers/cuda/cudnn_common.h"
 #include "core/common/inlined_containers.h"
 #include "core/common/gsl.h"
 #include "shared_inc/cuda_call.h"
@@ -27,7 +30,7 @@ Status CudnnTensor::CreateTensorIfNeeded() {
   return Status::OK();
 }
 
-Status CudnnTensor::Set(gsl::span<const int64_t> input_dims, cudnnDataType_t dataType) {
+Status CudnnTensor::Set(gsl::span<const int64_t> input_dims, cudnnDataType_t dataType, bool is_nhwc) {
   ORT_RETURN_IF_ERROR(CreateTensorIfNeeded());
 
   int rank = gsl::narrow_cast<int>(input_dims.size());
@@ -38,6 +41,10 @@ Status CudnnTensor::Set(gsl::span<const int64_t> input_dims, cudnnDataType_t dat
     dims[i] = gsl::narrow_cast<int>(input_dims[i]);
     strides[i] = gsl::narrow_cast<int>(pitches[i]);
   }
+  if (is_nhwc) {
+    std::swap(dims[1], dims[rank - 1]);
+    std::swap(strides[1], strides[rank - 1]);
+  }
   CUDNN_RETURN_IF_ERROR(cudnnSetTensorNdDescriptor(tensor_, dataType, static_cast<int>(rank), dims.data(), strides.data()));
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h
index ba75ab4f2c029..8a94a334ee688 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.h
+++ b/onnxruntime/core/providers/cuda/cudnn_common.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -16,7 +17,7 @@ class CudnnTensor final {
   ~CudnnTensor();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudnnTensor);
 
-  Status Set(gsl::span<const int64_t> input_dims, cudnnDataType_t dataType);
+  Status Set(gsl::span<const int64_t> input_dims, cudnnDataType_t dataType, bool is_nhwc = false);
   Status Set(const CudnnTensor& x_desc, cudnnBatchNormMode_t mode);
   // Set 4D tensor format (for NHWC)
   Status Set(cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w);
diff --git a/onnxruntime/core/providers/cuda/nn/batch_norm.cc b/onnxruntime/core/providers/cuda/nn/batch_norm.cc
index 4f22b5298a30a..c468971e1e426 100644
--- a/onnxruntime/core/providers/cuda/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/cuda/nn/batch_norm.cc
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #include "batch_norm.h"
@@ -11,38 +12,38 @@ using namespace std;
 namespace onnxruntime {
 namespace cuda {
 
-#define REGISTER_KERNEL_TYPED(T)                                   \
+#define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC)                     \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
       BatchNormalization,                                          \
-      kOnnxDomain,                                                 \
+      DOMAIN,                                                      \
       7, 8,                                                        \
       T,                                                           \
       kCudaExecutionProvider,                                      \
       (*KernelDefBuilder::Create())                                \
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),  \
-      BatchNorm<T>);                                               \
+      BatchNorm<T, NHWC>);                                         \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
       BatchNormalization,                                          \
-      kOnnxDomain,                                                 \
+      DOMAIN,                                                      \
       9, 13,                                                       \
       T,                                                           \
       kCudaExecutionProvider,                                      \
       (*KernelDefBuilder::Create())                                \
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()),  \
-      BatchNorm<T>);                                               \
+      BatchNorm<T, NHWC>);                                         \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                         \
       BatchNormalization,                                          \
-      kOnnxDomain,                                                 \
+      DOMAIN,                                                      \
       14, 14,                                                      \
       T,                                                           \
       kCudaExecutionProvider,                                      \
       (*KernelDefBuilder::Create())                                \
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())   \
           .TypeConstraint("U", DataTypeImpl::GetTensorType<T>()),  \
-      BatchNorm<T>);                                               \
+      BatchNorm<T, NHWC>);                                         \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                   \
       BatchNormalization,                                          \
-      kOnnxDomain,                                                 \
+      DOMAIN,                                                      \
       15,                                                          \
       T,                                                           \
       kCudaExecutionProvider,                                      \
@@ -50,10 +51,10 @@ namespace cuda {
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())   \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())  \
           .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
-      BatchNorm<T>);
+      BatchNorm<T, NHWC>);
 
-template <typename T>
-Status BatchNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
+template <typename T, bool NHWC>
+Status BatchNorm<T, NHWC>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
 
   const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
@@ -62,7 +63,7 @@ Status BatchNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const
   const Tensor* mean = p_op_kernel_context->Input<Tensor>(3);
   const Tensor* var = p_op_kernel_context->Input<Tensor>(4);
 
-  ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, scale, B, mean, var, spatial_ == 1));
+  ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, scale, B, mean, var, spatial_ == 1, NHWC));
 
   const TensorShape& x_shape = X->Shape();
   const TensorShape& channel_shape = mean->Shape();
@@ -87,7 +88,7 @@ Status BatchNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const
   CudnnTensor data_desc;
   vector<int64_t> new_dims;
   BatchNormHelper::NormalizeDims(x_shape, new_dims);
-  ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, CudnnTensor::GetDataType<CudaT>()));
+  ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, CudnnTensor::GetDataType<CudaT>(), NHWC));
 
   // For half data type, the alpha, beta, scale, B, mean, var need to be float type
   if (X->IsDataType<MLFloat16>()) {
@@ -97,7 +98,7 @@ Status BatchNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const
     ORT_RETURN_IF_ERROR(bn_tensor_desc.Set(data_desc, cudnn_batch_norm_mode_));
 
     // Convert the scale, B, mean, var to float
-    const int64_t C = x_shape.GetDims()[1];
+    const int64_t C = x_shape.GetDims()[NHWC ? 3 : 1];
     auto f_scale = GetScratchBuffer<float>(C, p_op_kernel_context->GetComputeStream());
     auto f_B = GetScratchBuffer<float>(C, p_op_kernel_context->GetComputeStream());
     auto f_mean = GetScratchBuffer<float>(C, p_op_kernel_context->GetComputeStream());
@@ -175,13 +176,17 @@ Status BatchNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const
   return Status::OK();
 }
 
-#define SPECIALIZED_COMPUTE(T) \
-  REGISTER_KERNEL_TYPED(T)     \
-  template Status BatchNorm<T>::ComputeInternal(OpKernelContext* ctx) const;
+#define SPECIALIZED_COMPUTE(T, DOMAIN, NHWC) \
+  REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC)     \
+  template Status BatchNorm<T, NHWC>::ComputeInternal(OpKernelContext* ctx) const;
 
-SPECIALIZED_COMPUTE(float)
-SPECIALIZED_COMPUTE(double)
-SPECIALIZED_COMPUTE(MLFloat16)
+SPECIALIZED_COMPUTE(float, kOnnxDomain, false)
+SPECIALIZED_COMPUTE(double, kOnnxDomain, false)
+SPECIALIZED_COMPUTE(MLFloat16, kOnnxDomain, false)
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+SPECIALIZED_COMPUTE(float, kMSInternalNHWCDomain, true)
+SPECIALIZED_COMPUTE(MLFloat16, kMSInternalNHWCDomain, true)
+#endif
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/nn/batch_norm.h b/onnxruntime/core/providers/cuda/nn/batch_norm.h
index 99da7652a1d24..4eb9fb74d3761 100644
--- a/onnxruntime/core/providers/cuda/nn/batch_norm.h
+++ b/onnxruntime/core/providers/cuda/nn/batch_norm.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -9,7 +10,7 @@
 namespace onnxruntime {
 namespace cuda {
 
-template <typename T>
+template <typename T, bool NCHW>
 class BatchNorm final : public CudaKernel {
  public:
   BatchNorm(const OpKernelInfo& op_kernel_info)
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index 81db3c4186282..82f3503919237 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -1,38 +1,47 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
+#include <utility>
+
 #include "core/providers/cuda/nn/conv.h"
 #include "core/common/span_utils.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/shared_inc/fpgeneric.h"
 #include "core/providers/cuda/tensor/slice.h"
+#include "core/providers/cuda/tensor/transpose.h"
 
 namespace onnxruntime {
 namespace cuda {
 
 // Op Set 11 for Conv only update document to clearify default dilations and strides value.
 // which are already convered by op set 11 cpu versoin, so simply add declaration.
-#define REGISTER_KERNEL_TYPED(T)                                                           \
+#define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC)                                             \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
       Conv,                                                                                \
-      kOnnxDomain,                                                                         \
+      DOMAIN,                                                                              \
       1, 10,                                                                               \
       T,                                                                                   \
       kCudaExecutionProvider,                                                              \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      Conv<T, false>);                                                                     \
+      Conv<T, NHWC>);                                                                      \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
       Conv,                                                                                \
-      kOnnxDomain,                                                                         \
+      DOMAIN,                                                                              \
       11,                                                                                  \
       T,                                                                                   \
       kCudaExecutionProvider,                                                              \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      Conv<T, false>);
+      Conv<T, NHWC>);
+
+REGISTER_KERNEL_TYPED(float, kOnnxDomain, false)
+REGISTER_KERNEL_TYPED(double, kOnnxDomain, false)
+REGISTER_KERNEL_TYPED(MLFloat16, kOnnxDomain, false)
 
-REGISTER_KERNEL_TYPED(float)
-REGISTER_KERNEL_TYPED(double)
-REGISTER_KERNEL_TYPED(MLFloat16)
+#ifdef ENABLE_CUDA_NHWC_OPS
+REGISTER_KERNEL_TYPED(float, kMSInternalNHWCDomain, true)
+REGISTER_KERNEL_TYPED(MLFloat16, kMSInternalNHWCDomain, true)
+#endif
 
 template <typename T, bool NHWC>
 const cudnnConvolutionFwdAlgo_t Conv<T, NHWC>::kAllAlgos[] = {
@@ -86,6 +95,39 @@ Status SliceOutUnwantedOutputSection(cudaStream_t stream,
   return SliceCuda::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size);
 }
 
+template <typename T, bool NHWC>
+Status Conv<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                              bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) {
+  is_packed = false;
+  // only layout of weight input is adjusted via PrePack
+  if (NHWC && is_nhwc_domain_) {  // InputTensors::IN_W
+    if (input_idx == 1) {
+      // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group}
+      auto orig_shape = tensor.Shape();
+
+      InlinedVector<size_t> perm{0, 2, 3, 1};
+      gsl::span<size_t> permutation(perm.data(), 4);
+      TensorShapeVector new_dims{orig_shape[0],
+                                 orig_shape[2],
+                                 orig_shape[3],
+                                 orig_shape[1]};
+      W_ = Tensor::Create(tensor.DataType(), TensorShape(new_dims), std::move(alloc));
+
+      auto status = cuda::Transpose::DoTranspose(GetDeviceProp(),
+                                                 DefaultCudaStream(),
+                                                 DefaultCublasHandle(),
+                                                 permutation, tensor, *W_);
+      if (!status.IsOK()) {
+        return status;
+      }
+      CUDA_CALL_THROW(cudaStreamSynchronize(DefaultCudaStream()));
+      is_packed = true;
+    }
+  }
+
+  return Status::OK();
+}
+
 template <typename T, bool NHWC>
 Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected) const {
   // set X
@@ -95,7 +137,12 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
   s_.x_data = reinterpret_cast<const CudaT*>(X->Data<T>());
   s_.element_size = X->DataType()->Size();
   // set W
-  const Tensor* W = context->Input<Tensor>(1);
+  const Tensor* W;
+  if (!W_) {
+    W = context->Input<Tensor>(1);
+  } else {
+    W = W_.get();
+  }
   const TensorShape& w_shape = W->Shape();
   auto w_dims = w_shape.AsShapeVector();
   s_.w_data = reinterpret_cast<const CudaT*>(W->Data<T>());
diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h
index 07825b93204ca..bcaa4d855b81e 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.h
+++ b/onnxruntime/core/providers/cuda/nn/conv.h
@@ -1,13 +1,16 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
 
+#include <list>
+#include <memory>
+
 #include "core/platform/ort_mutex.h"
 #include "core/providers/cuda/cuda_kernel.h"
 #include "core/providers/cuda/cudnn_common.h"
 #include "core/providers/cpu/nn/conv_attributes.h"
-#include <list>
 
 namespace onnxruntime {
 
@@ -187,8 +190,12 @@ class Conv : public CudaKernel {
   Conv(const OpKernelInfo& info) : CudaKernel(info), conv_attrs_(info) {
     auto pads_size = conv_attrs_.pads.size();
     ORT_ENFORCE(pads_size % 2 == 0);
+    is_nhwc_domain_ = info.node().Domain() == kMSInternalNHWCDomain;
   }
 
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override;
+
   Status ComputeInternal(OpKernelContext* context) const override;
 
  protected:
@@ -201,6 +208,8 @@ class Conv : public CudaKernel {
   mutable CudnnConvState<cudnnConvolutionFwdAlgoPerf_t> s_;
   constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
   static const cudnnConvolutionFwdAlgo_t kAllAlgos[];
+  std::unique_ptr<Tensor> W_;
+  bool is_nhwc_domain_;  // prepack is only needed for the Conv in kMSInternalNHWCDomain
 };
 
 Status SliceOutUnwantedOutputSection(cudaStream_t stream,
diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
index 04f6bc46dcfcc..55dceaa2698e8 100644
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
@@ -1,7 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
+#include <utility>
+
 #include "conv_transpose.h"
+#include "core/providers/cuda/tensor/transpose.h"
 
 // To suppress FP static analyzer warnings:
 // https://msdata.visualstudio.com/Vienna/_workitems/edit/1944928 and
@@ -17,35 +21,59 @@ namespace cuda {
 
 // Op Set 11 for ConvTranspose only update document to clarify default dilations and strides value.
 // which are already covered by op set 11 cpu version, so simply add declaration.
-#define REGISTER_KERNEL_TYPED(T)                                                           \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                 \
-      ConvTranspose,                                                                       \
-      kOnnxDomain,                                                                         \
-      1, 10,                                                                               \
-      T,                                                                                   \
-      kCudaExecutionProvider,                                                              \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      ConvTranspose<T>);                                                                   \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
-      ConvTranspose,                                                                       \
-      kOnnxDomain,                                                                         \
-      11,                                                                                  \
-      T,                                                                                   \
-      kCudaExecutionProvider,                                                              \
-      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      ConvTranspose<T>);
-
-REGISTER_KERNEL_TYPED(float)
-REGISTER_KERNEL_TYPED(double)
-REGISTER_KERNEL_TYPED(MLFloat16)
-
-template <typename T>
-Status ConvTranspose<T>::ComputeInternal(OpKernelContext* context) const {
+#define REGISTER_KERNEL_TYPED(T, DOMAIN, NHWC)                                                                       \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                                           \
+      ConvTranspose, DOMAIN, 1, 10, T, kCudaExecutionProvider,                                                       \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), ConvTranspose<T, NHWC>);  \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(ConvTranspose, DOMAIN, 11, T, kCudaExecutionProvider,                                \
+                                (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+                                ConvTranspose<T, NHWC>);
+
+REGISTER_KERNEL_TYPED(float, kOnnxDomain, false)
+REGISTER_KERNEL_TYPED(double, kOnnxDomain, false)
+REGISTER_KERNEL_TYPED(MLFloat16, kOnnxDomain, false)
+
+#ifdef ENABLE_CUDA_NHWC_OPS
+REGISTER_KERNEL_TYPED(float, kMSInternalNHWCDomain, true)
+REGISTER_KERNEL_TYPED(MLFloat16, kMSInternalNHWCDomain, true)
+#endif
+
+template <typename T, bool NHWC>
+Status ConvTranspose<T, NHWC>::ComputeInternal(OpKernelContext* context) const {
   return DoConvTranspose(context, false);
 }
 
-template <typename T>
-Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const {
+template <typename T, bool NHWC>
+Status ConvTranspose<T, NHWC>::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed,
+                                       [[maybe_unused]] PrePackedWeights* prepacked_weights) {
+  is_packed = false;
+  // only layout of weight input is adjusted via PrePack
+  if (NHWC) {  // InputTensors::IN_W
+    if (input_idx == 1) {
+      // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group}
+      auto orig_shape = tensor.Shape();
+
+      InlinedVector<size_t> perm{0, 2, 3, 1};
+      gsl::span<size_t> permutation(perm.data(), 4);
+      TensorShapeVector new_dims{orig_shape[0], orig_shape[2], orig_shape[3], orig_shape[1]};
+      W_ = Tensor::Create(tensor.DataType(), TensorShape(new_dims), std::move(alloc));
+
+      auto status = cuda::Transpose::DoTranspose(GetDeviceProp(), DefaultCudaStream(), DefaultCublasHandle(),
+                                                 permutation, tensor, *W_);
+
+      if (!status.IsOK()) {
+        return status;
+      }
+      CUDA_CALL_THROW(cudaStreamSynchronize(DefaultCudaStream()));
+      is_packed = true;
+    }
+  }
+
+  return Status::OK();
+}
+
+template <typename T, bool NHWC>
+Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
 
   const Tensor* X = context->Input<Tensor>(0);
@@ -59,7 +87,12 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input X must be 3-, 4- or 5-dimensional.",
                            " X: ", X->Shape().ToString().c_str());
   }
-  const Tensor* W = context->Input<Tensor>(1);
+  const Tensor* W;
+  if (!W_) {
+    W = context->Input<Tensor>(1);
+  } else {
+    W = W_.get();
+  }
   const TensorShape& w_shape = W->Shape();
   TensorShapeVector w_dims = w_shape.AsShapeVector();
   auto w_data = reinterpret_cast<const CudaT*>(W->Data<T>());
@@ -80,8 +113,7 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
     bool input_dims_changed = (s_.last_x_dims.AsShapeVector() != x_dims);
     bool w_dims_changed = (s_.last_w_dims.AsShapeVector() != w_dims);
     if (input_dims_changed || w_dims_changed) {
-      if (input_dims_changed)
-        s_.last_x_dims = gsl::make_span(x_dims);
+      if (input_dims_changed) s_.last_x_dims = gsl::make_span(x_dims);
 
       if (w_dims_changed) {
         s_.last_w_dims = gsl::make_span(w_dims);
@@ -89,7 +121,8 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
       }
 
       ConvTransposeAttributes::Prepare p;
-      ORT_RETURN_IF_ERROR(conv_transpose_attrs_.PrepareForCompute(context, has_bias, p, dynamic_padding));
+      ORT_RETURN_IF_ERROR(
+          conv_transpose_attrs_.PrepareForCompute(context, has_bias, p, dynamic_padding, &w_shape, NHWC));
 
       auto y_dims = p.Y->Shape().AsShapeVector();
       if (x_dimensions == 3) {
@@ -103,7 +136,13 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
       s_.y_dims = gsl::make_span(y_dims);
 
       if (w_dims_changed) {
-        ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, CudnnTensor::GetDataType<CudaT>()));
+        if (NHWC) {
+          ORT_RETURN_IF_ERROR(s_.w_desc.Set(CUDNN_TENSOR_NHWC, CudnnTensor::GetDataType<CudaT>(),
+                                            static_cast<int>(w_dims[0]), static_cast<int>(w_dims[3]),
+                                            static_cast<int>(w_dims[1]), static_cast<int>(w_dims[2])));
+        } else {
+          ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, CudnnTensor::GetDataType<CudaT>()));
+        }
       }
 
       // Special case when there is a dim value of 0 in the shape.
@@ -113,31 +152,39 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
       if (p.Y->Shape().Size() == 0) {
         return Status::OK();
       }
-
-      ORT_RETURN_IF_ERROR(s_.x_tensor.Set(x_dims, CudnnTensor::GetDataType<CudaT>()));
-      ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims, CudnnTensor::GetDataType<CudaT>()));
+      if (NHWC) {
+        ORT_RETURN_IF_ERROR(s_.x_tensor.Set(CUDNN_TENSOR_NHWC, CudnnTensor::GetDataType<CudaT>(),
+                                            static_cast<int>(x_dims[0]), static_cast<int>(x_dims[3]),
+                                            static_cast<int>(x_dims[1]), static_cast<int>(x_dims[2])));
+        ORT_RETURN_IF_ERROR(s_.y_tensor.Set(CUDNN_TENSOR_NHWC, CudnnTensor::GetDataType<CudaT>(),
+                                            static_cast<int>(y_dims[0]), static_cast<int>(y_dims[3]),
+                                            static_cast<int>(y_dims[1]), static_cast<int>(y_dims[2])));
+      } else {
+        ORT_RETURN_IF_ERROR(s_.x_tensor.Set(x_dims, CudnnTensor::GetDataType<CudaT>()));
+        ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims, CudnnTensor::GetDataType<CudaT>()));
+      }
 
       cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
       ORT_RETURN_IF_ERROR(s_.conv_desc.Set(p.kernel_shape.size(), p.pads, p.strides, p.dilations,
-                                           gsl::narrow_cast<int>(conv_transpose_attrs_.group),
-                                           mode, CudnnTensor::GetDataType<CudaT>()));
+                                           gsl::narrow_cast<int>(conv_transpose_attrs_.group), mode,
+                                           CudnnTensor::GetDataType<CudaT>()));
 
       if (has_bias) {
         const auto& b_shape = p.B->Shape();
         ORT_RETURN_IF_NOT(b_shape.NumDimensions() == 1, "bias should be 1D");
         TensorShapeVector b_dims(2 + p.kernel_shape.size());
-        b_dims[0] = 1;           // N
-        b_dims[1] = b_shape[0];  // C
-        for (size_t i = 0; i < p.kernel_shape.size(); i++)
-          b_dims[2 + i] = 1;
+        b_dims[0] = 1;                      // N
+        b_dims[NHWC ? 3 : 1] = b_shape[0];  // C
+        for (size_t i = 0; i < p.kernel_shape.size(); i++) b_dims[(NHWC ? 1 : 2) + i] = 1;
 
-        ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType<CudaT>()));
+        ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType<CudaT>(), NHWC));
       }
 
       y_data = reinterpret_cast<CudaT*>(p.Y->MutableData<T>());
 
       if (!s_.cached_benchmark_results.contains(x_dims)) {
-        IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
+        IAllocatorUniquePtr<void> algo_search_workspace =
+            GetScratchBuffer<void>(AlgoSearchWorkspaceSize, context->GetComputeStream());
 
         // set math type to tensor core before algorithm search
         if constexpr (std::is_same<T, MLFloat16>::value)
@@ -146,19 +193,8 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
         cudnnConvolutionBwdDataAlgoPerf_t perf;
         int algo_count = 1;
         CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionBackwardDataAlgorithmEx(
-            GetCudnnHandle(context),
-            s_.w_desc,
-            w_data,
-            s_.x_tensor,
-            x_data,
-            s_.conv_desc,
-            s_.y_tensor,
-            y_data,
-            1,
-            &algo_count,
-            &perf,
-            algo_search_workspace.get(),
-            AlgoSearchWorkspaceSize));
+            GetCudnnHandle(context), s_.w_desc, w_data, s_.x_tensor, x_data, s_.conv_desc, s_.y_tensor, y_data, 1,
+            &algo_count, &perf, algo_search_workspace.get(), AlgoSearchWorkspaceSize));
         s_.cached_benchmark_results.insert(x_dims, {perf.algo, perf.memory, perf.mathType});
       }
 
@@ -189,26 +225,15 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
 
     IAllocatorUniquePtr<void> workspace = GetScratchBuffer<void>(s_.workspace_bytes, context->GetComputeStream());
 
-    CUDNN_RETURN_IF_ERROR(
-        cudnnConvolutionBackwardData(
-            GetCudnnHandle(context),
-            &alpha,
-            s_.w_desc,
-            w_data,
-            s_.x_tensor,
-            x_data,
-            s_.conv_desc,
-            s_.algo,
-            workspace.get(),
-            s_.workspace_bytes,
-            &beta,
-            s_.y_tensor,
-            y_data));
+    CUDNN_RETURN_IF_ERROR(cudnnConvolutionBackwardData(GetCudnnHandle(context), &alpha, s_.w_desc, w_data, s_.x_tensor,
+                                                       x_data, s_.conv_desc, s_.algo, workspace.get(),
+                                                       s_.workspace_bytes, &beta, s_.y_tensor, y_data));
 
     if (has_bias) {
       const Tensor* B = dynamic_padding ? context->Input<Tensor>(3) : context->Input<Tensor>(2);
       auto b_data = reinterpret_cast<const CudaT*>(B->Data<T>());
-      CUDNN_RETURN_IF_ERROR(cudnnAddTensor(GetCudnnHandle(context), &alpha, s_.b_tensor, b_data, &alpha, s_.y_tensor, y_data));
+      CUDNN_RETURN_IF_ERROR(
+          cudnnAddTensor(GetCudnnHandle(context), &alpha, s_.b_tensor, b_data, &alpha, s_.y_tensor, y_data));
     }
   }
 
diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.h b/onnxruntime/core/providers/cuda/nn/conv_transpose.h
index 165d548d27fa2..77c9d94162b6b 100644
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.h
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.h
@@ -1,8 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
 
+#include <memory>
+
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/cuda_kernel.h"
 #include "core/providers/cuda/cudnn_common.h"
@@ -12,10 +15,12 @@
 namespace onnxruntime {
 namespace cuda {
 
-template <typename T>
+template <typename T, bool NHWC>
 class ConvTranspose : public CudaKernel {
  public:
   ConvTranspose(const OpKernelInfo& info) : CudaKernel(info), conv_transpose_attrs_(info){};
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                 bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override;
   Status ComputeInternal(OpKernelContext* context) const override;
   Status DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const;
 
@@ -23,6 +28,7 @@ class ConvTranspose : public CudaKernel {
   ConvTransposeAttributes conv_transpose_attrs_;
 
   mutable CudnnConvState<cudnnConvolutionBwdDataAlgoPerf_t> s_;
+  std::unique_ptr<Tensor> W_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc
index e632ef20bce43..8bc96958693bc 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.cc
+++ b/onnxruntime/core/providers/cuda/nn/pool.cc
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #include "core/providers/shared_library/provider_api.h"
@@ -11,92 +12,99 @@ using namespace onnxruntime::common;
 namespace onnxruntime {
 namespace cuda {
 
-#define POOLING_KERNEL(op_name, data_type, pool_type, since_version)                               \
+#define POOLING_KERNEL(op_name, data_type, pool_type, since_version, op_domain, nhwc)              \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                                   \
-      op_name,                                                                                     \
-      kOnnxDomain,                                                                                 \
-      since_version,                                                                               \
-      data_type,                                                                                   \
-      kCudaExecutionProvider,                                                                      \
+      op_name, op_domain, since_version, data_type, kCudaExecutionProvider,                        \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()), \
-      Pool<data_type, pool_type>);
-
-#define POOLING_KERNEL_VERSIONED(op_name, data_type, pool_type, since_version, end_version) \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                  \
-      op_name,                                                                              \
-      kOnnxDomain,                                                                          \
-      since_version,                                                                        \
-      end_version,                                                                          \
-      data_type,                                                                            \
-      kCudaExecutionProvider,                                                               \
-      (*KernelDefBuilder::Create())                                                         \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()),                   \
-      Pool<data_type, pool_type>);
-
-#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version) \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                  \
-      op_name,                                                                    \
-      kOnnxDomain,                                                                \
-      since_version,                                                              \
-      data_type,                                                                  \
-      kCudaExecutionProvider,                                                     \
-      (*KernelDefBuilder::Create())                                               \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())          \
-          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),           \
-      Pool<data_type, pool_type>);
-
-#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version) \
-  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                               \
-      op_name,                                                                                           \
-      kOnnxDomain,                                                                                       \
-      since_version,                                                                                     \
-      end_version,                                                                                       \
-      data_type,                                                                                         \
-      kCudaExecutionProvider,                                                                            \
-      (*KernelDefBuilder::Create())                                                                      \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())                                 \
-          .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),                                  \
-      Pool<data_type, pool_type>);
-
-POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9)
-POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9)
-POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9)
-POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10)
-POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 10, 10)
-POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10)
+      Pool<data_type, pool_type, nhwc>);
+
+#define POOLING_KERNEL_VERSIONED(op_name, data_type, pool_type, since_version, end_version, op_domain, nhwc) \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                                                   \
+      op_name, op_domain, since_version, end_version, data_type, kCudaExecutionProvider,                     \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()),           \
+      Pool<data_type, pool_type, nhwc>);
+
+#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version, op_domain, nhwc)    \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(op_name, op_domain, since_version, data_type, kCudaExecutionProvider, \
+                                (*KernelDefBuilder::Create())                                         \
+                                    .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())    \
+                                    .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),     \
+                                Pool<data_type, pool_type, nhwc>);
+
+#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version, op_domain, \
+                                              nhwc)                                                                 \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(op_name, op_domain, since_version, end_version, data_type,                \
+                                          kCudaExecutionProvider,                                                   \
+                                          (*KernelDefBuilder::Create())                                             \
+                                              .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>())        \
+                                              .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),         \
+                                          Pool<data_type, pool_type, nhwc>);
+
+POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 10, 10, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10, kOnnxDomain, false)
 // AveragePool and MaxPool op set 11 only update spec document on default value for dilations and strides.
-POOLING_KERNEL(AveragePool, float, AveragePool, 11)
-POOLING_KERNEL(AveragePool, double, AveragePool, 11)
-POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11)
-POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1)
-POOLING_KERNEL(GlobalAveragePool, double, AveragePool, 1)
-POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1)
-POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<1>, 1, 7)
-POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<1>, 1, 7)
-POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 8, 9)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 8, 9)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 8, 9)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 10, 10)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 10, 10)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 10, 10)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 11, 11)
-POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11)
-POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12)
-POOLING_KERNEL_WITH_INDICES(MaxPool, double, MaxPool<8>, 12)
-POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12)
-POOLING_KERNEL_WITH_INDICES(MaxPool, int8_t, MaxPool<8>, 12)
-POOLING_KERNEL_WITH_INDICES(MaxPool, uint8_t, MaxPool<8>, 12)
-
-POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1)
-POOLING_KERNEL(GlobalMaxPool, double, MaxPool<1>, 1)
-POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1)
+POOLING_KERNEL(AveragePool, float, AveragePool, 11, kOnnxDomain, false)
+POOLING_KERNEL(AveragePool, double, AveragePool, 11, kOnnxDomain, false)
+POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11, kOnnxDomain, false)
+POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1, kOnnxDomain, false)
+POOLING_KERNEL(GlobalAveragePool, double, AveragePool, 1, kOnnxDomain, false)
+POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<1>, 1, 7, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<1>, 1, 7, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 8, 9, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 8, 9, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 8, 9, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 10, 10, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 10, 10, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 10, 10, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 11, 11, kOnnxDomain, false)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11, kOnnxDomain, false)
+POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12, kOnnxDomain, false)
+POOLING_KERNEL_WITH_INDICES(MaxPool, double, MaxPool<8>, 12, kOnnxDomain, false)
+POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12, kOnnxDomain, false)
+POOLING_KERNEL_WITH_INDICES(MaxPool, int8_t, MaxPool<8>, 12, kOnnxDomain, false)
+POOLING_KERNEL_WITH_INDICES(MaxPool, uint8_t, MaxPool<8>, 12, kOnnxDomain, false)
+
+POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1, kOnnxDomain, false)
+POOLING_KERNEL(GlobalMaxPool, double, MaxPool<1>, 1, kOnnxDomain, false)
+POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1, kOnnxDomain, false)
+
+// NHWC variants
+#ifdef ENABLE_CUDA_NHWC_OPS
+POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<1>, 1, 7, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 8, 9, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 8, 9, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 10, 10, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 10, 10, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12, kMSInternalNHWCDomain, true)
+
+POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1, kMSInternalNHWCDomain, true)
+POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1, kMSInternalNHWCDomain, true)
+
+POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10, kMSInternalNHWCDomain, true)
+POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10, kMSInternalNHWCDomain, true)
+// AveragePool and MaxPool op set 11 only update spec document on default value for dilations
+POOLING_KERNEL(AveragePool, float, AveragePool, 11, kMSInternalNHWCDomain, true)
+POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11, kMSInternalNHWCDomain, true)
+POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1, kMSInternalNHWCDomain, true)
+POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1, kMSInternalNHWCDomain, true)
+#endif
 
 class CudnnPoolingDescriptor final {
  public:
-  CudnnPoolingDescriptor() : desc_(nullptr) {
-  }
+  CudnnPoolingDescriptor() : desc_(nullptr) {}
 
   ~CudnnPoolingDescriptor() {
     if (desc_ != nullptr) {
@@ -108,12 +116,9 @@ class CudnnPoolingDescriptor final {
   CudnnPoolingDescriptor(const CudnnPoolingDescriptor&) = delete;
   CudnnPoolingDescriptor& operator=(const CudnnPoolingDescriptor&) = delete;
 
-  Status Set(cudnnPoolingMode_t mode,
-             const gsl::span<const int64_t>& kernel_shape,
-             const gsl::span<const int64_t>& pads,
-             const gsl::span<const int64_t>& strides) {
-    if (!desc_)
-      CUDNN_RETURN_IF_ERROR(cudnnCreatePoolingDescriptor(&desc_));
+  Status Set(cudnnPoolingMode_t mode, const gsl::span<const int64_t>& kernel_shape,
+             const gsl::span<const int64_t>& pads, const gsl::span<const int64_t>& strides) {
+    if (!desc_) CUDNN_RETURN_IF_ERROR(cudnnCreatePoolingDescriptor(&desc_));
 
     int rank = gsl::narrow_cast<int>(kernel_shape.size());
     InlinedVector<int> window(rank);
@@ -128,14 +133,8 @@ class CudnnPoolingDescriptor final {
     for (int i = 0; i < rank; i++) {
       stride[i] = gsl::narrow_cast<int>(strides[i]);
     }
-    CUDNN_RETURN_IF_ERROR(SetPoolingNdDescriptorHelper(
-        desc_,
-        mode,
-        CUDNN_PROPAGATE_NAN,
-        rank,
-        window.data(),
-        padding.data(),
-        stride.data()));
+    CUDNN_RETURN_IF_ERROR(SetPoolingNdDescriptorHelper(desc_, mode, CUDNN_PROPAGATE_NAN, rank, window.data(),
+                                                       padding.data(), stride.data()));
 
     return Status::OK();
   }
@@ -146,8 +145,8 @@ class CudnnPoolingDescriptor final {
   cudnnPoolingDescriptor_t desc_;
 };
 
-template <typename T, typename PoolType>
-Status Pool<T, PoolType>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, typename PoolType, bool NHWC>
+Status Pool<T, PoolType, NHWC>::ComputeInternal(OpKernelContext* context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const Tensor* X = context->Input<Tensor>(0);
   const TensorShape& x_shape = X->Shape();
@@ -166,13 +165,12 @@ Status Pool<T, PoolType>::ComputeInternal(OpKernelContext* context) const {
     pads.assign(kernel_shape.size(), 0);
     strides.assign(kernel_shape.size(), 1);
   }
-
-  auto y_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
+  auto out_channel = NHWC ? x_shape[3] : x_shape[1];
+  auto y_dims = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC);
   TensorShape y_shape(y_dims);
   Tensor* Y = context->Output(0, y_shape);
   // special case when there is a dim value of 0 in the shape.
-  if (y_shape.Size() == 0)
-    return Status::OK();
+  if (y_shape.Size() == 0) return Status::OK();
 
   auto x_data = reinterpret_cast<const CudaT*>(X->Data<T>());
   auto y_data = reinterpret_cast<CudaT*>(Y->MutableData<T>());
@@ -181,12 +179,19 @@ Status Pool<T, PoolType>::ComputeInternal(OpKernelContext* context) const {
   TensorShapeVector y_dims_cudnn(y_dims);
   if (kernel_shape.size() < 2) {
     // cudnn only takes 4D or 5D input, so pad dimensions if needed
-    x_dims_cudnn.push_back(1);
-    y_dims_cudnn.push_back(1);
+    if (NHWC) {
+      x_dims_cudnn.insert(x_dims_cudnn.begin() + 1, 1);
+      y_dims_cudnn.insert(y_dims_cudnn.begin() + 1, 1);
+      kernel_shape.insert(kernel_shape.begin() + 1, 1);
+      strides.insert(strides.begin() + 1, 1);
+    } else {
+      x_dims_cudnn.push_back(1);
+      y_dims_cudnn.push_back(1);
+      kernel_shape.push_back(1);
+      strides.push_back(1);
+    }
     pads.insert(pads.begin() + kernel_shape.size(), 0);
     pads.insert(pads.end(), 0);
-    kernel_shape.push_back(1);
-    strides.push_back(1);
   }
 
   cudnnPoolingMode_t mode = CUDNN_POOLING_MAX;
@@ -203,8 +208,8 @@ Status Pool<T, PoolType>::ComputeInternal(OpKernelContext* context) const {
     const auto beta = Consts<float>::Zero;
     CudnnTensor x_tensor;
     CudnnTensor y_tensor;
-    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<float>()));
-    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<float>()));
+    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<float>(), NHWC));
+    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<float>(), NHWC));
 
     const auto input_count = x_shape.Size();
     const auto output_count = y_shape.Size();
@@ -212,24 +217,26 @@ Status Pool<T, PoolType>::ComputeInternal(OpKernelContext* context) const {
     IAllocatorUniquePtr<float> temp_X = GetScratchBuffer<float>(input_count, context->GetComputeStream());
     auto temp_Y = GetScratchBuffer<float>(output_count, context->GetComputeStream());
     Impl_Cast<CudaT, float>(Stream(context), reinterpret_cast<const CudaT*>(x_data), temp_X.get(), input_count);
-    CUDNN_RETURN_IF_ERROR(PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, temp_X.get(), &beta, y_tensor, temp_Y.get()));
+    CUDNN_RETURN_IF_ERROR(PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, temp_X.get(),
+                                               &beta, y_tensor, temp_Y.get()));
     Impl_Cast<float, CudaT>(Stream(context), temp_Y.get(), y_data, output_count);
   } else {
     const auto alpha = Consts<CudaT>::One;
     const auto beta = Consts<CudaT>::Zero;
     CudnnTensor x_tensor;
     CudnnTensor y_tensor;
-    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<CudaT>()));
-    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<CudaT>()));
+    ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), NHWC));
+    ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType<CudaT>(), NHWC));
 
-    CUDNN_RETURN_IF_ERROR(PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, x_data, &beta, y_tensor, y_data));
+    CUDNN_RETURN_IF_ERROR(
+        PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, x_data, &beta, y_tensor, y_data));
   }
 
   return Status::OK();
 }
 
-template <typename T>
-Status Pool<T, MaxPool<8>>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, bool NHWC>
+Status Pool<T, MaxPool<8>, NHWC>::ComputeInternal(OpKernelContext* context) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const Tensor* X = context->Input<Tensor>(0);
   const TensorShape& x_shape = X->Shape();
@@ -248,13 +255,12 @@ Status Pool<T, MaxPool<8>>::ComputeInternal(OpKernelContext* context) const {
     pads.assign(kernel_shape.size(), 0);
     strides.assign(kernel_shape.size(), 1);
   }
-
-  auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
+  auto out_channel = NHWC ? x_shape[3] : x_shape[1];
+  auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC);
   Tensor* Y = context->Output(0, TensorShape(y_dims));
 
   // special case when there is a dim value of 0 in the shape.
-  if (Y->Shape().Size() == 0)
-    return Status::OK();
+  if (Y->Shape().Size() == 0) return Status::OK();
 
   auto x_data = reinterpret_cast<const CudaT*>(X->Data<T>());
   auto y_data = reinterpret_cast<CudaT*>(Y->MutableData<T>());
@@ -262,20 +268,10 @@ Status Pool<T, MaxPool<8>>::ComputeInternal(OpKernelContext* context) const {
   Tensor* I = context->Output(1, TensorShape(y_dims));
   if (nullptr != I || !this->pool_attrs_.default_dilations) {
     auto i_data = nullptr == I ? nullptr : I->MutableData<int64_t>();
-    MaxPoolWithIndex<CudaT>(
-        this->Stream(context),
-        x_shape,
-        TensorShape(y_dims),
-        kernel_shape,
-        strides,
-        pads,
-        this->pool_attrs_.dilations,
-        this->pool_attrs_.storage_order,
-        x_data,
-        y_data,
-        i_data);
+    MaxPoolWithIndex<CudaT>(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape, strides, pads,
+                            this->pool_attrs_.dilations, this->pool_attrs_.storage_order, x_data, y_data, i_data);
   } else {
-    ORT_RETURN_IF_ERROR((Pool<T, MaxPool<1>>::ComputeInternal(context)));
+    ORT_RETURN_IF_ERROR((Pool<T, MaxPool<1>, NHWC>::ComputeInternal(context)));
   }
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/nn/pool.h b/onnxruntime/core/providers/cuda/nn/pool.h
index fb223c18d2625..8b5152a1565a9 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.h
+++ b/onnxruntime/core/providers/cuda/nn/pool.h
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #pragma once
@@ -10,7 +11,7 @@
 namespace onnxruntime {
 namespace cuda {
 
-template <typename T, typename PoolType>
+template <typename T, typename PoolType, bool NHWC>
 class Pool : public CudaKernel, public PoolBase {
  public:
   Pool(const OpKernelInfo& info) : CudaKernel(info), PoolBase(info) {}
@@ -18,10 +19,10 @@ class Pool : public CudaKernel, public PoolBase {
   Status ComputeInternal(OpKernelContext* context) const override;
 };
 
-template <typename T>
-class Pool<T, MaxPool<8>> final : public Pool<T, MaxPool<1>> {
+template <typename T, bool NHWC>
+class Pool<T, MaxPool<8>, NHWC> final : public Pool<T, MaxPool<1>, NHWC> {
  public:
-  Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>>(info) {}
+  explicit Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>, NHWC>(info) {}
 
   Status ComputeInternal(OpKernelContext* context) const override;
 };
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 85599fab808b3..76533a0061702 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -240,6 +240,7 @@ struct DeleteOnUnloadPtr {
 
 constexpr const char* kOnnxDomain = "";
 constexpr const char* kMSDomain = "com.microsoft";
+constexpr const char* kMSInternalNHWCDomain = "com.ms.internal.nhwc";
 constexpr const char* kPytorchAtenDomain = "org.pytorch.aten";
 constexpr const char* kNGraphDomain = "com.intel.ai";
 constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 8d30202faaeb3..d950223f2d108 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1332,6 +1332,7 @@ OrtCUDAProviderOptionsV2 OrtCUDAProviderOptionsToOrtCUDAProviderOptionsV2(const
   // Use default value as this field is not available in OrtCUDAProviderOptions
   cuda_options_converted.cudnn_conv_use_max_workspace = 1;
   cuda_options_converted.enable_cuda_graph = 0;
+  cuda_options_converted.prefer_nhwc = 0;
   cuda_options_converted.cudnn_conv1d_pad_to_nc1d = 0;
   cuda_options_converted.enable_skip_layer_norm_strict_mode = 0;
   cuda_options_converted.use_ep_level_unified_stream = 0;
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 56f924ed351fb..b1a04a00e89b1 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -1,4 +1,5 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
 // Licensed under the MIT License.
 
 #include "command_args_parser.h"
@@ -53,8 +54,8 @@ namespace perftest {
       "\t-o [optimization level]: Default is 99 (all). Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
       "\t\tPlease see onnxruntime_c_api.h (enum GraphOptimizationLevel) for the full list of all optimization levels.\n"
       "\t-u [optimized_model_path]: Specify the optimized model path for saving.\n"
-      "\t-d [cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). \n"
-      "\t-q: [CUDA only] use separate stream for copy. \n"
+      "\t-d [CUDA only][cudnn_conv_algorithm]: Specify CUDNN convolution algorithms: 0(benchmark), 1(heuristic), 2(default). \n"
+      "\t-q [CUDA only] use separate stream for copy. \n"
       "\t-z: Set denormal as zero. When turning on this option reduces latency dramatically, a model may have denormals.\n"
       "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
       "\t    [OpenVINO only] [device_type]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 1111a92a385fd..a7f0b7584a211 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -1,3 +1,7 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// Licensed under the MIT License.
+
 #include "ort_test_session.h"
 #include <algorithm>
 #include <limits>
@@ -93,11 +97,66 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
 #endif
   } else if (provider_name == onnxruntime::kCudaExecutionProvider) {
 #ifdef USE_CUDA
-    OrtCUDAProviderOptions cuda_options;
-    cuda_options.cudnn_conv_algo_search = static_cast<OrtCudnnConvAlgoSearch>(performance_test_config.run_config.cudnn_conv_algo);
-    cuda_options.do_copy_in_default_stream = !performance_test_config.run_config.do_cuda_copy_in_separate_stream;
-    // TODO: Support arena configuration for users of perf test
-    session_options.AppendExecutionProvider_CUDA(cuda_options);
+    const auto& api = Ort::GetApi();
+    OrtCUDAProviderOptionsV2* cuda_options;
+    Ort::ThrowOnError(api.CreateCUDAProviderOptions(&cuda_options));
+
+    const char* cudnn_conv_algo_search = "cudnn_conv_algo_search";
+    const char* default_conv = "DEFAULT";
+    const char* benchmarking = "EXHAUSTIVE";
+    const char* heuristic = "HEURISTIC";
+    switch (performance_test_config.run_config.cudnn_conv_algo) {
+      case 0:
+        Ort::ThrowOnError(
+            api.UpdateCUDAProviderOptions(cuda_options, &cudnn_conv_algo_search, &benchmarking, 1));
+        break;
+      case 1:
+        Ort::ThrowOnError(
+            api.UpdateCUDAProviderOptions(cuda_options, &cudnn_conv_algo_search, &heuristic, 1));
+        break;
+      default:
+        Ort::ThrowOnError(
+            api.UpdateCUDAProviderOptions(cuda_options, &cudnn_conv_algo_search, &default_conv, 1));
+        break;
+    }
+
+    const char* do_copy_in_default_stream = "do_copy_in_default_stream";
+    if (performance_test_config.run_config.do_cuda_copy_in_separate_stream) {
+      const char* v = "1";
+      Ort::ThrowOnError(
+          api.UpdateCUDAProviderOptions(cuda_options, &do_copy_in_default_stream, &v, 1));
+    } else {
+      const char* v = "0";
+      Ort::ThrowOnError(
+          api.UpdateCUDAProviderOptions(cuda_options, &do_copy_in_default_stream, &v, 1));
+    }
+
+#ifdef _MSC_VER
+    std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string);
+#else
+    std::string ov_string = performance_test_config.run_config.ep_runtime_config_string;
+#endif
+    std::istringstream ss(ov_string);
+    std::string token;
+    while (ss >> token) {
+      if (token == "") {
+        continue;
+      }
+      auto pos = token.find("|");
+      if (pos == std::string::npos || pos == 0 || pos == token.length()) {
+        ORT_THROW(
+            "[ERROR] [CUDA] Use a '|' to separate the key and value for the run-time option you are trying to use.\n");
+      }
+
+      auto key = token.substr(0, pos);
+      auto value = token.substr(pos + 1);
+      auto key_p = key.c_str();
+      auto value_p = value.c_str();
+      Ort::ThrowOnError(
+          api.UpdateCUDAProviderOptions(cuda_options, &key_p, &value_p, 1));
+    }
+
+    session_options.AppendExecutionProvider_CUDA_V2(*cuda_options);
 #else
     ORT_THROW("CUDA is not supported in this build\n");
 #endif
diff --git a/onnxruntime/test/providers/compare_provider_test_utils.cc b/onnxruntime/test/providers/compare_provider_test_utils.cc
index 94fb03540e3f8..3ef74259e27b6 100644
--- a/onnxruntime/test/providers/compare_provider_test_utils.cc
+++ b/onnxruntime/test/providers/compare_provider_test_utils.cc
@@ -121,5 +121,83 @@ void CompareOpTester::CompareWithCPU(const std::string& target_provider_type,
   }
 }
 
+void CompareOpTester::CompareEPs(const std::shared_ptr<IExecutionProvider>& source_execution_provider,
+                                 std::vector<std::shared_ptr<IExecutionProvider>>& target_execution_providers,
+                                 double per_sample_tolerance,
+                                 double relative_per_sample_tolerance,
+                                 const bool need_cpu_cast,
+                                 const std::unordered_map<std::string, int>& extra_domain_to_version) {
+  SetTestFunctionCalled();
+
+  auto& model = BuildModel(extra_domain_to_version);
+  auto& graph = model.MainGraph();
+
+  // In InferenceSession::Initialize(), the call to graph partitioner, which is responsible
+  // for Inlining function bodies for ops whose kernel is missing happens before the
+  // Cast Transformer. As a result, for MLFloat16 tests where the node is missing a CPU kernel,
+  // the function body is instead used for CPU pass. This option allows the comparison with
+  // the CPU kernel by adding the input/output casts before looking for a registered CPU kernel.
+  if (need_cpu_cast) {
+    InsertCastTransformer transformer("Test", GetExecutionProvider(kCpuExecutionProvider)->GetKernelRegistry().get());
+    bool modified = false;
+    ASSERT_STATUS_OK(transformer.Apply(graph, modified, DefaultLoggingManager().DefaultLogger()));
+  }
+
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  // Hookup the inputs and outputs
+  std::unordered_map<std::string, OrtValue> feeds;
+  std::vector<std::string> output_names;
+  FillFeedsAndOutputNames(feeds, output_names);
+
+  // Run the model
+  SessionOptions so;
+  so.session_logid = Op();
+
+  InferenceSession source_session_object{so, GetEnvironment()};
+  ASSERT_STATUS_OK(source_session_object.RegisterExecutionProvider(source_execution_provider));
+
+  // first run with source provider
+  std::string s1;
+  model.ToProto().SerializeToString(&s1);
+  std::istringstream model_proto_str(s1);
+
+  ASSERT_STATUS_OK(source_session_object.Load(model_proto_str));
+
+  ASSERT_STATUS_OK(source_session_object.Initialize());
+
+  std::vector<OrtValue> source_fetches;
+  ASSERT_STATUS_OK(source_session_object.Run({}, feeds, output_names, &source_fetches));
+
+  for (auto& target_execution_provider : target_execution_providers) {
+    // run with target provider
+    // build the graph again as the other graphs may be with casts
+    auto& tp_model = BuildModel(extra_domain_to_version);
+    auto& tp_graph = tp_model.MainGraph();
+
+    ASSERT_STATUS_OK(tp_graph.Resolve());
+
+    InferenceSession target_session_object{so, GetEnvironment()};
+    ASSERT_STATUS_OK(target_session_object.RegisterExecutionProvider(target_execution_provider));
+
+    std::string s2;
+    tp_model.ToProto().SerializeToString(&s2);
+    std::istringstream model_proto_str1(s2);
+    ASSERT_STATUS_OK(target_session_object.Load(model_proto_str1));
+
+    ASSERT_STATUS_OK(target_session_object.Initialize());
+
+    std::vector<OrtValue> target_fetches;
+    ASSERT_STATUS_OK(target_session_object.Run({}, feeds, output_names, &target_fetches));
+
+    // compare
+    ASSERT_TRUE(source_fetches.size() == target_fetches.size());
+    for (size_t i = 0; i < source_fetches.size(); i++) {
+      auto ret = CompareOrtValue(target_fetches[i], source_fetches[i], per_sample_tolerance,
+                                 relative_per_sample_tolerance, false);
+      EXPECT_EQ(ret.first, COMPARE_RESULT::SUCCESS) << ret.second;
+    }
+  }
+}
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/compare_provider_test_utils.h b/onnxruntime/test/providers/compare_provider_test_utils.h
index 924fe405ba8dd..155016d7e69a2 100644
--- a/onnxruntime/test/providers/compare_provider_test_utils.h
+++ b/onnxruntime/test/providers/compare_provider_test_utils.h
@@ -3,6 +3,11 @@
 
 #pragma once
 
+#include <string>
+#include <memory>
+#include <vector>
+#include <unordered_map>
+
 #include "core/graph/constants.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/providers/provider_test_utils.h"
@@ -22,6 +27,13 @@ class CompareOpTester : public OpTester {
                       double relative_per_sample_tolerance = 1e-4,
                       const bool need_cpu_cast = false,
                       const std::unordered_map<std::string, int>& extra_domain_to_version = {});
+
+  void CompareEPs(const std::shared_ptr<IExecutionProvider>& source_execution_provider,
+                  std::vector<std::shared_ptr<IExecutionProvider>>& target_execution_providers,
+                  double per_sample_tolerance,
+                  double relative_per_sample_tolerance = 1e-4,
+                  const bool need_cpu_cast = false,
+                  const std::unordered_map<std::string, int>& extra_domain_to_version = {});
 };
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
new file mode 100644
index 0000000000000..be0082f95feb8
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc
@@ -0,0 +1,74 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// Licensed under the MIT License.
+
+#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h"
+
+namespace onnxruntime {
+namespace test {
+
+template <typename T>
+struct ConvOp {
+  const std::vector<int64_t> input_dims;
+  const std::vector<int64_t> kernel_shape;
+  int64_t channels;
+  int64_t group = 1;
+  bool bias = false;
+  std::vector<int64_t> strides = {1, 1};
+  std::vector<int64_t> padding = {0, 0, 0, 0};
+  std::vector<int64_t> dilations = {1, 1};
+
+  std::unique_ptr<CompareOpTester> get_test() {
+    RandomValueGenerator random{};
+
+    auto test = std::make_unique<CompareOpTester>("Conv", 7);
+    std::vector<T> input_data = random.Uniform<T>(input_dims, 0.0f, 1.0f);
+
+    std::vector<int64_t> weight_dims{channels, input_dims[1] / group, kernel_shape[0], kernel_shape[1]};
+    std::vector<T> weight_data = random.Uniform<T>(weight_dims, -0.4f, 0.4f);
+
+    test->AddInput<T>("X", input_dims, input_data);
+    test->AddInput<T>("W", weight_dims, weight_data, true);
+    if (bias) {
+      std::vector<int64_t> bias_dims{channels};
+      std::vector<T> bias_data = random.Uniform<T>(bias_dims, 0.2f, 0.4f);
+      test->AddInput<T>("B", bias_dims, bias_data, true);
+    }
+    test->AddAttribute("group", group);
+    test->AddAttribute("kernel_shape", kernel_shape);
+    test->AddAttribute("strides", strides);
+    test->AddAttribute("dilations", dilations);
+    test->AddAttribute("pads", padding);
+
+    std::vector<int64_t> output_dims = {
+        input_dims[0], channels,
+        ComputeOutputShape(input_dims[2], strides[0], kernel_shape[0], dilations[0], padding[0], padding[1]),
+        ComputeOutputShape(input_dims[3], strides[1], kernel_shape[1], dilations[1], padding[2], padding[3])};
+    std::vector<T> output_data = FillZeros<T>(output_dims);
+
+    test->AddOutput<T>("Y", output_dims, output_data);
+    return test;
+  }
+};
+
+TYPED_TEST(CudaNhwcTypedTest, ConvNhwcBias) {
+  auto op = ConvOp<TypeParam>{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .bias = true};
+
+  MAKE_PROVIDERS_EPS_TYPE(TypeParam)
+}
+
+TYPED_TEST(CudaNhwcTypedTest, ConvNhwcGroupNoBias) {
+  auto op = ConvOp<TypeParam>{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .group = 4};
+
+  MAKE_PROVIDERS_EPS_TYPE(TypeParam)
+}
+
+TYPED_TEST(CudaNhwcTypedTest, ConvNhwcPadding) {
+  auto op =
+      ConvOp<TypeParam>{.input_dims = {2, 4, 64, 64}, .kernel_shape = {3, 3}, .channels = 4, .padding = {4, 4, 4, 4}};
+
+  MAKE_PROVIDERS_EPS_TYPE(TypeParam)
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc
new file mode 100644
index 0000000000000..d45323190c514
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc
@@ -0,0 +1,93 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// Licensed under the MIT License.
+
+#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h"
+
+namespace onnxruntime {
+namespace test {
+
+template <typename T>
+struct ConvTransposeOp {
+  const std::vector<int64_t> input_dims;
+  const std::vector<int64_t> kernel_shape;
+  int64_t channels;
+  int64_t group = 1;
+  bool bias = false;
+  std::vector<int64_t> strides = {1, 1};
+  std::vector<int64_t> padding = {0, 0, 0, 0};
+  std::vector<int64_t> output_padding = {0, 0, 0, 0};
+  std::vector<int64_t> dilations = {1, 1};
+
+  std::unique_ptr<CompareOpTester> get_test() {
+    RandomValueGenerator random{};
+
+    auto test = std::make_unique<CompareOpTester>("ConvTranspose", 14);
+    std::vector<T> input_data = random.Uniform<T>(input_dims, 0.0f, 1.0f);
+
+    std::vector<int64_t> weight_dims{input_dims[1], channels / group, kernel_shape[0], kernel_shape[1]};
+    std::vector<T> weight_data = random.Uniform<T>(weight_dims, -0.4f, 0.4f);
+
+    test->AddInput<T>("X", input_dims, input_data);
+    test->AddInput<T>("W", weight_dims, weight_data, true);
+    if (bias) {
+      std::vector<int64_t> bias_dims{channels};
+      std::vector<T> bias_data = random.Uniform<T>(bias_dims, 0.2f, 0.4f);
+      test->AddInput<T>("B", bias_dims, bias_data, true);
+    }
+    test->AddAttribute("group", group);
+    test->AddAttribute("kernel_shape", kernel_shape);
+    test->AddAttribute("strides", strides);
+    test->AddAttribute("dilations", dilations);
+    test->AddAttribute("pads", padding);
+    if (!output_padding.empty()) {
+      test->AddAttribute("output_padding", output_padding);
+    }
+
+    std::vector<int64_t> output_dims = {
+        input_dims[0], channels,
+        (kernel_shape[1] - 1) * dilations[1] + (input_dims[2] - 1) * strides[1] - (padding[1] + padding[0]) + 1,
+        (kernel_shape[0] - 1) * dilations[0] + (input_dims[3] - 1) * strides[0] - (padding[3] + padding[2]) + 1};
+    std::vector<T> output_data = FillZeros<T>(output_dims);
+
+    test->AddOutput<T>("Y", output_dims, output_data);
+    return test;
+  }
+};
+
+TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcGroupNoBias) {
+  auto op =
+      ConvTransposeOp<TypeParam>{.input_dims = {8, 8, 32, 32}, .kernel_shape = {3, 3}, .channels = 16, .group = 4};
+
+  MAKE_PROVIDERS_EPS_TYPE(TypeParam)
+}
+
+TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcBias) {
+  auto op =
+      ConvTransposeOp<TypeParam>{.input_dims = {1, 8, 80, 80}, .kernel_shape = {5, 5}, .channels = 16, .bias = true};
+
+  MAKE_PROVIDERS_EPS_TYPE(TypeParam)
+}
+
+TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcPad) {
+  auto op = ConvTransposeOp<TypeParam>{.input_dims = {1, 16, 8, 8},
+                                       .kernel_shape = {3, 3},
+                                       .channels = 32,
+                                       .padding = {2, 2, 2, 2},
+                                       .output_padding = {}};
+
+  MAKE_PROVIDERS_EPS_TYPE(TypeParam)
+}
+
+TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcOutPad) {
+  auto op = ConvTransposeOp<TypeParam>{.input_dims = {1, 32, 8, 8},
+                                       .kernel_shape = {3, 3},
+                                       .channels = 32,
+                                       .strides = {2, 2},
+                                       .output_padding = {1, 1, 1, 1}};
+
+  MAKE_PROVIDERS_EPS_TYPE(TypeParam)
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
new file mode 100644
index 0000000000000..2c942bb790096
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
@@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// Licensed under the MIT License.
+
+#include <vector>
+#include <utility>
+#include <memory>
+
+#include "core/providers/cuda/cuda_provider_options.h"
+#include "core/providers/common.h"
+
+#include "test/providers/compare_provider_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
+
+#include "gtest/gtest.h"
+
+#define MAKE_PROVIDERS_EPS(eps)                                           \
+  std::vector<std::shared_ptr<IExecutionProvider>> execution_providers;   \
+  OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true};                  \
+  execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc)); \
+                                                                          \
+  double error_tolerance = eps;                                           \
+  OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false};                 \
+  auto source_ep = CudaExecutionProviderWithOptions(&nchw);               \
+  auto test = op.get_test();                                              \
+  test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance);
+
+#define MAKE_PROVIDERS() MAKE_PROVIDERS_EPS(1e-3)
+
+#define MAKE_PROVIDERS_EPS_TYPE(T)             \
+  if (std::is_same<T, MLFloat16>::value) {     \
+    MAKE_PROVIDERS_EPS(2e-2)                   \
+  } else if (std::is_same<T, double>::value) { \
+    MAKE_PROVIDERS_EPS(2e-4)                   \
+  } else {                                     \
+    MAKE_PROVIDERS_EPS(2e-3)                   \
+  }
+namespace onnxruntime {
+namespace test {
+
+template <typename T>
+class CudaNhwcTypedTest : public ::testing::Test {};
+
+using CudaNhwcTestTypes = ::testing::Types<float, MLFloat16>;  // double,
+TYPED_TEST_SUITE(CudaNhwcTypedTest, CudaNhwcTestTypes);
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/nhwc/norm_test.cc b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc
new file mode 100644
index 0000000000000..52da8ba557c2d
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// Licensed under the MIT License.
+
+#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h"
+
+namespace onnxruntime {
+namespace test {
+
+template <typename T>
+struct BatchNormOp {
+  const std::vector<int64_t> input_dims;
+
+  std::unique_ptr<CompareOpTester> get_test() {
+    // create rand inputs
+    RandomValueGenerator random{};
+
+    auto test = std::make_unique<CompareOpTester>("BatchNormalization", 14);
+    std::vector<T> input_data = random.Uniform<T>(input_dims, 0.0f, 0.3f);
+    auto channels = input_dims[1];
+    test->AddInput<T>("X", input_dims, input_data);
+
+    std::vector<int64_t> bias_dims{channels};
+    std::vector<T> bias_data = random.Uniform<T>(bias_dims, 0.2f, 1.0f);
+    test->AddInput<T>("B", bias_dims, bias_data);
+    // we simply gonna reuse the bias data here.
+    test->AddInput<T>("scale", bias_dims, bias_data);
+
+    std::vector<int64_t> mean{channels};
+    std::vector<T> mean_data = random.Uniform<T>(mean, 0.7f, 0.8f);
+    test->AddInput<T>("input_mean", bias_dims, bias_data);
+    std::vector<int64_t> var{channels};
+    std::vector<T> var_data = random.Uniform<T>(var, 0.0f, 0.1f);
+    test->AddInput<T>("input_var", bias_dims, bias_data);
+
+    std::vector<T> output_data = FillZeros<T>(input_dims);
+    test->AddOutput<T>("Y", input_dims, output_data);
+    return test;
+  }
+};
+
+TYPED_TEST(CudaNhwcTypedTest, BatchNormNhwc) {
+  auto op = BatchNormOp<TypeParam>{
+      .input_dims = {4, 16, 64, 64},
+  };
+
+  MAKE_PROVIDERS()
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cuda/nhwc/pool_test.cc b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc
new file mode 100644
index 0000000000000..3d1f81e6bc282
--- /dev/null
+++ b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc
@@ -0,0 +1,95 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// Licensed under the MIT License.
+
+#include "test/providers/cuda/nhwc/nhwc_cuda_helper.h"
+
+namespace onnxruntime {
+namespace test {
+
+template <typename T>
+struct PoolOp {
+  const std::string pooling_type;
+  const std::vector<int64_t> input_dims;
+  const std::vector<int64_t> kernel_shape;
+  int64_t channels;
+  int64_t group = 1;
+  std::vector<int64_t> strides = {1, 1};
+  std::vector<int64_t> padding = {0, 0, 0, 0};
+
+  std::unique_ptr<CompareOpTester> get_test() {
+    RandomValueGenerator random{};
+
+    auto test = std::make_unique<CompareOpTester>(pooling_type.c_str(), 14);
+    std::vector<T> input_data = random.Uniform<T>(input_dims, 0.0f, 0.3f);
+
+    test->AddInput<T>("X", input_dims, input_data);
+
+    test->AddAttribute("kernel_shape", kernel_shape);
+    test->AddAttribute("strides", strides);
+    test->AddAttribute("pads", padding);
+
+    std::vector<int64_t> output_dims = {
+        input_dims[0], channels,
+        (kernel_shape[1] - 1) + (input_dims[2] - 1) * strides[1] - (padding[1] + padding[0]) + 1,
+        (kernel_shape[0] - 1) + (input_dims[3] - 1) * strides[0] - (padding[3] + padding[2]) + 1};
+    std::vector<T> output_data = FillZeros<T>(output_dims);
+
+    test->AddOutput<T>("Y", output_dims, output_data);
+    return test;
+  }
+};
+
+TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwc) {
+  auto op = PoolOp<TypeParam>{
+      .pooling_type = "AveragePool",
+      .input_dims = {1, 16, 64, 64},
+      .kernel_shape = {3, 3},
+      .channels = 16,
+  };
+  MAKE_PROVIDERS()
+}
+
+TYPED_TEST(CudaNhwcTypedTest, MaxPoolNhwc) {
+  auto op = PoolOp<TypeParam>{
+      .pooling_type = "MaxPool",
+      .input_dims = {1, 16, 64, 64},
+      .kernel_shape = {3, 3},
+      .channels = 16,
+  };
+  MAKE_PROVIDERS()
+}
+
+TYPED_TEST(CudaNhwcTypedTest, GlobalMaxPoolNhwc) {
+  RandomValueGenerator random{};
+  auto test = std::make_unique<CompareOpTester>("GlobalMaxPool", 14);
+  const std::vector<int64_t> input_dims = {4, 16, 4, 8};
+  std::vector<TypeParam> input_data = random.Uniform<TypeParam>(input_dims, 0.5f, 1.3f);
+  test->AddInput<TypeParam>("X", input_dims, input_data);
+
+  std::vector<int64_t> output_dims = {input_dims[0], input_dims[1], 1, 1};
+  std::vector<TypeParam> output_data = FillZeros<TypeParam>(output_dims);
+  test->AddOutput<TypeParam>("Y", output_dims, output_data);
+
+  std::vector<std::shared_ptr<IExecutionProvider>> execution_providers;
+  OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true};
+  execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc));
+
+  double error_tolerance = 1e-3;
+  OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false};
+  auto source_ep = CudaExecutionProviderWithOptions(&nchw);
+  test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance);
+}
+
+TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwcPad) {
+  auto op = PoolOp<TypeParam>{.pooling_type = "AveragePool",
+                              .input_dims = {1, 16, 64, 64},
+                              .kernel_shape = {3, 3},
+                              .channels = 16,
+                              .padding = {2, 2, 2, 2}};
+
+  MAKE_PROVIDERS()
+}
+
+}  // namespace test
+}  // namespace onnxruntime