microsoft · hariharans29 · Oct 17, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 16, 2023
diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
@@ -168,6 +168,8 @@ set(provider_excluded_files
   "gpu_data_transfer.h"
   "integer_gemm.cc"
   "tunable/*"
+  "cuda_nhwc_kernels.cc"
+  "cuda_nhwc_kernels.h"
 )
 
 set(training_ops_excluded_files

diff --git a/onnxruntime/core/providers/rocm/miopen_common.cc b/onnxruntime/core/providers/rocm/miopen_common.cc
@@ -73,7 +73,12 @@ Status MiopenTensor::CreateTensorIfNeeded() {
   return Status::OK();
 }
 
-Status MiopenTensor::Set(gsl::span<const int64_t> input_dims, miopenDataType_t dataType) {
+Status MiopenTensor::Set(gsl::span<const int64_t> input_dims, miopenDataType_t dataType, bool is_nhwc) {
+  if (is_nhwc) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
+                           "NHWC Tensor usage is not supported in AMD builds for now");
+  }
+
   ORT_RETURN_IF_ERROR(CreateTensorIfNeeded());
 
   int rank = gsl::narrow_cast<int>(input_dims.size());

diff --git a/onnxruntime/core/providers/rocm/miopen_common.h b/onnxruntime/core/providers/rocm/miopen_common.h
@@ -33,7 +33,7 @@ class MiopenTensor final {
   ~MiopenTensor();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(MiopenTensor);
 
-  Status Set(gsl::span<const int64_t> input_dims, miopenDataType_t dataType);
+  Status Set(gsl::span<const int64_t> input_dims, miopenDataType_t dataType, bool is_nhwc = false);
   Status Set(miopenDataType_t dataType, miopenTensorLayout_t tensor_layout, int n, int c, int h, int w);
   Status Set(const MiopenTensor& x_desc, miopenBatchNormMode_t mode);
 

diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc
@@ -16,28 +16,28 @@ namespace rocm {
       T,                                                                                   \
       kRocmExecutionProvider,                                                              \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      ConvTranspose<T>);                                                                   \
+      ConvTranspose<T, false>);                                                            \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
       ConvTranspose,                                                                       \
       kOnnxDomain,                                                                         \
       11,                                                                                  \
       T,                                                                                   \
       kRocmExecutionProvider,                                                              \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      ConvTranspose<T>);
+      ConvTranspose<T, false>);
 
 REGISTER_KERNEL_TYPED(float)
 // not yet supported in MIOpen
 // REGISTER_KERNEL_TYPED(double)
 REGISTER_KERNEL_TYPED(MLFloat16)
 
-template <typename T>
-Status ConvTranspose<T>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, bool NHWC>
+Status ConvTranspose<T, NHWC>::ComputeInternal(OpKernelContext* context) const {
   return DoConvTranspose(context, false);
 }
 
-template <typename T>
-Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const {
+template <typename T, bool NHWC>
+Status ConvTranspose<T, NHWC>::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const {
   typedef typename ToHipType<T>::MappedType HipT;
 
   const Tensor* X = context->Input<Tensor>(0);

diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.h b/onnxruntime/core/providers/rocm/nn/conv_transpose.h
@@ -12,10 +12,12 @@
 namespace onnxruntime {
 namespace rocm {
 
-template <typename T>
+template <typename T, bool NHWC>
 class ConvTranspose : public RocmKernel {
  public:
-  ConvTranspose(const OpKernelInfo& info) : RocmKernel(info), conv_transpose_attrs_(info){};
+  ConvTranspose(const OpKernelInfo& info) : RocmKernel(info), conv_transpose_attrs_(info) {
+    static_assert(!NHWC, "AMD builds don't support usage of NHWC ops");
+  };
   Status ComputeInternal(OpKernelContext* context) const override;
   Status DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const;
 

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
@@ -247,6 +247,7 @@ def convert_arg_line_to_args(self, arg_line):
         "--cudnn_home is not specified.",
     )
     parser.add_argument("--enable_cuda_line_info", action="store_true", help="Enable CUDA line info.")
+    parser.add_argument("--enable_cuda_nhwc_ops", action="store_true", help="Enable CUDA NHWC ops in build.")
 
     # Python bindings
     parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.")
@@ -1025,6 +1026,7 @@ def generate_build_tree(
         "-Donnxruntime_USE_MPI=" + ("ON" if args.use_mpi else "OFF"),
         "-Donnxruntime_ENABLE_MEMORY_PROFILE=" + ("ON" if args.enable_memory_profile else "OFF"),
         "-Donnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO=" + ("ON" if args.enable_cuda_line_info else "OFF"),
+        "-Donnxruntime_USE_CUDA_NHWC_OPS=" + ("ON" if args.enable_cuda_nhwc_ops else "OFF"),
         "-Donnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB=" + ("ON" if args.build_wasm_static_lib else "OFF"),
         "-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_CATCHING="
         + ("OFF" if args.disable_wasm_exception_catching else "ON"),

diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -106,7 +106,7 @@ jobs:
               --parallel \
               --build_wheel \
               --enable_onnx_tests --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda-${{variables.common_cuda_version}} --cudnn_home=/usr/local/cuda-${{variables.common_cuda_version}} \
-              --enable_cuda_profiling \
+              --enable_cuda_profiling --enable_cuda_nhwc_ops \
               --enable_pybind --build_java \
               --use_cache \
               --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \