From cf0dc093a7b3e7e737473f79108e0caa4b30dc40 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Sun, 25 Feb 2018 18:27:04 -0800
Subject: [PATCH 01/32] Adds layout support: mx.sym.Pooling(...,
 layout='NHWC',...) with tests.

---
 python/mxnet/gluon/nn/conv_layers.py      |  69 +++++---
 src/operator/nn/cudnn/cudnn_pooling-inl.h |  73 ++++++---
 src/operator/nn/pooling-inl.h             |  21 ++-
 src/operator/nn/pooling.cc                | 187 +++++++++++++---------
 src/operator/nn/pooling.cu                |  14 +-
 tests/python/gpu/test_operator_gpu.py     |  98 ++++++++++--
 6 files changed, 320 insertions(+), 142 deletions(-)

diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 5f20d20c02ab..57241967a263 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -673,7 +673,7 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
 class _Pooling(HybridBlock):
     """Abstract class for different pooling layers."""
     def __init__(self, pool_size, strides, padding, ceil_mode, global_pool,
-                 pool_type, count_include_pad=None, **kwargs):
+                 pool_type, layout, count_include_pad=None, **kwargs):
         super(_Pooling, self).__init__(**kwargs)
         if strides is None:
             strides = pool_size
@@ -684,6 +684,7 @@ def __init__(self, pool_size, strides, padding, ceil_mode, global_pool,
         self._kwargs = {
             'kernel': pool_size, 'stride': strides, 'pad': padding,
             'global_pool': global_pool, 'pool_type': pool_type,
+            'layout': layout,
             'pooling_convention': 'full' if ceil_mode else 'valid'}
         if count_include_pad is not None:
             self._kwargs['count_include_pad'] = count_include_pad
@@ -695,7 +696,8 @@ def hybrid_forward(self, F, x):
         return F.Pooling(x, name='fwd', **self._kwargs)
 
     def __repr__(self):
-        s = '{name}(size={kernel}, stride={stride}, padding={pad}, ceil_mode={ceil_mode})'
+        s = '{name}(size={kernel}, stride={stride}, padding={pad}, ceil_mode={ceil_mode}'
+        s += ', global_pool={global_pool}, pool_type={pool_type}, layout={layout})'
         return s.format(name=self.__class__.__name__,
                         ceil_mode=self._kwargs['pooling_convention'] == 'full',
                         **self._kwargs)
@@ -738,12 +740,13 @@ class MaxPool1D(_Pooling):
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, **kwargs):
-        assert layout == 'NCW', "Only supports 'NCW' layout for now"
+        assert layout == 'NCW' or layout == 'NWC',\
+            "Only NCW and NWC layouts are valid for 1D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
         assert len(pool_size) == 1, "pool_size must be a number or a list of 1 ints"
         super(MaxPool1D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'max', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'max', layout, **kwargs)
 
 
 class MaxPool2D(_Pooling):
@@ -786,12 +789,13 @@ class MaxPool2D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0, layout='NCHW',
                  ceil_mode=False, **kwargs):
-        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
+        assert layout == 'NCHW' or layout == 'NHWC',\
+            "Only NCHW and NHWC layouts are valid for 2D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
         assert len(pool_size) == 2, "pool_size must be a number or a list of 2 ints"
         super(MaxPool2D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'max', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'max', layout, **kwargs)
 
 
 class MaxPool3D(_Pooling):
@@ -836,12 +840,13 @@ class MaxPool3D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
+        assert layout == 'NCDHW' or layout == 'NDHWC',\
+            "Only NCDHW and NDHWC layouts are valid for 3D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
         assert len(pool_size) == 3, "pool_size must be a number or a list of 3 ints"
         super(MaxPool3D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'max', **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'max', layout, **kwargs)
 
 
 class AvgPool1D(_Pooling):
@@ -882,12 +887,14 @@ class AvgPool1D(_Pooling):
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, count_include_pad=True, **kwargs):
-        assert layout == 'NCW', "Only supports 'NCW' layout for now"
+        assert layout == 'NCW' or layout == 'NWC',\
+            "Only NCW and NWC layouts are valid for 1D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
         assert len(pool_size) == 1, "pool_size must be a number or a list of 1 ints"
         super(AvgPool1D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', count_include_pad, **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', layout, count_include_pad,
+            **kwargs)
 
 
 class AvgPool2D(_Pooling):
@@ -931,12 +938,14 @@ class AvgPool2D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCHW', count_include_pad=True, **kwargs):
-        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
+        assert layout == 'NCHW' or layout == 'NHWC',\
+            "Only NCHW and NHWC layouts are valid for 2D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
         assert len(pool_size) == 2, "pool_size must be a number or a list of 2 ints"
         super(AvgPool2D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', count_include_pad, **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', layout, count_include_pad,
+            **kwargs)
 
 
 class AvgPool3D(_Pooling):
@@ -982,12 +991,14 @@ class AvgPool3D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', count_include_pad=True, **kwargs):
-        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
+        assert layout == 'NCDHW' or layout == 'NDHWC',\
+            "Only NCDHW and NDHWC layouts are valid for 3D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
         assert len(pool_size) == 3, "pool_size must be a number or a list of 3 ints"
         super(AvgPool3D, self).__init__(
-            pool_size, strides, padding, ceil_mode, False, 'avg', count_include_pad, **kwargs)
+            pool_size, strides, padding, ceil_mode, False, 'avg', layout, count_include_pad,
+            **kwargs)
 
 
 class GlobalMaxPool1D(_Pooling):
@@ -1011,9 +1022,10 @@ class GlobalMaxPool1D(_Pooling):
           when `layout` is `NCW`.
     """
     def __init__(self, layout='NCW', **kwargs):
-        assert layout == 'NCW', "Only supports 'NCW' layout for now"
+        assert layout == 'NCW' or layout == 'NWC',\
+            "Only NCW and NWC layouts are valid for 1D"
         super(GlobalMaxPool1D, self).__init__(
-            (1,), None, 0, True, True, 'max', **kwargs)
+            (1,), None, 0, True, True, 'max', layout, **kwargs)
 
 
 class GlobalMaxPool2D(_Pooling):
@@ -1038,9 +1050,10 @@ class GlobalMaxPool2D(_Pooling):
           `(batch_size, channels, 1, 1)` when `layout` is `NCHW`.
     """
     def __init__(self, layout='NCHW', **kwargs):
-        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
+        assert layout == 'NCHW' or layout == 'NHWC',\
+            "Only NCHW and NHWC layouts are valid for 2D"
         super(GlobalMaxPool2D, self).__init__(
-            (1, 1), None, 0, True, True, 'max', **kwargs)
+            (1, 1), None, 0, True, True, 'max', layout, **kwargs)
 
 
 class GlobalMaxPool3D(_Pooling):
@@ -1066,9 +1079,10 @@ class GlobalMaxPool3D(_Pooling):
           `(batch_size, channels, 1, 1, 1)` when `layout` is `NCDHW`.
     """
     def __init__(self, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
+        assert layout == 'NCDHW' or layout == 'NDHWC',\
+            "Only NCDHW and NDHWC layouts are valid for 3D"
         super(GlobalMaxPool3D, self).__init__(
-            (1, 1, 1), None, 0, True, True, 'max', **kwargs)
+            (1, 1, 1), None, 0, True, True, 'max', layout, **kwargs)
 
 
 class GlobalAvgPool1D(_Pooling):
@@ -1090,9 +1104,10 @@ class GlobalAvgPool1D(_Pooling):
         - **out**: 3D output tensor with shape `(batch_size, channels, 1)`.
     """
     def __init__(self, layout='NCW', **kwargs):
-        assert layout == 'NCW', "Only supports 'NCW' layout for now"
+        assert layout == 'NCW' or layout == 'NWC',\
+            "Only NCW and NWC layouts are valid for 1D"
         super(GlobalAvgPool1D, self).__init__(
-            (1,), None, 0, True, True, 'avg', **kwargs)
+            (1,), None, 0, True, True, 'avg', layout, **kwargs)
 
 
 class GlobalAvgPool2D(_Pooling):
@@ -1116,9 +1131,10 @@ class GlobalAvgPool2D(_Pooling):
           `(batch_size, channels, 1, 1)` when `layout` is `NCHW`.
     """
     def __init__(self, layout='NCHW', **kwargs):
-        assert layout == 'NCHW', "Only supports 'NCHW' layout for now"
+        assert layout == 'NCHW' or layout == 'NHWC',\
+            "Only NCHW and NHWC layouts are valid for 2D"
         super(GlobalAvgPool2D, self).__init__(
-            (1, 1), None, 0, True, True, 'avg', **kwargs)
+            (1, 1), None, 0, True, True, 'avg', layout, **kwargs)
 
 
 class GlobalAvgPool3D(_Pooling):
@@ -1143,9 +1159,10 @@ class GlobalAvgPool3D(_Pooling):
           `(batch_size, channels, 1, 1, 1)` when `layout` is `NCDHW`.
     """
     def __init__(self, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW', "Only supports 'NCDHW' layout for now"
+        assert layout == 'NCDHW' or layout == 'NDHWC',\
+            "Only NCDHW and NDHWC layouts are valid for 3D"
         super(GlobalAvgPool3D, self).__init__(
-            (1, 1, 1), None, 0, True, True, 'avg', **kwargs)
+            (1, 1, 1), None, 0, True, True, 'avg', layout, **kwargs)
 
 
 class ReflectionPad2D(HybridBlock):
diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 89fa73ef5471..0d0bd0a89913 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -73,7 +73,8 @@ class CuDNNPoolingOp {
     CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
   }
 
-  void Forward(const OpContext &ctx, const TBlob &in_data,
+  // Return boolean saying whether pooling configuration is supported
+  bool Forward(const OpContext &ctx, const TBlob &in_data,
       const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
@@ -81,7 +82,8 @@ class CuDNNPoolingOp {
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
-    this->Init(s, in_data, out_data);
+    if (!this->Init(s, in_data, out_data))
+      return false;
     if (param_.kernel.ndim() == 2) {
       // 2d pool
       Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
@@ -113,9 +115,11 @@ class CuDNNPoolingOp {
     } else {
       LOG(FATAL) << "Only support 2D or 3D pooling";
     }
+    return true;
   }
 
-  void Backward(const OpContext &ctx, const TBlob &out_grad,
+  // Return boolean saying whether pooling configuration is supported
+  bool Backward(const OpContext &ctx, const TBlob &out_grad,
       const TBlob &in_data, const TBlob &out_data,
       const OpReqType &req, const TBlob &in_grad) {
     using namespace mshadow;
@@ -125,7 +129,8 @@ class CuDNNPoolingOp {
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
-    this->Init(s, in_data, out_data);
+    if (!this->Init(s, in_data, out_data))
+      return false;
     if (param_.kernel.ndim() == 2) {
       // 2d pool
       Tensor<gpu, 4, DType> m_out_grad = out_grad.get<gpu, 4, DType>(s);
@@ -165,55 +170,78 @@ class CuDNNPoolingOp {
     } else {
       LOG(FATAL) << "Only support 2D or 3D pooling";
     }
+    return true;
   }
 
  private:
-  inline void Init(mshadow::Stream<gpu> *s, const TBlob &in_data,
+  // Return boolean saying whether pooling configuration is supported
+  inline bool Init(mshadow::Stream<gpu> *s, const TBlob &in_data,
       const TBlob &out_data) {
     using namespace mshadow;
+    bool is_supported = true;
     #if CUDNN_MAJOR >= 5
     nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
     #endif
     if (param_.kernel.ndim() == 2) {
       // 2d conv
+      CHECK(param_.layout.value() == mshadow::kNCHW ||
+            param_.layout.value() == mshadow::kNHWC) << "Need 2D layout";
+      cudnnTensorFormat_t cudnn_layout =
+          (param_.layout.value() == mshadow::kNCHW) ? CUDNN_TENSOR_NCHW
+                                                    : CUDNN_TENSOR_NHWC;
       Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
       Tensor<gpu, 4, DType> out = out_data.get<gpu, 4, DType>(s);
-      mshadow::Shape<4> dshape = data.shape_;
+      // Perform shape calculations in a standard (NCHW) layout space
+      mshadow::Shape<4> dshape_nchw = (param_.layout.value() == mshadow::kNHWC) ?
+                                      ConvertLayout(data.shape_, mshadow::kNHWC, mshadow::kNCHW) :
+                                      data.shape_;
+      mshadow::Shape<4> oshape_nchw = (param_.layout.value() == mshadow::kNHWC) ?
+                                      ConvertLayout(out.shape_, mshadow::kNHWC, mshadow::kNCHW) :
+                                      out.shape_;
       CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
-                                            CUDNN_TENSOR_NCHW,
+                                            cudnn_layout,
                                             dtype_,
-                                            data.shape_[0],
-                                            data.shape_[1],
-                                            data.shape_[2],
-                                            data.shape_[3]));
+                                            dshape_nchw[0],
+                                            dshape_nchw[1],
+                                            dshape_nchw[2],
+                                            dshape_nchw[3]));
       CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_,
-                                            CUDNN_TENSOR_NCHW,
+                                            cudnn_layout,
                                             dtype_,
-                                            out.shape_[0],
-                                            out.shape_[1],
-                                            out.shape_[2],
-                                            out.shape_[3]));
+                                            oshape_nchw[0],
+                                            oshape_nchw[1],
+                                            oshape_nchw[2],
+                                            oshape_nchw[3]));
+      int window_height = param_.global_pool ? dshape_nchw[2] : param_.kernel[0];
+      int window_width = param_.global_pool ? dshape_nchw[3] : param_.kernel[1];
+      // CuDNN v7.1.4 backprop kernel doesn't support window sizes 9 and above.
+      #if CUDNN_VERSION == 7104
+      is_supported = window_height <= 8 && window_width <= 8;
+      #endif
       #if CUDNN_MAJOR >= 5
       CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
                                              mode_,
                                              nan_prop_,
-                                             param_.global_pool ? dshape[2] : param_.kernel[0],
-                                             param_.global_pool ? dshape[3] : param_.kernel[1],
+                                             window_height,
+                                             window_width,
                                              param_.global_pool ? 0 : param_.pad[0],
                                              param_.global_pool ? 0 : param_.pad[1],
                                              param_.global_pool ? 1 : param_.stride[0],
-                                             param_.global_pool ? 1 :param_.stride[1]));
+                                             param_.global_pool ? 1 : param_.stride[1]));
       #else
       CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
                                              mode_,
-                                             param_.global_pool ? dshape[2] : param_.kernel[0],
-                                             param_.global_pool ? dshape[3] : param_.kernel[1],
+                                             window_height,
+                                             window_width,
                                              param_.global_pool ? 0 : param_.pad[0],
-                                             param_.global_ppol ? 0 : param_.pad[1],
+                                             param_.global_pool ? 0 : param_.pad[1],
                                              param_.global_pool ? 1 : param_.stride[0],
                                              param_.global_pool ? 1 : param_.stride[1]));
       #endif
     } else {
+      CHECK(param_.layout.value() == mshadow::kNCDHW ||
+            param_.layout.value() == mshadow::kNDHWC) << "Need 3D layout";
+      CHECK(param_.layout.value() == mshadow::kNCDHW) << "Only the NCDHW layout is supported.";
       Tensor<gpu, 5, DType> data = in_data.get<gpu, 5, DType>(s);
       Tensor<gpu, 5, DType> out = out_data.get<gpu, 5, DType>(s);
       std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
@@ -275,6 +303,7 @@ class CuDNNPoolingOp {
       LOG(FATAL) << "3D pooling only support CUDNN v5 and above";
       #endif
     }
+    return is_supported;
   }
 
   cudnnDataType_t dtype_;
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 71d85da9ba52..7c721907ee89 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -53,6 +53,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   bool cudnn_off;
   dmlc::optional<int> p_value;
   dmlc::optional<bool> count_include_pad;
+  dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     DMLC_DECLARE_FIELD(kernel).set_default(TShape())  // add default value here
     .enforce_nonzero()
@@ -92,6 +93,16 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
               "calculation. For example, with a 5*5 kernel on a 3*3 corner of a image,"
               "the sum of the 9 valid elements will be divided by 25 if this is set to true,"
               "or it will be divided by 9 if this is set to false. Defaults to true.");
+
+    DMLC_DECLARE_FIELD(layout)
+    .add_enum("NCW", mshadow::kNCW)
+    .add_enum("NCHW", mshadow::kNCHW)
+    .add_enum("NCDHW", mshadow::kNCDHW)
+    .add_enum("NHWC", mshadow::kNHWC)
+    .add_enum("NDHWC", mshadow::kNDHWC)
+    .set_default(dmlc::optional<int>())
+    .describe("Set layout for input and output. Empty for\n    "
+              "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d.");
   }
 
   bool operator==(const PoolingParam& other) const {
@@ -103,7 +114,8 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
            this->global_pool        == other.global_pool &&
            this->cudnn_off          == other.cudnn_off &&
            this->p_value            == other.p_value &&
-           this->count_include_pad  == other.count_include_pad;
+           this->count_include_pad  == other.count_include_pad &&
+           this->layout             == other.layout;
   }
 };
 
@@ -124,6 +136,7 @@ struct hash<mxnet::op::PoolingParam> {
     ret = dmlc::HashCombine(ret, val.cudnn_off);
     ret = dmlc::HashCombine(ret, val.p_value);
     ret = dmlc::HashCombine(ret, val.count_include_pad);
+    ret = dmlc::HashCombine(ret, val.layout);
     return ret;
   }
 };
@@ -149,6 +162,9 @@ class PoolingOp {
   void Forward(const OpContext& ctx, const TBlob& in_data,
                const OpReqType& req, const TBlob& out_data) {
     using namespace mshadow;
+    CHECK(param_.layout.value() == kNCW ||
+          param_.layout.value() == kNCHW ||
+          param_.layout.value() == kNCDHW) << "Need CuDNN for layout support";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const TShape& ishape = in_data.shape_;
     TShape kernel = param_.kernel;
@@ -198,6 +214,9 @@ class PoolingOp {
                 const TBlob& in_data, const TBlob& out_data,
                 const OpReqType& req, const TBlob& in_grad) {
     using namespace mshadow;
+    CHECK(param_.layout.value() == kNCW ||
+          param_.layout.value() == kNCHW ||
+          param_.layout.value() == kNCDHW) << "Need CuDNN for layout support";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const TShape& ishape = in_data.shape_;
     TShape kernel = param_.kernel;
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 611568807a9a..fc536078b8a4 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -40,9 +40,11 @@ void PoolingParamParser(nnvm::NodeAttrs *attrs) {
   PoolingParam param;
   param.Init(attrs->dict);
   if (param.kernel.ndim() == 1) {
+    param.layout = param.layout ? param.layout.value() : mshadow::kNCW;
     if (param.stride.ndim() == 0) param.stride = Shape1(1);
     if (param.pad.ndim() == 0) param.pad = Shape1(0);
   } else if (param.kernel.ndim() == 2) {
+    param.layout = param.layout ? param.layout.value() : mshadow::kNCHW;
     if (param.stride.ndim() == 0) param.stride = Shape2(1, 1);
     if (param.pad.ndim() == 0) param.pad = Shape2(0, 0);
   } else {
@@ -51,6 +53,7 @@ void PoolingParamParser(nnvm::NodeAttrs *attrs) {
         CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim()
             << "D pooling not supported";
       }
+    param.layout = param.layout ? param.layout.value(): mshadow::kNCDHW;
     if (param.stride.ndim() == 0) param.stride = Shape3(1, 1, 1);
     if (param.pad.ndim() == 0) param.pad = Shape3(0, 0, 0);
   }
@@ -111,38 +114,65 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       << "Pooling: Input data should be  3D in (batch, channel, x)"
       << " Or 4D in (batch, channel, y, x) "
       << " Or 5D in (batch, channel, d, y, x)";
-  TShape oshape = dshape;
   if (dshape.ndim() == 0) return false;
   if (param.global_pool) {
-      for (size_t i{2}; i < dshape.ndim(); i++)
-          oshape[i] = 1;
-      out_shape->clear();
-      out_shape->push_back(oshape);  // save output shape
+    TShape oshape = dshape;
+    size_t c_index = 0;
+    switch (param.layout.value()) {
+      case mshadow::kNCW:
+      case mshadow::kNCHW:
+      case mshadow::kNCDHW:
+        c_index = 1;
+        break;
+      case mshadow::kNWC:
+      case mshadow::kNHWC:
+      case mshadow::kNDHWC:
+        c_index = dshape.ndim() - 1;
+        break;
+      default:
+        LOG(FATAL) << "Unsupported tensor layout " << param.layout.value();
+    }
+    for (size_t i{1}; i < dshape.ndim(); i++)
+      if (i != c_index)
+        oshape[i] = 1;
+    out_shape->clear();
+    out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_MKLDNN == 1
-      if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+    if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
         out_shape->push_back(oshape);   // for workspace
 #endif
+  } else if (param.kernel.ndim() == 0) {
+    return false;
   } else if (param.kernel.ndim() == 1) {
-    CHECK_EQ(dshape.ndim(), 3U)
-        << "Pooling: Input data should be 3D in (batch, channel, x)";
-    CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
-        << "kernel size (" << param.kernel[0] << ") exceeds input ("
-        << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
-        << ")";
+    CHECK_EQ(dshape.ndim(), 3U) <<
+      "Pooling: Input data should be 3D in (batch, channel, x)";
+    CHECK(param.layout.value() == mshadow::kNCW ||
+          param.layout.value() == mshadow::kNWC) << "Need 1D layout";
+    // Perform shape calculations in a standard (NCW) layout space
+    mshadow::Shape<3> dshape_ncw = (param.layout.value() == mshadow::kNWC) ?
+                                    ConvertLayout(dshape.get<3>(), mshadow::kNWC, mshadow::kNCW) :
+                                    dshape.get<3>();
+    mshadow::Shape<3> oshape_ncw = dshape_ncw;
+    CHECK(param.kernel[0] <= dshape_ncw[2] + 2 * param.pad[0])
+        << "kernel size (" << param.kernel[0] << ") exceeds input (" << dshape[2]
+        << " padded to " << (dshape_ncw[2] + 2*param.pad[0]) << ")";
     if (param.pooling_convention == pool_enum::kValid) {
-      oshape[2] = 1 +
-                  (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                      param.stride[0];
+      oshape_ncw[2] = 1 +
+                      (dshape_ncw[2] + 2 * param.pad[0] - param.kernel[0]) /
+                          param.stride[0];
     } else if (param.pooling_convention == pool_enum::kFull) {
-      oshape[2] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                             param.kernel[0]) /
-                          param.stride[0]));
+      oshape_ncw[2] = 1 + static_cast<int>(std::ceil(
+                              static_cast<float>(dshape_ncw[2] + 2 * param.pad[0] -
+                                                 param.kernel[0]) /
+                              param.stride[0]));
     } else {
-      oshape[2] = static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[2] + 2 * param.pad[0]) /
+      oshape_ncw[2] = static_cast<int>(std::ceil(
+                          static_cast<float>(dshape_ncw[2] + 2 * param.pad[0]) /
                           param.stride[0]));
     }
+    // Convert back from standard (NCW) layout space to the actual layout type
+    TShape oshape = (param.layout.value() == mshadow::kNWC) ?
+                    ConvertLayout(oshape_ncw, mshadow::kNCW, mshadow::kNWC) : oshape_ncw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_MKLDNN == 1
@@ -150,33 +180,38 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       out_shape->push_back(oshape);   // for workspace
 #endif
   } else if (param.kernel.ndim() == 2) {
-    CHECK_EQ(dshape.ndim(), 4U)
-        << "Pooling: Input data should be 4D in (batch, channel, y, x)";
-    CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0])
-        << "kernel size (" << param.kernel[0] << ") exceeds input ("
-        << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0])
-        << ")";
-    CHECK(param.kernel[1] <= dshape[3] + 2 * param.pad[1])
-        << "kernel size (" << param.kernel[1] << ") exceeds input ("
-        << dshape[3] << " padded to " << (dshape[3] + 2 * param.pad[1])
-        << ")";
+    CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+    CHECK(param.layout.value() == mshadow::kNCHW ||
+          param.layout.value() == mshadow::kNHWC) << "Need 2D layout";
+    // Perform shape calculations in a standard (NCHW) layout space
+    mshadow::Shape<4> dshape_nchw = (param.layout.value() == mshadow::kNHWC) ?
+                                    ConvertLayout(dshape.get<4>(), mshadow::kNHWC, mshadow::kNCHW) :
+                                    dshape.get<4>();
+    mshadow::Shape<4> oshape_nchw = dshape_nchw;
+    CHECK(param.kernel[0] <= dshape_nchw[2] + 2 * param.pad[0])
+        << "kernel size (" << param.kernel[0] << ") exceeds input (" << dshape_nchw[2]
+        << " padded to " << (dshape_nchw[2] + 2*param.pad[0]) << ")";
+    CHECK(param.kernel[1] <= dshape_nchw[3] + 2 * param.pad[1])
+        << "kernel size (" << param.kernel[1] << ") exceeds input (" << dshape_nchw[3]
+        << " padded to " << (dshape_nchw[3] + 2*param.pad[1]) << ")";
     if (param.pooling_convention == pool_enum::kValid) {
-      oshape[2] = 1 +
-                  (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                      param.stride[0];
-      oshape[3] = 1 +
-                  (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
-                      param.stride[1];
+      oshape_nchw[2] = 1 + (dshape_nchw[2] + 2 * param.pad[0] - param.kernel[0]) /
+                          param.stride[0];
+      oshape_nchw[3] = 1 + (dshape_nchw[3] + 2 * param.pad[1] - param.kernel[1]) /
+                          param.stride[1];
     } else {
-      oshape[2] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                             param.kernel[0]) /
-                          param.stride[0]));
-      oshape[3] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[3] + 2 * param.pad[1] -
-                                             param.kernel[1]) /
-                          param.stride[1]));
+      oshape_nchw[2] = 1 + static_cast<int>(ceil(
+                               static_cast<float>(dshape_nchw[2] + 2 * param.pad[0] -
+                                                  param.kernel[0]) /
+                               param.stride[0]));
+      oshape_nchw[3] = 1 + static_cast<int>(ceil(
+                               static_cast<float>(dshape_nchw[3] + 2 * param.pad[1] -
+                                                  param.kernel[1]) /
+                               param.stride[1]));
     }
+    // Convert back from standard (NCHW) layout space to the actual layout type
+    TShape oshape = (param.layout.value() == mshadow::kNHWC) ?
+                    ConvertLayout(oshape_nchw, mshadow::kNCHW, mshadow::kNHWC) : oshape_nchw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_MKLDNN == 1
@@ -185,38 +220,41 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
 #endif
   } else if (param.kernel.ndim() == 3) {
     CHECK_EQ(dshape.ndim(), 5U)
-        << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
-    CHECK_LE(param.kernel[0], dshape[2] + 2 * param.pad[0])
-        << "kernel size exceeds input";
-    CHECK_LE(param.kernel[1], dshape[3] + 2 * param.pad[1])
-        << "kernel size exceeds input";
-    CHECK_LE(param.kernel[2], dshape[4] + 2 * param.pad[2])
-        << "kernel size exceeds input";
+      << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+    CHECK(param.layout.value() == mshadow::kNCDHW ||
+          param.layout.value() == mshadow::kNDHWC) << "Need 3D layout";
+    // Perform shape calculations in a standard (NCDHW) layout space
+    mshadow::Shape<5> dshape_ncdhw = (param.layout.value() == mshadow::kNDHWC) ?
+                                  ConvertLayout(dshape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW) :
+                                  dshape.get<5>();
+    mshadow::Shape<5> oshape_ncdhw = dshape_ncdhw;
+    CHECK_LE(param.kernel[0], dshape_ncdhw[2] + 2 * param.pad[0]) << "kernel size exceeds input";
+    CHECK_LE(param.kernel[1], dshape_ncdhw[3] + 2 * param.pad[1]) << "kernel size exceeds input";
+    CHECK_LE(param.kernel[2], dshape_ncdhw[4] + 2 * param.pad[2]) << "kernel size exceeds input";
     if (param.pooling_convention == pool_enum::kValid) {
-      oshape[2] = 1 +
-                  (dshape[2] + 2 * param.pad[0] - param.kernel[0]) /
-                      param.stride[0];
-      oshape[3] = 1 +
-                  (dshape[3] + 2 * param.pad[1] - param.kernel[1]) /
-                      param.stride[1];
-      oshape[4] = 1 +
-                  (dshape[4] + 2 * param.pad[2] - param.kernel[2]) /
-                      param.stride[2];
+      oshape_ncdhw[2] = 1 + (dshape_ncdhw[2] + 2 * param.pad[0] - param.kernel[0]) /
+                          param.stride[0];
+      oshape_ncdhw[3] = 1 + (dshape_ncdhw[3] + 2 * param.pad[1] - param.kernel[1]) /
+                          param.stride[1];
+      oshape_ncdhw[4] = 1 + (dshape_ncdhw[4] + 2 * param.pad[2] - param.kernel[2]) /
+                          param.stride[2];
     } else {
-      oshape[2] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[2] + 2 * param.pad[0] -
-                                             param.kernel[0]) /
-                          param.stride[0]));
-      oshape[3] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[3] + 2 * param.pad[1] -
-                                             param.kernel[1]) /
-                          param.stride[1]));
-      oshape[4] = 1 + static_cast<int>(std::ceil(
-                          static_cast<float>(dshape[4] + 2 * param.pad[2] -
-                                             param.kernel[2]) /
-                          param.stride[2]));
+      oshape_ncdhw[2] = 1 + static_cast<int>(ceil(
+                                static_cast<float>(dshape_ncdhw[2] + 2 * param.pad[0] -
+                                                   param.kernel[0]) /
+                                param.stride[0]));
+      oshape_ncdhw[3] = 1 + static_cast<int>(ceil(
+                                static_cast<float>(dshape_ncdhw[3] + 2 * param.pad[1] -
+                                                   param.kernel[1]) /
+                                param.stride[1]));
+      oshape_ncdhw[4] = 1 + static_cast<int>(ceil(
+                                static_cast<float>(dshape_ncdhw[4] + 2 * param.pad[2] -
+                                                   param.kernel[2]) /
+                                param.stride[2]));
     }
-
+    // Convert back from standard (NCDHW) layout space to the actual layout type
+    TShape oshape = (param.layout.value() == mshadow::kNDHWC) ?
+                    ConvertLayout(oshape_ncdhw, mshadow::kNCDHW, mshadow::kNDHWC) : oshape_ncdhw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
 #if MXNET_USE_MKLDNN == 1
@@ -224,6 +262,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       out_shape->push_back(oshape);   // for workspace
 #endif
   }
+
   return true;
 }
 
diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu
index 997218620c3a..b996ebec9b42 100644
--- a/src/operator/nn/pooling.cu
+++ b/src/operator/nn/pooling.cu
@@ -61,8 +61,10 @@ void PoolingCompute<gpu>(const nnvm::NodeAttrs& attrs,
       switch (param.pool_type) {
         case pool_enum::kMaxPooling:
         case pool_enum::kAvgPooling:
-          GetCuDNNPoolingOp<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
-          return;
+          if (GetCuDNNPoolingOp<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0])) {
+            return;
+          }
+          break;
         case pool_enum::kSumPooling:
           LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
           break;
@@ -116,10 +118,12 @@ void PoolingGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
       switch (param.pool_type) {
         case pool_enum::kMaxPooling:
         case pool_enum::kAvgPooling:
-          GetCuDNNPoolingOp<DType>(param).Backward(ctx, inputs[ograd_idx],
+          if (GetCuDNNPoolingOp<DType>(param).Backward(ctx, inputs[ograd_idx],
                                                    inputs[in_data_idx], inputs[out_data_idx],
-                                                   req[0], outputs[0]);
-          return;
+                                                   req[0], outputs[0])) {
+            return;
+          }
+          break;
         case pool_enum::kSumPooling:
           LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
           break;
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 8054937a84c6..7baf26c9642e 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -608,6 +608,51 @@ def test_convolution_versions():
 
 
 @with_seed()
+def test_pooling_with_convention():
+    # While the float32 and float64 output is reliably consistent, float16 departs occasionally.
+    # We compare cpu and gpu results only within a given precision.
+    for data_type in [np.float64, np.float32, np.float16]:
+        ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': {'pool_data': data_type}},
+                    {'ctx': mx.cpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': {'pool_data': data_type}}]
+        sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='valid', name='pool')
+        check_consistency(sym, ctx_list)
+
+        sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='full', name='pool')
+        check_consistency(sym, ctx_list)
+
+        sym = mx.sym.Pooling(kernel=(300,300), pool_type='max', global_pool=True, name='pool')
+        check_consistency(sym, ctx_list)
+
+
+@with_seed()
+def test_pooling_nhwc_with_convention():
+    def make_pooling_syms(**kwargs):
+        # Conventional NCHW layout pooling
+        sym = mx.sym.Pooling(**kwargs)
+        # NHWC pooling
+        data = mx.sym.Variable('pool_data')
+        sym_nhwc = mx.sym.transpose(data, axes=(0,2,3,1))
+        sym_nhwc = mx.sym.Pooling(sym_nhwc, layout='NHWC', **kwargs)
+        sym_nhwc = mx.sym.transpose(sym_nhwc, axes=(0,3,1,2), name='pool')
+        return [sym, sym_nhwc]
+
+    # While the float32 and float64 output is reliably consistent, float16 departs occasionally.
+    # We compare nhwc and nchw results only within a given precision.
+    for in_shape in [(3, 4, 8, 8), (2, 2, 10, 10)]:
+        for data_type in [np.float64, np.float32, np.float16]:
+            # NHWC pooling is only enabled on GPU with CUDNN
+            ctx_list = [{'ctx': mx.gpu(0), 'pool_data': in_shape, 'type_dict': {'pool_data': data_type}}]
+            symlist = make_pooling_syms(kernel=(3,3), pool_type='max', pooling_convention='valid', name='pool')
+            check_consistency_NxM(symlist, ctx_list)
+
+            symlist = make_pooling_syms(kernel=(3,3), pool_type='max', pooling_convention='full', name='pool')
+            check_consistency_NxM(symlist, ctx_list)
+            # CUDNN v7.1.4 can't handle all cases, and there's no NHWC MXNet fallback impl yet
+            if in_shape[2] <= 8 and in_shape[3] <= 8:
+                symlist = make_pooling_syms(kernel=(300,300), pool_type='max', global_pool=True, name='pool')
+                check_consistency_NxM(symlist, ctx_list)
+
+
 def test_pooling_with_type():
     ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': {'pool_data': np.float64}},
                 {'ctx': mx.gpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': {'pool_data': np.float32}},
@@ -768,26 +813,51 @@ def test_spatial_transformer_with_type():
     check_consistency(sym, ctx_list)
     check_consistency(sym, ctx_list, grad_req="add")
 
-
 @with_seed()
 def test_pooling_with_type2():
-    ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float64}},
-                {'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float32}},
-                {'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float16}},
-                {'ctx': mx.cpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float64}},
-                {'ctx': mx.cpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float32}}]
+    # While the float32 and float64 output is reliably consistent, float16 departs occasionally.
+    # We compare cpu and gpu results only within a given precision.
+    for data_type in [np.float64, np.float32, np.float16]:
+        ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': data_type}},
+                    {'ctx': mx.cpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': data_type}}]
 
-    sym = mx.sym.Pooling(name='pool', kernel=(3,3), stride=(2,2), pool_type='max')
-    check_consistency(sym, ctx_list, rand_type=np.float16)
+        sym = mx.sym.Pooling(name='pool', kernel=(3,3), stride=(2,2), pool_type='max')
+        check_consistency(sym, ctx_list)
 
-    sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), pool_type='avg')
-    check_consistency(sym, ctx_list)
+        sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), pool_type='avg')
+        check_consistency(sym, ctx_list)
 
-    sym = mx.sym.Pooling(name='pool', kernel=(5,5), pad=(2,2), pool_type='max')
-    check_consistency(sym, ctx_list, rand_type=np.float16)
+        sym = mx.sym.Pooling(name='pool', kernel=(5,5), pad=(2,2), pool_type='max')
+        check_consistency(sym, ctx_list)
 
-    sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), pool_type='sum')
-    check_consistency(sym, ctx_list)
+        sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), pool_type='sum')
+        check_consistency(sym, ctx_list)
+
+@with_seed()
+def test_pooling_nhwc_with_type():
+    def make_pooling_syms(**kwargs):
+        # Conventional NCHW layout pooling
+        sym = mx.sym.Pooling(**kwargs)
+        # NHWC pooling
+        data = mx.sym.Variable('pool_data')
+        sym_nhwc = mx.sym.transpose(data, axes=(0,2,3,1))
+        sym_nhwc = mx.sym.Pooling(sym_nhwc, layout='NHWC', **kwargs)
+        sym_nhwc = mx.sym.transpose(sym_nhwc, axes=(0,3,1,2), name='pool')
+        return [sym, sym_nhwc]
+
+    # While the float32 and float64 output is reliably consistent, float16 departs occasionally.
+    # We compare nhwc and nchw results only within a given precision.
+    for data_type in [np.float64, np.float32, np.float16]:
+        # NHWC pooling only enabled on GPU with CUDNN
+        ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': data_type}}]
+        symlist = make_pooling_syms(name='pool', kernel=(3,3), stride=(2,2), pool_type='max')
+        check_consistency_NxM(symlist, ctx_list)
+
+        symlist = make_pooling_syms(name='pool', kernel=(3,3), pad=(1,1), pool_type='avg')
+        check_consistency_NxM(symlist, ctx_list)
+
+        symlist = make_pooling_syms(name='pool', kernel=(5,5), pad=(2,2), pool_type='max')
+        check_consistency_NxM(symlist, ctx_list)
 
 @unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/11517")
 @with_seed()

From 55ebfb7318f103b4a0e14505fba7005880871787 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Wed, 21 Nov 2018 13:26:42 -0800
Subject: [PATCH 02/32] Docs changes

---
 python/mxnet/gluon/nn/conv_layers.py | 30 ++++++++++++++++++----------
 src/operator/nn/pooling.cc           | 15 +++++++++-----
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 57241967a263..0c39bb7749b1 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -718,7 +718,8 @@ class MaxPool1D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' layout for now.
+        Dimension ordering of data and weight. Only supports 'NCW' and 'NWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Pooling is applied on the W dimension.
     ceil_mode : bool, default False
@@ -764,7 +765,8 @@ class MaxPool2D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' layout for now.
+        Dimension ordering of data and weight. Only supports 'NCHW' and 'NHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
@@ -813,7 +815,8 @@ class MaxPool3D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Only supports 'NCDHW' layout for now.
+        Dimension ordering of data and weight. Only supports 'NCDHW' and 'NDHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -863,7 +866,8 @@ class AvgPool1D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' layout for now.
+        Dimension ordering of data and weight. Only supports 'NCW' or 'NWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. padding is applied on 'W' dimension.
     ceil_mode : bool, default False
@@ -911,7 +915,8 @@ class AvgPool2D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' layout for now.
+        Dimension ordering of data and weight. Only supports 'NCHW' or 'NHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
@@ -1008,7 +1013,8 @@ class GlobalMaxPool1D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' layout for now.
+        Dimension ordering of data and weight. Only supports 'NCW' or 'NWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Pooling is applied on the W dimension.
 
@@ -1035,7 +1041,8 @@ class GlobalMaxPool2D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' layout for now.
+        Dimension ordering of data and weight. Only supports 'NCHW' or 'NHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
 
@@ -1063,7 +1070,8 @@ class GlobalMaxPool3D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Only supports 'NCDHW' layout for now.
+        Dimension ordering of data and weight. Only supports 'NCDHW' or 'NDHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -1091,7 +1099,8 @@ class GlobalAvgPool1D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' layout for now.
+        Dimension ordering of data and weight. Only supports 'NCW' or 'NWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. padding is applied on 'W' dimension.
 
@@ -1116,7 +1125,8 @@ class GlobalAvgPool2D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' layout for now.
+        Dimension ordering of data and weight. Only supports 'NCHW' or 'NHWC'
+        (only with cuDNN) layouts for now.
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively.
 
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index fc536078b8a4..1024f110492b 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -370,13 +370,17 @@ NNVM_REGISTER_OP(Pooling)
 
 The shapes for 1-D pooling are
 
-- **data**: *(batch_size, channel, width)*,
-- **out**: *(batch_size, num_filter, out_width)*.
+- **data**: *(batch_size, channel, width)* (NCW layout) or
+  *(batch_size, width, channel)* (NWC layout, cuDNN only),
+- **out**: *(batch_size, num_filter, out_width)* (NCW layout) or
+  *(batch_size, width, channel)* (NWC layout, cuDNN only).
 
 The shapes for 2-D pooling are
 
-- **data**: *(batch_size, channel, height, width)*
-- **out**: *(batch_size, num_filter, out_height, out_width)*, with::
+- **data**: *(batch_size, channel, height, width)* (NCHW layout) pr
+  *(batch_size, height, width, channel)* (NHWC layout, cuDNN only),
+- **out**: *(batch_size, num_filter, out_height, out_width)* (NCHW layout) or
+  *(batch_size, out_height, out_width, num_filter)* (NHWC layout, cuDNN only), with::
 
     out_height = f(height, kernel[0], pad[0], stride[0])
     out_width = f(width, kernel[1], pad[1], stride[1])
@@ -403,7 +407,8 @@ Three pooling options are supported by ``pool_type``:
 
 For 3-D pooling, an additional *depth* dimension is added before
 *height*. Namely the input data will have shape *(batch_size, channel, depth,
-height, width)*.
+height, width)* (NCDHW) or *(batch_size, depth, height, width, channel)* (NDHWC,
+cuDNN only).
 
 Notes on Lp pooling:
 

From 79e07fd5ea51fb814c23bf58484868be9491d233 Mon Sep 17 00:00:00 2001
From: Marco de Abreu <marcoabreu@users.noreply.github.com>
Date: Fri, 23 Nov 2018 22:32:30 +0100
Subject: [PATCH 03/32] Trigger

---
 src/operator/nn/cudnn/cudnn_pooling-inl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 0d0bd0a89913..a97466e82fa1 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -73,7 +73,7 @@ class CuDNNPoolingOp {
     CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
   }
 
-  // Return boolean saying whether pooling configuration is supported
+  // Return boolean saying whether pooling configuration is supported.
   bool Forward(const OpContext &ctx, const TBlob &in_data,
       const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;

From 4e0d0e9394966bbe67e6a5748c91c28056c32a86 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 26 Nov 2018 14:17:40 -0800
Subject: [PATCH 04/32] Skip NHWC pooling tests on non-cuDNN platforms

---
 tests/python/gpu/test_operator_gpu.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 7baf26c9642e..ba81ead671ce 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -625,6 +625,7 @@ def test_pooling_with_convention():
 
 
 @with_seed()
+@assert_raises_cudnn_not_satisfied(min_version='7.0.1')
 def test_pooling_nhwc_with_convention():
     def make_pooling_syms(**kwargs):
         # Conventional NCHW layout pooling
@@ -834,6 +835,7 @@ def test_pooling_with_type2():
         check_consistency(sym, ctx_list)
 
 @with_seed()
+@assert_raises_cudnn_not_satisfied(min_version='7.0.1')
 def test_pooling_nhwc_with_type():
     def make_pooling_syms(**kwargs):
         # Conventional NCHW layout pooling

From c7e06a952f7f340fe89373b0176f09ced70adf9c Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 3 Dec 2018 14:53:07 -0800
Subject: [PATCH 05/32] Fix pylint NHWC pooling

---
 python/mxnet/gluon/nn/conv_layers.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 0c39bb7749b1..3d2221cf1488 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -741,7 +741,7 @@ class MaxPool1D(_Pooling):
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, **kwargs):
-        assert layout == 'NCW' or layout == 'NWC',\
+        assert layout in ('NCW', 'NWC'),\
             "Only NCW and NWC layouts are valid for 1D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
@@ -791,7 +791,7 @@ class MaxPool2D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0, layout='NCHW',
                  ceil_mode=False, **kwargs):
-        assert layout == 'NCHW' or layout == 'NHWC',\
+        assert layout in ('NCHW', 'NHWC'),\
             "Only NCHW and NHWC layouts are valid for 2D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
@@ -843,7 +843,7 @@ class MaxPool3D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW' or layout == 'NDHWC',\
+        assert layout in ('NCDHW', 'NDHWC'),\
             "Only NCDHW and NDHWC layouts are valid for 3D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
@@ -891,7 +891,7 @@ class AvgPool1D(_Pooling):
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, count_include_pad=True, **kwargs):
-        assert layout == 'NCW' or layout == 'NWC',\
+        assert layout in ('NCW', 'NWC'),\
             "Only NCW and NWC layouts are valid for 1D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
@@ -943,7 +943,7 @@ class AvgPool2D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCHW', count_include_pad=True, **kwargs):
-        assert layout == 'NCHW' or layout == 'NHWC',\
+        assert layout in ('NCHW', 'NHWC'),\
             "Only NCHW and NHWC layouts are valid for 2D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
@@ -996,7 +996,7 @@ class AvgPool3D(_Pooling):
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', count_include_pad=True, **kwargs):
-        assert layout == 'NCDHW' or layout == 'NDHWC',\
+        assert layout in ('NCDHW', 'NDHWC'),\
             "Only NCDHW and NDHWC layouts are valid for 3D"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
@@ -1028,7 +1028,7 @@ class GlobalMaxPool1D(_Pooling):
           when `layout` is `NCW`.
     """
     def __init__(self, layout='NCW', **kwargs):
-        assert layout == 'NCW' or layout == 'NWC',\
+        assert layout in ('NCW', 'NWC'),\
             "Only NCW and NWC layouts are valid for 1D"
         super(GlobalMaxPool1D, self).__init__(
             (1,), None, 0, True, True, 'max', layout, **kwargs)
@@ -1057,7 +1057,7 @@ class GlobalMaxPool2D(_Pooling):
           `(batch_size, channels, 1, 1)` when `layout` is `NCHW`.
     """
     def __init__(self, layout='NCHW', **kwargs):
-        assert layout == 'NCHW' or layout == 'NHWC',\
+        assert layout in ('NCHW', 'NHWC'),\
             "Only NCHW and NHWC layouts are valid for 2D"
         super(GlobalMaxPool2D, self).__init__(
             (1, 1), None, 0, True, True, 'max', layout, **kwargs)
@@ -1087,7 +1087,7 @@ class GlobalMaxPool3D(_Pooling):
           `(batch_size, channels, 1, 1, 1)` when `layout` is `NCDHW`.
     """
     def __init__(self, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW' or layout == 'NDHWC',\
+        assert layout in ('NCDHW', 'NDHWC'),\
             "Only NCDHW and NDHWC layouts are valid for 3D"
         super(GlobalMaxPool3D, self).__init__(
             (1, 1, 1), None, 0, True, True, 'max', layout, **kwargs)
@@ -1113,7 +1113,7 @@ class GlobalAvgPool1D(_Pooling):
         - **out**: 3D output tensor with shape `(batch_size, channels, 1)`.
     """
     def __init__(self, layout='NCW', **kwargs):
-        assert layout == 'NCW' or layout == 'NWC',\
+        assert layout in ('NCW', 'NWC'),\
             "Only NCW and NWC layouts are valid for 1D"
         super(GlobalAvgPool1D, self).__init__(
             (1,), None, 0, True, True, 'avg', layout, **kwargs)
@@ -1141,7 +1141,7 @@ class GlobalAvgPool2D(_Pooling):
           `(batch_size, channels, 1, 1)` when `layout` is `NCHW`.
     """
     def __init__(self, layout='NCHW', **kwargs):
-        assert layout == 'NCHW' or layout == 'NHWC',\
+        assert layout in ('NCHW', 'NHWC'),\
             "Only NCHW and NHWC layouts are valid for 2D"
         super(GlobalAvgPool2D, self).__init__(
             (1, 1), None, 0, True, True, 'avg', layout, **kwargs)
@@ -1169,7 +1169,7 @@ class GlobalAvgPool3D(_Pooling):
           `(batch_size, channels, 1, 1, 1)` when `layout` is `NCDHW`.
     """
     def __init__(self, layout='NCDHW', **kwargs):
-        assert layout == 'NCDHW' or layout == 'NDHWC',\
+        assert layout in ('NCDHW', 'NDHWC'),\
             "Only NCDHW and NDHWC layouts are valid for 3D"
         super(GlobalAvgPool3D, self).__init__(
             (1, 1, 1), None, 0, True, True, 'avg', layout, **kwargs)

From 7fcbdac1281b8ff819e2c173b124dec9a3365e20 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Wed, 5 Dec 2018 15:40:47 -0800
Subject: [PATCH 06/32] Fixes from review

---
 python/mxnet/gluon/nn/conv_layers.py      | 24 +++++++++++------------
 src/operator/nn/cudnn/cudnn_pooling-inl.h |  4 +++-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 3d2221cf1488..427ceb3bbf9c 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -742,7 +742,7 @@ class MaxPool1D(_Pooling):
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, **kwargs):
         assert layout in ('NCW', 'NWC'),\
-            "Only NCW and NWC layouts are valid for 1D"
+            "Only NCW and NWC layouts are valid for 1D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
         assert len(pool_size) == 1, "pool_size must be a number or a list of 1 ints"
@@ -792,7 +792,7 @@ class MaxPool2D(_Pooling):
     def __init__(self, pool_size=(2, 2), strides=None, padding=0, layout='NCHW',
                  ceil_mode=False, **kwargs):
         assert layout in ('NCHW', 'NHWC'),\
-            "Only NCHW and NHWC layouts are valid for 2D"
+            "Only NCHW and NHWC layouts are valid for 2D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
         assert len(pool_size) == 2, "pool_size must be a number or a list of 2 ints"
@@ -844,7 +844,7 @@ class MaxPool3D(_Pooling):
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', **kwargs):
         assert layout in ('NCDHW', 'NDHWC'),\
-            "Only NCDHW and NDHWC layouts are valid for 3D"
+            "Only NCDHW and NDHWC layouts are valid for 3D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
         assert len(pool_size) == 3, "pool_size must be a number or a list of 3 ints"
@@ -892,7 +892,7 @@ class AvgPool1D(_Pooling):
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, count_include_pad=True, **kwargs):
         assert layout in ('NCW', 'NWC'),\
-            "Only NCW and NWC layouts are valid for 1D"
+            "Only NCW and NWC layouts are valid for 1D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)
         assert len(pool_size) == 1, "pool_size must be a number or a list of 1 ints"
@@ -944,7 +944,7 @@ class AvgPool2D(_Pooling):
     def __init__(self, pool_size=(2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCHW', count_include_pad=True, **kwargs):
         assert layout in ('NCHW', 'NHWC'),\
-            "Only NCHW and NHWC layouts are valid for 2D"
+            "Only NCHW and NHWC layouts are valid for 2D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*2
         assert len(pool_size) == 2, "pool_size must be a number or a list of 2 ints"
@@ -997,7 +997,7 @@ class AvgPool3D(_Pooling):
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', count_include_pad=True, **kwargs):
         assert layout in ('NCDHW', 'NDHWC'),\
-            "Only NCDHW and NDHWC layouts are valid for 3D"
+            "Only NCDHW and NDHWC layouts are valid for 3D Pooling"
         if isinstance(pool_size, numeric_types):
             pool_size = (pool_size,)*3
         assert len(pool_size) == 3, "pool_size must be a number or a list of 3 ints"
@@ -1029,7 +1029,7 @@ class GlobalMaxPool1D(_Pooling):
     """
     def __init__(self, layout='NCW', **kwargs):
         assert layout in ('NCW', 'NWC'),\
-            "Only NCW and NWC layouts are valid for 1D"
+            "Only NCW and NWC layouts are valid for 1D Pooling"
         super(GlobalMaxPool1D, self).__init__(
             (1,), None, 0, True, True, 'max', layout, **kwargs)
 
@@ -1058,7 +1058,7 @@ class GlobalMaxPool2D(_Pooling):
     """
     def __init__(self, layout='NCHW', **kwargs):
         assert layout in ('NCHW', 'NHWC'),\
-            "Only NCHW and NHWC layouts are valid for 2D"
+            "Only NCHW and NHWC layouts are valid for 2D Pooling"
         super(GlobalMaxPool2D, self).__init__(
             (1, 1), None, 0, True, True, 'max', layout, **kwargs)
 
@@ -1088,7 +1088,7 @@ class GlobalMaxPool3D(_Pooling):
     """
     def __init__(self, layout='NCDHW', **kwargs):
         assert layout in ('NCDHW', 'NDHWC'),\
-            "Only NCDHW and NDHWC layouts are valid for 3D"
+            "Only NCDHW and NDHWC layouts are valid for 3D Pooling"
         super(GlobalMaxPool3D, self).__init__(
             (1, 1, 1), None, 0, True, True, 'max', layout, **kwargs)
 
@@ -1114,7 +1114,7 @@ class GlobalAvgPool1D(_Pooling):
     """
     def __init__(self, layout='NCW', **kwargs):
         assert layout in ('NCW', 'NWC'),\
-            "Only NCW and NWC layouts are valid for 1D"
+            "Only NCW and NWC layouts are valid for 1D Pooling"
         super(GlobalAvgPool1D, self).__init__(
             (1,), None, 0, True, True, 'avg', layout, **kwargs)
 
@@ -1142,7 +1142,7 @@ class GlobalAvgPool2D(_Pooling):
     """
     def __init__(self, layout='NCHW', **kwargs):
         assert layout in ('NCHW', 'NHWC'),\
-            "Only NCHW and NHWC layouts are valid for 2D"
+            "Only NCHW and NHWC layouts are valid for 2D Pooling"
         super(GlobalAvgPool2D, self).__init__(
             (1, 1), None, 0, True, True, 'avg', layout, **kwargs)
 
@@ -1170,7 +1170,7 @@ class GlobalAvgPool3D(_Pooling):
     """
     def __init__(self, layout='NCDHW', **kwargs):
         assert layout in ('NCDHW', 'NDHWC'),\
-            "Only NCDHW and NDHWC layouts are valid for 3D"
+            "Only NCDHW and NDHWC layouts are valid for 3D Pooling"
         super(GlobalAvgPool3D, self).__init__(
             (1, 1, 1), None, 0, True, True, 'avg', layout, **kwargs)
 
diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index a97466e82fa1..dcd9546c8079 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -63,7 +63,7 @@ class CuDNNPoolingOp {
         }
         break;
       default:
-        LOG(FATAL) << "Not implmented";
+        LOG(FATAL) << "Not implemented";
     }
   }
 
@@ -215,6 +215,8 @@ class CuDNNPoolingOp {
       int window_height = param_.global_pool ? dshape_nchw[2] : param_.kernel[0];
       int window_width = param_.global_pool ? dshape_nchw[3] : param_.kernel[1];
       // CuDNN v7.1.4 backprop kernel doesn't support window sizes 9 and above.
+      // For reference see Fixed Issues section in
+      // https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_721.html#rel_721
       #if CUDNN_VERSION == 7104
       is_supported = window_height <= 8 && window_width <= 8;
       #endif

From 1d19ae50516ede379f59aadc25fb8e28e111d7ed Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 6 Dec 2018 17:53:13 -0800
Subject: [PATCH 07/32] Add CuDNNPoolingOp::Supports() in place of
 Forward()/Backward() bool return.

---
 src/operator/nn/cudnn/cudnn_pooling-inl.h | 70 ++++++++++++++++++++---
 src/operator/nn/pooling.cu                | 38 +++---------
 2 files changed, 72 insertions(+), 36 deletions(-)

diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index dcd9546c8079..7579c1b38204 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -73,8 +73,7 @@ class CuDNNPoolingOp {
     CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_));
   }
 
-  // Return boolean saying whether pooling configuration is supported.
-  bool Forward(const OpContext &ctx, const TBlob &in_data,
+  void Forward(const OpContext &ctx, const TBlob &in_data,
       const OpReqType &req, const TBlob &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
@@ -83,7 +82,7 @@ class CuDNNPoolingOp {
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     if (!this->Init(s, in_data, out_data))
-      return false;
+      LOG(FATAL) << "CuDNN Pooling invoked with unsupported parameters.";
     if (param_.kernel.ndim() == 2) {
       // 2d pool
       Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
@@ -115,11 +114,9 @@ class CuDNNPoolingOp {
     } else {
       LOG(FATAL) << "Only support 2D or 3D pooling";
     }
-    return true;
   }
 
-  // Return boolean saying whether pooling configuration is supported
-  bool Backward(const OpContext &ctx, const TBlob &out_grad,
+  void Backward(const OpContext &ctx, const TBlob &out_grad,
       const TBlob &in_data, const TBlob &out_data,
       const OpReqType &req, const TBlob &in_grad) {
     using namespace mshadow;
@@ -130,7 +127,7 @@ class CuDNNPoolingOp {
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     if (!this->Init(s, in_data, out_data))
-      return false;
+      LOG(FATAL) << "CuDNN Pooling invoked with unsupported parameters.";
     if (param_.kernel.ndim() == 2) {
       // 2d pool
       Tensor<gpu, 4, DType> m_out_grad = out_grad.get<gpu, 4, DType>(s);
@@ -170,6 +167,65 @@ class CuDNNPoolingOp {
     } else {
       LOG(FATAL) << "Only support 2D or 3D pooling";
     }
+  }
+
+/*!
+ * \brief Returns whether the cuDNN library version supports the pooling operation
+ * described by `param`: cuDNN v5 and earlier does not support 3D pooling for example.
+ * CuDNN v7.1.4 backprop kernel doesn't support window sizes 9 and above.
+ */
+  static bool Supports(const PoolingParam &param, const TBlob& input) {
+    using namespace mshadow;
+    static bool sum_pooling_warning_issued = false;
+    static bool lp_pooling_warning_issued = false;
+
+    switch (param.pool_type) {
+      case pool_enum::kMaxPooling:
+      case pool_enum::kAvgPooling:
+        break;
+      case pool_enum::kSumPooling:
+        if (!sum_pooling_warning_issued) {
+          sum_pooling_warning_issued = true;
+          LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
+        }
+        return false;
+      case pool_enum::kLpPooling:
+        if (!lp_pooling_warning_issued) {
+          lp_pooling_warning_issued = true;
+          LOG(WARNING) << "Lp pooling is not supported by cudnn, MXNet Lp pooling is applied.";
+        }
+        return false;
+      default:
+        return false;
+    }
+
+    if (param.kernel.ndim() == 2) {
+      // 2d conv
+      if (param.layout.value() != mshadow::kNCHW && param.layout.value() != mshadow::kNHWC)
+        return false;
+#if CUDNN_VERSION == 7104
+      // CuDNN v7.1.4 backprop kernel doesn't support window sizes 9 and above.
+      // Perform shape calculations in a standard (NCHW) layout space
+      mshadow::Shape<4> input_shape = input.shape_.get<4>();
+      mshadow::Shape<4> dshape_nchw = (param.layout.value() == mshadow::kNHWC) ?
+                                      ConvertLayout(input_shape, mshadow::kNHWC, mshadow::kNCHW) :
+                                      input_shape;
+      int window_height = param.global_pool ? dshape_nchw[2] : param.kernel[0];
+      int window_width = param.global_pool ? dshape_nchw[3] : param.kernel[1];
+      if (window_height > 8 || window_width > 8)
+        return false;
+#endif
+    } else if (param.kernel.ndim() == 3) {
+#if CUDNN_MAJOR < 5
+      return false;
+#endif
+      if (param.layout.value() != mshadow::kNCDHW)
+        return false;
+    } else {
+      // Unsupported kernel dim
+      return false;
+    }
+
     return true;
   }
 
diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu
index b996ebec9b42..84cacc15e239 100644
--- a/src/operator/nn/pooling.cu
+++ b/src/operator/nn/pooling.cu
@@ -56,21 +56,11 @@ void PoolingCompute<gpu>(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), GetNumOutputs(param));
 
 #if MXNET_USE_CUDNN == 1
-  if (!param.cudnn_off && param.kernel.ndim() > 1) {
+  if (!param.cudnn_off) {
     MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      switch (param.pool_type) {
-        case pool_enum::kMaxPooling:
-        case pool_enum::kAvgPooling:
-          if (GetCuDNNPoolingOp<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0])) {
-            return;
-          }
-          break;
-        case pool_enum::kSumPooling:
-          LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
-          break;
-        case pool_enum::kLpPooling:
-          LOG(WARNING) << "Lp pooling is not supported by cudnn, MXNet lp pooling is applied.";
-          break;
+      if (CuDNNPoolingOp<DType>::Supports(param, inputs[0])) {
+        GetCuDNNPoolingOp<DType>(param).Forward(ctx, inputs[0], req[0], outputs[0]);
+        return;
       }
     });
   }
@@ -113,23 +103,13 @@ void PoolingGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
   }
 
 #if MXNET_USE_CUDNN == 1
-  if (!param.cudnn_off && param.kernel.ndim() > 1) {
+  if (!param.cudnn_off) {
     MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      switch (param.pool_type) {
-        case pool_enum::kMaxPooling:
-        case pool_enum::kAvgPooling:
-          if (GetCuDNNPoolingOp<DType>(param).Backward(ctx, inputs[ograd_idx],
+      if (CuDNNPoolingOp<DType>::Supports(param, inputs[in_data_idx])) {
+          GetCuDNNPoolingOp<DType>(param).Backward(ctx, inputs[ograd_idx],
                                                    inputs[in_data_idx], inputs[out_data_idx],
-                                                   req[0], outputs[0])) {
-            return;
-          }
-          break;
-        case pool_enum::kSumPooling:
-          LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
-          break;
-        case pool_enum::kLpPooling:
-          LOG(WARNING) << "Lp pooling is not supported by cudnn, MXNet Lp pooling is applied.";
-          break;
+                                                   req[0], outputs[0]);
+          return;
       }
     });
   }

From 6d5d8cb5ba3151d047f138166bc0808895d051d0 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 11 Dec 2018 18:56:06 -0800
Subject: [PATCH 08/32] Add layout support to cpu implementation of Pooling,
 with tests.

---
 src/operator/nn/pool.cuh            |  16 +-
 src/operator/nn/pool.h              | 968 +++++++++++++++++++++++++---
 src/operator/nn/pooling-inl.h       |  19 +-
 tests/python/unittest/test_gluon.py | 165 +++--
 4 files changed, 987 insertions(+), 181 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 976aacf63a55..f398c10c2c47 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -680,8 +680,14 @@ template<typename DType, int p>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data, const bool count_include_pad) {
+                 DType* out_data, const bool count_include_pad, const dmlc::optional<int> &layout) {
   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
+  CHECK(!layout.has_value() ||
+        layout.value() == mshadow::kNCW ||
+        layout.value() == mshadow::kNCHW ||
+        layout.value() == mshadow::kNCDHW) <<
+    "MXNet CUDA pooling expects layout NCW, NCHW or NCDHW.  Need CuDNN for support of layout " <<
+    layout.value();
   using namespace mxnet_op;
   if (kernel.ndim() == 1) {
     if (pool_enum::kMaxPooling == pool_type) {
@@ -817,11 +823,17 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
                    const int pool_type, OpReqType req_type, DType* in_grad,
-                   const bool count_include_pad) {
+                   const bool count_include_pad, const dmlc::optional<int> &layout) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
   }
+  CHECK(!layout.has_value() ||
+        layout.value() == mshadow::kNCW ||
+        layout.value() == mshadow::kNCHW ||
+        layout.value() == mshadow::kNCDHW) <<
+     "MXNet CUDA pooling expects layout NCW, NCHW or NCDHW.  Need CuDNN for support of layout " <<
+     layout.value();
   using namespace mxnet_op;
   if (kernel.ndim() == 1) {
     if (pool_enum::kMaxPooling == pool_type) {
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 33005c8e5f0f..4bb00fa9950e 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -61,6 +61,7 @@
 
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
+#include <vector>
 #include <algorithm>
 #include "./pool_utils.h"
 #include "../mxnet_op.h"
@@ -77,13 +78,13 @@ enum PoolingOpPadConventionType {kValid, kFull, kSame};
 }  // namespace pool_enum
 
 /*!
- * \brief max pooling cpu function for 1-D images.
+ * \brief max pooling cpu function for 1-D images in 'ncw' layout.
  * Do not call this kernel directly. Use the interface pool().
  */
 template<typename DType>
-inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
-                            const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data) {
+inline void pool_max_1d_ncw_cpu(const DType *in_data, const TShape &ishape, const TShape &oshape,
+                                const TShape &kernel, const TShape &pad, const TShape &stride,
+                                DType *out_data) {
   using mshadow::red::limits::MinValue;
   const int width = ishape[2];
   const int pooled_width = oshape[2];
@@ -113,14 +114,53 @@ inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TS
 }
 
 /*!
- * \brief max pooling cpu function for 2-D images.
+ * \brief max pooling cpu function for 1-D images in 'nwc' layout.
  * Do not call this kernel directly. Use the interface pool().
  */
 template<typename DType>
-inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+inline void pool_max_1d_nwc_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
                             DType* out_data) {
   using mshadow::red::limits::MinValue;
+  const int width = ishape[1];
+  const int pooled_width = oshape[1];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const int features = oshape[2];
+  const index_t in_data_offset = ishape[1] * features;
+  const index_t out_data_offset = oshape[1] * features;
+  std::vector<DType> max_vals(features);
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int pw = 0; pw < pooled_width; ++pw) {
+      int wstart = pw * stride_w - pad_w;
+      int wend = std::min(wstart + kernel_w, width);
+      wstart = std::max(wstart, 0);
+      std::fill(max_vals.begin(), max_vals.end(), MinValue<DType>());
+      for (int w = wstart; w < wend; ++w) {
+        for (index_t c = 0; c < features; ++c) {
+          if (in_data[w * features + c] > max_vals[c]) {
+            max_vals[c] = in_data[w * features + c];
+          }
+        }
+      }
+      for (index_t c = 0; c < features; ++c)
+        out_data[pw * features + c] = max_vals[c];
+    }
+    in_data += in_data_offset;
+    out_data += out_data_offset;
+  }
+}
+
+/*!
+ * \brief max pooling cpu function for 2-D images in 'nchw' layout.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_max_2d_nchw_cpu(const DType *in_data, const TShape &ishape, const TShape &oshape,
+                                 const TShape &kernel, const TShape &pad, const TShape &stride,
+                                 DType *out_data) {
+  using mshadow::red::limits::MinValue;
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -158,14 +198,62 @@ inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TS
 }
 
 /*!
- * \brief max pooling cpu function for 3-D images.
+ * \brief max pooling cpu function for 2-D images in 'nhwc' layout.
  * Do not call this kernel directly. Use the interface pool().
  */
 template<typename DType>
-inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+inline void pool_max_2d_nhwc_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
                             DType* out_data) {
   using mshadow::red::limits::MinValue;
+  const int height = ishape[1], width = ishape[2];
+  const int pooled_height = oshape[1], pooled_width = oshape[2];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const int features = oshape[3];
+  const index_t in_data_offset = ishape[1] * ishape[2] * features;
+  const index_t out_data_offset = oshape[1] * oshape[2] * features;
+  std::vector<DType> max_vals(features);
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = std::min(hstart + kernel_h, height);
+        int wend = std::min(wstart + kernel_w, width);
+        hstart = std::max(hstart, 0);
+        wstart = std::max(wstart, 0);
+        const int pool_index = ph * pooled_width + pw;
+        std::fill(max_vals.begin(), max_vals.end(), MinValue<DType>());
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            const int in_index = h * width + w;
+            for (index_t c = 0; c < features; ++c) {
+              if (in_data[in_index * features + c] > max_vals[c]) {
+                max_vals[c] = in_data[in_index * features + c];
+              }
+            }
+          }
+        }
+        for (index_t c = 0; c < features; ++c)
+          out_data[pool_index * features + c] = max_vals[c];
+      }
+    }
+    in_data += in_data_offset;
+    out_data += out_data_offset;
+  }
+}
+
+/*!
+ * \brief max pooling cpu function for 3-D images in 'ncdhw' layout.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_max_3d_ncdhw_cpu(const DType *in_data, const TShape &ishape, const TShape &oshape,
+                                  const TShape &kernel, const TShape &pad, const TShape &stride,
+                                  DType *out_data) {
+  using mshadow::red::limits::MinValue;
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -210,14 +298,69 @@ inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TS
 }
 
 /*!
- * \brief avg/sum pooling cpu function for 1-D images.
+ * \brief max pooling cpu function for 3-D images in 'ndhwc' layout.
  * Do not call this kernel directly. Use the interface pool().
  */
-template<typename DType, int p = 1>
-inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+template<typename DType>
+inline void pool_max_3d_ndhwc_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
-                            DType* out_data,
-                            const bool get_avg = false, const bool count_include_pad = true) {
+                            DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  const int depth = ishape[1], height = ishape[2], width = ishape[3];
+  const int pooled_depth = oshape[1], pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const int features = oshape[4];
+  const index_t in_data_offset = ishape[1] * ishape[2] * ishape[3] * features;
+  const index_t out_data_offset = oshape[1] * oshape[2] * oshape[3] * features;
+  std::vector<DType> max_vals(features);
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int pd = 0; pd < pooled_depth; ++pd) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int dstart = pd * stride_d - pad_d;
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int dend = std::min(dstart + kernel_d, depth);
+          int hend = std::min(hstart + kernel_h, height);
+          int wend = std::min(wstart + kernel_w, width);
+          dstart = std::max(dstart, 0);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+          std::fill(max_vals.begin(), max_vals.end(), MinValue<DType>());
+          for (int d = dstart; d < dend; ++d) {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int in_index = (d * height + h) * width + w;
+                for (index_t c = 0; c < features; ++c) {
+                  if (in_data[in_index * features + c] > max_vals[c]) {
+                    max_vals[c] = in_data[in_index * features + c];
+                  }
+                }
+              }
+            }
+          }
+          for (index_t c = 0; c < features; ++c)
+            out_data[pool_index * features + c] = max_vals[c];
+        }
+      }
+    }
+    in_data += in_data_offset;
+    out_data += out_data_offset;
+  }
+}
+
+/*!
+ * \brief avg/sum pooling cpu function for 1-D images in 'ncw' layout.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType, int p = 1>
+inline void pool_sum_1d_ncw_cpu(const DType *in_data, const TShape &ishape, const TShape &oshape,
+                                const TShape &kernel, const TShape &pad, const TShape &stride,
+                                DType *out_data,
+                                const bool get_avg = false, const bool count_include_pad = true) {
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -249,14 +392,56 @@ inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TS
 }
 
 /*!
- * \brief avg/sum pooling cpu function for 2-D images.
+ * \brief avg/sum pooling cpu function for 1-D images in 'nwc' layout.
  * Do not call this kernel directly. Use the interface pool().
  */
 template<typename DType, int p = 1>
-inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+inline void pool_sum_1d_nwc_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
                             DType* out_data,
                             const bool get_avg = false, const bool count_include_pad = true) {
+  const int width = ishape[1];
+  const int pooled_width = oshape[1];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const int features = oshape[2];
+  const index_t in_data_offset = ishape[1] * features;
+  const index_t out_data_offset = oshape[1 * features];
+  std::vector<DType> sums(features);
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int pw = 0; pw < pooled_width; ++pw) {
+      int wstart = pw * stride_w - pad_w;
+      int wend = std::min(wstart + kernel_w, width + pad_w);
+      int pool_size = (get_avg ? (wend - wstart) : 1);
+      wstart = std::max(wstart, 0);
+      wend = std::min(wend, width);
+      if (get_avg && !count_include_pad) {
+        pool_size = (wend - wstart);
+      }
+      std::fill(sums.begin(), sums.end(), 0);
+      for (int w = wstart; w < wend; ++w) {
+        for (index_t c = 0; c < features; ++c) {
+          sums[c] += a_pow_p<DType, p>::Map(in_data[w * features + c]) / pool_size;
+        }
+      }
+      for (index_t c = 0; c < features; ++c)
+        out_data[pw * features + c] = a_root_p<DType, p>::Map(sums[c]);
+    }
+    in_data += in_data_offset;
+    out_data += out_data_offset;
+  }
+}
+
+/*!
+ * \brief avg/sum pooling cpu function for 2-D images in 'nchw' layout.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType, int p = 1>
+inline void pool_sum_2d_nchw_cpu(const DType *in_data, const TShape &ishape, const TShape &oshape,
+                                 const TShape &kernel, const TShape &pad, const TShape &stride,
+                                 DType *out_data,
+                                 const bool get_avg = false, const bool count_include_pad = true) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -296,14 +481,66 @@ inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TS
 }
 
 /*!
- * \brief avg/sum pooling cpu function for 3-D images.
+ * \brief avg/sum pooling cpu function for 2-D images in 'nhwc' layout.
  * Do not call this kernel directly. Use the interface pool().
  */
 template<typename DType, int p = 1>
-inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+inline void pool_sum_2d_nhwc_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
                             const TShape& kernel, const TShape& pad, const TShape& stride,
                             DType* out_data,
                             const bool get_avg = false, const bool count_include_pad = true) {
+  const int height = ishape[1], width = ishape[2];
+  const int pooled_height = oshape[1], pooled_width = oshape[2];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const int features = oshape[3];
+  const index_t in_data_offset = ishape[1] * ishape[2] * features;
+  const index_t out_data_offset = oshape[1] * oshape[2] * features;
+  std::vector<DType> sums(features);
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = std::min(hstart + kernel_h, height + pad_h);
+        int wend = std::min(wstart + kernel_w, width + pad_w);
+        int pool_size = (get_avg ? (hend - hstart) * (wend - wstart) : 1);
+        hstart = std::max(hstart, 0);
+        wstart = std::max(wstart, 0);
+        hend = std::min(hend, height);
+        wend = std::min(wend, width);
+        if (get_avg && !count_include_pad) {
+          pool_size = (hend - hstart) * (wend - wstart);
+        }
+        const int pool_index = ph * pooled_width + pw;
+        std::fill(sums.begin(), sums.end(), 0);
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            const int in_index = h * width + w;
+            for (index_t c = 0; c < features; ++c) {
+              sums[c] += a_pow_p<DType, p>::Map(in_data[in_index * features + c]) / pool_size;
+            }
+          }
+        }
+        for (index_t c = 0; c < features; ++c)
+          out_data[pool_index * features + c] = a_root_p<DType, p>::Map(sums[c]);
+      }
+    }
+    in_data += in_data_offset;
+    out_data += out_data_offset;
+  }
+}
+
+/*!
+ * \brief avg/sum pooling cpu function for 3-D images in 'ncdhw' layout.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType, int p = 1>
+inline void pool_sum_3d_ncdhw_cpu(const DType *in_data, const TShape &ishape, const TShape &oshape,
+                                  const TShape &kernel, const TShape &pad, const TShape &stride,
+                                  DType *out_data,
+                                  const bool get_avg = false, const bool count_include_pad = true) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -353,15 +590,77 @@ inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TS
 }
 
 /*!
- * \brief max unpooling cpu function for 1-D images.
+ * \brief avg/sum pooling cpu function for 3-D images in 'ndhwc' layout.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType, int p = 1>
+inline void pool_sum_3d_ndhwc_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data,
+                            const bool get_avg = false, const bool count_include_pad = true) {
+  const int depth = ishape[1], height = ishape[2], width = ishape[3];
+  const int pooled_depth = oshape[1], pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const int features = oshape[4];
+  const index_t in_data_offset = ishape[1] * ishape[2] * ishape[3] * features;
+  const index_t out_data_offset = oshape[1] * oshape[2] * oshape[3] * features;
+  std::vector<DType> sums(features);
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int pd = 0; pd < pooled_depth; ++pd) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int dstart = pd * stride_d - pad_d;
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int dend = std::min(dstart + kernel_d, depth + pad_d);
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = (get_avg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+          dstart = std::max(dstart, 0);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          dend = std::min(dend, depth);
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          if (get_avg && !count_include_pad) {
+            pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          }
+          const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+          std::fill(sums.begin(), sums.end(), 0);
+          for (int d = dstart; d < dend; ++d) {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int in_index = (d * height + h) * width + w;
+                for (index_t c = 0; c < features; ++c) {
+                  sums[c] += a_pow_p<DType, p>::Map(in_data[in_index * features + c]) / pool_size;
+                }
+              }
+            }
+          }
+          for (index_t c = 0; c < features; ++c)
+            out_data[pool_index * features + c] = (pool_size == 0) ?
+                                                            DType(nanf("")) :
+                                                            a_root_p<DType, p>::Map(sums[c]);
+        }
+      }
+    }
+    in_data += in_data_offset;
+    out_data += out_data_offset;
+  }
+}
+
+/*!
+ * \brief max unpooling cpu function for 1-D images in 'ncw' layout.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType>
-inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data,
-                              const DType* out_data, const TShape& ishape,
-                              const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride,
-                              DType* in_grad) {
+inline void unpool_max_1d_ncw_cpu(const DType *out_grad, const DType *in_data,
+                                  const DType *out_data, const TShape &ishape,
+                                  const TShape &oshape, const TShape &kernel,
+                                  const TShape &pad, const TShape &stride,
+                                  DType *in_grad) {
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -397,15 +696,63 @@ inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data,
 }
 
 /*!
- * \brief max unpooling cpu function for 2-D images.
+ * \brief max unpooling cpu function for 1-D images in 'nwc' layout.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType>
-inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
+inline void unpool_max_1d_nwc_cpu(const DType* out_grad, const DType* in_data,
                               const DType* out_data, const TShape& ishape,
                               const TShape& oshape, const TShape& kernel,
                               const TShape& pad, const TShape& stride,
                               DType* in_grad) {
+  const int width = ishape[1];
+  const int pooled_width = oshape[1];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const int features = oshape[2];
+  const index_t in_offset = ishape[1] * features;
+  const index_t out_offset = oshape[1] * features;
+  std::vector<int> max_idxs(features);
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int pw = 0; pw < pooled_width; ++pw) {
+      int wstart = pw * stride_w - pad_w;
+      int wend = std::min(wstart + kernel_w, width);
+      wstart = std::max(wstart, 0);
+      std::fill(max_idxs.begin(), max_idxs.end(), -1);
+      for (index_t c = 0; c < features; ++c) {
+        for (int w = wstart; w < wend; ++w) {
+          if (in_data[w * features + c] == out_data[pw * features + c]) {
+            max_idxs[c] = w;
+            break;
+          }
+        }
+      }
+      // In the case where pad > 0 and kernel = 1, for example,
+      // max_idx can be -1 reaching this step.
+      for (index_t c = 0; c < features; ++c) {
+        if (max_idxs[c] >= 0) {
+          in_grad[max_idxs[c] * features + c] += out_grad[pw * features + c];
+        }
+      }
+    }
+    in_data += in_offset;
+    in_grad += in_offset;
+    out_data += out_offset;
+    out_grad += out_offset;
+  }
+}
+
+/*!
+ * \brief max unpooling cpu function for 2-D images in 'nchw' layout.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_max_2d_nchw_cpu(const DType *out_grad, const DType *in_data,
+                                   const DType *out_data, const TShape &ishape,
+                                   const TShape &oshape, const TShape &kernel,
+                                   const TShape &pad, const TShape &stride,
+                                   DType *in_grad) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -453,15 +800,75 @@ inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
 }
 
 /*!
- * \brief max unpooling cpu function for 3-D images.
+ * \brief max unpooling cpu function for 2-D images in 'nhwc' layout.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType>
-inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
+inline void unpool_max_2d_nhwc_cpu(const DType* out_grad, const DType* in_data,
                               const DType* out_data, const TShape& ishape,
                               const TShape& oshape, const TShape& kernel,
                               const TShape& pad, const TShape& stride,
                               DType* in_grad) {
+  const int height = ishape[1], width = ishape[2];
+  const int pooled_height = oshape[1], pooled_width = oshape[2];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const int features = oshape[3];
+  const index_t in_offset = ishape[1] * ishape[2] * features;
+  const index_t out_offset = oshape[1] * oshape[2] * features;
+  std::vector<int> max_idxs(features);
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = std::min(hstart + kernel_h, height);
+        int wend = std::min(wstart + kernel_w, width);
+        hstart = std::max(hstart, 0);
+        wstart = std::max(wstart, 0);
+        const int pool_index = ph * pooled_width + pw;
+        std::fill(max_idxs.begin(), max_idxs.end(), -1);
+        for (index_t c = 0; c < features; ++c) {
+          bool found = false;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int idx = h * width + w;
+              if (in_data[idx * features + c] == out_data[pool_index * features + c]) {
+                max_idxs[c] = idx;
+                found = true;
+                break;
+              }
+            }
+            if (found) break;
+          }
+        }
+        // In the case where pad > 0 and kernel = 1, for example,
+        // max_idx can be -1 reaching this step.
+        for (index_t c = 0; c < features; ++c) {
+          if (max_idxs[c] >= 0) {
+            in_grad[max_idxs[c] * features + c] += out_grad[pool_index * features + c];
+          }
+        }
+      }
+    }
+    in_data += in_offset;
+    in_grad += in_offset;
+    out_data += out_offset;
+    out_grad += out_offset;
+  }
+}
+
+/*!
+ * \brief max unpooling cpu function for 3-D images in 'ncdhw' layout.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_max_3d_ncdhw_cpu(const DType *out_grad, const DType *in_data,
+                                    const DType *out_data, const TShape &ishape,
+                                    const TShape &oshape, const TShape &kernel,
+                                    const TShape &pad, const TShape &stride,
+                                    DType *in_grad) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -517,14 +924,83 @@ inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
 }
 
 /*!
- * \brief avg/sum unpooling cpu function for 1-D images.
+ * \brief max unpooling cpu function for 3-D images in 'ndhwc' layout.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_max_3d_ndhwc_cpu(const DType* out_grad, const DType* in_data,
+                              const DType* out_data, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad) {
+  const int depth = ishape[1], height = ishape[2], width = ishape[3];
+  const int pooled_depth = oshape[1], pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const int features = oshape[4];
+  const index_t in_offset = ishape[1] * ishape[2] * ishape[3] * features;
+  const index_t out_offset = oshape[1] * oshape[2] * oshape[3] * features;
+  std::vector<int> max_idxs(features);
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int pd = 0; pd < pooled_depth; ++pd) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int dstart = pd * stride_d - pad_d;
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int dend = std::min(dstart + kernel_d, depth);
+          int hend = std::min(hstart + kernel_h, height);
+          int wend = std::min(wstart + kernel_w, width);
+          dstart = std::max(dstart, 0);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+          std::fill(max_idxs.begin(), max_idxs.end(), -1);
+          for (index_t c = 0; c < features; ++c) {
+            bool found = false;
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  const int idx = (d * height + h) * width + w;
+                  if (in_data[idx * features + c] == out_data[pool_index * features + c]) {
+                    max_idxs[c] = idx;
+                    found = true;
+                    break;
+                  }
+                }
+                if (found) break;
+              }
+              if (found) break;
+            }
+          }
+          // In the case where pad > 0 and kernel = 1, for example,
+          // max_idx can be -1 reaching this step.
+          for (index_t c = 0; c < features; ++c) {
+            if (max_idxs[c] >= 0) {
+              in_grad[max_idxs[c] * features + c] += out_grad[pool_index * features + c];
+            }
+          }
+        }
+      }
+    }
+    in_data += in_offset;
+    in_grad += in_offset;
+    out_data += out_offset;
+    out_grad += out_offset;
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling cpu function for 1-D images in 'ncw' layout.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType, int p = 1>
-inline void unpool_sum_1d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
-                              const TShape& ishape, const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride, DType* in_grad,
-                              const bool is_avg = false, const bool count_include_pad = true) {
+inline void unpool_sum_1d_ncw_cpu(const DType *out_grad, const DType *in_data,
+                                  const DType *out_data,
+                                  const TShape &ishape, const TShape &oshape, const TShape &kernel,
+                                  const TShape &pad, const TShape &stride, DType *in_grad,
+                                  const bool is_avg = false, const bool count_include_pad = true) {
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -556,14 +1032,58 @@ inline void unpool_sum_1d_cpu(const DType* out_grad, const DType* in_data, const
 }
 
 /*!
- * \brief avg/sum unpooling cpu function for 2-D images.
+ * \brief avg/sum unpooling cpu function for 1-D images in 'nwc' layout.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType, int p = 1>
-inline void unpool_sum_2d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
+inline void unpool_sum_1d_nwc_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
                               const TShape& ishape, const TShape& oshape, const TShape& kernel,
                               const TShape& pad, const TShape& stride, DType* in_grad,
                               const bool is_avg = false, const bool count_include_pad = true) {
+  const int width = ishape[1];
+  const int pooled_width = oshape[1];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const int features = oshape[2];
+  const index_t in_grad_offset = ishape[1] * features;
+  const index_t out_grad_offset = oshape[1] * features;
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int pw = 0; pw < pooled_width; ++pw) {
+      int wstart = pw * stride_w - pad_w;
+      int wend = std::min(wstart + kernel_w, width + pad_w);
+      int pool_size = (is_avg ? (wend - wstart) : 1);
+      wstart = std::max(wstart, 0);
+      wend = std::min(wend, width);
+      if (is_avg && !count_include_pad) {
+        pool_size = (wend - wstart);
+      }
+      for (int w = wstart; w < wend; ++w) {
+        for (index_t c = 0; c < features; ++c) {
+          in_grad[w * features + c] +=
+              lp_grad<DType, p>::Map(out_grad[pw * features + c],
+                                     in_data[w * features + c],
+                                     out_data[pw * features + c]) / pool_size;
+        }
+      }
+    }
+    in_grad += in_grad_offset;
+    in_data += in_grad_offset;
+    out_grad += out_grad_offset;
+    out_data += out_grad_offset;
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling cpu function for 2-D images in 'nchw' layout.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType, int p = 1>
+inline void unpool_sum_2d_nchw_cpu(const DType *out_grad, const DType *in_data,
+                                   const DType *out_data,
+                                   const TShape &ishape, const TShape &oshape, const TShape &kernel,
+                                   const TShape &pad, const TShape &stride, DType *in_grad,
+                                   const bool is_avg = false, const bool count_include_pad = true) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -607,14 +1127,69 @@ inline void unpool_sum_2d_cpu(const DType* out_grad, const DType* in_data, const
 }
 
 /*!
- * \brief avg/sum unpooling cpu function for 3-D images.
+ * \brief avg/sum unpooling cpu function for 2-D images in 'nhwc' layout.
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType, int p = 1>
-inline void unpool_sum_3d_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
+inline void unpool_sum_2d_nhwc_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
                               const TShape& ishape, const TShape& oshape, const TShape& kernel,
                               const TShape& pad, const TShape& stride, DType* in_grad,
                               const bool is_avg = false, const bool count_include_pad = true) {
+  const int height = ishape[1], width = ishape[2];
+  const int pooled_height = oshape[1], pooled_width = oshape[2];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int features = oshape[3];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_grad_offset = ishape[1] * ishape[2] * features;
+  const index_t out_grad_offset = oshape[1] * oshape[2] * features;
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int ph = 0; ph < pooled_height; ++ph) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = std::min(hstart + kernel_h, height + pad_h);
+        int wend = std::min(wstart + kernel_w, width + pad_w);
+        int pool_size = (is_avg ? (hend - hstart) * (wend - wstart) : 1);
+        hstart = std::max(hstart, 0);
+        wstart = std::max(wstart, 0);
+        hend = std::min(hend, height);
+        wend = std::min(wend, width);
+        if (is_avg && !count_include_pad) {
+          pool_size = (hend - hstart) * (wend - wstart);
+        }
+        const int pool_index = ph * pooled_width + pw;
+        for (int h = hstart; h < hend; ++h) {
+          for (int w = wstart; w < wend; ++w) {
+            const int in_index = h * width + w;
+            for (index_t c = 0; c < features; ++c) {
+              in_grad[in_index * features + c] +=
+                  lp_grad<DType, p>::Map(out_grad[pool_index * features + c],
+                                         in_data[in_index * features + c],
+                                         out_data[pool_index * features + c]) / pool_size;
+            }
+          }
+        }
+      }
+    }
+    in_grad += in_grad_offset;
+    in_data += in_grad_offset;
+    out_grad += out_grad_offset;
+    out_data += out_grad_offset;
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling cpu function for 3-D images in 'ncdhw' layout.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType, int p = 1>
+inline void unpool_sum_3d_ncdhw_cpu(const DType *out_grad, const DType *in_data,
+                                    const DType *out_data,
+                                    const TShape &ishape, const TShape &oshape,
+                                    const TShape &kernel,
+                                    const TShape &pad, const TShape &stride, DType *in_grad,
+                                    const bool is_avg = false, const bool count_include_pad = true) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -665,6 +1240,67 @@ inline void unpool_sum_3d_cpu(const DType* out_grad, const DType* in_data, const
   }
 }
 
+/*!
+ * \brief avg/sum unpooling cpu function for 3-D images in 'ndhwc' layout.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType, int p = 1>
+inline void unpool_sum_3d_ndhwc_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
+                              const TShape& ishape, const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride, DType* in_grad,
+                              const bool is_avg = false, const bool count_include_pad = true) {
+  const int depth = ishape[1], height = ishape[2], width = ishape[3];
+  const int pooled_depth = oshape[1], pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const int features = oshape[4];
+  const index_t in_grad_offset = ishape[1] * ishape[2] * ishape[3] * features;
+  const index_t out_grad_offset = oshape[1] * oshape[2] * oshape[3] * features;
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (int pd = 0; pd < pooled_depth; ++pd) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int dstart = pd * stride_d - pad_d;
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int dend = std::min(dstart + kernel_d, depth + pad_d);
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = (is_avg ? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+          dstart = std::max(dstart, 0);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          dend = std::min(dend, depth);
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          if (is_avg && !count_include_pad) {
+            pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          }
+          const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+          for (int d = dstart; d < dend; ++d) {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int in_index = (d * height + h) * width + w;
+                for (index_t c = 0; c < features; ++c) {
+                  in_grad[in_index * features + c] +=
+                      lp_grad<DType, p>::Map(out_grad[pool_index * features + c],
+                                             in_data[in_index * features + c],
+                                             out_data[pool_index * features + c]) / pool_size;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    in_grad += in_grad_offset;
+    in_data += in_grad_offset;
+    out_grad += out_grad_offset;
+    out_data += out_grad_offset;
+  }
+}
+
 /*!
  * \brief This function serves as an interface for 1/2/3-D pooling operations.
  * \param s context stream defining the device in use is cpu
@@ -683,46 +1319,97 @@ template<typename DType, int p>
 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data, const bool count_include_pad) {
+                 DType* out_data, const bool count_include_pad, const dmlc::optional<int> &layout) {
   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
   if (kernel.ndim() == 1) {
-    if (pool_enum::kMaxPooling == pool_type) {
-      pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
-    } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
-                      true, count_include_pad);
-    } else if (pool_enum::kSumPooling == pool_type) {
-      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
-    } else if (pool_enum::kLpPooling == pool_type) {
-      pool_sum_1d_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    if (layout.has_value() && layout.value() == mshadow::kNWC) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        pool_max_1d_nwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        pool_sum_1d_nwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                        true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        pool_sum_1d_nwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        pool_sum_1d_nwc_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCW) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        pool_max_1d_ncw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        pool_sum_1d_ncw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                            true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        pool_sum_1d_ncw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        pool_sum_1d_ncw_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
     } else {
-      LOG(FATAL) << "Unknown pooling type " << pool_type;
+      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout.value();
     }
   } else if (kernel.ndim() == 2) {
-    if (pool_enum::kMaxPooling == pool_type) {
-      pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
-    } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
-                      true, count_include_pad);
-    } else if (pool_enum::kSumPooling == pool_type) {
-      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
-    } else if (pool_enum::kLpPooling == pool_type) {
-      pool_sum_2d_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    if (layout.has_value() && layout.value() == mshadow::kNHWC) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        pool_max_2d_nhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        pool_sum_2d_nhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                        true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        pool_sum_2d_nhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        pool_sum_2d_nhwc_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCHW) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        pool_max_2d_nchw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        pool_sum_2d_nchw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                             true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        pool_sum_2d_nchw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        pool_sum_2d_nchw_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
     } else {
-      LOG(FATAL) << "Unknown pooling type " << pool_type;
+      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout.value();
     }
   } else if (kernel.ndim() == 3) {
-    if (pool_enum::kMaxPooling == pool_type) {
-      pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
-    } else if (pool_enum::kAvgPooling == pool_type) {
-      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
-                      true, count_include_pad);
-    } else if (pool_enum::kSumPooling == pool_type) {
-      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
-    } else if (pool_enum::kLpPooling == pool_type) {
-      pool_sum_3d_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    if (layout.has_value() && layout.value() == mshadow::kNDHWC) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        pool_max_3d_ndhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        pool_sum_3d_ndhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                        true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        pool_sum_3d_ndhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        pool_sum_3d_ndhwc_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCDHW) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        pool_max_3d_ncdhw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        pool_sum_3d_ncdhw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data,
+                              true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        pool_sum_3d_ncdhw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        pool_sum_3d_ncdhw_cpu<DType, p>(in_data, ishape, oshape, kernel, pad, stride, out_data);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
     } else {
-      LOG(FATAL) << "Unknown pooling type " << pool_type;
+      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout.value();
     }
   } else {
     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling";
@@ -750,54 +1437,131 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
                    const int pool_type, OpReqType req_type, DType* in_grad,
-                   const bool count_include_pad) {
+                   const bool count_include_pad, const dmlc::optional<int> &layout) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_grad);
   }
   if (kernel.ndim() == 1) {
-    if (pool_enum::kMaxPooling == pool_type) {
-      unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
-    } else if (pool_enum::kAvgPooling == pool_type) {
-      unpool_sum_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                        true, count_include_pad);
-    } else if (pool_enum::kSumPooling == pool_type) {
-      unpool_sum_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
-    } else if (pool_enum::kLpPooling == pool_type) {
-      unpool_sum_1d_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
-                                  in_grad);
+    if (layout.has_value() && layout.value() == mshadow::kNWC) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        unpool_max_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                          in_grad);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        unpool_sum_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
+                          true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        unpool_sum_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                          in_grad);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        unpool_sum_1d_nwc_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
+                                    stride,
+                                    in_grad);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCW) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        unpool_max_1d_ncw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                              in_grad);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        unpool_sum_1d_ncw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                              in_grad,
+                              true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        unpool_sum_1d_ncw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                              in_grad);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        unpool_sum_1d_ncw_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
+                                        stride,
+                                        in_grad);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
     } else {
-      LOG(FATAL) << "Unknown pooling type " << pool_type;
+      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout.value();
     }
   } else if (kernel.ndim() == 2) {
-    if (pool_enum::kMaxPooling == pool_type) {
-      unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
-    } else if (pool_enum::kAvgPooling == pool_type) {
-      unpool_sum_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                        true, count_include_pad);
-    } else if (pool_enum::kSumPooling == pool_type) {
-      unpool_sum_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
-    } else if (pool_enum::kLpPooling == pool_type) {
-      unpool_sum_2d_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
-                                  in_grad);
+    if (layout.has_value() && layout.value() == mshadow::kNHWC) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        unpool_max_2d_nhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                          in_grad);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        unpool_sum_2d_nhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                          in_grad,
+                          true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        unpool_sum_2d_nhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                          in_grad);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        unpool_sum_2d_nhwc_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
+                                    stride,
+                                    in_grad);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCHW) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        unpool_max_2d_nchw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                               in_grad);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        unpool_sum_2d_nchw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                               in_grad,
+                               true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        unpool_sum_2d_nchw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                               in_grad);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        unpool_sum_2d_nchw_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
+                                         stride,
+                                         in_grad);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
     } else {
-      LOG(FATAL) << "Unknown pooling type " << pool_type;
+      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout.value();
     }
   } else if (kernel.ndim() == 3) {
-    if (pool_enum::kMaxPooling == pool_type) {
-      unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
-    } else if (pool_enum::kAvgPooling == pool_type) {
-      unpool_sum_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                        true, count_include_pad);
-    } else if (pool_enum::kSumPooling == pool_type) {
-      unpool_sum_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
-    } else if (pool_enum::kLpPooling == pool_type) {
-      unpool_sum_3d_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
-                                  in_grad);
+    if (layout.has_value() && layout.value() == mshadow::kNDHWC) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        unpool_max_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                          in_grad);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        unpool_sum_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
+                          true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        unpool_sum_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                          in_grad);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        unpool_sum_3d_ndhwc_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
+                                    stride,
+                                    in_grad);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCDHW) {
+      if (pool_enum::kMaxPooling == pool_type) {
+        unpool_max_3d_ncdhw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                                in_grad);
+      } else if (pool_enum::kAvgPooling == pool_type) {
+        unpool_sum_3d_ncdhw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                                in_grad,
+                                true, count_include_pad);
+      } else if (pool_enum::kSumPooling == pool_type) {
+        unpool_sum_3d_ncdhw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                                in_grad);
+      } else if (pool_enum::kLpPooling == pool_type) {
+        unpool_sum_3d_ncdhw_cpu<DType, p>(out_grad, in_data, out_data, ishape, oshape, kernel, pad,
+                                          stride,
+                                          in_grad);
+      } else {
+        LOG(FATAL) << "Unknown pooling type " << pool_type;
+      }
     } else {
-      LOG(FATAL) << "Unknown pooling type " << pool_type;
+      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout.value();
     }
-  } else {
+  }
+  else {
     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
   }
 }
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 7c721907ee89..d48f1e38975c 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -98,6 +98,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     .add_enum("NCW", mshadow::kNCW)
     .add_enum("NCHW", mshadow::kNCHW)
     .add_enum("NCDHW", mshadow::kNCDHW)
+    .add_enum("NWC", mshadow::kNWC)
     .add_enum("NHWC", mshadow::kNHWC)
     .add_enum("NDHWC", mshadow::kNDHWC)
     .set_default(dmlc::optional<int>())
@@ -162,9 +163,6 @@ class PoolingOp {
   void Forward(const OpContext& ctx, const TBlob& in_data,
                const OpReqType& req, const TBlob& out_data) {
     using namespace mshadow;
-    CHECK(param_.layout.value() == kNCW ||
-          param_.layout.value() == kNCHW ||
-          param_.layout.value() == kNCDHW) << "Need CuDNN for layout support";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const TShape& ishape = in_data.shape_;
     TShape kernel = param_.kernel;
@@ -189,21 +187,21 @@ class PoolingOp {
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad);
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad, param_.layout);
         break;
       case 2:
         pool<DType, 2>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad);
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad, param_.layout);
         break;
       case 3:
         pool<DType, 3>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad);
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad, param_.layout);
         break;
       default:
         LOG(FATAL) << "p value of " << p_value << " is not supported yet...";
@@ -214,9 +212,6 @@ class PoolingOp {
                 const TBlob& in_data, const TBlob& out_data,
                 const OpReqType& req, const TBlob& in_grad) {
     using namespace mshadow;
-    CHECK(param_.layout.value() == kNCW ||
-          param_.layout.value() == kNCHW ||
-          param_.layout.value() == kNCDHW) << "Need CuDNN for layout support";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const TShape& ishape = in_data.shape_;
     TShape kernel = param_.kernel;
@@ -243,7 +238,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad);
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad, param_.layout);
         break;
       case 2:
         unpool<DType, 2>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
@@ -251,7 +246,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad);
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad, param_.layout);
         break;
       case 3:
         unpool<DType, 3>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
@@ -259,7 +254,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad);
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad, param_.layout);
         break;
       default:
         LOG(FATAL) << "p value of " << p_value << " is not supported yet...";
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 3049674821c9..962350d0f6b9 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -506,50 +506,75 @@ def test_deconv():
 
 @with_seed()
 def test_pool():
-    layers1d = [
-        nn.MaxPool1D(),
-        nn.MaxPool1D(3),
-        nn.MaxPool1D(3, 2),
-        nn.AvgPool1D(),
-        nn.AvgPool1D(count_include_pad=False),
-        nn.GlobalAvgPool1D(),
-        ]
-    for layer in layers1d:
-        check_layer_forward(layer, (1, 2, 10))
-
-
-    layers2d = [
-        nn.MaxPool2D(),
-        nn.MaxPool2D((3, 3)),
-        nn.MaxPool2D(3, 2),
-        nn.AvgPool2D(),
-        nn.AvgPool2D(count_include_pad=False),
-        nn.GlobalAvgPool2D(),
-        ]
-    for layer in layers2d:
-        check_layer_forward(layer, (1, 2, 10, 10))
-
-    layers3d = [
-        nn.MaxPool3D(),
-        nn.MaxPool3D((3, 3, 3)),
-        nn.MaxPool3D(3, 2),
-        nn.AvgPool3D(),
-        nn.AvgPool3D(count_include_pad=False),
-        nn.GlobalAvgPool3D(),
-        ]
-    for layer in layers3d:
-        check_layer_forward(layer, (1, 2, 10, 10, 10))
+    # transpose shape to bring feature dimension 'c' from 2nd position to last
+    def transpose(shape):
+        return (shape[0],) + shape[2:] + (shape[1],)
+
+    for layout in ['NCW', 'NWC']:
+        shape1d = (1, 2, 10)
+        if layout == 'NWC':
+            shape1d = transpose(shape1d)
+        layers1d = [
+            nn.MaxPool1D(layout=layout),
+            nn.MaxPool1D(3, layout=layout),
+            nn.MaxPool1D(3, 2, layout=layout),
+            nn.AvgPool1D(layout=layout),
+            nn.AvgPool1D(count_include_pad=False, layout=layout),
+            nn.GlobalAvgPool1D(layout=layout),
+            ]
+        for layer in layers1d:
+            check_layer_forward(layer, shape1d)
+
+
+    for layout in ['NCHW', 'NHWC']:
+        shape2d = (1, 2, 10, 10)
+        if layout == 'NHWC':
+            shape2d = transpose(shape2d)
+        layers2d = [
+            nn.MaxPool2D(layout=layout),
+            nn.MaxPool2D((3, 3), layout=layout),
+            nn.MaxPool2D(3, 2, layout=layout),
+            nn.AvgPool2D(layout=layout),
+            nn.AvgPool2D(count_include_pad=False, layout=layout),
+            nn.GlobalAvgPool2D(layout=layout),
+            ]
+        for layer in layers2d:
+            check_layer_forward(layer, shape2d)
+
+    for layout in ['NCDHW', 'NDHWC']:
+        shape3d = (1, 2, 10, 10, 10)
+        if layout == 'NDHWC':
+            shape3d = transpose(shape3d)
+        layers3d = [
+            nn.MaxPool3D(layout=layout),
+            nn.MaxPool3D((3, 3, 3), layout=layout),
+            nn.MaxPool3D(3, 2, layout=layout),
+            nn.AvgPool3D(layout=layout),
+            nn.AvgPool3D(count_include_pad=False, layout=layout),
+            nn.GlobalAvgPool3D(layout=layout),
+            ]
+        for layer in layers3d:
+            check_layer_forward(layer, shape3d)
 
     # test ceil_mode
-    x = mx.nd.zeros((2, 2, 10, 10))
+    for layout in ['NCHW', 'NHWC']:
+        xshape = (2, 2, 10, 10)
+        noceil_out_shape = (2, 2, 3, 3)
+        ceil_out_shape = (2, 2, 4, 4)
+        if layout == 'NHWC':
+            xshape = transpose(xshape)
+            noceil_out_shape = transpose(noceil_out_shape)
+            ceil_out_shape = transpose(ceil_out_shape)
 
-    layer = nn.MaxPool2D(3, ceil_mode=False)
-    layer.collect_params().initialize()
-    assert (layer(x).shape==(2, 2, 3, 3))
+        x = mx.nd.zeros(xshape)
 
-    layer = nn.MaxPool2D(3, ceil_mode=True)
-    layer.collect_params().initialize()
-    assert (layer(x).shape==(2, 2, 4, 4))
+        layer = nn.MaxPool2D(3, ceil_mode=False, layout=layout)
+        layer.collect_params().initialize()
+        assert (layer(x).shape==noceil_out_shape)
+
+        layer = nn.MaxPool2D(3, ceil_mode=True, layout=layout)
+        layer.collect_params().initialize()
+        assert (layer(x).shape==ceil_out_shape)
 
 
 @with_seed()
@@ -2091,31 +2116,41 @@ def hybrid_forward(self, F, x):
 
 @with_seed()
 def test_slice_pooling2d():
-    max_pooling = nn.MaxPool2D(strides=(2, 3), padding=(1, 1))
-    avg_pooling = nn.AvgPool2D(strides=(2, 2), padding=(1, 1))
-    global_maxpooling = nn.GlobalMaxPool2D()
-    global_avgpooling = nn.GlobalAvgPool2D()
-    pooling_layers = [max_pooling, avg_pooling, global_maxpooling, global_avgpooling]
-    class Net(gluon.HybridBlock):
-        def __init__(self,
-                     slice,
-                     pooling_layer,
-                     **kwargs):
-            super(Net, self).__init__(**kwargs)
-            with self.name_scope():
-                self.slice = slice
-                self.pool0 = pooling_layer
-
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=self.slice[0], end=self.slice[1])
-            out = self.pool0(x_slice)
-            return out
-
-    x = mx.nd.random.uniform(shape=(16, 128, 256, 256))
-    slice = [(0, 0, 0, 0), (4, 16, 32, 64)]
-    for i in range(len(pooling_layers)):
-        net = Net(slice, pooling_layers[i])
-        check_layer_forward_withinput(net, x)
+    # transpose shape to bring feature dimension 'c' from 2nd position to last
+    def transpose(shape):
+        return (shape[0],) + shape[2:] + (shape[1],)
+
+    for layout in ['NCHW', 'NHWC']:
+        max_pooling = nn.MaxPool2D(strides=(2, 3), padding=(1, 1), layout=layout)
+        avg_pooling = nn.AvgPool2D(strides=(2, 2), padding=(1, 1), layout=layout)
+        global_maxpooling = nn.GlobalMaxPool2D(layout=layout)
+        global_avgpooling = nn.GlobalAvgPool2D(layout=layout)
+        pooling_layers = [max_pooling, avg_pooling, global_maxpooling, global_avgpooling]
+        class Net(gluon.HybridBlock):
+            def __init__(self,
+                         slice,
+                         pooling_layer,
+                         **kwargs):
+                super(Net, self).__init__(**kwargs)
+                with self.name_scope():
+                    self.slice = slice
+                    self.pool0 = pooling_layer
+
+            def hybrid_forward(self, F, x):
+                x_slice = x.slice(begin=self.slice[0], end=self.slice[1])
+                out = self.pool0(x_slice)
+                return out
+
+        xshape = (16, 128, 256, 256)
+        slice_shape = (4, 16, 32, 64)
+        if layout == 'NHWC':
+            xshape = transpose(xshape)
+            slice_shape = transpose(slice_shape)
+        x = mx.nd.random.uniform(shape=xshape)
+        slice = [(0, 0, 0, 0), slice_shape]
+        for i in range(len(pooling_layers)):
+            net = Net(slice, pooling_layers[i])
+            check_layer_forward_withinput(net, x)
 
 @with_seed()
 @unittest.skip('skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')

From 934f84c9a022e6888f41999318eafc8e502d9048 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 12 Dec 2018 13:08:49 -0800
Subject: [PATCH 09/32] Fix cpplint.

---
 src/operator/nn/pool.h | 60 +++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 4bb00fa9950e..883dc29aa6d2 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -1036,10 +1036,12 @@ inline void unpool_sum_1d_ncw_cpu(const DType *out_grad, const DType *in_data,
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType, int p = 1>
-inline void unpool_sum_1d_nwc_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
-                              const TShape& ishape, const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride, DType* in_grad,
-                              const bool is_avg = false, const bool count_include_pad = true) {
+inline void unpool_sum_1d_nwc_cpu(const DType* out_grad, const DType* in_data,
+                                  const DType *out_data, const TShape &ishape,
+                                  const TShape &oshape, const TShape &kernel,
+                                  const TShape &pad, const TShape &stride,
+                                  DType *in_grad, const bool is_avg = false,
+                                  const bool count_include_pad = true) {
   const int width = ishape[1];
   const int pooled_width = oshape[1];
   const int kernel_w = kernel[0];
@@ -1080,10 +1082,11 @@ inline void unpool_sum_1d_nwc_cpu(const DType* out_grad, const DType* in_data, c
  */
 template<typename DType, int p = 1>
 inline void unpool_sum_2d_nchw_cpu(const DType *out_grad, const DType *in_data,
-                                   const DType *out_data,
-                                   const TShape &ishape, const TShape &oshape, const TShape &kernel,
-                                   const TShape &pad, const TShape &stride, DType *in_grad,
-                                   const bool is_avg = false, const bool count_include_pad = true) {
+                                   const DType *out_data, const TShape &ishape,
+                                   const TShape &oshape, const TShape &kernel,
+                                   const TShape &pad, const TShape &stride,
+                                   DType *in_grad, const bool is_avg = false,
+                                   const bool count_include_pad = true) {
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -1131,10 +1134,12 @@ inline void unpool_sum_2d_nchw_cpu(const DType *out_grad, const DType *in_data,
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType, int p = 1>
-inline void unpool_sum_2d_nhwc_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
-                              const TShape& ishape, const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride, DType* in_grad,
-                              const bool is_avg = false, const bool count_include_pad = true) {
+inline void unpool_sum_2d_nhwc_cpu(const DType* out_grad, const DType* in_data,
+                                   const DType *out_data, const TShape &ishape,
+                                   const TShape &oshape, const TShape &kernel,
+                                   const TShape &pad, const TShape &stride,
+                                   DType *in_grad, const bool is_avg = false,
+                                   const bool count_include_pad = true) {
   const int height = ishape[1], width = ishape[2];
   const int pooled_height = oshape[1], pooled_width = oshape[2];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -1185,11 +1190,11 @@ inline void unpool_sum_2d_nhwc_cpu(const DType* out_grad, const DType* in_data,
  */
 template<typename DType, int p = 1>
 inline void unpool_sum_3d_ncdhw_cpu(const DType *out_grad, const DType *in_data,
-                                    const DType *out_data,
-                                    const TShape &ishape, const TShape &oshape,
-                                    const TShape &kernel,
-                                    const TShape &pad, const TShape &stride, DType *in_grad,
-                                    const bool is_avg = false, const bool count_include_pad = true) {
+                                    const DType *out_data, const TShape &ishape,
+                                    const TShape &oshape, const TShape &kernel,
+                                    const TShape &pad, const TShape &stride,
+                                    DType *in_grad, const bool is_avg = false,
+                                    const bool count_include_pad = true) {
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -1245,10 +1250,12 @@ inline void unpool_sum_3d_ncdhw_cpu(const DType *out_grad, const DType *in_data,
  * Do not call this kernel directly. Use the interface unpool().
  */
 template<typename DType, int p = 1>
-inline void unpool_sum_3d_ndhwc_cpu(const DType* out_grad, const DType* in_data, const DType* out_data,
-                              const TShape& ishape, const TShape& oshape, const TShape& kernel,
-                              const TShape& pad, const TShape& stride, DType* in_grad,
-                              const bool is_avg = false, const bool count_include_pad = true) {
+inline void unpool_sum_3d_ndhwc_cpu(const DType* out_grad, const DType* in_data,
+                                    const DType *out_data, const TShape &ishape,
+                                    const TShape &oshape, const TShape &kernel,
+                                    const TShape &pad, const TShape &stride,
+                                    DType *in_grad, const bool is_avg = false,
+                                    const bool count_include_pad = true) {
   const int depth = ishape[1], height = ishape[2], width = ishape[3];
   const int pooled_depth = oshape[1], pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -1448,8 +1455,8 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
         unpool_max_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
                           in_grad);
       } else if (pool_enum::kAvgPooling == pool_type) {
-        unpool_sum_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                          true, count_include_pad);
+        unpool_sum_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                              in_grad, true, count_include_pad);
       } else if (pool_enum::kSumPooling == pool_type) {
         unpool_sum_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
                           in_grad);
@@ -1527,8 +1534,8 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
         unpool_max_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
                           in_grad);
       } else if (pool_enum::kAvgPooling == pool_type) {
-        unpool_sum_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad,
-                          true, count_include_pad);
+        unpool_sum_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
+                                in_grad, true, count_include_pad);
       } else if (pool_enum::kSumPooling == pool_type) {
         unpool_sum_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
                           in_grad);
@@ -1560,8 +1567,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
     } else {
       LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout.value();
     }
-  }
-  else {
+  } else {
     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
   }
 }

From 73fbae9cb91585bc786238ef916dc956b8da84b7 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 20 Dec 2018 16:31:47 -0800
Subject: [PATCH 10/32] Fix bug in cpu nhwc impl.

---
 src/operator/nn/pool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 883dc29aa6d2..4a2f9b27d5ab 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -407,7 +407,7 @@ inline void pool_sum_1d_nwc_cpu(const DType* in_data, const TShape& ishape, cons
   const int stride_w = stride[0];
   const int features = oshape[2];
   const index_t in_data_offset = ishape[1] * features;
-  const index_t out_data_offset = oshape[1 * features];
+  const index_t out_data_offset = oshape[1] * features;
   std::vector<DType> sums(features);
   for (index_t n = 0; n < oshape[0]; ++n) {
     for (int pw = 0; pw < pooled_width; ++pw) {

From 8654d508a748a12019dab3b2710723e6f23ec510 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 20 Dec 2018 18:59:51 -0800
Subject: [PATCH 11/32] Add MXNet CUDA pooling in NWC, NHWC and NDHWC.  Turn on
 3D cuDNN pooling.  Tests.

---
 src/operator/nn/cudnn/cudnn_pooling-inl.h | 119 ++++--
 src/operator/nn/pool.cuh                  | 478 +++++++++++++++-------
 src/operator/nn/pool_utils.h              |   6 +-
 src/operator/nn/pooling-inl.h             |  26 +-
 tests/python/gpu/test_operator_gpu.py     | 423 +++++++++++--------
 5 files changed, 682 insertions(+), 370 deletions(-)

diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 7579c1b38204..1892a51968cc 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -21,13 +21,13 @@
  * Copyright (c) 2015 by Contributors
  * \file cudnn_pooling-inl.h
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Dick Carter
 */
 
 #ifndef MXNET_OPERATOR_NN_CUDNN_CUDNN_POOLING_INL_H_
 #define MXNET_OPERATOR_NN_CUDNN_CUDNN_POOLING_INL_H_
 #include <algorithm>
-#include <vector>
+#include <array>
 #include "../pooling-inl.h"
 
 namespace mxnet {
@@ -178,6 +178,8 @@ class CuDNNPoolingOp {
     using namespace mshadow;
     static bool sum_pooling_warning_issued = false;
     static bool lp_pooling_warning_issued = false;
+    static bool unsupported_dim_warning_issued = false;
+    int layout = param.layout.value();
 
     switch (param.pool_type) {
       case pool_enum::kMaxPooling:
@@ -200,14 +202,14 @@ class CuDNNPoolingOp {
     }
 
     if (param.kernel.ndim() == 2) {
-      // 2d conv
-      if (param.layout.value() != mshadow::kNCHW && param.layout.value() != mshadow::kNHWC)
+      // 2d pooling
+      if (!(layout == mshadow::kNCHW || layout == mshadow::kNHWC))
         return false;
 #if CUDNN_VERSION == 7104
       // CuDNN v7.1.4 backprop kernel doesn't support window sizes 9 and above.
       // Perform shape calculations in a standard (NCHW) layout space
       mshadow::Shape<4> input_shape = input.shape_.get<4>();
-      mshadow::Shape<4> dshape_nchw = (param.layout.value() == mshadow::kNHWC) ?
+      mshadow::Shape<4> dshape_nchw = (layout == mshadow::kNHWC) ?
                                       ConvertLayout(input_shape, mshadow::kNHWC, mshadow::kNCHW) :
                                       input_shape;
       int window_height = param.global_pool ? dshape_nchw[2] : param.kernel[0];
@@ -215,14 +217,25 @@ class CuDNNPoolingOp {
       if (window_height > 8 || window_width > 8)
         return false;
 #endif
+      // Avoid strided NHWC max pooling for some configs, to be corrected in a future cudnn release.
+      if (layout == mshadow::kNHWC &&
+          param.pool_type == pool_enum::kMaxPooling && !param.global_pool) {
+        if ((param.stride[0] >= 3 || param.stride[0] == 2 && param.kernel[0] % 2 == 0))
+          return false;
+        if ((param.stride[1] >= 3 || param.stride[1] == 2 && param.kernel[1] % 2 == 0))
+          return false;
+      }
     } else if (param.kernel.ndim() == 3) {
+      // 3d pooling
 #if CUDNN_MAJOR < 5
+      LogUnsupportedDim(&unsupported_dim_warning_issued, param.kernel.ndim());
       return false;
 #endif
-      if (param.layout.value() != mshadow::kNCDHW)
+      if (!(layout == mshadow::kNCDHW || layout == mshadow::kNDHWC))
         return false;
     } else {
       // Unsupported kernel dim
+      LogUnsupportedDim(&unsupported_dim_warning_issued, param.kernel.ndim());
       return false;
     }
 
@@ -239,9 +252,9 @@ class CuDNNPoolingOp {
     nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
     #endif
     if (param_.kernel.ndim() == 2) {
-      // 2d conv
+      // 2d pooling
       CHECK(param_.layout.value() == mshadow::kNCHW ||
-            param_.layout.value() == mshadow::kNHWC) << "Need 2D layout";
+            param_.layout.value() == mshadow::kNHWC) << "Need 2D layout NCHW or NHWC.";
       cudnnTensorFormat_t cudnn_layout =
           (param_.layout.value() == mshadow::kNCHW) ? CUDNN_TENSOR_NCHW
                                                     : CUDNN_TENSOR_NHWC;
@@ -298,57 +311,70 @@ class CuDNNPoolingOp {
       #endif
     } else {
       CHECK(param_.layout.value() == mshadow::kNCDHW ||
-            param_.layout.value() == mshadow::kNDHWC) << "Need 3D layout";
-      CHECK(param_.layout.value() == mshadow::kNCDHW) << "Only the NCDHW layout is supported.";
+            param_.layout.value() == mshadow::kNDHWC) << "Need 3D layout NCDHW or NDHWC.";
       Tensor<gpu, 5, DType> data = in_data.get<gpu, 5, DType>(s);
-      Tensor<gpu, 5, DType> out = out_data.get<gpu, 5, DType>(s);
-      std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
-                                 static_cast<int>(data.shape_[1]),
-                                 static_cast<int>(data.shape_[2]),
-                                 static_cast<int>(data.shape_[3]),
-                                 static_cast<int>(data.shape_[4])};
-
-      std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
-                                  static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
-                                  static_cast<int>(ishape[3] * ishape[4]),
-                                  static_cast<int>(ishape[4]), 1};
+      mshadow::Shape<5> dshape = data.shape_;
+      mshadow::Shape<5> dstride = mshadow::Shape5(dshape.ProdShape(1, 5),
+                                           dshape.ProdShape(2, 5),
+                                           dshape.ProdShape(3, 5),
+                                           dshape.ProdShape(4, 5),
+                                           dshape.ProdShape(5, 5));
 
-      std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
-                                 static_cast<int>(out.shape_[1]),
-                                 static_cast<int>(out.shape_[2]),
-                                 static_cast<int>(out.shape_[3]),
-                                 static_cast<int>(out.shape_[4])};
+      Tensor<gpu, 5, DType> out = out_data.get<gpu, 5, DType>(s);
+      mshadow::Shape<5> oshape = out.shape_;
+      mshadow::Shape<5> ostride = mshadow::Shape5(oshape.ProdShape(1, 5),
+                                           oshape.ProdShape(2, 5),
+                                           oshape.ProdShape(3, 5),
+                                           oshape.ProdShape(4, 5),
+                                           oshape.ProdShape(5, 5));
+      // Convert to a standard (NCDHW) layout space to create args for cuDNN
 
-      std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
-                                  static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
-                                  static_cast<int>(oshape[3] * oshape[4]),
-                                  static_cast<int>(oshape[4]), 1};
+      mshadow::Shape<5> dshape_ncdhw = (param_.layout.value() == mshadow::kNDHWC) ?
+                                       ConvertLayout(dshape, mshadow::kNDHWC, mshadow::kNCDHW) :
+                                       dshape;
+      mshadow::Shape<5> dstride_ncdhw = (param_.layout.value() == mshadow::kNDHWC) ?
+                                        ConvertLayout(dstride, mshadow::kNDHWC, mshadow::kNCDHW) :
+                                        dstride;
+      mshadow::Shape<5> oshape_ncdhw = (param_.layout.value() == mshadow::kNDHWC) ?
+                                        ConvertLayout(oshape, mshadow::kNDHWC, mshadow::kNCDHW) :
+                                        oshape;
+      mshadow::Shape<5> ostride_ncdhw = (param_.layout.value() == mshadow::kNDHWC) ?
+                                        ConvertLayout(ostride, mshadow::kNDHWC, mshadow::kNCDHW) :
+                                        ostride;
+      // Create int arrays for passing into cuDNN
+      std::array<int, 5> dshape_ncdhw_int, dstride_ncdhw_int, oshape_ncdhw_int, ostride_ncdhw_int;
+      for (int i = 0; i < 5; ++i) {
+        dshape_ncdhw_int[i] = static_cast<int>(dshape_ncdhw[i]);
+        dstride_ncdhw_int[i] = static_cast<int>(dstride_ncdhw[i]);
+        oshape_ncdhw_int[i] = static_cast<int>(oshape_ncdhw[i]);
+        ostride_ncdhw_int[i] = static_cast<int>(ostride_ncdhw[i]);
+      }
 
-      std::vector<int> kernel_vec = {param_.global_pool ? ishape[2] :
+      std::array<int, 3> kernel_vec = {param_.global_pool ? static_cast<int>(dshape_ncdhw[2]) :
                                                           static_cast<int>(param_.kernel[0]),
-                                     param_.global_pool ? ishape[3] :
+                                     param_.global_pool ? static_cast<int>(dshape_ncdhw[3]) :
                                                           static_cast<int>(param_.kernel[1]),
-                                     param_.global_pool ? ishape[4] :
+                                     param_.global_pool ? static_cast<int>(dshape_ncdhw[4]) :
                                                           static_cast<int>(param_.kernel[2])};
 
-      std::vector<int> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
+      std::array<int, 3> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
                                   param_.global_pool ? 0 : static_cast<int>(param_.pad[1]),
                                   param_.global_pool ? 0 : static_cast<int>(param_.pad[2])};
 
-      std::vector<int> stride_vec = {param_.global_pool ? 1 : static_cast<int>(param_.stride[0]),
+      std::array<int, 3> stride_vec = {param_.global_pool ? 1 : static_cast<int>(param_.stride[0]),
                                      param_.global_pool ? 1 : static_cast<int>(param_.stride[1]),
                                      param_.global_pool ? 1 : static_cast<int>(param_.stride[2])};
 
       CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_,
                                             dtype_,
-                                            static_cast<int>(ishape.size()),
-                                            &ishape[0],
-                                            &istride[0]));
+                                            static_cast<int>(dshape_ncdhw_int.size()),
+                                            &dshape_ncdhw_int[0],
+                                            &dstride_ncdhw_int[0]));
       CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_,
                                             dtype_,
-                                            static_cast<int>(oshape.size()),
-                                            &oshape[0],
-                                            &ostride[0]));
+                                            static_cast<int>(oshape_ncdhw_int.size()),
+                                            &oshape_ncdhw_int[0],
+                                            &ostride_ncdhw_int[0]));
       #if CUDNN_MAJOR >= 5
       CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_,
                                              mode_,
@@ -358,12 +384,21 @@ class CuDNNPoolingOp {
                                              &(pad_vec[0]),
                                              &(stride_vec[0])));
       #else
-      LOG(FATAL) << "3D pooling only support CUDNN v5 and above";
+      LOG(FATAL) << "3D pooling is only supported by CUDNN v5 and above.";
       #endif
     }
     return is_supported;
   }
 
+  // Log once that the dimension of the pooling operation isn't supported
+  static void LogUnsupportedDim(bool *msg_logged, int ndim) {
+    if (!*msg_logged) {
+      *msg_logged = true;
+      LOG(WARNING) << ndim << "D pooling is not supported by cudnn, "
+                   << "MXNet " << ndim << "D pooling is applied.";
+    }
+  }
+
   cudnnDataType_t dtype_;
   cudnnHandle_t handle_;
   cudnnPoolingMode_t mode_;
diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index f398c10c2c47..cbdf77d272a0 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -89,29 +89,32 @@ namespace mxnet {
 namespace op {
 
 /*!
- * \brief max pooling gpu kernel for 1-D images.
+ * \brief max pooling gpu kernel for 1-D images, for both NCW and NWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
-template <typename DType>
+template <typename DType, int layout>
 __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int channels, const int width,
                                        const int pooled_width, const int kernel_w,
                                        const int stride_w, const int pad_w,
                                        DType* out_data) {
   using mshadow::red::limits::MinValue;
-  // index is the output image's pixel index in NCW
+  // index is the output image's pixel index
   CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int c = (index / pooled_width) % channels;
+    const bool nwc_layout = layout == mshadow::kNWC;
+    const int idx = nwc_layout ? (index / channels) : index;
+    const int pw = idx % pooled_width;
+    const int c = nwc_layout ? (index % channels) : (index / pooled_width) % channels;
     const int n = index / pooled_width / channels;
     int wstart = pw * stride_w - pad_w;
     const int wend = min(wstart + kernel_w, width);
     wstart = max(wstart, 0);
-    const DType* in_slice =
-        in_data + (n * channels + c) * width;
+    const DType* in_slice = nwc_layout ? in_data + n * channels * width + c
+                                       : in_data + (n * channels + c) * width;
     DType max_val = MinValue<DType>();
+    const int multiplier = nwc_layout ? channels : 1;
     for (int w = wstart; w < wend; ++w) {
-      const DType in_val = in_slice[w];
+      const DType in_val = in_slice[w * multiplier];
       if (in_val > max_val) {
         max_val = in_val;
       }
@@ -121,10 +124,10 @@ __global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
 }
 
 /*!
- * \brief max pooling gpu kernel for 2-D images.
+ * \brief max pooling gpu kernel for 2-D images, for both NCHW and NHWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
-template <typename DType>
+template <typename DType, int layout>
 __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int channels, const int height, const int width,
                                        const int pooled_height, const int pooled_width,
@@ -132,11 +135,14 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int stride_w, const int pad_h, const int pad_w,
                                        DType* out_data) {
   using mshadow::red::limits::MinValue;
-  // index is the output image's pixel index in NCHW
+  // index is the output image's pixel index
   CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
+    const bool nhwc_layout = layout == mshadow::kNHWC;
+    const int idx = nhwc_layout ? (index / channels) : index;
+    const int pw = idx % pooled_width;
+    const int ph = (idx / pooled_width) % pooled_height;
+    const int c = nhwc_layout ? (index % channels)
+                              : (index / pooled_width / pooled_height) % channels;
     const int n = index / pooled_width / pooled_height / channels;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
@@ -144,12 +150,13 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
     const int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    const DType* in_slice =
-        in_data + (n * channels + c) * height * width;
+    const DType* in_slice = nhwc_layout ? in_data + n * channels * height * width + c
+                                        : in_data + (n * channels + c) * height * width;
     DType max_val = MinValue<DType>();
+    const int multiplier = nhwc_layout ? channels : 1;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        const DType in_val = in_slice[h * width + w];
+        const DType in_val = in_slice[(h * width + w) * multiplier];
         if (in_val > max_val) {
           max_val = in_val;
         }
@@ -160,10 +167,10 @@ __global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
 }
 
 /*!
- * \brief max pooling gpu kernel for 3-D images.
+ * \brief max pooling gpu kernel for 3-D images, for both NCDHW and NDHWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
-template <typename DType>
+template <typename DType, int layout>
 __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int depth, const int height, const int width,
                                        const int pooled_depth, const int pooled_height,
@@ -173,12 +180,15 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int pad_h, const int pad_w,
                                        DType* out_data) {
   using mshadow::red::limits::MinValue;
-  // index is the output image's pixel index in NCDHW
+  // index is the output image's pixel index
   CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
-    const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const bool ndhwc_layout = layout == mshadow::kNDHWC;
+    const int idx = ndhwc_layout ? (index / channels) : index;
+    const int pw = idx % pooled_width;
+    const int ph = (idx / pooled_width) % pooled_height;
+    const int pd = (idx / pooled_width / pooled_height) % pooled_depth;
+    const int c = ndhwc_layout ? (index % channels)
+                               : (index / pooled_width / pooled_height / pooled_depth) % channels;
     const int n = index / pooled_width / pooled_height / pooled_depth / channels;
     int dstart = pd * stride_d - pad_d;
     int hstart = ph * stride_h - pad_h;
@@ -189,13 +199,14 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data,
     dstart = max(dstart, 0);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    const DType* in_slice =
-        in_data + (n * channels + c) * depth * height * width;
+    const DType* in_slice = ndhwc_layout ? in_data + n * channels * depth * height * width + c
+                                         : in_data + (n * channels + c) * depth * height * width;
     DType max_val = MinValue<DType>();
+    const int multiplier = ndhwc_layout ? channels : 1;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
-          const DType in_val = in_slice[(d * height + h) * width + w];
+          const DType in_val = in_slice[((d * height + h) * width + w) * multiplier];
           if (in_val > max_val) {
             max_val = in_val;
           }
@@ -207,17 +218,20 @@ __global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data,
 }
 
 /*!
- * \brief avg/sum pooling gpu kernel for 1-D images.
+ * \brief avg/sum pooling gpu kernel for 1-D images, for both NCW and NWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
-template <typename DType, int p = 1>
+template <typename DType, int layout, int p = 1>
 __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int width, const int pooled_width, const int kernel_w,
                                        const int stride_w, const int pad_w, DType* out_data,
-                                       const bool get_avg = false, const bool count_include_pad = true) {
+                                       const bool get_avg = false,
+                                       const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int c = (index / pooled_width) % channels;
+    const bool nwc_layout = layout == mshadow::kNWC;
+    const int idx = nwc_layout ? (index / channels) : index;
+    const int pw = idx % pooled_width;
+    const int c = nwc_layout ? (index % channels) : (index / pooled_width) % channels;
     const int n = index / pooled_width / channels;
     int wstart = pw * stride_w - pad_w;
     int wend = min(wstart + kernel_w, width + pad_w);
@@ -228,19 +242,21 @@ __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data,
       pool_size = (wend - wstart);
     }
     DType sum = 0;
-    const DType* out_slice = in_data + (n * channels + c) * width;
+    const DType* out_slice = nwc_layout ? in_data + n * channels * width + c
+                                        : in_data + (n * channels + c) * width;
+    const int multiplier = nwc_layout ? channels : 1;
     for (int w = wstart; w < wend; ++w) {
-      sum += a_pow_p<DType, p>::Map(out_slice[w]) / pool_size;
+      sum += a_pow_p<DType, p>::Map(out_slice[w * multiplier]) / pool_size;
     }
     out_data[index] = a_root_p<DType, p>::Map(sum);
   }
 }
 
 /*!
- * \brief avg/sum pooling gpu kernel for 2-D images.
+ * \brief avg/sum pooling gpu kernel for 2-D images, for both NCHW and NHWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
-template <typename DType, int p = 1>
+template <typename DType, int layout, int p = 1>
 __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int height, const int width,
                                        const int pooled_height, const int pooled_width,
@@ -250,9 +266,12 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const bool get_avg = false,
                                        const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
+    const bool nhwc_layout = layout == mshadow::kNHWC;
+    const int idx = nhwc_layout ? (index / channels) : index;
+    const int pw = idx % pooled_width;
+    const int ph = (idx / pooled_width) % pooled_height;
+    const int c = nhwc_layout ? (index % channels)
+                              : (index / pooled_width / pooled_height) % channels;
     const int n = index / pooled_width / pooled_height / channels;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
@@ -267,10 +286,12 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
       pool_size = (hend - hstart) * (wend - wstart);
     }
     DType sum = 0;
-    const DType* out_slice = in_data + (n * channels + c) * height * width;
+    const DType* out_slice = nhwc_layout ? in_data + n * channels * height * width + c
+                                         : in_data + (n * channels + c) * height * width;
+    const int multiplier = nhwc_layout ? channels : 1;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        sum += a_pow_p<DType, p>::Map(out_slice[h * width + w]) / pool_size;
+        sum += a_pow_p<DType, p>::Map(out_slice[(h * width + w) * multiplier]) / pool_size;
       }
     }
     out_data[index] = a_root_p<DType, p>::Map(sum);
@@ -278,10 +299,10 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
 }
 
 /*!
- * \brief avg/sum pooling gpu kernel for 3-D images.
+ * \brief avg/sum pooling gpu kernel for 3-D images, for both NCDHW and NDHWC layouts.
  * Do not call this kernel directly. Use the interface pool().
  */
-template <typename DType, int p = 1>
+template <typename DType, int layout, int p = 1>
 __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
                                        const int depth, const int height, const int width,
                                        const int pooled_depth, const int pooled_height,
@@ -292,10 +313,13 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
                                        DType* out_data, const bool get_avg = false,
                                        const bool count_include_pad = true) {
   CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
-    const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const bool ndhwc_layout = layout == mshadow::kNDHWC;
+    const int idx = ndhwc_layout ? (index / channels) : index;
+    const int pw = idx % pooled_width;
+    const int ph = (idx / pooled_width) % pooled_height;
+    const int pd = (idx / pooled_width / pooled_height) % pooled_depth;
+    const int c = ndhwc_layout ? (index % channels)
+                               : (index / pooled_width / pooled_height / pooled_depth) % channels;
     const int n = index / pooled_width / pooled_height / pooled_depth / channels;
     int dstart = pd * stride_d - pad_d;
     int hstart = ph * stride_h - pad_h;
@@ -314,11 +338,14 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
       pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
     }
     DType sum = 0;
-    const DType* out_slice = in_data + (n * channels + c) * depth * height * width;
+    const DType* out_slice = ndhwc_layout ? in_data + n * channels * depth * height * width + c
+                                          : in_data + (n * channels + c) * depth * height * width;
+    const int multiplier = ndhwc_layout ? channels : 1;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
-          sum += a_pow_p<DType, p>::Map(out_slice[(d * height + h) * width + w]) / pool_size;
+          sum += a_pow_p<DType, p>::Map(out_slice[((d * height + h) * width + w) *
+                                                   multiplier]) / pool_size;
         }
       }
     }
@@ -329,35 +356,38 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
 }
 
 /*!
- * \brief max unpooling gpu kernel for 1-D images.
+ * \brief max unpooling gpu kernel for 1-D images, for both NCW and NWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template <typename DType>
+template <typename DType, int layout>
 __global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int width,
                                          const int pooled_width, const int kernel_w,
                                          const int stride_w, const int pad_w,
                                          DType* in_grad) {
-  // index is the output image's pixel index in NCHW
+  // index is the output image's pixel index
   // the order has to be consistent with pooling max
   // to avoid adding out_grad to the wrong in_grad
   // in the case where there are multiple max pixels
   // covered by a kernel window
   CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int c = (index / pooled_width) % channels;
-    const int n = index / pooled_width / channels;
+    const bool nwc_layout = layout == mshadow::kNWC;
+    const int idx = nwc_layout ? (index / channels) : index;
+    const int pw = idx % pooled_width;
+    const int c = nwc_layout ? index % channels : (index / pooled_width) % channels;
+    const int n = index / channels / pooled_width;
     int wstart = pw * stride_w - pad_w;
     const int wend = min(wstart + kernel_w, width);
     wstart = max(wstart, 0);
     // in data/grad offset batch and channel dims
-    int in_offset = (n * channels + c) * width;
+    const int in_offset = nwc_layout ? n * channels * width + c : (n * channels + c) * width;
     const DType* in_data_slice = in_data + in_offset;
     int max_idx = -1;
     DType max_val = out_data[index];
+    const int multiplier = nwc_layout ? channels : 1;
     for (int w = wstart; w < wend; ++w) {
-      if (in_data_slice[w] == max_val) {
+      if (in_data_slice[w * multiplier] == max_val) {
         max_idx = w;
         break;
       }
@@ -366,16 +396,16 @@ __global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_gr
     // In the case where pad > 0 and kernel = 1, for example,
     // max_idx can be -1 reaching this step.
     if (max_idx >= 0) {
-      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+      atomicAdd(&in_grad[in_offset + max_idx * multiplier], out_grad[index]);
     }
   }
 }
 
 /*!
- * \brief max unpooling gpu kernel for 2-D images.
+ * \brief max unpooling gpu kernel for 2-D images, for both NCHW and NHWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template <typename DType>
+template <typename DType, int layout>
 __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int height, const int width,
@@ -384,15 +414,18 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int stride_h, const int stride_w,
                                          const int pad_h, const int pad_w,
                                          DType* in_grad) {
-  // index is the output image's pixel index in NCHW
+  // index is the output image's pixel index
   // the order has to be consistent with pooling max
   // to avoid adding out_grad to the wrong in_grad
   // in the case where there are multiple max pixels
   // covered by a kernel window
   CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int c = (index / pooled_width / pooled_height) % channels;
+    const bool nhwc_layout = layout == mshadow::kNHWC;
+    const int idx = nhwc_layout ? (index / channels) : index;
+    const int pw = idx % pooled_width;
+    const int ph = (idx / pooled_width) % pooled_height;
+    const int c = nhwc_layout ? (index % channels)
+                              : (index / pooled_width / pooled_height) % channels;
     const int n = index / pooled_width / pooled_height / channels;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
@@ -401,15 +434,17 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     // in data/grad offset batch and channel dims
-    int in_offset = (n * channels + c) * height * width;
+    int in_offset = nhwc_layout ? n * channels * height * width + c
+                                : (n * channels + c) * height * width;
     const DType* in_data_slice = in_data + in_offset;
     int max_idx = -1;
     DType max_val = out_data[index];
+    const int multiplier = nhwc_layout ? channels : 1;
     bool found = false;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         const int idx = h * width + w;
-        if (in_data_slice[idx] == max_val) {
+        if (in_data_slice[idx * multiplier] == max_val) {
           max_idx = idx;
           found = true;
           break;
@@ -421,16 +456,16 @@ __global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_gr
     // In the case where pad > 0 and kernel = 1, for example,
     // max_idx can be -1 reaching this step.
     if (max_idx >= 0) {
-      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+      atomicAdd(&in_grad[in_offset + max_idx * multiplier], out_grad[index]);
     }
   }
 }
 
 /*!
- * \brief max unpooling gpu kernel for 3-D images.
+ * \brief max unpooling gpu kernel for 3-D images, for both NCDHW and NDHWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template <typename DType>
+template <typename DType, int layout>
 __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int depth, const int height,
@@ -441,16 +476,19 @@ __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int stride_h, const int stride_w, const int pad_d,
                                          const int pad_h, const int pad_w,
                                          DType* in_grad) {
-  // index is the output image's pixel index in NCDHW
+  // index is the output image's pixel index
   // the order has to be consistent with pooling max
   // to avoid adding out_grad to the wrong in_grad
   // in the case where there are multiple max pixels
   // covered by a kernel window
   CUDA_KERNEL_LOOP(index, nthreads) {
-    const int pw = index % pooled_width;
-    const int ph = (index / pooled_width) % pooled_height;
-    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
-    const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const bool ndhwc_layout = layout == mshadow::kNDHWC;
+    const int idx = ndhwc_layout ? (index / channels) : index;
+    const int pw = idx % pooled_width;
+    const int ph = (idx / pooled_width) % pooled_height;
+    const int pd = (idx / pooled_width / pooled_height) % pooled_depth;
+    const int c = ndhwc_layout ? (index % channels)
+                               : (index / pooled_width / pooled_height / pooled_depth) % channels;
     const int n = index / pooled_width / pooled_height / pooled_depth / channels;
     int dstart = pd * stride_d - pad_d;
     int hstart = ph * stride_h - pad_h;
@@ -462,16 +500,18 @@ __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_gr
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     // in data/grad offset batch and channel dims
-    int in_offset = (n * channels + c) * depth * height * width;
+    int in_offset = ndhwc_layout ? n * channels * depth * height * width + c
+                                 : (n * channels + c) * depth * height * width;
     const DType* in_data_slice = in_data + in_offset;
     int max_idx = -1;
     DType max_val = out_data[index];
+    const int multiplier = ndhwc_layout ? channels : 1;
     bool found = false;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
           const int idx = (d * height + h) * width + w;
-          if (in_data_slice[idx] == max_val) {
+          if (in_data_slice[idx * multiplier] == max_val) {
             max_idx = idx;
             found = true;
             break;
@@ -485,16 +525,16 @@ __global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_gr
     // In the case where pad > 0 and kernel = 1, for example,
     // max_idx can be -1 reaching this step.
     if (max_idx >= 0) {
-      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+      atomicAdd(&in_grad[in_offset + max_idx * multiplier], out_grad[index]);
     }
   }
 }
 
 /*!
- * \brief avg/sum unpooling gpu kernel for 1-D images.
+ * \brief avg/sum unpooling gpu kernel for 1-D images, for both NCW and NWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template<typename DType, int p = 1>
+template<typename DType, int layout, int p = 1>
 __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int width,
@@ -502,20 +542,23 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int stride_w, const int pad_w, DType* in_grad,
                                          const bool is_avg = false,
                                          const bool count_include_pad = true) {
-  // index is the input image index in NCW
+  // index is the input image index
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
     // find out the local offset
-    const int w = index % width + pad_w;
-    const int c = (index / width) % channels;
+    const bool nwc_layout = layout == mshadow::kNWC;
+    const int idx = nwc_layout ? (index / channels) : index;
+    const int w = idx % width + pad_w;
+    const int c = nwc_layout ? index % channels : (index / width) % channels;
     const int n = index / width / channels;
     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
     const int pwend = min(w / stride_w + 1, pooled_width);
     DType gradient = 0;
-    const DType* out_grad_slice =
-      out_grad + (n * channels + c) * pooled_width;
-    const DType* out_data_slice =
-      out_data + (n * channels + c) * pooled_width;
+    const int slice_offset = nwc_layout ? n * channels * pooled_width + c
+                                        : (n * channels + c) * pooled_width;
+    const DType* out_grad_slice = out_grad + slice_offset;
+    const DType* out_data_slice = out_data + slice_offset;
+    const int multiplier = nwc_layout ? channels : 1;
     for (int pw = pwstart; pw < pwend; ++pw) {
       // figure out the pooling size
       int wstart = pw * stride_w - pad_w;
@@ -527,7 +570,8 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
         pool_size = (wend - wstart);
       }
       gradient +=
-        lp_grad<DType, p>::Map(out_grad_slice[pw], in_data[index], out_data_slice[pw]) / pool_size;
+        lp_grad<DType, p>::Map(out_grad_slice[pw * multiplier], in_data[index],
+                               out_data_slice[pw * multiplier]) / pool_size;
     }
     // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
     // use "+=" here instead of "=" to accommodate when req=kAddTo
@@ -536,10 +580,10 @@ __global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_gr
 }
 
 /*!
- * \brief avg/sum unpooling gpu kernel for 2-D images.
+ * \brief avg/sum unpooling gpu kernel for 2-D images, for both NCHW and NHWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template<typename DType, int p = 1>
+template<typename DType, int layout, int p = 1>
 __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int height, const int width,
@@ -549,23 +593,26 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int pad_h, const int pad_w, DType* in_grad,
                                          const bool is_avg = false,
                                          const bool count_include_pad = true) {
-  // index is the input image index in NCHW
+  // index is the input image index
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
     // find out the local offset
-    const int w = index % width + pad_w;
-    const int h = (index / width) % height + pad_h;
-    const int c = (index / width / height) % channels;
+    const bool nhwc_layout = layout == mshadow::kNHWC;
+    const int idx = nhwc_layout ? (index / channels) : index;
+    const int w = idx % width + pad_w;
+    const int h = (idx / width) % height + pad_h;
+    const int c = nhwc_layout ? index % channels : (index / width / height) % channels;
     const int n = index / width / height / channels;
     const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
     const int phend = min(h / stride_h + 1, pooled_height);
     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
     const int pwend = min(w / stride_w + 1, pooled_width);
     DType gradient = 0;
-    const DType* out_grad_slice =
-      out_grad + (n * channels + c) * pooled_height * pooled_width;
-    const DType* out_data_slice =
-      out_data + (n * channels + c) * pooled_height * pooled_width;
+    const int slice_offset = nhwc_layout ? n * channels * pooled_height * pooled_width + c
+                                         : (n * channels + c) * pooled_height * pooled_width;
+    const DType* out_grad_slice = out_grad + slice_offset;
+    const DType* out_data_slice = out_data + slice_offset;
+    const int multiplier = nhwc_layout ? channels : 1;
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         // figure out the pooling size
@@ -583,9 +630,9 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
           pool_size = (hend - hstart) * (wend - wstart);
         }
         gradient +=
-          lp_grad<DType, p>::Map(out_grad_slice[out_index],
+          lp_grad<DType, p>::Map(out_grad_slice[out_index * multiplier],
                                  in_data[index],
-                                 out_data_slice[out_index]) / pool_size;
+                                 out_data_slice[out_index * multiplier]) / pool_size;
       }
     }
     // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
@@ -595,10 +642,10 @@ __global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_gr
 }
 
 /*!
- * \brief avg/sum unpooling gpu kernel for 3-D images.
+ * \brief avg/sum unpooling gpu kernel for 3-D images, for both NCDHW and NDHWC layouts.
  * Do not call this kernel directly. Use the interface unpool().
  */
-template<typename DType, int p = 1>
+template<typename DType, int layout, int p = 1>
 __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_grad,
                                          const DType* in_data, const DType* out_data,
                                          const int channels, const int depth, const int height,
@@ -609,14 +656,16 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
                                          const int stride_w, const int pad_d, const int pad_h,
                                          const int pad_w, DType* in_grad, const bool is_avg = false,
                                          const bool count_include_pad = true) {
-  // index is the input image index in NCDHW
+  // index is the input image index
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
     // find out the local offset
-    const int w = index % width + pad_w;
-    const int h = (index / width) % height + pad_h;
-    const int d = (index / width / height) % depth + pad_d;
-    const int c = (index / width / height / depth) % channels;
+    const bool ndhwc_layout = layout == mshadow::kNDHWC;
+    const int idx = ndhwc_layout ? (index / channels) : index;
+    const int w = idx % width + pad_w;
+    const int h = (idx / width) % height + pad_h;
+    const int d = (idx / width / height) % depth + pad_d;
+    const int c = ndhwc_layout ? index % channels : (index / width / height / depth) % channels;
     const int n = index / width / height / depth / channels;
     const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1;
     const int pdend = min(d / stride_d + 1, pooled_depth);
@@ -625,10 +674,12 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
     const int pwend = min(w / stride_w + 1, pooled_width);
     DType gradient = 0;
-    const DType* out_grad_slice =
-      out_grad + (n * channels + c) * pooled_depth * pooled_height * pooled_width;
-    const DType* out_data_slice =
-      out_data + (n * channels + c) * pooled_depth * pooled_height * pooled_width;
+    const int slice_offset =
+        ndhwc_layout ? n * channels * pooled_depth * pooled_height * pooled_width + c
+                     : (n * channels + c) * pooled_depth * pooled_height * pooled_width;
+    const DType* out_grad_slice = out_grad + slice_offset;
+    const DType* out_data_slice = out_data + slice_offset;
+    const int multiplier = ndhwc_layout ? channels : 1;
     for (int pd = pdstart; pd < pdend; ++pd) {
       for (int ph = phstart; ph < phend; ++ph) {
         for (int pw = pwstart; pw < pwend; ++pw) {
@@ -650,9 +701,9 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
             wend = min(wend, width);
             pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
           }
-          gradient += lp_grad<DType, p>::Map(out_grad_slice[out_index],
+          gradient += lp_grad<DType, p>::Map(out_grad_slice[out_index * multiplier],
                                              in_data[index],
-                                             out_data_slice[out_index]) / pool_size;
+                                             out_data_slice[out_index * multiplier]) / pool_size;
         }
       }
     }
@@ -674,32 +725,26 @@ __global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_gr
  * \param pool_type supported pooling type: max, avg, sum
  * \param req_type operator request type, only support kWriteTo for now
  * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
- * \param p_value value of p for Lp pooling
+ * \param count_include_pad for avg pooling, should 0 pad values be averaged in the window
  */
-template<typename DType, int p>
+template<typename DType, int layout, int p>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data, const bool count_include_pad, const dmlc::optional<int> &layout) {
+                 DType* out_data, const bool count_include_pad) {
   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
-  CHECK(!layout.has_value() ||
-        layout.value() == mshadow::kNCW ||
-        layout.value() == mshadow::kNCHW ||
-        layout.value() == mshadow::kNCDHW) <<
-    "MXNet CUDA pooling expects layout NCW, NCHW or NCDHW.  Need CuDNN for support of layout " <<
-    layout.value();
   using namespace mxnet_op;
   if (kernel.ndim() == 1) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_max_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_max_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2],
                                    oshape[2], kernel[0], stride[0], pad[0], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_1d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_sum_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
                                    kernel[0], stride[0], pad[0], out_data,
@@ -707,14 +752,14 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_sum_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
                                    kernel[0], stride[0], pad[0], out_data);
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_sum_1d_gpu_kernel<DType, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_sum_1d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
                                    kernel[0], stride[0], pad[0], out_data);
@@ -725,7 +770,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
   } else if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_max_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
@@ -733,7 +778,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_sum_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
@@ -742,7 +787,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_sum_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
@@ -750,7 +795,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_sum_2d_gpu_kernel<DType, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_sum_2d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    oshape[2], oshape[3], kernel[0], kernel[1],
@@ -762,7 +807,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
   } else if (kernel.ndim() == 3) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_max_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_max_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4],
@@ -771,7 +816,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_3d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_sum_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
@@ -780,7 +825,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_sum_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
@@ -789,7 +834,7 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
       MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      pool_sum_3d_gpu_kernel<DType, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      pool_sum_3d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                    oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
                                    ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
@@ -802,6 +847,70 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
   }
 }
 
+/*!
+ * \brief This function serves as an interface for 1/2/3-D pooling operations.
+ * \param s context stream defining the device in use is gpu
+ * \param in_data pointer of the input tensor data
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type, only support kWriteTo for now
+ * \param out_data pointer of the output tensor data
+ * \param count_include_pad for avg pooling, should 0 pad values be averaged in the window
+ * \param layout I/O tensor layout, e.g. NCHW vs. NHWC
+ */
+template<typename DType, int p>
+inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
+                 const TShape& oshape, const TShape& kernel, const TShape& pad,
+                 const TShape& stride, const int pool_type, OpReqType req_type,
+                 DType* out_data, const bool count_include_pad, const dmlc::optional<int> &layout) {
+  if (kernel.ndim() == 1) {
+    if (layout.has_value() && layout.value() == mshadow::kNWC) {
+      // standardize shapes to NCW to aid templated kernel invocation
+      TShape ishape_ncw = ConvertLayout(ishape.get<3>(), mshadow::kNWC, mshadow::kNCW);
+      TShape oshape_ncw = ConvertLayout(oshape.get<3>(), mshadow::kNWC, mshadow::kNCW);
+      pool<DType, mshadow::kNWC, p>(s, in_data, ishape_ncw, oshape_ncw, kernel,
+                                    pad, stride, pool_type, req_type, out_data, count_include_pad);
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCW) {
+      pool<DType, mshadow::kNCW, p>(s, in_data, ishape, oshape, kernel,
+                                    pad, stride, pool_type, req_type, out_data, count_include_pad);
+    } else {
+      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout.value();
+    }
+  } else if (kernel.ndim() == 2) {
+    if (layout.has_value() && layout.value() == mshadow::kNHWC) {
+      // standardize shapes to NCHW to aid templated kernel invocation
+      TShape ishape_nchw = ConvertLayout(ishape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
+      TShape oshape_nchw = ConvertLayout(oshape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
+      pool<DType, mshadow::kNHWC, p>(s, in_data, ishape_nchw, oshape_nchw, kernel,
+                                     pad, stride, pool_type, req_type, out_data, count_include_pad);
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCHW) {
+      pool<DType, mshadow::kNCHW, p>(s, in_data, ishape, oshape, kernel,
+                                     pad, stride, pool_type, req_type, out_data, count_include_pad);
+    } else {
+      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout.value();
+    }
+  } else if (kernel.ndim() == 3) {
+    if (layout.has_value() && layout.value() == mshadow::kNDHWC) {
+      // standardize shapes to NCDHW to aid templated kernel invocation
+      TShape ishape_ncdhw = ConvertLayout(ishape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
+      TShape oshape_ncdhw = ConvertLayout(oshape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
+      pool<DType, mshadow::kNDHWC, p>(s, in_data, ishape_ncdhw, oshape_ncdhw, kernel,
+                                     pad, stride, pool_type, req_type, out_data, count_include_pad);
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCDHW) {
+      pool<DType, mshadow::kNCDHW, p>(s, in_data, ishape, oshape, kernel,
+                                     pad, stride, pool_type, req_type, out_data, count_include_pad);
+    } else {
+      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout.value();
+    }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling";
+  }
+}
+
 /*!
  * \brief This function serves as an interface for 1/2/3-D unpooling operations.
  * \param s context stream defining the device in use is gpu
@@ -816,29 +925,23 @@ inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& is
  * \param pool_type supported pooling type: max, avg, sum
  * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
  * \param in_grad pointer of the gradient of the operator's input tensor
- * \param p_value value of p for Lp pooling
+ * \param count_include_pad for avg pooling, should 0 pad values be averaged in the window
  */
-template<typename DType, int p>
+template<typename DType, int layout, int p>
 inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
                    const int pool_type, OpReqType req_type, DType* in_grad,
-                   const bool count_include_pad, const dmlc::optional<int> &layout) {
+                   const bool count_include_pad) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
   }
-  CHECK(!layout.has_value() ||
-        layout.value() == mshadow::kNCW ||
-        layout.value() == mshadow::kNCHW ||
-        layout.value() == mshadow::kNCDHW) <<
-     "MXNet CUDA pooling expects layout NCW, NCHW or NCDHW.  Need CuDNN for support of layout " <<
-     layout.value();
   using namespace mxnet_op;
   if (kernel.ndim() == 1) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_max_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_max_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      oshape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0], stride[0], pad[0],
@@ -846,7 +949,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_1d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_sum_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
@@ -854,7 +957,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_sum_1d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
@@ -862,7 +965,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_sum_1d_gpu_kernel<DType, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_sum_1d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], oshape[2], kernel[0],
@@ -874,7 +977,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
   } else if (kernel.ndim() == 2) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_max_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      oshape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
@@ -883,7 +986,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_sum_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
@@ -893,7 +996,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_sum_2d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
@@ -902,7 +1005,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_sum_2d_gpu_kernel<DType, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_sum_2d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3],
@@ -915,7 +1018,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
   } else if (kernel.ndim() == 3) {
     if (pool_enum::kMaxPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_max_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_max_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      oshape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3], ishape[4],
@@ -925,7 +1028,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_3d_gpu_kernel);
     } else if (pool_enum::kAvgPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_sum_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3], ishape[4],
@@ -935,7 +1038,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
     } else if (pool_enum::kSumPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_sum_3d_gpu_kernel<DType, layout><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3], ishape[4],
@@ -945,7 +1048,7 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
       MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
     } else if (pool_enum::kLpPooling == pool_type) {
       // NOLINT_NEXT_LINE(whitespace/operators)
-      unpool_sum_3d_gpu_kernel<DType, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+      unpool_sum_3d_gpu_kernel<DType, layout, p><<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
                                  0, mshadow::Stream<gpu>::GetStream(s)>>>(
                                      ishape.Size(), out_grad, in_data, out_data,
                                      ishape[1], ishape[2], ishape[3], ishape[4],
@@ -961,6 +1064,73 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
   }
 }
 
+/*!
+ * \brief This function serves as an interface for 1/2/3-D unpooling operations.
+ * \param s context stream defining the device in use is gpu
+ * \param out_grad pointer of the gradient of operator's output tensor
+ * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW
+ * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param in_grad pointer of the gradient of the operator's input tensor
+ * \param count_include_pad for avg pooling, should 0 pad values be averaged in the window
+ * \param layout I/O tensor layout, e.g. NCHW vs. NHWC
+ */
+template<typename DType, int p>
+inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
+                   const DType* out_data, const TShape& ishape, const TShape& oshape,
+                   const TShape& kernel, const TShape& pad, const TShape& stride,
+                   const int pool_type, OpReqType req_type, DType* in_grad,
+                   const bool count_include_pad, const dmlc::optional<int> &layout) {
+  if (kernel.ndim() == 1) {
+    if (layout.has_value() && layout.value() == mshadow::kNWC) {
+      // standardize shapes to NCW to aid templated kernel invocation
+      TShape ishape_ncw = ConvertLayout(ishape.get<3>(), mshadow::kNWC, mshadow::kNCW);
+      TShape oshape_ncw = ConvertLayout(oshape.get<3>(), mshadow::kNWC, mshadow::kNCW);
+      unpool<DType, mshadow::kNWC, p>(s, out_grad, in_data, out_data, ishape_ncw, oshape_ncw,
+                              kernel, pad, stride, pool_type, req_type, in_grad, count_include_pad);
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCW) {
+      unpool<DType, mshadow::kNCW, p>(s, out_grad, in_data, out_data, ishape, oshape, kernel,
+                              pad, stride, pool_type, req_type, in_grad, count_include_pad);
+    } else {
+      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout.value();
+    }
+  } else if (kernel.ndim() == 2) {
+    if (layout.has_value() && layout.value() == mshadow::kNHWC) {
+      // standardize shapes to NCHW to aid templated kernel invocation
+      TShape ishape_nchw = ConvertLayout(ishape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
+      TShape oshape_nchw = ConvertLayout(oshape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
+      unpool<DType, mshadow::kNHWC, p>(s, out_grad, in_data, out_data, ishape_nchw, oshape_nchw,
+                              kernel, pad, stride, pool_type, req_type, in_grad, count_include_pad);
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCHW) {
+      unpool<DType, mshadow::kNCHW, p>(s, out_grad, in_data, out_data, ishape, oshape, kernel,
+                              pad, stride, pool_type, req_type, in_grad, count_include_pad);
+    } else {
+      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout.value();
+    }
+  } else if (kernel.ndim() == 3) {
+    if (layout.has_value() && layout.value() == mshadow::kNDHWC) {
+      // standardize shapes to NCDHW to aid templated kernel invocation
+      TShape ishape_ncdhw = ConvertLayout(ishape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
+      TShape oshape_ncdhw = ConvertLayout(oshape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
+      unpool<DType, mshadow::kNDHWC, p>(s, out_grad, in_data, out_data, ishape_ncdhw, oshape_ncdhw,
+                              kernel, pad, stride, pool_type, req_type, in_grad, count_include_pad);
+    } else if (!layout.has_value() || layout.value() == mshadow::kNCDHW) {
+      unpool<DType, mshadow::kNCDHW, p>(s, out_grad, in_data, out_data, ishape, oshape, kernel,
+                              pad, stride, pool_type, req_type, in_grad, count_include_pad);
+    } else {
+      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout.value();
+    }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
+  }
+}
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/nn/pool_utils.h b/src/operator/nn/pool_utils.h
index 641cc4a995ab..7b2657451ba7 100644
--- a/src/operator/nn/pool_utils.h
+++ b/src/operator/nn/pool_utils.h
@@ -98,14 +98,16 @@ struct lp_grad<DType, 1> {
 template<typename DType>
 struct lp_grad<DType, 2> {
   static MSHADOW_XINLINE DType Map(const DType grad, const DType in_data, const DType out_data) {
-    return grad * in_data / out_data;
+    // Avoid nan result if both grad and out_data are 0.
+    return (grad == DType(0.0)) ? DType(0.0) : grad * in_data / out_data;
   }
 };
 
 template<typename DType>
 struct lp_grad<DType, 3> {
   static MSHADOW_XINLINE DType Map(const DType grad, const DType in_data, const DType out_data) {
-    return grad * in_data * in_data / (out_data * out_data);
+    // Avoid nan result if both grad and out_data are 0.
+    return (grad == DType(0.0)) ? DType(0.0) : grad * in_data * in_data / (out_data * out_data);
   }
 };
 
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index d48f1e38975c..5207f812282d 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -169,8 +169,17 @@ class PoolingOp {
     TShape padding = param_.pad;
     TShape stride = param_.stride;
     if (param_.global_pool) {
-      kernel = TShape(ishape.data() + 2,
-               ishape.data() + ishape.ndim());
+      // with global pooling, kernel shape corresponds to input shape with 'N' and 'C' removed
+      if (param_.layout.value() == mshadow::kNWC ||
+          param_.layout.value() == mshadow::kNHWC ||
+          param_.layout.value() == mshadow::kNDHWC) {
+        kernel = TShape(ishape.data() + 1,
+                        ishape.data() + ishape.ndim() - 1);
+
+      } else {
+        kernel = TShape(ishape.data() + 2,
+                        ishape.data() + ishape.ndim());
+      }
       padding = TShape(ishape.ndim() - 2);
       for (index_t i = 0; i < ishape.ndim() - 2; i++) {
         padding[i] = 0;
@@ -218,8 +227,17 @@ class PoolingOp {
     TShape padding = param_.pad;
     TShape stride = param_.stride;
     if (param_.global_pool) {
-      kernel = TShape(ishape.data() + 2,
-               ishape.data() + ishape.ndim());
+      // with global pooling, kernel shape corresponds to input shape with 'N' and 'C' removed
+      if (param_.layout.value() == mshadow::kNWC ||
+          param_.layout.value() == mshadow::kNHWC ||
+          param_.layout.value() == mshadow::kNDHWC) {
+        kernel = TShape(ishape.data() + 1,
+                        ishape.data() + ishape.ndim() - 1);
+
+      } else {
+        kernel = TShape(ishape.data() + 2,
+                        ishape.data() + ishape.ndim());
+      }
       padding = TShape(ishape.ndim() - 2);
       for (index_t i = 0; i < ishape.ndim() - 2; i++) {
         padding[i] = 0;
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index ba81ead671ce..2eadeae5e063 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -609,19 +609,36 @@ def test_convolution_versions():
 
 @with_seed()
 def test_pooling_with_convention():
+    # transpose shape to bring feature dimension 'c' from 2nd position to last
+    def transpose(shape):
+        return (shape[0],) + shape[2:] + (shape[1],)
     # While the float32 and float64 output is reliably consistent, float16 departs occasionally.
     # We compare cpu and gpu results only within a given precision.
-    for data_type in [np.float64, np.float32, np.float16]:
-        ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': {'pool_data': data_type}},
-                    {'ctx': mx.cpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': {'pool_data': data_type}}]
-        sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='valid', name='pool')
-        check_consistency(sym, ctx_list)
-
-        sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='full', name='pool')
-        check_consistency(sym, ctx_list)
-
-        sym = mx.sym.Pooling(kernel=(300,300), pool_type='max', global_pool=True, name='pool')
-        check_consistency(sym, ctx_list)
+    for layout in ['NCHW', 'NHWC']:
+        for data_type in [np.float64, np.float32, np.float16]:
+            logical_inshape = (2, 2, 10, 10)
+            inshape = logical_inshape if layout == 'NCHW' else transpose(logical_inshape)
+            gpu_ctx = {'ctx': mx.gpu(0), 'pool_data': inshape,
+                       'type_dict': {'pool_data': data_type}}
+            cpu_ctx = {'ctx': mx.cpu(0), 'pool_data': inshape,
+                       'type_dict': {'pool_data': data_type}}
+            sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='valid',
+                                 name='pool', layout = layout, cudnn_off = False)
+            sym_no_cudnn = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='valid',
+                                          name='pool', layout = layout, cudnn_off = True)
+            check_consistency([sym, sym_no_cudnn, sym], [gpu_ctx, gpu_ctx, cpu_ctx])
+
+            sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='full',
+                                 name='pool', layout = layout, cudnn_off = False)
+            sym_no_cudnn = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='full',
+                                          name='pool', layout = layout, cudnn_off = True)
+            check_consistency([sym, sym_no_cudnn, sym], [gpu_ctx, gpu_ctx, cpu_ctx])
+
+            sym = mx.sym.Pooling(kernel=(300,300), pool_type='max', global_pool=True,
+                                 name='pool', layout = layout, cudnn_off = False)
+            sym_no_cudnn = mx.sym.Pooling(kernel=(300,300), pool_type='max', global_pool=True,
+                                          name='pool', layout = layout, cudnn_off = True)
+            check_consistency([sym, sym_no_cudnn, sym], [gpu_ctx, gpu_ctx, cpu_ctx])
 
 
 @with_seed()
@@ -642,16 +659,19 @@ def make_pooling_syms(**kwargs):
     for in_shape in [(3, 4, 8, 8), (2, 2, 10, 10)]:
         for data_type in [np.float64, np.float32, np.float16]:
             # NHWC pooling is only enabled on GPU with CUDNN
-            ctx_list = [{'ctx': mx.gpu(0), 'pool_data': in_shape, 'type_dict': {'pool_data': data_type}}]
-            symlist = make_pooling_syms(kernel=(3,3), pool_type='max', pooling_convention='valid', name='pool')
+            ctx_list = [{'ctx': mx.gpu(0), 'pool_data': in_shape,
+                         'type_dict': {'pool_data': data_type}}]
+            symlist = make_pooling_syms(kernel=(3,3), pool_type='max',
+                                        pooling_convention='valid', name='pool')
             check_consistency_NxM(symlist, ctx_list)
 
-            symlist = make_pooling_syms(kernel=(3,3), pool_type='max', pooling_convention='full', name='pool')
+            symlist = make_pooling_syms(kernel=(3,3), pool_type='max',
+                                        pooling_convention='full', name='pool')
+            check_consistency_NxM(symlist, ctx_list)
+
+            symlist = make_pooling_syms(kernel=(300,300), pool_type='max',
+                                        global_pool=True, name='pool')
             check_consistency_NxM(symlist, ctx_list)
-            # CUDNN v7.1.4 can't handle all cases, and there's no NHWC MXNet fallback impl yet
-            if in_shape[2] <= 8 and in_shape[3] <= 8:
-                symlist = make_pooling_syms(kernel=(300,300), pool_type='max', global_pool=True, name='pool')
-                check_consistency_NxM(symlist, ctx_list)
 
 
 def test_pooling_with_type():
@@ -861,211 +881,278 @@ def make_pooling_syms(**kwargs):
         symlist = make_pooling_syms(name='pool', kernel=(5,5), pad=(2,2), pool_type='max')
         check_consistency_NxM(symlist, ctx_list)
 
-@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/11517")
+
 @with_seed()
 def test_pooling_versions():
-    def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride, pooling_convention='valid',
-                                     global_pool=False, p_value=2, count_include_pad=True, tol=None):
+
+    def transposed_layout(ndim):
+        if ndim < 3 or ndim > 5:
+            raise RuntimeError("Invalid data dim, expecting 3, 4 or 5")
+        return ('NWC', 'NHWC', 'NDHWC')[ndim-3]
+
+    def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride,
+                                     pooling_convention='valid', global_pool=False, p_value=2,
+                                     count_include_pad=True, tol=None, dtype=np.float32):
         ctx_list = []
         sym_list = []
-        # PoolingV1 cpu
-        if 'pool_v1_cpu' in pool_op_list:
-            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
-            if not global_pool:
-                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                                  pooling_convention=pooling_convention, name='pool'))
-            else:
-                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
-        # PoolingV1 gpu
-        if 'pool_v1_gpu' in pool_op_list:
-            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
-            if not global_pool:
-                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                                  pooling_convention=pooling_convention, name='pool'))
-            else:
-                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
-        # Pooling cpu
-        if 'pool_cpu' in pool_op_list:
-            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
-            if not global_pool:
-                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                               pooling_convention=pooling_convention, name='pool',
-                                               p_value=p_value, count_include_pad=count_include_pad))
-            else:
-                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool',
-                                               p_value=p_value, count_include_pad=count_include_pad))
-        # Pooling gpu
-        if 'pool_gpu' in pool_op_list:
-            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
-            if not global_pool:
-                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                               pooling_convention=pooling_convention, cudnn_off=True, name='pool',
-                                               p_value=p_value, count_include_pad=count_include_pad))
+        for pool_ctx in pool_op_list:
+            (pool_op, ctx_type) = pool_ctx.rsplit('_', 1)
+            expected_ctxs = ['cpu', 'gpu', 'cudnn']
+            if ctx_type not in expected_ctxs:
+                raise RuntimeError('Expected one of {}, saw {}.'.format(expected_ctxs, ctx_type))
+            # print('Testing pool_op = {}, ctx_type = {}'.format(pool_op, ctx_type))
+            ctx = mx.cpu(0) if ctx_type == 'cpu' else mx.gpu(0)
+            ctx_list.append({'ctx': ctx, 'pool_data': data, 'type_dict': {'pool_data': dtype}})
+            # start with pool args present in all cases
+            pool_op_args = {'kernel': kernel, 'pool_type': pool_type,
+                            'pooling_convention' : pooling_convention, 'name' : 'pool'}
+            # add other args as needed
+            if global_pool:
+                pool_op_args['global_pool'] = True
             else:
-                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=True,
-                                               name='pool', p_value=p_value, count_include_pad=count_include_pad))
-        # CuDNNPooling
-        if 'pool_cudnn' in pool_op_list:
-            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
-            if not global_pool:
-                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                               pooling_convention=pooling_convention, p_value=p_value, cudnn_off=False,
-                                               name='pool', count_include_pad=count_include_pad))
+                pool_op_args.update({'pad' : pad, 'stride' : stride})
+            expected_pool_ops = ['pool', 'pool_transposed', 'pool_v1']
+            if pool_op == 'pool_v1':
+                sym = mx.sym.Pooling_v1(**pool_op_args)
             else:
-                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, p_value=p_value,
-                                               cudnn_off=False, name='pool', count_include_pad=count_include_pad))
+                pool_op_args.update({'p_value' : p_value, 'count_include_pad' : count_include_pad})
+                if ctx_type != 'cpu':
+                    pool_op_args['cudnn_off'] = ctx_type == 'gpu'
+                if pool_op == 'pool':
+                    sym = mx.sym.Pooling(**pool_op_args)
+                elif pool_op == 'pool_transposed':
+                    ndim = len(data)
+                    # NCW->NWC axes=(0,2,1) NCHW->NHWC axes=(0,2,3,1) NCDHW->NDHWC axes=(0,2,3,4,1);
+                    axes = (0,) + tuple(range(2,ndim)) + (1,)
+                    transposed = mx.sym.transpose(axes=axes, name='pool')
+                    pooled = mx.sym.Pooling(data=transposed, layout=transposed_layout(ndim),
+                                            **pool_op_args)
+                    # NWC->NCW axes=(0,2,1) NHWC->NCHW axes=(0,3,1,2) NDHWC->NCDHW axes=(0,4,1,2,3);
+                    axes = (0, ndim-1) + tuple(range(1,ndim-1))
+                    sym = mx.sym.transpose(data=pooled, axes=axes, name='pool')
+                else:
+                    raise RuntimeError('Expected one of {}, saw {}.'.format(expected_pool_ops,
+                                                                            pool_op))
+            sym_list.append(sym)
+
         check_consistency(sym_list, ctx_list, equal_nan=(not count_include_pad), tol=tol)
 
-    def test_1d_pooling(pool_type, p_value=2, count_include_pad=True):
-        data = (2, 3, 20)
+    def test_1d_pooling(pool_type, dtype, p_value=2, count_include_pad=True):
+        data = (2, 3, 10)
         kernel = (4,)
         pad = (0,)
         stride = (1,)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, pooling_convention='valid',
+                                     global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
 
         pad = (2,)
         stride = (2,)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, pooling_convention='valid',
+                                     global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
 
         pad = (0,)
         stride = (1,)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, pooling_convention='full',
+                                     global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
 
         pad = (2,)
         stride = (2,)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad)
-
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     global_pool=True, p_value=p_value, count_include_pad=count_include_pad)
-
-    def test_2d_pooling(pool_type, p_value=2, count_include_pad=True):
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, pooling_convention='full',
+                                     global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
+
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, global_pool=True, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
+
+    def test_2d_pooling(pool_type, dtype, p_value=2, count_include_pad=True):
         data = (2, 3, 20, 20)
         kernel = (4, 5)
         pad = (0, 0)
         stride = (1, 1)
         if pool_type == 'lp':
-            test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         pooling_convention='valid', global_pool=False, p_value=p_value)
+            test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                       'pool_gpu', 'pool_transposed_gpu',
+                                                       'pool_cudnn', 'pool_transposed_cudnn'],
+                                         data=data, kernel=kernel, pad=pad, stride=stride,
+                                         pool_type=pool_type, pooling_convention='valid',
+                                         global_pool=False, p_value=p_value, dtype=dtype)
         else:
-            test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         pooling_convention='valid', global_pool=False, count_include_pad=count_include_pad)
+            test_pooling_versions_helper(pool_op_list=['pool_v1_cpu',
+                                                       'pool_v1_gpu',
+                                                       'pool_cpu', 'pool_transposed_cpu',
+                                                       'pool_gpu', 'pool_transposed_gpu',
+                                                       'pool_cudnn', 'pool_transposed_cudnn'],
+                                         data=data, kernel=kernel, pad=pad, stride=stride,
+                                         pool_type=pool_type, pooling_convention='valid',
+                                         global_pool=False, count_include_pad=count_include_pad,
+                                         dtype=dtype)
 
         # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
-        pad = (2, 3)
-        stride = (2, 3)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad)
+        pad = (0, 0)
+        stride = (2, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, pooling_convention='valid',
+                                     global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
 
         pad = (0, 0)
         stride = (1, 1)
         if pool_type == 'lp':
-            test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         pooling_convention='full', global_pool=False, p_value=p_value)
+            test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                       'pool_gpu', 'pool_transposed_gpu',
+                                                       'pool_cudnn', 'pool_transposed_cudnn'],
+                                         data=data, kernel=kernel, pad=pad, stride=stride,
+                                         pool_type=pool_type, pooling_convention='full',
+                                         global_pool=False, p_value=p_value, dtype=dtype)
         else:
             if count_include_pad:
-                test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                             data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                             pooling_convention='full', global_pool=False,
-                                             count_include_pad=count_include_pad)
+                test_pooling_versions_helper(pool_op_list=['pool_v1_cpu',
+                                                           'pool_v1_gpu',
+                                                           'pool_cpu', 'pool_transposed_cpu',
+                                                           'pool_gpu', 'pool_transposed_gpu',
+                                                           'pool_cudnn', 'pool_transposed_cudnn'],
+                                             data=data, kernel=kernel, pad=pad, stride=stride,
+                                             pool_type=pool_type, pooling_convention='full',
+                                             global_pool=False, count_include_pad=count_include_pad,
+                                             dtype=dtype)
             else:
-                test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                             data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                             pooling_convention='full', global_pool=False,
-                                             count_include_pad=count_include_pad)
+                test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                           'pool_gpu', 'pool_transposed_gpu',
+                                                           'pool_cudnn', 'pool_transposed_cudnn'],
+                                             data=data, kernel=kernel, pad=pad, stride=stride,
+                                             pool_type=pool_type, pooling_convention='full',
+                                             global_pool=False, count_include_pad=count_include_pad,
+                                             dtype=dtype)
 
         # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
         pad = (2, 3)
         stride = (2, 3)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, pooling_convention='full',
+                                     global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
 
         if pool_type == 'lp':
-            test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         global_pool=True, p_value=p_value)
+            test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                       'pool_gpu', 'pool_transposed_gpu',
+                                                       'pool_cudnn', 'pool_transposed_cudnn'],
+                                         data=data, kernel=kernel, pad=pad, stride=stride,
+                                         pool_type=pool_type, global_pool=True,
+                                         p_value=p_value, dtype=dtype)
         else:
-            test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                         global_pool=True, count_include_pad=count_include_pad)
-
-    def test_3d_pooling(pool_type, p_value=2, count_include_pad=True):
+            test_pooling_versions_helper(pool_op_list=['pool_v1_cpu',
+                                                       'pool_v1_gpu',
+                                                       'pool_cpu', 'pool_transposed_cpu',
+                                                       'pool_gpu', 'pool_transposed_gpu',
+                                                       'pool_cudnn', 'pool_transposed_cudnn'],
+                                         data=data, kernel=kernel, pad=pad, stride=stride,
+                                         pool_type=pool_type, global_pool=True,
+                                         count_include_pad=count_include_pad, dtype=dtype)
+
+    def test_3d_pooling(pool_type, dtype, p_value=2, count_include_pad=True):
         data = (2, 3, 20, 20, 20)
         kernel = (4, 5, 3)
         pad = (0, 0, 0)
         stride = (1, 1, 1)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, pooling_convention='valid',
+                                     global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
 
         pad = (2, 3, 3)
         stride = (2, 3, 1)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='valid', global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, pooling_convention='valid',
+                                     global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
 
         pad = (0, 0, 0)
         stride = (1, 1, 1)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, pooling_convention='full',
+                                     global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
 
         pad = (2, 3, 3)
         stride = (2, 3, 1)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     pooling_convention='full', global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad)
-
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                     global_pool=True, p_value=p_value, count_include_pad=count_include_pad)
-
-    test_1d_pooling('max')
-    test_1d_pooling('avg', count_include_pad=True)
-    test_1d_pooling('avg', count_include_pad=False)
-    test_1d_pooling('sum')
-    test_1d_pooling('lp', p_value=1)
-    test_1d_pooling('lp', p_value=2)
-    test_1d_pooling('lp', p_value=3)
-
-    test_2d_pooling('max')
-    test_2d_pooling('avg', count_include_pad=True)
-    test_2d_pooling('avg', count_include_pad=False)
-    test_2d_pooling('sum')
-    test_2d_pooling('lp', p_value=1)
-    test_2d_pooling('lp', p_value=2)
-    test_2d_pooling('lp', p_value=3)
-
-    test_3d_pooling('max')
-    test_3d_pooling('avg', count_include_pad=True)
-    test_3d_pooling('avg', count_include_pad=False)
-    test_3d_pooling('sum')
-    test_3d_pooling('lp', p_value=1)
-    test_3d_pooling('lp', p_value=2)
-    test_3d_pooling('lp', p_value=3)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, pooling_convention='full',
+                                     global_pool=False, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
+
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
+                                                   'pool_gpu', 'pool_transposed_gpu',
+                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride,
+                                     pool_type=pool_type, global_pool=True, p_value=p_value,
+                                     count_include_pad=count_include_pad, dtype=dtype)
+
+    # leave off np.float16 until pool_v1 can be made to conform to pool for this dtype
+    for dtype in [np.float32, np.float64]:
+        test_1d_pooling('max', dtype=dtype)
+        test_1d_pooling('avg', dtype=dtype, count_include_pad=True)
+        test_1d_pooling('avg', dtype=dtype, count_include_pad=False)
+        test_1d_pooling('sum', dtype=dtype)
+        test_1d_pooling('lp', dtype=dtype, p_value=1)
+        test_1d_pooling('lp', dtype=dtype, p_value=2)
+        test_1d_pooling('lp', dtype=dtype, p_value=3)
+
+        test_2d_pooling('max', dtype=dtype)
+        test_2d_pooling('avg', dtype=dtype, count_include_pad=True)
+        test_2d_pooling('avg', dtype=dtype, count_include_pad=False)
+        test_2d_pooling('sum', dtype=dtype)
+        test_2d_pooling('lp', dtype=dtype, p_value=1)
+        test_2d_pooling('lp', dtype=dtype, p_value=2)
+        test_2d_pooling('lp', dtype=dtype, p_value=3)
+
+        test_3d_pooling('max', dtype=dtype)
+        test_3d_pooling('avg', dtype=dtype, count_include_pad=True)
+        test_3d_pooling('avg', dtype=dtype, count_include_pad=False)
+        test_3d_pooling('sum', dtype=dtype)
+        test_3d_pooling('lp', dtype=dtype, p_value=1)
+        test_3d_pooling('lp', dtype=dtype, p_value=2)
+        test_3d_pooling('lp', dtype=dtype, p_value=3)
 
 
 @with_seed()

From 78d08a127b43ac61494f8fff767a1194be1b4c4d Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 21 Dec 2018 14:13:04 -0800
Subject: [PATCH 12/32] Add PoolingParam::GetLayout() for better default layout
 handling.

---
 src/operator/nn/cudnn/cudnn_pooling-inl.h | 27 +++++++------
 src/operator/nn/pool.cuh                  | 40 ++++++++++----------
 src/operator/nn/pool.h                    | 40 ++++++++++----------
 src/operator/nn/pooling-inl.h             | 46 ++++++++++++++++-------
 src/operator/nn/pooling.cc                | 30 +++++++--------
 5 files changed, 100 insertions(+), 83 deletions(-)

diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 1892a51968cc..1df1d920ffd7 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -179,7 +179,7 @@ class CuDNNPoolingOp {
     static bool sum_pooling_warning_issued = false;
     static bool lp_pooling_warning_issued = false;
     static bool unsupported_dim_warning_issued = false;
-    int layout = param.layout.value();
+    int layout = param.GetLayout(input.ndim());
 
     switch (param.pool_type) {
       case pool_enum::kMaxPooling:
@@ -251,20 +251,19 @@ class CuDNNPoolingOp {
     #if CUDNN_MAJOR >= 5
     nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
     #endif
+    int layout = param_.GetLayout(in_data.ndim());
     if (param_.kernel.ndim() == 2) {
       // 2d pooling
-      CHECK(param_.layout.value() == mshadow::kNCHW ||
-            param_.layout.value() == mshadow::kNHWC) << "Need 2D layout NCHW or NHWC.";
-      cudnnTensorFormat_t cudnn_layout =
-          (param_.layout.value() == mshadow::kNCHW) ? CUDNN_TENSOR_NCHW
-                                                    : CUDNN_TENSOR_NHWC;
+      CHECK(layout == mshadow::kNCHW || layout == mshadow::kNHWC) << "Need 2D layout NCHW or NHWC.";
+      cudnnTensorFormat_t cudnn_layout = (layout == mshadow::kNCHW) ? CUDNN_TENSOR_NCHW
+                                                                    : CUDNN_TENSOR_NHWC;
       Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
       Tensor<gpu, 4, DType> out = out_data.get<gpu, 4, DType>(s);
       // Perform shape calculations in a standard (NCHW) layout space
-      mshadow::Shape<4> dshape_nchw = (param_.layout.value() == mshadow::kNHWC) ?
+      mshadow::Shape<4> dshape_nchw = (layout == mshadow::kNHWC) ?
                                       ConvertLayout(data.shape_, mshadow::kNHWC, mshadow::kNCHW) :
                                       data.shape_;
-      mshadow::Shape<4> oshape_nchw = (param_.layout.value() == mshadow::kNHWC) ?
+      mshadow::Shape<4> oshape_nchw = (layout == mshadow::kNHWC) ?
                                       ConvertLayout(out.shape_, mshadow::kNHWC, mshadow::kNCHW) :
                                       out.shape_;
       CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_,
@@ -310,8 +309,8 @@ class CuDNNPoolingOp {
                                              param_.global_pool ? 1 : param_.stride[1]));
       #endif
     } else {
-      CHECK(param_.layout.value() == mshadow::kNCDHW ||
-            param_.layout.value() == mshadow::kNDHWC) << "Need 3D layout NCDHW or NDHWC.";
+      CHECK(layout == mshadow::kNCDHW ||
+            layout == mshadow::kNDHWC) << "Need 3D layout NCDHW or NDHWC.";
       Tensor<gpu, 5, DType> data = in_data.get<gpu, 5, DType>(s);
       mshadow::Shape<5> dshape = data.shape_;
       mshadow::Shape<5> dstride = mshadow::Shape5(dshape.ProdShape(1, 5),
@@ -329,16 +328,16 @@ class CuDNNPoolingOp {
                                            oshape.ProdShape(5, 5));
       // Convert to a standard (NCDHW) layout space to create args for cuDNN
 
-      mshadow::Shape<5> dshape_ncdhw = (param_.layout.value() == mshadow::kNDHWC) ?
+      mshadow::Shape<5> dshape_ncdhw = (layout == mshadow::kNDHWC) ?
                                        ConvertLayout(dshape, mshadow::kNDHWC, mshadow::kNCDHW) :
                                        dshape;
-      mshadow::Shape<5> dstride_ncdhw = (param_.layout.value() == mshadow::kNDHWC) ?
+      mshadow::Shape<5> dstride_ncdhw = (layout == mshadow::kNDHWC) ?
                                         ConvertLayout(dstride, mshadow::kNDHWC, mshadow::kNCDHW) :
                                         dstride;
-      mshadow::Shape<5> oshape_ncdhw = (param_.layout.value() == mshadow::kNDHWC) ?
+      mshadow::Shape<5> oshape_ncdhw = (layout == mshadow::kNDHWC) ?
                                         ConvertLayout(oshape, mshadow::kNDHWC, mshadow::kNCDHW) :
                                         oshape;
-      mshadow::Shape<5> ostride_ncdhw = (param_.layout.value() == mshadow::kNDHWC) ?
+      mshadow::Shape<5> ostride_ncdhw = (layout == mshadow::kNDHWC) ?
                                         ConvertLayout(ostride, mshadow::kNDHWC, mshadow::kNCDHW) :
                                         ostride;
       // Create int arrays for passing into cuDNN
diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index cbdf77d272a0..69c630508304 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -866,45 +866,45 @@ template<typename DType, int p>
 inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data, const bool count_include_pad, const dmlc::optional<int> &layout) {
+                 DType* out_data, const bool count_include_pad, int layout) {
   if (kernel.ndim() == 1) {
-    if (layout.has_value() && layout.value() == mshadow::kNWC) {
+    if (layout == mshadow::kNWC) {
       // standardize shapes to NCW to aid templated kernel invocation
       TShape ishape_ncw = ConvertLayout(ishape.get<3>(), mshadow::kNWC, mshadow::kNCW);
       TShape oshape_ncw = ConvertLayout(oshape.get<3>(), mshadow::kNWC, mshadow::kNCW);
       pool<DType, mshadow::kNWC, p>(s, in_data, ishape_ncw, oshape_ncw, kernel,
                                     pad, stride, pool_type, req_type, out_data, count_include_pad);
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCW) {
+    } else if (layout == mshadow::kNCW) {
       pool<DType, mshadow::kNCW, p>(s, in_data, ishape, oshape, kernel,
                                     pad, stride, pool_type, req_type, out_data, count_include_pad);
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 2) {
-    if (layout.has_value() && layout.value() == mshadow::kNHWC) {
+    if (layout == mshadow::kNHWC) {
       // standardize shapes to NCHW to aid templated kernel invocation
       TShape ishape_nchw = ConvertLayout(ishape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
       TShape oshape_nchw = ConvertLayout(oshape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
       pool<DType, mshadow::kNHWC, p>(s, in_data, ishape_nchw, oshape_nchw, kernel,
                                      pad, stride, pool_type, req_type, out_data, count_include_pad);
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCHW) {
+    } else if (layout == mshadow::kNCHW) {
       pool<DType, mshadow::kNCHW, p>(s, in_data, ishape, oshape, kernel,
                                      pad, stride, pool_type, req_type, out_data, count_include_pad);
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 3) {
-    if (layout.has_value() && layout.value() == mshadow::kNDHWC) {
+    if (layout == mshadow::kNDHWC) {
       // standardize shapes to NCDHW to aid templated kernel invocation
       TShape ishape_ncdhw = ConvertLayout(ishape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
       TShape oshape_ncdhw = ConvertLayout(oshape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
       pool<DType, mshadow::kNDHWC, p>(s, in_data, ishape_ncdhw, oshape_ncdhw, kernel,
                                      pad, stride, pool_type, req_type, out_data, count_include_pad);
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCDHW) {
+    } else if (layout == mshadow::kNCDHW) {
       pool<DType, mshadow::kNCDHW, p>(s, in_data, ishape, oshape, kernel,
                                      pad, stride, pool_type, req_type, out_data, count_include_pad);
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout;
     }
   } else {
     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling";
@@ -1086,45 +1086,45 @@ inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType*
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
                    const int pool_type, OpReqType req_type, DType* in_grad,
-                   const bool count_include_pad, const dmlc::optional<int> &layout) {
+                   const bool count_include_pad, int layout) {
   if (kernel.ndim() == 1) {
-    if (layout.has_value() && layout.value() == mshadow::kNWC) {
+    if (layout == mshadow::kNWC) {
       // standardize shapes to NCW to aid templated kernel invocation
       TShape ishape_ncw = ConvertLayout(ishape.get<3>(), mshadow::kNWC, mshadow::kNCW);
       TShape oshape_ncw = ConvertLayout(oshape.get<3>(), mshadow::kNWC, mshadow::kNCW);
       unpool<DType, mshadow::kNWC, p>(s, out_grad, in_data, out_data, ishape_ncw, oshape_ncw,
                               kernel, pad, stride, pool_type, req_type, in_grad, count_include_pad);
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCW) {
+    } else if (layout == mshadow::kNCW) {
       unpool<DType, mshadow::kNCW, p>(s, out_grad, in_data, out_data, ishape, oshape, kernel,
                               pad, stride, pool_type, req_type, in_grad, count_include_pad);
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 2) {
-    if (layout.has_value() && layout.value() == mshadow::kNHWC) {
+    if (layout == mshadow::kNHWC) {
       // standardize shapes to NCHW to aid templated kernel invocation
       TShape ishape_nchw = ConvertLayout(ishape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
       TShape oshape_nchw = ConvertLayout(oshape.get<4>(), mshadow::kNHWC, mshadow::kNCHW);
       unpool<DType, mshadow::kNHWC, p>(s, out_grad, in_data, out_data, ishape_nchw, oshape_nchw,
                               kernel, pad, stride, pool_type, req_type, in_grad, count_include_pad);
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCHW) {
+    } else if (layout == mshadow::kNCHW) {
       unpool<DType, mshadow::kNCHW, p>(s, out_grad, in_data, out_data, ishape, oshape, kernel,
                               pad, stride, pool_type, req_type, in_grad, count_include_pad);
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 3) {
-    if (layout.has_value() && layout.value() == mshadow::kNDHWC) {
+    if (layout == mshadow::kNDHWC) {
       // standardize shapes to NCDHW to aid templated kernel invocation
       TShape ishape_ncdhw = ConvertLayout(ishape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
       TShape oshape_ncdhw = ConvertLayout(oshape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW);
       unpool<DType, mshadow::kNDHWC, p>(s, out_grad, in_data, out_data, ishape_ncdhw, oshape_ncdhw,
                               kernel, pad, stride, pool_type, req_type, in_grad, count_include_pad);
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCDHW) {
+    } else if (layout == mshadow::kNCDHW) {
       unpool<DType, mshadow::kNCDHW, p>(s, out_grad, in_data, out_data, ishape, oshape, kernel,
                               pad, stride, pool_type, req_type, in_grad, count_include_pad);
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout;
     }
   } else {
     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 4a2f9b27d5ab..9cd779134228 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -1326,10 +1326,10 @@ template<typename DType, int p>
 inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
                  const TShape& oshape, const TShape& kernel, const TShape& pad,
                  const TShape& stride, const int pool_type, OpReqType req_type,
-                 DType* out_data, const bool count_include_pad, const dmlc::optional<int> &layout) {
+                 DType* out_data, const bool count_include_pad, int layout) {
   CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
   if (kernel.ndim() == 1) {
-    if (layout.has_value() && layout.value() == mshadow::kNWC) {
+    if (layout == mshadow::kNWC) {
       if (pool_enum::kMaxPooling == pool_type) {
         pool_max_1d_nwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
       } else if (pool_enum::kAvgPooling == pool_type) {
@@ -1342,7 +1342,7 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
       } else {
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCW) {
+    } else if (layout == mshadow::kNCW) {
       if (pool_enum::kMaxPooling == pool_type) {
         pool_max_1d_ncw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
       } else if (pool_enum::kAvgPooling == pool_type) {
@@ -1356,10 +1356,10 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 2) {
-    if (layout.has_value() && layout.value() == mshadow::kNHWC) {
+    if (layout == mshadow::kNHWC) {
       if (pool_enum::kMaxPooling == pool_type) {
         pool_max_2d_nhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
       } else if (pool_enum::kAvgPooling == pool_type) {
@@ -1372,7 +1372,7 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
       } else {
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCHW) {
+    } else if (layout == mshadow::kNCHW) {
       if (pool_enum::kMaxPooling == pool_type) {
         pool_max_2d_nchw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
       } else if (pool_enum::kAvgPooling == pool_type) {
@@ -1386,10 +1386,10 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 3) {
-    if (layout.has_value() && layout.value() == mshadow::kNDHWC) {
+    if (layout == mshadow::kNDHWC) {
       if (pool_enum::kMaxPooling == pool_type) {
         pool_max_3d_ndhwc_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
       } else if (pool_enum::kAvgPooling == pool_type) {
@@ -1402,7 +1402,7 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
       } else {
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCDHW) {
+    } else if (layout == mshadow::kNCDHW) {
       if (pool_enum::kMaxPooling == pool_type) {
         pool_max_3d_ncdhw_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
       } else if (pool_enum::kAvgPooling == pool_type) {
@@ -1416,7 +1416,7 @@ inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& is
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout;
     }
   } else {
     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling";
@@ -1444,13 +1444,13 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
                    const DType* out_data, const TShape& ishape, const TShape& oshape,
                    const TShape& kernel, const TShape& pad, const TShape& stride,
                    const int pool_type, OpReqType req_type, DType* in_grad,
-                   const bool count_include_pad, const dmlc::optional<int> &layout) {
+                   const bool count_include_pad, int layout) {
   if (mxnet::kNullOp == req_type) return;
   if (mxnet::kAddTo != req_type) {
     mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_grad);
   }
   if (kernel.ndim() == 1) {
-    if (layout.has_value() && layout.value() == mshadow::kNWC) {
+    if (layout == mshadow::kNWC) {
       if (pool_enum::kMaxPooling == pool_type) {
         unpool_max_1d_nwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
                           in_grad);
@@ -1467,7 +1467,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
       } else {
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCW) {
+    } else if (layout == mshadow::kNCW) {
       if (pool_enum::kMaxPooling == pool_type) {
         unpool_max_1d_ncw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
                               in_grad);
@@ -1486,10 +1486,10 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCW or kNWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 2) {
-    if (layout.has_value() && layout.value() == mshadow::kNHWC) {
+    if (layout == mshadow::kNHWC) {
       if (pool_enum::kMaxPooling == pool_type) {
         unpool_max_2d_nhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
                           in_grad);
@@ -1507,7 +1507,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
       } else {
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCHW) {
+    } else if (layout == mshadow::kNCHW) {
       if (pool_enum::kMaxPooling == pool_type) {
         unpool_max_2d_nchw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
                                in_grad);
@@ -1526,10 +1526,10 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCHW or kNHWC, saw: " << layout;
     }
   } else if (kernel.ndim() == 3) {
-    if (layout.has_value() && layout.value() == mshadow::kNDHWC) {
+    if (layout == mshadow::kNDHWC) {
       if (pool_enum::kMaxPooling == pool_type) {
         unpool_max_3d_ndhwc_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
                           in_grad);
@@ -1546,7 +1546,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
       } else {
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
-    } else if (!layout.has_value() || layout.value() == mshadow::kNCDHW) {
+    } else if (layout == mshadow::kNCDHW) {
       if (pool_enum::kMaxPooling == pool_type) {
         unpool_max_3d_ncdhw_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride,
                                 in_grad);
@@ -1565,7 +1565,7 @@ inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType*
         LOG(FATAL) << "Unknown pooling type " << pool_type;
       }
     } else {
-      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout.value();
+      LOG(FATAL) << "Unsupported layout, expecting kNCDHW or kNDHWC, saw: " << layout;
     }
   } else {
     LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 5207f812282d..c947c3265f2b 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -118,6 +118,27 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
            this->count_include_pad  == other.count_include_pad &&
            this->layout             == other.layout;
   }
+
+  // Extract layout from param, or supply default layout based on provided input dimension.
+  int GetLayout(int input_dim) const {
+    int ret_val = mshadow::kNCW;
+    if (layout.has_value()) {
+      ret_val = layout.value();
+    }
+    else {
+      switch (input_dim) {
+        case 3U: ret_val = mshadow::kNCW; break;
+        case 4U: ret_val = mshadow::kNCHW; break;
+        case 5U: ret_val = mshadow::kNCDHW; break;
+        default:
+          LOG(FATAL) << "Unexpected input data dim " << input_dim << "\n"
+                     << "Pooling: Input data should be  3D in (batch, channel, x), "
+                     << " or 4D in (batch, channel, y, x), "
+                     << " or 5D in (batch, channel, d, y, x).";
+      }
+    }
+    return ret_val;
+  }
 };
 
 }  // namespace op
@@ -137,7 +158,8 @@ struct hash<mxnet::op::PoolingParam> {
     ret = dmlc::HashCombine(ret, val.cudnn_off);
     ret = dmlc::HashCombine(ret, val.p_value);
     ret = dmlc::HashCombine(ret, val.count_include_pad);
-    ret = dmlc::HashCombine(ret, val.layout);
+    int val_layout = val.layout.has_value() ? val.layout.value() : -1;
+    ret = dmlc::HashCombine(ret, val_layout);
     return ret;
   }
 };
@@ -168,11 +190,10 @@ class PoolingOp {
     TShape kernel = param_.kernel;
     TShape padding = param_.pad;
     TShape stride = param_.stride;
+    int layout = param_.GetLayout(ishape.ndim());
     if (param_.global_pool) {
       // with global pooling, kernel shape corresponds to input shape with 'N' and 'C' removed
-      if (param_.layout.value() == mshadow::kNWC ||
-          param_.layout.value() == mshadow::kNHWC ||
-          param_.layout.value() == mshadow::kNDHWC) {
+      if (layout == mshadow::kNWC || layout == mshadow::kNHWC || layout == mshadow::kNDHWC) {
         kernel = TShape(ishape.data() + 1,
                         ishape.data() + ishape.ndim() - 1);
 
@@ -196,21 +217,21 @@ class PoolingOp {
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad, param_.layout);
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad, layout);
         break;
       case 2:
         pool<DType, 2>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad, param_.layout);
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad, layout);
         break;
       case 3:
         pool<DType, 3>(s, in_data.dptr<DType>(), in_data.shape_, out_data.shape_,
           kernel,
           padding,
           stride,
-          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad, param_.layout);
+          param_.pool_type, req, out_data.dptr<DType>(), count_include_pad, layout);
         break;
       default:
         LOG(FATAL) << "p value of " << p_value << " is not supported yet...";
@@ -226,11 +247,10 @@ class PoolingOp {
     TShape kernel = param_.kernel;
     TShape padding = param_.pad;
     TShape stride = param_.stride;
+    int layout = param_.GetLayout(ishape.ndim());
     if (param_.global_pool) {
       // with global pooling, kernel shape corresponds to input shape with 'N' and 'C' removed
-      if (param_.layout.value() == mshadow::kNWC ||
-          param_.layout.value() == mshadow::kNHWC ||
-          param_.layout.value() == mshadow::kNDHWC) {
+      if (layout == mshadow::kNWC || layout == mshadow::kNHWC || layout == mshadow::kNDHWC) {
         kernel = TShape(ishape.data() + 1,
                         ishape.data() + ishape.ndim() - 1);
 
@@ -256,7 +276,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad, param_.layout);
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad, layout);
         break;
       case 2:
         unpool<DType, 2>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
@@ -264,7 +284,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad, param_.layout);
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad, layout);
         break;
       case 3:
         unpool<DType, 3>(s, out_grad.dptr<DType>(), in_data.dptr<DType>(), out_data.dptr<DType>(),
@@ -272,7 +292,7 @@ class PoolingOp {
            kernel,
            padding,
            stride,
-           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad, param_.layout);
+           param_.pool_type, req, in_grad.dptr<DType>(), count_include_pad, layout);
         break;
       default:
         LOG(FATAL) << "p value of " << p_value << " is not supported yet...";
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 1024f110492b..1dc641aa0f9c 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -39,12 +39,13 @@ void PoolingParamParser(nnvm::NodeAttrs *attrs) {
   using namespace mshadow;
   PoolingParam param;
   param.Init(attrs->dict);
+  // Set default layout if it can be inferred from kernel shape.
+  if (param.kernel.ndim() > 0)
+    param.layout = param.GetLayout(param.kernel.ndim() + 2);
   if (param.kernel.ndim() == 1) {
-    param.layout = param.layout ? param.layout.value() : mshadow::kNCW;
     if (param.stride.ndim() == 0) param.stride = Shape1(1);
     if (param.pad.ndim() == 0) param.pad = Shape1(0);
   } else if (param.kernel.ndim() == 2) {
-    param.layout = param.layout ? param.layout.value() : mshadow::kNCHW;
     if (param.stride.ndim() == 0) param.stride = Shape2(1, 1);
     if (param.pad.ndim() == 0) param.pad = Shape2(0, 0);
   } else {
@@ -53,7 +54,6 @@ void PoolingParamParser(nnvm::NodeAttrs *attrs) {
         CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim()
             << "D pooling not supported";
       }
-    param.layout = param.layout ? param.layout.value(): mshadow::kNCDHW;
     if (param.stride.ndim() == 0) param.stride = Shape3(1, 1, 1);
     if (param.pad.ndim() == 0) param.pad = Shape3(0, 0, 0);
   }
@@ -115,10 +115,11 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       << " Or 4D in (batch, channel, y, x) "
       << " Or 5D in (batch, channel, d, y, x)";
   if (dshape.ndim() == 0) return false;
+  int layout = param.GetLayout(dshape.ndim());
   if (param.global_pool) {
     TShape oshape = dshape;
     size_t c_index = 0;
-    switch (param.layout.value()) {
+    switch (layout) {
       case mshadow::kNCW:
       case mshadow::kNCHW:
       case mshadow::kNCDHW:
@@ -146,10 +147,9 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
   } else if (param.kernel.ndim() == 1) {
     CHECK_EQ(dshape.ndim(), 3U) <<
       "Pooling: Input data should be 3D in (batch, channel, x)";
-    CHECK(param.layout.value() == mshadow::kNCW ||
-          param.layout.value() == mshadow::kNWC) << "Need 1D layout";
+    CHECK(layout == mshadow::kNCW || layout == mshadow::kNWC) << "Need 1D layout";
     // Perform shape calculations in a standard (NCW) layout space
-    mshadow::Shape<3> dshape_ncw = (param.layout.value() == mshadow::kNWC) ?
+    mshadow::Shape<3> dshape_ncw = (layout == mshadow::kNWC) ?
                                     ConvertLayout(dshape.get<3>(), mshadow::kNWC, mshadow::kNCW) :
                                     dshape.get<3>();
     mshadow::Shape<3> oshape_ncw = dshape_ncw;
@@ -171,7 +171,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
                           param.stride[0]));
     }
     // Convert back from standard (NCW) layout space to the actual layout type
-    TShape oshape = (param.layout.value() == mshadow::kNWC) ?
+    TShape oshape = (layout == mshadow::kNWC) ?
                     ConvertLayout(oshape_ncw, mshadow::kNCW, mshadow::kNWC) : oshape_ncw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
@@ -181,10 +181,9 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
 #endif
   } else if (param.kernel.ndim() == 2) {
     CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
-    CHECK(param.layout.value() == mshadow::kNCHW ||
-          param.layout.value() == mshadow::kNHWC) << "Need 2D layout";
+    CHECK(layout == mshadow::kNCHW || layout == mshadow::kNHWC) << "Need 2D layout";
     // Perform shape calculations in a standard (NCHW) layout space
-    mshadow::Shape<4> dshape_nchw = (param.layout.value() == mshadow::kNHWC) ?
+    mshadow::Shape<4> dshape_nchw = (layout == mshadow::kNHWC) ?
                                     ConvertLayout(dshape.get<4>(), mshadow::kNHWC, mshadow::kNCHW) :
                                     dshape.get<4>();
     mshadow::Shape<4> oshape_nchw = dshape_nchw;
@@ -210,7 +209,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
                                param.stride[1]));
     }
     // Convert back from standard (NCHW) layout space to the actual layout type
-    TShape oshape = (param.layout.value() == mshadow::kNHWC) ?
+    TShape oshape = (layout == mshadow::kNHWC) ?
                     ConvertLayout(oshape_nchw, mshadow::kNCHW, mshadow::kNHWC) : oshape_nchw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape
@@ -221,10 +220,9 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
   } else if (param.kernel.ndim() == 3) {
     CHECK_EQ(dshape.ndim(), 5U)
       << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
-    CHECK(param.layout.value() == mshadow::kNCDHW ||
-          param.layout.value() == mshadow::kNDHWC) << "Need 3D layout";
+    CHECK(layout == mshadow::kNCDHW || layout == mshadow::kNDHWC) << "Need 3D layout";
     // Perform shape calculations in a standard (NCDHW) layout space
-    mshadow::Shape<5> dshape_ncdhw = (param.layout.value() == mshadow::kNDHWC) ?
+    mshadow::Shape<5> dshape_ncdhw = (layout == mshadow::kNDHWC) ?
                                   ConvertLayout(dshape.get<5>(), mshadow::kNDHWC, mshadow::kNCDHW) :
                                   dshape.get<5>();
     mshadow::Shape<5> oshape_ncdhw = dshape_ncdhw;
@@ -253,7 +251,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
                                 param.stride[2]));
     }
     // Convert back from standard (NCDHW) layout space to the actual layout type
-    TShape oshape = (param.layout.value() == mshadow::kNDHWC) ?
+    TShape oshape = (layout == mshadow::kNDHWC) ?
                     ConvertLayout(oshape_ncdhw, mshadow::kNCDHW, mshadow::kNDHWC) : oshape_ncdhw;
     out_shape->clear();
     out_shape->push_back(oshape);  // save output shape

From 226854b625fd2d081e5adbd6be3eeb31dac6e229 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 21 Dec 2018 14:26:26 -0800
Subject: [PATCH 13/32] Fix cpplint.

---
 src/operator/nn/pooling-inl.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index c947c3265f2b..af00fd5cfa3c 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -124,8 +124,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     int ret_val = mshadow::kNCW;
     if (layout.has_value()) {
       ret_val = layout.value();
-    }
-    else {
+    } else {
       switch (input_dim) {
         case 3U: ret_val = mshadow::kNCW; break;
         case 4U: ret_val = mshadow::kNCHW; break;

From b08747bb1f5dbc4d7c3a1340e2b13ef56089e74a Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 21 Dec 2018 15:24:08 -0800
Subject: [PATCH 14/32] Throw exception for quantization pooling not NCHW.

---
 src/operator/quantization/quantized_pooling.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/operator/quantization/quantized_pooling.cc b/src/operator/quantization/quantized_pooling.cc
index 477830a6353c..bb5ea570f6ac 100644
--- a/src/operator/quantization/quantized_pooling.cc
+++ b/src/operator/quantization/quantized_pooling.cc
@@ -40,6 +40,9 @@ bool QuantizedPoolingShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(dshape.ndim(), 4U)
       << "quantized_pooling: Input data should be 4D in "
       << "(batch, channel, y, x)";
+  int layout = param.GetLayout(dshape.ndim());
+  CHECK_EQ(layout, mshadow::kNCHW)
+      << "QuantizedPoolingOp only supports NCHW layout for now, saw " << layout;
   // NCHW layout
   const int N = 0, H = 2, W = 3, C = 1;
   TShape oshape(4);

From ed31a607ec26799d2722b39ff9e95d9f312dad5a Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 21 Dec 2018 16:19:29 -0800
Subject: [PATCH 15/32] Expand nhwc pooling test coverage.

---
 src/operator/nn/cudnn/cudnn_pooling-inl.h | 10 ++++---
 tests/python/gpu/test_operator_gpu.py     | 33 ++++++++++++-----------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 1df1d920ffd7..8dad756f6c77 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -217,14 +217,18 @@ class CuDNNPoolingOp {
       if (window_height > 8 || window_width > 8)
         return false;
 #endif
-      // Avoid strided NHWC max pooling for some configs, to be corrected in a future cudnn release.
+#if CUDNN_VERSION >= 7105 && CUDNN_VERSION < 7500
+      // Avoid strided NHWC max pooling for some configs
       if (layout == mshadow::kNHWC &&
           param.pool_type == pool_enum::kMaxPooling && !param.global_pool) {
-        if ((param.stride[0] >= 3 || param.stride[0] == 2 && param.kernel[0] % 2 == 0))
+        if (param.stride[0] >= 3 ||
+            param.stride[0] == 2 && param.kernel[0] % 2 == 0 && param.kernel[0] != 2)
           return false;
-        if ((param.stride[1] >= 3 || param.stride[1] == 2 && param.kernel[1] % 2 == 0))
+        if (param.stride[1] >= 3 ||
+            param.stride[1] == 2 && param.kernel[1] % 2 == 0 && param.kernel[1] != 2)
           return false;
       }
+#endif
     } else if (param.kernel.ndim() == 3) {
       // 3d pooling
 #if CUDNN_MAJOR < 5
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 2eadeae5e063..3c6c2355f768 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -656,22 +656,23 @@ def make_pooling_syms(**kwargs):
 
     # While the float32 and float64 output is reliably consistent, float16 departs occasionally.
     # We compare nhwc and nchw results only within a given precision.
-    for in_shape in [(3, 4, 8, 8), (2, 2, 10, 10)]:
-        for data_type in [np.float64, np.float32, np.float16]:
-            # NHWC pooling is only enabled on GPU with CUDNN
-            ctx_list = [{'ctx': mx.gpu(0), 'pool_data': in_shape,
-                         'type_dict': {'pool_data': data_type}}]
-            symlist = make_pooling_syms(kernel=(3,3), pool_type='max',
-                                        pooling_convention='valid', name='pool')
-            check_consistency_NxM(symlist, ctx_list)
-
-            symlist = make_pooling_syms(kernel=(3,3), pool_type='max',
-                                        pooling_convention='full', name='pool')
-            check_consistency_NxM(symlist, ctx_list)
-
-            symlist = make_pooling_syms(kernel=(300,300), pool_type='max',
-                                        global_pool=True, name='pool')
-            check_consistency_NxM(symlist, ctx_list)
+    for in_shape in [(3, 4, 8, 8), (2, 2, 20, 20)]:
+        for kernel in [(2,2), (3,3), (4,4)]:
+            for stride in [(1,1), (1,2), (2,1), (2,2)]:
+                for data_type in [np.float64, np.float32, np.float16]:
+                    ctx_list = [{'ctx': mx.gpu(0), 'pool_data': in_shape,
+                                 'type_dict': {'pool_data': data_type}}]
+                    symlist = make_pooling_syms(kernel=kernel, pool_type='max', stride=stride,
+                                                pooling_convention='valid', name='pool')
+                    check_consistency_NxM(symlist, ctx_list)
+
+                    symlist = make_pooling_syms(kernel=kernel, pool_type='max', stride=stride,
+                                                pooling_convention='full', name='pool')
+                    check_consistency_NxM(symlist, ctx_list)
+
+                    symlist = make_pooling_syms(kernel=(300,300), pool_type='max',
+                                                global_pool=True, name='pool')
+                    check_consistency_NxM(symlist, ctx_list)
 
 
 def test_pooling_with_type():

From 0dd3fbad0f4ab160c6cf6e6d366d0b0b0aa01778 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 21 Dec 2018 18:12:46 -0800
Subject: [PATCH 16/32] SupportMKLDNNPooling() to examine layout param.

---
 src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
index f548778c7615..de3d63e24f6c 100644
--- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h
@@ -104,7 +104,8 @@ class MKLDNNPoolingBwd {
 inline bool SupportMKLDNNPooling(const PoolingParam &param) {
   return param.kernel.ndim() == 2 &&
          (param.pool_type == pool_enum::kMaxPooling ||
-          param.pool_type == pool_enum::kAvgPooling);
+          param.pool_type == pool_enum::kAvgPooling) &&
+         (!param.layout.has_value() || param.layout.value() == mshadow::kNCHW);
 }
 
 inline bool SupportMKLDNNPooling(const PoolingParam &param,

From 062554233f497b5cad87346bd1b8f5a97f762f2d Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 28 Dec 2018 19:18:50 -0800
Subject: [PATCH 17/32] Compare 'std' and 'v1' pooling versions only when op
 definitions permit.

---
 tests/python/gpu/test_operator_gpu.py | 285 +++++++-------------------
 1 file changed, 77 insertions(+), 208 deletions(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 3c6c2355f768..2895e32cc7c9 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -901,7 +901,6 @@ def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, str
             expected_ctxs = ['cpu', 'gpu', 'cudnn']
             if ctx_type not in expected_ctxs:
                 raise RuntimeError('Expected one of {}, saw {}.'.format(expected_ctxs, ctx_type))
-            # print('Testing pool_op = {}, ctx_type = {}'.format(pool_op, ctx_type))
             ctx = mx.cpu(0) if ctx_type == 'cpu' else mx.gpu(0)
             ctx_list.append({'ctx': ctx, 'pool_data': data, 'type_dict': {'pool_data': dtype}})
             # start with pool args present in all cases
@@ -938,222 +937,92 @@ def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, str
 
         check_consistency(sym_list, ctx_list, equal_nan=(not count_include_pad), tol=tol)
 
-    def test_1d_pooling(pool_type, dtype, p_value=2, count_include_pad=True):
-        data = (2, 3, 10)
-        kernel = (4,)
-        pad = (0,)
-        stride = (1,)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, pooling_convention='valid',
-                                     global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
-
-        pad = (2,)
-        stride = (2,)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, pooling_convention='valid',
-                                     global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
-
-        pad = (0,)
-        stride = (1,)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, pooling_convention='full',
-                                     global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
-
-        pad = (2,)
-        stride = (2,)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, pooling_convention='full',
-                                     global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
-
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, global_pool=True, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
-
-    def test_2d_pooling(pool_type, dtype, p_value=2, count_include_pad=True):
-        data = (2, 3, 20, 20)
-        kernel = (4, 5)
-        pad = (0, 0)
-        stride = (1, 1)
-        if pool_type == 'lp':
-            test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                       'pool_gpu', 'pool_transposed_gpu',
-                                                       'pool_cudnn', 'pool_transposed_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride,
-                                         pool_type=pool_type, pooling_convention='valid',
-                                         global_pool=False, p_value=p_value, dtype=dtype)
+    def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_include_pad=True):
+        if dim == '1D':
+            data = (3, 3, 10)
+            kernel = (4,)
+            pads = [(0,), (2,)]
+            strides = [(1,), (2,)]
+        elif dim == '2D_no_padding':
+            data = (3, 2, 20, 20)
+            kernel = (4, 5)
+            pads = [(0, 0), (0, 0)]
+            strides = [(1, 1), (2,1)]
+        elif dim == '2D':
+            data = (2, 2, 20, 20)
+            kernel = (4, 5)
+            pads = [(0, 0), (0, 0), (2, 3)]
+            strides = [(1, 1), (2,1), (1, 1)]
+        elif dim == '3D':
+            data = (2, 3, 20, 20, 20)
+            kernel = (4, 5, 3)
+            pads = [(0, 0, 0), (2, 3, 2)]
+            strides = [(1, 1, 1), (2, 3, 1)]
         else:
-            test_pooling_versions_helper(pool_op_list=['pool_v1_cpu',
-                                                       'pool_v1_gpu',
-                                                       'pool_cpu', 'pool_transposed_cpu',
-                                                       'pool_gpu', 'pool_transposed_gpu',
-                                                       'pool_cudnn', 'pool_transposed_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride,
-                                         pool_type=pool_type, pooling_convention='valid',
-                                         global_pool=False, count_include_pad=count_include_pad,
-                                         dtype=dtype)
-
-        # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
-        pad = (0, 0)
-        stride = (2, 1)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, pooling_convention='valid',
-                                     global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
+            raise RuntimeError('Unexpected pooling test class: {}.'.format(dim))
 
-        pad = (0, 0)
-        stride = (1, 1)
-        if pool_type == 'lp':
-            test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                       'pool_gpu', 'pool_transposed_gpu',
-                                                       'pool_cudnn', 'pool_transposed_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride,
-                                         pool_type=pool_type, pooling_convention='full',
-                                         global_pool=False, p_value=p_value, dtype=dtype)
-        else:
-            if count_include_pad:
-                test_pooling_versions_helper(pool_op_list=['pool_v1_cpu',
-                                                           'pool_v1_gpu',
-                                                           'pool_cpu', 'pool_transposed_cpu',
-                                                           'pool_gpu', 'pool_transposed_gpu',
-                                                           'pool_cudnn', 'pool_transposed_cudnn'],
-                                             data=data, kernel=kernel, pad=pad, stride=stride,
-                                             pool_type=pool_type, pooling_convention='full',
-                                             global_pool=False, count_include_pad=count_include_pad,
-                                             dtype=dtype)
-            else:
-                test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                           'pool_gpu', 'pool_transposed_gpu',
-                                                           'pool_cudnn', 'pool_transposed_cudnn'],
-                                             data=data, kernel=kernel, pad=pad, stride=stride,
-                                             pool_type=pool_type, pooling_convention='full',
-                                             global_pool=False, count_include_pad=count_include_pad,
-                                             dtype=dtype)
-
-        # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
-        pad = (2, 3)
-        stride = (2, 3)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
+        for pad, stride in zip(pads, strides):
+            for pooling_convention in ['valid', 'full']:
+                test_pooling_versions_helper(pool_op_list=pool_op_list,
                                      data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, pooling_convention='full',
+                                     pool_type=pool_type, pooling_convention=pooling_convention,
                                      global_pool=False, p_value=p_value,
                                      count_include_pad=count_include_pad, dtype=dtype)
 
-        if pool_type == 'lp':
-            test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                       'pool_gpu', 'pool_transposed_gpu',
-                                                       'pool_cudnn', 'pool_transposed_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride,
-                                         pool_type=pool_type, global_pool=True,
-                                         p_value=p_value, dtype=dtype)
-        else:
-            test_pooling_versions_helper(pool_op_list=['pool_v1_cpu',
-                                                       'pool_v1_gpu',
-                                                       'pool_cpu', 'pool_transposed_cpu',
-                                                       'pool_gpu', 'pool_transposed_gpu',
-                                                       'pool_cudnn', 'pool_transposed_cudnn'],
-                                         data=data, kernel=kernel, pad=pad, stride=stride,
-                                         pool_type=pool_type, global_pool=True,
-                                         count_include_pad=count_include_pad, dtype=dtype)
-
-    def test_3d_pooling(pool_type, dtype, p_value=2, count_include_pad=True):
-        data = (2, 3, 20, 20, 20)
-        kernel = (4, 5, 3)
-        pad = (0, 0, 0)
-        stride = (1, 1, 1)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, pooling_convention='valid',
-                                     global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
-
-        pad = (2, 3, 3)
-        stride = (2, 3, 1)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, pooling_convention='valid',
-                                     global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
-
-        pad = (0, 0, 0)
-        stride = (1, 1, 1)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, pooling_convention='full',
-                                     global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
-
-        pad = (2, 3, 3)
-        stride = (2, 3, 1)
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
-                                     pool_type=pool_type, pooling_convention='full',
-                                     global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
-
-        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_transposed_cpu',
-                                                   'pool_gpu', 'pool_transposed_gpu',
-                                                   'pool_cudnn', 'pool_transposed_cudnn'],
-                                     data=data, kernel=kernel, pad=pad, stride=stride,
+        test_pooling_versions_helper(pool_op_list=pool_op_list,
+                                     data=data, kernel=kernel, pad=None, stride=None,
                                      pool_type=pool_type, global_pool=True, p_value=p_value,
                                      count_include_pad=count_include_pad, dtype=dtype)
 
-    # leave off np.float16 until pool_v1 can be made to conform to pool for this dtype
-    for dtype in [np.float32, np.float64]:
-        test_1d_pooling('max', dtype=dtype)
-        test_1d_pooling('avg', dtype=dtype, count_include_pad=True)
-        test_1d_pooling('avg', dtype=dtype, count_include_pad=False)
-        test_1d_pooling('sum', dtype=dtype)
-        test_1d_pooling('lp', dtype=dtype, p_value=1)
-        test_1d_pooling('lp', dtype=dtype, p_value=2)
-        test_1d_pooling('lp', dtype=dtype, p_value=3)
-
-        test_2d_pooling('max', dtype=dtype)
-        test_2d_pooling('avg', dtype=dtype, count_include_pad=True)
-        test_2d_pooling('avg', dtype=dtype, count_include_pad=False)
-        test_2d_pooling('sum', dtype=dtype)
-        test_2d_pooling('lp', dtype=dtype, p_value=1)
-        test_2d_pooling('lp', dtype=dtype, p_value=2)
-        test_2d_pooling('lp', dtype=dtype, p_value=3)
-
-        test_3d_pooling('max', dtype=dtype)
-        test_3d_pooling('avg', dtype=dtype, count_include_pad=True)
-        test_3d_pooling('avg', dtype=dtype, count_include_pad=False)
-        test_3d_pooling('sum', dtype=dtype)
-        test_3d_pooling('lp', dtype=dtype, p_value=1)
-        test_3d_pooling('lp', dtype=dtype, p_value=2)
-        test_3d_pooling('lp', dtype=dtype, p_value=3)
+    # The various implementations of the standard pooling operator
+    std_pool_op_list = ['pool_cpu', 'pool_transposed_cpu',
+                        'pool_gpu', 'pool_transposed_gpu',
+                        'pool_cudnn', 'pool_transposed_cudnn']
+    # The implementations of the 'v1' pooling operator
+    v1_pool_op_list = ['pool_v1_cpu', 'pool_v1_gpu']
+    # For those cases when all implementations should match- the combined implementation list.
+    combo_pool_op_list = std_pool_op_list + v1_pool_op_list
+
+    for dtype in [np.float32, np.float64, np.float16]:
+        # Testing of the standard (not 'v1') pooling operator is universal across all
+        # data dimensions, implementations and layouts.
+        for dim in ['1D', '2D', '3D']:
+            test_pooling_dim(dim, 'max', dtype, std_pool_op_list)
+            test_pooling_dim(dim, 'avg', dtype, std_pool_op_list, count_include_pad=True)
+            test_pooling_dim(dim, 'avg', dtype, std_pool_op_list, count_include_pad=False)
+            test_pooling_dim(dim, 'sum', dtype, std_pool_op_list)
+            test_pooling_dim(dim, 'lp', dtype, std_pool_op_list, p_value=1)
+            test_pooling_dim(dim, 'lp', dtype, std_pool_op_list, p_value=2)
+            test_pooling_dim(dim, 'lp', dtype, std_pool_op_list, p_value=3)
+
+        # Testing of the 'v1' pooling operator is over its restricted support domain of
+        # 2D data only and not with the 'lp' pooling type.  The 'v1' cpu and gpu versions are
+        # always tested against each other, and sometimes against the standard operator versions.
+        # The slightly different 'v1' definition prevents this in the following cases:
+        #
+        #     1. In max pooling, when multiple input values are the maximum in the input window,
+        #        the 'v1' implementation backprops the gradient to all maxima, whereas the standard
+        #        pooling operator backprops the gradient to the lowest-indexed maximum only.
+        #     2. In max pooling, the 'v1' operator pads with 0's and this value can become the
+        #        maximum output value in the case of an all-negative input.  The standard pooling
+        #        operator effectively considers the padding to be the largest negative value, so
+        #        only input values should appear in the output.
+        #     3. In avg pooling, the 'v1' operator divides the sum by the same window size factor,
+        #        even at the edges, and so does not support count_include_pad = False.
+
+        # Exclude std implementations due to points 1 and 2 above.
+        test_pooling_dim('2D', 'max', dtype, v1_pool_op_list)
+        # The standard and 'v1' implementations match for this case.
+        test_pooling_dim('2D', 'avg', dtype, combo_pool_op_list, count_include_pad=True)
+        # Exclude std implementations due to point 3 above.
+        test_pooling_dim('2D', 'avg', dtype, v1_pool_op_list, count_include_pad=False)
+        # The standard and 'v1' implementations match for this case.
+        test_pooling_dim('2D', 'sum', dtype, combo_pool_op_list)
+
+    # We can compare the standard and 'v1' max pooling implementations if we eliminate padding
+    # (see point 2 above) and use np.float64 data so that no two random input window values are
+    # likely to be the same (see point 1 above).
+    test_pooling_dim('2D_no_padding', 'max', np.float64, combo_pool_op_list)
 
 
 @with_seed()

From 19d9453731b68983f9955bfc8b750d22bd064d3b Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 1 Jan 2019 15:23:50 -0800
Subject: [PATCH 18/32] Add pooling test diagnostic output.

---
 tests/python/gpu/test_operator_gpu.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 2895e32cc7c9..22fc1913ef0c 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -963,11 +963,21 @@ def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_inclu
 
         for pad, stride in zip(pads, strides):
             for pooling_convention in ['valid', 'full']:
-                test_pooling_versions_helper(pool_op_list=pool_op_list,
+                try:
+                    test_pooling_versions_helper(pool_op_list=pool_op_list,
                                      data=data, kernel=kernel, pad=pad, stride=stride,
                                      pool_type=pool_type, pooling_convention=pooling_convention,
                                      global_pool=False, p_value=p_value,
                                      count_include_pad=count_include_pad, dtype=dtype)
+                except:
+                    print('pool_op_list = {}'.format(pool_op_list)
+                    print('kernel={}, pad={}, stride={}'.format(kernel, pad, stride))
+                    print('pool_type={}, pooling_convention={}, global_pool=False'.format(pool_type,
+                          pooling_convention))
+                    print('p_value={}, count_include_pad={}, dtype={}'.format(p_value,
+                          count_include_pad, dtype))
+                    print('environ = \n{}'.format(os.environ))
+                    raise
 
         test_pooling_versions_helper(pool_op_list=pool_op_list,
                                      data=data, kernel=kernel, pad=None, stride=None,

From 086ba9e6dc12c3956c166ba401157c3f2ebfa4fc Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 1 Jan 2019 16:15:34 -0800
Subject: [PATCH 19/32] Fix syntax.

---
 tests/python/gpu/test_operator_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 22fc1913ef0c..2d72bd7204cb 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -970,7 +970,7 @@ def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_inclu
                                      global_pool=False, p_value=p_value,
                                      count_include_pad=count_include_pad, dtype=dtype)
                 except:
-                    print('pool_op_list = {}'.format(pool_op_list)
+                    print('pool_op_list = {}'.format(pool_op_list))
                     print('kernel={}, pad={}, stride={}'.format(kernel, pad, stride))
                     print('pool_type={}, pooling_convention={}, global_pool=False'.format(pool_type,
                           pooling_convention))

From e913e0f6ff7c52bc533db1ae4929c9e1da5e5f12 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 3 Jan 2019 13:16:04 -0800
Subject: [PATCH 20/32] Fix pooling FInplaceOption so it can be shared by all
 implementations.

---
 src/operator/nn/pooling.cc            | 11 ++++++++---
 tests/python/gpu/test_operator_gpu.py |  2 --
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 1dc641aa0f9c..fd6a1b94d71d 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -463,10 +463,15 @@ NNVM_REGISTER_OP(_backward_Pooling)
 .set_attr<nnvm::FInplaceOption>(
     "FInplaceOption",
     [](const NodeAttrs &attrs) {
-#if MXNET_USE_CUDNN == 1
-  return std::vector<std::pair<int, int> >();
+#if MXNET_USE_MKLDNN == 1 && MXNET_USE_CUDA == 0 && MXNET_USE_CUDNN == 0
+  // Since this routine is not provided the cpu/gpu context info, only in the case
+  // where CUDA and CUDNN implementations are not available can we be sure the MKLDNN
+  // implementation will be employed.  The MKLDNN FInplaceOptions are not compatible
+  // with the other (i.e. cpu, cuda and cudnn) implementations.
+  if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
+    return std::vector<std::pair<int, int> >{{1, 0}};
 #else
-  return std::vector<std::pair<int, int> >{{1, 0}};
+  return std::vector<std::pair<int, int> >();
 #endif
 })
 #if MXNET_USE_MKLDNN == 1
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 2d72bd7204cb..4fe5819310d1 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -642,7 +642,6 @@ def transpose(shape):
 
 
 @with_seed()
-@assert_raises_cudnn_not_satisfied(min_version='7.0.1')
 def test_pooling_nhwc_with_convention():
     def make_pooling_syms(**kwargs):
         # Conventional NCHW layout pooling
@@ -856,7 +855,6 @@ def test_pooling_with_type2():
         check_consistency(sym, ctx_list)
 
 @with_seed()
-@assert_raises_cudnn_not_satisfied(min_version='7.0.1')
 def test_pooling_nhwc_with_type():
     def make_pooling_syms(**kwargs):
         # Conventional NCHW layout pooling

From 179a5bf595332e09a9bd003dc60081567b2c8975 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 3 Jan 2019 13:56:28 -0800
Subject: [PATCH 21/32] Add missing param definition.

---
 src/operator/nn/pooling.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index fd6a1b94d71d..bac11e9fa2d2 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -468,6 +468,7 @@ NNVM_REGISTER_OP(_backward_Pooling)
   // where CUDA and CUDNN implementations are not available can we be sure the MKLDNN
   // implementation will be employed.  The MKLDNN FInplaceOptions are not compatible
   // with the other (i.e. cpu, cuda and cudnn) implementations.
+  const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
   if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
     return std::vector<std::pair<int, int> >{{1, 0}};
 #else

From 21336d97796f6cdea4257ea0d79d931ccdfce848 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 3 Jan 2019 14:28:47 -0800
Subject: [PATCH 22/32] Fix #if logic.

---
 src/operator/nn/pooling.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index bac11e9fa2d2..9983c75cf3d2 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -471,9 +471,8 @@ NNVM_REGISTER_OP(_backward_Pooling)
   const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
   if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
     return std::vector<std::pair<int, int> >{{1, 0}};
-#else
-  return std::vector<std::pair<int, int> >();
 #endif
+  return std::vector<std::pair<int, int> >();
 })
 #if MXNET_USE_MKLDNN == 1
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {

From 403bd70f65c69817cebb6cfdfbfce0fc970574f0 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 15 Jan 2019 11:35:59 -0800
Subject: [PATCH 23/32] Temp switch to DickJC123/mshadow: shows effect of half
 round-to-nearest on cpu.

---
 .gitmodules      | 2 +-
 3rdparty/mshadow | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 836d824a6f5a..c6e929f317a6 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/mshadow"]
 	path = 3rdparty/mshadow
-	url = https://github.com/dmlc/mshadow.git
+	url = https://github.com/DickJC123/mshadow.git
 [submodule "3rdparty/dmlc-core"]
 	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core.git
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 696803bd7723..f607798892e2 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 696803bd7723ade8230af878460d96c68a550fbc
+Subproject commit f607798892e25c0c6538adf8425d01115da21ba6

From 95469dbbf56598be2972688ef32958ebbafdd89c Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 28 Jan 2019 14:12:10 -0800
Subject: [PATCH 24/32] Move back to dmlc/mshadow.git, now with float->half
 rounding.

---
 .gitmodules      | 2 +-
 3rdparty/mshadow | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index c6e929f317a6..836d824a6f5a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/mshadow"]
 	path = 3rdparty/mshadow
-	url = https://github.com/DickJC123/mshadow.git
+	url = https://github.com/dmlc/mshadow.git
 [submodule "3rdparty/dmlc-core"]
 	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core.git
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index f607798892e2..3dc80815d965 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit f607798892e25c0c6538adf8425d01115da21ba6
+Subproject commit 3dc80815d965b56b9a975dc27229361955bf66fe

From 098bc49f1d288ea9f2b64453aefcc1537ca5254e Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 4 Feb 2019 16:56:14 -0800
Subject: [PATCH 25/32] Avoid underflow of lp pooling calc for dtype=float16.

---
 src/operator/nn/pool.cuh              | 23 ++++++++------
 src/operator/nn/pool.h                | 46 +++++++++++++++------------
 src/operator/nn/pool_utils.h          | 20 +++++++++---
 tests/python/gpu/test_operator_gpu.py | 23 +++++++++++---
 4 files changed, 73 insertions(+), 39 deletions(-)

diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
index 69c630508304..671bc7932ef9 100644
--- a/src/operator/nn/pool.cuh
+++ b/src/operator/nn/pool.cuh
@@ -227,6 +227,7 @@ __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int stride_w, const int pad_w, DType* out_data,
                                        const bool get_avg = false,
                                        const bool count_include_pad = true) {
+  using AccType = typename PoolingTypes<DType>::AccType;
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool nwc_layout = layout == mshadow::kNWC;
     const int idx = nwc_layout ? (index / channels) : index;
@@ -241,14 +242,14 @@ __global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data,
     if (get_avg && !count_include_pad) {
       pool_size = (wend - wstart);
     }
-    DType sum = 0;
+    AccType sum = 0;
     const DType* out_slice = nwc_layout ? in_data + n * channels * width + c
                                         : in_data + (n * channels + c) * width;
     const int multiplier = nwc_layout ? channels : 1;
     for (int w = wstart; w < wend; ++w) {
-      sum += a_pow_p<DType, p>::Map(out_slice[w * multiplier]) / pool_size;
+      sum += a_pow_p<AccType, p>::Map(out_slice[w * multiplier]) / pool_size;
     }
-    out_data[index] = a_root_p<DType, p>::Map(sum);
+    out_data[index] = a_root_p<AccType, p>::Map(sum);
   }
 }
 
@@ -265,6 +266,7 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int pad_h, const int pad_w, DType* out_data,
                                        const bool get_avg = false,
                                        const bool count_include_pad = true) {
+  using AccType = typename PoolingTypes<DType>::AccType;
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool nhwc_layout = layout == mshadow::kNHWC;
     const int idx = nhwc_layout ? (index / channels) : index;
@@ -285,16 +287,16 @@ __global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data,
     if (get_avg && !count_include_pad) {
       pool_size = (hend - hstart) * (wend - wstart);
     }
-    DType sum = 0;
+    AccType sum = 0;
     const DType* out_slice = nhwc_layout ? in_data + n * channels * height * width + c
                                          : in_data + (n * channels + c) * height * width;
     const int multiplier = nhwc_layout ? channels : 1;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        sum += a_pow_p<DType, p>::Map(out_slice[(h * width + w) * multiplier]) / pool_size;
+        sum += a_pow_p<AccType, p>::Map(out_slice[(h * width + w) * multiplier]) / pool_size;
       }
     }
-    out_data[index] = a_root_p<DType, p>::Map(sum);
+    out_data[index] = a_root_p<AccType, p>::Map(sum);
   }
 }
 
@@ -312,6 +314,7 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
                                        const int pad_d, const int pad_h, const int pad_w,
                                        DType* out_data, const bool get_avg = false,
                                        const bool count_include_pad = true) {
+  using AccType = typename PoolingTypes<DType>::AccType;
   CUDA_KERNEL_LOOP(index, nthreads) {
     const bool ndhwc_layout = layout == mshadow::kNDHWC;
     const int idx = ndhwc_layout ? (index / channels) : index;
@@ -337,21 +340,21 @@ __global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data,
     if (get_avg && !count_include_pad) {
       pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
     }
-    DType sum = 0;
+    AccType sum = 0;
     const DType* out_slice = ndhwc_layout ? in_data + n * channels * depth * height * width + c
                                           : in_data + (n * channels + c) * depth * height * width;
     const int multiplier = ndhwc_layout ? channels : 1;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
-          sum += a_pow_p<DType, p>::Map(out_slice[((d * height + h) * width + w) *
+          sum += a_pow_p<AccType, p>::Map(out_slice[((d * height + h) * width + w) *
                                                    multiplier]) / pool_size;
         }
       }
     }
     out_data[index] = (pool_size == 0) ?
-                      DType(nanf("")) :
-                      a_root_p<DType, p>::Map(sum);
+                      AccType(nanf("")) :
+                      a_root_p<AccType, p>::Map(sum);
   }
 }
 
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 9cd779134228..3c8c19a02607 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -361,6 +361,7 @@ inline void pool_sum_1d_ncw_cpu(const DType *in_data, const TShape &ishape, cons
                                 const TShape &kernel, const TShape &pad, const TShape &stride,
                                 DType *out_data,
                                 const bool get_avg = false, const bool count_include_pad = true) {
+  using AccType = typename PoolingTypes<DType>::AccType;
   const int width = ishape[2];
   const int pooled_width = oshape[2];
   const int kernel_w = kernel[0];
@@ -379,11 +380,11 @@ inline void pool_sum_1d_ncw_cpu(const DType *in_data, const TShape &ishape, cons
         if (get_avg && !count_include_pad) {
           pool_size = (wend - wstart);
         }
-        DType sum = 0;
+        AccType sum = 0;
         for (int w = wstart; w < wend; ++w) {
-          sum += a_pow_p<DType, p>::Map(in_data[w]) / pool_size;
+          sum += a_pow_p<AccType, p>::Map(in_data[w]) / pool_size;
         }
-        out_data[pw] = a_root_p<DType, p>::Map(sum);
+        out_data[pw] = a_root_p<AccType, p>::Map(sum);
       }
       in_data += in_data_offset;
       out_data += out_data_offset;
@@ -400,6 +401,7 @@ inline void pool_sum_1d_nwc_cpu(const DType* in_data, const TShape& ishape, cons
                             const TShape& kernel, const TShape& pad, const TShape& stride,
                             DType* out_data,
                             const bool get_avg = false, const bool count_include_pad = true) {
+  using AccType = typename PoolingTypes<DType>::AccType;
   const int width = ishape[1];
   const int pooled_width = oshape[1];
   const int kernel_w = kernel[0];
@@ -408,7 +410,7 @@ inline void pool_sum_1d_nwc_cpu(const DType* in_data, const TShape& ishape, cons
   const int features = oshape[2];
   const index_t in_data_offset = ishape[1] * features;
   const index_t out_data_offset = oshape[1] * features;
-  std::vector<DType> sums(features);
+  std::vector<AccType> sums(features);
   for (index_t n = 0; n < oshape[0]; ++n) {
     for (int pw = 0; pw < pooled_width; ++pw) {
       int wstart = pw * stride_w - pad_w;
@@ -422,11 +424,11 @@ inline void pool_sum_1d_nwc_cpu(const DType* in_data, const TShape& ishape, cons
       std::fill(sums.begin(), sums.end(), 0);
       for (int w = wstart; w < wend; ++w) {
         for (index_t c = 0; c < features; ++c) {
-          sums[c] += a_pow_p<DType, p>::Map(in_data[w * features + c]) / pool_size;
+          sums[c] += a_pow_p<AccType, p>::Map(in_data[w * features + c]) / pool_size;
         }
       }
       for (index_t c = 0; c < features; ++c)
-        out_data[pw * features + c] = a_root_p<DType, p>::Map(sums[c]);
+        out_data[pw * features + c] = a_root_p<AccType, p>::Map(sums[c]);
     }
     in_data += in_data_offset;
     out_data += out_data_offset;
@@ -442,6 +444,7 @@ inline void pool_sum_2d_nchw_cpu(const DType *in_data, const TShape &ishape, con
                                  const TShape &kernel, const TShape &pad, const TShape &stride,
                                  DType *out_data,
                                  const bool get_avg = false, const bool count_include_pad = true) {
+  using AccType = typename PoolingTypes<DType>::AccType;
   const int height = ishape[2], width = ishape[3];
   const int pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -465,13 +468,13 @@ inline void pool_sum_2d_nchw_cpu(const DType *in_data, const TShape &ishape, con
           if (get_avg && !count_include_pad) {
             pool_size = (hend - hstart) * (wend - wstart);
           }
-          DType sum = 0;
+          AccType sum = 0;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
-              sum += a_pow_p<DType, p>::Map(in_data[h*width+w]) / pool_size;
+              sum += a_pow_p<AccType, p>::Map(in_data[h*width+w]) / pool_size;
             }
           }
-          out_data[ph*pooled_width+pw] = a_root_p<DType, p>::Map(sum);
+          out_data[ph*pooled_width+pw] = a_root_p<AccType, p>::Map(sum);
         }
       }
       in_data += in_data_offset;
@@ -489,6 +492,7 @@ inline void pool_sum_2d_nhwc_cpu(const DType* in_data, const TShape& ishape, con
                             const TShape& kernel, const TShape& pad, const TShape& stride,
                             DType* out_data,
                             const bool get_avg = false, const bool count_include_pad = true) {
+  using AccType = typename PoolingTypes<DType>::AccType;
   const int height = ishape[1], width = ishape[2];
   const int pooled_height = oshape[1], pooled_width = oshape[2];
   const int kernel_h = kernel[0], kernel_w = kernel[1];
@@ -497,7 +501,7 @@ inline void pool_sum_2d_nhwc_cpu(const DType* in_data, const TShape& ishape, con
   const int features = oshape[3];
   const index_t in_data_offset = ishape[1] * ishape[2] * features;
   const index_t out_data_offset = oshape[1] * oshape[2] * features;
-  std::vector<DType> sums(features);
+  std::vector<AccType> sums(features);
   for (index_t n = 0; n < oshape[0]; ++n) {
     for (int ph = 0; ph < pooled_height; ++ph) {
       for (int pw = 0; pw < pooled_width; ++pw) {
@@ -519,12 +523,12 @@ inline void pool_sum_2d_nhwc_cpu(const DType* in_data, const TShape& ishape, con
           for (int w = wstart; w < wend; ++w) {
             const int in_index = h * width + w;
             for (index_t c = 0; c < features; ++c) {
-              sums[c] += a_pow_p<DType, p>::Map(in_data[in_index * features + c]) / pool_size;
+              sums[c] += a_pow_p<AccType, p>::Map(in_data[in_index * features + c]) / pool_size;
             }
           }
         }
         for (index_t c = 0; c < features; ++c)
-          out_data[pool_index * features + c] = a_root_p<DType, p>::Map(sums[c]);
+          out_data[pool_index * features + c] = a_root_p<AccType, p>::Map(sums[c]);
       }
     }
     in_data += in_data_offset;
@@ -541,6 +545,7 @@ inline void pool_sum_3d_ncdhw_cpu(const DType *in_data, const TShape &ishape, co
                                   const TShape &kernel, const TShape &pad, const TShape &stride,
                                   DType *out_data,
                                   const bool get_avg = false, const bool count_include_pad = true) {
+  using AccType = typename PoolingTypes<DType>::AccType;
   const int depth = ishape[2], height = ishape[3], width = ishape[4];
   const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -569,17 +574,17 @@ inline void pool_sum_3d_ncdhw_cpu(const DType *in_data, const TShape &ishape, co
             if (get_avg && !count_include_pad) {
               pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
             }
-            DType sum = 0;
+            AccType sum = 0;
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
-                  sum += a_pow_p<DType, p>::Map(in_data[(d*height+h)*width+w]) / pool_size;
+                  sum += a_pow_p<AccType, p>::Map(in_data[(d*height+h)*width+w]) / pool_size;
                 }
               }
             }
             out_data[(pd*pooled_height+ph)*pooled_width+pw] = (pool_size == 0) ?
-                                                              DType(nanf("")) :
-                                                              a_root_p<DType, p>::Map(sum);
+                                                              AccType(nanf("")) :
+                                                              a_root_p<AccType, p>::Map(sum);
           }
         }
       }
@@ -598,6 +603,7 @@ inline void pool_sum_3d_ndhwc_cpu(const DType* in_data, const TShape& ishape, co
                             const TShape& kernel, const TShape& pad, const TShape& stride,
                             DType* out_data,
                             const bool get_avg = false, const bool count_include_pad = true) {
+  using AccType = typename PoolingTypes<DType>::AccType;
   const int depth = ishape[1], height = ishape[2], width = ishape[3];
   const int pooled_depth = oshape[1], pooled_height = oshape[2], pooled_width = oshape[3];
   const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
@@ -606,7 +612,7 @@ inline void pool_sum_3d_ndhwc_cpu(const DType* in_data, const TShape& ishape, co
   const int features = oshape[4];
   const index_t in_data_offset = ishape[1] * ishape[2] * ishape[3] * features;
   const index_t out_data_offset = oshape[1] * oshape[2] * oshape[3] * features;
-  std::vector<DType> sums(features);
+  std::vector<AccType> sums(features);
   for (index_t n = 0; n < oshape[0]; ++n) {
     for (int pd = 0; pd < pooled_depth; ++pd) {
       for (int ph = 0; ph < pooled_height; ++ph) {
@@ -634,15 +640,15 @@ inline void pool_sum_3d_ndhwc_cpu(const DType* in_data, const TShape& ishape, co
               for (int w = wstart; w < wend; ++w) {
                 const int in_index = (d * height + h) * width + w;
                 for (index_t c = 0; c < features; ++c) {
-                  sums[c] += a_pow_p<DType, p>::Map(in_data[in_index * features + c]) / pool_size;
+                  sums[c] += a_pow_p<AccType, p>::Map(in_data[in_index * features + c]) / pool_size;
                 }
               }
             }
           }
           for (index_t c = 0; c < features; ++c)
             out_data[pool_index * features + c] = (pool_size == 0) ?
-                                                            DType(nanf("")) :
-                                                            a_root_p<DType, p>::Map(sums[c]);
+                                                            AccType(nanf("")) :
+                                                            a_root_p<AccType, p>::Map(sums[c]);
         }
       }
     }
diff --git a/src/operator/nn/pool_utils.h b/src/operator/nn/pool_utils.h
index 7b2657451ba7..6bf7235048dc 100644
--- a/src/operator/nn/pool_utils.h
+++ b/src/operator/nn/pool_utils.h
@@ -25,6 +25,17 @@
 namespace mxnet {
 namespace op {
 
+// Define an accumulator type AccType to permit float16-I/O lp pooling to avoid underflow.
+template<typename DType>
+struct PoolingTypes {
+  typedef DType AccType;
+};
+
+template<>
+struct PoolingTypes<mshadow::half::half_t> {
+  typedef float AccType;
+};
+
 template<typename DType, int p>
 struct a_pow_p {
   static MSHADOW_XINLINE DType Map(const DType a) {
@@ -98,16 +109,17 @@ struct lp_grad<DType, 1> {
 template<typename DType>
 struct lp_grad<DType, 2> {
   static MSHADOW_XINLINE DType Map(const DType grad, const DType in_data, const DType out_data) {
-    // Avoid nan result if both grad and out_data are 0.
-    return (grad == DType(0.0)) ? DType(0.0) : grad * in_data / out_data;
+    // Avoid inf, if out_data has underflowed to 0 for a non-zero input, or nan if grad is also 0.
+    return (out_data == DType(0.0)) ? DType(0.0) : grad * (in_data / out_data);
   }
 };
 
 template<typename DType>
 struct lp_grad<DType, 3> {
   static MSHADOW_XINLINE DType Map(const DType grad, const DType in_data, const DType out_data) {
-    // Avoid nan result if both grad and out_data are 0.
-    return (grad == DType(0.0)) ? DType(0.0) : grad * in_data * in_data / (out_data * out_data);
+    // Avoid inf, if out_data has underflowed to 0 for a non-zero input, or nan if grad is also 0.
+    DType in_out_ratio = in_data / out_data;
+    return (out_data == DType(0.0)) ? DType(0.0) : grad * in_out_ratio * in_out_ratio;
   }
 };
 
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index af1e15f3bfa0..ccd94263db15 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -935,7 +935,8 @@ def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, str
 
         check_consistency(sym_list, ctx_list, equal_nan=(not count_include_pad), tol=tol)
 
-    def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_include_pad=True):
+    def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_include_pad=True,
+                         tol=None):
         if dim == '1D':
             data = (3, 3, 10)
             kernel = (4,)
@@ -966,7 +967,7 @@ def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_inclu
                                      data=data, kernel=kernel, pad=pad, stride=stride,
                                      pool_type=pool_type, pooling_convention=pooling_convention,
                                      global_pool=False, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
+                                     count_include_pad=count_include_pad, tol=tol, dtype=dtype)
                 except:
                     print('pool_op_list = {}'.format(pool_op_list))
                     print('kernel={}, pad={}, stride={}'.format(kernel, pad, stride))
@@ -980,7 +981,7 @@ def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_inclu
         test_pooling_versions_helper(pool_op_list=pool_op_list,
                                      data=data, kernel=kernel, pad=None, stride=None,
                                      pool_type=pool_type, global_pool=True, p_value=p_value,
-                                     count_include_pad=count_include_pad, dtype=dtype)
+                                     count_include_pad=count_include_pad, tol=tol, dtype=dtype)
 
     # The various implementations of the standard pooling operator
     std_pool_op_list = ['pool_cpu', 'pool_transposed_cpu',
@@ -1017,15 +1018,27 @@ def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_inclu
         #        only input values should appear in the output.
         #     3. In avg pooling, the 'v1' operator divides the sum by the same window size factor,
         #        even at the edges, and so does not support count_include_pad = False.
+        #     4. The float16 'v1' pooling operator performs forward sums and averages in
+        #        float16, whereas the std operators perform those calculations in float32, so
+        #        greater float16 tolerances are needed when comparing across implementations.
+
+        # Double the float16 tol when comparing v1 and non-v1 implemenations, per note 4 above.
+        relaxed_tol = {np.dtype(np.float16): 2e-1,
+               np.dtype(np.float32): 1e-3,
+               np.dtype(np.float64): 1e-5,
+               np.dtype(np.uint8): 0,
+               np.dtype(np.int32): 0,
+               np.dtype(np.int64): 0}
 
         # Exclude std implementations due to points 1 and 2 above.
         test_pooling_dim('2D', 'max', dtype, v1_pool_op_list)
         # The standard and 'v1' implementations match for this case.
-        test_pooling_dim('2D', 'avg', dtype, combo_pool_op_list, count_include_pad=True)
+        test_pooling_dim('2D', 'avg', dtype, combo_pool_op_list, count_include_pad=True,
+                         tol=relaxed_tol)
         # Exclude std implementations due to point 3 above.
         test_pooling_dim('2D', 'avg', dtype, v1_pool_op_list, count_include_pad=False)
         # The standard and 'v1' implementations match for this case.
-        test_pooling_dim('2D', 'sum', dtype, combo_pool_op_list)
+        test_pooling_dim('2D', 'sum', dtype, combo_pool_op_list, tol=relaxed_tol)
 
     # We can compare the standard and 'v1' max pooling implementations if we eliminate padding
     # (see point 2 above) and use np.float64 data so that no two random input window values are

From 0627df420002d1fa45f893e30c4a09bbaf97c63d Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 5 Feb 2019 15:04:01 -0800
Subject: [PATCH 26/32] Remove redundant pooling test.

---
 tests/python/gpu/test_operator_gpu.py | 70 ++++++++++++---------------
 1 file changed, 30 insertions(+), 40 deletions(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index ccd94263db15..a78f34a4ed03 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -607,40 +607,7 @@ def test_convolution_versions():
     check_consistency(syms, ctx_list)
 
 
-@with_seed()
-def test_pooling_with_convention():
-    # transpose shape to bring feature dimension 'c' from 2nd position to last
-    def transpose(shape):
-        return (shape[0],) + shape[2:] + (shape[1],)
-    # While the float32 and float64 output is reliably consistent, float16 departs occasionally.
-    # We compare cpu and gpu results only within a given precision.
-    for layout in ['NCHW', 'NHWC']:
-        for data_type in [np.float64, np.float32, np.float16]:
-            logical_inshape = (2, 2, 10, 10)
-            inshape = logical_inshape if layout == 'NCHW' else transpose(logical_inshape)
-            gpu_ctx = {'ctx': mx.gpu(0), 'pool_data': inshape,
-                       'type_dict': {'pool_data': data_type}}
-            cpu_ctx = {'ctx': mx.cpu(0), 'pool_data': inshape,
-                       'type_dict': {'pool_data': data_type}}
-            sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='valid',
-                                 name='pool', layout = layout, cudnn_off = False)
-            sym_no_cudnn = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='valid',
-                                          name='pool', layout = layout, cudnn_off = True)
-            check_consistency([sym, sym_no_cudnn, sym], [gpu_ctx, gpu_ctx, cpu_ctx])
-
-            sym = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='full',
-                                 name='pool', layout = layout, cudnn_off = False)
-            sym_no_cudnn = mx.sym.Pooling(kernel=(3,3), pool_type='max', pooling_convention='full',
-                                          name='pool', layout = layout, cudnn_off = True)
-            check_consistency([sym, sym_no_cudnn, sym], [gpu_ctx, gpu_ctx, cpu_ctx])
-
-            sym = mx.sym.Pooling(kernel=(300,300), pool_type='max', global_pool=True,
-                                 name='pool', layout = layout, cudnn_off = False)
-            sym_no_cudnn = mx.sym.Pooling(kernel=(300,300), pool_type='max', global_pool=True,
-                                          name='pool', layout = layout, cudnn_off = True)
-            check_consistency([sym, sym_no_cudnn, sym], [gpu_ctx, gpu_ctx, cpu_ctx])
-
-
+# More max-pooling strides and pads to test cudnn pooling implementation code paths
 @with_seed()
 def test_pooling_nhwc_with_convention():
     def make_pooling_syms(**kwargs):
@@ -884,11 +851,24 @@ def make_pooling_syms(**kwargs):
 @with_seed()
 def test_pooling_versions():
 
+    # Produce the name of the 'transposed' layout, given the dimension
     def transposed_layout(ndim):
         if ndim < 3 or ndim > 5:
             raise RuntimeError("Invalid data dim, expecting 3, 4 or 5")
         return ('NWC', 'NHWC', 'NDHWC')[ndim-3]
 
+    # default padding is all zeros
+    def is_default_pad(pad):
+        return pad == (0,) * len(pad)
+
+    # default stride is all ones
+    def is_default_stride(stride):
+        return stride == (1,) * len(stride)
+
+    # returns True/False randomly with equal probability
+    def random_choice():
+        return np.random.random(1)[0] < 0.5
+
     def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride,
                                      pooling_convention='valid', global_pool=False, p_value=2,
                                      count_include_pad=True, tol=None, dtype=np.float32):
@@ -908,7 +888,12 @@ def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, str
             if global_pool:
                 pool_op_args['global_pool'] = True
             else:
-                pool_op_args.update({'pad' : pad, 'stride' : stride})
+                # Add pad and stride param if needed, plus randomly when it matches the default
+                if not is_default_pad(pad) or random_choice():
+                    pool_op_args.update({'pad' : pad})
+                if not is_default_stride(stride) or random_choice():
+                    pool_op_args.update({'stride' : stride})
+
             expected_pool_ops = ['pool', 'pool_transposed', 'pool_v1']
             if pool_op == 'pool_v1':
                 sym = mx.sym.Pooling_v1(**pool_op_args)
@@ -939,28 +924,28 @@ def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_inclu
                          tol=None):
         if dim == '1D':
             data = (3, 3, 10)
-            kernel = (4,)
+            kernels = [(4,), (4,)]
             pads = [(0,), (2,)]
             strides = [(1,), (2,)]
         elif dim == '2D_no_padding':
             data = (3, 2, 20, 20)
-            kernel = (4, 5)
+            kernels = [(3, 3), (4, 5)]
             pads = [(0, 0), (0, 0)]
             strides = [(1, 1), (2,1)]
         elif dim == '2D':
             data = (2, 2, 20, 20)
-            kernel = (4, 5)
+            kernels = [(3, 3), (4, 5), (4, 5)]
             pads = [(0, 0), (0, 0), (2, 3)]
             strides = [(1, 1), (2,1), (1, 1)]
         elif dim == '3D':
             data = (2, 3, 20, 20, 20)
-            kernel = (4, 5, 3)
+            kernels = [(4, 5, 3), (4, 5, 3)]
             pads = [(0, 0, 0), (2, 3, 2)]
             strides = [(1, 1, 1), (2, 3, 1)]
         else:
             raise RuntimeError('Unexpected pooling test class: {}.'.format(dim))
 
-        for pad, stride in zip(pads, strides):
+        for kernel, pad, stride in zip(kernels, pads, strides):
             for pooling_convention in ['valid', 'full']:
                 try:
                     test_pooling_versions_helper(pool_op_list=pool_op_list,
@@ -978,6 +963,11 @@ def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_inclu
                     print('environ = \n{}'.format(os.environ))
                     raise
 
+        # Make sure kernel is ignored during global_pool by sometimes setting it to a crazy value
+        kernel = kernels[0]
+        if random_choice():
+            kernel = (300,) * len(kernel)
+
         test_pooling_versions_helper(pool_op_list=pool_op_list,
                                      data=data, kernel=kernel, pad=None, stride=None,
                                      pool_type=pool_type, global_pool=True, p_value=p_value,

From 16bfef091623bafe78326eb52b8310d56e52a900 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Wed, 6 Feb 2019 13:08:23 -0800
Subject: [PATCH 27/32] Minor variable naming fixes.

---
 src/operator/nn/cudnn/cudnn_pooling-inl.h | 30 +++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index 8dad756f6c77..d37668de6bed 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -82,7 +82,7 @@ class CuDNNPoolingOp {
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     if (!this->Init(s, in_data, out_data))
-      LOG(FATAL) << "CuDNN Pooling invoked with unsupported parameters.";
+      LOG(FATAL) << "cuDNN Pooling invoked with unsupported parameters.";
     if (param_.kernel.ndim() == 2) {
       // 2d pool
       Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
@@ -127,7 +127,7 @@ class CuDNNPoolingOp {
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     if (!this->Init(s, in_data, out_data))
-      LOG(FATAL) << "CuDNN Pooling invoked with unsupported parameters.";
+      LOG(FATAL) << "cuDNN Pooling invoked with unsupported parameters.";
     if (param_.kernel.ndim() == 2) {
       // 2d pool
       Tensor<gpu, 4, DType> m_out_grad = out_grad.get<gpu, 4, DType>(s);
@@ -172,7 +172,7 @@ class CuDNNPoolingOp {
 /*!
  * \brief Returns whether the cuDNN library version supports the pooling operation
  * described by `param`: cuDNN v5 and earlier does not support 3D pooling for example.
- * CuDNN v7.1.4 backprop kernel doesn't support window sizes 9 and above.
+ * CuDNN v7.1.4 backprop kernel doesn't support kernel sizes 9 and above.
  */
   static bool Supports(const PoolingParam &param, const TBlob& input) {
     using namespace mshadow;
@@ -206,15 +206,15 @@ class CuDNNPoolingOp {
       if (!(layout == mshadow::kNCHW || layout == mshadow::kNHWC))
         return false;
 #if CUDNN_VERSION == 7104
-      // CuDNN v7.1.4 backprop kernel doesn't support window sizes 9 and above.
+      // CuDNN v7.1.4 backprop kernel doesn't support kernel sizes 9 and above.
       // Perform shape calculations in a standard (NCHW) layout space
       mshadow::Shape<4> input_shape = input.shape_.get<4>();
       mshadow::Shape<4> dshape_nchw = (layout == mshadow::kNHWC) ?
                                       ConvertLayout(input_shape, mshadow::kNHWC, mshadow::kNCHW) :
                                       input_shape;
-      int window_height = param.global_pool ? dshape_nchw[2] : param.kernel[0];
-      int window_width = param.global_pool ? dshape_nchw[3] : param.kernel[1];
-      if (window_height > 8 || window_width > 8)
+      int kernel_height = param.global_pool ? dshape_nchw[2] : param.kernel[0];
+      int kernel_width = param.global_pool ? dshape_nchw[3] : param.kernel[1];
+      if (kernel_height > 8 || kernel_width > 8)
         return false;
 #endif
 #if CUDNN_VERSION >= 7105 && CUDNN_VERSION < 7500
@@ -284,20 +284,20 @@ class CuDNNPoolingOp {
                                             oshape_nchw[1],
                                             oshape_nchw[2],
                                             oshape_nchw[3]));
-      int window_height = param_.global_pool ? dshape_nchw[2] : param_.kernel[0];
-      int window_width = param_.global_pool ? dshape_nchw[3] : param_.kernel[1];
-      // CuDNN v7.1.4 backprop kernel doesn't support window sizes 9 and above.
+      int kernel_height = param_.global_pool ? dshape_nchw[2] : param_.kernel[0];
+      int kernel_width = param_.global_pool ? dshape_nchw[3] : param_.kernel[1];
+      // CuDNN v7.1.4 backprop kernel doesn't support kernel sizes 9 and above.
       // For reference see Fixed Issues section in
       // https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_721.html#rel_721
       #if CUDNN_VERSION == 7104
-      is_supported = window_height <= 8 && window_width <= 8;
+      is_supported = kernel_height <= 8 && kernel_width <= 8;
       #endif
       #if CUDNN_MAJOR >= 5
       CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
                                              mode_,
                                              nan_prop_,
-                                             window_height,
-                                             window_width,
+                                             kernel_height,
+                                             kernel_width,
                                              param_.global_pool ? 0 : param_.pad[0],
                                              param_.global_pool ? 0 : param_.pad[1],
                                              param_.global_pool ? 1 : param_.stride[0],
@@ -305,8 +305,8 @@ class CuDNNPoolingOp {
       #else
       CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_,
                                              mode_,
-                                             window_height,
-                                             window_width,
+                                             kernel_height,
+                                             kernel_width,
                                              param_.global_pool ? 0 : param_.pad[0],
                                              param_.global_pool ? 0 : param_.pad[1],
                                              param_.global_pool ? 1 : param_.stride[0],

From 7506c1b1fd19ae049423f9773f5569e61614b738 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Tue, 12 Feb 2019 17:00:29 -0800
Subject: [PATCH 28/32] Modify FInplaceOption handling per reviewer comments. 
 Expand testing.

---
 src/operator/nn/pooling.cc            |  7 ++-----
 tests/python/gpu/test_operator_gpu.py | 24 +++++++++++++-----------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 9983c75cf3d2..a351f93e7e0c 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -463,11 +463,8 @@ NNVM_REGISTER_OP(_backward_Pooling)
 .set_attr<nnvm::FInplaceOption>(
     "FInplaceOption",
     [](const NodeAttrs &attrs) {
-#if MXNET_USE_MKLDNN == 1 && MXNET_USE_CUDA == 0 && MXNET_USE_CUDNN == 0
-  // Since this routine is not provided the cpu/gpu context info, only in the case
-  // where CUDA and CUDNN implementations are not available can we be sure the MKLDNN
-  // implementation will be employed.  The MKLDNN FInplaceOptions are not compatible
-  // with the other (i.e. cpu, cuda and cudnn) implementations.
+// Different backend requires different FInplaceOption
+#if MXNET_USE_MKLDNN == 1
   const PoolingParam &param = nnvm::get<PoolingParam>(attrs.parsed);
   if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param))
     return std::vector<std::pair<int, int> >{{1, 0}};
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index a78f34a4ed03..010cf504fe70 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -902,7 +902,9 @@ def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, str
                 if ctx_type != 'cpu':
                     pool_op_args['cudnn_off'] = ctx_type == 'gpu'
                 if pool_op == 'pool':
-                    sym = mx.sym.Pooling(**pool_op_args)
+                    # isolate pooling input from symbol input to test shared tensor optimizations
+                    buffered_input = mx.sym.identity(name='pool')
+                    sym = mx.sym.Pooling(buffered_input, **pool_op_args)
                 elif pool_op == 'pool_transposed':
                     ndim = len(data)
                     # NCW->NWC axes=(0,2,1) NCHW->NHWC axes=(0,2,3,1) NCDHW->NDHWC axes=(0,2,3,4,1);
@@ -924,24 +926,24 @@ def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_inclu
                          tol=None):
         if dim == '1D':
             data = (3, 3, 10)
-            kernels = [(4,), (4,)]
-            pads = [(0,), (2,)]
-            strides = [(1,), (2,)]
+            kernels = [(4,), (4,), (5,)]
+            pads = [(0,), (2,), (2,)]
+            strides = [(1,), (2,), (1,)]
         elif dim == '2D_no_padding':
             data = (3, 2, 20, 20)
             kernels = [(3, 3), (4, 5)]
             pads = [(0, 0), (0, 0)]
-            strides = [(1, 1), (2,1)]
+            strides = [(1, 1), (2, 1)]
         elif dim == '2D':
             data = (2, 2, 20, 20)
-            kernels = [(3, 3), (4, 5), (4, 5)]
-            pads = [(0, 0), (0, 0), (2, 3)]
-            strides = [(1, 1), (2,1), (1, 1)]
+            kernels = [(3, 3), (3, 5), (4, 5), (4, 5)]
+            pads = [(0, 0), (1, 2), (0, 0), (2, 3)]
+            strides = [(1, 1), (1, 1), (2, 1), (1, 1)]
         elif dim == '3D':
             data = (2, 3, 20, 20, 20)
-            kernels = [(4, 5, 3), (4, 5, 3)]
-            pads = [(0, 0, 0), (2, 3, 2)]
-            strides = [(1, 1, 1), (2, 3, 1)]
+            kernels = [(4, 5, 3), (4, 5, 3), (3, 5, 7)]
+            pads = [(0, 0, 0), (2, 3, 2), (1, 2, 3)]
+            strides = [(1, 1, 1), (2, 3, 1), (1, 1, 1)]
         else:
             raise RuntimeError('Unexpected pooling test class: {}.'.format(dim))
 

From 7ac90ee87ef487d5cc59619a97a6716b95c2d001 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 14 Feb 2019 18:16:17 -0800
Subject: [PATCH 29/32] Correct gluon Pooling layout param description.

---
 python/mxnet/gluon/nn/conv_layers.py | 34 ++++++++++------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 427ceb3bbf9c..dfac1c9dc372 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -718,8 +718,7 @@ class MaxPool1D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' and 'NWC'
-        (only with cuDNN) layouts for now.
+        Dimension ordering of data and weight ('NCW' or 'NWC').
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Pooling is applied on the W dimension.
     ceil_mode : bool, default False
@@ -765,8 +764,7 @@ class MaxPool2D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' and 'NHWC'
-        (only with cuDNN) layouts for now.
+        Dimension ordering of data and weight ('NCHW' or 'NHWC').
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
@@ -815,8 +813,7 @@ class MaxPool3D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Only supports 'NCDHW' and 'NDHWC'
-        (only with cuDNN) layouts for now.
+        Dimension ordering of data and weight ('NCDHW' or 'NDHWC').
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -866,8 +863,7 @@ class AvgPool1D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' or 'NWC'
-        (only with cuDNN) layouts for now.
+        Dimension ordering of data and weight ('NCW' or 'NWC').
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. padding is applied on 'W' dimension.
     ceil_mode : bool, default False
@@ -915,8 +911,7 @@ class AvgPool2D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' or 'NHWC'
-        (only with cuDNN) layouts for now.
+        Dimension ordering of data and weight ('NCHW' or 'NHWC').
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
@@ -967,7 +962,7 @@ class AvgPool3D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
+        Dimension ordering of data and weight ('NCDHW' or 'NDHWC').
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -1013,8 +1008,7 @@ class GlobalMaxPool1D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' or 'NWC'
-        (only with cuDNN) layouts for now.
+        Dimension ordering of data and weight ('NCW' or 'NWC').
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Pooling is applied on the W dimension.
 
@@ -1041,8 +1035,7 @@ class GlobalMaxPool2D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' or 'NHWC'
-        (only with cuDNN) layouts for now.
+        Dimension ordering of data and weight ('NCHW' or 'NHWC').
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
 
@@ -1070,8 +1063,7 @@ class GlobalMaxPool3D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Only supports 'NCDHW' or 'NDHWC'
-        (only with cuDNN) layouts for now.
+        Dimension ordering of data and weight ('NCDHW' or 'NDHWC').
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -1099,8 +1091,7 @@ class GlobalAvgPool1D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCW'
-        Dimension ordering of data and weight. Only supports 'NCW' or 'NWC'
-        (only with cuDNN) layouts for now.
+        Dimension ordering of data and weight ('NCW' or 'NWC').
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. padding is applied on 'W' dimension.
 
@@ -1125,8 +1116,7 @@ class GlobalAvgPool2D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight. Only supports 'NCHW' or 'NHWC'
-        (only with cuDNN) layouts for now.
+        Dimension ordering of data and weight ('NCHW' or 'NHWC').
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively.
 
@@ -1153,7 +1143,7 @@ class GlobalAvgPool3D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
+        Dimension ordering of data and weight ('NCDHW' or 'NDHWC').
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.

From a2a21039e44967c89831e34f8b6d58e0c1f78a2f Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Thu, 14 Feb 2019 18:43:56 -0800
Subject: [PATCH 30/32] Correct Symbol Pooling description.

---
 src/operator/nn/pooling.cc | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index a351f93e7e0c..9e9af4d97fd9 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -368,17 +368,13 @@ NNVM_REGISTER_OP(Pooling)
 
 The shapes for 1-D pooling are
 
-- **data**: *(batch_size, channel, width)* (NCW layout) or
-  *(batch_size, width, channel)* (NWC layout, cuDNN only),
-- **out**: *(batch_size, num_filter, out_width)* (NCW layout) or
-  *(batch_size, width, channel)* (NWC layout, cuDNN only).
+- **data** and **out**: *(batch_size, channel, width)* (NCW layout) or
+  *(batch_size, width, channel)* (NWC layout),
 
 The shapes for 2-D pooling are
 
-- **data**: *(batch_size, channel, height, width)* (NCHW layout) pr
-  *(batch_size, height, width, channel)* (NHWC layout, cuDNN only),
-- **out**: *(batch_size, num_filter, out_height, out_width)* (NCHW layout) or
-  *(batch_size, out_height, out_width, num_filter)* (NHWC layout, cuDNN only), with::
+- **data** and **out**: *(batch_size, channel, height, width)* (NCHW layout) or
+  *(batch_size, height, width, channel)* (NHWC layout),
 
     out_height = f(height, kernel[0], pad[0], stride[0])
     out_width = f(width, kernel[1], pad[1], stride[1])
@@ -404,9 +400,8 @@ Three pooling options are supported by ``pool_type``:
 - **lp**: Lp pooling
 
 For 3-D pooling, an additional *depth* dimension is added before
-*height*. Namely the input data will have shape *(batch_size, channel, depth,
-height, width)* (NCDHW) or *(batch_size, depth, height, width, channel)* (NDHWC,
-cuDNN only).
+*height*. Namely the input data and output will have shape *(batch_size, channel, depth,
+height, width)* (NCDHW layout) or *(batch_size, depth, height, width, channel)* (NDHWC layout).
 
 Notes on Lp pooling:
 

From a86be0787ed530a578dc86958ca1be444286f731 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 15 Feb 2019 09:51:33 -0800
Subject: [PATCH 31/32] Use 'CHECK(x)' rather than 'if (x) LOG(FATAL)'.

---
 python/mxnet/gluon/nn/conv_layers.py      | 24 +++++++++++------------
 src/operator/nn/cudnn/cudnn_pooling-inl.h | 12 +++++-------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index dfac1c9dc372..c210081f6071 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -718,7 +718,7 @@ class MaxPool1D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight ('NCW' or 'NWC').
+        Dimension ordering of data and out ('NCW' or 'NWC').
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Pooling is applied on the W dimension.
     ceil_mode : bool, default False
@@ -764,7 +764,7 @@ class MaxPool2D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight ('NCHW' or 'NHWC').
+        Dimension ordering of data and out ('NCHW' or 'NHWC').
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
@@ -813,7 +813,7 @@ class MaxPool3D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight ('NCDHW' or 'NDHWC').
+        Dimension ordering of data and out ('NCDHW' or 'NDHWC').
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -863,7 +863,7 @@ class AvgPool1D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCW'
-        Dimension ordering of data and weight ('NCW' or 'NWC').
+        Dimension ordering of data and out ('NCW' or 'NWC').
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. padding is applied on 'W' dimension.
     ceil_mode : bool, default False
@@ -911,7 +911,7 @@ class AvgPool2D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight ('NCHW' or 'NHWC').
+        Dimension ordering of data and out ('NCHW' or 'NHWC').
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
     ceil_mode : bool, default False
@@ -962,7 +962,7 @@ class AvgPool3D(_Pooling):
         If padding is non-zero, then the input is implicitly
         zero-padded on both sides for padding number of points.
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight ('NCDHW' or 'NDHWC').
+        Dimension ordering of data and out ('NCDHW' or 'NDHWC').
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -1008,7 +1008,7 @@ class GlobalMaxPool1D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCW'
-        Dimension ordering of data and weight ('NCW' or 'NWC').
+        Dimension ordering of data and out ('NCW' or 'NWC').
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. Pooling is applied on the W dimension.
 
@@ -1035,7 +1035,7 @@ class GlobalMaxPool2D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight ('NCHW' or 'NHWC').
+        Dimension ordering of data and out ('NCHW' or 'NHWC').
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively. padding is applied on 'H' and 'W' dimension.
 
@@ -1063,7 +1063,7 @@ class GlobalMaxPool3D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight ('NCDHW' or 'NDHWC').
+        Dimension ordering of data and out ('NCDHW' or 'NDHWC').
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
@@ -1091,7 +1091,7 @@ class GlobalAvgPool1D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCW'
-        Dimension ordering of data and weight ('NCW' or 'NWC').
+        Dimension ordering of data and out ('NCW' or 'NWC').
         'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
         respectively. padding is applied on 'W' dimension.
 
@@ -1116,7 +1116,7 @@ class GlobalAvgPool2D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCHW'
-        Dimension ordering of data and weight ('NCHW' or 'NHWC').
+        Dimension ordering of data and out ('NCHW' or 'NHWC').
         'N', 'C', 'H', 'W' stands for batch, channel, height, and width
         dimensions respectively.
 
@@ -1143,7 +1143,7 @@ class GlobalAvgPool3D(_Pooling):
     Parameters
     ----------
     layout : str, default 'NCDHW'
-        Dimension ordering of data and weight ('NCDHW' or 'NDHWC').
+        Dimension ordering of data and out ('NCDHW' or 'NDHWC').
         'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
         depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
         dimension.
diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h
index d37668de6bed..ada605db0ee9 100644
--- a/src/operator/nn/cudnn/cudnn_pooling-inl.h
+++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h
@@ -63,7 +63,7 @@ class CuDNNPoolingOp {
         }
         break;
       default:
-        LOG(FATAL) << "Not implemented";
+        LOG(FATAL) << "Pooling type not implemented by cuDNN.";
     }
   }
 
@@ -81,8 +81,7 @@ class CuDNNPoolingOp {
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
-    if (!this->Init(s, in_data, out_data))
-      LOG(FATAL) << "cuDNN Pooling invoked with unsupported parameters.";
+    CHECK(this->Init(s, in_data, out_data)) << "cuDNN Pooling invoked with unsupported parameters.";
     if (param_.kernel.ndim() == 2) {
       // 2d pool
       Tensor<gpu, 4, DType> data = in_data.get<gpu, 4, DType>(s);
@@ -112,7 +111,7 @@ class CuDNNPoolingOp {
                                      out_desc_,
                                      out.dptr_));
     } else {
-      LOG(FATAL) << "Only support 2D or 3D pooling";
+      LOG(FATAL) << "cuDNN only supports 2D or 3D pooling.";
     }
   }
 
@@ -126,8 +125,7 @@ class CuDNNPoolingOp {
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
-    if (!this->Init(s, in_data, out_data))
-      LOG(FATAL) << "cuDNN Pooling invoked with unsupported parameters.";
+    CHECK(this->Init(s, in_data, out_data)) << "cuDNN Pooling invoked with unsupported parameters.";
     if (param_.kernel.ndim() == 2) {
       // 2d pool
       Tensor<gpu, 4, DType> m_out_grad = out_grad.get<gpu, 4, DType>(s);
@@ -165,7 +163,7 @@ class CuDNNPoolingOp {
                                       in_desc_,
                                       m_in_grad.dptr_));
     } else {
-      LOG(FATAL) << "Only support 2D or 3D pooling";
+      LOG(FATAL) << "cuDNN only supports 2D or 3D pooling.";
     }
   }
 

From f773a9016332933be88ed66063a3486313334475 Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Fri, 15 Feb 2019 12:08:48 -0800
Subject: [PATCH 32/32] Empty commit to trigger CI.