From 3a042857c1e9c2aeae327f90cf3377d1fc482201 Mon Sep 17 00:00:00 2001
From: Sandeep Krishnamurthy <sandeep.krishna98@gmail.com>
Date: Fri, 1 Feb 2019 17:13:52 -0800
Subject: [PATCH 1/8] parallelize on channel forward pass

---
 src/operator/image/image_random-inl.h | 40 +++++++++++++--------------
 1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index c9dd85af616f..59d64a9ad96b 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -217,37 +217,37 @@ inline bool NormalizeOpType(const nnvm::NodeAttrs& attrs,
 template<int req>
 struct normalize_forward {
     template<typename DType>
-    MSHADOW_XINLINE static void Map(int j, DType* out_data, const DType* in_data,
-                                    const int i, const int length, const int step,
-                                    const DType mean, const DType std_dev) {
-        KERNEL_ASSIGN(out_data[step + i*length + j], req,
-                      (in_data[step + i*length + j] - mean) / std_dev);
+    MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data,
+                                    const NormalizeParam &param, const int length, 
+                                    const int step) {
+        DType mean = param.mean[param.mean.ndim() > c ? c : 0];
+        DType std_dev = param.std[param.std.ndim() > c ? c : 0];
+        
+        #pragma omp parallel for
+        for (int i = 0; i < length; ++i) {
+          KERNEL_ASSIGN(out_data[step + c*length + i], req,
+                        (in_data[step + c*length + i] - mean) / std_dev);
+        }
     }
 };
 
 template<typename xpu>
 void NormalizeImpl(const OpContext &ctx,
-                          const std::vector<TBlob> &inputs,
-                          const std::vector<TBlob> &outputs,
-                          const std::vector<OpReqType> &req,
-                          const NormalizeParam &param,
-                          const int length,
-                          const uint32_t channel,
-                          const int step = 0) {
+                   const std::vector<TBlob> &inputs,
+                   const std::vector<TBlob> &outputs,
+                   const std::vector<OpReqType> &req,
+                   const NormalizeParam &param,
+                   const int length,
+                   const uint32_t channel,
+                   const int step = 0) {
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
 
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
         DType* input = inputs[0].dptr<DType>();
         DType* output = outputs[0].dptr<DType>();
-
-        for (uint32_t i = 0; i < channel; ++i) {
-            DType mean = param.mean[param.mean.ndim() > i ? i : 0];
-            DType std_dev = param.std[param.std.ndim() > i ? i : 0];
-            mxnet_op::Kernel<normalize_forward<req_type>, xpu>::Launch(
-                s, length, output, input,
-                i, length, step, mean, std_dev);
-        }
+        mxnet_op::Kernel<normalize_forward<req_type>, xpu>::Launch(
+            s, channel, output, input, param, length, step);
       });
     });
 }

From 9fe52f0577d6ce9f905523b67eda6de7c461d9a5 Mon Sep 17 00:00:00 2001
From: Sandeep Krishnamurthy <sandeep.krishna98@gmail.com>
Date: Fri, 1 Feb 2019 17:27:33 -0800
Subject: [PATCH 2/8] parallelize on channel normalize backward pass

---
 src/operator/image/image_random-inl.h | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index 59d64a9ad96b..1b54ce094f9f 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -287,12 +287,16 @@ void NormalizeOpForward(const nnvm::NodeAttrs &attrs,
 template<int req>
 struct normalize_backward {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int j, DType* in_grad, const DType* out_grad,
-                                  const int i, const int length,
-                                  const int step, const DType std_dev) {
+  MSHADOW_XINLINE static void Map(uint32_t c, DType* in_grad, const DType* out_grad,
+                                  const NormalizeParam &param, const int length,
+                                  const int step) {
     // d/dx{(x - mean) / std_dev} => (1 / std_dev)
-    KERNEL_ASSIGN(in_grad[step + i*length + j], req,
-                  out_grad[step + i*length + j] * (1.0 / std_dev));
+    DType std_dev = param.std[param.std.ndim() > c ? c : 0];
+    #pragma omp parallel for
+    for (int i = 0; i < length; ++i) {
+      KERNEL_ASSIGN(in_grad[step + c*length + i], req,
+                    out_grad[step + c*length + i] * (1.0 / std_dev));
+    }
   }
 };
 
@@ -306,16 +310,13 @@ void NormalizeBackwardImpl(const OpContext &ctx,
                            const uint32_t channel,
                            const int step = 0) {
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TBlob& out_grad = inputs[0];
-    const TBlob& in_grad = outputs[0];
+
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        for (uint32_t i = 0; i < channel; ++i) {
-            DType std_dev = param.std[param.std.ndim() > i ? i : 0];
-            mxnet_op::Kernel<normalize_backward<req_type>, xpu>::Launch(
-                s, length, in_grad.dptr<DType>(), out_grad.dptr<DType>(),
-                i, length, step, std_dev);
-        }
+        DType* out_grad = inputs[0].dptr<DType>();
+        DType* in_grad = outputs[0].dptr<DType>();
+        mxnet_op::Kernel<normalize_backward<req_type>, xpu>::Launch(
+            s, channel, in_grad, out_grad, param, length, step);
       });
     });
 }

From 3b6164faf617167a57de037b50749bcb2c5734df Mon Sep 17 00:00:00 2001
From: Sandeep Krishnamurthy <sandeep.krishna98@gmail.com>
Date: Fri, 1 Feb 2019 18:57:14 -0800
Subject: [PATCH 3/8] Fix lint issues

---
 src/operator/image/image_random-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index 1b54ce094f9f..e65b68667980 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -218,11 +218,11 @@ template<int req>
 struct normalize_forward {
     template<typename DType>
     MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data,
-                                    const NormalizeParam &param, const int length, 
+                                    const NormalizeParam &param, const int length,
                                     const int step) {
         DType mean = param.mean[param.mean.ndim() > c ? c : 0];
         DType std_dev = param.std[param.std.ndim() > c ? c : 0];
-        
+
         #pragma omp parallel for
         for (int i = 0; i < length; ++i) {
           KERNEL_ASSIGN(out_data[step + c*length + i], req,

From 67cde94486a61e83b673dd81101a274427681e50 Mon Sep 17 00:00:00 2001
From: Sandeep Krishnamurthy <sandeep.krishna98@gmail.com>
Date: Mon, 4 Feb 2019 17:13:33 -0800
Subject: [PATCH 4/8] Trying to fix CI build failure on GPU

---
 src/operator/image/image_random-inl.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index e65b68667980..133d12fffad0 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -220,8 +220,20 @@ struct normalize_forward {
     MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data,
                                     const NormalizeParam &param, const int length,
                                     const int step) {
-        DType mean = param.mean[param.mean.ndim() > c ? c : 0];
-        DType std_dev = param.std[param.std.ndim() > c ? c : 0];
+        int mean_idx, std_idx;
+        if (param.mean.ndim() > c) {
+          mean_idx = c;
+        } else {
+          mean_idx = 0;
+        }
+
+        if (param.std.ndim() > c) {
+          std_idx = c;
+        } else {
+          std_idx = 0;
+        }
+        DType mean = param.mean[mean_idx];
+        DType std_dev = param.std[std_idx];
 
         #pragma omp parallel for
         for (int i = 0; i < length; ++i) {

From cdc08ded37e33981ec65a9cee84fa14e24ba717b Mon Sep 17 00:00:00 2001
From: Sandeep Krishnamurthy <sandeep.krishna98@gmail.com>
Date: Mon, 4 Feb 2019 23:06:33 -0800
Subject: [PATCH 5/8] Fix failing GPU test on CI Do not pass normalize param as
 is to GPU kernel

---
 src/operator/image/image_random-inl.h | 38 +++++++++++----------------
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index 133d12fffad0..53f57e71f487 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -218,27 +218,16 @@ template<int req>
 struct normalize_forward {
     template<typename DType>
     MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data,
-                                    const NormalizeParam &param, const int length,
-                                    const int step) {
-        int mean_idx, std_idx;
-        if (param.mean.ndim() > c) {
-          mean_idx = c;
-        } else {
-          mean_idx = 0;
-        }
-
-        if (param.std.ndim() > c) {
-          std_idx = c;
-        } else {
-          std_idx = 0;
-        }
-        DType mean = param.mean[mean_idx];
-        DType std_dev = param.std[std_idx];
+                                    const float* mean, const uint32_t mean_ndim,
+                                    const float* std_dev, const uint32_t std_dev_ndim,
+                                    const int length, const int step) {
+        DType m = mean[mean_ndim > c ? c : 0];
+        DType s = std_dev[std_dev_ndim > c ? c : 0];
 
         #pragma omp parallel for
         for (int i = 0; i < length; ++i) {
           KERNEL_ASSIGN(out_data[step + c*length + i], req,
-                        (in_data[step + c*length + i] - mean) / std_dev);
+                        (in_data[step + c*length + i] - m) / s);
         }
     }
 };
@@ -259,7 +248,8 @@ void NormalizeImpl(const OpContext &ctx,
         DType* input = inputs[0].dptr<DType>();
         DType* output = outputs[0].dptr<DType>();
         mxnet_op::Kernel<normalize_forward<req_type>, xpu>::Launch(
-            s, channel, output, input, param, length, step);
+            s, channel, output, input, param.mean.begin(), param.mean.ndim(),
+            param.std.begin(), param.std.ndim(), length, step);
       });
     });
 }
@@ -300,14 +290,15 @@ template<int req>
 struct normalize_backward {
   template<typename DType>
   MSHADOW_XINLINE static void Map(uint32_t c, DType* in_grad, const DType* out_grad,
-                                  const NormalizeParam &param, const int length,
-                                  const int step) {
+                                  const float* std_dev, const uint32_t std_dev_ndim,
+                                  const int length, const int step) {
     // d/dx{(x - mean) / std_dev} => (1 / std_dev)
-    DType std_dev = param.std[param.std.ndim() > c ? c : 0];
+    DType s = std_dev[std_dev_ndim > c ? c : 0];
+
     #pragma omp parallel for
     for (int i = 0; i < length; ++i) {
       KERNEL_ASSIGN(in_grad[step + c*length + i], req,
-                    out_grad[step + c*length + i] * (1.0 / std_dev));
+                    out_grad[step + c*length + i] * (1.0 / s));
     }
   }
 };
@@ -328,7 +319,8 @@ void NormalizeBackwardImpl(const OpContext &ctx,
         DType* out_grad = inputs[0].dptr<DType>();
         DType* in_grad = outputs[0].dptr<DType>();
         mxnet_op::Kernel<normalize_backward<req_type>, xpu>::Launch(
-            s, channel, in_grad, out_grad, param, length, step);
+            s, channel, in_grad, out_grad, param.std.begin(),
+            param.std.ndim(), length, step);
       });
     });
 }

From 21df212f258e8c969584801f4ea9f3c89171ec8d Mon Sep 17 00:00:00 2001
From: Sandeep Krishnamurthy <sandeep.krishna98@gmail.com>
Date: Tue, 5 Feb 2019 10:29:10 -0800
Subject: [PATCH 6/8] Fix to_tensor tests

---
 tests/python/gpu/test_gluon_transforms.py | 33 ++++++-----------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/tests/python/gpu/test_gluon_transforms.py b/tests/python/gpu/test_gluon_transforms.py
index 3927d4c1f094..23b34d334888 100644
--- a/tests/python/gpu/test_gluon_transforms.py
+++ b/tests/python/gpu/test_gluon_transforms.py
@@ -80,32 +80,15 @@ def test_to_tensor():
         data_in.astype(dtype=np.float32) / 255.0, (2, 0, 1)))
 
     # 4D Input
-    data_in_4d = nd.random.uniform(0, 1, (2, 3, 300, 300))
-    out_nd_4d = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in_4d)
-    data_expected_4d = data_in_4d.asnumpy()
-    data_expected_4d[0][:][:][0] = data_expected_4d[0][:][:][0] / 3.0
-    data_expected_4d[0][:][:][1] = (data_expected_4d[0][:][:][1] - 1.0) / 2.0
-    data_expected_4d[0][:][:][2] = data_expected_4d[0][:][:][2] - 2.0
-    data_expected_4d[1][:][:][0] = data_expected_4d[1][:][:][0] / 3.0
-    data_expected_4d[1][:][:][1] = (data_expected_4d[1][:][:][1] - 1.0) / 2.0
-    data_expected_4d[1][:][:][2] = data_expected_4d[1][:][:][2] - 2.0
-    assert_almost_equal(data_expected_4d, out_nd_4d.asnumpy())
-
-    # Default normalize values i.e., mean=0, std=1
-    data_in_3d_def = nd.random.uniform(0, 1, (3, 300, 300))
-    out_nd_3d_def = transforms.Normalize()(data_in_3d_def)
-    data_expected_3d_def = data_in_3d_def.asnumpy()
-    assert_almost_equal(data_expected_3d_def, out_nd_3d_def.asnumpy())
-
-    # Invalid Input - Neither 3D or 4D input
-    invalid_data_in = nd.random.uniform(0, 1, (5, 5, 3, 300, 300))
-    normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
-    assertRaises(MXNetError, normalize_transformer, invalid_data_in)
+    data_in = np.random.uniform(0, 255, (5, 300, 300, 3)).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
+    assert_almost_equal(out_nd.asnumpy(), np.transpose(
+                        data_in.astype(dtype=np.float32) / 255.0, (0, 3, 1, 2)))
 
-    # Invalid Input - Channel neither 1 or 3
-    invalid_data_in = nd.random.uniform(0, 1, (5, 4, 300, 300))
-    normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
-    assertRaises(MXNetError, normalize_transformer, invalid_data_in)
+    # Invalid Input
+    invalid_data_in = nd.random.uniform(0, 255, (5, 5, 300, 300, 3)).astype(dtype=np.uint8)
+    transformer = transforms.ToTensor()
+    assertRaises(MXNetError, transformer, invalid_data_in)
 
 @with_seed()
 def test_resize():

From ba4c161166daeff5ad0be73cbdb42e15661e3c55 Mon Sep 17 00:00:00 2001
From: Sandeep Krishnamurthy <sandeep.krishna98@gmail.com>
Date: Tue, 5 Feb 2019 11:35:07 -0800
Subject: [PATCH 7/8] Pass mean and std_dev as native types for kernel

---
 src/operator/image/image_random-inl.h | 48 ++++++++++++++++++---------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index 53f57e71f487..b4c0fe5d8147 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -218,11 +218,10 @@ template<int req>
 struct normalize_forward {
     template<typename DType>
     MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data,
-                                    const float* mean, const uint32_t mean_ndim,
-                                    const float* std_dev, const uint32_t std_dev_ndim,
+                                    const std::vector<float> &mean, const std::vector<float> &std_dev,
                                     const int length, const int step) {
-        DType m = mean[mean_ndim > c ? c : 0];
-        DType s = std_dev[std_dev_ndim > c ? c : 0];
+        DType m = mean[mean.size() > c ? c : 0];
+        DType s = std_dev[std_dev.size() > c ? c : 0];
 
         #pragma omp parallel for
         for (int i = 0; i < length; ++i) {
@@ -237,7 +236,8 @@ void NormalizeImpl(const OpContext &ctx,
                    const std::vector<TBlob> &inputs,
                    const std::vector<TBlob> &outputs,
                    const std::vector<OpReqType> &req,
-                   const NormalizeParam &param,
+                   const std::vector<float> &mean,
+                   const std::vector<float> &std_dev,
                    const int length,
                    const uint32_t channel,
                    const int step = 0) {
@@ -248,8 +248,7 @@ void NormalizeImpl(const OpContext &ctx,
         DType* input = inputs[0].dptr<DType>();
         DType* output = outputs[0].dptr<DType>();
         mxnet_op::Kernel<normalize_forward<req_type>, xpu>::Launch(
-            s, channel, output, input, param.mean.begin(), param.mean.ndim(),
-            param.std.begin(), param.std.ndim(), length, step);
+            s, channel, output, input, mean, std_dev, length, step);
       });
     });
 }
@@ -266,11 +265,23 @@ void NormalizeOpForward(const nnvm::NodeAttrs &attrs,
 
   const NormalizeParam &param = nnvm::get<NormalizeParam>(attrs.parsed);
 
+  // Prepare mean and std_dev vector
+  std::vector<float> mean(param.mean.ndim());
+  std::vector<float> std_dev(param.std.ndim());
+
+  for (uint32_t idx = 0; idx < param.mean.ndim(); ++idx) {
+    mean[idx] = param.mean[idx];
+  }
+
+  for (uint32_t idx = 0; idx < param.std.ndim(); ++idx) {
+    std_dev[idx] = param.std[idx];
+  }
+
   // 3D input (c, h, w)
   if (inputs[0].ndim() == 3) {
     const int length = inputs[0].shape_[1] * inputs[0].shape_[2];
     const uint32_t channel = inputs[0].shape_[0];
-    NormalizeImpl<xpu>(ctx, inputs, outputs, req, param, length, channel);
+    NormalizeImpl<xpu>(ctx, inputs, outputs, req, mean, std_dev, length, channel);
   } else if (inputs[0].ndim() == 4) {
     // 4D input (n, c, h, w)
     const int batch_size = inputs[0].shape_[0];
@@ -280,7 +291,7 @@ void NormalizeOpForward(const nnvm::NodeAttrs &attrs,
 
     #pragma omp parallel for
     for (auto n = 0; n < batch_size; ++n) {
-      NormalizeImpl<xpu>(ctx, inputs, outputs, req, param, length, channel, n*step);
+      NormalizeImpl<xpu>(ctx, inputs, outputs, req, mean, std_dev, length, channel, n*step);
     }
   }
 }
@@ -290,10 +301,10 @@ template<int req>
 struct normalize_backward {
   template<typename DType>
   MSHADOW_XINLINE static void Map(uint32_t c, DType* in_grad, const DType* out_grad,
-                                  const float* std_dev, const uint32_t std_dev_ndim,
+                                  const std::vector<float> &std_dev,
                                   const int length, const int step) {
     // d/dx{(x - mean) / std_dev} => (1 / std_dev)
-    DType s = std_dev[std_dev_ndim > c ? c : 0];
+    DType s = std_dev[std_dev.size() > c ? c : 0];
 
     #pragma omp parallel for
     for (int i = 0; i < length; ++i) {
@@ -308,7 +319,7 @@ void NormalizeBackwardImpl(const OpContext &ctx,
                            const std::vector<TBlob> &inputs,
                            const std::vector<TBlob> &outputs,
                            const std::vector<OpReqType> &req,
-                           const NormalizeParam &param,
+                           const std::vector<float> &std_dev,
                            const int length,
                            const uint32_t channel,
                            const int step = 0) {
@@ -319,8 +330,7 @@ void NormalizeBackwardImpl(const OpContext &ctx,
         DType* out_grad = inputs[0].dptr<DType>();
         DType* in_grad = outputs[0].dptr<DType>();
         mxnet_op::Kernel<normalize_backward<req_type>, xpu>::Launch(
-            s, channel, in_grad, out_grad, param.std.begin(),
-            param.std.ndim(), length, step);
+            s, channel, in_grad, out_grad, std_dev, length, step);
       });
     });
 }
@@ -335,7 +345,13 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs,
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
 
+  // Prepare std_dev vector
   const NormalizeParam &param = nnvm::get<NormalizeParam>(attrs.parsed);
+  std::vector<float> std_dev(param.std.ndim());
+
+  for (uint32_t idx = 0; idx < param.std.ndim(); ++idx) {
+    std_dev[idx] = param.std[idx];
+  }
 
   // Note: inputs[0] is out_grad
   const TBlob& in_data = inputs[1];
@@ -344,7 +360,7 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs,
   if (in_data.ndim() == 3) {
     const int length = in_data.shape_[1] * in_data.shape_[2];
     const uint32_t channel = in_data.shape_[0];
-    NormalizeBackwardImpl<xpu>(ctx, inputs, outputs, req, param, length, channel);
+    NormalizeBackwardImpl<xpu>(ctx, inputs, outputs, req, std_dev, length, channel);
   } else if (in_data.ndim() == 4) {
     // 4D input (n, c, h, w)
     const int batch_size = in_data.shape_[0];
@@ -354,7 +370,7 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs,
 
     #pragma omp parallel for
     for (auto n = 0; n < batch_size; ++n) {
-      NormalizeBackwardImpl<xpu>(ctx, inputs, outputs, req, param, length, channel, n*step);
+      NormalizeBackwardImpl<xpu>(ctx, inputs, outputs, req, std_dev, length, channel, n*step);
     }
   }
 }

From c6c6829a9722047e3add0c04bf3f33630a1198df Mon Sep 17 00:00:00 2001
From: Sandeep Krishnamurthy <sandeep.krishna98@gmail.com>
Date: Tue, 5 Feb 2019 15:02:25 -0800
Subject: [PATCH 8/8] Fix CI failure. Do not pass mean, std as vector to kernel

---
 src/operator/image/image_random-inl.h | 99 +++++++++++++++++++--------
 1 file changed, 69 insertions(+), 30 deletions(-)

diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index b4c0fe5d8147..448016341f21 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -218,15 +218,25 @@ template<int req>
 struct normalize_forward {
     template<typename DType>
     MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data,
-                                    const std::vector<float> &mean, const std::vector<float> &std_dev,
+                                    const float mean_d0, const float mean_d1, const float mean_d2,
+                                    const float std_d0, const float std_d1, const float std_d2,
                                     const int length, const int step) {
-        DType m = mean[mean.size() > c ? c : 0];
-        DType s = std_dev[std_dev.size() > c ? c : 0];
-
+        float mean, std;
+        switch (c) {
+          case 0 : mean = mean_d0;
+                   std = std_d0;
+                   break;
+          case 1 : mean = mean_d1;
+                   std = std_d1;
+                   break;
+          case 2 : mean = mean_d2;
+                   std = std_d2;
+                   break;
+        }
         #pragma omp parallel for
         for (int i = 0; i < length; ++i) {
           KERNEL_ASSIGN(out_data[step + c*length + i], req,
-                        (in_data[step + c*length + i] - m) / s);
+                        (in_data[step + c*length + i] - mean) / std);
         }
     }
 };
@@ -236,8 +246,9 @@ void NormalizeImpl(const OpContext &ctx,
                    const std::vector<TBlob> &inputs,
                    const std::vector<TBlob> &outputs,
                    const std::vector<OpReqType> &req,
-                   const std::vector<float> &mean,
-                   const std::vector<float> &std_dev,
+                   const float mean_d0, const float mean_d1,
+                   const float mean_d2, const float std_d0,
+                   const float std_d1, const float std_d2,
                    const int length,
                    const uint32_t channel,
                    const int step = 0) {
@@ -248,7 +259,8 @@ void NormalizeImpl(const OpContext &ctx,
         DType* input = inputs[0].dptr<DType>();
         DType* output = outputs[0].dptr<DType>();
         mxnet_op::Kernel<normalize_forward<req_type>, xpu>::Launch(
-            s, channel, output, input, mean, std_dev, length, step);
+            s, channel, output, input, mean_d0, mean_d1, mean_d2,
+            std_d0, std_d1, std_d2, length, step);
       });
     });
 }
@@ -265,23 +277,35 @@ void NormalizeOpForward(const nnvm::NodeAttrs &attrs,
 
   const NormalizeParam &param = nnvm::get<NormalizeParam>(attrs.parsed);
 
-  // Prepare mean and std_dev vector
-  std::vector<float> mean(param.mean.ndim());
-  std::vector<float> std_dev(param.std.ndim());
-
-  for (uint32_t idx = 0; idx < param.mean.ndim(); ++idx) {
-    mean[idx] = param.mean[idx];
+  // Note: We need mean and std_dev in the kernel.
+  // It is costly (device copy) to pass it as vector, for gpu kernel.
+  // Hence, passing it as below for performance.
+  float mean_d0, mean_d1, mean_d2;
+  float std_d0, std_d1, std_d2;
+
+  // Mean and Std can be 1 or 3 D only.
+  if (param.mean.ndim() == 1) {
+    mean_d0 = mean_d1 = mean_d2 = param.mean[0];
+  } else {
+    mean_d0 = param.mean[0];
+    mean_d1 = param.mean[1];
+    mean_d2 = param.mean[2];
   }
 
-  for (uint32_t idx = 0; idx < param.std.ndim(); ++idx) {
-    std_dev[idx] = param.std[idx];
+  if (param.std.ndim() == 1) {
+    std_d0 = std_d1 = std_d2 = param.std[0];
+  } else {
+    std_d0 = param.std[0];
+    std_d1 = param.std[1];
+    std_d2 = param.std[2];
   }
 
   // 3D input (c, h, w)
   if (inputs[0].ndim() == 3) {
     const int length = inputs[0].shape_[1] * inputs[0].shape_[2];
     const uint32_t channel = inputs[0].shape_[0];
-    NormalizeImpl<xpu>(ctx, inputs, outputs, req, mean, std_dev, length, channel);
+    NormalizeImpl<xpu>(ctx, inputs, outputs, req, mean_d0, mean_d1, mean_d2,
+                       std_d0, std_d1, std_d2, length, channel);
   } else if (inputs[0].ndim() == 4) {
     // 4D input (n, c, h, w)
     const int batch_size = inputs[0].shape_[0];
@@ -291,7 +315,8 @@ void NormalizeOpForward(const nnvm::NodeAttrs &attrs,
 
     #pragma omp parallel for
     for (auto n = 0; n < batch_size; ++n) {
-      NormalizeImpl<xpu>(ctx, inputs, outputs, req, mean, std_dev, length, channel, n*step);
+      NormalizeImpl<xpu>(ctx, inputs, outputs, req, mean_d0, mean_d1, mean_d2,
+                       std_d0, std_d1, std_d2, length, channel, n*step);
     }
   }
 }
@@ -301,15 +326,23 @@ template<int req>
 struct normalize_backward {
   template<typename DType>
   MSHADOW_XINLINE static void Map(uint32_t c, DType* in_grad, const DType* out_grad,
-                                  const std::vector<float> &std_dev,
+                                  const float std_d0, const float std_d1, const float std_d2,
                                   const int length, const int step) {
     // d/dx{(x - mean) / std_dev} => (1 / std_dev)
-    DType s = std_dev[std_dev.size() > c ? c : 0];
+    float std_dev;
+    switch (c) {
+        case 0 : std_dev = std_d0;
+                 break;
+        case 1 : std_dev = std_d1;
+                 break;
+        case 2 : std_dev = std_d2;
+                 break;
+    }
 
     #pragma omp parallel for
     for (int i = 0; i < length; ++i) {
       KERNEL_ASSIGN(in_grad[step + c*length + i], req,
-                    out_grad[step + c*length + i] * (1.0 / s));
+                    out_grad[step + c*length + i] * (1.0 / std_dev));
     }
   }
 };
@@ -319,7 +352,7 @@ void NormalizeBackwardImpl(const OpContext &ctx,
                            const std::vector<TBlob> &inputs,
                            const std::vector<TBlob> &outputs,
                            const std::vector<OpReqType> &req,
-                           const std::vector<float> &std_dev,
+                           const float std_d0, const float std_d1, const float std_d2,
                            const int length,
                            const uint32_t channel,
                            const int step = 0) {
@@ -330,7 +363,7 @@ void NormalizeBackwardImpl(const OpContext &ctx,
         DType* out_grad = inputs[0].dptr<DType>();
         DType* in_grad = outputs[0].dptr<DType>();
         mxnet_op::Kernel<normalize_backward<req_type>, xpu>::Launch(
-            s, channel, in_grad, out_grad, std_dev, length, step);
+            s, channel, in_grad, out_grad, std_d0, std_d1, std_d2, length, step);
       });
     });
 }
@@ -345,12 +378,16 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs,
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
 
-  // Prepare std_dev vector
   const NormalizeParam &param = nnvm::get<NormalizeParam>(attrs.parsed);
-  std::vector<float> std_dev(param.std.ndim());
-
-  for (uint32_t idx = 0; idx < param.std.ndim(); ++idx) {
-    std_dev[idx] = param.std[idx];
+  float std_d0, std_d1, std_d2;
+
+  // Std can be 1 or 3 D only
+  if (param.std.ndim() == 1) {
+    std_d0 = std_d1 = std_d2 = param.std[0];
+  } else {
+    std_d0 = param.std[0];
+    std_d1 = param.std[1];
+    std_d2 = param.std[2];
   }
 
   // Note: inputs[0] is out_grad
@@ -360,7 +397,7 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs,
   if (in_data.ndim() == 3) {
     const int length = in_data.shape_[1] * in_data.shape_[2];
     const uint32_t channel = in_data.shape_[0];
-    NormalizeBackwardImpl<xpu>(ctx, inputs, outputs, req, std_dev, length, channel);
+    NormalizeBackwardImpl<xpu>(ctx, inputs, outputs, req, std_d0, std_d1, std_d2, length, channel);
   } else if (in_data.ndim() == 4) {
     // 4D input (n, c, h, w)
     const int batch_size = in_data.shape_[0];
@@ -370,7 +407,9 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs,
 
     #pragma omp parallel for
     for (auto n = 0; n < batch_size; ++n) {
-      NormalizeBackwardImpl<xpu>(ctx, inputs, outputs, req, std_dev, length, channel, n*step);
+      NormalizeBackwardImpl<xpu>(ctx, inputs, outputs, req,
+                                 std_d0, std_d1, std_d2, length,
+                                 channel, n*step);
     }
   }
 }