From 3a042857c1e9c2aeae327f90cf3377d1fc482201 Mon Sep 17 00:00:00 2001 From: Sandeep Krishnamurthy Date: Fri, 1 Feb 2019 17:13:52 -0800 Subject: [PATCH 1/8] parallelize on channel forward pass --- src/operator/image/image_random-inl.h | 40 +++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h index c9dd85af616f..59d64a9ad96b 100644 --- a/src/operator/image/image_random-inl.h +++ b/src/operator/image/image_random-inl.h @@ -217,37 +217,37 @@ inline bool NormalizeOpType(const nnvm::NodeAttrs& attrs, template struct normalize_forward { template - MSHADOW_XINLINE static void Map(int j, DType* out_data, const DType* in_data, - const int i, const int length, const int step, - const DType mean, const DType std_dev) { - KERNEL_ASSIGN(out_data[step + i*length + j], req, - (in_data[step + i*length + j] - mean) / std_dev); + MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data, + const NormalizeParam ¶m, const int length, + const int step) { + DType mean = param.mean[param.mean.ndim() > c ? c : 0]; + DType std_dev = param.std[param.std.ndim() > c ? c : 0]; + + #pragma omp parallel for + for (int i = 0; i < length; ++i) { + KERNEL_ASSIGN(out_data[step + c*length + i], req, + (in_data[step + c*length + i] - mean) / std_dev); + } } }; template void NormalizeImpl(const OpContext &ctx, - const std::vector &inputs, - const std::vector &outputs, - const std::vector &req, - const NormalizeParam ¶m, - const int length, - const uint32_t channel, - const int step = 0) { + const std::vector &inputs, + const std::vector &outputs, + const std::vector &req, + const NormalizeParam ¶m, + const int length, + const uint32_t channel, + const int step = 0) { mshadow::Stream *s = ctx.get_stream(); MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, { DType* input = inputs[0].dptr(); DType* output = outputs[0].dptr(); - - for (uint32_t i = 0; i < channel; ++i) { - DType mean = param.mean[param.mean.ndim() > i ? i : 0]; - DType std_dev = param.std[param.std.ndim() > i ? i : 0]; - mxnet_op::Kernel, xpu>::Launch( - s, length, output, input, - i, length, step, mean, std_dev); - } + mxnet_op::Kernel, xpu>::Launch( + s, channel, output, input, param, length, step); }); }); } From 9fe52f0577d6ce9f905523b67eda6de7c461d9a5 Mon Sep 17 00:00:00 2001 From: Sandeep Krishnamurthy Date: Fri, 1 Feb 2019 17:27:33 -0800 Subject: [PATCH 2/8] parallelize on channel normalize backward pass --- src/operator/image/image_random-inl.h | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h index 59d64a9ad96b..1b54ce094f9f 100644 --- a/src/operator/image/image_random-inl.h +++ b/src/operator/image/image_random-inl.h @@ -287,12 +287,16 @@ void NormalizeOpForward(const nnvm::NodeAttrs &attrs, template struct normalize_backward { template - MSHADOW_XINLINE static void Map(int j, DType* in_grad, const DType* out_grad, - const int i, const int length, - const int step, const DType std_dev) { + MSHADOW_XINLINE static void Map(uint32_t c, DType* in_grad, const DType* out_grad, + const NormalizeParam ¶m, const int length, + const int step) { // d/dx{(x - mean) / std_dev} => (1 / std_dev) - KERNEL_ASSIGN(in_grad[step + i*length + j], req, - out_grad[step + i*length + j] * (1.0 / std_dev)); + DType std_dev = param.std[param.std.ndim() > c ? c : 0]; + #pragma omp parallel for + for (int i = 0; i < length; ++i) { + KERNEL_ASSIGN(in_grad[step + c*length + i], req, + out_grad[step + c*length + i] * (1.0 / std_dev)); + } } }; @@ -306,16 +310,13 @@ void NormalizeBackwardImpl(const OpContext &ctx, const uint32_t channel, const int step = 0) { mshadow::Stream *s = ctx.get_stream(); - const TBlob& out_grad = inputs[0]; - const TBlob& in_grad = outputs[0]; + MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, { MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, { - for (uint32_t i = 0; i < channel; ++i) { - DType std_dev = param.std[param.std.ndim() > i ? i : 0]; - mxnet_op::Kernel, xpu>::Launch( - s, length, in_grad.dptr(), out_grad.dptr(), - i, length, step, std_dev); - } + DType* out_grad = inputs[0].dptr(); + DType* in_grad = outputs[0].dptr(); + mxnet_op::Kernel, xpu>::Launch( + s, channel, in_grad, out_grad, param, length, step); }); }); } From 3b6164faf617167a57de037b50749bcb2c5734df Mon Sep 17 00:00:00 2001 From: Sandeep Krishnamurthy Date: Fri, 1 Feb 2019 18:57:14 -0800 Subject: [PATCH 3/8] Fix lint issues --- src/operator/image/image_random-inl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h index 1b54ce094f9f..e65b68667980 100644 --- a/src/operator/image/image_random-inl.h +++ b/src/operator/image/image_random-inl.h @@ -218,11 +218,11 @@ template struct normalize_forward { template MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data, - const NormalizeParam ¶m, const int length, + const NormalizeParam ¶m, const int length, const int step) { DType mean = param.mean[param.mean.ndim() > c ? c : 0]; DType std_dev = param.std[param.std.ndim() > c ? c : 0]; - + #pragma omp parallel for for (int i = 0; i < length; ++i) { KERNEL_ASSIGN(out_data[step + c*length + i], req, From 67cde94486a61e83b673dd81101a274427681e50 Mon Sep 17 00:00:00 2001 From: Sandeep Krishnamurthy Date: Mon, 4 Feb 2019 17:13:33 -0800 Subject: [PATCH 4/8] Trying to fix CI build failure on GPU --- src/operator/image/image_random-inl.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h index e65b68667980..133d12fffad0 100644 --- a/src/operator/image/image_random-inl.h +++ b/src/operator/image/image_random-inl.h @@ -220,8 +220,20 @@ struct normalize_forward { MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data, const NormalizeParam ¶m, const int length, const int step) { - DType mean = param.mean[param.mean.ndim() > c ? c : 0]; - DType std_dev = param.std[param.std.ndim() > c ? c : 0]; + int mean_idx, std_idx; + if (param.mean.ndim() > c) { + mean_idx = c; + } else { + mean_idx = 0; + } + + if (param.std.ndim() > c) { + std_idx = c; + } else { + std_idx = 0; + } + DType mean = param.mean[mean_idx]; + DType std_dev = param.std[std_idx]; #pragma omp parallel for for (int i = 0; i < length; ++i) { From cdc08ded37e33981ec65a9cee84fa14e24ba717b Mon Sep 17 00:00:00 2001 From: Sandeep Krishnamurthy Date: Mon, 4 Feb 2019 23:06:33 -0800 Subject: [PATCH 5/8] Fix failing GPU test on CI Do not pass normalize param as is to GPU kernel --- src/operator/image/image_random-inl.h | 38 +++++++++++---------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h index 133d12fffad0..53f57e71f487 100644 --- a/src/operator/image/image_random-inl.h +++ b/src/operator/image/image_random-inl.h @@ -218,27 +218,16 @@ template struct normalize_forward { template MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data, - const NormalizeParam ¶m, const int length, - const int step) { - int mean_idx, std_idx; - if (param.mean.ndim() > c) { - mean_idx = c; - } else { - mean_idx = 0; - } - - if (param.std.ndim() > c) { - std_idx = c; - } else { - std_idx = 0; - } - DType mean = param.mean[mean_idx]; - DType std_dev = param.std[std_idx]; + const float* mean, const uint32_t mean_ndim, + const float* std_dev, const uint32_t std_dev_ndim, + const int length, const int step) { + DType m = mean[mean_ndim > c ? c : 0]; + DType s = std_dev[std_dev_ndim > c ? c : 0]; #pragma omp parallel for for (int i = 0; i < length; ++i) { KERNEL_ASSIGN(out_data[step + c*length + i], req, - (in_data[step + c*length + i] - mean) / std_dev); + (in_data[step + c*length + i] - m) / s); } } }; @@ -259,7 +248,8 @@ void NormalizeImpl(const OpContext &ctx, DType* input = inputs[0].dptr(); DType* output = outputs[0].dptr(); mxnet_op::Kernel, xpu>::Launch( - s, channel, output, input, param, length, step); + s, channel, output, input, param.mean.begin(), param.mean.ndim(), + param.std.begin(), param.std.ndim(), length, step); }); }); } @@ -300,14 +290,15 @@ template struct normalize_backward { template MSHADOW_XINLINE static void Map(uint32_t c, DType* in_grad, const DType* out_grad, - const NormalizeParam ¶m, const int length, - const int step) { + const float* std_dev, const uint32_t std_dev_ndim, + const int length, const int step) { // d/dx{(x - mean) / std_dev} => (1 / std_dev) - DType std_dev = param.std[param.std.ndim() > c ? c : 0]; + DType s = std_dev[std_dev_ndim > c ? c : 0]; + #pragma omp parallel for for (int i = 0; i < length; ++i) { KERNEL_ASSIGN(in_grad[step + c*length + i], req, - out_grad[step + c*length + i] * (1.0 / std_dev)); + out_grad[step + c*length + i] * (1.0 / s)); } } }; @@ -328,7 +319,8 @@ void NormalizeBackwardImpl(const OpContext &ctx, DType* out_grad = inputs[0].dptr(); DType* in_grad = outputs[0].dptr(); mxnet_op::Kernel, xpu>::Launch( - s, channel, in_grad, out_grad, param, length, step); + s, channel, in_grad, out_grad, param.std.begin(), + param.std.ndim(), length, step); }); }); } From 21df212f258e8c969584801f4ea9f3c89171ec8d Mon Sep 17 00:00:00 2001 From: Sandeep Krishnamurthy Date: Tue, 5 Feb 2019 10:29:10 -0800 Subject: [PATCH 6/8] Fix to_tensor tests --- tests/python/gpu/test_gluon_transforms.py | 33 ++++++----------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/tests/python/gpu/test_gluon_transforms.py b/tests/python/gpu/test_gluon_transforms.py index 3927d4c1f094..23b34d334888 100644 --- a/tests/python/gpu/test_gluon_transforms.py +++ b/tests/python/gpu/test_gluon_transforms.py @@ -80,32 +80,15 @@ def test_to_tensor(): data_in.astype(dtype=np.float32) / 255.0, (2, 0, 1))) # 4D Input - data_in_4d = nd.random.uniform(0, 1, (2, 3, 300, 300)) - out_nd_4d = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in_4d) - data_expected_4d = data_in_4d.asnumpy() - data_expected_4d[0][:][:][0] = data_expected_4d[0][:][:][0] / 3.0 - data_expected_4d[0][:][:][1] = (data_expected_4d[0][:][:][1] - 1.0) / 2.0 - data_expected_4d[0][:][:][2] = data_expected_4d[0][:][:][2] - 2.0 - data_expected_4d[1][:][:][0] = data_expected_4d[1][:][:][0] / 3.0 - data_expected_4d[1][:][:][1] = (data_expected_4d[1][:][:][1] - 1.0) / 2.0 - data_expected_4d[1][:][:][2] = data_expected_4d[1][:][:][2] - 2.0 - assert_almost_equal(data_expected_4d, out_nd_4d.asnumpy()) - - # Default normalize values i.e., mean=0, std=1 - data_in_3d_def = nd.random.uniform(0, 1, (3, 300, 300)) - out_nd_3d_def = transforms.Normalize()(data_in_3d_def) - data_expected_3d_def = data_in_3d_def.asnumpy() - assert_almost_equal(data_expected_3d_def, out_nd_3d_def.asnumpy()) - - # Invalid Input - Neither 3D or 4D input - invalid_data_in = nd.random.uniform(0, 1, (5, 5, 3, 300, 300)) - normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1)) - assertRaises(MXNetError, normalize_transformer, invalid_data_in) + data_in = np.random.uniform(0, 255, (5, 300, 300, 3)).astype(dtype=np.uint8) + out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8')) + assert_almost_equal(out_nd.asnumpy(), np.transpose( + data_in.astype(dtype=np.float32) / 255.0, (0, 3, 1, 2))) - # Invalid Input - Channel neither 1 or 3 - invalid_data_in = nd.random.uniform(0, 1, (5, 4, 300, 300)) - normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1)) - assertRaises(MXNetError, normalize_transformer, invalid_data_in) + # Invalid Input + invalid_data_in = nd.random.uniform(0, 255, (5, 5, 300, 300, 3)).astype(dtype=np.uint8) + transformer = transforms.ToTensor() + assertRaises(MXNetError, transformer, invalid_data_in) @with_seed() def test_resize(): From ba4c161166daeff5ad0be73cbdb42e15661e3c55 Mon Sep 17 00:00:00 2001 From: Sandeep Krishnamurthy Date: Tue, 5 Feb 2019 11:35:07 -0800 Subject: [PATCH 7/8] Pass mean and std_dev as native types for kernel --- src/operator/image/image_random-inl.h | 48 ++++++++++++++++++--------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h index 53f57e71f487..b4c0fe5d8147 100644 --- a/src/operator/image/image_random-inl.h +++ b/src/operator/image/image_random-inl.h @@ -218,11 +218,10 @@ template struct normalize_forward { template MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data, - const float* mean, const uint32_t mean_ndim, - const float* std_dev, const uint32_t std_dev_ndim, + const std::vector &mean, const std::vector &std_dev, const int length, const int step) { - DType m = mean[mean_ndim > c ? c : 0]; - DType s = std_dev[std_dev_ndim > c ? c : 0]; + DType m = mean[mean.size() > c ? c : 0]; + DType s = std_dev[std_dev.size() > c ? c : 0]; #pragma omp parallel for for (int i = 0; i < length; ++i) { @@ -237,7 +236,8 @@ void NormalizeImpl(const OpContext &ctx, const std::vector &inputs, const std::vector &outputs, const std::vector &req, - const NormalizeParam ¶m, + const std::vector &mean, + const std::vector &std_dev, const int length, const uint32_t channel, const int step = 0) { @@ -248,8 +248,7 @@ void NormalizeImpl(const OpContext &ctx, DType* input = inputs[0].dptr(); DType* output = outputs[0].dptr(); mxnet_op::Kernel, xpu>::Launch( - s, channel, output, input, param.mean.begin(), param.mean.ndim(), - param.std.begin(), param.std.ndim(), length, step); + s, channel, output, input, mean, std_dev, length, step); }); }); } @@ -266,11 +265,23 @@ void NormalizeOpForward(const nnvm::NodeAttrs &attrs, const NormalizeParam ¶m = nnvm::get(attrs.parsed); + // Prepare mean and std_dev vector + std::vector mean(param.mean.ndim()); + std::vector std_dev(param.std.ndim()); + + for (uint32_t idx = 0; idx < param.mean.ndim(); ++idx) { + mean[idx] = param.mean[idx]; + } + + for (uint32_t idx = 0; idx < param.std.ndim(); ++idx) { + std_dev[idx] = param.std[idx]; + } + // 3D input (c, h, w) if (inputs[0].ndim() == 3) { const int length = inputs[0].shape_[1] * inputs[0].shape_[2]; const uint32_t channel = inputs[0].shape_[0]; - NormalizeImpl(ctx, inputs, outputs, req, param, length, channel); + NormalizeImpl(ctx, inputs, outputs, req, mean, std_dev, length, channel); } else if (inputs[0].ndim() == 4) { // 4D input (n, c, h, w) const int batch_size = inputs[0].shape_[0]; @@ -280,7 +291,7 @@ void NormalizeOpForward(const nnvm::NodeAttrs &attrs, #pragma omp parallel for for (auto n = 0; n < batch_size; ++n) { - NormalizeImpl(ctx, inputs, outputs, req, param, length, channel, n*step); + NormalizeImpl(ctx, inputs, outputs, req, mean, std_dev, length, channel, n*step); } } } @@ -290,10 +301,10 @@ template struct normalize_backward { template MSHADOW_XINLINE static void Map(uint32_t c, DType* in_grad, const DType* out_grad, - const float* std_dev, const uint32_t std_dev_ndim, + const std::vector &std_dev, const int length, const int step) { // d/dx{(x - mean) / std_dev} => (1 / std_dev) - DType s = std_dev[std_dev_ndim > c ? c : 0]; + DType s = std_dev[std_dev.size() > c ? c : 0]; #pragma omp parallel for for (int i = 0; i < length; ++i) { @@ -308,7 +319,7 @@ void NormalizeBackwardImpl(const OpContext &ctx, const std::vector &inputs, const std::vector &outputs, const std::vector &req, - const NormalizeParam ¶m, + const std::vector &std_dev, const int length, const uint32_t channel, const int step = 0) { @@ -319,8 +330,7 @@ void NormalizeBackwardImpl(const OpContext &ctx, DType* out_grad = inputs[0].dptr(); DType* in_grad = outputs[0].dptr(); mxnet_op::Kernel, xpu>::Launch( - s, channel, in_grad, out_grad, param.std.begin(), - param.std.ndim(), length, step); + s, channel, in_grad, out_grad, std_dev, length, step); }); }); } @@ -335,7 +345,13 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs, CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); + // Prepare std_dev vector const NormalizeParam ¶m = nnvm::get(attrs.parsed); + std::vector std_dev(param.std.ndim()); + + for (uint32_t idx = 0; idx < param.std.ndim(); ++idx) { + std_dev[idx] = param.std[idx]; + } // Note: inputs[0] is out_grad const TBlob& in_data = inputs[1]; @@ -344,7 +360,7 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs, if (in_data.ndim() == 3) { const int length = in_data.shape_[1] * in_data.shape_[2]; const uint32_t channel = in_data.shape_[0]; - NormalizeBackwardImpl(ctx, inputs, outputs, req, param, length, channel); + NormalizeBackwardImpl(ctx, inputs, outputs, req, std_dev, length, channel); } else if (in_data.ndim() == 4) { // 4D input (n, c, h, w) const int batch_size = in_data.shape_[0]; @@ -354,7 +370,7 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs, #pragma omp parallel for for (auto n = 0; n < batch_size; ++n) { - NormalizeBackwardImpl(ctx, inputs, outputs, req, param, length, channel, n*step); + NormalizeBackwardImpl(ctx, inputs, outputs, req, std_dev, length, channel, n*step); } } } From c6c6829a9722047e3add0c04bf3f33630a1198df Mon Sep 17 00:00:00 2001 From: Sandeep Krishnamurthy Date: Tue, 5 Feb 2019 15:02:25 -0800 Subject: [PATCH 8/8] Fix CI failure. Do not pass mean, std as vector to kernel --- src/operator/image/image_random-inl.h | 99 +++++++++++++++++++-------- 1 file changed, 69 insertions(+), 30 deletions(-) diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h index b4c0fe5d8147..448016341f21 100644 --- a/src/operator/image/image_random-inl.h +++ b/src/operator/image/image_random-inl.h @@ -218,15 +218,25 @@ template struct normalize_forward { template MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data, - const std::vector &mean, const std::vector &std_dev, + const float mean_d0, const float mean_d1, const float mean_d2, + const float std_d0, const float std_d1, const float std_d2, const int length, const int step) { - DType m = mean[mean.size() > c ? c : 0]; - DType s = std_dev[std_dev.size() > c ? c : 0]; - + float mean, std; + switch (c) { + case 0 : mean = mean_d0; + std = std_d0; + break; + case 1 : mean = mean_d1; + std = std_d1; + break; + case 2 : mean = mean_d2; + std = std_d2; + break; + } #pragma omp parallel for for (int i = 0; i < length; ++i) { KERNEL_ASSIGN(out_data[step + c*length + i], req, - (in_data[step + c*length + i] - m) / s); + (in_data[step + c*length + i] - mean) / std); } } }; @@ -236,8 +246,9 @@ void NormalizeImpl(const OpContext &ctx, const std::vector &inputs, const std::vector &outputs, const std::vector &req, - const std::vector &mean, - const std::vector &std_dev, + const float mean_d0, const float mean_d1, + const float mean_d2, const float std_d0, + const float std_d1, const float std_d2, const int length, const uint32_t channel, const int step = 0) { @@ -248,7 +259,8 @@ void NormalizeImpl(const OpContext &ctx, DType* input = inputs[0].dptr(); DType* output = outputs[0].dptr(); mxnet_op::Kernel, xpu>::Launch( - s, channel, output, input, mean, std_dev, length, step); + s, channel, output, input, mean_d0, mean_d1, mean_d2, + std_d0, std_d1, std_d2, length, step); }); }); } @@ -265,23 +277,35 @@ void NormalizeOpForward(const nnvm::NodeAttrs &attrs, const NormalizeParam ¶m = nnvm::get(attrs.parsed); - // Prepare mean and std_dev vector - std::vector mean(param.mean.ndim()); - std::vector std_dev(param.std.ndim()); - - for (uint32_t idx = 0; idx < param.mean.ndim(); ++idx) { - mean[idx] = param.mean[idx]; + // Note: We need mean and std_dev in the kernel. + // It is costly (device copy) to pass it as vector, for gpu kernel. + // Hence, passing it as below for performance. + float mean_d0, mean_d1, mean_d2; + float std_d0, std_d1, std_d2; + + // Mean and Std can be 1 or 3 D only. + if (param.mean.ndim() == 1) { + mean_d0 = mean_d1 = mean_d2 = param.mean[0]; + } else { + mean_d0 = param.mean[0]; + mean_d1 = param.mean[1]; + mean_d2 = param.mean[2]; } - for (uint32_t idx = 0; idx < param.std.ndim(); ++idx) { - std_dev[idx] = param.std[idx]; + if (param.std.ndim() == 1) { + std_d0 = std_d1 = std_d2 = param.std[0]; + } else { + std_d0 = param.std[0]; + std_d1 = param.std[1]; + std_d2 = param.std[2]; } // 3D input (c, h, w) if (inputs[0].ndim() == 3) { const int length = inputs[0].shape_[1] * inputs[0].shape_[2]; const uint32_t channel = inputs[0].shape_[0]; - NormalizeImpl(ctx, inputs, outputs, req, mean, std_dev, length, channel); + NormalizeImpl(ctx, inputs, outputs, req, mean_d0, mean_d1, mean_d2, + std_d0, std_d1, std_d2, length, channel); } else if (inputs[0].ndim() == 4) { // 4D input (n, c, h, w) const int batch_size = inputs[0].shape_[0]; @@ -291,7 +315,8 @@ void NormalizeOpForward(const nnvm::NodeAttrs &attrs, #pragma omp parallel for for (auto n = 0; n < batch_size; ++n) { - NormalizeImpl(ctx, inputs, outputs, req, mean, std_dev, length, channel, n*step); + NormalizeImpl(ctx, inputs, outputs, req, mean_d0, mean_d1, mean_d2, + std_d0, std_d1, std_d2, length, channel, n*step); } } } @@ -301,15 +326,23 @@ template struct normalize_backward { template MSHADOW_XINLINE static void Map(uint32_t c, DType* in_grad, const DType* out_grad, - const std::vector &std_dev, + const float std_d0, const float std_d1, const float std_d2, const int length, const int step) { // d/dx{(x - mean) / std_dev} => (1 / std_dev) - DType s = std_dev[std_dev.size() > c ? c : 0]; + float std_dev; + switch (c) { + case 0 : std_dev = std_d0; + break; + case 1 : std_dev = std_d1; + break; + case 2 : std_dev = std_d2; + break; + } #pragma omp parallel for for (int i = 0; i < length; ++i) { KERNEL_ASSIGN(in_grad[step + c*length + i], req, - out_grad[step + c*length + i] * (1.0 / s)); + out_grad[step + c*length + i] * (1.0 / std_dev)); } } }; @@ -319,7 +352,7 @@ void NormalizeBackwardImpl(const OpContext &ctx, const std::vector &inputs, const std::vector &outputs, const std::vector &req, - const std::vector &std_dev, + const float std_d0, const float std_d1, const float std_d2, const int length, const uint32_t channel, const int step = 0) { @@ -330,7 +363,7 @@ void NormalizeBackwardImpl(const OpContext &ctx, DType* out_grad = inputs[0].dptr(); DType* in_grad = outputs[0].dptr(); mxnet_op::Kernel, xpu>::Launch( - s, channel, in_grad, out_grad, std_dev, length, step); + s, channel, in_grad, out_grad, std_d0, std_d1, std_d2, length, step); }); }); } @@ -345,12 +378,16 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs, CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); - // Prepare std_dev vector const NormalizeParam ¶m = nnvm::get(attrs.parsed); - std::vector std_dev(param.std.ndim()); - - for (uint32_t idx = 0; idx < param.std.ndim(); ++idx) { - std_dev[idx] = param.std[idx]; + float std_d0, std_d1, std_d2; + + // Std can be 1 or 3 D only + if (param.std.ndim() == 1) { + std_d0 = std_d1 = std_d2 = param.std[0]; + } else { + std_d0 = param.std[0]; + std_d1 = param.std[1]; + std_d2 = param.std[2]; } // Note: inputs[0] is out_grad @@ -360,7 +397,7 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs, if (in_data.ndim() == 3) { const int length = in_data.shape_[1] * in_data.shape_[2]; const uint32_t channel = in_data.shape_[0]; - NormalizeBackwardImpl(ctx, inputs, outputs, req, std_dev, length, channel); + NormalizeBackwardImpl(ctx, inputs, outputs, req, std_d0, std_d1, std_d2, length, channel); } else if (in_data.ndim() == 4) { // 4D input (n, c, h, w) const int batch_size = in_data.shape_[0]; @@ -370,7 +407,9 @@ void NormalizeOpBackward(const nnvm::NodeAttrs &attrs, #pragma omp parallel for for (auto n = 0; n < batch_size; ++n) { - NormalizeBackwardImpl(ctx, inputs, outputs, req, std_dev, length, channel, n*step); + NormalizeBackwardImpl(ctx, inputs, outputs, req, + std_d0, std_d1, std_d2, length, + channel, n*step); } } }