From 0107e3a9d54b2472266dbcc52f5a34b18fa86385 Mon Sep 17 00:00:00 2001 From: Wuxun Zhang Date: Mon, 22 Jul 2019 07:25:30 +0800 Subject: [PATCH] Add omp parallel optimization for _contrib_BilinearReisze2D (#15584) * Add omp parallel optimization for bilinear_resize op * retrigger CI * retrigger CI * trigger CI --- src/operator/contrib/bilinear_resize.cc | 146 ++++++++++++++---------- tests/python/gpu/test_operator_gpu.py | 16 +++ tests/python/unittest/test_operator.py | 2 +- 3 files changed, 101 insertions(+), 63 deletions(-) diff --git a/src/operator/contrib/bilinear_resize.cc b/src/operator/contrib/bilinear_resize.cc index 441ea53ad9c6..346324753a26 100644 --- a/src/operator/contrib/bilinear_resize.cc +++ b/src/operator/contrib/bilinear_resize.cc @@ -23,7 +23,6 @@ * \author Hang Zhang */ #include "bilinear_resize-inl.h" -// #include "elemwise_op_common.h" #include "../elemwise_op_common.h" namespace mxnet { @@ -44,56 +43,66 @@ void SpatialUpSamplingBilinearUpdateOutput(mshadow::Stream *s, int inputHeight = itensor.size(2); int inputWidth = itensor.size(3); + const auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); + DType *idata = itensor.dptr_; DType *odata = otensor.dptr_; channels = nbatch * channels; + const int input_elems_per_channel = inputWidth * inputHeight; + const int output_elems_per_channel = outputWidth * outputHeight; + // special case: just copy if (inputHeight == outputHeight && inputWidth == outputWidth) { - for (int h2 = 0; h2 < outputHeight; ++h2) { +#pragma omp parallel for num_threads(nthreads) + for (int index = 0; index < output_elems_per_channel; index++) { + const int h2 = index / outputWidth; const int h1 = h2; - for (int w2 = 0; w2 < outputWidth; ++w2) { - const int w1 = w2; - const DType* pos1 = &idata[h1 * inputWidth + w1]; - DType* pos2 = &odata[h2 * outputWidth + w2]; - for (int c = 0; c < channels; ++c) { - pos2[0] = pos1[0]; - pos1 += inputWidth * inputHeight; - pos2 += outputWidth * outputHeight; - } + const int w2 = index % outputWidth; + const int w1 = w2; + const DType* pos1 = &idata[h1 * inputWidth + w1]; + DType* pos2 = &odata[index]; + for (int c = 0; c < channels; ++c) { + *pos2 = *pos1; + pos1 += input_elems_per_channel; + pos2 += output_elems_per_channel; } } return; } + const float rheight =(outputHeight > 1) ? static_cast(inputHeight - 1)/ (outputHeight - 1) : 0.f; const float rwidth = (outputWidth > 1) ? static_cast(inputWidth - 1) / (outputWidth - 1) : 0.f; - for (int h2 = 0; h2 < outputHeight; ++h2) { +#pragma omp parallel for num_threads(nthreads) + for (int index = 0; index < output_elems_per_channel; index++) { + const int h2 = index / outputWidth; + const int w2 = index % outputWidth; + const float h1r = rheight * h2; const int h1 = h1r; const int h1p = (h1 < inputHeight - 1) ? 1 : 0; const DType h1lambda = h1r - h1; const DType h0lambda = (DType)1. - h1lambda; - for (int w2 = 0; w2 < outputWidth; ++w2) { - const float w1r = rwidth * w2; - const int w1 = w1r; - const int w1p = (w1 < inputWidth - 1) ? 1 : 0; - const DType w1lambda = w1r - w1; - const DType w0lambda = (DType)1. - w1lambda; - const DType* pos1 = &idata[h1 * inputWidth + w1]; - DType* pos2 = &odata[h2 * outputWidth + w2]; - for (int c = 0; c < channels; ++c) { - pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p]) - + h1lambda * (w0lambda * pos1[h1p * inputWidth] - + w1lambda * pos1[h1p * inputWidth + w1p]); - pos1 += inputWidth * inputHeight; - pos2 += outputWidth * outputHeight; - } + + const float w1r = rwidth * w2; + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const DType w1lambda = w1r - w1; + const DType w0lambda = (DType)1. - w1lambda; + const DType* pos1 = &idata[h1 * inputWidth + w1]; + DType* pos2 = &odata[index]; + + for (int c = 0; c < channels; ++c) { + *pos2 = h0lambda * (w0lambda * (*pos1) + w1lambda * *(pos1 + w1p)) + + h1lambda * (w0lambda * *(pos1 + h1p * inputWidth) + + w1lambda * *(pos1 + h1p * inputWidth + w1p)); + pos1 += input_elems_per_channel; + pos2 += output_elems_per_channel; } } } - template void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream *s, const std::vector &input, @@ -109,23 +118,28 @@ void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream *s, int inputHeight = gradInput.size(2); int inputWidth = gradInput.size(3); + const auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); + DType *dataInput = gradInput.dptr_; DType *dataOutput = gradOutput.dptr_; channels = nbatch * channels; + const int input_elems_per_channel = inputWidth * inputHeight; + const int output_elems_per_channel = outputWidth * outputHeight; // special case: same-size matching grids if (inputHeight == outputHeight && inputWidth == outputWidth) { - for (int h2 = 0; h2 < outputHeight; ++h2) { +#pragma omp parallel for num_threads(nthreads) + for (int index = 0; index < output_elems_per_channel; index++) { + const int h2 = index / outputWidth; const int h1 = h2; - for (int w2 = 0; w2 < outputWidth; ++w2) { - const int w1 = w2; - DType* pos1 = &dataInput[h1 * inputWidth + w1]; - const DType* pos2 = &dataOutput[h2 * outputWidth + w2]; - for (int c = 0; c < channels; ++c) { - pos1[0] += pos2[0]; - pos1 += inputWidth * inputHeight; - pos2 += outputWidth * outputHeight; - } + const int w2 = index % outputWidth; + const int w1 = w2; + DType* pos1 = &dataInput[h1 * inputWidth + w1]; + const DType* pos2 = &dataOutput[index]; + for (int c = 0; c < channels; ++c) { + *pos1 += *pos2; + pos1 += input_elems_per_channel; + pos2 += output_elems_per_channel; } } return; @@ -134,28 +148,36 @@ void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream *s, (outputHeight - 1) : 0.f; const float rwidth = (outputWidth > 1) ? static_cast(inputWidth - 1)/ (outputWidth - 1) : 0.f; - for (int h2 = 0; h2 < outputHeight; ++h2) { + +#pragma omp parallel for num_threads(nthreads) + for (int index = 0; index < output_elems_per_channel; index++) { + const int h2 = index / outputWidth; + const int w2 = index % outputWidth; + const float h1r = rheight * h2; const int h1 = h1r; const int h1p = (h1 < inputHeight - 1) ? 1 : 0; const DType h1lambda = h1r - h1; const DType h0lambda = (DType)1. - h1lambda; - for (int w2 = 0; w2 < outputWidth; ++w2) { - const float w1r = rwidth * w2; - const int w1 = w1r; - const int w1p = (w1 < inputWidth - 1) ? 1 : 0; - const DType w1lambda = w1r - w1; - const DType w0lambda = (DType)1. - w1lambda; - DType* posInput = &dataInput[h1 * inputWidth + w1]; - const DType* posOutput = &dataOutput[h2 * outputWidth + w2]; - for (int c = 0; c < channels; ++c) { - posInput[0] += h0lambda * w0lambda * posOutput[0]; - posInput[w1p] += h0lambda * w1lambda * posOutput[0]; - posInput[h1p * inputWidth] += h1lambda * w0lambda * posOutput[0]; - posInput[h1p * inputWidth + w1p] += h1lambda * w1lambda * posOutput[0]; - posInput += inputWidth * inputHeight; - posOutput += outputWidth * outputHeight; + + const float w1r = rwidth * w2; + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const DType w1lambda = w1r - w1; + const DType w0lambda = (DType)1. - w1lambda; + + DType* posInput = &dataInput[h1 * inputWidth + w1]; + const DType* posOutput = &dataOutput[index]; + for (int c = 0; c < channels; ++c) { + #pragma omp critical + { + *posInput += h0lambda * w0lambda * (*posOutput); + *(posInput + w1p) += h0lambda * w1lambda * (*posOutput); + *(posInput + h1p * inputWidth) += h1lambda * w0lambda * (*posOutput); + *(posInput + h1p * inputWidth + w1p) += h1lambda * w1lambda * (*posOutput); } + posInput += input_elems_per_channel; + posOutput += output_elems_per_channel; } } @@ -165,19 +187,19 @@ void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream *s, int inputWidthLike = gradInputLike.size(3); DType *dataInputLike = gradInputLike.dptr_; int channelsLike = nbatch * gradInputLike.size(1); - for (int h_like = 0; h_like < inputHeightLike; ++h_like) { - for (int w_like = 0; w_like < inputWidthLike; ++w_like) { - DType *posInput = &dataInputLike[h_like * inputWidthLike + w_like]; - for (int c = 0; c < channelsLike; ++c) { - posInput[0] = 0; - posInput += inputWidthLike * inputHeightLike; - } + + const int inputLike_elems_per_channel = inputHeightLike * inputWidthLike; +#pragma omp parallel for num_threads(nthreads) + for (int index = 0; index < inputLike_elems_per_channel; index++) { + DType *posInput = &dataInputLike[index]; + for (int c = 0; c < channelsLike; ++c) { + *posInput = 0; + posInput += inputLike_elems_per_channel; } } } } - DMLC_REGISTER_PARAMETER(BilinearSampleParam); NNVM_REGISTER_OP(_contrib_BilinearResize2D) diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 5b4f81d96065..f9814abc9c0b 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -1143,6 +1143,22 @@ def test_flatten_slice_after_conv(): check_consistency(slice_sym, ctx_list) +@with_seed() +def test_bilinear_resize_op(): + ctx_list = [{'ctx': mx.cpu(0), 'data': (2, 2, 20, 20), 'type_dict': {'data': np.float32}}, + {'ctx': mx.gpu(0), 'data': (2, 2, 20, 20), 'type_dict': {'data': np.float32}}] + + data = mx.sym.Variable('data') + sym = mx.sym.contrib.BilinearResize2D(data, height=10, width=5) + check_consistency(sym, ctx_list) + + sym = mx.sym.contrib.BilinearResize2D(data, None, scale_height=2, scale_width=0.5, mode='odd_scale') + check_consistency(sym, ctx_list) + + sym = mx.sym.contrib.BilinearResize2D(data, None, scale_height=0.5, scale_width=2, mode='to_even_up') + check_consistency(sym, ctx_list) + + @with_seed() def test_global_pooling(): def test_1d_pooling(pool_type, p_value=2): diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index 6f11a5d4764b..c2584314228b 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -7663,7 +7663,7 @@ def py_bilinear_resize(x, outputHeight, outputWidth): w1r = 1.0 * w2 * rwidth w1 = int(np.floor(w1r)) w1lambda = w1r - w1 - w1p = 1 if w1 < (inputHeight - 1) else 0 + w1p = 1 if w1 < (inputWidth - 1) else 0 for b in range(batch): for c in range(channel): y[b][c][h2][w2] = (1-h1lambda)*((1-w1lambda)*x[b][c][h1][w1] + \