Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Add omp parallel optimization for _contrib_BilinearReisze2D (#15584)
Browse files Browse the repository at this point in the history
* Add omp parallel optimization for bilinear_resize op

* retrigger CI

* retrigger CI

* trigger CI
  • Loading branch information
wuxun-zhang authored and marcoabreu committed Jul 21, 2019
1 parent 9d859c8 commit 9a1a102
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 63 deletions.
146 changes: 84 additions & 62 deletions src/operator/contrib/bilinear_resize.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
* \author Hang Zhang
*/
#include "bilinear_resize-inl.h"
// #include "elemwise_op_common.h"
#include "../elemwise_op_common.h"

namespace mxnet {
Expand All @@ -44,56 +43,66 @@ void SpatialUpSamplingBilinearUpdateOutput(mshadow::Stream<cpu> *s,
int inputHeight = itensor.size(2);
int inputWidth = itensor.size(3);

const auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();

DType *idata = itensor.dptr_;
DType *odata = otensor.dptr_;
channels = nbatch * channels;
const int input_elems_per_channel = inputWidth * inputHeight;
const int output_elems_per_channel = outputWidth * outputHeight;

// special case: just copy
if (inputHeight == outputHeight && inputWidth == outputWidth) {
for (int h2 = 0; h2 < outputHeight; ++h2) {
#pragma omp parallel for num_threads(nthreads)
for (int index = 0; index < output_elems_per_channel; index++) {
const int h2 = index / outputWidth;
const int h1 = h2;
for (int w2 = 0; w2 < outputWidth; ++w2) {
const int w1 = w2;
const DType* pos1 = &idata[h1 * inputWidth + w1];
DType* pos2 = &odata[h2 * outputWidth + w2];
for (int c = 0; c < channels; ++c) {
pos2[0] = pos1[0];
pos1 += inputWidth * inputHeight;
pos2 += outputWidth * outputHeight;
}
const int w2 = index % outputWidth;
const int w1 = w2;
const DType* pos1 = &idata[h1 * inputWidth + w1];
DType* pos2 = &odata[index];
for (int c = 0; c < channels; ++c) {
*pos2 = *pos1;
pos1 += input_elems_per_channel;
pos2 += output_elems_per_channel;
}
}
return;
}

const float rheight =(outputHeight > 1) ? static_cast<float>(inputHeight - 1)/
(outputHeight - 1) : 0.f;
const float rwidth = (outputWidth > 1) ? static_cast<float>(inputWidth - 1) /
(outputWidth - 1) : 0.f;
for (int h2 = 0; h2 < outputHeight; ++h2) {
#pragma omp parallel for num_threads(nthreads)
for (int index = 0; index < output_elems_per_channel; index++) {
const int h2 = index / outputWidth;
const int w2 = index % outputWidth;

const float h1r = rheight * h2;
const int h1 = h1r;
const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
const DType h1lambda = h1r - h1;
const DType h0lambda = (DType)1. - h1lambda;
for (int w2 = 0; w2 < outputWidth; ++w2) {
const float w1r = rwidth * w2;
const int w1 = w1r;
const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
const DType w1lambda = w1r - w1;
const DType w0lambda = (DType)1. - w1lambda;
const DType* pos1 = &idata[h1 * inputWidth + w1];
DType* pos2 = &odata[h2 * outputWidth + w2];
for (int c = 0; c < channels; ++c) {
pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p])
+ h1lambda * (w0lambda * pos1[h1p * inputWidth]
+ w1lambda * pos1[h1p * inputWidth + w1p]);
pos1 += inputWidth * inputHeight;
pos2 += outputWidth * outputHeight;
}

const float w1r = rwidth * w2;
const int w1 = w1r;
const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
const DType w1lambda = w1r - w1;
const DType w0lambda = (DType)1. - w1lambda;
const DType* pos1 = &idata[h1 * inputWidth + w1];
DType* pos2 = &odata[index];

for (int c = 0; c < channels; ++c) {
*pos2 = h0lambda * (w0lambda * (*pos1) + w1lambda * *(pos1 + w1p))
+ h1lambda * (w0lambda * *(pos1 + h1p * inputWidth)
+ w1lambda * *(pos1 + h1p * inputWidth + w1p));
pos1 += input_elems_per_channel;
pos2 += output_elems_per_channel;
}
}
}


template<typename xpu, typename DType, typename AccReal>
void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream<cpu> *s,
const std::vector<TBlob> &input,
Expand All @@ -109,23 +118,28 @@ void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream<cpu> *s,
int inputHeight = gradInput.size(2);
int inputWidth = gradInput.size(3);

const auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();

DType *dataInput = gradInput.dptr_;
DType *dataOutput = gradOutput.dptr_;
channels = nbatch * channels;
const int input_elems_per_channel = inputWidth * inputHeight;
const int output_elems_per_channel = outputWidth * outputHeight;

// special case: same-size matching grids
if (inputHeight == outputHeight && inputWidth == outputWidth) {
for (int h2 = 0; h2 < outputHeight; ++h2) {
#pragma omp parallel for num_threads(nthreads)
for (int index = 0; index < output_elems_per_channel; index++) {
const int h2 = index / outputWidth;
const int h1 = h2;
for (int w2 = 0; w2 < outputWidth; ++w2) {
const int w1 = w2;
DType* pos1 = &dataInput[h1 * inputWidth + w1];
const DType* pos2 = &dataOutput[h2 * outputWidth + w2];
for (int c = 0; c < channels; ++c) {
pos1[0] += pos2[0];
pos1 += inputWidth * inputHeight;
pos2 += outputWidth * outputHeight;
}
const int w2 = index % outputWidth;
const int w1 = w2;
DType* pos1 = &dataInput[h1 * inputWidth + w1];
const DType* pos2 = &dataOutput[index];
for (int c = 0; c < channels; ++c) {
*pos1 += *pos2;
pos1 += input_elems_per_channel;
pos2 += output_elems_per_channel;
}
}
return;
Expand All @@ -134,28 +148,36 @@ void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream<cpu> *s,
(outputHeight - 1) : 0.f;
const float rwidth = (outputWidth > 1) ? static_cast<float>(inputWidth - 1)/
(outputWidth - 1) : 0.f;
for (int h2 = 0; h2 < outputHeight; ++h2) {

#pragma omp parallel for num_threads(nthreads)
for (int index = 0; index < output_elems_per_channel; index++) {
const int h2 = index / outputWidth;
const int w2 = index % outputWidth;

const float h1r = rheight * h2;
const int h1 = h1r;
const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
const DType h1lambda = h1r - h1;
const DType h0lambda = (DType)1. - h1lambda;
for (int w2 = 0; w2 < outputWidth; ++w2) {
const float w1r = rwidth * w2;
const int w1 = w1r;
const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
const DType w1lambda = w1r - w1;
const DType w0lambda = (DType)1. - w1lambda;
DType* posInput = &dataInput[h1 * inputWidth + w1];
const DType* posOutput = &dataOutput[h2 * outputWidth + w2];
for (int c = 0; c < channels; ++c) {
posInput[0] += h0lambda * w0lambda * posOutput[0];
posInput[w1p] += h0lambda * w1lambda * posOutput[0];
posInput[h1p * inputWidth] += h1lambda * w0lambda * posOutput[0];
posInput[h1p * inputWidth + w1p] += h1lambda * w1lambda * posOutput[0];
posInput += inputWidth * inputHeight;
posOutput += outputWidth * outputHeight;

const float w1r = rwidth * w2;
const int w1 = w1r;
const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
const DType w1lambda = w1r - w1;
const DType w0lambda = (DType)1. - w1lambda;

DType* posInput = &dataInput[h1 * inputWidth + w1];
const DType* posOutput = &dataOutput[index];
for (int c = 0; c < channels; ++c) {
#pragma omp critical
{
*posInput += h0lambda * w0lambda * (*posOutput);
*(posInput + w1p) += h0lambda * w1lambda * (*posOutput);
*(posInput + h1p * inputWidth) += h1lambda * w0lambda * (*posOutput);
*(posInput + h1p * inputWidth + w1p) += h1lambda * w1lambda * (*posOutput);
}
posInput += input_elems_per_channel;
posOutput += output_elems_per_channel;
}
}

Expand All @@ -165,19 +187,19 @@ void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream<cpu> *s,
int inputWidthLike = gradInputLike.size(3);
DType *dataInputLike = gradInputLike.dptr_;
int channelsLike = nbatch * gradInputLike.size(1);
for (int h_like = 0; h_like < inputHeightLike; ++h_like) {
for (int w_like = 0; w_like < inputWidthLike; ++w_like) {
DType *posInput = &dataInputLike[h_like * inputWidthLike + w_like];
for (int c = 0; c < channelsLike; ++c) {
posInput[0] = 0;
posInput += inputWidthLike * inputHeightLike;
}

const int inputLike_elems_per_channel = inputHeightLike * inputWidthLike;
#pragma omp parallel for num_threads(nthreads)
for (int index = 0; index < inputLike_elems_per_channel; index++) {
DType *posInput = &dataInputLike[index];
for (int c = 0; c < channelsLike; ++c) {
*posInput = 0;
posInput += inputLike_elems_per_channel;
}
}
}
}


DMLC_REGISTER_PARAMETER(BilinearSampleParam);

NNVM_REGISTER_OP(_contrib_BilinearResize2D)
Expand Down
16 changes: 16 additions & 0 deletions tests/python/gpu/test_operator_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -1143,6 +1143,22 @@ def test_flatten_slice_after_conv():
check_consistency(slice_sym, ctx_list)


@with_seed()
def test_bilinear_resize_op():
ctx_list = [{'ctx': mx.cpu(0), 'data': (2, 2, 20, 20), 'type_dict': {'data': np.float32}},
{'ctx': mx.gpu(0), 'data': (2, 2, 20, 20), 'type_dict': {'data': np.float32}}]

data = mx.sym.Variable('data')
sym = mx.sym.contrib.BilinearResize2D(data, height=10, width=5)
check_consistency(sym, ctx_list)

sym = mx.sym.contrib.BilinearResize2D(data, None, scale_height=2, scale_width=0.5, mode='odd_scale')
check_consistency(sym, ctx_list)

sym = mx.sym.contrib.BilinearResize2D(data, None, scale_height=0.5, scale_width=2, mode='to_even_up')
check_consistency(sym, ctx_list)


@with_seed()
def test_global_pooling():
def test_1d_pooling(pool_type, p_value=2):
Expand Down
2 changes: 1 addition & 1 deletion tests/python/unittest/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7649,7 +7649,7 @@ def py_bilinear_resize(x, outputHeight, outputWidth):
w1r = 1.0 * w2 * rwidth
w1 = int(np.floor(w1r))
w1lambda = w1r - w1
w1p = 1 if w1 < (inputHeight - 1) else 0
w1p = 1 if w1 < (inputWidth - 1) else 0
for b in range(batch):
for c in range(channel):
y[b][c][h2][w2] = (1-h1lambda)*((1-w1lambda)*x[b][c][h1][w1] + \
Expand Down

0 comments on commit 9a1a102

Please sign in to comment.