Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

make ROIAlign support position-sensitive pooling #13088

Merged
merged 10 commits into from
Jan 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/operator/contrib/roi_align-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* Copyright (c) 2018 by Contributors
* \file roi_align-inl.h
* \brief roi align operator and symbol
* \author Hang Zhang
* \author Hang Zhang, Shesung
* modified from Caffe2
*/
#ifndef MXNET_OPERATOR_CONTRIB_ROI_ALIGN_INL_H_
Expand All @@ -35,7 +35,6 @@
namespace mxnet {
namespace op {


// Declare enumeration of input order to make code more intuitive.
// These enums are only visible within this header
namespace roialign {
Expand All @@ -48,6 +47,7 @@ struct ROIAlignParam : public dmlc::Parameter<ROIAlignParam> {
TShape pooled_size;
float spatial_scale;
int sample_ratio;
bool position_sensitive;
DMLC_DECLARE_PARAMETER(ROIAlignParam) {
DMLC_DECLARE_FIELD(pooled_size)
.set_expect_ndim(2).enforce_nonzero()
Expand All @@ -57,6 +57,10 @@ struct ROIAlignParam : public dmlc::Parameter<ROIAlignParam> {
"Equals the reciprocal of total stride in convolutional layers");
DMLC_DECLARE_FIELD(sample_ratio).set_default(-1)
.describe("Optional sampling ratio of ROI align, using adaptive size by default.");
DMLC_DECLARE_FIELD(position_sensitive).set_default(false)
.describe("Whether to perform position-sensitive RoI pooling. PSRoIPooling is "
"first proposaled by R-FCN and it can reduce the input channels by ph*pw times, "
"where (ph, pw) is the pooled_size");
}
};

Expand Down
59 changes: 42 additions & 17 deletions src/operator/contrib/roi_align.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* Copyright (c) 2018 by Contributors
* \file roi_align.cc
* \brief roi align operator
* \author Hang Zhang
* \author Hang Zhang, Shesung
* Adapted from Caffe2
*/
#include "./roi_align-inl.h"
Expand Down Expand Up @@ -142,6 +142,7 @@ void ROIAlignForward(
const int nthreads,
const T* bottom_data,
const T& spatial_scale,
const bool position_sensitive,
const int channels,
const int height,
const int width,
Expand All @@ -156,6 +157,8 @@ void ROIAlignForward(
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
#pragma omp parallel for \
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;

Expand Down Expand Up @@ -208,19 +211,23 @@ void ROIAlignForward(
roi_bin_grid_w,
&pre_calc);

int c;
#pragma omp parallel for private(c) \
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
for (c = 0; c < channels; c++) {
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;

for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;

int c_unpooled = c;
int channels_unpooled = channels;
if (position_sensitive) {
c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
channels_unpooled = channels * pooled_height * pooled_width;
}
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels_unpooled + c_unpooled)
* height * width;
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
Expand Down Expand Up @@ -310,6 +317,7 @@ void ROIAlignBackward(
const T* top_diff,
const int /*num_rois*/,
const T& spatial_scale,
const bool position_sensitive,
const int channels,
const int height,
const int width,
Expand Down Expand Up @@ -347,8 +355,15 @@ void ROIAlignBackward(
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

int c_unpooled = c;
int channels_unpooled = channels;
if (position_sensitive) {
c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
channels_unpooled = channels * pooled_height * pooled_width;
}
T* offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
bottom_diff + (roi_batch_ind * channels_unpooled + c_unpooled)
* height * width;

int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
Expand Down Expand Up @@ -426,7 +441,7 @@ void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,

const int count = out_data[roialign::kOut].Size();
// const int num_rois = in_data[roialign::kBox].size(0);
const int channels = in_data[roialign::kData].size(1);
const int channels = out_data[roialign::kOut].size(1); // channels of pooled output
const int height = in_data[roialign::kData].size(2);
const int width = in_data[roialign::kData].size(3);
const int pooled_height = out_data[roialign::kOut].size(2);
Expand All @@ -439,9 +454,9 @@ void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
const DType *bottom_rois = in_data[roialign::kBox].dptr<DType>();
DType *top_data = out_data[roialign::kOut].dptr<DType>();

ROIAlignForward<DType>(count, bottom_data, param.spatial_scale, channels,
height, width, pooled_height, pooled_width, param.sample_ratio,
bottom_rois, rois_cols, top_data);
ROIAlignForward<DType>(count, bottom_data, param.spatial_scale, param.position_sensitive,
channels, height, width, pooled_height, pooled_width,
param.sample_ratio, bottom_rois, rois_cols, top_data);
})
}

Expand Down Expand Up @@ -470,7 +485,7 @@ void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,

const int count = out_grad[0].Size();
const int num_rois = in_data[0].size(0);
const int channels = outputs[0].size(1);
const int channels = out_grad[0].size(1); // channels of pooled output
const int height = outputs[0].size(2);
const int width = outputs[0].size(3);
const int pooled_height = out_grad[0].size(2);
Expand All @@ -489,8 +504,9 @@ void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
Fill<false>(s, outputs[0], kWriteTo, static_cast<DType>(0));
}
ROIAlignBackward<DType>(count, top_diff, num_rois, param.spatial_scale,
channels, height, width, pooled_height, pooled_width,
param.sample_ratio, grad_in, bottom_rois, rois_cols);
param.position_sensitive, channels, height, width,
pooled_height, pooled_width, param.sample_ratio, grad_in,
bottom_rois, rois_cols);
}
if (kWriteTo == req[roialign::kBox]) {
Fill<false>(s, outputs[1], kWriteTo, static_cast<DType>(0));
Expand Down Expand Up @@ -545,8 +561,17 @@ He, Kaiming, et al. "Mask R-CNN." ICCV, 2017
CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]";
// out: [num_rois, c, pooled_h, pooled_w]
out_shape->clear();
out_shape->push_back(
Shape4(bshape[0], dshape[1], param.pooled_size[0], param.pooled_size[1]));
if (param.position_sensitive) {
CHECK_EQ(dshape[1] % (param.pooled_size[0]*param.pooled_size[1]), 0) <<
"Input channels should be divided by pooled_size[0]*pooled_size[1]"
"when position_sensitive is true.";
out_shape->push_back(
Shape4(bshape[0], dshape[1]/param.pooled_size[0]/param.pooled_size[1],
param.pooled_size[0], param.pooled_size[1]));
} else {
out_shape->push_back(
Shape4(bshape[0], dshape[1], param.pooled_size[0], param.pooled_size[1]));
}
return true;
})
.set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
Expand Down
28 changes: 23 additions & 5 deletions src/operator/contrib/roi_align.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
* Copyright (c) 2018 by Contributors
* \file roi_align.cu
* \brief roi align operator
* \author Hang Zhang
* \author Hang Zhang, Shesung
* Adapted from Caffe2
*/
#include "./roi_align-inl.h"
Expand Down Expand Up @@ -111,6 +111,7 @@ __global__ void RoIAlignForwardKernel(
const int nthreads,
const T* bottom_data,
const T spatial_scale,
const bool position_sensitive,
const int channels,
const int height,
const int width,
Expand Down Expand Up @@ -145,8 +146,15 @@ __global__ void RoIAlignForwardKernel(
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

int c_unpooled = c;
int channels_unpooled = channels;
if (position_sensitive) {
c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
channels_unpooled = channels * pooled_height * pooled_width;
}
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
bottom_data + (roi_batch_ind * channels_unpooled + c_unpooled)
* height * width;

// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
Expand Down Expand Up @@ -242,6 +250,7 @@ __global__ void RoIAlignBackwardKernel(
const T* top_diff,
const int num_rois,
const T spatial_scale,
const bool position_sensitive,
const int channels,
const int height,
const int width,
Expand Down Expand Up @@ -276,8 +285,15 @@ __global__ void RoIAlignBackwardKernel(
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

int c_unpooled = c;
int channels_unpooled = channels;
if (position_sensitive) {
c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
channels_unpooled = channels * pooled_height * pooled_width;
}
T* offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
bottom_diff + (roi_batch_ind * channels_unpooled + c_unpooled)
* height * width;

int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
Expand Down Expand Up @@ -357,7 +373,7 @@ void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,

const int count = out_data[roialign::kOut].Size();
const int num_rois = in_data[roialign::kBox].size(0);
const int channels = in_data[roialign::kData].size(1);
const int channels = out_data[roialign::kOut].size(1); // channels of pooled output
const int height = in_data[roialign::kData].size(2);
const int width = in_data[roialign::kData].size(3);
const int pooled_height = out_data[roialign::kOut].size(2);
Expand All @@ -377,6 +393,7 @@ void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
count,
bottom_data,
param.spatial_scale,
param.position_sensitive,
channels,
height,
width,
Expand Down Expand Up @@ -414,7 +431,7 @@ void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,

const int count = out_grad[0].Size();
const int num_rois = in_data[0].size(0);
const int channels = outputs[0].size(1);
const int channels = out_grad[0].size(1); // channels of pooled output
const int height = outputs[0].size(2);
const int width = outputs[0].size(3);
const int pooled_height = out_grad[0].size(2);
Expand Down Expand Up @@ -445,6 +462,7 @@ void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
top_diff,
num_rois,
param.spatial_scale,
param.position_sensitive,
channels,
height,
width,
Expand Down
29 changes: 19 additions & 10 deletions tests/python/unittest/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

# pylint: skip-file
from __future__ import print_function
from __future__ import division
Copy link
Member

@wkcn wkcn Dec 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The types of variables are specific as float or int.
But it is useful for the consistency of division between py2 and py3.

import numpy as np
import mxnet as mx
import copy
Expand Down Expand Up @@ -6899,14 +6900,16 @@ def bilinear_interpolate(bottom, height, width, y, x):
]
return val, grad

def roialign_forward_backward(data, rois, pooled_size, spatial_scale, sampling_ratio, dy):
def roialign_forward_backward(data, rois, pooled_size, spatial_scale, sampling_ratio,
position_sensitive, dy):
N, C, H, W = data.shape
R = rois.shape[0]
PH, PW = pooled_size
assert len(rois.shape) == 2
assert rois.shape[1] == 5

out = np.zeros((R, C, PH, PW))
C_out = C // PH // PW if position_sensitive else C
out = np.zeros((R, C_out, PH, PW))
dx = np.zeros_like(data)
drois = np.zeros_like(rois)

Expand All @@ -6924,24 +6927,25 @@ def roialign_forward_backward(data, rois, pooled_size, spatial_scale, sampling_r
roi_bin_grid_h = int(np.ceil(roi_h * 1.0 / PH))
roi_bin_grid_w = int(np.ceil(roi_w * 1.0 / PW))
count = roi_bin_grid_h * roi_bin_grid_w
for c in range(C):
for c in range(C_out):
for ph in range(PH):
for pw in range(PW):
val = 0.0
c_in = c * PH * PW + ph * PW + pw if position_sensitive else c
for iy in range(roi_bin_grid_h):
y = sh + ph * bin_h + (iy + 0.5) * bin_h / roi_bin_grid_h
for ix in range(roi_bin_grid_w):
x = sw + pw * bin_w + (ix + 0.5) * bin_w / roi_bin_grid_w
v, g = bilinear_interpolate(bdata[c], H, W, y, x)
v, g = bilinear_interpolate(bdata[c_in], H, W, y, x)
val += v
# compute grad
for qy, qx, qw in g:
dx[batch_ind, c, qy, qx] += dy[r, c, ph, pw] * qw * 1.0 / count
dx[batch_ind, c_in, qy, qx] += dy[r, c, ph, pw] * qw * 1.0 / count

out[r, c, ph, pw] = val * 1.0 / count
return out, [dx, drois]

def test_roi_align_value(sampling_ratio=0):
def test_roi_align_value(sampling_ratio=0, position_sensitive=False):
ctx=default_context()
dtype = np.float32

Expand All @@ -6950,6 +6954,7 @@ def test_roi_align_value(sampling_ratio=0):
assert H == W
R = 7
pooled_size = (3, 4)
C = C * pooled_size[0] * pooled_size[1] if position_sensitive else C

spatial_scale = H * 1.0 / dlen
data = mx.nd.array(np.arange(N*C*W*H).reshape((N,C,H,W)), ctx=ctx, dtype = dtype)
Expand All @@ -6964,11 +6969,14 @@ def test_roi_align_value(sampling_ratio=0):
rois.attach_grad()
with mx.autograd.record():
output = mx.nd.contrib.ROIAlign(data, rois, pooled_size=pooled_size,
spatial_scale=spatial_scale, sample_ratio=sampling_ratio)
dy = mx.nd.random.uniform(-1, 1, (R, C) + pooled_size, ctx=ctx, dtype = dtype)
spatial_scale=spatial_scale, sample_ratio=sampling_ratio,
position_sensitive=position_sensitive)
C_out = C // pooled_size[0] // pooled_size[1] if position_sensitive else C
dy = mx.nd.random.uniform(-1, 1, (R, C_out) + pooled_size, ctx=ctx, dtype = dtype)
output.backward(dy)
real_output, [dx, drois] = roialign_forward_backward(data.asnumpy(), rois.asnumpy(), pooled_size,
spatial_scale, sampling_ratio, dy.asnumpy())
spatial_scale, sampling_ratio,
position_sensitive, dy.asnumpy())
assert np.allclose(output.asnumpy(), real_output)
# It seems that the precision between Cfloat and Pyfloat is different.
assert np.allclose(data.grad.asnumpy(), dx, atol = 1e-5), np.abs(data.grad.asnumpy() - dx).max()
Expand All @@ -6994,7 +7002,8 @@ def test_roi_align_autograd(sampling_ratio=0):
numeric_eps=1e-4, rtol=1e-1, atol=1e-4, ctx=ctx)

test_roi_align_value()
test_roi_align_value(2)
test_roi_align_value(sampling_ratio=2)
test_roi_align_value(position_sensitive=True)
test_roi_align_autograd()

@with_seed()
Expand Down