Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Fix launch bounds in spatial transformer (#13188)
Browse files Browse the repository at this point in the history
* Fix launch bounds in spatial transformer

* Adding explanation in comment.
  • Loading branch information
ptrendx authored and KellenSunderland committed Jan 14, 2019
1 parent bf0185c commit 0faa5b7
Showing 1 changed file with 32 additions and 11 deletions.
43 changes: 32 additions & 11 deletions src/operator/spatial_transformer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,23 @@ template<typename DType>
__device__ bool between(DType value, int lowerBound, int upperBound) {
return (value >= lowerBound && value <= upperBound);
}

template<typename DType>
__global__ void BilinearSamplingForwardKernel(const int i_c, const int i_h,
const int i_w, const DType* data,
const DType* grid, const int o_n,
const int o_c, const int o_h,
const int o_w, DType* out) {
__global__ void
/*
* In order to not generate the code that uses too many
* registers (resulting in too many resources requested
* error) we need to tell the compiler that we will be
* launching this kernel with cuda::kMaxThreadsPerBlock
* threads per block. Setting __launch_bounds__ ensures
* that such configuration can always be launched.
*/
__launch_bounds__(cuda::kMaxThreadsPerBlock, 1)
BilinearSamplingForwardKernel(const int i_c, const int i_h,
const int i_w, const DType* data,
const DType* grid, const int o_n,
const int o_c, const int o_h,
const int o_w, DType* out) {
for (int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
index < o_n * o_c * o_h * o_w;
index += blockDim.x * gridDim.x * gridDim.y) {
Expand Down Expand Up @@ -77,13 +88,23 @@ __global__ void BilinearSamplingForwardKernel(const int i_c, const int i_h,
}
}

/*
* In order to not generate the code that uses too many
* registers (resulting in too many resources requested
* error) we need to tell the compiler that we will be
* launching this kernel with cuda::kMaxThreadsPerBlock
* threads per block. Setting __launch_bounds__ ensures
* that such configuration can always be launched.
*/
template<typename DType>
__global__ void BilinearSamplingBackwardKernel(const int i_c, const int i_h,
const int i_w, const DType* grad,
const DType* data, const int o_n,
const int o_c, const int o_h,
const int o_w, DType* g_input,
DType* grid_src) {
__global__ void
__launch_bounds__(cuda::kMaxThreadsPerBlock, 1)
BilinearSamplingBackwardKernel(const int i_c, const int i_h,
const int i_w, const DType* grad,
const DType* data, const int o_n,
const int o_c, const int o_h,
const int o_w, DType* g_input,
DType* grid_src) {
for (int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
index < o_n * o_h * o_w;
index += blockDim.x * gridDim.x * gridDim.y) {
Expand Down

0 comments on commit 0faa5b7

Please sign in to comment.