From cb0d731c5d06d65136bc805545015bf42100f993 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 2 Jan 2019 15:20:31 -0800 Subject: [PATCH] Adding explanation in comment. --- src/operator/spatial_transformer.cu | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/operator/spatial_transformer.cu b/src/operator/spatial_transformer.cu index 1a16de3bab7c..fd330bd4ca87 100644 --- a/src/operator/spatial_transformer.cu +++ b/src/operator/spatial_transformer.cu @@ -35,8 +35,17 @@ template __device__ bool between(DType value, int lowerBound, int upperBound) { return (value >= lowerBound && value <= upperBound); } + template __global__ void +/* + * In order to not generate the code that uses too many + * registers (resulting in too many resources requested + * error) we need to tell the compiler that we will be + * launching this kernel with cuda::kMaxThreadsPerBlock + * threads per block. Setting __launch_bounds__ ensures + * that such configuration can always be launched. + */ __launch_bounds__(cuda::kMaxThreadsPerBlock, 1) BilinearSamplingForwardKernel(const int i_c, const int i_h, const int i_w, const DType* data, @@ -79,6 +88,14 @@ BilinearSamplingForwardKernel(const int i_c, const int i_h, } } +/* + * In order to not generate the code that uses too many + * registers (resulting in too many resources requested + * error) we need to tell the compiler that we will be + * launching this kernel with cuda::kMaxThreadsPerBlock + * threads per block. Setting __launch_bounds__ ensures + * that such configuration can always be launched. + */ template __global__ void __launch_bounds__(cuda::kMaxThreadsPerBlock, 1)