From dc5470cb9d0992ae2409ca17d85fcbbccda4ae86 Mon Sep 17 00:00:00 2001 From: Serge Panev Date: Fri, 27 Sep 2019 12:28:16 +0200 Subject: [PATCH] Factorize CUDA_KERNEL_LOOP used in CUDA kernels (#16197) * Factorize CUDA_KERNEL_LOOP used in CUDA kernels Signed-off-by: Serge Panev * Retrigger CI --- src/operator/contrib/count_sketch.cu | 4 ---- src/operator/contrib/deformable_psroi_pooling.cu | 4 ---- src/operator/contrib/psroi_pooling.cu | 4 ---- src/operator/contrib/roi_align.cu | 11 ++++------- src/operator/correlation.cu | 6 ++---- 5 files changed, 6 insertions(+), 23 deletions(-) diff --git a/src/operator/contrib/count_sketch.cu b/src/operator/contrib/count_sketch.cu index 68dede377fc1..b7113aed1dfe 100644 --- a/src/operator/contrib/count_sketch.cu +++ b/src/operator/contrib/count_sketch.cu @@ -33,10 +33,6 @@ #define WARPS_PER_BLOCK 1 #define THREADS_PER_BLOCK 512 -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); \ - i += blockDim.x * gridDim.x) namespace mshadow { namespace cuda { // wrappers to deal with atomic add diff --git a/src/operator/contrib/deformable_psroi_pooling.cu b/src/operator/contrib/deformable_psroi_pooling.cu index 6c89746b43ab..ba8cfc865798 100644 --- a/src/operator/contrib/deformable_psroi_pooling.cu +++ b/src/operator/contrib/deformable_psroi_pooling.cu @@ -38,10 +38,6 @@ cudaError_t error = condition; \ CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ } while (0) -#define CUDA_KERNEL_LOOP(i, n) \ -for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); \ - i += blockDim.x * gridDim.x) namespace mshadow { namespace cuda { diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu index e4de9248dfbf..c5f229148aff 100644 --- a/src/operator/contrib/psroi_pooling.cu +++ b/src/operator/contrib/psroi_pooling.cu @@ -39,10 +39,6 @@ cudaError_t error = condition; \ CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ } while (0) -#define CUDA_KERNEL_LOOP(i, n) \ -for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); \ - i += blockDim.x * gridDim.x) namespace mshadow { namespace cuda { diff --git a/src/operator/contrib/roi_align.cu b/src/operator/contrib/roi_align.cu index 38b461d5f58c..7099dd1a3991 100644 --- a/src/operator/contrib/roi_align.cu +++ b/src/operator/contrib/roi_align.cu @@ -24,15 +24,12 @@ * Adapted from Caffe2 */ #include "./roi_align-inl.h" +#include "../mxnet_op.h" namespace mxnet { namespace op { -#define CUDA_1D_KERNEL_LOOP(i, n) \ - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ - i += blockDim.x * gridDim.x) - using namespace mshadow::cuda; // The maximum number of blocks to use in the default kernel call. @@ -120,7 +117,7 @@ __global__ void RoIAlignForwardKernel( const int sampling_ratio, const T* bottom_rois, T* top_data) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { + CUDA_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; @@ -259,7 +256,7 @@ __global__ void RoIAlignBackwardKernel( const int sampling_ratio, T* bottom_diff, const T* bottom_rois) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { + CUDA_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; @@ -353,7 +350,7 @@ __global__ void RoIAlignBackwardKernel( } // if } // ix } // iy - } // CUDA_1D_KERNEL_LOOP + } // CUDA_KERNEL_LOOP } // RoIAlignBackward template diff --git a/src/operator/correlation.cu b/src/operator/correlation.cu index 821b9007a8fe..117dc61af6bf 100644 --- a/src/operator/correlation.cu +++ b/src/operator/correlation.cu @@ -28,6 +28,7 @@ #include #include #include +#include "./mxnet_op.h" #define ROUND_OFF 50000 #define WARPS_PER_BLOCK 1 @@ -38,10 +39,7 @@ cudaError_t error = condition; \ CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ } while (0) -#define CUDA_KERNEL_LOOP(i, n) \ -for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); \ - i += blockDim.x * gridDim.x) + namespace mshadow { namespace cuda { // == Correlation Kernel