diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu index 1a1bc2f3bcf..f093f21a5a5 100644 --- a/src/chain/chain-kernels.cu +++ b/src/chain/chain-kernels.cu @@ -25,7 +25,14 @@ configure with --use-cuda=no (this will disable the use of GPU). #endif -#if __CUDA_ARCH__ < 600 + +#ifdef __CUDACC__ +#if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 ) +// native implementation available +#else +#if __CUDA_ARCH__ >= 600 +#error using CAS implementation of double atomicAdd +#endif __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*) address; unsigned long long int old = *address_as_ull, assumed; @@ -41,6 +48,8 @@ __device__ double atomicAdd(double* address, double val) { return __longlong_as_double(old); } #endif +#endif + template __device__ inline void atomic_add(Real* address, Real value) { @@ -278,4 +287,3 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl, this_beta, log_prob_deriv, log_prob_deriv_stride); } - diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 97416750372..3c4bb7a660c 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -28,6 +28,16 @@ #include #include "cudamatrix/cu-kernels-ansi.h" + + +inline __device__ static float max_generic(float a, float b) { + return fmaxf(a, b); +} + +inline __device__ static double max_generic(double a, double b) { + return fmax(a, b); +} + /*********************************************************************** * Generic __device__ functions */ @@ -379,7 +389,7 @@ static void _max(Real* mat, const Real* A, MatrixDim dst_d, int src_stride) { int32_cuda dst_index = i + j * dst_d.stride, src_index = i + j * src_stride; if (i < dst_d.cols && j < dst_d.rows) { Real a = mat[dst_index], b = A[src_index]; - mat[dst_index] = (a > b ? a : b); + mat[dst_index] = max_generic(a, b); } } @@ -890,9 +900,8 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M, // Tree reduce to 2x warpSize elements. # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { - if (tid < shift) { + if (tid < shift) ssum[tid] += ssum[tid + shift]; - } __syncthreads(); } @@ -1248,7 +1257,7 @@ struct TransReduceOp { } __forceinline__ __device__ Real Reduce(const Real& a, const Real& b) const { - return max(a, b); + return max_generic(a, b); } __forceinline__ __device__ Real PostReduce(const Real& x, const Real& output) const { @@ -1288,7 +1297,7 @@ struct TransReduceOp { } __forceinline__ __device__ Real Reduce(const Real& a, const Real& b) const { - return max(a, b); + return max_generic(a, b); } __forceinline__ __device__ Real PostReduce(const Real& x, const Real& output) const { @@ -2155,7 +2164,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { // reduce to CU1DBLOCK elements per row. Real tmax = sizeof(Real) == sizeof(float) ? -CUDART_INF_F : -CUDART_INF; for (int j = tid; j < d.cols; j += CU1DBLOCK) { - tmax = max(tmax, x[x_start + j]); + tmax = max_generic(tmax, x[x_start + j]); } smem[tid] = tmax; __syncthreads(); @@ -2164,7 +2173,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { if (tid < shift) { - smem[tid] = max(smem[tid], smem[tid + shift]); + smem[tid] = max_generic(smem[tid], smem[tid + shift]); } __syncthreads(); } @@ -2173,7 +2182,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { if (tid < warpSize) { # pragma unroll for (int shift = warpSize; shift > 0; shift >>= 1) { - smem[tid] = max(smem[tid], smem[tid + shift]); + smem[tid] = max_generic(smem[tid], smem[tid + shift]); } } @@ -2251,10 +2260,9 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x, // Tree reduce to 2x warpSize elements per row # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { - if (tid < shift) { + if (tid < shift) ssum[tid] += ssum[tid + shift]; - __syncthreads(); - } + __syncthreads(); } // Reduce last warp to 1 element per row. @@ -2269,7 +2277,7 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x, const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66 if (tid == 0) { ssum[0] = sqrt( - max(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor)); + max_generic(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor)); } // Broadcast floored stddev to all threads. @@ -2312,7 +2320,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim, // reduce to CU1DBLOCK elements per row. Real tmax = -1e20; for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) { - tmax = max(tmax, x[x_start + j]); + tmax = max_generic(tmax, x[x_start + j]); } smem[tid] = tmax; __syncthreads(); @@ -2321,7 +2329,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim, # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { if (tid < shift) { - smem[tid] = max(smem[tid], smem[tid + shift]); + smem[tid] = max_generic(smem[tid], smem[tid + shift]); } __syncthreads(); } @@ -2329,7 +2337,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim, // reduce to 1 element per row if (tid < warpSize) { for (int shift = warpSize; shift > 0; shift >>= 1) { - smem[tid] = max(smem[tid], smem[tid + shift]); + smem[tid] = max_generic(smem[tid], smem[tid + shift]); } } diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc index b5d0c040e02..763c762424a 100644 --- a/src/cudamatrix/cu-math.cc +++ b/src/cudamatrix/cu-math.cc @@ -29,15 +29,15 @@ namespace kaldi { namespace cu { /* - * templated functions wrapping the ANSI-C CUDA kernel functions + * templated functions wrapping the ANSI-C CUDA kernel functions */ template void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, Real l1, Real lr) { KALDI_ASSERT(SameDim(*weight, *grad)); -#if HAVE_CUDA == 1 - if (CuDevice::Instantiate().Enabled()) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { Timer tim; dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); @@ -46,7 +46,7 @@ void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, Real l1, cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr, weight->Dim(), grad->Stride()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -55,11 +55,11 @@ void RegularizeL1(CuMatrixBase *weight, CuMatrixBase *grad, Real l1, MatrixBase &grad2 = grad->Mat(); for(MatrixIndexT r=0; r &src, #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - + /* - Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 + Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK)); */ /* * Let's use blocksize 4 x 128 (512 threads/block) - * and extend the randomizable matrices to: col 4*65535, row 128*65535 + * and extend the randomizable matrices to: col 4*65535, row 128*65535 * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints)) */ dim3 dimBlock(4, 128); @@ -111,7 +111,7 @@ void Randomize(const CuMatrixBase &src, cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(), copy_from_idx.Data(), dimtgt, dimsrc); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -124,28 +124,28 @@ void Randomize(const CuMatrixBase &src, tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i])); } } -} +} template void Splice(const CuMatrixBase &src, const CuArray &frame_offsets, CuMatrixBase *tgt) { - + KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols()); KALDI_ASSERT(src.NumRows() == tgt->NumRows()); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - + dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); - + cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(), frame_offsets.Data(), tgt->Dim(), src.Dim()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -171,7 +171,7 @@ void Splice(const CuMatrixBase &src, const CuArray &frame_offsets, template void Copy(const CuMatrixBase &src, const CuArray ©_from_indices, - CuMatrixBase *tgt) { + CuMatrixBase *tgt) { KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols()); KALDI_ASSERT(src.NumRows() == tgt->NumRows()); @@ -179,14 +179,14 @@ void Copy(const CuMatrixBase &src, const CuArray ©_from_indices #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Timer tim; - + dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); - + cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(), copy_from_indices.Data(), tgt->Dim(), src.Dim()); CU_SAFE_CALL(cudaGetLastError()); - + CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); } else #endif @@ -246,6 +246,7 @@ template void NormalizePerRow(const CuMatrixBase& in, const Real target_rms, const bool add_log_stddev, CuMatrixBase* out) { const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66 + KALDI_ASSERT(SameDim(in, *out)); #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { @@ -289,4 +290,3 @@ void NormalizePerRow(const CuMatrixBase& in, const double target_rms, } //namespace cu } //namespace kaldi -