Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions src/chain/chain-kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@
configure with --use-cuda=no (this will disable the use of GPU).
#endif

#if __CUDA_ARCH__ < 600

#ifdef __CUDACC__
#if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 )
// native implementation available
#else
#if __CUDA_ARCH__ >= 600
#error using CAS implementation of double atomicAdd
#endif
__device__ double atomicAdd(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*) address;
unsigned long long int old = *address_as_ull, assumed;
Expand All @@ -41,6 +48,8 @@ __device__ double atomicAdd(double* address, double val) {
return __longlong_as_double(old);
}
#endif
#endif


template <typename Real>
__device__ inline void atomic_add(Real* address, Real value) {
Expand Down Expand Up @@ -278,4 +287,3 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
this_beta, log_prob_deriv,
log_prob_deriv_stride);
}

38 changes: 23 additions & 15 deletions src/cudamatrix/cu-kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@
#include <math_constants.h>
#include "cudamatrix/cu-kernels-ansi.h"



inline __device__ static float max_generic(float a, float b) {
return fmaxf(a, b);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Math functions such as fmax() are already overloaded for both double and float as documented here. http://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH.html#group__CUDA__MATH

It seems that max() has been overloaded for all possible types in this header but this is not documented.
/usr/local/cuda/include/math_functions.h

}

inline __device__ static double max_generic(double a, double b) {
return fmax(a, b);
}

/***********************************************************************
* Generic __device__ functions
*/
Expand Down Expand Up @@ -379,7 +389,7 @@ static void _max(Real* mat, const Real* A, MatrixDim dst_d, int src_stride) {
int32_cuda dst_index = i + j * dst_d.stride, src_index = i + j * src_stride;
if (i < dst_d.cols && j < dst_d.rows) {
Real a = mat[dst_index], b = A[src_index];
mat[dst_index] = (a > b ? a : b);
mat[dst_index] = max_generic(a, b);
}
}

Expand Down Expand Up @@ -890,9 +900,8 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
// Tree reduce to 2x warpSize elements.
# pragma unroll
for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
if (tid < shift) {
if (tid < shift)
ssum[tid] += ssum[tid + shift];
}
__syncthreads();
}

Expand Down Expand Up @@ -1248,7 +1257,7 @@ struct TransReduceOp<MAX, Real> {
}
__forceinline__
__device__ Real Reduce(const Real& a, const Real& b) const {
return max(a, b);
return max_generic(a, b);
}
__forceinline__
__device__ Real PostReduce(const Real& x, const Real& output) const {
Expand Down Expand Up @@ -1288,7 +1297,7 @@ struct TransReduceOp<LINFNORM, Real> {
}
__forceinline__
__device__ Real Reduce(const Real& a, const Real& b) const {
return max(a, b);
return max_generic(a, b);
}
__forceinline__
__device__ Real PostReduce(const Real& x, const Real& output) const {
Expand Down Expand Up @@ -2155,7 +2164,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
// reduce to CU1DBLOCK elements per row.
Real tmax = sizeof(Real) == sizeof(float) ? -CUDART_INF_F : -CUDART_INF;
for (int j = tid; j < d.cols; j += CU1DBLOCK) {
tmax = max(tmax, x[x_start + j]);
tmax = max_generic(tmax, x[x_start + j]);
}
smem[tid] = tmax;
__syncthreads();
Expand All @@ -2164,7 +2173,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
# pragma unroll
for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
if (tid < shift) {
smem[tid] = max(smem[tid], smem[tid + shift]);
smem[tid] = max_generic(smem[tid], smem[tid + shift]);
}
__syncthreads();
}
Expand All @@ -2173,7 +2182,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
if (tid < warpSize) {
# pragma unroll
for (int shift = warpSize; shift > 0; shift >>= 1) {
smem[tid] = max(smem[tid], smem[tid + shift]);
smem[tid] = max_generic(smem[tid], smem[tid + shift]);
}
}

Expand Down Expand Up @@ -2251,10 +2260,9 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
// Tree reduce to 2x warpSize elements per row
# pragma unroll
for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
if (tid < shift) {
if (tid < shift)
ssum[tid] += ssum[tid + shift];
__syncthreads();
}
__syncthreads();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad… I missed this sync bug in unit test.

}

// Reduce last warp to 1 element per row.
Expand All @@ -2269,7 +2277,7 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
if (tid == 0) {
ssum[0] = sqrt(
max(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
max_generic(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
}

// Broadcast floored stddev to all threads.
Expand Down Expand Up @@ -2312,7 +2320,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
// reduce to CU1DBLOCK elements per row.
Real tmax = -1e20;
for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
tmax = max(tmax, x[x_start + j]);
tmax = max_generic(tmax, x[x_start + j]);
}
smem[tid] = tmax;
__syncthreads();
Expand All @@ -2321,15 +2329,15 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
# pragma unroll
for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
if (tid < shift) {
smem[tid] = max(smem[tid], smem[tid + shift]);
smem[tid] = max_generic(smem[tid], smem[tid + shift]);
}
__syncthreads();
}

// reduce to 1 element per row
if (tid < warpSize) {
for (int shift = warpSize; shift > 0; shift >>= 1) {
smem[tid] = max(smem[tid], smem[tid + shift]);
smem[tid] = max_generic(smem[tid], smem[tid + shift]);
}
}

Expand Down
40 changes: 20 additions & 20 deletions src/cudamatrix/cu-math.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ namespace kaldi {
namespace cu {

/*
* templated functions wrapping the ANSI-C CUDA kernel functions
* templated functions wrapping the ANSI-C CUDA kernel functions
*/


template<typename Real>
void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
KALDI_ASSERT(SameDim(*weight, *grad));
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;

dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
Expand All @@ -46,7 +46,7 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
weight->Dim(), grad->Stride());
CU_SAFE_CALL(cudaGetLastError());

CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
Expand All @@ -55,11 +55,11 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
MatrixBase<Real> &grad2 = grad->Mat();
for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {

if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!

Real l1_signed = l1;
if (weight2(r, c) < 0.0)
if (weight2(r, c) < 0.0)
l1_signed = -l1;

Real before = weight2(r, c);
Expand Down Expand Up @@ -88,16 +88,16 @@ void Randomize(const CuMatrixBase<Real> &src,
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;

/*
Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
*/

/*
* Let's use blocksize 4 x 128 (512 threads/block)
* and extend the randomizable matrices to: col 4*65535, row 128*65535
* and extend the randomizable matrices to: col 4*65535, row 128*65535
* (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
*/
dim3 dimBlock(4, 128);
Expand All @@ -111,7 +111,7 @@ void Randomize(const CuMatrixBase<Real> &src,
cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
copy_from_idx.Data(), dimtgt, dimsrc);
CU_SAFE_CALL(cudaGetLastError());

CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
Expand All @@ -124,28 +124,28 @@ void Randomize(const CuMatrixBase<Real> &src,
tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
}
}
}
}



template<typename Real>
void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
CuMatrixBase<Real> *tgt) {

KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
KALDI_ASSERT(src.NumRows() == tgt->NumRows());

#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;

dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));

cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
frame_offsets.Data(), tgt->Dim(), src.Dim());
CU_SAFE_CALL(cudaGetLastError());

CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
Expand All @@ -171,22 +171,22 @@ void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,

template<typename Real>
void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices,
CuMatrixBase<Real> *tgt) {
CuMatrixBase<Real> *tgt) {

KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
KALDI_ASSERT(src.NumRows() == tgt->NumRows());

#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;

dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));

cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
copy_from_indices.Data(), tgt->Dim(), src.Dim());
CU_SAFE_CALL(cudaGetLastError());

CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
Expand Down Expand Up @@ -246,6 +246,7 @@ template<typename Real>
void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
const bool add_log_stddev, CuMatrixBase<Real>* out) {
const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
KALDI_ASSERT(SameDim(in, *out));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not hold if add_log_stddev is true.


#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Expand Down Expand Up @@ -289,4 +290,3 @@ void NormalizePerRow(const CuMatrixBase<double>& in, const double target_rms,
} //namespace cu

} //namespace kaldi