-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Fixes to CUDA problems introduced by recent commits: normalize code c… #1228
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,16 @@ | |
| #include <math_constants.h> | ||
| #include "cudamatrix/cu-kernels-ansi.h" | ||
|
|
||
|
|
||
|
|
||
| inline __device__ static float max_generic(float a, float b) { | ||
| return fmaxf(a, b); | ||
| } | ||
|
|
||
| inline __device__ static double max_generic(double a, double b) { | ||
| return fmax(a, b); | ||
| } | ||
|
|
||
| /*********************************************************************** | ||
| * Generic __device__ functions | ||
| */ | ||
|
|
@@ -379,7 +389,7 @@ static void _max(Real* mat, const Real* A, MatrixDim dst_d, int src_stride) { | |
| int32_cuda dst_index = i + j * dst_d.stride, src_index = i + j * src_stride; | ||
| if (i < dst_d.cols && j < dst_d.rows) { | ||
| Real a = mat[dst_index], b = A[src_index]; | ||
| mat[dst_index] = (a > b ? a : b); | ||
| mat[dst_index] = max_generic(a, b); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -890,9 +900,8 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M, | |
| // Tree reduce to 2x warpSize elements. | ||
| # pragma unroll | ||
| for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { | ||
| if (tid < shift) { | ||
| if (tid < shift) | ||
| ssum[tid] += ssum[tid + shift]; | ||
| } | ||
| __syncthreads(); | ||
| } | ||
|
|
||
|
|
@@ -1248,7 +1257,7 @@ struct TransReduceOp<MAX, Real> { | |
| } | ||
| __forceinline__ | ||
| __device__ Real Reduce(const Real& a, const Real& b) const { | ||
| return max(a, b); | ||
| return max_generic(a, b); | ||
| } | ||
| __forceinline__ | ||
| __device__ Real PostReduce(const Real& x, const Real& output) const { | ||
|
|
@@ -1288,7 +1297,7 @@ struct TransReduceOp<LINFNORM, Real> { | |
| } | ||
| __forceinline__ | ||
| __device__ Real Reduce(const Real& a, const Real& b) const { | ||
| return max(a, b); | ||
| return max_generic(a, b); | ||
| } | ||
| __forceinline__ | ||
| __device__ Real PostReduce(const Real& x, const Real& output) const { | ||
|
|
@@ -2155,7 +2164,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { | |
| // reduce to CU1DBLOCK elements per row. | ||
| Real tmax = sizeof(Real) == sizeof(float) ? -CUDART_INF_F : -CUDART_INF; | ||
| for (int j = tid; j < d.cols; j += CU1DBLOCK) { | ||
| tmax = max(tmax, x[x_start + j]); | ||
| tmax = max_generic(tmax, x[x_start + j]); | ||
| } | ||
| smem[tid] = tmax; | ||
| __syncthreads(); | ||
|
|
@@ -2164,7 +2173,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { | |
| # pragma unroll | ||
| for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { | ||
| if (tid < shift) { | ||
| smem[tid] = max(smem[tid], smem[tid + shift]); | ||
| smem[tid] = max_generic(smem[tid], smem[tid + shift]); | ||
| } | ||
| __syncthreads(); | ||
| } | ||
|
|
@@ -2173,7 +2182,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) { | |
| if (tid < warpSize) { | ||
| # pragma unroll | ||
| for (int shift = warpSize; shift > 0; shift >>= 1) { | ||
| smem[tid] = max(smem[tid], smem[tid + shift]); | ||
| smem[tid] = max_generic(smem[tid], smem[tid + shift]); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -2251,10 +2260,9 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x, | |
| // Tree reduce to 2x warpSize elements per row | ||
| # pragma unroll | ||
| for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { | ||
| if (tid < shift) { | ||
| if (tid < shift) | ||
| ssum[tid] += ssum[tid + shift]; | ||
| __syncthreads(); | ||
| } | ||
| __syncthreads(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My bad… I missed this sync bug in unit test. |
||
| } | ||
|
|
||
| // Reduce last warp to 1 element per row. | ||
|
|
@@ -2269,7 +2277,7 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x, | |
| const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66 | ||
| if (tid == 0) { | ||
| ssum[0] = sqrt( | ||
| max(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor)); | ||
| max_generic(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor)); | ||
| } | ||
|
|
||
| // Broadcast floored stddev to all threads. | ||
|
|
@@ -2312,7 +2320,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim, | |
| // reduce to CU1DBLOCK elements per row. | ||
| Real tmax = -1e20; | ||
| for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) { | ||
| tmax = max(tmax, x[x_start + j]); | ||
| tmax = max_generic(tmax, x[x_start + j]); | ||
| } | ||
| smem[tid] = tmax; | ||
| __syncthreads(); | ||
|
|
@@ -2321,15 +2329,15 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim, | |
| # pragma unroll | ||
| for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { | ||
| if (tid < shift) { | ||
| smem[tid] = max(smem[tid], smem[tid + shift]); | ||
| smem[tid] = max_generic(smem[tid], smem[tid + shift]); | ||
| } | ||
| __syncthreads(); | ||
| } | ||
|
|
||
| // reduce to 1 element per row | ||
| if (tid < warpSize) { | ||
| for (int shift = warpSize; shift > 0; shift >>= 1) { | ||
| smem[tid] = max(smem[tid], smem[tid + shift]); | ||
| smem[tid] = max_generic(smem[tid], smem[tid + shift]); | ||
| } | ||
| } | ||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,15 +29,15 @@ namespace kaldi { | |
| namespace cu { | ||
|
|
||
| /* | ||
| * templated functions wrapping the ANSI-C CUDA kernel functions | ||
| * templated functions wrapping the ANSI-C CUDA kernel functions | ||
| */ | ||
|
|
||
|
|
||
| template<typename Real> | ||
| void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) { | ||
| KALDI_ASSERT(SameDim(*weight, *grad)); | ||
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
| Timer tim; | ||
|
|
||
| dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); | ||
|
|
@@ -46,7 +46,7 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, | |
| cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr, | ||
| weight->Dim(), grad->Stride()); | ||
| CU_SAFE_CALL(cudaGetLastError()); | ||
|
|
||
| CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); | ||
| } else | ||
| #endif | ||
|
|
@@ -55,11 +55,11 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, | |
| MatrixBase<Real> &grad2 = grad->Mat(); | ||
| for(MatrixIndexT r=0; r<weight2.NumRows(); r++) { | ||
| for(MatrixIndexT c=0; c<weight2.NumCols(); c++) { | ||
|
|
||
| if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght! | ||
|
|
||
| Real l1_signed = l1; | ||
| if (weight2(r, c) < 0.0) | ||
| if (weight2(r, c) < 0.0) | ||
| l1_signed = -l1; | ||
|
|
||
| Real before = weight2(r, c); | ||
|
|
@@ -88,16 +88,16 @@ void Randomize(const CuMatrixBase<Real> &src, | |
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
| Timer tim; | ||
|
|
||
| /* | ||
| Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 | ||
| Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 | ||
| dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); | ||
| dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK)); | ||
| */ | ||
|
|
||
| /* | ||
| * Let's use blocksize 4 x 128 (512 threads/block) | ||
| * and extend the randomizable matrices to: col 4*65535, row 128*65535 | ||
| * and extend the randomizable matrices to: col 4*65535, row 128*65535 | ||
| * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints)) | ||
| */ | ||
| dim3 dimBlock(4, 128); | ||
|
|
@@ -111,7 +111,7 @@ void Randomize(const CuMatrixBase<Real> &src, | |
| cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(), | ||
| copy_from_idx.Data(), dimtgt, dimsrc); | ||
| CU_SAFE_CALL(cudaGetLastError()); | ||
|
|
||
| CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); | ||
| } else | ||
| #endif | ||
|
|
@@ -124,28 +124,28 @@ void Randomize(const CuMatrixBase<Real> &src, | |
| tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i])); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
|
|
||
| template<typename Real> | ||
| void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets, | ||
| CuMatrixBase<Real> *tgt) { | ||
|
|
||
| KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols()); | ||
| KALDI_ASSERT(src.NumRows() == tgt->NumRows()); | ||
|
|
||
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
| Timer tim; | ||
|
|
||
| dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); | ||
| dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); | ||
|
|
||
| cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(), | ||
| frame_offsets.Data(), tgt->Dim(), src.Dim()); | ||
| CU_SAFE_CALL(cudaGetLastError()); | ||
|
|
||
| CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); | ||
| } else | ||
| #endif | ||
|
|
@@ -171,22 +171,22 @@ void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets, | |
|
|
||
| template<typename Real> | ||
| void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> ©_from_indices, | ||
| CuMatrixBase<Real> *tgt) { | ||
| CuMatrixBase<Real> *tgt) { | ||
|
|
||
| KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols()); | ||
| KALDI_ASSERT(src.NumRows() == tgt->NumRows()); | ||
|
|
||
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
| Timer tim; | ||
|
|
||
| dim3 dimBlock(CU2DBLOCK, CU2DBLOCK); | ||
| dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK)); | ||
|
|
||
| cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(), | ||
| copy_from_indices.Data(), tgt->Dim(), src.Dim()); | ||
| CU_SAFE_CALL(cudaGetLastError()); | ||
|
|
||
| CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed()); | ||
| } else | ||
| #endif | ||
|
|
@@ -246,6 +246,7 @@ template<typename Real> | |
| void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms, | ||
| const bool add_log_stddev, CuMatrixBase<Real>* out) { | ||
| const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66 | ||
| KALDI_ASSERT(SameDim(in, *out)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does not hold if add_log_stddev is true. |
||
|
|
||
| #if HAVE_CUDA == 1 | ||
| if (CuDevice::Instantiate().Enabled()) { | ||
|
|
@@ -289,4 +290,3 @@ void NormalizePerRow(const CuMatrixBase<double>& in, const double target_rms, | |
| } //namespace cu | ||
|
|
||
| } //namespace kaldi | ||
|
|
||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Math functions such as
fmax()are already overloaded for bothdoubleandfloatas documented here. http://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH.html#group__CUDA__MATHIt seems that
max()has been overloaded for all possible types in this header but this is not documented./usr/local/cuda/include/math_functions.h