diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index 1a1bc2f3bcf..f093f21a5a5 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -25,7 +25,14 @@
          configure with --use-cuda=no (this will disable the use of GPU).
 #endif
 
-#if __CUDA_ARCH__ < 600
+
+#ifdef __CUDACC__
+#if ( __CUDACC_VER_MAJOR__ >= 8 ) && ( !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 )
+// native implementation available
+#else
+#if __CUDA_ARCH__ >= 600
+#error using CAS implementation of double atomicAdd
+#endif
 __device__ double atomicAdd(double* address, double val) {
   unsigned long long int* address_as_ull = (unsigned long long int*) address;
   unsigned long long int old = *address_as_ull, assumed;
@@ -41,6 +48,8 @@ __device__ double atomicAdd(double* address, double val) {
   return __longlong_as_double(old);
 }
 #endif
+#endif
+
 
 template <typename Real>
 __device__ inline void atomic_add(Real* address, Real value) {
@@ -278,4 +287,3 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
                                       this_beta, log_prob_deriv,
                                       log_prob_deriv_stride);
 }
-
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 97416750372..3c4bb7a660c 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -28,6 +28,16 @@
 #include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
 
+
+
+inline __device__ static float max_generic(float a, float b) {
+  return fmaxf(a, b);
+}
+
+inline __device__ static double max_generic(double a, double b) {
+  return fmax(a, b);
+}
+
 /***********************************************************************
  * Generic __device__ functions
  */
@@ -379,7 +389,7 @@ static void _max(Real* mat, const Real* A, MatrixDim dst_d, int src_stride) {
   int32_cuda dst_index = i + j * dst_d.stride, src_index = i + j * src_stride;
   if (i < dst_d.cols && j < dst_d.rows) {
     Real a = mat[dst_index], b = A[src_index];
-    mat[dst_index] = (a > b ? a : b);
+    mat[dst_index] = max_generic(a, b);
   }
 }
 
@@ -890,9 +900,8 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
   // Tree reduce to 2x warpSize elements.
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
+    if (tid < shift)
       ssum[tid] += ssum[tid + shift];
-    }
     __syncthreads();
   }
 
@@ -1248,7 +1257,7 @@ struct TransReduceOp<MAX, Real> {
   }
   __forceinline__
   __device__ Real Reduce(const Real& a, const Real& b) const {
-    return max(a, b);
+    return max_generic(a, b);
   }
   __forceinline__
   __device__ Real PostReduce(const Real& x, const Real& output) const {
@@ -1288,7 +1297,7 @@ struct TransReduceOp<LINFNORM, Real> {
   }
   __forceinline__
   __device__ Real Reduce(const Real& a, const Real& b) const {
-    return max(a, b);
+    return max_generic(a, b);
   }
   __forceinline__
   __device__ Real PostReduce(const Real& x, const Real& output) const {
@@ -2155,7 +2164,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   // reduce to CU1DBLOCK elements per row.
   Real tmax = sizeof(Real) == sizeof(float) ? -CUDART_INF_F : -CUDART_INF;
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
-    tmax = max(tmax, x[x_start + j]);
+    tmax = max_generic(tmax, x[x_start + j]);
   }
   smem[tid] = tmax;
   __syncthreads();
@@ -2164,7 +2173,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
     if (tid < shift) {
-      smem[tid] = max(smem[tid], smem[tid + shift]);
+      smem[tid] = max_generic(smem[tid], smem[tid + shift]);
     }
     __syncthreads();
   }
@@ -2173,7 +2182,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = max(smem[tid], smem[tid + shift]);
+      smem[tid] = max_generic(smem[tid], smem[tid + shift]);
     }
   }
 
@@ -2251,10 +2260,9 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   // Tree reduce to 2x warpSize elements per row
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
+    if (tid < shift)
       ssum[tid] += ssum[tid + shift];
-      __syncthreads();
-    }
+    __syncthreads();
   }
 
   // Reduce last warp to 1 element per row.
@@ -2269,7 +2277,7 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
   if (tid == 0) {
     ssum[0] = sqrt(
-        max(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
+        max_generic(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
   }
 
   // Broadcast floored stddev to all threads.
@@ -2312,7 +2320,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   // reduce to CU1DBLOCK elements per row.
   Real tmax = -1e20;
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
-    tmax = max(tmax, x[x_start + j]);
+    tmax = max_generic(tmax, x[x_start + j]);
   }
   smem[tid] = tmax;
   __syncthreads();
@@ -2321,7 +2329,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
     if (tid < shift) {
-      smem[tid] = max(smem[tid], smem[tid + shift]);
+      smem[tid] = max_generic(smem[tid], smem[tid + shift]);
     }
     __syncthreads();
   }
@@ -2329,7 +2337,7 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   // reduce to 1 element per row
   if (tid < warpSize) {
     for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = max(smem[tid], smem[tid + shift]);
+      smem[tid] = max_generic(smem[tid], smem[tid + shift]);
     }
   }
 
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index b5d0c040e02..763c762424a 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -29,15 +29,15 @@ namespace kaldi {
 namespace cu {
 
 /*
- * templated functions wrapping the ANSI-C CUDA kernel functions 
+ * templated functions wrapping the ANSI-C CUDA kernel functions
  */
 
 
 template<typename Real>
 void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1, Real lr) {
   KALDI_ASSERT(SameDim(*weight, *grad));
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
 
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
@@ -46,7 +46,7 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
     cuda_regularize_l1(dimGrid, dimBlock, weight->Data(), grad->Data(), l1, lr,
                        weight->Dim(), grad->Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -55,11 +55,11 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *grad, Real l1,
     MatrixBase<Real> &grad2 = grad->Mat();
     for(MatrixIndexT r=0; r<weight2.NumRows(); r++) {
       for(MatrixIndexT c=0; c<weight2.NumCols(); c++) {
-        
+
         if(weight2(r,c)==0.0) continue; // skip L1 if zero weightght!
 
         Real l1_signed = l1;
-        if (weight2(r, c) < 0.0) 
+        if (weight2(r, c) < 0.0)
           l1_signed = -l1;
 
         Real before = weight2(r, c);
@@ -88,16 +88,16 @@ void Randomize(const CuMatrixBase<Real> &src,
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     /*
-    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535 
+    Note: default 16x16 block-size limits the --cachesize to matrix size 16*65535 x 16*65535
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(copy_from_idx.Dim(), CU2DBLOCK));
     */
 
     /*
      * Let's use blocksize 4 x 128 (512 threads/block)
-     * and extend the randomizable matrices to: col 4*65535, row 128*65535 
+     * and extend the randomizable matrices to: col 4*65535, row 128*65535
      * (ie. max-cols:262140 (dim), max-rows:8388480 (datapoints))
      */
     dim3 dimBlock(4, 128);
@@ -111,7 +111,7 @@ void Randomize(const CuMatrixBase<Real> &src,
     cuda_randomize(dimGrid, dimBlock, tgt->Data(), src.Data(),
                    copy_from_idx.Data(), dimtgt, dimsrc);
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -124,28 +124,28 @@ void Randomize(const CuMatrixBase<Real> &src,
       tgtmat.Row(i).CopyFromVec(srcmat.Row(copy_from_idxvec[i]));
     }
   }
-} 
+}
 
 
 
 template<typename Real>
 void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
             CuMatrixBase<Real> *tgt) {
-  
+
   KALDI_ASSERT(src.NumCols()*frame_offsets.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
 
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
-    
+
     cuda_splice(dimGrid, dimBlock, tgt->Data(), src.Data(),
                 frame_offsets.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -171,7 +171,7 @@ void Splice(const CuMatrixBase<Real> &src, const CuArray<int32> &frame_offsets,
 
 template<typename Real>
 void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices,
-          CuMatrixBase<Real> *tgt) { 
+          CuMatrixBase<Real> *tgt) {
 
   KALDI_ASSERT(copy_from_indices.Dim() == tgt->NumCols());
   KALDI_ASSERT(src.NumRows() == tgt->NumRows());
@@ -179,14 +179,14 @@ void Copy(const CuMatrixBase<Real> &src, const CuArray<int32> &copy_from_indices
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(tgt->NumCols(), CU2DBLOCK), n_blocks(tgt->NumRows(), CU2DBLOCK));
-    
+
     cuda_copy(dimGrid, dimBlock, tgt->Data(), src.Data(),
               copy_from_indices.Data(), tgt->Dim(), src.Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -246,6 +246,7 @@ template<typename Real>
 void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
                      const bool add_log_stddev, CuMatrixBase<Real>* out) {
   const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+  KALDI_ASSERT(SameDim(in, *out));
 
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -289,4 +290,3 @@ void NormalizePerRow(const CuMatrixBase<double>& in, const double target_rms,
 } //namespace cu
 
 } //namespace kaldi
-