diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index 23b20501d4c..567cc0f6d18 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -105,8 +105,9 @@ void CuArrayBase::CopyFromVec(const std::vector &src) { if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL( - cudaMemcpy(data_, &src.front(), src.size() * sizeof(T), - cudaMemcpyHostToDevice)); + cudaMemcpyAsync(data_, &src.front(), src.size() * sizeof(T), + cudaMemcpyHostToDevice, cudaStreamPerThread)); + CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif @@ -122,7 +123,9 @@ void CuArray::CopyFromVec(const std::vector &src) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - CU_SAFE_CALL(cudaMemcpy(this->data_, &src.front(), src.size()*sizeof(T), cudaMemcpyHostToDevice)); + CU_SAFE_CALL(cudaMemcpyAsync(this->data_, &src.front(), + src.size()*sizeof(T), cudaMemcpyHostToDevice, cudaStreamPerThread)); + CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif @@ -179,7 +182,9 @@ void CuArrayBase::CopyToVec(std::vector *dst) const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), this->dim_ * sizeof(T), cudaMemcpyDeviceToHost)); + CU_SAFE_CALL(cudaMemcpyAsync(&dst->front(), Data(), this->dim_ * sizeof(T), + cudaMemcpyDeviceToHost, cudaStreamPerThread)); + CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim); } else #endif @@ -196,7 +201,9 @@ void CuArrayBase::CopyToHost(T *dst) const { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - CU_SAFE_CALL(cudaMemcpy(dst, Data(), this->dim_ * sizeof(T), cudaMemcpyDeviceToHost)); + CU_SAFE_CALL(cudaMemcpyAsync(dst, Data(), this->dim_ * sizeof(T), + cudaMemcpyDeviceToHost, cudaStreamPerThread)); + CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim); } else #endif @@ -212,7 +219,9 @@ void CuArrayBase::SetZero() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - CU_SAFE_CALL(cudaMemset(this->data_, 0, this->dim_ * sizeof(T))); + CU_SAFE_CALL(cudaMemsetAsync(this->data_, 0, this->dim_ * sizeof(T), + cudaStreamPerThread)); + CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim); } else #endif diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 247c2236565..1f09ff278ce 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -321,8 +321,10 @@ void CuMatrixBase::CopyFromMat(const MatrixBase &src, MatrixIndexT dst_pitch = stride_*sizeof(Real); MatrixIndexT src_pitch = src.Stride()*sizeof(Real); MatrixIndexT width = src.NumCols()*sizeof(Real); - CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch, - width, src.NumRows(), cudaMemcpyHostToDevice)); + CU_SAFE_CALL(cudaMemcpy2DAsync(data_, dst_pitch, src.Data(), src_pitch, + width, src.NumRows(), cudaMemcpyHostToDevice, + cudaStreamPerThread)); + cudaStreamSynchronize(cudaStreamPerThread); CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)", tim); } else { diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index d4dbdf12143..7581b043ae0 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -248,7 +248,9 @@ void CuPackedMatrix::SetZero() { size_t nr = static_cast(num_rows_), num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real); - CU_SAFE_CALL(cudaMemset(reinterpret_cast(this->data_), 0, num_bytes)); + CU_SAFE_CALL(cudaMemsetAsync(reinterpret_cast(this->data_), 0, + num_bytes, cudaStreamPerThread)); + CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetZero", tim); } else #endif diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 536e55d8a3b..7c968c6550d 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -1072,7 +1072,9 @@ void CuVectorBase::SetZero() { KALDI_ASSERT(dim_>=0); KALDI_ASSERT(data_!=NULL); CuTimer tim; - CU_SAFE_CALL(cudaMemset(data_, 0, dim_*sizeof(Real))); + CU_SAFE_CALL(cudaMemsetAsync(data_, 0, dim_*sizeof(Real), + cudaStreamPerThread)); + CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuVector::SetZero", tim); } else #endif