diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index ddae19b9a4e..23b20501d4c 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -139,8 +139,9 @@ void CuArray::CopyFromArray(const CuArrayBase &src) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, this->dim_ * sizeof(T), - cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL(cudaMemcpyAsync(this->data_, src.data_, this->dim_ * sizeof(T), + cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif @@ -158,8 +159,8 @@ void CuArrayBase::CopyFromArray(const CuArrayBase &src) { if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL( - cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T), - cudaMemcpyDeviceToDevice)); + cudaMemcpyAsync(this->data_, src.data_, dim_ * sizeof(T), + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index beccd9dc4a5..247c2236565 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -229,8 +229,10 @@ void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, MatrixIndexT dst_pitch = stride_ * sizeof(Real); MatrixIndexT src_pitch = M.Stride() * sizeof(Real); MatrixIndexT width = M.NumCols() * sizeof(Real); - CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, M.data_, src_pitch, - width, M.num_rows_, cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL( + cudaMemcpy2DAsync(data_, dst_pitch, M.data_, src_pitch, + width, M.num_rows_, cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); } else { if (trans == kNoTrans) { dim3 dimGrid, dimBlock; @@ -2286,14 +2288,15 @@ void CuMatrixBase::CopyRowsFromVec(const CuVectorBase &v) { if (v.Dim() == num_rows_*num_cols_) { if (stride_ == num_cols_) { const Real* v_data = v.Data(); - CU_SAFE_CALL(cudaMemcpy(data_, v_data, - sizeof(Real)*num_rows_*num_cols_, - cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL( + cudaMemcpyAsync(data_, v_data, sizeof(Real)*num_rows_*num_cols_, + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); } else { - CU_SAFE_CALL(cudaMemcpy2D(data_, stride_ * sizeof(Real), v.Data(), - num_cols_*sizeof(Real), num_cols_*sizeof(Real), - num_rows_, - cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL( + cudaMemcpy2DAsync(data_, stride_ * sizeof(Real), v.Data(), + num_cols_*sizeof(Real), num_cols_*sizeof(Real), + num_rows_, cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); } } else if (v.Dim() == num_cols_) { dim3 dimGrid, dimBlock; diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 64f8afe0616..d4dbdf12143 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -143,8 +143,9 @@ void CuPackedMatrix::CopyFromPacked(const CuPackedMatrix &src) { size_t nr = static_cast(num_rows_), num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real); - CU_SAFE_CALL(cudaMemcpy(data_, src.data_, num_bytes, - cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL( + cudaMemcpyAsync(data_, src.data_, num_bytes, cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked1", tim); } else diff --git a/src/cudamatrix/cu-value.h b/src/cudamatrix/cu-value.h index b9b3035ccbd..cab0a3235d7 100644 --- a/src/cudamatrix/cu-value.h +++ b/src/cudamatrix/cu-value.h @@ -39,7 +39,9 @@ class CuValue { inline CuValue operator = (const CuValue &other) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - CU_SAFE_CALL(cudaMemcpy(data_, other.data_, sizeof(Real), cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL( + cudaMemcpyAsync(data_, other.data_, sizeof(Real), + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); return *this; } else #endif diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index dcca5a76cde..536e55d8a3b 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -167,14 +167,16 @@ void CuVectorBase::CopyRowsFromMat(const CuMatrixBase &mat) { if (dim_ == 0) return; CuTimer tim; if (mat.Stride() == mat.NumCols() && mat.NumRows() != 0) { - CU_SAFE_CALL(cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_, - cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL( + cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_, + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); } else { Real* vec_data = data_; for (MatrixIndexT r = 0; r < mat.NumRows(); r++) { - CU_SAFE_CALL(cudaMemcpy(vec_data, mat.RowData(r), - sizeof(Real) * mat.NumCols(), - cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r), + sizeof(Real) * mat.NumCols(), + cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); vec_data += mat.NumCols(); } } @@ -1049,7 +1051,9 @@ void CuVectorBase::CopyFromVec(const CuVectorBase &src) { if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; - CU_SAFE_CALL(cudaMemcpy(data_, src.data_, src.dim_ * sizeof(Real), cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL( + cudaMemcpyAsync(data_, src.data_, src.dim_ * sizeof(Real), + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif