diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index a909a742885..164fc91f4a9 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -814,10 +814,10 @@ void cudaD_mat_copy_range_clamped( // the matrices are of size num_rows[i] x num_cols[i] and have a leading // dimension of ldo[i] for the output and ldi[i] for the input. void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, - int32_t *num_cols, float **inputs, int32_t *ldi, float **outputs, + int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs, int32_t *ldo); void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, - int32_t *num_cols, double **inputs, int32_t *ldi, double **outputs, + int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs, int32_t *ldo); // Launches a kernel that does nothing, explicitly using the legacy default stream; diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index e22d9aa99dc..d8e3a1a27e6 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -3677,7 +3677,8 @@ void _cuda_mat_copy_range_clamped( template struct MatrixCopyDesc { - Real *input, *output; + const Real *input; + Real *output; int32_t ldi, ldo; int32_t num_rows, num_cols; }; @@ -3704,7 +3705,7 @@ void _cuda_batch_copy_mats(BatchedMatrixCopyDesc batch_desc) { MatrixCopyDesc desc = batch_desc.batch[bid]; int32_t num_rows = desc.num_rows; int32_t num_cols = desc.num_cols; - Real *input = desc.input; + const Real *input = desc.input; Real *output = desc.output; int32_t ldi = desc.ldi; int32_t ldo = desc.ldo; @@ -5530,7 +5531,7 @@ void cudaD_mat_copy_range_clamped( } void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, - int32_t *num_cols, float **inputs, int32_t *ldi, float **outputs, + int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs, int32_t *ldo) { dim3 threads(32,32); @@ -5595,7 +5596,7 @@ void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows, } void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows, - int32_t *num_cols, double **inputs, int32_t *ldi, double **outputs, + int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs, int32_t *ldo) { dim3 threads(32,32); diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 528626b4ad1..b1112252d19 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -1580,14 +1580,14 @@ inline void cuda_mat_copy_range_clamped( } inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows, - int32_t *num_cols, float **inputs, int32_t *ldi, float **outputs, + int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs, int32_t *ldo) { cudaF_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi, outputs, ldo); } inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows, - int32_t *num_cols, double **inputs, int32_t *ldi, double **outputs, + int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs, int32_t *ldo) { cudaD_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi, outputs, ldo); diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc index c485c94b412..9d71a021f05 100644 --- a/src/nnet3/nnet-batch-compute.cc +++ b/src/nnet3/nnet-batch-compute.cc @@ -401,13 +401,14 @@ void NnetBatchComputer::FormatInputs( #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - std::vector inputs(num_tasks), outputs(num_tasks); + std::vector inputs(num_tasks); + std::vector outputs(num_tasks); std::vector ldi(num_tasks), ldo(num_tasks); std::vector num_rows(num_tasks), num_cols(num_tasks); // compute matrix descriptions for each copy for (int32 n = 0; n < num_tasks; n++) { - CuMatrix &input_mat = tasks[n]->input; + const CuMatrix &input_mat = tasks[n]->input; CuSubMatrix output_mat = input->RowRange( n * num_input_frames, num_input_frames); @@ -421,11 +422,11 @@ void NnetBatchComputer::FormatInputs( } // execute batched copy - cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], - &outputs[0], &ldo[0]); + cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], + &ldi[0], &outputs[0], &ldo[0]); } else -#else +#endif { for (int32 n = 0; n < num_tasks; n++) { CuSubMatrix input_part(*input, @@ -434,7 +435,6 @@ void NnetBatchComputer::FormatInputs( input_part.CopyFromMat(tasks[n]->input); } } -#endif if (GetVerboseLevel() >=2 ) { if (num_tasks < minibatch_size) { @@ -455,13 +455,14 @@ void NnetBatchComputer::FormatInputs( // using the batched matrix copy routine for this. This isn't // extremely efficient but the kernel takes a minimal amount of // time so making a batched vector copy is not worth the effort. - std::vector inputs(num_tasks), outputs(num_tasks); + std::vector inputs(num_tasks); + std::vector outputs(num_tasks); std::vector ldi(num_tasks), ldo(num_tasks); std::vector num_rows(num_tasks), num_cols(num_tasks); // compute source pointers for each input for (int32 n = 0; n < num_tasks; n++) { - CuVector &input_vec = tasks[n]->ivector; + const CuVector &input_vec = tasks[n]->ivector; CuSubVector output_vec = ivector->Row(n); // create matrix batch description arrays num_rows[n] = 1; @@ -477,13 +478,12 @@ void NnetBatchComputer::FormatInputs( &outputs[0], &ldo[0]); } else -#else +#endif { for (int32 n = 0; n < num_tasks; n++) { ivector->Row(n).CopyFromVec(tasks[n]->ivector); } } -#endif if (GetVerboseLevel() >= 2) { if (num_tasks < minibatch_size) { @@ -512,7 +512,8 @@ void NnetBatchComputer::FormatOutputs( #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - std::vector inputs(num_tasks), outputs(num_tasks); + std::vector inputs(num_tasks); + std::vector outputs(num_tasks); std::vector ldi(num_tasks), ldo(num_tasks); std::vector num_rows(num_tasks), num_cols(num_tasks); @@ -545,7 +546,7 @@ void NnetBatchComputer::FormatOutputs( CuSubMatrix output_mat = task->output.RowRange( left_unused, used); - CuSubMatrix input_mat = output.RowRange( + const CuSubMatrix input_mat = output.RowRange( n * num_output_frames + left_unused, used); // create matrix batch description arrays @@ -830,6 +831,7 @@ static void SplitInputToTasks(const NnetBatchComputerOptions &opts, opts.extra_right_context : opts.extra_right_context_final), num_tasks = tasks->size(); + for (int32 i = 0; i < num_tasks; i++) { NnetInferenceTask &task = (*tasks)[i]; // begin_output_t and end_output_t are the subsampled frame indexes at @@ -948,10 +950,47 @@ void NnetBatchComputer::SplitUtteranceIntoTasks( SplitInputToTasks(opts_, nnet_left_context_, nnet_right_context_, input, tasks); + if (ivector != NULL) { KALDI_ASSERT(online_ivectors == NULL); - for (size_t i = 0; i < tasks->size(); i++) - (*tasks)[i].ivector = *ivector; + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + int32_t num_tasks = tasks->size(); + + std::vector inputs(num_tasks); + std::vector outputs(num_tasks); + std::vector ldi(num_tasks), ldo(num_tasks); + std::vector num_rows(num_tasks), num_cols(num_tasks); + + int b=0; // batch counter + + for (size_t i = 0; i < tasks->size(); i++) { + CuVector &output_vec = (*tasks)[i].ivector; + const CuVector &input_vec = *ivector; + + output_vec.Resize(input_vec.Dim(), kUndefined); + + // create matrix batch description arrays + num_rows[b] = 1; + num_cols[b] = output_vec.Dim(); + outputs[b] = output_vec.Data(); + inputs[b] = input_vec.Data(); + ldo[b] = 0; + ldi[b] = 0; + b++; // increase batch count + } + + // execute batched copy + cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], + &outputs[0], &ldo[0]); + } else +#endif + { + for (size_t i = 0; i < tasks->size(); i++) + (*tasks)[i].ivector = *ivector; + } + } else if (online_ivectors != NULL) { AddOnlineIvectorsToTasks(opts_, *online_ivectors, online_ivector_period, tasks); @@ -1018,6 +1057,49 @@ void MergeTaskOutput( KALDI_ASSERT(num_output_frames != 0 && output_dim != 0); int32 cur_output_frame = 0; output->Resize(num_output_frames, output_dim, kUndefined); + +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + + std::vector inputs(num_tasks); + std::vector outputs(num_tasks); + std::vector ldi(num_tasks), ldo(num_tasks); + std::vector num_rows(num_tasks), num_cols(num_tasks); + + int b=0; // batch counter + for (int32 i = 0; i < num_tasks; i++) { + const NnetInferenceTask &task = tasks[i]; + int32 skip = task.num_initial_unused_output_frames, + num_used = task.num_used_output_frames; + KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index); + if (task.output_to_cpu) { + output->RowRange(cur_output_frame, num_used).CopyFromMat( + task.output_cpu.RowRange(skip, num_used)); + } else { + CuSubMatrix output_mat = + output->RowRange(cur_output_frame, num_used); + const CuSubMatrix input_mat = + task.output.RowRange(skip, num_used); + + // create matrix batch description arrays + num_rows[b] = output_mat.NumRows(); + num_cols[b] = output_mat.NumCols(); + outputs[b] = output_mat.Data(); + inputs[b] = input_mat.Data(); + ldo[b] = output_mat.Stride(); + ldi[b] = input_mat.Stride(); + b++; // increase batch count + } + cur_output_frame += num_used; + } + + // execute batched copy + cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], + &outputs[0], &ldo[0]); + + } else +#endif + { for (int32 i = 0; i < num_tasks; i++) { const NnetInferenceTask &task = tasks[i]; int32 skip = task.num_initial_unused_output_frames, @@ -1032,6 +1114,8 @@ void MergeTaskOutput( } cur_output_frame += num_used; } + } + KALDI_ASSERT(cur_output_frame == num_output_frames); }