Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/cudadecoder/cuda-decoder-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@
// it has to be less than the number of 1D threads
#define KALDI_CUDA_DECODER_HISTO_NBINS 255

// Number of "heavy duty" process non emitting kernels
// If more non emitting iterations are required, those will be done
// in the one-CTA persistent kernel
#define KALDI_CUDA_DECODER_N_NON_EMITTING_MAIN_ITERATIONS 2

// Adaptive beam parameters
// We will decrease the beam when we detect that we are generating too many
// tokens
Expand Down Expand Up @@ -410,7 +415,7 @@ struct LaneCounters {
// The histogram for max_active will be computed between min_histo_cost
// and max_histo_cost. Set for each frame after emitting stage
CostType min_histo_cost;
CostType max_histo_cost;
CostType max_histo_cost;
CostType histo_bin_width;
bool compute_max_active;
// offsets used by concatenate_lanes_data_kernel
Expand Down
16 changes: 12 additions & 4 deletions src/cudadecoder/cuda-decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ CudaDecoder::~CudaDecoder() {
// Stopping h2h tasks
h2h_threads_running_ = false;
n_h2h_main_task_todo_cv_.notify_all();
for (std::thread &thread : cpu_dedicated_threads_) thread.join();
cudaStreamDestroy(compute_st_);
cudaStreamDestroy(copy_st_);

Expand Down Expand Up @@ -595,7 +596,7 @@ void CudaDecoder::ExpandArcsEmitting() {
*h_device_params_, *h_kernel_params_);
}

void CudaDecoder::ExpandArcsNonEmitting(bool *should_iterate) {
void CudaDecoder::ExpandArcsNonEmitting() {
// false is for non emitting
ExpandArcsKernel<false>(KaldiCudaDecoderNumBlocks(nlanes_used_),
KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
Expand Down Expand Up @@ -798,7 +799,8 @@ void CudaDecoder::AdvanceDecoding(
ExpandArcsEmitting();
// We'll loop until we have a small enough number of non-emitting arcs
// in the token queue. We'll then break the loop
for (int i = 0; i < 1; ++i) { // TODO const
for (int i = 0; i < KALDI_CUDA_DECODER_N_NON_EMITTING_MAIN_ITERATIONS;
++i) {
// If one of the aux_q contains more than max_active_ tokens,
// we'll reduce the beam to only keep max_active_ tokens
ApplyMaxActiveAndReduceBeam(AUX_Q);
Expand All @@ -808,8 +810,10 @@ void CudaDecoder::AdvanceDecoding(
// and do the preprocessing necessary for the next ExpandArcs
PruneAndPreprocess();

bool should_iterate;
ExpandArcsNonEmitting(&should_iterate); // TODO remvoe should_iterate
// "heavy duty" kernel for non-emitting. The long tail of small
// non-emitting iterations will be done in
// FinalizeProcessNonEmittingKernel
ExpandArcsNonEmitting();
}
ApplyMaxActiveAndReduceBeam(AUX_Q);
PruneAndPreprocess();
Expand Down Expand Up @@ -1494,6 +1498,10 @@ void CudaDecoder::ConcurrentGetRawLatticeSingleChannel(const ChannelId ichannel,
TokenId best_cost_idx;
{
std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
h_all_tokens_info_.shrink_to_fit();
h_all_tokens_acoustic_cost_.shrink_to_fit();
h_all_tokens_extra_prev_tokens_.shrink_to_fit();
h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.shrink_to_fit();
best_cost_idx = h_all_argmin_cost_[ichannel].first;
}
KALDI_ASSERT(
Expand Down
4 changes: 1 addition & 3 deletions src/cudadecoder/cuda-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,7 @@ class CudaDecoder {
// in PostProcessingMainQueue
void ExpandArcsEmitting();
// ExpandArcs, non-emitting stage. Must be called after PruneAndPreprocess.
// if *should_iterate is true, we should do another iteration of the
// PruneAndPreprocess/ExpandArcsNonEmitting pair
void ExpandArcsNonEmitting(bool *should_iterate);
void ExpandArcsNonEmitting();
// If we have more than max_active_ tokens in the queue (either after an
// expand, or at the end of the frame)
// we will compute a new beam that will only keep a number of tokens as close
Expand Down