kaldi-asr · danpovey · Jun 27, 2019 · Jun 27, 2019
diff --git a/src/cudadecoder/cuda-decoder-common.h b/src/cudadecoder/cuda-decoder-common.h
@@ -64,6 +64,11 @@
 // it has to be less than the number of 1D threads
 #define KALDI_CUDA_DECODER_HISTO_NBINS 255
 
+// Number of "heavy duty" process non emitting kernels
+// If more non emitting iterations are required, those will be done
+// in the one-CTA persistent kernel
+#define KALDI_CUDA_DECODER_N_NON_EMITTING_MAIN_ITERATIONS 2
+
 // Adaptive beam parameters
 // We will decrease the beam when we detect that we are generating too many
 // tokens
@@ -410,7 +415,7 @@ struct LaneCounters {
   // The histogram for max_active will be computed between min_histo_cost
   // and max_histo_cost. Set for each frame after emitting stage
   CostType min_histo_cost;
-  CostType max_histo_cost; 
+  CostType max_histo_cost;
   CostType histo_bin_width;
   bool compute_max_active;
   // offsets used by concatenate_lanes_data_kernel

diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
@@ -299,6 +299,7 @@ CudaDecoder::~CudaDecoder() {
   // Stopping h2h tasks
   h2h_threads_running_ = false;
   n_h2h_main_task_todo_cv_.notify_all();
+  for (std::thread &thread : cpu_dedicated_threads_) thread.join();
   cudaStreamDestroy(compute_st_);
   cudaStreamDestroy(copy_st_);
 
@@ -595,7 +596,7 @@ void CudaDecoder::ExpandArcsEmitting() {
                          *h_device_params_, *h_kernel_params_);
 }
 
-void CudaDecoder::ExpandArcsNonEmitting(bool *should_iterate) {
+void CudaDecoder::ExpandArcsNonEmitting() {
   // false is for non emitting
   ExpandArcsKernel<false>(KaldiCudaDecoderNumBlocks(nlanes_used_),
                           KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
@@ -798,7 +799,8 @@ void CudaDecoder::AdvanceDecoding(
     ExpandArcsEmitting();
     // We'll loop until we have a small enough number of non-emitting arcs
     // in the token queue. We'll then break the loop
-    for (int i = 0; i < 1; ++i) {  // TODO const
+    for (int i = 0; i < KALDI_CUDA_DECODER_N_NON_EMITTING_MAIN_ITERATIONS;
+         ++i) {
       // If one of the aux_q contains more than max_active_ tokens,
       // we'll reduce the beam to only keep max_active_ tokens
       ApplyMaxActiveAndReduceBeam(AUX_Q);
@@ -808,8 +810,10 @@ void CudaDecoder::AdvanceDecoding(
       // and do the preprocessing necessary for the next ExpandArcs
       PruneAndPreprocess();
 
-      bool should_iterate;
-      ExpandArcsNonEmitting(&should_iterate);  // TODO remvoe should_iterate
+      // "heavy duty" kernel for non-emitting. The long tail of small
+      // non-emitting iterations will be done in
+      // FinalizeProcessNonEmittingKernel
+      ExpandArcsNonEmitting();
     }
     ApplyMaxActiveAndReduceBeam(AUX_Q);
     PruneAndPreprocess();
@@ -1494,6 +1498,10 @@ void CudaDecoder::ConcurrentGetRawLatticeSingleChannel(const ChannelId ichannel,
   TokenId best_cost_idx;
   {
     std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    h_all_tokens_info_.shrink_to_fit();
+    h_all_tokens_acoustic_cost_.shrink_to_fit();
+    h_all_tokens_extra_prev_tokens_.shrink_to_fit();
+    h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.shrink_to_fit();
     best_cost_idx = h_all_argmin_cost_[ichannel].first;
   }
   KALDI_ASSERT(

diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
@@ -312,9 +312,7 @@ class CudaDecoder {
   // in PostProcessingMainQueue
   void ExpandArcsEmitting();
   // ExpandArcs, non-emitting stage. Must be called after PruneAndPreprocess.
-  // if *should_iterate is true, we should do another iteration of the
-  // PruneAndPreprocess/ExpandArcsNonEmitting pair
-  void ExpandArcsNonEmitting(bool *should_iterate);
+  void ExpandArcsNonEmitting();
   // If we have more than max_active_ tokens in the queue (either after an
   // expand, or at the end of the frame)
   // we will compute a new beam that will only keep a number of tokens as close