fixed bug.

Signed-off-by: Vahid <[email protected]>
NVIDIA · Jun 5, 2023 · 29bda6a · 29bda6a
1 parent cbf3dd8
commit 29bda6a
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 1 deletion.
diff --git a/...ormer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml b/...ormer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_bpe_streaming.yaml
@@ -8,6 +8,8 @@
 # FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
 # FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
 
+# Note: if training loss does not converge, you may increase warm-up to 20K.
+
 name: "FastConformer-Hybrid-Transducer-CTC-BPE-Streaming"
 
 model:

diff --git a/...rmer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml b/...rmer/hybrid_cache_aware_streaming/fastconformer_hybrid_transducer_ctc_char_streaming.yaml
@@ -7,6 +7,8 @@
 # Cache-aware Conformer here: https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/asr/models.html#cache-aware-streaming-conformer
 # FastConformer-CTC's architecture config: NeMo/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml
 # FastConformer-Transducer's architecture config, along with the optimal batch size and precision: NeMo/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml
+fixed
+# Note: if training loss does not converge, you may increase warm-up to 20K.
 
 name: "FastConformer-Hybrid-Transducer-CTC-Char-Streaming"
 
@@ -111,8 +113,15 @@ model:
     # for att_context_style=regular, the right context is recommended to be a small number around 0 to 3 as multiple-layers may increase the effective right context too large
     # for att_context_style=chunked_limited, the left context need to be dividable by the right context plus one
     # look-ahead(secs) = att_context_size[1]*subsampling_factor*window_stride, example: 13*8*0.01=1.04s
+
+    # For adaptive lookahead, you may specify a list of context sizes. During the training, different context sizes would be used randomly with the distribution specified by att_context_probs.
+    # The first item in the list would be the default during test/validation/inference.
+    # An example of settings for multi-lookahead:
+    #    att_context_size: [[70,13],[70,6],[70,1],[70,0]]
+    #    att_context_probs: [0.25, 0.25, 0.25, 0.25, 0.25]
     att_context_size: [70, 13] # -1 means unlimited context
     att_context_style: chunked_limited # regular or chunked_limited
+    att_context_probs: null
 
     xscaling: true # scales up the input embeddings by sqrt(d_model)
     pos_emb_max_len: 5000

diff --git a/nemo/collections/asr/modules/conformer_encoder.py b/nemo/collections/asr/modules/conformer_encoder.py
@@ -502,7 +502,7 @@ def forward_internal(
 
         # select a random att_context_size with the distribution specified by att_context_probs during training
         # for non-validation cases like test, validation or inference, it uses the first mode in self.att_context_size
-        if self.training and len(att_context_size_all) > 1:
+        if self.training and len(self.att_context_size_all) > 1:
             cur_att_context_size = random.choices(self.att_context_size_all, weights=self.att_context_probs)[0]
         else:
             cur_att_context_size = self.att_context_size