NVIDIA · yidong72 · Jul 13, 2023 · Jul 12, 2023
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -393,6 +393,7 @@ def synced_generate(
             context_length_tensor,
             tokens_to_generate,
             all_probs,
+            compute_attention_mask=compute_attention_mask,
             temperature=temperature,
         )
     else:
@@ -822,6 +823,7 @@ def tab_sample_sequence_batch(
     context_lengths,
     tokens_to_generate,
     all_probs=True,
+    compute_attention_mask=True,
     type_ids=None,
     temperature=None,
 ):
@@ -845,7 +847,7 @@ def tab_sample_sequence_batch(
     # initialize the batch
     with torch.no_grad():
         context_length = context_lengths.min().item()
-        inference_strategy.init_batch(context_tokens, context_length)
+        inference_strategy.init_batch(context_tokens, context_length, compute_attention_mask)
         context = context_tokens[:, :context_length]
         # the context may start in the middle of the row,
         # calculate the offset according to the position of '\n' or '<|endoftext|>'
@@ -879,7 +881,7 @@ def tab_sample_sequence_batch(
 
         while context_length < maxlen:
             batch, tensor_shape = inference_strategy.prepare_batch_at_step(
-                tokens, maxlen, micro_batch_size, counter, context_length
+                tokens, maxlen, micro_batch_size, counter, context_length, compute_attention_mask
             )
             output = inference_strategy.forward_step(batch, tensor_shape)