NVIDIA · MaximumEntropy · Mar 4, 2023 · Feb 17, 2023
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -97,6 +97,7 @@ model:
   bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: False # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
 
 
   # Miscellaneous

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -149,6 +149,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
             self._nsys_profile_start_step *= grad_accum_steps
             self._nsys_profile_end_step *= grad_accum_steps
 
+        self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', False)
+
     def set_inference_config(self, inference_config):
         self._inference_config = inference_config
 
@@ -507,25 +509,43 @@ def allreduce_first_last_embeddings(self):
 
     def get_forward_output_and_loss_func(self, validation_step=False):
         def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
-            # GPT3 uses only causal mask, which doesn't need attention mask
+            batch = next(dataloader_iter)
             if parallel_state.get_pipeline_model_parallel_world_size() == 1:
-                batch = next(dataloader_iter)
                 for k in batch.keys():
-                    batch[k] = batch[k].cuda(non_blocking=True) if k not in ['attention_mask'] else None
+                    if self.get_attention_mask_from_fusion:
+                        batch[k] = batch[k].cuda(non_blocking=True) if k not in ['attention_mask'] else None
+                    else:
+                        batch[k] = batch[k].cuda(non_blocking=True)
             else:
                 if parallel_state.is_pipeline_first_stage():
-                    batch = next(dataloader_iter)
-                    # First pipeline stage needs only the tokens and position_ids
+                    # First pipeline stage needs tokens, position_ids, and attention_mask
                     for k in batch.keys():
-                        batch[k] = batch[k].cuda(non_blocking=True) if k in ['tokens', 'position_ids'] else None
+                        if self.get_attention_mask_from_fusion:
+                            batch[k] = batch[k].cuda(non_blocking=True) if k in ['tokens', 'position_ids'] else None
+                        else:
+                            batch[k] = (
+                                batch[k].cuda(non_blocking=True)
+                                if k in ['tokens', 'position_ids', 'attention_mask']
+                                else None
+                            )
                 elif parallel_state.is_pipeline_last_stage():
-                    batch = next(dataloader_iter)
-                    # Last pipeline stage needs only the labels and loss_mask
+                    # Last pipeline stage needs the labels, loss_mask, and attention_mask
                     for k in batch.keys():
-                        batch[k] = batch[k].cuda(non_blocking=True) if k in ['labels', 'loss_mask'] else None
+                        if self.get_attention_mask_from_fusion:
+                            batch[k] = batch[k].cuda(non_blocking=True) if k in ['labels', 'loss_mask'] else None
+                        else:
+                            batch[k] = (
+                                batch[k].cuda(non_blocking=True)
+                                if k in ['labels', 'loss_mask', 'attention_mask']
+                                else None
+                            )
                 else:
-                    # Intermediate pipeline stage doesn't need any inputs
-                    batch = {k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels']}
+                    # Intermediate pipeline stage only needs attention_mask
+                    if self.get_attention_mask_from_fusion:
+                        batch = {k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels']}
+                    else:
+                        for k in batch.keys():
+                            batch[k] = batch[k].cuda(non_blocking=True) if k in ['attention_mask'] else None
 
             output_tensor = model(
                 batch['tokens'],