NVIDIA · markelsanz14 · Apr 13, 2023 · Mar 29, 2023 · Mar 29, 2023 · Mar 31, 2023
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -159,6 +159,15 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
         self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True)
 
+    @property
+    def model(self):
+        if isinstance(self._model, list):
+            return [model.module if isinstance(model, Float16Module) else model for model in self._model]
+        elif isinstance(self._model, Float16Module):
+            return self._model.module
+        else:
+            return self._model
+
     def set_inference_config(self, inference_config):
         self._inference_config = inference_config
 
@@ -1071,3 +1080,70 @@ def on_train_batch_end(self, outputs, dataloader_iter: Any, batch_idx: int, unus
                     # Reset the optimizer update skipped to `None` - this is to prevent scheduler no-ops during
                     # accumulated gradient updates.
                     grad_scaler.optimizer_update_skipped = None
+
+    def _reset_activation_checkpointing_args(self):
+        """ Disables activation checkpointing completely and saves the values so that 
+            _restore_activation_checkpointing_args can restore them later. This function must always be 
+            called before _restore_activation_checkpointing_args.
+        """
+        # Store values to restore them later.
+        self.last_activations_checkpoint_granularity = self.cfg.activations_checkpoint_granularity
+        self.last_activations_checkpoint_method = self.cfg.activations_checkpoint_method
+        self.last_activations_checkpoint_num_layers = self.cfg.activations_checkpoint_num_layers
+        self.last_activations_checkpoint_layers_per_pipeline = self.cfg.activations_checkpoint_layers_per_pipeline
+
+        # Reset config values. Needed for calling generate.
+        self.cfg.activations_checkpoint_granularity = None
+        self.cfg.activations_checkpoint_method = None
+        self.cfg.activations_checkpoint_num_layers = None
+        self.cfg.activations_checkpoint_layers_per_pipeline = None
+
+        # Reset model parameters.
+        self.model.language_model.encoder.activations_checkpoint_granularity = None
+        self.model.language_model.encoder.activations_checkpoint_method = None
+        self.model.language_model.encoder.activations_checkpoint_num_layers = None
+        self.model.language_model.encoder.activations_checkpoint_layers_per_pipeline = None
+
+    def _restore_activation_checkpointing_args(self):
+        """ Restores the activation checkpointing parameters using the values saved by  
+            _reset_activation_checkpointing_args. This function must never be called before 
+            _reset_activation_checkpointing_args.
+        """
+        # Restore config values.
+        self.cfg.activations_checkpoint_granularity = self.last_checkpointing_granularity
+        self.cfg.activations_checkpoint_method = self.last_checkpointing_method
+        self.cfg.activations_checkpoint_num_layers = self.last_checkpointing_num_layers
+        self.cfg.activations_checkpoint_layers_per_pipeline = self.last_activations_checkpoint_layers_per_pipeline
+
+        # Restore model parameters.
+        self.model.language_model.encoder.activations_checkpoint_granularity = self.last_checkpointing_granularity
+        self.model.language_model.encoder.activations_checkpoint_method = self.last_checkpointing_method
+        self.model.language_model.encoder.activations_checkpoint_num_layers = self.last_checkpointing_num_layers
+        self.model.language_model.encoder.activations_checkpoint_layers_per_pipeline = (
+            self.last_activations_checkpoint_layers_per_pipeline
+        )
+
+    def _reset_sequence_parallelism_args(self):
+        """ Disables sequence parallelism completely and saves the values so that 
+            _restore_sequence_parallelism_args can restore them later. This function must always be 
+            called before _restore_sequence_parallelism_args.
+        """
+        # Store values to restore them later.
+        self.last_sequence_parallel = self.cfg.sequence_parallel
+
+        # Reset config values. Needed for calling generate.
+        self.cfg.sequence_parallel = None
+
+        # Reset model parameters.
+        self.model.language_model.encoder.sequence_parallel = None
+
+    def _restore_sequence_parallelism_args(self):
+        """ Restores the sequence parallelism parameters using the values saved by  
+            _reset_sequence_parallelism_args. This function must never be called before 
+            _reset_sequence_parallelism_args.
+        """
+        # Restore config values.
+        self.cfg.sequence_parallel = self.last_sequence_parallel
+
+        # Restore model parameters.
+        self.model.language_model.encoder.sequence_parallel = self.last_sequence_parallel