NVIDIA · ericharper · May 9, 2023 · Apr 24, 2023 · May 7, 2023 · May 9, 2023
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -216,3 +216,7 @@ model:
       warmup_steps: 500
       constant_steps: 50000
       min_lr: 2e-5
+
+  gc_interval: 0
+  # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector.
+  # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import os
 import re
 from typing import Any, Dict, Optional, Union
@@ -148,6 +149,13 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
             "default_on_epoch": False,
         }
 
+        self.gc_interval = cfg.get('gc_interval', 0)
+        assert self.gc_interval >= 0, "gc_interval should be an integer value larger than or equal to 0."
+        # If gc_interval > 0, memory garbage collection is manually controlled.
+        # The automatic garbage collector sould be disabled before training starts.
+        if self.gc_interval > 0:
+            gc.disable()
+
     def _enable_nvidia_optimizations(self):
         "These optimizations are present in NVIDIA NGC PyTorch Containers"
 
@@ -351,6 +359,9 @@ def on_train_batch_end(self, outputs, dataloader_iter: Any, batch_idx: int, unus
                     # accumulated gradient updates.
                     grad_scaler.optimizer_update_skipped = None
 
+        if self.gc_interval > 0 and (self.trainer.global_step % self.gc_interval == 0):
+            gc.collect()
+
     def setup_optimization(
         self, optim_config: Optional[Union[DictConfig, Dict]] = None, optim_kwargs: Optional[Dict[str, Any]] = None,
     ):