Fix bug where GPT always enabled distopt overlapped param sync

Signed-off-by: Tim Moon <[email protected]>
NVIDIA · Feb 11, 2023 · 0afb078 · 0afb078
1 parent bfd371d
commit 0afb078
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 3 deletions.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -225,8 +225,10 @@ def setup_optimization(
         if self.with_distributed_adam:
 
             # Enable overlapped param sync by default
-            if 'overlap_param_sync' not in optim_kwargs:
-                optim_kwargs['overlap_param_sync'] = True
+            # Note: This is a hacky workaround since the user-provided
+            # config is loaded in the base class fucntion and is not
+            # available at this point.
+            optim_kwargs['default_kwargs'] = {'overlap_param_sync': True}
 
         return super().setup_optimization(optim_config=optim_config, optim_kwargs=optim_kwargs)
 

diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
@@ -20,7 +20,16 @@
 # Wrapper class that supports main_grad buffer
 # Note: main_grad buffer is used for O2-style optimizations
 class MegatronDistributedFusedAdam(DistributedFusedAdam):
-    def __init__(self, *args, disable_distributed_parameters=False, **kwargs):
+    def __init__(self, *args, disable_distributed_parameters=False, default_kwargs={}, **kwargs):
+
+        # Set default args
+        # Note: Hacky workaround that allows individual models to
+        # modify default behavior.
+        for key, val in default_kwargs.items():
+            if key not in kwargs:
+                kwargs[key] = val
+
+        # Initialize process groups
         if 'process_group' not in kwargs and not parallel_state.is_unitialized():
             kwargs['process_group'] = parallel_state.get_data_parallel_group()
         if disable_distributed_parameters:
@@ -29,6 +38,8 @@ def __init__(self, *args, disable_distributed_parameters=False, **kwargs):
             self_groups = [torch.distributed.new_group(ranks=[i]) for i in range(world_size)]
             kwargs['distributed_process_group'] = self_groups[rank]
             kwargs['redundant_process_group'] = kwargs['process_group']
+
+        # Construct distributed optimizer
         super().__init__(*args, **kwargs)
 
     def _make_post_backward_hook(self, param, param_group_id, param_id):