disable overlap_param_gather_with_optimizer_step (NVIDIA#11102)

JimmyZhang12 · jiemingz · XuesongYang · commit e40e72095da5 · 2025-01-18T00:37:11.000-08:00
* disable overlap_param_gather_with_optimizer_step

Signed-off-by: Jimmy Zhang &lt;jiemingz@nvidia.com&gt;

* fix comment

Signed-off-by: Jieming Zhang &lt;jiemingz@nvidia.com&gt;

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 &lt;JimmyZhang12@users.noreply.github.com&gt;

* fix typo again

Signed-off-by: Jieming Zhang &lt;jiemingz@nvidia.com&gt;

* Apply isort and black reformatting

Signed-off-by: JimmyZhang12 &lt;JimmyZhang12@users.noreply.github.com&gt;

---------

Signed-off-by: Jimmy Zhang &lt;jiemingz@nvidia.com&gt;
Signed-off-by: Jieming Zhang &lt;jiemingz@nvidia.com&gt;
Signed-off-by: JimmyZhang12 &lt;JimmyZhang12@users.noreply.github.com&gt;
Co-authored-by: Jimmy Zhang &lt;jiemingz@nvidia.com&gt;
Co-authored-by: JimmyZhang12 &lt;JimmyZhang12@users.noreply.github.com&gt;
diff --git a/nemo/collections/llm/recipes/gpt3_175b.py b/nemo/collections/llm/recipes/gpt3_175b.py
@@ -229,7 +229,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
             tp_comm_overlap_cfg=userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048,
             defer_embedding_wgrad_compute=True,
             wgrad_deferral_limit=50,
-            overlap_param_gather_with_optimizer_step=True,
+            overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing
             align_param_gather=True,
         )
     )
diff --git a/nemo/collections/llm/recipes/llama31_405b.py b/nemo/collections/llm/recipes/llama31_405b.py
@@ -231,7 +231,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
             tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
             defer_embedding_wgrad_compute=True,
             wgrad_deferral_limit=50,
-            overlap_param_gather_with_optimizer_step=True,
+            overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing
             align_param_gather=True,
         )
     )
diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py
@@ -232,7 +232,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
             tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,
             defer_embedding_wgrad_compute=True,
             wgrad_deferral_limit=22,
-            overlap_param_gather_with_optimizer_step=True,
+            overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing.
             align_param_gather=True,
         )
     )
diff --git a/nemo/collections/llm/recipes/mixtral_8x22b.py b/nemo/collections/llm/recipes/mixtral_8x22b.py
@@ -226,7 +226,9 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
                 MegatronTokenDropCallback,
             ),
             run.Config(
-                MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=True, align_param_gather=True
+                MegatronCommOverlapCallback,
+                overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing
+                align_param_gather=True,
             ),
         ]
     )
diff --git a/nemo/collections/llm/recipes/mixtral_8x7b.py b/nemo/collections/llm/recipes/mixtral_8x7b.py
@@ -222,7 +222,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
             run.Config(MegatronTokenDropCallback),
             run.Config(
                 MegatronCommOverlapCallback,
-                overlap_param_gather_with_optimizer_step=True,
+                overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing.
                 align_param_gather=True,
             ),
         ]
diff --git a/nemo/collections/llm/recipes/nemotron4_22b.py b/nemo/collections/llm/recipes/nemotron4_22b.py
@@ -209,7 +209,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
             tp_comm_overlap=True,
             defer_embedding_wgrad_compute=True,
             wgrad_deferral_limit=22,
-            overlap_param_gather_with_optimizer_step=True,
+            overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing
             align_param_gather=True,
         )
     )
diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -212,7 +212,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
             tp_comm_overlap=True,
             defer_embedding_wgrad_compute=True,
             wgrad_deferral_limit=22,
-            overlap_param_gather_with_optimizer_step=True,
+            overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing
             align_param_gather=True,
         )
     )
diff --git a/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py b/nemo/lightning/pytorch/callbacks/megatron_comm_overlap.py
@@ -181,7 +181,8 @@ def _get_optimizer_overlap_cfgs(self, parallelism_cfg: ParallelismConfig) -> _Co
             comm_overlap_cfg.overlap_grad_reduce = True
             comm_overlap_cfg.overlap_param_gather = True
             if parallelism_cfg.pipeline_model_parallel_size > 1 and vp_size > 1:
-                comm_overlap_cfg.overlap_param_gather_with_optimizer_step = True
+                # Currently disabled due to an issue with checkpointing
+                # comm_overlap_cfg.overlap_param_gather_with_optimizer_step = True
                 comm_overlap_cfg.align_param_gather = True
 
         comm_overlap_cfg = self._override_user_cfgs(comm_overlap_cfg)

Original file line number	Diff line number	Diff line change
`@@ -229,7 +229,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:`
`229`	`229`	`tp_comm_overlap_cfg=userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048,`
`230`	`230`	`defer_embedding_wgrad_compute=True,`
`231`	`231`	`wgrad_deferral_limit=50,`
`232`		`- overlap_param_gather_with_optimizer_step=True,`
	`232`	`+ overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing`
`233`	`233`	`align_param_gather=True,`
`234`	`234`	`)`
`235`	`235`	`)`
Original file line number	Diff line number	Diff line change
`@@ -231,7 +231,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:`
`231`	`231`	`tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,`
`232`	`232`	`defer_embedding_wgrad_compute=True,`
`233`	`233`	`wgrad_deferral_limit=50,`
`234`		`- overlap_param_gather_with_optimizer_step=True,`
	`234`	`+ overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing`
`235`	`235`	`align_param_gather=True,`
`236`	`236`	`)`
`237`	`237`	`)`
Original file line number	Diff line number	Diff line change
`@@ -232,7 +232,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:`
`232`	`232`	`tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192,`
`233`	`233`	`defer_embedding_wgrad_compute=True,`
`234`	`234`	`wgrad_deferral_limit=22,`
`235`		`- overlap_param_gather_with_optimizer_step=True,`
	`235`	`+ overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing.`
`236`	`236`	`align_param_gather=True,`
`237`	`237`	`)`
`238`	`238`	`)`
Original file line number	Diff line number	Diff line change
`@@ -226,7 +226,9 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:`
`226`	`226`	`MegatronTokenDropCallback,`
`227`	`227`	`),`
`228`	`228`	`run.Config(`
`229`		`- MegatronCommOverlapCallback, overlap_param_gather_with_optimizer_step=True, align_param_gather=True`
	`229`	`+ MegatronCommOverlapCallback,`
	`230`	`+ overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing`
	`231`	`+ align_param_gather=True,`
`230`	`232`	`),`
`231`	`233`	`]`
`232`	`234`	`)`
Original file line number	Diff line number	Diff line change
`@@ -222,7 +222,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:`
`222`	`222`	`run.Config(MegatronTokenDropCallback),`
`223`	`223`	`run.Config(`
`224`	`224`	`MegatronCommOverlapCallback,`
`225`		`- overlap_param_gather_with_optimizer_step=True,`
	`225`	`+ overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing.`
`226`	`226`	`align_param_gather=True,`
`227`	`227`	`),`
`228`	`228`	`]`
Original file line number	Diff line number	Diff line change
`@@ -209,7 +209,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:`
`209`	`209`	`tp_comm_overlap=True,`
`210`	`210`	`defer_embedding_wgrad_compute=True,`
`211`	`211`	`wgrad_deferral_limit=22,`
`212`		`- overlap_param_gather_with_optimizer_step=True,`
	`212`	`+ overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing`
`213`	`213`	`align_param_gather=True,`
`214`	`214`	`)`
`215`	`215`	`)`
Original file line number	Diff line number	Diff line change
`@@ -212,7 +212,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:`
`212`	`212`	`tp_comm_overlap=True,`
`213`	`213`	`defer_embedding_wgrad_compute=True,`
`214`	`214`	`wgrad_deferral_limit=22,`
`215`		`- overlap_param_gather_with_optimizer_step=True,`
	`215`	`+ overlap_param_gather_with_optimizer_step=False, # Currently disabled due to an issue with checkpointing`
`216`	`216`	`align_param_gather=True,`
`217`	`217`	`)`
`218`	`218`	`)`