NVIDIA · guyueh1 · Dec 28, 2024 · Dec 30, 2024 · Jan 7, 2025 · Jan 8, 2025
diff --git a/nemo/collections/llm/recipes/nemotron4_15b.py b/nemo/collections/llm/recipes/nemotron4_15b.py
@@ -25,6 +25,7 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
@@ -202,6 +203,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
         run.Config(
             MegatronCommOverlapCallback,
             tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192,
         )
     )
     return recipe

diff --git a/nemo/collections/llm/recipes/nemotron4_340b.py b/nemo/collections/llm/recipes/nemotron4_340b.py
@@ -25,6 +25,9 @@
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.nemotron import nemotron_model, nemotron_trainer
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
+from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
+    userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096,
+)
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
@@ -209,6 +212,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
         run.Config(
             MegatronCommOverlapCallback,
             tp_comm_overlap=True,
+            tp_comm_overlap_cfg=userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096,
             defer_embedding_wgrad_compute=True,
             wgrad_deferral_limit=22,
             overlap_param_gather_with_optimizer_step=False,  # Currently disabled due to an issue with checkpointing

diff --git a/nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py b/nemo/collections/llm/recipes/tp_overlap_configs/userbuffers.py
@@ -88,6 +88,20 @@ class TransformerLayerTPOverlapCfg:
     fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
 )
 
+# llama3 70b
+userbuffers_bf16_h100_h8192_tp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
+    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
+    qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
+    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
+    fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
+    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
+    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
+    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
+    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
+    proj_fprop=RingExchangeOverlapCfg(),
+    fc2_fprop=RingExchangeOverlapCfg(),
+)
+
 # llama3.1 405b
 userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
     qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
@@ -168,3 +182,17 @@ class TransformerLayerTPOverlapCfg:
     proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
     fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
 )
+
+# Nemotron 340B
+userbuffers_bf16_h100_h18432_tp8_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
+    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
+    qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
+    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
+    fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
+    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
+    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
+    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
+    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
+    proj_fprop=PipelineOverlapCfg(num_sm=32, cga_size=2, num_splits=2, set_sm_margin=True, fp8_buf=True),
+    fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
+)
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
@@ -19,6 +19,7 @@
 from typing import Callable, Optional
 
 import nemo_run as run
+import torch
 import yaml
 from lightning.pytorch import Callback
 from lightning.pytorch.loggers import WandbLogger
@@ -27,7 +28,6 @@
 from nemo.lightning.pytorch.callbacks import NsysCallback, PreemptionCallback
 from nemo.lightning.pytorch.strategies.megatron_strategy import MegatronStrategy
 from nemo.utils import logging
-
 from nemo.utils.import_utils import safe_import
 
 res_module, HAVE_RES = safe_import('nvidia_resiliency_ext.ptl_resiliency')
@@ -315,6 +315,7 @@ class PerfEnvPlugin(run.Plugin):
     layernorm_sm_margin: int = 16
     enable_vboost: bool = False
     nccl_pp_comm_chunksize: Optional[int] = None
+    num_cuda_device_max_connections: int = None
 
     def get_vboost_srun_cmd(self, nodes, job_dir):
         "Create the vboost `sudo nvidia-smi boost-slider --vboost 1` command"
@@ -341,11 +342,24 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
         """Enable the performance environment settings"""
 
         if task.trainer.strategy.__fn_or_cls__ == MegatronStrategy:
-            # Force program order kernel launch for TP, CP overlap
-            tp_size = task.trainer.strategy.tensor_model_parallel_size
-            cp_size = task.trainer.strategy.context_parallel_size
-            if tp_size > 1 or cp_size > 1:
-                executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+            if torch.cuda.is_available():
+                major, _ = torch.cuda.get_device_capability()
+                if major > 9:
+                    if self.num_cuda_device_max_connections is not None:
+                        executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(self.num_cuda_device_max_connections)
+                else:
+                    # When TP or CP size is larger than 1, need to use a single cuda device connection to enforce
+                    # the kernel queuing order of the host to GPU for their execution. This is needed  for the optimal
+                    # overlap between communication and computation kernels.
+                    tp_size = task.trainer.strategy.tensor_model_parallel_size
+                    cp_size = task.trainer.strategy.context_parallel_size
+                    if tp_size > 1 or cp_size > 1:
+                        executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+                    else:
+                        if self.num_cuda_device_max_connections is not None:
+                            executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(
+                                self.num_cuda_device_max_connections
+                            )
 
             # Set LayerNorm SM margin to support the overlap with LayerNorm kernel
             if self.enable_layernorm_sm_margin: