Skip to content

Commit

Permalink
Update the logic to set cuda_device_max_connections
Browse files Browse the repository at this point in the history
Signed-off-by: Guyue Huang <[email protected]>
  • Loading branch information
guyueh1 committed Jan 15, 2025
1 parent 43d6e12 commit 9d5cb11
Showing 1 changed file with 12 additions and 18 deletions.
30 changes: 12 additions & 18 deletions nemo/lightning/run/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ class PerfEnvPlugin(run.Plugin):
layernorm_sm_margin: int = 16
enable_vboost: bool = False
nccl_pp_comm_chunksize: Optional[int] = None
override_default_cuda_device_max_connections: bool = False
num_cuda_device_max_connections: int = None

def get_vboost_srun_cmd(self, nodes, job_dir):
Expand Down Expand Up @@ -344,24 +345,17 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
"""Enable the performance environment settings"""

if task.trainer.strategy.__fn_or_cls__ == MegatronStrategy:
if torch.cuda.is_available():
major, _ = torch.cuda.get_device_capability()
if major > 9:
if self.num_cuda_device_max_connections is not None:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(self.num_cuda_device_max_connections)
else:
# When TP or CP size is larger than 1, need to use a single cuda device connection to enforce
# the kernel queuing order of the host to GPU for their execution. This is needed for the optimal
# overlap between communication and computation kernels.
tp_size = task.trainer.strategy.tensor_model_parallel_size
cp_size = task.trainer.strategy.context_parallel_size
if tp_size > 1 or cp_size > 1:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
else:
if self.num_cuda_device_max_connections is not None:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(
self.num_cuda_device_max_connections
)
if self.override_default_cuda_device_max_connections:
if self.num_cuda_device_max_connections is not None:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(self.num_cuda_device_max_connections)
else:
# When TP or CP size is larger than 1, need to use a single cuda device connection to enforce
# the kernel queuing order of the host to GPU for their execution. This is needed for the optimal
# overlap between communication and computation kernels.
tp_size = task.trainer.strategy.tensor_model_parallel_size
cp_size = task.trainer.strategy.context_parallel_size
if tp_size > 1 or cp_size > 1:
executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"

# Set LayerNorm SM margin to support the overlap with LayerNorm kernel
if self.enable_layernorm_sm_margin:
Expand Down

0 comments on commit 9d5cb11

Please sign in to comment.