Update the logic to set cuda_device_max_connections

Signed-off-by: Guyue Huang <[email protected]>
NVIDIA · Jan 15, 2025 · 9d5cb11 · 9d5cb11
1 parent 43d6e12
commit 9d5cb11
Showing 1 changed file with 12 additions and 18 deletions.
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
@@ -317,6 +317,7 @@ class PerfEnvPlugin(run.Plugin):
     layernorm_sm_margin: int = 16
     enable_vboost: bool = False
     nccl_pp_comm_chunksize: Optional[int] = None
+    override_default_cuda_device_max_connections: bool = False
     num_cuda_device_max_connections: int = None
 
     def get_vboost_srun_cmd(self, nodes, job_dir):
@@ -344,24 +345,17 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
         """Enable the performance environment settings"""
 
         if task.trainer.strategy.__fn_or_cls__ == MegatronStrategy:
-            if torch.cuda.is_available():
-                major, _ = torch.cuda.get_device_capability()
-                if major > 9:
-                    if self.num_cuda_device_max_connections is not None:
-                        executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(self.num_cuda_device_max_connections)
-                else:
-                    # When TP or CP size is larger than 1, need to use a single cuda device connection to enforce
-                    # the kernel queuing order of the host to GPU for their execution. This is needed  for the optimal
-                    # overlap between communication and computation kernels.
-                    tp_size = task.trainer.strategy.tensor_model_parallel_size
-                    cp_size = task.trainer.strategy.context_parallel_size
-                    if tp_size > 1 or cp_size > 1:
-                        executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
-                    else:
-                        if self.num_cuda_device_max_connections is not None:
-                            executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(
-                                self.num_cuda_device_max_connections
-                            )
+            if self.override_default_cuda_device_max_connections:
+                if self.num_cuda_device_max_connections is not None:
+                    executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(self.num_cuda_device_max_connections)
+            else:
+                # When TP or CP size is larger than 1, need to use a single cuda device connection to enforce
+                # the kernel queuing order of the host to GPU for their execution. This is needed  for the optimal
+                # overlap between communication and computation kernels.
+                tp_size = task.trainer.strategy.tensor_model_parallel_size
+                cp_size = task.trainer.strategy.context_parallel_size
+                if tp_size > 1 or cp_size > 1:
+                    executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
 
             # Set LayerNorm SM margin to support the overlap with LayerNorm kernel
             if self.enable_layernorm_sm_margin: