NVIDIA-NeMo · parthchadha · Jul 11, 2025 · Jul 9, 2025 · Jul 10, 2025 · Jul 10, 2025
@@ -1329,11 +1329,11 @@ def __init__(
         )
 
         # It's necessary to set env_vars here to ensure that vllm non-leader workers also have these env_vars
-        # Disable NCCL SHM if training and generation are not co-located: https://github.com/NVIDIA-NeMo/RL/issues/564
+        # Explicitly set NCCL_CUMEM_ENABLE to 1 to avoid the P2P initialization error for PyNCCLCommunicator.
+        # See https://github.com/NVIDIA-NeMo/RL/issues/564 for more details.
         env_vars = {}
         if not self.cfg["colocated"]["enabled"]:
-            env_vars["NCCL_SHM_DISABLE"] = "1"
-            env_vars["NCCL_P2P_DISABLE"] = "1"
+            os.environ["NCCL_CUMEM_ENABLE"] = "1"
 
         # Check if we need parallelism-aware worker group creation
         if self.model_parallel_size > 1:

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from typing import Any
 
 import torch
@@ -36,6 +37,10 @@ def init_collective(
         local_rank = torch.distributed.get_rank()
         rank = rank_prefix + local_rank + 1  # 1 is the head node of the train cluster
 
+        # Temporary fix for vllm==0.9.0 which overrides the NCCL_CUMEM_ENABLE to 0 and causes
+        # https://github.com/NVIDIA-NeMo/RL/issues/564. This can be removed after it is upgraded to vllm>=0.9.1rc1.
+        os.environ["NCCL_CUMEM_ENABLE"] = "1"
+
         pg = StatelessProcessGroup.create(
             host=ip, port=port, rank=rank, world_size=world_size
         )

@@ -138,14 +138,14 @@ def __init__(
         init_reference_model: bool = True,
         **kwargs: Any,
     ):
-        # Disable NCCL SHM if training and generation are not co-located: https://github.com/NVIDIA-NeMo/RL/issues/564
+        # Explicitly set NCCL_CUMEM_ENABLE to 1 to avoid the P2P initialization error for PyNCCLCommunicator.
+        # See https://github.com/NVIDIA-NeMo/RL/issues/564 for more details.
         if (
             "generation" in config
             and config["generation"] is not None
             and not config["generation"]["colocated"]["enabled"]
         ):
-            os.environ["NCCL_SHM_DISABLE"] = "1"
-            os.environ["NCCL_P2P_DISABLE"] = "1"
+            os.environ["NCCL_CUMEM_ENABLE"] = "1"
 
         self.cfg = config
         # torch distributed init. Envars for rank, world_size, and master_addr and master_port are set from the ray remote call
@@ -388,11 +388,6 @@ def init_collective(self, ip: str, port: int, world_size: int) -> None:
         from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
         from vllm.distributed.utils import StatelessProcessGroup
 
-        # keep the same behavior as vllm
-        # see https://github.com/vllm-project/vllm/blob/v0.9.0/vllm/env_override.py#L25
-        if not os.path.exists("/dev/nvidia-caps-imex-channels"):
-            os.environ["NCCL_CUMEM_ENABLE"] = "0"
-
         if self.rank == 0:
             pg = StatelessProcessGroup.create(
                 host=ip, port=port, rank=0, world_size=world_size