diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index 9125b67665..0c1e4ee40e 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -1329,11 +1329,11 @@ def __init__( ) # It's necessary to set env_vars here to ensure that vllm non-leader workers also have these env_vars - # Disable NCCL SHM if training and generation are not co-located: https://github.com/NVIDIA-NeMo/RL/issues/564 + # Explicitly set NCCL_CUMEM_ENABLE to 1 to avoid the P2P initialization error for PyNCCLCommunicator. + # See https://github.com/NVIDIA-NeMo/RL/issues/564 for more details. env_vars = {} if not self.cfg["colocated"]["enabled"]: - env_vars["NCCL_SHM_DISABLE"] = "1" - env_vars["NCCL_P2P_DISABLE"] = "1" + os.environ["NCCL_CUMEM_ENABLE"] = "1" # Check if we need parallelism-aware worker group creation if self.model_parallel_size > 1: diff --git a/nemo_rl/models/generation/vllm_backend.py b/nemo_rl/models/generation/vllm_backend.py index 8aaa34cd15..01dc68146b 100644 --- a/nemo_rl/models/generation/vllm_backend.py +++ b/nemo_rl/models/generation/vllm_backend.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from typing import Any import torch @@ -36,6 +37,10 @@ def init_collective( local_rank = torch.distributed.get_rank() rank = rank_prefix + local_rank + 1 # 1 is the head node of the train cluster + # Temporary fix for vllm==0.9.0 which overrides the NCCL_CUMEM_ENABLE to 0 and causes + # https://github.com/NVIDIA-NeMo/RL/issues/564. This can be removed after it is upgraded to vllm>=0.9.1rc1. + os.environ["NCCL_CUMEM_ENABLE"] = "1" + pg = StatelessProcessGroup.create( host=ip, port=port, rank=rank, world_size=world_size ) diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py index 2fd7032bdd..6e619885ae 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker.py +++ b/nemo_rl/models/policy/dtensor_policy_worker.py @@ -138,14 +138,14 @@ def __init__( init_reference_model: bool = True, **kwargs: Any, ): - # Disable NCCL SHM if training and generation are not co-located: https://github.com/NVIDIA-NeMo/RL/issues/564 + # Explicitly set NCCL_CUMEM_ENABLE to 1 to avoid the P2P initialization error for PyNCCLCommunicator. + # See https://github.com/NVIDIA-NeMo/RL/issues/564 for more details. if ( "generation" in config and config["generation"] is not None and not config["generation"]["colocated"]["enabled"] ): - os.environ["NCCL_SHM_DISABLE"] = "1" - os.environ["NCCL_P2P_DISABLE"] = "1" + os.environ["NCCL_CUMEM_ENABLE"] = "1" self.cfg = config # torch distributed init. Envars for rank, world_size, and master_addr and master_port are set from the ray remote call @@ -388,11 +388,6 @@ def init_collective(self, ip: str, port: int, world_size: int) -> None: from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator from vllm.distributed.utils import StatelessProcessGroup - # keep the same behavior as vllm - # see https://github.com/vllm-project/vllm/blob/v0.9.0/vllm/env_override.py#L25 - if not os.path.exists("/dev/nvidia-caps-imex-channels"): - os.environ["NCCL_CUMEM_ENABLE"] = "0" - if self.rank == 0: pg = StatelessProcessGroup.create( host=ip, port=port, rank=0, world_size=world_size