diff --git a/nemo_rl/models/automodel/setup.py b/nemo_rl/models/automodel/setup.py index a017332dba..4347fca5a3 100644 --- a/nemo_rl/models/automodel/setup.py +++ b/nemo_rl/models/automodel/setup.py @@ -75,7 +75,8 @@ def validate_and_prepare_config( if "generation" in config and config["generation"] is not None: is_generation_colocated = config["generation"]["colocated"]["enabled"] - # Set NCCL environment variable + # Explicitly set NCCL_CUMEM_ENABLE to 1 to avoid the P2P initialization error for PyNCCLCommunicator. + # See https://github.com/NVIDIA-NeMo/RL/issues/564 for more details. if not is_generation_colocated: os.environ["NCCL_CUMEM_ENABLE"] = "1" @@ -282,6 +283,11 @@ def setup_distributed( # Force setup distributed for world size 1 as FSDP2Manager skips it if world_size == 1: + if cpu_offload: + raise NotImplementedError( + "CPUOffload doesn't work on single GPU for AutoModel. " + "If you need this feature, please file an issue on https://github.com/NVIDIA-NeMo/Automodel." + ) manager._setup_distributed() return manager