diff --git a/docs/advance/fsdp_extension.rst b/docs/advance/fsdp_extension.rst index 3441aa5a93c..11e9d8a1337 100644 --- a/docs/advance/fsdp_extension.rst +++ b/docs/advance/fsdp_extension.rst @@ -85,7 +85,8 @@ vLLM, follow the guide of gemma model below: param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) + - weight_loader(param, loaded_weight) + + weight_loader(param, local_loaded_weight.to(dtype=param.dtype)) loaded_params.add(name) unloaded_params = params_dict.keys() - loaded_params if unloaded_params: diff --git a/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py b/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py index c002f49a2f6..75bf11ab319 100644 --- a/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py +++ b/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py @@ -260,12 +260,10 @@ def from_engine_args( "Currently, the vllm in verl only support running on GPU" if engine_config.parallel_config.world_size == 1: - # TODO: may also need to init process group - from vllm.executor.gpu_executor import GPUExecutor - executor_class = GPUExecutor - else: - from .spmd_gpu_executor import SPMDGPUExecutor - executor_class = SPMDGPUExecutor + engine_config.load_config.load_format = "dummy_hf" + + from .spmd_gpu_executor import SPMDGPUExecutor + executor_class = SPMDGPUExecutor # Create the LLM engine. engine = cls( diff --git a/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py b/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py index 098bffd0679..8d161e74706 100644 --- a/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py +++ b/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py @@ -279,29 +279,14 @@ def free_cache_engine(self): # The GPUExecutor remove the Ray dependency @classmethod def _get_executor_cls(cls, engine_config: EngineConfig) -> Type[ExecutorBase]: - distributed_executor_backend = (engine_config.parallel_config.distributed_executor_backend) - # Initialize the cluster and specify the executor class.] - # Initialize the cluster and specify the executor class. assert engine_config.device_config.device_type == "cuda", \ "Currently, the vllm in verl only support running on GPU" if engine_config.parallel_config.world_size == 1: - # TODO: may also need to init process group - from vllm.executor.gpu_executor import GPUExecutor - executor_class = GPUExecutor - else: - from .spmd_gpu_executor import SPMDGPUExecutor - executor_class = SPMDGPUExecutor - # elif distributed_executor_backend == "mp": - # from vllm.executor.multiproc_gpu_executor import ( - # MultiprocessingGPUExecutor) - # assert not envs.VLLM_USE_RAY_SPMD_WORKER, ( - # "multiprocessing distributed executor backend does not " - # "support VLLM_USE_RAY_SPMD_WORKER=1") - # executor_class = MultiprocessingGPUExecutor - # else: - # from vllm.executor.gpu_executor import GPUExecutor - # executor_class = GPUExecutor + engine_config.load_config.load_format = "dummy_hf" + + from .spmd_gpu_executor import SPMDGPUExecutor + executor_class = SPMDGPUExecutor return executor_class @classmethod @@ -321,13 +306,8 @@ def from_engine_args( assert engine_config.device_config.device_type == "cuda", \ "Currently, the vllm in verl only support running on GPU" - if engine_config.parallel_config.world_size == 1: - # TODO: may also need to init process group - from vllm.executor.gpu_executor import GPUExecutor - executor_class = GPUExecutor - else: - from .spmd_gpu_executor import SPMDGPUExecutor - executor_class = SPMDGPUExecutor + from .spmd_gpu_executor import SPMDGPUExecutor + executor_class = SPMDGPUExecutor # Create the LLM engine. engine = cls( diff --git a/verl/trainer/ppo/workers/fsdp_workers.py b/verl/trainer/ppo/workers/fsdp_workers.py index aaf275ddf71..5e4cbd60c29 100644 --- a/verl/trainer/ppo/workers/fsdp_workers.py +++ b/verl/trainer/ppo/workers/fsdp_workers.py @@ -232,6 +232,8 @@ def _build_rollout(self): tokenizer=self.tokenizer, model_hf_config=self.actor_model_config) log_gpu_memory_usage('After building vllm rollout', logger=None) + if torch.distributed.get_world_size() == 1: + self.config.rollout.load_format = 'dummy_hf' sharding_manager = FSDPVLLMShardingManager(module=self.actor_module_fsdp, inference_engine=rollout.inference_engine, model_config=self.actor_model_config,