diff --git a/docs/advance/fsdp_extension.rst b/docs/advance/fsdp_extension.rst
index 3441aa5a93c..11e9d8a1337 100644
--- a/docs/advance/fsdp_extension.rst
+++ b/docs/advance/fsdp_extension.rst
@@ -85,7 +85,8 @@ vLLM, follow the guide of gemma model below:
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
-                weight_loader(param, loaded_weight)
+    -           weight_loader(param, loaded_weight)
+    +           weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
             loaded_params.add(name)
         unloaded_params = params_dict.keys() - loaded_params
         if unloaded_params:
diff --git a/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py b/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
index c002f49a2f6..75bf11ab319 100644
--- a/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
+++ b/verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
@@ -260,12 +260,10 @@ def from_engine_args(
             "Currently, the vllm in verl only support running on GPU"
 
         if engine_config.parallel_config.world_size == 1:
-            # TODO: may also need to init process group
-            from vllm.executor.gpu_executor import GPUExecutor
-            executor_class = GPUExecutor
-        else:
-            from .spmd_gpu_executor import SPMDGPUExecutor
-            executor_class = SPMDGPUExecutor
+            engine_config.load_config.load_format = "dummy_hf"
+
+        from .spmd_gpu_executor import SPMDGPUExecutor
+        executor_class = SPMDGPUExecutor
 
         # Create the LLM engine.
         engine = cls(
diff --git a/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py b/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py
index 098bffd0679..8d161e74706 100644
--- a/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py
+++ b/verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py
@@ -279,29 +279,14 @@ def free_cache_engine(self):
     # The GPUExecutor remove the Ray dependency
     @classmethod
     def _get_executor_cls(cls, engine_config: EngineConfig) -> Type[ExecutorBase]:
-        distributed_executor_backend = (engine_config.parallel_config.distributed_executor_backend)
-        # Initialize the cluster and specify the executor class.]
-        # Initialize the cluster and specify the executor class.
         assert engine_config.device_config.device_type == "cuda", \
             "Currently, the vllm in verl only support running on GPU"
 
         if engine_config.parallel_config.world_size == 1:
-            # TODO: may also need to init process group
-            from vllm.executor.gpu_executor import GPUExecutor
-            executor_class = GPUExecutor
-        else:
-            from .spmd_gpu_executor import SPMDGPUExecutor
-            executor_class = SPMDGPUExecutor
-        # elif distributed_executor_backend == "mp":
-        #     from vllm.executor.multiproc_gpu_executor import (
-        #         MultiprocessingGPUExecutor)
-        #     assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
-        #         "multiprocessing distributed executor backend does not "
-        #         "support VLLM_USE_RAY_SPMD_WORKER=1")
-        #     executor_class = MultiprocessingGPUExecutor
-        # else:
-        #     from vllm.executor.gpu_executor import GPUExecutor
-        #     executor_class = GPUExecutor
+            engine_config.load_config.load_format = "dummy_hf"
+
+        from .spmd_gpu_executor import SPMDGPUExecutor
+        executor_class = SPMDGPUExecutor
         return executor_class
 
     @classmethod
@@ -321,13 +306,8 @@ def from_engine_args(
         assert engine_config.device_config.device_type == "cuda", \
             "Currently, the vllm in verl only support running on GPU"
 
-        if engine_config.parallel_config.world_size == 1:
-            # TODO: may also need to init process group
-            from vllm.executor.gpu_executor import GPUExecutor
-            executor_class = GPUExecutor
-        else:
-            from .spmd_gpu_executor import SPMDGPUExecutor
-            executor_class = SPMDGPUExecutor
+        from .spmd_gpu_executor import SPMDGPUExecutor
+        executor_class = SPMDGPUExecutor
 
         # Create the LLM engine.
         engine = cls(
diff --git a/verl/trainer/ppo/workers/fsdp_workers.py b/verl/trainer/ppo/workers/fsdp_workers.py
index aaf275ddf71..5e4cbd60c29 100644
--- a/verl/trainer/ppo/workers/fsdp_workers.py
+++ b/verl/trainer/ppo/workers/fsdp_workers.py
@@ -232,6 +232,8 @@ def _build_rollout(self):
                                   tokenizer=self.tokenizer,
                                   model_hf_config=self.actor_model_config)
             log_gpu_memory_usage('After building vllm rollout', logger=None)
+            if torch.distributed.get_world_size() == 1:
+                self.config.rollout.load_format = 'dummy_hf'
             sharding_manager = FSDPVLLMShardingManager(module=self.actor_module_fsdp,
                                                        inference_engine=rollout.inference_engine,
                                                        model_config=self.actor_model_config,