diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index f7f952af66e1..d4048a4731ef 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -138,6 +138,13 @@ class ParallelConfig: """Whether the deployed model is MoE (if known).""" enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" + enable_ep_weight_filter: bool = False + """Skip non-local expert weights during model loading when expert + parallelism is active. Each rank only reads its own expert shard from + disk, which can drastically reduce storage I/O for MoE models with + per-expert weight tensors (e.g. DeepSeek, Mixtral, Kimi-K2.5). Has no + effect on 3D fused-expert checkpoints (e.g. GPT-OSS) or non-MoE + models.""" enable_eplb: bool = False """Enable expert parallelism load balancing for MoE layers.""" eplb_config: EPLBConfig = Field(default_factory=EPLBConfig) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2c04c06e7a68..d0bdd4916144 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -419,6 +419,7 @@ class EngineArgs: data_parallel_external_lb: bool = False data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel + enable_ep_weight_filter: bool = ParallelConfig.enable_ep_weight_filter moe_backend: MoEBackend = KernelConfig.moe_backend all2all_backend: All2AllBackend = ParallelConfig.all2all_backend enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep @@ -902,6 +903,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "-ep", **parallel_kwargs["enable_expert_parallel"], ) + parallel_group.add_argument( + "--enable-ep-weight-filter", + **parallel_kwargs["enable_ep_weight_filter"], + ) parallel_group.add_argument( "--all2all-backend", **parallel_kwargs["all2all_backend"] ) @@ -1731,6 +1736,7 @@ def create_engine_config( data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, is_moe_model=model_config.is_moe, enable_expert_parallel=self.enable_expert_parallel, + enable_ep_weight_filter=self.enable_ep_weight_filter, all2all_backend=self.all2all_backend, enable_elastic_ep=self.enable_elastic_ep, enable_dbo=self.enable_dbo, diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 693bb2987d31..a8d81024421d 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -313,7 +313,11 @@ def _init_ep_weight_filter(self, model_config: ModelConfig) -> None: vllm_config = get_current_vllm_config() parallel_config = vllm_config.parallel_config - if not (model_config.is_moe and parallel_config.enable_expert_parallel): + if not ( + model_config.is_moe + and parallel_config.enable_expert_parallel + and parallel_config.enable_ep_weight_filter + ): return num_experts = model_config.get_num_experts()