From f097a7da8550805919d6313308c3a528601d09fa Mon Sep 17 00:00:00 2001 From: wangyafeng Date: Mon, 13 Oct 2025 14:44:27 +0800 Subject: [PATCH] [Model][Bugfix]fix ernie45 load failed due to ernie45 eplb code Signed-off-by: wangyafeng --- vllm/model_executor/models/ernie45_moe.py | 34 +++++++++++++++-------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index e01f26731cd9..607589e68ef3 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only ErineMoE model compatible with HuggingFace weights.""" -from collections.abc import Iterable +import typing +from collections.abc import Callable, Iterable from itertools import islice from typing import Any @@ -139,10 +140,10 @@ def __init__( # Load balancing settings. vllm_config = get_current_vllm_config() - parallel_config = vllm_config.parallel_config + eplb_config = vllm_config.parallel_config.eplb_config self.enable_eplb = enable_eplb - self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_redundant_experts = eplb_config.num_redundant_experts self.n_logical_experts = self.n_routed_experts self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts self.n_local_physical_experts = self.n_physical_experts // self.ep_size @@ -426,8 +427,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.vocab_size = config.vocab_size self.config = config parallel_config = vllm_config.parallel_config + eplb_config = parallel_config.eplb_config enable_eplb = parallel_config.enable_eplb - self.num_redundant_experts = parallel_config.num_redundant_experts + + self.num_redundant_experts = eplb_config.num_redundant_experts if get_pp_group().is_first_rank: self.embed_tokens = VocabParallelEmbedding( @@ -570,20 +573,27 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: # Skip loading extra bias for GPTQ models. if ( - name.endswith(".bias") or name.endswith("_bias") - ) and name not in params_dict: + name_mapped.endswith(".bias") or name_mapped.endswith("_bias") + ) and name_mapped not in params_dict: continue - param = params_dict[name] - - weight_loader = param.weight_loader - weight_loader( + param = params_dict[name_mapped] + # We should ask the weight loader to return success or not + # here since otherwise we may skip experts with other + # available replicas. + weight_loader = typing.cast( + Callable[..., bool], param.weight_loader + ) + success = weight_loader( param, loaded_weight, - name, + name_mapped, shard_id=shard_id, expert_id=expert_id, + return_success=True, ) - break + if success: + name = name_mapped + break else: if is_expert_weight: # We've checked that this is an expert weight