diff --git a/vllm/model_executor/models/bailing_moe_linear.py b/vllm/model_executor/models/bailing_moe_linear.py index 55ea1bad44db..bcf1200bd1b4 100644 --- a/vllm/model_executor/models/bailing_moe_linear.py +++ b/vllm/model_executor/models/bailing_moe_linear.py @@ -205,13 +205,19 @@ def __init__( self.q_a_layernorm = None self.q_b_proj = None - rope_parameters = _build_rope_parameters(config) + rope_parameters = _build_rope_parameters(config) or {} + # MLA rotates the full qk_rope_head_dim, + # partial_rotary_factor is for the linear-attn head only. + rope_parameters = { + k: v for k, v in rope_parameters.items() if k != "partial_rotary_factor" + } + rope_parameters["rope_dim"] = self.qk_rope_head_dim max_position = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( head_size=self.qk_rope_head_dim, max_position=max_position, is_neox_style=False, - rope_parameters=rope_parameters or None, + rope_parameters=rope_parameters, ) # Build MLAModules for MultiHeadLatentAttentionWrapper