diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 95b036ac218b..a47c2a7771bb 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -147,7 +147,6 @@ def __init__( num_kv_heads: int, rotary_dim: int, rope_parameters: dict[str, Any] | None = None, - attn_window_size: int | None = None, max_position_embeddings: int = 8192, head_dim: int | None = None, rms_norm_eps: float = 1e-06, @@ -211,7 +210,6 @@ def __init__( self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, - per_layer_sliding_window=attn_window_size, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", @@ -291,7 +289,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: torch.Tensor | None, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -330,7 +328,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, - quant_config=None, + quant_config=quant_config, prefix=f"{prefix}.embed_tokens", ) else: @@ -437,16 +435,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if is_pp_missing_parameter(name, self): continue + if name not in params_dict: + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break else: + is_expert_weight = False for mapping in expert_params_mapping: param_name, weight_name, expert_id, shard_id = mapping if weight_name not in name: continue + + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + name = name.replace(weight_name, param_name) if is_pp_missing_parameter(name, self): @@ -463,6 +469,12 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ) break else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue + # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue @@ -506,11 +518,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) if get_pp_group().is_last_rank: self.lm_head = ParallelLMHead( - config.vocab_size, config.hidden_size, quant_config=None + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), ) + self.logits_processor = LogitsProcessor(config.vocab_size) else: self.lm_head = PPMissingLayer() - self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors )