Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions vllm/model_executor/models/minimax_m2.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ def __init__(
num_kv_heads: int,
rotary_dim: int,
rope_parameters: dict[str, Any] | None = None,
attn_window_size: int | None = None,
max_position_embeddings: int = 8192,
head_dim: int | None = None,
rms_norm_eps: float = 1e-06,
Expand Down Expand Up @@ -211,7 +210,6 @@ def __init__(
self.head_dim,
self.scaling,
num_kv_heads=self.num_kv_heads,
per_layer_sliding_window=attn_window_size,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
Expand Down Expand Up @@ -291,7 +289,7 @@ def forward(
positions: torch.Tensor,
hidden_states: torch.Tensor,
residual: torch.Tensor | None,
) -> torch.Tensor:
) -> tuple[torch.Tensor, torch.Tensor]:
# Self Attention
if residual is None:
residual = hidden_states
Expand Down Expand Up @@ -330,7 +328,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
self.embed_tokens = VocabParallelEmbedding(
config.vocab_size,
config.hidden_size,
quant_config=None,
quant_config=quant_config,
prefix=f"{prefix}.embed_tokens",
)
else:
Expand Down Expand Up @@ -437,16 +435,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:

if is_pp_missing_parameter(name, self):
continue
if name not in params_dict:
continue

param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
is_expert_weight = False
for mapping in expert_params_mapping:
param_name, weight_name, expert_id, shard_id = mapping
if weight_name not in name:
continue

# Anyway, this is an expert weight and should not be
# attempted to load as other weights later
is_expert_weight = True

name = name.replace(weight_name, param_name)

if is_pp_missing_parameter(name, self):
Expand All @@ -463,6 +469,12 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
)
break
else:
if is_expert_weight:
# We've checked that this is an expert weight
# However it's not mapped locally to this rank
# So we simply skip it
continue

# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
Expand Down Expand Up @@ -506,11 +518,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
)
if get_pp_group().is_last_rank:
self.lm_head = ParallelLMHead(
config.vocab_size, config.hidden_size, quant_config=None
config.vocab_size,
config.hidden_size,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "lm_head"),
)
self.logits_processor = LogitsProcessor(config.vocab_size)
else:
self.lm_head = PPMissingLayer()
Comment on lines 527 to 528
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

On non-last pipeline parallel ranks, self.logits_processor is not initialized. While compute_logits is guarded in ModelRunner, it's good practice to initialize all attributes in __init__ to avoid potential AttributeError in other code paths. Other models in vLLM initialize this to PPMissingLayer on non-last ranks.

        else:
            self.lm_head = PPMissingLayer()
            self.logits_processor = PPMissingLayer()

self.logits_processor = LogitsProcessor(config.vocab_size)
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors
)
Expand Down