Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified docs/assets/contributing/dockerfile-stages-dependency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 6 additions & 2 deletions vllm/model_executor/models/llama4.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,10 +801,14 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
self,
skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
)
weights = [
# Use a generator (not a list comprehension) so the weights iterator is
# consumed lazily by AutoWeightsLoader. Materializing it here would hold
# the entire language-model checkpoint in host memory at once, which can
# OOM loaders that return private copies rather than mmap views.
weights = (
self.permute_qk_weight_for_rotary(name, loaded_weight)
for name, loaded_weight in weights
]
)
return loader.load_weights(weights)

def permute_qk_weight_for_rotary(
Expand Down
128 changes: 58 additions & 70 deletions vllm/model_executor/models/mllama4.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,66 +983,6 @@ def _rename_weight_for_modelopt_checkpoint(self, name: str) -> str:

return name

def _separate_and_rename_weights(
self, weights: Iterable[tuple[str, torch.Tensor]]
) -> tuple[list[tuple[str, torch.Tensor]], list[tuple[str, torch.Tensor]]]:
"""Rename weights and separate them into language_model and other
weights."""
language_model_weights = []
other_weights = []

for name, weight in weights:
renamed = self._rename_weight_for_modelopt_checkpoint(name)

attr = renamed.split(".", 1)[0]
if isinstance(getattr(self, attr), StageMissingLayer):
continue

if renamed.startswith("language_model."):
language_model_weights.append((renamed, weight))
else:
other_weights.append((renamed, weight))

return language_model_weights, other_weights

def _handle_expert_scale_broadcasting(
self, weights: list[tuple[str, torch.Tensor]], params_dict: dict
) -> tuple[list[tuple[str, torch.Tensor]], set[str]]:
"""Handle expert scale parameters that need broadcasting.

ModelOpt checkpoints use a single value tensor scalar for BMM style
experts, vLLM expects the scale to be broadcasted across all experts.
"""
regular_weights = []
expert_scale_weights = []
updated_params = set()

for name, weight in weights:
# Check if this is an expert scale parameter that needs broadcasting
if (
"feed_forward.experts." in name
and "scale" in name
and ".shared_expert" not in name
):
name = maybe_remap_moe_expert_param_name(name, params_dict)
if name in params_dict:
param = params_dict[name]
if (
hasattr(param, "data")
and param.data.numel() > 1
and weight.numel() == 1
):
# Broadcast single value to all experts
param.data.fill_(weight.item())
updated_params.add(name)
continue

expert_scale_weights.append((name, weight))
else:
regular_weights.append((name, weight))

return regular_weights, expert_scale_weights, updated_params

def _load_other_weights(
self,
other_weights: Iterable[tuple[str, torch.Tensor]],
Expand Down Expand Up @@ -1103,19 +1043,67 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
params_dict = dict(self.named_parameters())
updated_params: set[str] = set()

# Separate and rename weights
language_model_weights, other_weights = self._separate_and_rename_weights(
weights
)
# Stream thelanguage-model weights straight into
# AutoWeightsLoader so each tensor is loaded and released as we iterate,
# instead of materializing the whole checkpoint in host memory first.
# Only the small vision/projector and scalar expert-scale groups are
# buffered.
other_weights: list[tuple[str, torch.Tensor]] = []
expert_scale_weights: list[tuple[str, torch.Tensor]] = []

def regular_language_model_weights() -> Iterable[tuple[str, torch.Tensor]]:
"""Rename weights and separate them into language_model and other
weights.

Yields the (large) language_model weights for streaming; the small
groups (vision/projector and scalar expert scales) are buffered into
the lists above.
"""
for name, weight in weights:
renamed = self._rename_weight_for_modelopt_checkpoint(name)

attr = renamed.split(".", 1)[0]
if isinstance(getattr(self, attr), StageMissingLayer):
continue

# Handle expert scale parameters
regular_weights, expert_scale_weights, updated_params_from_experts = (
self._handle_expert_scale_broadcasting(language_model_weights, params_dict)
)
updated_params.update(updated_params_from_experts)
if not renamed.startswith("language_model."):
other_weights.append((renamed, weight))
continue

# Handle expert scale parameters that need broadcasting.
# ModelOpt checkpoints use a single value tensor scalar for BMM
# style experts, vLLM expects the scale to be broadcasted across
# all experts.
if (
"feed_forward.experts." in renamed
and "scale" in renamed
and ".shared_expert" not in renamed
):
renamed = maybe_remap_moe_expert_param_name(renamed, params_dict)
if renamed in params_dict:
param = params_dict[renamed]
if (
hasattr(param, "data")
and param.data.numel() > 1
and weight.numel() == 1
):
# Broadcast single value to all experts
param.data.fill_(weight.item())
updated_params.add(renamed)
continue

expert_scale_weights.append((renamed, weight))
continue

yield renamed, weight

loader = AutoWeightsLoader(self)
loaded_language_model_params = loader.load_weights(regular_weights)
# AutoWeightsLoader consumes its input lazily and runs to exhaustion,
# so other_weights / expert_scale_weights are fully populated as a side
# effect by the time this returns.
loaded_language_model_params = loader.load_weights(
regular_language_model_weights()
)
assert loaded_language_model_params is not None
updated_params.update(loaded_language_model_params)

Expand Down
Loading