Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions vllm/model_executor/layers/attention/mla_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -2359,11 +2359,26 @@ def _compute_prefill_context(
kv_c_normed = workspace[:toks][..., : self.kv_lora_rank]
# When FP8 weights are used without FP8 prefill, kv_b_proj expects
# model dtype input and will quantize internally.
# Only cast when the weight is stored as a float dtype (BF16/FP16/FP8).
# Quantized layers with integer storage (e.g. Marlin int32 for NVFP4,
# or weight_packed for INT4) expect BF16 input and handle
# dequantization internally — do not cast in those cases.
_kv_b_w = getattr(self.kv_b_proj, "weight", None)
_kv_b_weight_dtype = _kv_b_w.dtype if _kv_b_w is not None else None
if (
use_fp8_prefill
or self.kv_b_proj.weight.dtype != current_platform.fp8_dtype()
_kv_b_weight_dtype is not None
and (
use_fp8_prefill
or _kv_b_weight_dtype != current_platform.fp8_dtype()
)
and _kv_b_weight_dtype
in (
torch.float16,
torch.bfloat16,
current_platform.fp8_dtype(),
)
):
kv_c_normed = kv_c_normed.to(self.kv_b_proj.weight.dtype)
kv_c_normed = kv_c_normed.to(_kv_b_weight_dtype)

k_pe = workspace[:toks][..., self.kv_lora_rank :].unsqueeze(1)
kv_nope = self.kv_b_proj(kv_c_normed)[0].view(
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/glm4_moe_lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
param, "weight_loader", default_weight_loader
)
weight_loader(param, loaded_weight)
if not is_fusion_moe_shared_experts_layer:
if not is_fusion_moe_shared_experts_layer and name is not None:
loaded_params.add(name)

return loaded_params
Expand Down