vllm-project · lucaspirola · Feb 28, 2026 · Feb 28, 2026
@@ -2359,11 +2359,26 @@ def _compute_prefill_context(
             kv_c_normed = workspace[:toks][..., : self.kv_lora_rank]
             # When FP8 weights are used without FP8 prefill, kv_b_proj expects
             # model dtype input and will quantize internally.
+            # Only cast when the weight is stored as a float dtype (BF16/FP16/FP8).
+            # Quantized layers with integer storage (e.g. Marlin int32 for NVFP4,
+            # or weight_packed for INT4) expect BF16 input and handle
+            # dequantization internally — do not cast in those cases.
+            _kv_b_w = getattr(self.kv_b_proj, "weight", None)
+            _kv_b_weight_dtype = _kv_b_w.dtype if _kv_b_w is not None else None
             if (
-                use_fp8_prefill
-                or self.kv_b_proj.weight.dtype != current_platform.fp8_dtype()
+                _kv_b_weight_dtype is not None
+                and (
+                    use_fp8_prefill
+                    or _kv_b_weight_dtype != current_platform.fp8_dtype()
+                )
+                and _kv_b_weight_dtype
+                in (
+                    torch.float16,
+                    torch.bfloat16,
+                    current_platform.fp8_dtype(),
+                )
             ):
-                kv_c_normed = kv_c_normed.to(self.kv_b_proj.weight.dtype)
+                kv_c_normed = kv_c_normed.to(_kv_b_weight_dtype)
 
             k_pe = workspace[:toks][..., self.kv_lora_rank :].unsqueeze(1)
             kv_nope = self.kv_b_proj(kv_c_normed)[0].view(

diff --git a/vllm/model_executor/models/glm4_moe_lite.py b/vllm/model_executor/models/glm4_moe_lite.py
@@ -506,7 +506,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                             param, "weight_loader", default_weight_loader
                         )
                         weight_loader(param, loaded_weight)
-            if not is_fusion_moe_shared_experts_layer:
+            if not is_fusion_moe_shared_experts_layer and name is not None:
                 loaded_params.add(name)
 
         return loaded_params