vllm-project · MengqingCao · Mar 17, 2026 · Mar 13, 2026 · gemini-code-assist · Mar 13, 2026
@@ -37,11 +37,28 @@ def __init__(
         super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype)
         vllm_config = get_current_vllm_config()
         self.bias = None
+        self.bias_loaded = False
+
         # quantization with anti_method m4 will generate none-zero norm bias
         if vllm_config.quant_config is not None and any(
             "norm.bias" in name for name in vllm_config.quant_config.quant_description
         ):
             self.bias = torch.nn.Parameter(torch.zeros(hidden_size), requires_grad=False)
+            self.bias.weight_loader = self._bias_weight_loader
+
+    def _bias_weight_loader(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor) -> None:
+        if param.numel() == 1 and loaded_weight.numel() == 1:
+            # Sometimes scalar values aren't considered tensors with shapes
+            # so if both param and loaded_weight are a scalar,
+            # "broadcast" instead of copy
+            param.data.fill_(loaded_weight.item())
+        else:
+            assert param.size() == loaded_weight.size(), (
+                f"Attempted to load weight ({loaded_weight.size()}) into parameter ({param.size()})"
+            )
+
+            param.data.copy_(loaded_weight)
+        self.bias_loaded = True
 
     def forward_oot(
         self,
@@ -62,7 +79,7 @@ def forward_oot(
             return x, residual
 
         x, residual = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon)
-        if self.bias is not None:
+        if self.bias_loaded:
             x.add_(self.bias)
 
         weight_prefetch_method = get_weight_prefetch_method()