vllm-project · tjtanaa · Apr 24, 2026 · Apr 16, 2026 · Apr 17, 2026 · Apr 17, 2026
@@ -150,9 +150,11 @@ def rocm_aiter_grouped_topk(
         topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
 
     if e_score_correction_bias is not None:
+        if e_score_correction_bias.dtype != gating_output.dtype:
+            e_score_correction_bias = e_score_correction_bias.to(gating_output.dtype)
         rocm_aiter_ops.biased_grouped_topk(
             gating_output,
-            e_score_correction_bias.to(gating_output.dtype),
+            e_score_correction_bias,
             topk_weights,
             topk_ids,
             num_expert_group,

@@ -134,9 +134,13 @@ def fused_topk_bias(
                 dtype=torch.int32 if indices_type is None else indices_type,
                 device=hidden_states.device,
             )
+            if e_score_correction_bias.dtype != gating_output.dtype:
+                e_score_correction_bias = e_score_correction_bias.to(
+                    gating_output.dtype
+                )
             rocm_aiter_ops.biased_grouped_topk(
                 gating_output,
-                e_score_correction_bias.to(gating_output.dtype),
+                e_score_correction_bias,
                 topk_weights,
                 topk_ids,
                 num_expert_group=num_expert_group,

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -351,6 +351,16 @@ def __init__(
             else torch.bfloat16
         )
 
+        # Pre-cast the bias to match the gate output dtype so the
+        # conversion is not repeated on every forward pass.  All
+        # downstream references (FusedMoE, router) share the same
+        # nn.Parameter object, so mutating .data propagates everywhere.
+        # Weight loading uses copy_(), which handles the dtype conversion.
+        if self.gate.e_score_correction_bias is not None:
+            self.gate.e_score_correction_bias.data = (
+                self.gate.e_score_correction_bias.data.to(self.gate.out_dtype)
+            )
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)