vllm-project · whx-sjtu · Oct 20, 2025 · gemini-code-assist · Oct 20, 2025
@@ -75,7 +75,9 @@ def __init__(self, vllm_config: VllmConfig, prefix: str) -> None:
             topk_indices_buffer = None
 
         self.shared_head = SharedHead(
-            config=config, prefix=prefix, quant_config=quant_config
+            config=config,
+            prefix=maybe_prefix(prefix, "shared_head"),
+            quant_config=quant_config,
         )
         self.mtp_block = DeepseekV2DecoderLayer(
             vllm_config,

diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -79,7 +79,9 @@ def __init__(
         self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
         self.shared_head = SharedHead(
-            config=config, prefix=prefix, quant_config=quant_config
+            config=config,
+            prefix=maybe_prefix(prefix, "shared_head"),
+            quant_config=quant_config,
         )
         self.mtp_block = Glm4MoeDecoderLayer(
             config=config,