From fb7144ad09375832788297063937408d7ec891dc Mon Sep 17 00:00:00 2001 From: Olga Miroshnichenko Date: Thu, 4 Jun 2026 02:28:24 -0500 Subject: [PATCH] [ROCm][Perf] Enable torch.compile fusion passes on GLM-4 MTP draft Glm4MoeMTP is not decorated with @support_torch_compile, so the MTP draft forward executes as eager Python and misses every Inductor fusion pass the target forward enjoys - most notably the AITER allreduce + RMSNorm fusion, RMSNorm + quant fusion, and silu+mul+fp8-quant fusion. Add the decorator to bring MTP in line with the canonical DeepSeekMTP pattern (vllm/model_executor/models/deepseek_mtp.py, L185) and make the draft eligible for the same compile-time fusions as the target. dynamic_arg_dims is inferred from the existing forward annotations (the four Tensor | None / IntermediateTensors | None args become dim-0 dynamic), exactly as for DeepSeekMTP. Measured on top of #44313 HEAD with GLM-4.7-FP8 TP=4 + EP + MTP num_speculative_tokens=2 + ROCM_AITER_UNIFIED_ATTN: - FSE=0 arm: +2.1% output throughput, -5.4% P99 TPOT, -8.5% mean TTFT (geomean across 9 cells). 7 of 9 cells improve. - FSE=1 arm: flat throughput (within 0.3%), -4.0% P99 TPOT - a clean tail-latency improvement at no cost. - gsm8k 5-shot accuracy unchanged within 1 sigma on both arms. - Spec-decode acceptance length / rate unchanged within noise. Signed-off-by: Olga Miroshnichenko --- vllm/model_executor/models/glm4_moe_mtp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py index 791ecabebebc..111c66d281a7 100644 --- a/vllm/model_executor/models/glm4_moe_mtp.py +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -30,6 +30,7 @@ import torch.nn as nn from transformers import PretrainedConfig +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.model_executor.layers.fused_moe import ( FusedMoE, @@ -187,6 +188,7 @@ def compute_logits( return logits +@support_torch_compile class Glm4MoeMTP(nn.Module, Glm4MixtureOfExperts): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__()