From e6e1cdad1c5eb48bcbd7117289af4f52c5e95647 Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Fri, 5 Jun 2026 22:35:06 +0000 Subject: [PATCH] [Bugfix][Kernel] Fix mHC fused-RMSNorm big-fuse miscompile for hidden_size != 4096 mhc_pre_big_fuse_with_norm_tilelang pipelined the fused-RMSNorm pass at num_stages=3. Combined with the loop-carried sumsq reduction and the persistent output_shared buffer, the tilelang software pipeliner miscompiles it: NaN when hidden_size // 1024 <= 3 (e.g. 2048, 3072) and finite-but-wrong, slightly non-deterministic output when hidden_size // 1024 > 4 (e.g. 7168, the DeepSeek-V4 size). Only hidden_size=4096 (trip count 4) was correct. Drop to num_stages=2, matching the correct no-norm sibling kernel mhc_pre_big_fuse_tilelang. Verified against an fp32 RMSNorm reference: all hidden sizes 2048-8192 now match to ~1.6e-3 (bf16 floor), deterministically, across token counts 1..16384. num_stages=3 without the loop-carried state (the no-norm kernel) compiles correctly, confirming the issue is the pipeliner's handling of that combination. Signed-off-by: Yongye Zhu Co-authored-by: Claude --- vllm/model_executor/kernels/mhc/tilelang_kernels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/kernels/mhc/tilelang_kernels.py b/vllm/model_executor/kernels/mhc/tilelang_kernels.py index 5cc91a470a31..9fa13041b3f3 100644 --- a/vllm/model_executor/kernels/mhc/tilelang_kernels.py +++ b/vllm/model_executor/kernels/mhc/tilelang_kernels.py @@ -309,7 +309,7 @@ def mhc_pre_big_fuse_with_norm_tilelang( sumsq_per_pos = T.alloc_fragment(hidden_block, T.float32) T.clear(sumsq_per_pos) - for i0_h in T.Pipelined(hidden_size // hidden_block, num_stages=3): + for i0_h in T.Pipelined(hidden_size // hidden_block, num_stages=2): xs = T.alloc_shared((hc_mult, hidden_block), T.bfloat16) xl = T.alloc_fragment((hc_mult, hidden_block), T.float32) T.copy(residual[i, 0, i0_h * hidden_block], xs)