From ca053f08858efb87ef917aefa68f278222c0ffa7 Mon Sep 17 00:00:00 2001 From: alisonshao Date: Wed, 26 Nov 2025 16:28:24 -0800 Subject: [PATCH] Fix flashinfer cutlass MoE output shape for non-FP4-packed inputs Fix incorrect output tensor shape calculation in ModelOptNvFp4FusedMoEMethod that caused ValueError during server startup for DeepSeek-V3-FP4 models. --- python/sglang/srt/layers/quantization/modelopt_quant.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py index 2809c7a19226..8d44584e7f26 100755 --- a/python/sglang/srt/layers/quantization/modelopt_quant.py +++ b/python/sglang/srt/layers/quantization/modelopt_quant.py @@ -1654,11 +1654,15 @@ def apply( output_dtype = torch.bfloat16 + # If x_sf is not None, x is FP4 packed (half size), so we need * 2 + # If x_sf is None, x is not packed, so output_col = x.shape[1] + output_col = x.shape[1] * 2 if x_sf is not None else x.shape[1] + with use_symmetric_memory( get_tp_group(), disabled=not is_allocation_symmetric() ): symm_output = torch.empty( - x.shape[0], x.shape[1] * 2, dtype=output_dtype, device=x.device + x.shape[0], output_col, dtype=output_dtype, device=x.device ) output = flashinfer_cutlass_fused_moe(