From ca053f08858efb87ef917aefa68f278222c0ffa7 Mon Sep 17 00:00:00 2001
From: alisonshao <a.shao@wustl.edu>
Date: Wed, 26 Nov 2025 16:28:24 -0800
Subject: [PATCH] Fix flashinfer cutlass MoE output shape for non-FP4-packed
 inputs

Fix incorrect output tensor shape calculation in ModelOptNvFp4FusedMoEMethod
that caused ValueError during server startup for DeepSeek-V3-FP4 models.
---
 python/sglang/srt/layers/quantization/modelopt_quant.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py
index 2809c7a19226..8d44584e7f26 100755
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -1654,11 +1654,15 @@ def apply(
 
             output_dtype = torch.bfloat16
 
+            # If x_sf is not None, x is FP4 packed (half size), so we need * 2
+            # If x_sf is None, x is not packed, so output_col = x.shape[1]
+            output_col = x.shape[1] * 2 if x_sf is not None else x.shape[1]
+
             with use_symmetric_memory(
                 get_tp_group(), disabled=not is_allocation_symmetric()
             ):
                 symm_output = torch.empty(
-                    x.shape[0], x.shape[1] * 2, dtype=output_dtype, device=x.device
+                    x.shape[0], output_col, dtype=output_dtype, device=x.device
                 )
 
             output = flashinfer_cutlass_fused_moe(