From 7b325bf717b57a15f3ce3a0d370fcc632800705a Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 3 Dec 2025 08:47:26 +0000 Subject: [PATCH 1/2] Fix trtllm-moe-fp4-renorm for Qwen series models --- python/sglang/srt/layers/moe/fused_moe_triton/layer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index 6e0487a1ddc8..413c9c5ea6aa 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -1117,12 +1117,16 @@ def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput): topk_config = topk_output.topk_config hs_fp4, hs_scale_linear = self._quantize_hidden_states_fp4(hidden_states) - router_logits = router_logits.to(torch.float32) routing_method_type = self.routing_method_type assert ( routing_method_type is not None ), "flashinfer trtllm moe nvfp4 backend has not been adapted for the current moe layer, you can set routing_method_type (See definition of RoutingMethodType please) for the moe layer explicitly for a quick adaptation." + # DeepSeekV3 style routing requires float32 router logits, + # see this PR for details: https://github.com/flashinfer-ai/flashinfer/commit/d84e1d560da0a27961c19ca788d96c19cb9dcfb6 + if routing_method_type == RoutingMethodType.DeepSeekV3: + router_logits = router_logits.to(torch.float32) + correction_bias = ( None if topk_config.correction_bias is None From bf298d9462051e426218229c80bf9b6bba2cb246 Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 3 Dec 2025 13:43:33 +0000 Subject: [PATCH 2/2] Re-enable trtllm-mha --- python/sglang/srt/server_args.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 7a718795a449..cbb216253672 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1220,7 +1220,6 @@ def _handle_model_specific_adjustments(self): ) self.disable_overlap_schedule = True if is_sm100_supported(): - self.attention_backend = "triton" quantization_config = getattr(hf_config, "quantization_config", None) quant_method = ( quantization_config.get("quant_method")