From 7b325bf717b57a15f3ce3a0d370fcc632800705a Mon Sep 17 00:00:00 2001
From: Sam <lsam@nvidia.com>
Date: Wed, 3 Dec 2025 08:47:26 +0000
Subject: [PATCH 1/2] Fix trtllm-moe-fp4-renorm for Qwen series models

---
 python/sglang/srt/layers/moe/fused_moe_triton/layer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
index 6e0487a1ddc8..413c9c5ea6aa 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -1117,12 +1117,16 @@ def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
         topk_config = topk_output.topk_config
 
         hs_fp4, hs_scale_linear = self._quantize_hidden_states_fp4(hidden_states)
-        router_logits = router_logits.to(torch.float32)
         routing_method_type = self.routing_method_type
         assert (
             routing_method_type is not None
         ), "flashinfer trtllm moe nvfp4 backend has not been adapted for the current moe layer, you can set routing_method_type (See definition of RoutingMethodType please) for the moe layer explicitly for a quick adaptation."
 
+        # DeepSeekV3 style routing requires float32 router logits,
+        # see this PR for details: https://github.com/flashinfer-ai/flashinfer/commit/d84e1d560da0a27961c19ca788d96c19cb9dcfb6
+        if routing_method_type == RoutingMethodType.DeepSeekV3:
+            router_logits = router_logits.to(torch.float32)
+
         correction_bias = (
             None
             if topk_config.correction_bias is None

From bf298d9462051e426218229c80bf9b6bba2cb246 Mon Sep 17 00:00:00 2001
From: Sam <lsam@nvidia.com>
Date: Wed, 3 Dec 2025 13:43:33 +0000
Subject: [PATCH 2/2] Re-enable trtllm-mha

---
 python/sglang/srt/server_args.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 7a718795a449..cbb216253672 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -1220,7 +1220,6 @@ def _handle_model_specific_adjustments(self):
                 )
                 self.disable_overlap_schedule = True
             if is_sm100_supported():
-                self.attention_backend = "triton"
                 quantization_config = getattr(hf_config, "quantization_config", None)
                 quant_method = (
                     quantization_config.get("quant_method")