diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index bc17db6f89e0..e8fdda3f3622 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1815,6 +1815,21 @@ def _handle_speculative_decoding(self): ): self.speculative_draft_model_revision = "main" + # Avoid using flashinfer_trtllm for speculative MoE runner backend by default + # TODO: Remove this block after verifying no accuracy regression with flashinfer_trtllm speculative backend + from sglang.srt.layers.moe.utils import MoeRunnerBackend + + if self.speculative_moe_runner_backend is None: + self.speculative_moe_runner_backend = ( + "auto" + if self.moe_runner_backend == "flashinfer_trtllm" + else self.moe_runner_backend + ) + else: + assert not MoeRunnerBackend( + self.speculative_moe_runner_backend + ).is_flashinfer_trtllm(), "Currently speculative MoE runner backend cannot be flashinfer_trtllm for risk in some draft models." + if self.speculative_algorithm == "NEXTN": self.speculative_algorithm = "EAGLE" diff --git a/test/srt/test_deepseek_v3_fp4_4gpu.py b/test/srt/test_deepseek_v3_fp4_4gpu.py index 1f236f0953c8..2b4f94889cd3 100644 --- a/test/srt/test_deepseek_v3_fp4_4gpu.py +++ b/test/srt/test_deepseek_v3_fp4_4gpu.py @@ -172,7 +172,7 @@ def test_bs_1_speed(self): f"{speed=:.2f} token/s\n" ) - self.assertGreater(acc_length, 2.04) + self.assertGreater(acc_length, 2.65) self.assertGreater(speed, 150)