diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 9ec274d2a75c..aefedb3ad8f7 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -2188,19 +2188,6 @@ def _handle_mamba_radix_cache( not self.enable_mamba_extra_buffer() ), f"mamba extra_buffer is not supported for {model_arch} model" - # FlashInfer GDN decode is incompatible with no_buffer scheduling. - # See https://github.com/sgl-project/sglang/issues/20791 - if ( - self.linear_attn_decode_backend == "flashinfer" - and self.mamba_scheduler_strategy == "no_buffer" - ): - raise ValueError( - "FlashInfer GDN decode (--linear-attn-decode-backend flashinfer) is not " - "compatible with --mamba-scheduler-strategy no_buffer. " - "Please use --mamba-scheduler-strategy extra_buffer instead. " - "See https://github.com/sgl-project/sglang/issues/20791" - ) - if self.enable_mamba_extra_buffer(): # extra_buffer if self.disable_radix_cache: raise ValueError( diff --git a/test/registered/4-gpu-models/test_qwen35_models.py b/test/registered/4-gpu-models/test_qwen35_models.py index 828857b55c0f..0b76daaa1c30 100644 --- a/test/registered/4-gpu-models/test_qwen35_models.py +++ b/test/registered/4-gpu-models/test_qwen35_models.py @@ -29,15 +29,11 @@ class TestQwen35FP4(CustomTestCase): def test_gsm8k(self): - base_args = [ + common_args = [ "--tp-size", "4", "--chunked-prefill-size", "2048", - "--mamba-scheduler-strategy", - "extra_buffer", - "--mamba-track-interval", - "128", "--mamba-ssm-dtype", "bfloat16", "--max-running-requests", @@ -51,19 +47,35 @@ def test_gsm8k(self): "--model-loader-extra-config", '{"enable_multithread_load": true,"num_threads": 64}', ] + extra_buffer_args = common_args + [ + "--mamba-scheduler-strategy", + "extra_buffer", + "--mamba-track-interval", + "128", + ] + no_buffer_args = common_args + [ + "--mamba-scheduler-strategy", + "no_buffer", + ] variants = [ ModelLaunchSettings( QWEN35_FP4_MODEL, - extra_args=base_args, + extra_args=extra_buffer_args, variant="Triton", ), - # TODO: Fix this and re-enable it - # ModelLaunchSettings( - # QWEN35_FP4_MODEL, - # extra_args=base_args + ["--linear-attn-decode-backend", "flashinfer"], - # variant="FlashInfer", - # ), + ModelLaunchSettings( + QWEN35_FP4_MODEL, + extra_args=extra_buffer_args + + ["--linear-attn-decode-backend", "flashinfer"], + variant="FlashInfer", + ), + ModelLaunchSettings( + QWEN35_FP4_MODEL, + extra_args=no_buffer_args + + ["--linear-attn-decode-backend", "flashinfer"], + variant="FlashInfer-NoBuffer", + ), ] run_combined_tests(