Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -2188,19 +2188,6 @@ def _handle_mamba_radix_cache(
not self.enable_mamba_extra_buffer()
), f"mamba extra_buffer is not supported for {model_arch} model"

# FlashInfer GDN decode is incompatible with no_buffer scheduling.
# See https://github.com/sgl-project/sglang/issues/20791
if (
self.linear_attn_decode_backend == "flashinfer"
and self.mamba_scheduler_strategy == "no_buffer"
):
raise ValueError(
"FlashInfer GDN decode (--linear-attn-decode-backend flashinfer) is not "
"compatible with --mamba-scheduler-strategy no_buffer. "
"Please use --mamba-scheduler-strategy extra_buffer instead. "
"See https://github.com/sgl-project/sglang/issues/20791"
)

if self.enable_mamba_extra_buffer(): # extra_buffer
if self.disable_radix_cache:
raise ValueError(
Expand Down
36 changes: 24 additions & 12 deletions test/registered/4-gpu-models/test_qwen35_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,11 @@

class TestQwen35FP4(CustomTestCase):
def test_gsm8k(self):
base_args = [
common_args = [
"--tp-size",
"4",
"--chunked-prefill-size",
"2048",
"--mamba-scheduler-strategy",
"extra_buffer",
"--mamba-track-interval",
"128",
"--mamba-ssm-dtype",
"bfloat16",
"--max-running-requests",
Expand All @@ -51,19 +47,35 @@ def test_gsm8k(self):
"--model-loader-extra-config",
'{"enable_multithread_load": true,"num_threads": 64}',
]
extra_buffer_args = common_args + [
"--mamba-scheduler-strategy",
"extra_buffer",
"--mamba-track-interval",
"128",
]
no_buffer_args = common_args + [
"--mamba-scheduler-strategy",
"no_buffer",
]

variants = [
ModelLaunchSettings(
QWEN35_FP4_MODEL,
extra_args=base_args,
extra_args=extra_buffer_args,
variant="Triton",
),
# TODO: Fix this and re-enable it
# ModelLaunchSettings(
# QWEN35_FP4_MODEL,
# extra_args=base_args + ["--linear-attn-decode-backend", "flashinfer"],
# variant="FlashInfer",
# ),
ModelLaunchSettings(
QWEN35_FP4_MODEL,
extra_args=extra_buffer_args
+ ["--linear-attn-decode-backend", "flashinfer"],
variant="FlashInfer",
),
ModelLaunchSettings(
QWEN35_FP4_MODEL,
extra_args=no_buffer_args
+ ["--linear-attn-decode-backend", "flashinfer"],
variant="FlashInfer-NoBuffer",
),
]

run_combined_tests(
Expand Down
Loading