From 533805d2885d0c9e830e10b8687935cef4469cc2 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 2 Dec 2025 16:56:13 +0800 Subject: [PATCH 1/3] more --- python/sglang/srt/server_args.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 526465ee5886..df2cfe399fdc 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -40,6 +40,7 @@ get_device, get_device_memory_capacity, get_device_sm, + is_blackwell, is_blackwell_supported, is_cuda, is_fa3_default_architecture, @@ -1283,7 +1284,8 @@ def _handle_attention_backend_compatibility(self): 1. Models with MHA Architecture (e.g: Llama, QWen) 1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1. - 1.2 In other cases, we will use flashinfer if available, otherwise use triton. + 1.2 Use trtllm_mha for Blackwell excluding spec with topk > 1. + 1.3 In other cases, we will use flashinfer if available, otherwise use triton. 2. Models with MLA Architecture and using FA3 2.1 We will use FA3 backend on hopper. 2.2 We will use Flashinfer backend on blackwell. @@ -1298,6 +1300,8 @@ def _handle_attention_backend_compatibility(self): and is_fa3_default_architecture(self.model_config.hf_config) ): self.attention_backend = "fa3" + elif is_blackwell() and is_no_spec_infer_or_topk_one(): + self.attention_backend = "trtllm_mha" elif is_hip(): self.attention_backend = "aiter" elif is_npu(): From ed38f970adfcbab69de786cdba74ed9cb10a0d52 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Wed, 3 Dec 2025 09:39:04 +0800 Subject: [PATCH 2/3] Update server_args.py --- python/sglang/srt/server_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index df2cfe399fdc..d10819c0dbe2 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1300,7 +1300,7 @@ def _handle_attention_backend_compatibility(self): and is_fa3_default_architecture(self.model_config.hf_config) ): self.attention_backend = "fa3" - elif is_blackwell() and is_no_spec_infer_or_topk_one(): + elif is_blackwell() and is_no_spec_infer_or_topk_one(self): self.attention_backend = "trtllm_mha" elif is_hip(): self.attention_backend = "aiter" From e4babc23fb06cbb2acce29775209e76a88b60426 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Thu, 4 Dec 2025 09:02:12 +0800 Subject: [PATCH 3/3] Update test_flash_attention_4.py --- test/srt/test_flash_attention_4.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/srt/test_flash_attention_4.py b/test/srt/test_flash_attention_4.py index 4322263c459e..44623a132c3a 100644 --- a/test/srt/test_flash_attention_4.py +++ b/test/srt/test_flash_attention_4.py @@ -22,6 +22,8 @@ def setUpClass(cls): "0.8", "--prefill-attention-backend", "fa4", + "--decode-attention-backend", + "flashinfer", ] cls.process = popen_launch_server( cls.model,