From 43fe1368453cbbbccbda2e60be2f30ae900cdad1 Mon Sep 17 00:00:00 2001 From: Brayden Zhong Date: Wed, 10 Dec 2025 11:02:54 -0800 Subject: [PATCH] more --- python/sglang/srt/server_args.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 8fc9062ca6ea..3fcf2b7d4c3f 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -40,7 +40,6 @@ get_device, get_device_memory_capacity, get_device_sm, - is_blackwell, is_blackwell_supported, is_cuda, is_fa3_default_architecture, @@ -1350,7 +1349,8 @@ def _handle_attention_backend_compatibility(self): 1. Models with MHA Architecture (e.g: Llama, QWen) 1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1. - 1.2 Use trtllm_mha for Blackwell excluding spec with topk > 1. + 1.2 Use trtllm_mha for SM100/SM103 (Blackwell B200/GB200/B300) excluding spec with topk > 1. + Note: trtllm_mha does not support SM120, which will fall back to flashinfer. 1.3 In other cases, we will use flashinfer if available, otherwise use triton. 2. Models with MLA Architecture and using FA3 2.1 We will use FA3 backend on hopper. @@ -1366,7 +1366,7 @@ def _handle_attention_backend_compatibility(self): and is_fa3_default_architecture(self.model_config.hf_config) ): self.attention_backend = "fa3" - elif is_blackwell() and is_no_spec_infer_or_topk_one(self): + elif is_sm100_supported() and is_no_spec_infer_or_topk_one(self): self.attention_backend = "trtllm_mha" elif is_hip(): self.attention_backend = "aiter"