From c2ce9b4c52ab63280ba0e9083963827843796e21 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Mon, 11 May 2026 14:39:09 -0500 Subject: [PATCH] Enable trtllm_mha as gemma4 default attn backend. --- python/sglang/srt/server_args.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 1660a24179be..a8bea66b9688 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -2171,9 +2171,13 @@ def _handle_model_specific_adjustments(self): ) self.disable_hybrid_swa_memory = True elif model_arch == "Gemma4ForConditionalGeneration": - if self.is_attention_backend_not_set(): + if is_sm100_supported(): + self.attention_backend = "trtllm_mha" + else: self.attention_backend = "triton" - logger.info("Use triton as default attention backend for Gemma4") + logger.info( + f"Use {self.attention_backend} as default attention backend for Gemma4" + ) elif model_arch == "MossVLForConditionalGeneration": if self.is_attention_backend_not_set(): self.prefill_attention_backend = "flashinfer"