From 6f8beefac04bdc3f30afa7b36efc40727840dab2 Mon Sep 17 00:00:00 2001 From: "Shu Wang." Date: Tue, 7 Apr 2026 14:34:23 +0000 Subject: [PATCH 1/2] fix(triton): add sm100 block sizes to fix PTX register exhaustion for large head dims and default to trtllm_mha on sm100. --- .../srt/layers/attention/triton_ops/extend_attention.py | 8 ++++++++ python/sglang/srt/server_args.py | 9 +++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py index 8ce0e35ff792..fb57df6ed0c5 100644 --- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py +++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py @@ -72,6 +72,14 @@ def _get_block_sizes_for_extend_attention(Lq: int, Lv: int): BLOCK_M, BLOCK_N = (64, 64) else: BLOCK_M, BLOCK_N = (32, 32) + elif _is_cuda and CUDA_CAPABILITY[0] == 10: + # Blackwell data-center architecture (GB200, B200, sm_100a) + # sm_100a has different register constraints from Hopper; Hopper block sizes + # cause PTX register exhaustion (>255 regs) for large head dims (Lq=512). + if Lq <= 256: + BLOCK_M, BLOCK_N = (64, 64) + else: + BLOCK_M, BLOCK_N = (16, 64) elif _is_cuda and CUDA_CAPABILITY[0] >= 9: # Hopper architecture (H100, etc.) if Lq <= 256: diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 74445c9cd1f2..5b5ac197726d 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1889,8 +1889,13 @@ def _handle_model_specific_adjustments(self): self.disable_hybrid_swa_memory = True elif model_arch == "Gemma4ForConditionalGeneration": if self.is_attention_backend_not_set(): - self.attention_backend = "triton" - logger.info("Use triton as default attention backend for Gemma4") + if is_sm100_supported(): + self.attention_backend = "trtllm_mha" + else: + self.attention_backend = "triton" + logger.info( + f"Use {self.attention_backend} as default attention backend for Gemma4" + ) elif model_arch in ["Exaone4ForCausalLM", "ExaoneMoEForCausalLM"]: if hf_config.sliding_window_pattern is not None: logger.warning( From 64f4f2cbbe9a1ec0209830734e658229639fe77d Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 7 Apr 2026 13:31:43 -0500 Subject: [PATCH 2/2] FIx for fp8 kvcache. --- python/sglang/srt/server_args.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 5b5ac197726d..74445c9cd1f2 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -1889,13 +1889,8 @@ def _handle_model_specific_adjustments(self): self.disable_hybrid_swa_memory = True elif model_arch == "Gemma4ForConditionalGeneration": if self.is_attention_backend_not_set(): - if is_sm100_supported(): - self.attention_backend = "trtllm_mha" - else: - self.attention_backend = "triton" - logger.info( - f"Use {self.attention_backend} as default attention backend for Gemma4" - ) + self.attention_backend = "triton" + logger.info("Use triton as default attention backend for Gemma4") elif model_arch in ["Exaone4ForCausalLM", "ExaoneMoEForCausalLM"]: if hf_config.sliding_window_pattern is not None: logger.warning(