From 6f8beefac04bdc3f30afa7b36efc40727840dab2 Mon Sep 17 00:00:00 2001
From: "Shu Wang." <shuw@nvidia.com>
Date: Tue, 7 Apr 2026 14:34:23 +0000
Subject: [PATCH 1/2] fix(triton): add sm100 block sizes to fix PTX register
 exhaustion for large head dims and default to trtllm_mha on sm100.

---
 .../srt/layers/attention/triton_ops/extend_attention.py  | 8 ++++++++
 python/sglang/srt/server_args.py                         | 9 +++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
index 8ce0e35ff792..fb57df6ed0c5 100644
--- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
+++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
@@ -72,6 +72,14 @@ def _get_block_sizes_for_extend_attention(Lq: int, Lv: int):
                 BLOCK_M, BLOCK_N = (64, 64)
             else:
                 BLOCK_M, BLOCK_N = (32, 32)
+        elif _is_cuda and CUDA_CAPABILITY[0] == 10:
+            # Blackwell data-center architecture (GB200, B200, sm_100a)
+            # sm_100a has different register constraints from Hopper; Hopper block sizes
+            # cause PTX register exhaustion (>255 regs) for large head dims (Lq=512).
+            if Lq <= 256:
+                BLOCK_M, BLOCK_N = (64, 64)
+            else:
+                BLOCK_M, BLOCK_N = (16, 64)
         elif _is_cuda and CUDA_CAPABILITY[0] >= 9:
             # Hopper architecture (H100, etc.)
             if Lq <= 256:
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 74445c9cd1f2..5b5ac197726d 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -1889,8 +1889,13 @@ def _handle_model_specific_adjustments(self):
             self.disable_hybrid_swa_memory = True
         elif model_arch == "Gemma4ForConditionalGeneration":
             if self.is_attention_backend_not_set():
-                self.attention_backend = "triton"
-                logger.info("Use triton as default attention backend for Gemma4")
+                if is_sm100_supported():
+                    self.attention_backend = "trtllm_mha"
+                else:
+                    self.attention_backend = "triton"
+                logger.info(
+                    f"Use {self.attention_backend} as default attention backend for Gemma4"
+                )
         elif model_arch in ["Exaone4ForCausalLM", "ExaoneMoEForCausalLM"]:
             if hf_config.sliding_window_pattern is not None:
                 logger.warning(

From 64f4f2cbbe9a1ec0209830734e658229639fe77d Mon Sep 17 00:00:00 2001
From: Shu Wang <shuw@nvidia.com>
Date: Tue, 7 Apr 2026 13:31:43 -0500
Subject: [PATCH 2/2] FIx for fp8 kvcache.

---
 python/sglang/srt/server_args.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 5b5ac197726d..74445c9cd1f2 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -1889,13 +1889,8 @@ def _handle_model_specific_adjustments(self):
             self.disable_hybrid_swa_memory = True
         elif model_arch == "Gemma4ForConditionalGeneration":
             if self.is_attention_backend_not_set():
-                if is_sm100_supported():
-                    self.attention_backend = "trtllm_mha"
-                else:
-                    self.attention_backend = "triton"
-                logger.info(
-                    f"Use {self.attention_backend} as default attention backend for Gemma4"
-                )
+                self.attention_backend = "triton"
+                logger.info("Use triton as default attention backend for Gemma4")
         elif model_arch in ["Exaone4ForCausalLM", "ExaoneMoEForCausalLM"]:
             if hf_config.sliding_window_pattern is not None:
                 logger.warning(