diff --git a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py index 8ce0e35ff792..fb57df6ed0c5 100644 --- a/python/sglang/srt/layers/attention/triton_ops/extend_attention.py +++ b/python/sglang/srt/layers/attention/triton_ops/extend_attention.py @@ -72,6 +72,14 @@ def _get_block_sizes_for_extend_attention(Lq: int, Lv: int): BLOCK_M, BLOCK_N = (64, 64) else: BLOCK_M, BLOCK_N = (32, 32) + elif _is_cuda and CUDA_CAPABILITY[0] == 10: + # Blackwell data-center architecture (GB200, B200, sm_100a) + # sm_100a has different register constraints from Hopper; Hopper block sizes + # cause PTX register exhaustion (>255 regs) for large head dims (Lq=512). + if Lq <= 256: + BLOCK_M, BLOCK_N = (64, 64) + else: + BLOCK_M, BLOCK_N = (16, 64) elif _is_cuda and CUDA_CAPABILITY[0] >= 9: # Hopper architecture (H100, etc.) if Lq <= 256: