diff --git a/python/sglang/srt/layers/attention/vision.py b/python/sglang/srt/layers/attention/vision.py index 087e76baf934..4c3f9e2f2721 100644 --- a/python/sglang/srt/layers/attention/vision.py +++ b/python/sglang/srt/layers/attention/vision.py @@ -888,7 +888,9 @@ def _determine_attention_backend(self, passed_backend: Optional[str]) -> str: Priority: server args override > constructor arg > platform default. Platform defaults: - - CUDA: "triton_attn" + - CUDA (Hopper SM90): "fa3" + - CUDA (Blackwell SM100): "fa4" + - CUDA (other): "triton_attn" - Non-CUDA: "sdpa" """ override_backend = get_global_server_args().mm_attention_backend @@ -900,6 +902,8 @@ def _determine_attention_backend(self, passed_backend: Optional[str]) -> str: major, minor = get_device_capability() if major == 9: backend = "fa3" + elif major == 10: + backend = "fa4" else: backend = "triton_attn" elif _is_hip: