diff --git a/docs_new/docs/sglang-diffusion/attention_backends.mdx b/docs_new/docs/sglang-diffusion/attention_backends.mdx index 4aaa735bbab8..53db2133fbfd 100644 --- a/docs_new/docs/sglang-diffusion/attention_backends.mdx +++ b/docs_new/docs/sglang-diffusion/attention_backends.mdx @@ -16,7 +16,7 @@ When using the diffusers backend, `--attention-backend` is passed through to dif - **CUDA**: prefers FlashAttention (FA3/FA4) when supported; otherwise falls back to PyTorch SDPA. - **ROCm**: uses FlashAttention when available; otherwise falls back to PyTorch SDPA. - **Intel XPU**: uses XPU Flash Attention backend (fp16/bf16, head sizes 64/96/128/192/256); otherwise falls back to PyTorch SDPA. -- **MUSA**: uses FlashAttention when available; otherwise falls back to PyTorch SDPA. +- **MUSA**: uses FlashAttention when available; also supports Sage Attention when installed; otherwise falls back to PyTorch SDPA. - **MPS**: always uses PyTorch SDPA. - **NPU**: for ring attention uses FA otherwise uses PyTorch SDPA. @@ -349,10 +349,10 @@ Some backends require additional configuration. You can pass these parameters vi Yes No No - No + Yes ❌ ❌ - CUDA-only (optional dependency). + Optional dependency on CUDA and MUSA. Falls back to FlashAttention if sageattention is not installed. `sage_attn_3` diff --git a/python/sglang/multimodal_gen/README.md b/python/sglang/multimodal_gen/README.md index 385383a7006f..3a87a85734b8 100644 --- a/python/sglang/multimodal_gen/README.md +++ b/python/sglang/multimodal_gen/README.md @@ -25,7 +25,7 @@ SGLang Diffusion supports AMD Instinct GPUs through ROCm. On AMD platforms, we u ### Moore Threads/MUSA Support -SGLang Diffusion supports Moore Threads GPUs (MTGPU) through the MUSA software stack. On MUSA platforms, we use the Torch SDPA backend for attention. See the [installation guide](https://github.com/sgl-project/sglang/tree/main/docs/diffusion/installation.md) for setup instructions. +SGLang Diffusion supports Moore Threads GPUs (MTGPU) through the MUSA software stack. On MUSA platforms, we use FlashAttention (FA3) when available; also supports Sage Attention when installed; otherwise falls back to the Torch SDPA backend. See the [installation guide](https://github.com/sgl-project/sglang/tree/main/docs/diffusion/installation.md) for setup instructions. ### Apple MPS Support diff --git a/python/sglang/multimodal_gen/runtime/platforms/musa.py b/python/sglang/multimodal_gen/runtime/platforms/musa.py index 234cd47c52b3..2b98ed4c4444 100644 --- a/python/sglang/multimodal_gen/runtime/platforms/musa.py +++ b/python/sglang/multimodal_gen/runtime/platforms/musa.py @@ -160,6 +160,23 @@ def get_attn_backend_cls_str( if selected_backend == AttentionBackendEnum.TORCH_SDPA: logger.info("Using Torch SDPA backend") return "sglang.multimodal_gen.runtime.layers.attention.backends.sdpa.SDPABackend" + elif selected_backend == AttentionBackendEnum.SAGE_ATTN: + try: + from sageattention import sageattn # noqa: F401 + + from sglang.multimodal_gen.runtime.layers.attention.backends.sage_attn import ( # noqa: F401 + SageAttentionBackend, + ) + + logger.info("Using Sage Attention backend") + + return "sglang.multimodal_gen.runtime.layers.attention.backends.sage_attn.SageAttentionBackend" + except ImportError as e: + logger.info(e) + logger.info( + "Sage Attention backend is not installed (To install it, run `pip install sageattention>=0.1.0`). Falling back to Flash Attention." + ) + target_backend = AttentionBackendEnum.FA elif selected_backend in [ AttentionBackendEnum.FA, ]: @@ -208,7 +225,7 @@ def get_attn_backend_cls_str( logger.info("Using Torch SDPA backend") return "sglang.multimodal_gen.runtime.layers.attention.backends.sdpa.SDPABackend" - logger.info("Using FlashAttention (FA3) backend on MUSA") + logger.info("Using FlashAttention (FA3) backend") return "sglang.multimodal_gen.runtime.layers.attention.backends.flash_attn.FlashAttentionBackend" @classmethod