diff --git a/docs_new/docs/sglang-diffusion/attention_backends.mdx b/docs_new/docs/sglang-diffusion/attention_backends.mdx
index 4aaa735bbab8..53db2133fbfd 100644
--- a/docs_new/docs/sglang-diffusion/attention_backends.mdx
+++ b/docs_new/docs/sglang-diffusion/attention_backends.mdx
@@ -16,7 +16,7 @@ When using the diffusers backend, `--attention-backend` is passed through to dif
- **CUDA**: prefers FlashAttention (FA3/FA4) when supported; otherwise falls back to PyTorch SDPA.
- **ROCm**: uses FlashAttention when available; otherwise falls back to PyTorch SDPA.
- **Intel XPU**: uses XPU Flash Attention backend (fp16/bf16, head sizes 64/96/128/192/256); otherwise falls back to PyTorch SDPA.
-- **MUSA**: uses FlashAttention when available; otherwise falls back to PyTorch SDPA.
+- **MUSA**: uses FlashAttention when available; also supports Sage Attention when installed; otherwise falls back to PyTorch SDPA.
- **MPS**: always uses PyTorch SDPA.
- **NPU**: for ring attention uses FA otherwise uses PyTorch SDPA.
@@ -349,10 +349,10 @@ Some backends require additional configuration. You can pass these parameters vi
Yes |
No |
No |
- No |
+ Yes |
❌ |
❌ |
- CUDA-only (optional dependency). |
+ Optional dependency on CUDA and MUSA. Falls back to FlashAttention if sageattention is not installed. |
| `sage_attn_3` |
diff --git a/python/sglang/multimodal_gen/README.md b/python/sglang/multimodal_gen/README.md
index 385383a7006f..3a87a85734b8 100644
--- a/python/sglang/multimodal_gen/README.md
+++ b/python/sglang/multimodal_gen/README.md
@@ -25,7 +25,7 @@ SGLang Diffusion supports AMD Instinct GPUs through ROCm. On AMD platforms, we u
### Moore Threads/MUSA Support
-SGLang Diffusion supports Moore Threads GPUs (MTGPU) through the MUSA software stack. On MUSA platforms, we use the Torch SDPA backend for attention. See the [installation guide](https://github.com/sgl-project/sglang/tree/main/docs/diffusion/installation.md) for setup instructions.
+SGLang Diffusion supports Moore Threads GPUs (MTGPU) through the MUSA software stack. On MUSA platforms, we use FlashAttention (FA3) when available; also supports Sage Attention when installed; otherwise falls back to the Torch SDPA backend. See the [installation guide](https://github.com/sgl-project/sglang/tree/main/docs/diffusion/installation.md) for setup instructions.
### Apple MPS Support
diff --git a/python/sglang/multimodal_gen/runtime/platforms/musa.py b/python/sglang/multimodal_gen/runtime/platforms/musa.py
index 234cd47c52b3..2b98ed4c4444 100644
--- a/python/sglang/multimodal_gen/runtime/platforms/musa.py
+++ b/python/sglang/multimodal_gen/runtime/platforms/musa.py
@@ -160,6 +160,23 @@ def get_attn_backend_cls_str(
if selected_backend == AttentionBackendEnum.TORCH_SDPA:
logger.info("Using Torch SDPA backend")
return "sglang.multimodal_gen.runtime.layers.attention.backends.sdpa.SDPABackend"
+ elif selected_backend == AttentionBackendEnum.SAGE_ATTN:
+ try:
+ from sageattention import sageattn # noqa: F401
+
+ from sglang.multimodal_gen.runtime.layers.attention.backends.sage_attn import ( # noqa: F401
+ SageAttentionBackend,
+ )
+
+ logger.info("Using Sage Attention backend")
+
+ return "sglang.multimodal_gen.runtime.layers.attention.backends.sage_attn.SageAttentionBackend"
+ except ImportError as e:
+ logger.info(e)
+ logger.info(
+ "Sage Attention backend is not installed (To install it, run `pip install sageattention>=0.1.0`). Falling back to Flash Attention."
+ )
+ target_backend = AttentionBackendEnum.FA
elif selected_backend in [
AttentionBackendEnum.FA,
]:
@@ -208,7 +225,7 @@ def get_attn_backend_cls_str(
logger.info("Using Torch SDPA backend")
return "sglang.multimodal_gen.runtime.layers.attention.backends.sdpa.SDPABackend"
- logger.info("Using FlashAttention (FA3) backend on MUSA")
+ logger.info("Using FlashAttention (FA3) backend")
return "sglang.multimodal_gen.runtime.layers.attention.backends.flash_attn.FlashAttentionBackend"
@classmethod