sgl-project · Kangyan-Zhou · May 12, 2026 · May 9, 2026 · May 9, 2026
@@ -16,7 +16,7 @@ When using the diffusers backend, `--attention-backend` is passed through to dif
 - **CUDA**: prefers FlashAttention (FA3/FA4) when supported; otherwise falls back to PyTorch SDPA.
 - **ROCm**: uses FlashAttention when available; otherwise falls back to PyTorch SDPA.
 - **Intel XPU**: uses XPU Flash Attention backend (fp16/bf16, head sizes 64/96/128/192/256); otherwise falls back to PyTorch SDPA.
-- **MUSA**: uses FlashAttention when available; otherwise falls back to PyTorch SDPA.
+- **MUSA**: uses FlashAttention when available; also supports Sage Attention when installed; otherwise falls back to PyTorch SDPA.
 - **MPS**: always uses PyTorch SDPA.
 - **NPU**: for ring attention uses FA otherwise uses PyTorch SDPA.
 
@@ -349,10 +349,10 @@ Some backends require additional configuration. You can pass these parameters vi
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Yes</td>
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>No</td>
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>No</td>
-      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>No</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>Yes</td>
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>❌</td>
       <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>❌</td>
-      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>CUDA-only (optional dependency).</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Optional dependency on CUDA and MUSA. Falls back to FlashAttention if <code>sageattention</code> is not installed.</td>
     </tr>
     <tr>
       <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`sage_attn_3`</td>

@@ -25,7 +25,7 @@ SGLang Diffusion supports AMD Instinct GPUs through ROCm. On AMD platforms, we u
 
 ### Moore Threads/MUSA Support
 
-SGLang Diffusion supports Moore Threads GPUs (MTGPU) through the MUSA software stack. On MUSA platforms, we use the Torch SDPA backend for attention. See the [installation guide](https://github.com/sgl-project/sglang/tree/main/docs/diffusion/installation.md) for setup instructions.
+SGLang Diffusion supports Moore Threads GPUs (MTGPU) through the MUSA software stack. On MUSA platforms, we use FlashAttention (FA3) when available; also supports Sage Attention when installed; otherwise falls back to the Torch SDPA backend. See the [installation guide](https://github.com/sgl-project/sglang/tree/main/docs/diffusion/installation.md) for setup instructions.
 
 ### Apple MPS Support
 

@@ -160,6 +160,23 @@ def get_attn_backend_cls_str(
         if selected_backend == AttentionBackendEnum.TORCH_SDPA:
             logger.info("Using Torch SDPA backend")
             return "sglang.multimodal_gen.runtime.layers.attention.backends.sdpa.SDPABackend"
+        elif selected_backend == AttentionBackendEnum.SAGE_ATTN:
+            try:
+                from sageattention import sageattn  # noqa: F401
+
+                from sglang.multimodal_gen.runtime.layers.attention.backends.sage_attn import (  # noqa: F401
+                    SageAttentionBackend,
+                )
+
+                logger.info("Using Sage Attention backend")
+
+                return "sglang.multimodal_gen.runtime.layers.attention.backends.sage_attn.SageAttentionBackend"
+            except ImportError as e:
+                logger.info(e)
+                logger.info(
+                    "Sage Attention backend is not installed (To install it, run `pip install sageattention>=0.1.0`). Falling back to Flash Attention."
+                )
+                target_backend = AttentionBackendEnum.FA
         elif selected_backend in [
             AttentionBackendEnum.FA,
         ]:
@@ -208,7 +225,7 @@ def get_attn_backend_cls_str(
             logger.info("Using Torch SDPA backend")
             return "sglang.multimodal_gen.runtime.layers.attention.backends.sdpa.SDPABackend"
 
-        logger.info("Using FlashAttention (FA3) backend on MUSA")
+        logger.info("Using FlashAttention (FA3) backend")
         return "sglang.multimodal_gen.runtime.layers.attention.backends.flash_attn.FlashAttentionBackend"
 
     @classmethod