sgl-project · Fridge003 · Apr 3, 2026 · Apr 3, 2026
diff --git a/python/pyproject_other.toml b/python/pyproject_other.toml
@@ -116,9 +116,6 @@ srt_musa = [
     "torch_musa",
     "torchada>=0.1.45",
     "mthreads-ml-py",
-    "mate",
-    "mate-deep_gemm",
-    "mate-flash-attention",
     "numpy<2.0",
 ]
 

diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -563,12 +563,6 @@ def _derive_model_shapes(self):
         self.num_key_value_heads = getattr(
             self.hf_text_config, "num_key_value_heads", None
         )
-        self.first_k_dense_replace = getattr(
-            self.hf_text_config, "first_k_dense_replace", None
-        )
-        self.full_attention_interval = getattr(
-            self.hf_text_config, "full_attention_interval", None
-        )
 
         # for Dbrx and MPT models
         if self.hf_config.model_type in ["dbrx", "mpt"]:

diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
@@ -331,9 +331,6 @@ class Envs:
     SGLANG_USE_AG_AFTER_QLORA = EnvBool(False)
     SGLANG_NPU_FUSED_MOE_MODE = EnvInt(1)
 
-    # MTHREADS & MUSA
-    SGLANG_MUSA_FA3_FORCE_UPDATE_METADATA = EnvBool(False)
-
     # Quantization
     SGLANG_INT4_WEIGHT = EnvBool(False)
     SGLANG_CPU_QUANTIZATION = EnvBool(False)

diff --git a/python/sglang/srt/hardware_backend/musa/__init__.py b/python/sglang/srt/hardware_backend/musa/__init__.py
diff --git a/python/sglang/srt/hardware_backend/musa/attention/__init__.py b/python/sglang/srt/hardware_backend/musa/attention/__init__.py
diff --git a/python/sglang/srt/hardware_backend/musa/attention/flash_attention.py b/python/sglang/srt/hardware_backend/musa/attention/flash_attention.py
@@ -1,10 +1,6 @@
 import logging
 from typing import TYPE_CHECKING
 
-from sglang.srt.utils import get_device_capability, is_musa
-
-_is_musa = is_musa()
-
 logger = logging.getLogger(__name__)
 
 
@@ -129,19 +125,14 @@ def create_flashmla_backend(runner):
 
 @register_attention_backend("fa3")
 def create_flashattention_v3_backend(runner):
+    import torch
 
-    major, minor = get_device_capability()
-    if not _is_musa:
-        assert (major == 8 and not runner.use_mla_backend) or major == 9, (
-            "FlashAttention v3 Backend requires SM>=80 and SM<=90. "
-            "Please use `--attention-backend flashinfer`."
-        )
-    else:
-        assert major >= 3 and minor >= 1, (
-            "FlashAttention v3 Backend requires MP>=31. "
-            "Please use `--attention-backend triton`."
-        )
-
+    assert (
+        torch.cuda.get_device_capability()[0] == 8 and not runner.use_mla_backend
+    ) or torch.cuda.get_device_capability()[0] == 9, (
+        "FlashAttention v3 Backend requires SM>=80 and SM<=90. "
+        "Please use `--attention-backend flashinfer`."
+    )
     from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
 
     return FlashAttentionBackend(runner)