vllm-project
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 5 additions & 0 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/compile/test_fusion_attn.py‎
Lines changed: 16 additions & 15 deletions b/‎tests/compile/test_fusion_attn.py‎
Lines changed: 16 additions & 15 deletions
diff --git a/‎tests/compile/test_fusions_e2e.py‎
Lines changed: 12 additions & 12 deletions b/‎tests/compile/test_fusions_e2e.py‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎tests/config/test_multimodal_config.py‎
Lines changed: 3 additions & 3 deletions b/‎tests/config/test_multimodal_config.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 45 additions & 30 deletions b/‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 45 additions & 30 deletions
@@ -890,11 +890,16 @@ steps:
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
     # Attention
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
     - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
     - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
     - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
 
@@ -10,7 +10,7 @@
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.attention import Attention, AttentionMetadata
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
@@ -104,7 +104,7 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
 
         # TODO(luka) use get_kv_cache_stride_order
         # Create dummy KV cache for the selected backend
-        if backend == _Backend.ROCM_ATTN:
+        if backend == AttentionBackendEnum.ROCM_ATTN:
             # k/v as 1st dimention
             # HND: [num_blocks, num_kv_heads, block_size, head_size]
             kv_cache = torch.zeros(
@@ -116,7 +116,7 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
                 dtype=self.kv_cache_dtype,
                 device=self.device,
             )
-        elif backend == _Backend.ROCM_AITER_UNIFIED_ATTN:
+        elif backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
             # k/v as 1st dimention
             # NHD: [num_blocks, block_size, num_kv_heads, head_size]
             kv_cache = torch.zeros(
@@ -128,7 +128,7 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
                 dtype=self.kv_cache_dtype,
                 device=self.device,
             )
-        elif backend == _Backend.TRITON_ATTN:
+        elif backend == AttentionBackendEnum.TRITON_ATTN:
             # k/v as 2nd dimention
             # NHD: [num_blocks, block_size, num_kv_heads, head_size]
             kv_cache = torch.zeros(
@@ -140,7 +140,7 @@ def build_attn_metadata(self, batch_size: int) -> AttentionMetadata:
                 dtype=self.kv_cache_dtype,
                 device=self.device,
             )
-        elif backend == _Backend.FLASHINFER:
+        elif backend == AttentionBackendEnum.FLASHINFER:
             kv_cache = torch.zeros(
                 num_blocks,
                 2,
@@ -244,8 +244,8 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
 MODELS_FP4: list[tuple[str, type]] = []
 HEADS: list[tuple[int, int]] = []
 SPLIT_ATTENTION: list[bool] = []
-BACKENDS_FP8: list[_Backend] = []
-BACKENDS_FP4: list[_Backend] = []
+BACKENDS_FP8: list[AttentionBackendEnum] = []
+BACKENDS_FP4: list[AttentionBackendEnum] = []
 
 if current_platform.is_cuda():
     HEADS = [(64, 8), (40, 8)]
@@ -261,18 +261,18 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
             TestAttentionNvfp4QuantPatternModel,
         )
     ]
-    BACKENDS_FP8 = [_Backend.TRITON_ATTN, _Backend.FLASHINFER]
-    BACKENDS_FP4 = [_Backend.FLASHINFER]
+    BACKENDS_FP8 = [AttentionBackendEnum.TRITON_ATTN, AttentionBackendEnum.FLASHINFER]
+    BACKENDS_FP4 = [AttentionBackendEnum.FLASHINFER]
 
 elif current_platform.is_rocm():
     HEADS = [(32, 8), (40, 8)]
     MODELS_FP8 = [
         ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel)
     ]
     BACKENDS = [
-        _Backend.ROCM_AITER_UNIFIED_ATTN,
-        _Backend.ROCM_ATTN,
-        _Backend.TRITON_ATTN,
+        AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
+        AttentionBackendEnum.ROCM_ATTN,
+        AttentionBackendEnum.TRITON_ATTN,
     ]
 
 
@@ -302,18 +302,19 @@ def test_attention_quant_pattern(
     custom_ops: str,
     model_name: str,
     model_class: type[AttentionQuantPatternModel],
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     dist_init,
 ):
     """Test AttentionStaticQuantPattern fusion pass"""
-    if backend == _Backend.FLASHINFER and (
+    if backend == AttentionBackendEnum.FLASHINFER and (
         not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
     ):
         pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
 
     custom_ops_list = custom_ops.split(",") if custom_ops else []
 
     device = torch.device("cuda:0")
+    torch.set_default_dtype(dtype)
     torch.manual_seed(42)
 
     vllm_config = VllmConfig(
@@ -402,7 +403,7 @@ def test_attention_quant_pattern(
 
         result_fused_1 = model_compiled(q, k, v)
 
-        if backend == _Backend.FLASHINFER:
+        if backend == AttentionBackendEnum.FLASHINFER:
             # With the Flashinfer backend after the 1st round of the forward
             # pass, output quant scale should be loaded into the attn layer's
             # _o_scale_float, the 2nd round should reuse the loaded
 
@@ -11,7 +11,7 @@
 import pytest
 import regex as re
 
-from tests.v1.attention.utils import _Backend
+from tests.v1.attention.utils import AttentionBackendEnum
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
@@ -24,7 +24,7 @@
 class ModelBackendTestCase(NamedTuple):
     model_name: str
     model_kwargs: dict[str, Any]
-    backend: _Backend
+    backend: AttentionBackendEnum
     attention_fusions: int
     allreduce_fusions: int | None = None
 
@@ -39,14 +39,14 @@ class ModelBackendTestCase(NamedTuple):
             # Use smaller model for L40s in CI
             model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             attention_fusions=32,
             allreduce_fusions=65,
         ),
         ModelBackendTestCase(
             model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=_Backend.FLASHINFER,
+            backend=AttentionBackendEnum.FLASHINFER,
             attention_fusions=48,
             allreduce_fusions=96,
         ),
@@ -56,7 +56,7 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=_Backend.FLASHINFER,
+            backend=AttentionBackendEnum.FLASHINFER,
             attention_fusions=32,
             allreduce_fusions=65,
         ),
@@ -67,7 +67,7 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             model_name="meta-llama/Llama-3.1-8B-Instruct",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             attention_fusions=0,
             allreduce_fusions=65,
         ),
@@ -85,19 +85,19 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             attention_fusions=32,
         ),
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.ROCM_ATTN,
+            backend=AttentionBackendEnum.ROCM_ATTN,
             attention_fusions=32,
         ),
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
-            backend=_Backend.ROCM_AITER_UNIFIED_ATTN,
+            backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
             attention_fusions=32,
         ),
     ]
@@ -117,15 +117,15 @@ class ModelBackendTestCase(NamedTuple):
 def test_attn_quant(
     model_name: str,
     model_kwargs: dict[str, Any],
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     attention_fusions: int,
     allreduce_fusions: int,
     custom_ops: str,
     inductor_graph_partition: bool,
     caplog_mp_spawn,
     monkeypatch,
 ):
-    if backend == _Backend.FLASHINFER and (
+    if backend == AttentionBackendEnum.FLASHINFER and (
         not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
     ):
         pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
@@ -208,7 +208,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
 def test_tp2_attn_quant_allreduce_rmsnorm(
     model_name: str,
     model_kwargs: dict,
-    backend: _Backend,
+    backend: AttentionBackendEnum,
     attention_fusions: int,
     allreduce_fusions: int,
     custom_ops: str,
 
@@ -3,13 +3,13 @@
 
 import pytest
 
-from vllm.attention.backends.registry import _Backend
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.multimodal import MultiModalConfig
 
 
 def test_mm_encoder_attn_backend_str_conversion():
     config = MultiModalConfig(mm_encoder_attn_backend="FLASH_ATTN")
-    assert config.mm_encoder_attn_backend == _Backend.FLASH_ATTN
+    assert config.mm_encoder_attn_backend == AttentionBackendEnum.FLASH_ATTN
 
 
 def test_mm_encoder_attn_backend_invalid():
@@ -20,6 +20,6 @@ def test_mm_encoder_attn_backend_invalid():
 def test_mm_encoder_attn_backend_hash_updates():
     base_hash = MultiModalConfig().compute_hash()
     overridden_hash = MultiModalConfig(
-        mm_encoder_attn_backend=_Backend.FLASH_ATTN
+        mm_encoder_attn_backend=AttentionBackendEnum.FLASH_ATTN
     ).compute_hash()
     assert base_hash != overridden_hash
@@ -120,12 +120,13 @@ def test_env(
 
         elif device == "cuda":
             with patch("vllm.platforms.current_platform", CudaPlatform()):
+                capability = torch.cuda.get_device_capability()
                 if use_mla:
                     # CUDA MLA backend logic:
                     # - CUTLASS_MLA: only supported with block_size == 128
-                    #   and Blackwell GPUs (SM 10.0), V1 only
+                    #   and Blackwell GPUs (SM 10.x), V1 only
                     # - FLASHINFER_MLA: only supported on Blackwell GPUs
-                    #   (SM 10.0+), V1 only
+                    #   (SM 10.x), V1 only
                     # - FLASHMLA: only supported with block_size == 64
                     # - FLASH_ATTN_MLA: V1 only
                     # - TRITON_MLA: fallback for other cases
@@ -134,58 +135,72 @@ def test_env(
                         if block_size != 128:
                             # CUTLASS_MLA only supports block_size == 128
                             pytest.skip("CUTLASS_MLA only supports block_size 128")
-                        else:
-                            backend = get_attn_backend(
-                                16, torch.float16, None, block_size, use_mla=use_mla
-                            )
-                            expected = "CUTLASS_MLA"
-                            assert backend.get_name() == expected
+                        if capability[0] != 10:
+                            pytest.skip("CUTLASS MLA is not supported on this platform")
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "CUTLASS_MLA"
+                        assert backend.get_name() == expected
                     elif name == "FLASHINFER_MLA":
+                        if capability[0] != 10:
+                            pytest.skip(
+                                "FlashInfer MLA is not supported on this platform"
+                            )
                         if block_size not in [32, 64]:
                             # FlashInfer MLA only supports block_size 32 or 64
                             pytest.skip(
                                 "FlashInfer MLA only supports block_size 32 or 64"
                             )
-                        else:
-                            backend = get_attn_backend(
-                                16, torch.float16, None, block_size, use_mla=use_mla
-                            )
-                            expected = "FLASHINFER_MLA"
-                            assert backend.get_name() == expected
+                        backend = get_attn_backend(
+                            576, torch.float16, None, block_size, use_mla=use_mla
+                        )
+                        expected = "FLASHINFER_MLA"
+                        assert backend.get_name() == expected
                     elif name == "FLASHMLA":
                         if block_size != 64:
                             # FlashMLA only supports block_size == 64
                             pytest.skip("FlashMLA only supports block_size 64")
-                        else:
-                            from vllm.v1.attention.backends.mla.flashmla import (
-                                is_flashmla_dense_supported,
-                            )
+                        from vllm.v1.attention.backends.mla.flashmla import (
+                            is_flashmla_dense_supported,
+                        )
 
-                            is_supported, _ = is_flashmla_dense_supported()
-                            if not is_supported:
-                                pytest.skip("FlashMLA not supported on this platform")
-                            else:
-                                backend = get_attn_backend(
-                                    16, torch.float16, None, block_size, use_mla=use_mla
-                                )
-                                expected = name
-                                assert backend.get_name() == expected
+                        is_supported, _ = is_flashmla_dense_supported()
+                        if not is_supported:
+                            pytest.skip("FlashMLA not supported on this platform")
+                        backend = get_attn_backend(
+                            576,
+                            torch.float16,
+                            None,
+                            block_size,
+                            use_mla=use_mla,
+                        )
+                        expected = name
+                        assert backend.get_name() == expected
                     elif name == "FLASH_ATTN_MLA":
+                        from vllm.attention.utils.fa_utils import (
+                            flash_attn_supports_mla,
+                        )
+
+                        if not flash_attn_supports_mla():
+                            pytest.skip(
+                                "FlashAttention MLA not supported on this platform"
+                            )
                         backend = get_attn_backend(
-                            16, torch.float16, None, block_size, use_mla=use_mla
+                            576, torch.float16, None, block_size, use_mla=use_mla
                         )
                         expected = "FLASH_ATTN_MLA"
                         assert backend.get_name() == expected
                     else:
                         # TRITON_MLA or other fallback
                         backend = get_attn_backend(
-                            16, torch.float16, None, block_size, use_mla=use_mla
+                            576, torch.float16, None, block_size, use_mla=use_mla
                         )
                         expected = "TRITON_MLA"
                         assert backend.get_name() == expected
                 elif name == "FLASHINFER":
                     backend = get_attn_backend(
-                        16, torch.float16, None, block_size, use_mla=use_mla
+                        64, torch.float16, None, block_size, use_mla=use_mla
                     )
                     expected = "FLASHINFER"
                     assert backend.get_name() == expected