vllm-project · vllm-bot · Jan 9, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -1105,8 +1105,8 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   - vllm/v1/attention/backends/mla/cutlass_mla.py
   - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/selector.py
   - vllm/platforms/cuda.py
-  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -954,8 +954,8 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   - vllm/v1/attention/backends/mla/cutlass_mla.py
   - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/selector.py
   - vllm/platforms/cuda.py
-  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py

diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
@@ -90,8 +90,8 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   - vllm/v1/attention/backends/mla/cutlass_mla.py
   - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/v1/attention/selector.py
   - vllm/platforms/cuda.py
-  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -3,7 +3,6 @@
 
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
@@ -27,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # vLLM V1
 /vllm/v1/attention @LucasWilkinson
+/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
@@ -117,15 +117,15 @@ mkdocs.yaml @hmellor
 /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
 
 # Kernels
-/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
-/vllm/attention/ops/triton_unified_attention.py @tdoublep
+/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
+/vllm/v1/attention/ops/triton_unified_attention.py @tdoublep
 
 # ROCm related: specify owner with write access to notify AMD folks for careful code review
 /vllm/**/*rocm* @tjtanaa
 /docker/Dockerfile.rocm* @gshtras @tjtanaa
 /vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
 /vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
-/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
+/vllm/v1/attention/ops/rocm*.py @gshtras @tjtanaa
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
 /csrc/rocm @gshtras @tjtanaa
 /requirements/*rocm* @tjtanaa

diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -222,10 +222,10 @@ pull_request_rules:
       - files~=^csrc/rocm/
       - files~=^docker/Dockerfile.rocm
       - files~=^requirements/rocm.*\.txt
-      - files~=^vllm/attention/backends/rocm.*\.py
-      - files~=^vllm/attention/ops/rocm.*\.py
       - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
+      - files~=^vllm/v1/attention/backends/rocm.*\.py
       - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
+      - files~=^vllm/v1/attention/ops/rocm.*\.py
       - files~=^tests/kernels/.*_rocm.*\.py
       - files=vllm/platforms/rocm.py
       - title~=(?i)AMD

diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@@ -7,16 +7,16 @@
 from tabulate import tabulate
 
 from vllm import _custom_ops as ops
-from vllm.attention.ops.triton_reshape_and_cache_flash import (
-    triton_reshape_and_cache_flash,
-)
 from vllm.logger import init_logger
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.torch_utils import (
     STR_DTYPE_TO_TORCH_DTYPE,
     create_kv_caches_with_random_flash,
     set_random_seed,
 )
+from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
+    triton_reshape_and_cache_flash,
+)
 
 logger = init_logger(__name__)
 

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
@@ -142,7 +142,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
 Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
-It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend.
+It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/v1/attention/backends/registry.py) when adding a new mamba backend.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
 Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
 The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md
@@ -60,7 +60,7 @@ For example:
 **1. Attention:**
 
 ```python
---8<-- "vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn"
+--8<-- "vllm/model_executor/layers/attention/mm_encoder_attention.py:mm_encoder_attn"
 
 --8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention"
 ```

diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
@@ -124,7 +124,7 @@ Every plugin has three parts:
 
     Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented.
 
-5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
+5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.v1.attention.backend.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
 
 6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports:
 
@@ -153,5 +153,5 @@ The interface for the model/module may change during vLLM's development. If you
 
 !!! warning "Deprecations"
     - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
-    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
+    - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
     - `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead.
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
@@ -4,10 +4,10 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import AttentionConfig
 from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 
 def parse_args():

diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
@@ -4,10 +4,10 @@
 from argparse import Namespace
 
 from vllm import LLM, EngineArgs
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import AttentionConfig
 from vllm.platforms import current_platform
 from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 
 def parse_args():

diff --git a/tests/compile/fullgraph/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py
@@ -9,10 +9,10 @@
 from tests.utils import wait_for_gpu_memory_to_clear
 from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
 from vllm import LLM, SamplingParams
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import CompilationConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 
 @contextlib.contextmanager

diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
@@ -10,10 +10,10 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 from ...utils import create_new_process_for_each_test
 

diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
@@ -9,8 +9,6 @@
 from tests.utils import flat_product
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import Attention
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
@@ -37,6 +35,8 @@
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
+from vllm.v1.attention.backend import AttentionMetadata
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 FP8_DTYPE = current_platform.fp8_dtype()

diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
@@ -5,7 +5,6 @@
 import torch
 
 from tests.compile.backend import TestBackend
-from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
 from vllm.compilation.noop_elimination import NoOpEliminationPass
@@ -25,6 +24,7 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.platforms import current_platform
+from vllm.v1.attention.backend import AttentionType
 
 RSQRT_OP = torch.ops.aten.rsqrt.default
 INDEX_SELECT_OP = torch.ops.aten.index.Tensor

diff --git a/tests/config/test_multimodal_config.py b/tests/config/test_multimodal_config.py
@@ -3,8 +3,8 @@
 
 import pytest
 
-from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.multimodal import MultiModalConfig
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
 
 def test_mm_encoder_attn_backend_str_conversion():

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
@@ -299,7 +299,7 @@ def test_compilation_config():
 
 
 def test_attention_config():
-    from vllm.attention.backends.registry import AttentionBackendEnum
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
 

@@ -6,9 +6,9 @@
 import torch
 
 import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
-from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available
 
 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]

@@ -10,7 +10,7 @@
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.attention.layer import Attention
-from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 from vllm.utils.torch_utils import set_random_seed
@@ -30,7 +30,7 @@
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing
 
 # This should be sync with get_supported_head_sizes() in
-# vllm.attention.ops.paged_attn.PagedAttention
+# vllm.v1.attention.ops.paged_attn.PagedAttention
 HEAD_SIZES = [32, 80, 128, 256]
 
 BLOCK_SIZES = [16, 32]

@@ -6,13 +6,13 @@
 import pytest
 import torch
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.rocm import RocmPlatform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend
 
 
 @pytest.fixture(autouse=True)
@@ -182,7 +182,7 @@ def test_backend_selection(
                         expected = name
                         assert backend.get_name() == expected
                     elif name == "FLASH_ATTN_MLA":
-                        from vllm.attention.utils.fa_utils import (
+                        from vllm.v1.attention.backends.fa_utils import (
                             flash_attn_supports_mla,
                         )
 

@@ -270,7 +270,7 @@ def permute_and_compact(x):
             v_scale,
         )
     elif implementation == "triton":
-        from vllm.attention.ops.triton_reshape_and_cache_flash import (
+        from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
             triton_reshape_and_cache_flash,
         )
 

@@ -7,12 +7,12 @@
 import pytest
 import torch
 
-from vllm.attention.ops.flashmla import (
+from vllm.triton_utils import triton
+from vllm.v1.attention.ops.flashmla import (
     flash_mla_with_kvcache,
     get_mla_metadata,
     is_flashmla_dense_supported,
 )
-from vllm.triton_utils import triton
 
 
 def cal_diff(

@@ -5,7 +5,7 @@
 
 
 def test_sparse_flashmla_metadata_smoke():
-    import vllm.attention.ops.flashmla as fm
+    import vllm.v1.attention.ops.flashmla as fm
 
     ok, reason = fm.is_flashmla_sparse_supported()
     if not ok:
@@ -34,7 +34,7 @@ def test_sparse_flashmla_metadata_smoke():
 
 
 def test_sparse_flashmla_decode_smoke():
-    import vllm.attention.ops.flashmla as fm
+    import vllm.v1.attention.ops.flashmla as fm
 
     ok, reason = fm.is_flashmla_sparse_supported()
     if not ok:
@@ -97,7 +97,7 @@ def test_sparse_flashmla_decode_smoke():
 
 
 def test_sparse_flashmla_prefill_smoke():
-    import vllm.attention.ops.flashmla as fm
+    import vllm.v1.attention.ops.flashmla as fm
 
     ok, reason = fm.is_flashmla_sparse_supported()
     if not ok:

@@ -5,10 +5,10 @@
 import torch
 
 from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
-from vllm.attention.ops.triton_merge_attn_states import (
+from vllm.platforms import current_platform
+from vllm.v1.attention.ops.triton_merge_attn_states import (
     merge_attn_states as merge_attn_states_triton,
 )
-from vllm.platforms import current_platform
 
 
 # Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005

@@ -12,14 +12,14 @@
 import pytest
 import torch
 
-from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
-from vllm.attention.selector import _cached_get_attn_backend
+from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.rocm import RocmPlatform
 from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.attention.selector import _cached_get_attn_backend
 
 
 @pytest.fixture(autouse=True)

@@ -4,7 +4,7 @@
 import torch
 from torch.testing import assert_close
 
-from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
+from vllm.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton
 
 
 def test_pack_seq_basic_fp8():

@@ -10,10 +10,12 @@
 import torch
 import torch.nn.functional as F
 
-from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
-from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
+from vllm.v1.attention.ops.chunked_prefill_paged_decode import (
+    chunked_prefill_paged_decode,
+)
+from vllm.v1.attention.ops.prefix_prefill import context_attention_fwd
 
 NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 64]