diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 7d322aeaf78f..ff5ce20d111b 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1105,8 +1105,8 @@ steps: - vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/v1/attention/selector.py - vllm/platforms/cuda.py - - vllm/attention/selector.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fceae96854a8..4e004e3479a1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -954,8 +954,8 @@ steps: - vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/v1/attention/selector.py - vllm/platforms/cuda.py - - vllm/attention/selector.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index 7ca099516d64..cf4b646f3495 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -90,8 +90,8 @@ steps: - vllm/v1/attention/backends/flashinfer.py - vllm/v1/attention/backends/mla/cutlass_mla.py - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/v1/attention/selector.py - vllm/platforms/cuda.py - - vllm/attention/selector.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 8122c525f1f1..c963be4cb8f9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -3,7 +3,6 @@ # This lists cover the "core" components of vLLM that require careful review /vllm/attention @LucasWilkinson -/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety @@ -27,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson # vLLM V1 /vllm/v1/attention @LucasWilkinson +/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill /vllm/v1/attention/backends/mla @pavanimajety /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety /vllm/v1/attention/backends/triton_attn.py @tdoublep @@ -117,15 +117,15 @@ mkdocs.yaml @hmellor /vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten # Kernels -/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep -/vllm/attention/ops/triton_unified_attention.py @tdoublep +/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep +/vllm/v1/attention/ops/triton_unified_attention.py @tdoublep # ROCm related: specify owner with write access to notify AMD folks for careful code review /vllm/**/*rocm* @tjtanaa /docker/Dockerfile.rocm* @gshtras @tjtanaa /vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa /vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa -/vllm/attention/ops/rocm*.py @gshtras @tjtanaa +/vllm/v1/attention/ops/rocm*.py @gshtras @tjtanaa /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa /csrc/rocm @gshtras @tjtanaa /requirements/*rocm* @tjtanaa diff --git a/.github/mergify.yml b/.github/mergify.yml index 61a03135be39..a496dd302db5 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -222,10 +222,10 @@ pull_request_rules: - files~=^csrc/rocm/ - files~=^docker/Dockerfile.rocm - files~=^requirements/rocm.*\.txt - - files~=^vllm/attention/backends/rocm.*\.py - - files~=^vllm/attention/ops/rocm.*\.py - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py + - files~=^vllm/v1/attention/backends/rocm.*\.py - files~=^vllm/v1/attention/backends/mla/rocm.*\.py + - files~=^vllm/v1/attention/ops/rocm.*\.py - files~=^tests/kernels/.*_rocm.*\.py - files=vllm/platforms/rocm.py - title~=(?i)AMD diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index bca66f301127..ef6be1f3c359 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -7,9 +7,6 @@ from tabulate import tabulate from vllm import _custom_ops as ops -from vllm.attention.ops.triton_reshape_and_cache_flash import ( - triton_reshape_and_cache_flash, -) from vllm.logger import init_logger from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( @@ -17,6 +14,9 @@ create_kv_caches_with_random_flash, set_random_seed, ) +from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash, +) logger = init_logger(__name__) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index d37501b86556..28f6f960ab04 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -142,7 +142,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`. It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers. Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this. -It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend. +It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/v1/attention/backends/registry.py) when adding a new mamba backend. Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it. Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this. The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended. diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index 6beb0ce0c0b8..fd298a149ab0 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -60,7 +60,7 @@ For example: **1. Attention:** ```python ---8<-- "vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn" +--8<-- "vllm/model_executor/layers/attention/mm_encoder_attention.py:mm_encoder_attn" --8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention" ``` diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 6a4b5fd6b882..9cebaed51eeb 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -124,7 +124,7 @@ Every plugin has three parts: Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented. -5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations. +5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.v1.attention.backend.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations. 6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports: @@ -153,5 +153,5 @@ The interface for the model/module may change during vLLM's development. If you !!! warning "Deprecations" - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. - - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. + - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. - `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead. diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index 17f727b33d32..90793fb61666 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -4,10 +4,10 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import AttentionConfig from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.attention.backends.registry import AttentionBackendEnum def parse_args(): diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index b2dadffd249f..abe827043d78 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -4,10 +4,10 @@ from argparse import Namespace from vllm import LLM, EngineArgs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import AttentionConfig from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.attention.backends.registry import AttentionBackendEnum def parse_args(): diff --git a/tests/compile/fullgraph/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py index c1f6f95d5630..c7c737371fc3 100644 --- a/tests/compile/fullgraph/test_full_cudagraph.py +++ b/tests/compile/fullgraph/test_full_cudagraph.py @@ -9,10 +9,10 @@ from tests.utils import wait_for_gpu_memory_to_clear from tests.v1.attention.utils import full_cg_backend_configs as backend_configs from vllm import LLM, SamplingParams -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import CompilationConfig from vllm.platforms import current_platform from vllm.utils.torch_utils import is_torch_equal_or_newer +from vllm.v1.attention.backends.registry import AttentionBackendEnum @contextlib.contextmanager diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index 7fe90c881177..209a879bfb9d 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -10,10 +10,10 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils.torch_utils import is_torch_equal_or_newer +from vllm.v1.attention.backends.registry import AttentionBackendEnum from ...utils import create_new_process_for_each_test diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 9e52de5c219d..a1fd098aee5f 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -9,8 +9,6 @@ from tests.utils import flat_product from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention.backends.abstract import AttentionMetadata -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import Attention from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fx_utils import find_op_nodes @@ -37,6 +35,8 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer +from vllm.v1.attention.backend import AttentionMetadata +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.kv_cache_interface import AttentionSpec FP8_DTYPE = current_platform.fp8_dtype() diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index e0968ac79925..45a114679beb 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -5,7 +5,6 @@ import torch from tests.compile.backend import TestBackend -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP from vllm.compilation.noop_elimination import NoOpEliminationPass @@ -25,6 +24,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.platforms import current_platform +from vllm.v1.attention.backend import AttentionType RSQRT_OP = torch.ops.aten.rsqrt.default INDEX_SELECT_OP = torch.ops.aten.index.Tensor diff --git a/tests/config/test_multimodal_config.py b/tests/config/test_multimodal_config.py index 3d02893e52f1..51bf938785e5 100644 --- a/tests/config/test_multimodal_config.py +++ b/tests/config/test_multimodal_config.py @@ -3,8 +3,8 @@ import pytest -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.multimodal import MultiModalConfig +from vllm.v1.attention.backends.registry import AttentionBackendEnum def test_mm_encoder_attn_backend_str_conversion(): diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 5bb5fcea2a94..2acb38bc9a18 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -299,7 +299,7 @@ def test_compilation_config(): def test_attention_config(): - from vllm.attention.backends.registry import AttentionBackendEnum + from vllm.v1.attention.backends.registry import AttentionBackendEnum parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py index 68ffb1ee34ad..cf24630c509f 100644 --- a/tests/kernels/attention/test_aiter_flash_attn.py +++ b/tests/kernels/attention/test_aiter_flash_attn.py @@ -6,9 +6,9 @@ import torch import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401 -from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed +from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 24b058ed24fa..94d494613fe7 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -10,7 +10,7 @@ from tests.kernels.utils import opcheck from vllm import _custom_ops as ops from vllm.attention.layer import Attention -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.utils.mem_utils import get_max_shared_memory_bytes from vllm.utils.torch_utils import set_random_seed @@ -30,7 +30,7 @@ NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing # This should be sync with get_supported_head_sizes() in -# vllm.attention.ops.paged_attn.PagedAttention +# vllm.v1.attention.ops.paged_attn.PagedAttention HEAD_SIZES = [32, 80, 128, 256] BLOCK_SIZES = [16, 32] diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index d62acc2022d1..a63297c3579e 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -6,13 +6,13 @@ import pytest import torch -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.rocm import RocmPlatform +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend @pytest.fixture(autouse=True) @@ -182,7 +182,7 @@ def test_backend_selection( expected = name assert backend.get_name() == expected elif name == "FLASH_ATTN_MLA": - from vllm.attention.utils.fa_utils import ( + from vllm.v1.attention.backends.fa_utils import ( flash_attn_supports_mla, ) diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index 19892ce26b6b..367a986ab63b 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -270,7 +270,7 @@ def permute_and_compact(x): v_scale, ) elif implementation == "triton": - from vllm.attention.ops.triton_reshape_and_cache_flash import ( + from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( triton_reshape_and_cache_flash, ) diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py index 2151933a610d..6b3d3485db1d 100644 --- a/tests/kernels/attention/test_flashmla.py +++ b/tests/kernels/attention/test_flashmla.py @@ -7,12 +7,12 @@ import pytest import torch -from vllm.attention.ops.flashmla import ( +from vllm.triton_utils import triton +from vllm.v1.attention.ops.flashmla import ( flash_mla_with_kvcache, get_mla_metadata, is_flashmla_dense_supported, ) -from vllm.triton_utils import triton def cal_diff( diff --git a/tests/kernels/attention/test_flashmla_sparse.py b/tests/kernels/attention/test_flashmla_sparse.py index 7ee6f4b07b4a..c1147ae9edb1 100644 --- a/tests/kernels/attention/test_flashmla_sparse.py +++ b/tests/kernels/attention/test_flashmla_sparse.py @@ -5,7 +5,7 @@ def test_sparse_flashmla_metadata_smoke(): - import vllm.attention.ops.flashmla as fm + import vllm.v1.attention.ops.flashmla as fm ok, reason = fm.is_flashmla_sparse_supported() if not ok: @@ -34,7 +34,7 @@ def test_sparse_flashmla_metadata_smoke(): def test_sparse_flashmla_decode_smoke(): - import vllm.attention.ops.flashmla as fm + import vllm.v1.attention.ops.flashmla as fm ok, reason = fm.is_flashmla_sparse_supported() if not ok: @@ -97,7 +97,7 @@ def test_sparse_flashmla_decode_smoke(): def test_sparse_flashmla_prefill_smoke(): - import vllm.attention.ops.flashmla as fm + import vllm.v1.attention.ops.flashmla as fm ok, reason = fm.is_flashmla_sparse_supported() if not ok: diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py index c7662223e1ca..a9f525cdc3ce 100644 --- a/tests/kernels/attention/test_merge_attn_states.py +++ b/tests/kernels/attention/test_merge_attn_states.py @@ -5,10 +5,10 @@ import torch from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda -from vllm.attention.ops.triton_merge_attn_states import ( +from vllm.platforms import current_platform +from vllm.v1.attention.ops.triton_merge_attn_states import ( merge_attn_states as merge_attn_states_triton, ) -from vllm.platforms import current_platform # Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005 diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 56912c1458b2..ecaea88674c2 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -12,14 +12,14 @@ import pytest import torch -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention -from vllm.attention.selector import _cached_get_attn_backend +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.rocm import RocmPlatform from vllm.utils.torch_utils import set_random_seed +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import _cached_get_attn_backend @pytest.fixture(autouse=True) diff --git a/tests/kernels/attention/test_pack_unpack_triton.py b/tests/kernels/attention/test_pack_unpack_triton.py index d2aa14738d9d..158ae550ef03 100644 --- a/tests/kernels/attention/test_pack_unpack_triton.py +++ b/tests/kernels/attention/test_pack_unpack_triton.py @@ -4,7 +4,7 @@ import torch from torch.testing import assert_close -from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton +from vllm.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton def test_pack_seq_basic_fp8(): diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index 45779636e0a8..2dc4a3cd2c14 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -10,10 +10,12 @@ import torch import torch.nn.functional as F -from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode -from vllm.attention.ops.prefix_prefill import context_attention_fwd from vllm.platforms import current_platform from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed +from vllm.v1.attention.ops.chunked_prefill_paged_decode import ( + chunked_prefill_paged_decode, +) +from vllm.v1.attention.ops.prefix_prefill import context_attention_fwd NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 64] diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py index f97d475eb47d..2a684ed70d60 100644 --- a/tests/kernels/attention/test_rocm_attention_selector.py +++ b/tests/kernels/attention/test_rocm_attention_selector.py @@ -4,10 +4,10 @@ import pytest import torch -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config from vllm.platforms.rocm import RocmPlatform +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend @pytest.fixture(autouse=True) @@ -19,7 +19,7 @@ def clear_cache(): @pytest.mark.skip(reason="Skipped for now. Should be revisited.") def test_selector(monkeypatch: pytest.MonkeyPatch): # Set the current platform to ROCm using monkeypatch - monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform()) + monkeypatch.setattr("vllm.v1.attention.selector.current_platform", RocmPlatform()) # Test standard ROCm attention attention_config = AttentionConfig(backend=AttentionBackendEnum.ROCM_ATTN) diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py index 04085fe5fa0f..f6b066a7bd1e 100644 --- a/tests/kernels/attention/test_triton_decode_attention.py +++ b/tests/kernels/attention/test_triton_decode_attention.py @@ -4,8 +4,8 @@ import pytest import torch -from vllm.attention.ops.triton_decode_attention import decode_attention_fwd from vllm.utils.math_utils import cdiv +from vllm.v1.attention.ops.triton_decode_attention import decode_attention_fwd @pytest.mark.parametrize("B", [3, 5]) diff --git a/tests/kernels/attention/test_triton_prefill_attention.py b/tests/kernels/attention/test_triton_prefill_attention.py index 67c52cbfd452..f4505d91f5f7 100644 --- a/tests/kernels/attention/test_triton_prefill_attention.py +++ b/tests/kernels/attention/test_triton_prefill_attention.py @@ -5,7 +5,7 @@ import torch import torch.nn.functional as F -from vllm.attention.ops.triton_prefill_attention import context_attention_fwd +from vllm.v1.attention.ops.triton_prefill_attention import context_attention_fwd def ref_masked_attention( diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index 55e3593481cb..a28982250f9c 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -5,10 +5,10 @@ import pytest import torch -from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.platforms import current_platform from vllm.utils.math_utils import next_power_of_2 from vllm.utils.torch_utils import set_random_seed +from vllm.v1.attention.ops.triton_unified_attention import unified_attention NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 72c79370d19c..ccdacf40c430 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -13,11 +13,11 @@ from torch._prims_common import TensorLikeType from tests.kernels.quant_utils import native_w8a8_block_matmul -from vllm.attention.backends.abstract import AttentionType from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input from vllm.utils.torch_utils import make_tensor_with_pad +from vllm.v1.attention.backend import AttentionType # For now, disable "test_aot_dispatch_dynamic" since there are some # bugs related to this test in PyTorch 2.4. diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py index 8cea6135ba6a..8f141746e249 100644 --- a/tests/models/multimodal/generation/test_vit_backend_functionality.py +++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py @@ -14,10 +14,10 @@ from transformers import AutoProcessor from vllm import LLM, EngineArgs, SamplingParams -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.multimodal.utils import encode_image_url from vllm.multimodal.video import sample_frames_from_video from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum from ....utils import create_new_process_for_each_test from ...utils import dummy_hf_overrides diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index f3b85ba0ee39..9be5fd33022f 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -9,7 +9,7 @@ import pytest from tests.quantization.utils import is_quant_method_supported -from vllm.attention.utils.fa_utils import flash_attn_supports_fp8 +from vllm.v1.attention.backends.fa_utils import flash_attn_supports_fp8 from vllm.platforms import current_platform from ..utils import check_logprobs_close diff --git a/tests/test_attention_backend_registry.py b/tests/test_attention_backend_registry.py index 7b90b949aa45..034749874d7b 100644 --- a/tests/test_attention_backend_registry.py +++ b/tests/test_attention_backend_registry.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.attention.backends.abstract import ( +from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, ) -from vllm.attention.backends.registry import ( +from vllm.v1.attention.backends.registry import ( AttentionBackendEnum, MambaAttentionBackendEnum, register_backend, diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 80714ac5a34c..2068c30c0916 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -15,8 +15,6 @@ create_vllm_config, try_get_attention_backend, ) -from vllm.attention.backends.abstract import AttentionType -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ModelConfig from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv @@ -25,6 +23,8 @@ is_torch_equal_or_newer, set_random_seed, ) +from vllm.v1.attention.backend import AttentionType +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, set_kv_cache_layout, diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 514bd0526a9e..de80c556bcba 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -18,15 +18,15 @@ try_get_attention_backend, ) from vllm import _custom_ops as ops -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.ops.flashmla import is_flashmla_dense_supported -from vllm.attention.utils.fa_utils import flash_attn_supports_mla from vllm.config.vllm import set_current_vllm_config from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla from vllm.v1.attention.backends.mla.common import QueryLenSupport +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported from vllm.v1.kv_cache_interface import FullAttentionSpec BACKENDS_TO_TEST = [ diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py index 77faeb93dfc1..a31c053aed21 100644 --- a/tests/v1/attention/test_rocm_attention_backends_selection.py +++ b/tests/v1/attention/test_rocm_attention_backends_selection.py @@ -7,9 +7,9 @@ import pytest import torch -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.selector import AttentionSelectorConfig from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import AttentionSelectorConfig # ROCm-specific attention backend selection tests pytestmark = pytest.mark.skipif( diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py index 2b63253b3601..1c24cd82f071 100644 --- a/tests/v1/attention/test_sparse_mla_backends.py +++ b/tests/v1/attention/test_sparse_mla_backends.py @@ -21,7 +21,6 @@ create_vllm_config, ) from vllm import _custom_ops as ops -from vllm.attention.ops import flashmla from vllm.config import set_current_vllm_config from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.platforms import current_platform @@ -31,6 +30,7 @@ triton_convert_req_index_to_global_index, ) from vllm.v1.attention.backends.utils import split_prefill_chunks +from vllm.v1.attention.ops import flashmla SPARSE_BACKEND_BATCH_SPECS = { name: BATCH_SPECS[name] diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 031436a03090..71e74f4d5af4 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -7,8 +7,6 @@ import pytest import torch -from vllm.attention.backends.abstract import AttentionImpl -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( CacheConfig, CompilationConfig, @@ -20,6 +18,8 @@ VllmConfig, ) from vllm.config.model import ModelDType +from vllm.v1.attention.backend import AttentionImpl +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py index a8013ed229cf..485eb26c7b9b 100644 --- a/tests/v1/determinism/utils.py +++ b/tests/v1/determinism/utils.py @@ -6,9 +6,9 @@ import pytest import torch -from vllm.attention.utils.fa_utils import flash_attn_supports_mla from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer +from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla skip_unsupported = pytest.mark.skipif( not (current_platform.is_cuda() and current_platform.has_device_capability(80)), diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py index 0d29ca5fca5e..da6a5aadbc6d 100644 --- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py +++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py @@ -14,12 +14,12 @@ import pytest -from vllm.attention.backends.abstract import AttentionMetadata from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, ) +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from .utils import create_scheduler, create_vllm_config diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index a5e326e82c59..3158ff0bda95 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -13,7 +13,6 @@ create_standard_kv_cache_spec, try_get_attention_backend, ) -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( AttentionConfig, CacheConfig, @@ -27,6 +26,7 @@ from vllm.config.load import LoadConfig from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py index 3b8813ceb818..b33dc58ffe3a 100644 --- a/tests/v1/spec_decode/test_mtp.py +++ b/tests/v1/spec_decode/test_mtp.py @@ -12,7 +12,6 @@ create_standard_kv_cache_spec, try_get_attention_backend, ) -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( CacheConfig, DeviceConfig, @@ -25,6 +24,7 @@ from vllm.config.load import LoadConfig from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.spec_decode.eagle import EagleProposer mimo_7b_dir = "XiaomiMiMo/MiMo-7B-Base" diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index 0afeeb8914b8..a0f140ccac0f 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -11,9 +11,9 @@ create_vllm_config, try_get_attention_backend, ) -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available from vllm.config import ParallelConfig, SpeculativeConfig +from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.utils import CommonAttentionMetadata if not is_flash_attn_varlen_func_available(): diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 5108729ae578..badbd3e9adff 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -5,8 +5,6 @@ import pytest import torch -from vllm.attention.backends.abstract import MultipleOf -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import Attention from vllm.config import ( AttentionConfig, @@ -27,6 +25,8 @@ from vllm.utils.mem_constants import GiB_bytes from vllm.utils.system_utils import update_environment_variables from vllm.utils.torch_utils import set_random_seed +from vllm.v1.attention.backend import MultipleOf +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput from vllm.v1.kv_cache_interface import ( diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index cb98a856c49f..4b7f85077962 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -73,7 +73,9 @@ "vllm/model_executor/models", "vllm/model_executor/layers/fla/ops", # Ignore triton kernels in ops. - "vllm/attention/ops", + "vllm/v1/attention/ops", + # TODO(matt): remove. + "vllm/v1/attention/backends/fa_utils.py", ] diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index a09666b65a99..411d11e5a23f 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -8,13 +8,6 @@ import torch.nn as nn import vllm.envs as envs -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionType, - MLAAttentionImpl, -) -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.selector import get_attn_backend from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer from vllm.config import CacheConfig, get_current_vllm_config @@ -37,6 +30,13 @@ direct_register_custom_op, kv_cache_dtype_str_to_dtype, ) +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionType, + MLAAttentionImpl, +) +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import ( FullAttentionSpec, KVCacheSpec, diff --git a/vllm/attention/ops/__init__.py b/vllm/attention/ops/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/vllm/config/attention.py b/vllm/config/attention.py index dd62d88826bd..293045787a1c 100644 --- a/vllm/config/attention.py +++ b/vllm/config/attention.py @@ -6,9 +6,9 @@ from pydantic import field_validator from pydantic.dataclasses import dataclass -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.utils import config from vllm.logger import init_logger +from vllm.v1.attention.backends.registry import AttentionBackendEnum logger = init_logger(__name__) diff --git a/vllm/config/model.py b/vllm/config/model.py index c8b677695473..bec1de55460c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -12,7 +12,6 @@ from pydantic.dataclasses import dataclass import vllm.envs as envs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.model_arch import ( ModelArchitectureConfig, ) @@ -50,6 +49,7 @@ from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri from vllm.transformers_utils.utils import maybe_model_redirect from vllm.utils.import_utils import LazyLoader +from vllm.v1.attention.backends.registry import AttentionBackendEnum if TYPE_CHECKING: from transformers import PretrainedConfig diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 8a2936de96d6..ecb346af8f3c 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -7,9 +7,9 @@ from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config.utils import config from vllm.utils.hashing import safe_hash +from vllm.v1.attention.backends.registry import AttentionBackendEnum @dataclass @@ -124,7 +124,7 @@ class MultiModalConfig: mm_encoder_attn_backend: AttentionBackendEnum | None = None """Optional override for the multi-modal encoder attention backend when using vision transformers. Accepts any value from - `vllm.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`).""" + `vllm.v1.attention.backends.registry.AttentionBackendEnum` (e.g. `FLASH_ATTN`).""" interleave_mm_strings: bool = False """Enable fully interleaved support for multimodal prompts, while using --chat-template-content-format=string.""" diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 914ab91b1563..1a09f2e6b272 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -10,10 +10,10 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import get_current_vllm_config from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput if TYPE_CHECKING: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 0829336f0d50..fd997d67ef13 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -42,8 +42,8 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py index e9b2bd392b0e..525061fc0087 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py @@ -36,7 +36,6 @@ import torch -from vllm.attention.backends.abstract import AttentionMetadata from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, @@ -44,6 +43,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata from vllm.logger import init_logger from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import AttentionMetadata if TYPE_CHECKING: from vllm.config import VllmConfig diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py index 41243fc866b5..ca2647194cec 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py @@ -7,7 +7,6 @@ import safetensors import torch -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -16,6 +15,7 @@ ) from vllm.logger import init_logger from vllm.utils.hashing import safe_hash +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 7869e08f19a1..ae2d7442d631 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -5,7 +5,6 @@ import torch -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_events import ( BlockStored, @@ -19,6 +18,7 @@ KVConnectorRole, ) from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index beeffd7c63db..8159832cc342 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -36,7 +36,6 @@ PluginLauncher as RuntimePluginLauncher, ) -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -54,6 +53,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import get_kv_cache_torch_dtype +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.version import __version__ as VLLM_VERSION diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index 9ebd2b1a3155..629170615dd8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -10,13 +10,13 @@ from lmcache.integration.vllm.utils import mla_enabled from lmcache.utils import init_logger as lmcache_init_logger -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole, ) +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput from vllm.v1.request import RequestStatus diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py index 91f0c6d481f8..2c604617297c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py @@ -16,8 +16,6 @@ import zmq.asyncio from vllm import envs -from vllm.attention.backends.abstract import AttentionMetadata -from vllm.attention.selector import get_attn_backend from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.utils import TpKVTopology from vllm.distributed.kv_transfer.kv_connector.v1.base import ( @@ -33,7 +31,9 @@ from vllm.forward_context import ForwardContext from vllm.logger import init_logger from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.utils import get_kv_cache_layout +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py index 4b6bd906d5d4..abdbeb9e416e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py @@ -15,7 +15,6 @@ import torch import zmq -from vllm.attention.selector import get_attn_backend from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -56,11 +55,12 @@ make_zmq_path, make_zmq_socket, ) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata + from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 3fa1cdc1e100..412e2c57133f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -7,7 +7,6 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.config import VllmConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType @@ -24,6 +23,7 @@ PromMetricT, ) from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 8177a26a4c1f..dc50ea678327 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -20,8 +20,6 @@ import zmq from vllm import envs -from vllm.attention.backends.abstract import AttentionMetadata -from vllm.attention.selector import get_attn_backend from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.utils import ( EngineId, @@ -50,7 +48,9 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.network_utils import make_zmq_path, make_zmq_socket +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.utils import get_kv_cache_layout +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.block_table import BlockTable diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 7f03e0d88b9d..67cf4b047025 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -8,7 +8,6 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent @@ -20,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata from vllm.forward_context import ForwardContext from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_utils import BlockHash from vllm.v1.core.sched.output import SchedulerOutput diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 8f3a62d7bcdb..09e3b0333040 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -7,7 +7,6 @@ import regex as re import torch -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, @@ -19,6 +18,7 @@ ) from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 55f4c280ab3b..94608b13dfd4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -32,7 +32,6 @@ from typing_extensions import TypeIs import vllm.envs as envs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( AttentionConfig, CacheConfig, @@ -94,6 +93,7 @@ from vllm.utils.mem_constants import GiB_bytes from vllm.utils.network_utils import get_ip from vllm.utils.torch_utils import resolve_kv_cache_dtype_string +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.sample.logits_processor import LogitsProcessor if TYPE_CHECKING: diff --git a/vllm/envs.py b/vllm/envs.py index 63b1954abac2..74f0b2177f86 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -683,7 +683,7 @@ def get_vllm_port() -> int | None: None, lambda: list( __import__( - "vllm.attention.backends.registry", fromlist=["AttentionBackendEnum"] + "vllm.v1.attention.backends.registry", fromlist=["AttentionBackendEnum"] ).AttentionBackendEnum.__members__.keys() ), ), diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 1ef8e5403e9e..9ef0569e85fe 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -10,10 +10,10 @@ import torch import vllm.envs as envs -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.worker.dp_utils import coordinate_batch_across_dp from vllm.v1.worker.ubatch_utils import UBatchSlices diff --git a/vllm/attention/backends/__init__.py b/vllm/model_executor/layers/attention/__init__.py similarity index 100% rename from vllm/attention/backends/__init__.py rename to vllm/model_executor/layers/attention/__init__.py diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py similarity index 97% rename from vllm/attention/layers/chunked_local_attention.py rename to vllm/model_executor/layers/attention/chunked_local_attention.py index 7e3794d40833..a34506934bde 100644 --- a/vllm/attention/layers/chunked_local_attention.py +++ b/vllm/model_executor/layers/attention/chunked_local_attention.py @@ -4,12 +4,11 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, @@ -17,6 +16,7 @@ make_local_attention_virtual_batches, subclass_attention_backend, ) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import ( AttentionSpec, ChunkedLocalAttentionSpec, diff --git a/vllm/attention/layers/cross_attention.py b/vllm/model_executor/layers/attention/cross_attention.py similarity index 98% rename from vllm/attention/layers/cross_attention.py rename to vllm/model_executor/layers/attention/cross_attention.py index f58c9d541775..9c3bc3403940 100644 --- a/vllm/attention/layers/cross_attention.py +++ b/vllm/model_executor/layers/attention/cross_attention.py @@ -6,20 +6,20 @@ import numpy as np import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionMetadata, - AttentionType, -) from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionMetadata, + AttentionType, +) from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, subclass_attention_backend, ) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import CrossAttentionSpec, KVCacheSpec logger = init_logger(__name__) diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/model_executor/layers/attention/encoder_only_attention.py similarity index 96% rename from vllm/attention/layers/encoder_only_attention.py rename to vllm/model_executor/layers/attention/encoder_only_attention.py index 5e99c9901003..c130fd095652 100644 --- a/vllm/attention/layers/encoder_only_attention.py +++ b/vllm/model_executor/layers/attention/encoder_only_attention.py @@ -5,19 +5,19 @@ import torch -from vllm.attention.backends.abstract import ( +from vllm.attention.layer import Attention +from vllm.config import CacheConfig +from vllm.config.vllm import VllmConfig +from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, AttentionType, ) -from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend -from vllm.config import CacheConfig -from vllm.config.vllm import VllmConfig from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, subclass_attention_backend, ) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import KVCacheSpec diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py similarity index 97% rename from vllm/attention/layers/mm_encoder_attention.py rename to vllm/model_executor/layers/attention/mm_encoder_attention.py index 411bdfa75861..099fe23914cc 100644 --- a/vllm/attention/layers/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -4,16 +4,16 @@ import torch -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.ops.vit_attn_wrappers import ( - vit_flash_attn_wrapper, - vit_torch_sdpa_wrapper, -) -from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import MultiModalConfig from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.models.vision import get_vit_attn_backend +from vllm.v1.attention.backends.fa_utils import get_flash_attn_version +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.ops.vit_attn_wrappers import ( + vit_flash_attn_wrapper, + vit_torch_sdpa_wrapper, +) logger = init_logger(__name__) diff --git a/vllm/attention/layers/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py similarity index 98% rename from vllm/attention/layers/static_sink_attention.py rename to vllm/model_executor/layers/attention/static_sink_attention.py index 13be65d8bed9..918dff560f1d 100644 --- a/vllm/attention/layers/static_sink_attention.py +++ b/vllm/model_executor/layers/attention/static_sink_attention.py @@ -4,26 +4,26 @@ import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionMetadata, - AttentionType, -) from vllm.attention.layer import Attention -from vllm.attention.ops.triton_reshape_and_cache_flash import ( - triton_reshape_and_cache_flash_diffkv, -) -from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig, VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionMetadata, + AttentionType, +) from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, subclass_attention_backend, ) +from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash_diffkv, +) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import ( AttentionSpec, KVCacheSpec, diff --git a/vllm/model_executor/layers/attention_layer_base.py b/vllm/model_executor/layers/attention_layer_base.py index 24809ccb0ded..97395b641497 100644 --- a/vllm/model_executor/layers/attention_layer_base.py +++ b/vllm/model_executor/layers/attention_layer_base.py @@ -4,8 +4,8 @@ from abc import ABC, abstractmethod -from vllm.attention.backends.abstract import AttentionBackend, AttentionImpl from vllm.config import VllmConfig +from vllm.v1.attention.backend import AttentionBackend, AttentionImpl from vllm.v1.kv_cache_interface import KVCacheSpec diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 1058270889b2..d3cf9739fcf4 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -6,11 +6,11 @@ import torch -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.torch_utils import is_torch_equal_or_newer +from vllm.v1.attention.backends.registry import AttentionBackendEnum logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py index 27cc3884517f..fde9ad36bcd3 100644 --- a/vllm/model_executor/layers/kda.py +++ b/vllm/model_executor/layers/kda.py @@ -5,7 +5,6 @@ from einops import rearrange from torch import nn -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import ( divide, @@ -17,6 +16,7 @@ from vllm.model_executor.model_loader.weight_utils import sharded_weight_loader from vllm.model_executor.utils import set_weight_attrs from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata from .fla.ops.kda import ( diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py index 74f4383e9c23..4f45dd6caf35 100644 --- a/vllm/model_executor/layers/mamba/abstract.py +++ b/vllm/model_executor/layers/mamba/abstract.py @@ -5,10 +5,10 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.selector import get_mamba_attn_backend from vllm.config import VllmConfig from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.v1.attention.backend import AttentionBackend +from vllm.v1.attention.selector import get_mamba_attn_backend from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py index 8020efbe3e74..8b5f80f54527 100644 --- a/vllm/model_executor/layers/mamba/linear_attn.py +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -8,7 +8,6 @@ from einops import rearrange from torch import nn -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed.communication_op import tensor_model_parallel_all_reduce from vllm.distributed.parallel_state import ( @@ -29,6 +28,7 @@ ) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 875bc9019fba..74e4a34b4ae0 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -5,7 +5,6 @@ import torch from torch import nn -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import ( divide, @@ -43,6 +42,7 @@ ) from vllm.model_executor.utils import set_weight_attrs from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata # Added by the IBM Team, 2024 diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py index e6bfea3a2f59..14e00bce2b1d 100644 --- a/vllm/model_executor/layers/mamba/short_conv.py +++ b/vllm/model_executor/layers/mamba/short_conv.py @@ -4,7 +4,6 @@ import torch -from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import ForwardContext, get_forward_context @@ -24,6 +23,7 @@ causal_conv1d_update, ) from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index f4248b67f734..ef6f59e447d2 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -9,7 +9,6 @@ import torch from torch import nn -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config @@ -50,6 +49,7 @@ maybe_prefix, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType logger = init_logger(__name__) diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index 96ca27ad0250..b802bb0ee35b 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -8,10 +8,10 @@ import torch import torch.nn as nn -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.utils import divide from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index e3f97a718b0f..7d43735c0053 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -32,13 +32,14 @@ from torch import nn from transformers import ApertusConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import XIELU +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -57,6 +58,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index b52f6d2bf2a3..cce01ea50acd 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -7,11 +7,13 @@ from torch import nn from transformers import BertConfig -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, PoolerConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 14794fd6ae72..a5c43bbb301e 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -6,7 +6,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -16,6 +15,9 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import get_act_and_mul_fn, get_act_fn +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.fused_moe import activation_without_mul, fused_topk from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 7387830b32bd..9279cccd596d 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -9,9 +9,9 @@ import torch.nn as nn from transformers import Blip2VisionConfig, BlipVisionConfig -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 1eae71f3a1d3..d18904fdf603 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -15,11 +15,11 @@ ) from vllm.attention.layer import Attention -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 9ef038d84ac7..82e6df199020 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -4,12 +4,12 @@ from math import lcm from typing import TYPE_CHECKING -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec if TYPE_CHECKING: diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py index 6b9d09e88aba..b3e5d920e03a 100644 --- a/vllm/model_executor/models/deepencoder.py +++ b/vllm/model_executor/models/deepencoder.py @@ -18,8 +18,8 @@ import torch.nn.functional as F from transformers import CLIPVisionConfig -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 7f1880e44bd8..db0ccd6958b9 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -33,9 +33,7 @@ from transformers import DeepseekV2Config, DeepseekV3Config from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -78,10 +76,12 @@ from vllm.sequence import IntermediateTensors from vllm.utils.deep_gemm import fp8_mqa_logits, fp8_paged_mqa_logits from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.mla.indexer import ( DeepseekV32IndexerBackend, DeepseekV32IndexerMetadata, ) +from vllm.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec from vllm.v1.worker.workspace import current_workspace_manager @@ -679,7 +679,9 @@ def sparse_attn_indexer( ) fp8_mqa_logits_func = fp8_mqa_logits if current_platform.is_rocm(): - from vllm.attention.ops.rocm_aiter_mla_sparse import rocm_fp8_mqa_logits + from vllm.v1.attention.ops.rocm_aiter_mla_sparse import ( + rocm_fp8_mqa_logits, + ) fp8_mqa_logits_func = rocm_fp8_mqa_logits logits = fp8_mqa_logits_func( @@ -729,7 +731,7 @@ def sparse_attn_indexer( num_padded_tokens = batch_size * next_n fp8_paged_mqa_logits_func = fp8_paged_mqa_logits if current_platform.is_rocm(): - from vllm.attention.ops.rocm_aiter_mla_sparse import ( + from vllm.v1.attention.ops.rocm_aiter_mla_sparse import ( rocm_fp8_paged_mqa_logits, ) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index c9e0dc8b9c5c..ac9ad3b67d65 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -8,10 +8,6 @@ from torch.nn import LayerNorm from transformers.models.qwen2_vl import Qwen2VLProcessor -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import ( - MMEncoderAttention, -) from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import utils as dist_utils @@ -20,6 +16,9 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -59,6 +58,7 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig, DotsVisionConfig from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .vision import run_dp_sharded_mrope_vision_model diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index d47955ea3d2a..a382cb5b61fe 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -36,16 +36,15 @@ from einops import rearrange from transformers import BatchFeature -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import ( - MMEncoderAttention, -) from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -74,6 +73,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM from .interfaces import ( diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index e6a201c669e9..c8a0ba8c9d3b 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -22,13 +22,15 @@ from torch import nn from transformers import Gemma3TextConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -47,8 +49,8 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType -from ...attention.layers.encoder_only_attention import EncoderOnlyAttention from .interfaces import SupportsLoRA, SupportsPP from .utils import ( AutoWeightsLoader, diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 2cd11e66c752..06da2a8b3498 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -29,7 +29,6 @@ from torch import nn from transformers import Glm4Config -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -41,6 +40,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA, SupportsPP from .llama import LlamaMLP as Glm4MLP diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 4c4347f5a70a..05257bd1ea01 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -46,15 +46,14 @@ from transformers.models.glm4v.video_processing_glm4v import Glm4vVideoProcessor from transformers.video_utils import VideoMetadata -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import ( - MMEncoderAttention, -) from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) from vllm.model_executor.layers.conv import Conv2dLayer, Conv3dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -89,6 +88,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from ..layers.activation import SiluAndMul from .interfaces import ( diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 453a7812a174..297237fd196a 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -19,11 +19,11 @@ from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index 27d408afdce7..a3b9a1221934 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -11,12 +11,12 @@ from transformers.models.glmasr import GlmAsrConfig, GlmAsrProcessor from transformers.models.whisper import WhisperFeatureExtractor -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 8a8df9f6ed95..69678188a619 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -7,7 +7,6 @@ from torch import nn from transformers import GptOssConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -37,6 +36,7 @@ from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index adb71e93bdb0..1cf6e824fa28 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -33,7 +33,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config @@ -65,6 +64,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 6fc56094af65..9afb86a89f7d 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -33,14 +33,13 @@ import torch.nn.functional as F from transformers import BatchFeature -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -81,6 +80,7 @@ from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index ee6ca5eacb17..c78ad64790e8 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -27,9 +27,9 @@ Idefics2VisionConfig, ) -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 5f7ba838aa3d..3e3d60ceaf93 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -15,7 +15,6 @@ import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.distributed import ( divide, get_tensor_model_parallel_rank, @@ -24,6 +23,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py index a16857d61322..2b2866d678a8 100644 --- a/vllm/model_executor/models/interns1_vit.py +++ b/vllm/model_executor/models/interns1_vit.py @@ -14,8 +14,8 @@ from transformers import PretrainedConfig from transformers.utils import torch_int -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 47704aac8220..1901cc6e81c4 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -24,7 +24,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -48,6 +47,7 @@ ) from vllm.model_executor.models.llama import LlamaMLP from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .utils import ( AutoWeightsLoader, diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index e05df611fa94..ffcc2444627e 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -16,11 +16,11 @@ from transformers.tokenization_utils import TensorType from typing_extensions import TypedDict, Unpack -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig, VllmConfig from vllm.config.model import ModelConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 18a999ea0ead..8e6b6642591d 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -16,13 +16,13 @@ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from transformers.utils import torch_int -from vllm.attention.layers.mm_encoder_attention import ( - MMEncoderAttention, -) from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 4332acc82120..95b5f0f5bf19 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,13 +31,14 @@ from torch import nn from transformers import LlamaConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -56,6 +57,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .adapters import as_embedding_model, as_seq_cls_model from .interfaces import ( diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 9ed0741acba1..dde6db7c204b 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -25,7 +25,6 @@ from transformers import Llama4TextConfig from vllm.attention.layer import Attention -from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -34,6 +33,9 @@ tensor_model_parallel_all_gather, ) from vllm.logger import init_logger +from vllm.model_executor.layers.attention.chunked_local_attention import ( + ChunkedLocalAttention, +) from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index 57aa4d91df5e..db85073b38a4 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -6,7 +6,6 @@ import torch from torch import nn -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.config import ( CacheConfig, @@ -43,6 +42,7 @@ ) from vllm.model_executor.models.utils import sequence_parallel_chunk from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import MixtureOfExperts, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 18509882d1f0..955a73ff19ed 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -14,7 +14,6 @@ from torch import nn from transformers import MiniMaxConfig -from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig @@ -48,6 +47,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import maybe_prefix from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionMetadata from .interfaces import HasInnerState, IsHybrid from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index aeea4a140465..fb66a03b8b22 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -31,10 +31,10 @@ get_best_fit, ) -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index d72b4800c098..77394803996d 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -7,10 +7,12 @@ from transformers import ModernBertConfig from transformers.activations import ACT2FN -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.pooler import DispatchPooler from vllm.model_executor.layers.pooler.seqwise import ( diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 5ccc5653ec8b..bdfa6178b4e3 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -18,7 +18,6 @@ from transformers.tokenization_utils_base import TextInput from vllm.attention.layer import Attention -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -30,6 +29,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import MulAndSilu, QuickGELU, SiluAndMul +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index c785b9910093..c675b2cd6594 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -52,9 +52,9 @@ from transformers.activations import ACT2FN from transformers.modeling_utils import PreTrainedModel -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 83ef5e7e1282..da0688f71958 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -31,7 +31,6 @@ from torch import nn from transformers import LlamaConfig -from vllm.attention.backends.abstract import AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group @@ -49,6 +48,7 @@ ) from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import HasNoOps, SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py index 1e7bb0e4304b..a88e52b5585b 100644 --- a/vllm/model_executor/models/nemotron_parse.py +++ b/vllm/model_executor/models/nemotron_parse.py @@ -26,7 +26,6 @@ TensorType, ) -from vllm.attention.backends.abstract import AttentionType from vllm.config import CacheConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.config.multimodal import BaseDummyOptions @@ -63,6 +62,7 @@ from vllm.transformers_utils.configs.radio import RadioConfig from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backend import AttentionType logger = init_logger(__name__) DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648) diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 44e3baee0206..9f569bcc71cf 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -30,7 +30,6 @@ from transformers import PretrainedConfig from vllm.attention.layer import Attention, AttentionType -from vllm.attention.layers.static_sink_attention import StaticSinkAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.distributed import ( @@ -42,6 +41,9 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.static_sink_attention import ( + StaticSinkAttention, +) from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 829148b4c1fb..f51c0f095072 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -33,7 +33,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -57,6 +56,7 @@ maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA from .utils import ( diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 0e5537b86fa4..530974f7fa8b 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -30,14 +30,13 @@ ) from transformers.utils import torch_int -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import ( - MMEncoderAttention, -) from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -72,6 +71,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .ernie45 import Ernie4_5ForCausalLM from .interfaces import MultiModalEmbeddings, SupportsMRoPE, SupportsMultiModal diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 9e052ce0bd0a..225e131ec764 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -9,7 +9,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig, get_current_vllm_config @@ -66,6 +65,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import IntermediateTensors from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index a91aa2cdf78d..ab9eac1a94fd 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -33,13 +33,14 @@ from torch import nn from transformers import Qwen2Config -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -59,6 +60,7 @@ ) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 221e7bb0661c..6e9e46368f26 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -41,8 +41,6 @@ Qwen2_5_VLVisionConfig, ) -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import MultiModalConfig, VllmConfig from vllm.distributed import parallel_state @@ -50,6 +48,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( @@ -83,6 +82,7 @@ from vllm.sequence import IntermediateTensors from vllm.utils.platform_utils import is_pin_memory_available from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ee2b6c22b2fc..3b0dce7fcd17 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -43,14 +43,13 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize from transformers.models.qwen2_vl.video_processing_qwen2_vl import Qwen2VLVideoProcessor -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, @@ -90,6 +89,7 @@ from vllm.sequence import IntermediateTensors from vllm.tokenizers import TokenizerLike from vllm.utils.tensor_schema import TensorSchema, TensorShape +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 0d0da52ed738..707e0ccfd3c5 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -30,7 +30,6 @@ from torch import nn from transformers import Qwen3Config -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -44,6 +43,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import set_default_rope_theta +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 9fded8e6b587..c3e45de707c5 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -10,7 +10,6 @@ from torch import nn from transformers.activations import ACT2FN -from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import ( @@ -75,6 +74,7 @@ from vllm.transformers_utils.configs import Qwen3NextConfig from vllm.triton_utils import tl, triton from vllm.utils.torch_utils import direct_register_custom_op +from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata from .interfaces import ( diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index de8027c434cc..d17ac6ce8cb3 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -46,7 +46,6 @@ ) from transformers.models.whisper import WhisperFeatureExtractor -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.compilation.decorators import support_torch_compile from vllm.config import MultiModalConfig, VllmConfig from vllm.distributed import get_pp_group @@ -75,6 +74,7 @@ PromptUpdateDetails, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index f47062c101e3..279d280677bc 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -48,7 +48,6 @@ ) from transformers.video_utils import VideoMetadata -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.compilation.decorators import support_torch_compile from vllm.config import MultiModalConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions @@ -92,6 +91,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils.collection_utils import is_list_of +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interfaces import ( MultiModalEmbeddings, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index f25223c78255..91a60bfd1668 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -30,7 +30,6 @@ from torch import nn from transformers import PretrainedConfig as SeedOssConfig -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -56,6 +55,7 @@ ) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import set_default_rope_theta +from vllm.v1.attention.backend import AttentionType from .interfaces import SupportsLoRA, SupportsPP from .utils import ( diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index e39ae4340fe4..c047415d4104 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -15,12 +15,14 @@ SiglipVisionConfig, ) -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/siglip2.py b/vllm/model_executor/models/siglip2.py index f7c91aa28dcc..8fbc408ec23e 100644 --- a/vllm/model_executor/models/siglip2.py +++ b/vllm/model_executor/models/siglip2.py @@ -10,11 +10,11 @@ from torch.nn import functional as F from transformers import Siglip2VisionConfig -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import MultiModalConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index b560710d978a..f4b79da5c3a4 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -11,10 +11,10 @@ from transformers import Siglip2VisionConfig from transformers.configuration_utils import PretrainedConfig -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 3c965721b9da..771e5974ae00 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -15,11 +15,11 @@ from torchvision.transforms.functional import InterpolationMode from transformers import BatchFeature, PretrainedConfig, TensorType -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index 2e79ace465a8..d094bb2895f2 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -27,13 +27,14 @@ from transformers import AutoModel from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS -from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention -from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention from vllm.config.utils import getattr_iter from vllm.distributed import get_pp_group, get_tp_group from vllm.distributed.utils import get_pp_indices from vllm.logger import init_logger +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.models.interfaces import ( SupportsEagle, @@ -59,6 +60,7 @@ maybe_prefix, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType if TYPE_CHECKING: from transformers import PreTrainedModel diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 024c50f1207e..2a4bec774b09 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -10,7 +10,6 @@ import torch from transformers import PretrainedConfig -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import VllmConfig from vllm.distributed import ( get_tensor_model_parallel_rank, @@ -19,6 +18,7 @@ ) from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum logger = init_logger(__name__) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index d1dadc3d5198..2f92ba0ac683 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -18,18 +18,15 @@ ) from transformers.models.whisper.modeling_whisper import sinusoids -from vllm.attention.backends.abstract import ( - AttentionType, -) from vllm.attention.layer import Attention -from vllm.attention.layers.cross_attention import CrossAttention -from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.cross_attention import CrossAttention +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -62,6 +59,9 @@ from vllm.utils.jsontree import json_map_leaves from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.torch_utils import set_default_torch_dtype +from vllm.v1.attention.backend import ( + AttentionType, +) from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsTranscription from .utils import ( diff --git a/vllm/model_executor/models/whisper_utils.py b/vllm/model_executor/models/whisper_utils.py index 077b4aff6fec..0bd0db061541 100644 --- a/vllm/model_executor/models/whisper_utils.py +++ b/vllm/model_executor/models/whisper_utils.py @@ -9,20 +9,20 @@ import torch.nn.functional as F from torch import nn -from vllm.attention.backends.abstract import ( +from vllm.attention.layer import Attention +from vllm.config import CacheConfig, VllmConfig +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, AttentionType, ) -from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend -from vllm.config import CacheConfig, VllmConfig -from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, subclass_attention_backend_with_overrides, ) +from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import AttentionSpec # From https://platform.openai.com/docs/guides/speech-to-text/supported-languages diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index c3adc0036863..949e9f41e39e 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -15,16 +15,16 @@ import torch from vllm import envs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interface import CpuArchEnum, Platform, PlatformEnum logger = init_logger(__name__) if TYPE_CHECKING: - from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig + from vllm.v1.attention.selector import AttentionSelectorConfig else: VllmConfig = None diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2dc4ba5d70ca..47d634416ae5 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -14,17 +14,17 @@ # import custom ops, trigger op registration import vllm._C # noqa -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.utils.import_utils import import_pynvml from vllm.utils.torch_utils import cuda_device_count_stateless +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: - from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig from vllm.config.cache import CacheDType + from vllm.v1.attention.selector import AttentionSelectorConfig else: VllmConfig = None CacheDType = None @@ -148,7 +148,7 @@ def log_warnings(cls): @classmethod def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: - from vllm.attention.backends.registry import AttentionBackendEnum + from vllm.v1.attention.backends.registry import AttentionBackendEnum parallel_config = vllm_config.parallel_config model_config = vllm_config.model_config @@ -200,7 +200,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: use_cutlass_mla = backend == AttentionBackendEnum.CUTLASS_MLA use_flashinfer_mla = backend == AttentionBackendEnum.FLASHINFER_MLA - from vllm.attention.ops.flashmla import is_flashmla_dense_supported + from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported if ( use_flashmla diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 3bea498f1b87..f86abd712f6f 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -13,18 +13,18 @@ import torch from typing_extensions import deprecated -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger +from vllm.v1.attention.backends.registry import AttentionBackendEnum if TYPE_CHECKING: from torch.distributed import PrefixStore, ProcessGroup - from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser + from vllm.v1.attention.selector import AttentionSelectorConfig else: FlexibleArgumentParser = object diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 278be5a71a40..3a55dd36d3dc 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -8,15 +8,15 @@ import torch import vllm.envs as envs -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.utils.torch_utils import cuda_device_count_stateless +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: - from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig + from vllm.v1.attention.selector import AttentionSelectorConfig logger = init_logger(__name__) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 60e4968abcc3..b2d7bf38dd48 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -7,14 +7,14 @@ import torch -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger +from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interface import DeviceCapability, Platform, PlatformEnum if TYPE_CHECKING: - from vllm.attention.selector import AttentionSelectorConfig from vllm.config import VllmConfig + from vllm.v1.attention.selector import AttentionSelectorConfig else: VllmConfig = None diff --git a/vllm/attention/backends/abstract.py b/vllm/v1/attention/backend.py similarity index 100% rename from vllm/attention/backends/abstract.py rename to vllm/v1/attention/backend.py diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index abbee244af3d..3fc53278a221 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -6,16 +6,16 @@ import torch from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import ( +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.platforms import CpuArchEnum, current_platform +from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, AttentionLayer, AttentionType, is_quantized_kv_cache, ) -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.platforms import CpuArchEnum, current_platform from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, diff --git a/vllm/attention/utils/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py similarity index 100% rename from vllm/attention/utils/fa_utils.py rename to vllm/v1/attention/backends/fa_utils.py diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 24390605a940..aa51c1a4301f 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -9,24 +9,24 @@ import numpy as np import torch -from vllm.attention.backends.abstract import ( +from vllm.attention.layer import Attention +from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, AttentionType, MultipleOf, is_quantized_kv_cache, ) -from vllm.attention.layer import Attention -from vllm.attention.ops.common import cp_lse_ag_out_rs -from vllm.attention.ops.merge_attn_states import merge_attn_states -from vllm.attention.utils.fa_utils import ( +from vllm.v1.attention.backends.fa_utils import ( flash_attn_supports_fp8, get_flash_attn_version, is_flash_attn_varlen_func_available, ) +from vllm.v1.attention.ops.common import cp_lse_ag_out_rs +from vllm.v1.attention.ops.merge_attn_states import merge_attn_states if is_flash_attn_varlen_func_available(): - from vllm.attention.utils.fa_utils import ( + from vllm.v1.attention.backends.fa_utils import ( flash_attn_supports_sinks, flash_attn_varlen_func, get_scheduler_metadata, diff --git a/vllm/v1/attention/backends/flash_attn_diffkv.py b/vllm/v1/attention/backends/flash_attn_diffkv.py index ebbc4a02c03b..5305cc1b8c12 100644 --- a/vllm/v1/attention/backends/flash_attn_diffkv.py +++ b/vllm/v1/attention/backends/flash_attn_diffkv.py @@ -4,14 +4,14 @@ import torch -from vllm.attention.backends.abstract import AttentionType -from vllm.attention.ops.triton_reshape_and_cache_flash import ( +from vllm.v1.attention.backend import AttentionType +from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available +from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( triton_reshape_and_cache_flash_diffkv, ) -from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available if is_flash_attn_varlen_func_available(): - from vllm.attention.utils.fa_utils import flash_attn_varlen_func + from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func from vllm.logger import init_logger from vllm.v1.attention.backends.utils import get_kv_cache_layout diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 0bdf396d8f4f..8dc2838d88a5 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -19,14 +19,6 @@ from typing_extensions import override from vllm import envs -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionType, - MultipleOf, -) -from vllm.attention.ops.common import cp_lse_ag_out_rs -from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.config import CUDAGraphMode, VllmConfig, get_current_vllm_config from vllm.config.cache import CacheDType from vllm.distributed.parallel_state import get_dcp_group @@ -48,6 +40,12 @@ ) from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import is_pin_memory_available +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionType, + MultipleOf, +) from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, @@ -59,6 +57,8 @@ infer_global_hyperparameters, split_decodes_and_prefills, ) +from vllm.v1.attention.ops.common import cp_lse_ag_out_rs +from vllm.v1.attention.ops.merge_attn_states import merge_attn_states from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.utils import CpuGpuBuffer diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index ad99a6dadf44..994bbe3c98fa 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -20,12 +20,6 @@ or_masks, ) -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionType, - is_quantized_kv_cache, -) from vllm.config import VllmConfig from vllm.config.cache import CacheDType from vllm.logger import init_logger @@ -35,6 +29,12 @@ from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import is_torch_equal_or_newer +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionType, + is_quantized_kv_cache, +) from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index 96f0d20ace3e..1d58ac683912 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -6,8 +6,8 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import ( PAD_SLOT_ID, AttentionCGSupport, diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py index 004baa2d09cd..b1aad30ee5ea 100644 --- a/vllm/v1/attention/backends/linear_attn.py +++ b/vllm/v1/attention/backends/linear_attn.py @@ -4,8 +4,8 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/mamba1_attn.py b/vllm/v1/attention/backends/mamba1_attn.py index 47dd44601377..9d4a37576dd4 100644 --- a/vllm/v1/attention/backends/mamba1_attn.py +++ b/vllm/v1/attention/backends/mamba1_attn.py @@ -3,7 +3,7 @@ from dataclasses import dataclass -from vllm.attention.backends.abstract import AttentionBackend +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.mamba_attn import ( BaseMambaAttentionMetadata, BaseMambaAttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/mamba2_attn.py b/vllm/v1/attention/backends/mamba2_attn.py index 74925a86e1d4..a5f661d5dc1b 100644 --- a/vllm/v1/attention/backends/mamba2_attn.py +++ b/vllm/v1/attention/backends/mamba2_attn.py @@ -5,9 +5,9 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.mamba_attn import ( BaseMambaAttentionMetadata, BaseMambaAttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 2ee2740a51ba..a5bd949e9bbe 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -199,15 +199,6 @@ from vllm import _custom_ops as ops from vllm import envs from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionLayer, - AttentionMetadata, - MLAAttentionImpl, -) -from vllm.attention.ops.common import cp_lse_ag_out_rs -from vllm.attention.ops.merge_attn_states import merge_attn_states -from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank from vllm.logger import init_logger @@ -222,6 +213,13 @@ from vllm.platforms import current_platform from vllm.utils.flashinfer import has_nvidia_artifactory from vllm.utils.math_utils import cdiv, round_down +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionLayer, + AttentionMetadata, + MLAAttentionImpl, +) +from vllm.v1.attention.backends.fa_utils import get_flash_attn_version from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, @@ -230,6 +228,8 @@ infer_global_hyperparameters, split_decodes_and_prefills, ) +from vllm.v1.attention.ops.common import cp_lse_ag_out_rs +from vllm.v1.attention.ops.merge_attn_states import merge_attn_states from vllm.v1.kv_cache_interface import AttentionSpec diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 5e3fbc0abf08..8cb8fa1f5b83 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -7,15 +7,15 @@ import torch import vllm._custom_ops as ops -from vllm.attention.backends.abstract import ( +from vllm.config.cache import CacheDType +from vllm.logger import init_logger +from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import ( AttentionLayer, AttentionType, MultipleOf, is_quantized_kv_cache, ) -from vllm.config.cache import CacheDType -from vllm.logger import init_logger -from vllm.platforms.interface import DeviceCapability from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonImpl, diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 915b51c25828..2e0a19ac5a85 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -6,23 +6,23 @@ import torch -from vllm.attention.backends.abstract import ( +from vllm.config import VllmConfig +from vllm.config.cache import CacheDType +from vllm.logger import init_logger +from vllm.model_executor.layers.batch_invariant import ( + vllm_is_batch_invariant, +) +from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import ( AttentionLayer, AttentionType, MultipleOf, is_quantized_kv_cache, ) -from vllm.attention.utils.fa_utils import ( +from vllm.v1.attention.backends.fa_utils import ( flash_attn_supports_mla, get_flash_attn_version, ) -from vllm.config import VllmConfig -from vllm.config.cache import CacheDType -from vllm.logger import init_logger -from vllm.model_executor.layers.batch_invariant import ( - vllm_is_batch_invariant, -) -from vllm.platforms.interface import DeviceCapability from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonDecodeMetadata, diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index f02a4bb1ef35..c0442b13fe52 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -6,14 +6,14 @@ import torch from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla -from vllm.attention.backends.abstract import ( +from vllm.config.cache import CacheDType +from vllm.logger import init_logger +from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import ( AttentionLayer, AttentionType, MultipleOf, ) -from vllm.config.cache import CacheDType -from vllm.logger import init_logger -from vllm.platforms.interface import DeviceCapability from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonImpl, diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 913503ce4494..24ef6dd4d028 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -6,12 +6,6 @@ import torch -from vllm.attention.backends.abstract import AttentionLayer, AttentionType, MultipleOf -from vllm.attention.ops.flashmla import ( - flash_mla_with_kvcache, - get_mla_metadata, - is_flashmla_dense_supported, -) from vllm.config import VllmConfig from vllm.config.cache import CacheDType from vllm.logger import init_logger @@ -19,6 +13,7 @@ vllm_is_batch_invariant, ) from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import AttentionLayer, AttentionType, MultipleOf from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonDecodeMetadata, @@ -32,6 +27,11 @@ reshape_attn_output_for_spec_decode, reshape_query_for_spec_decode, ) +from vllm.v1.attention.ops.flashmla import ( + flash_mla_with_kvcache, + get_mla_metadata, + is_flashmla_dense_supported, +) from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) @@ -78,11 +78,11 @@ def supports_combination( device_capability: DeviceCapability, ) -> str | None: if use_sparse: - from vllm.attention.ops.flashmla import is_flashmla_sparse_supported + from vllm.v1.attention.ops.flashmla import is_flashmla_sparse_supported return is_flashmla_sparse_supported()[1] else: - from vllm.attention.ops.flashmla import is_flashmla_dense_supported + from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported return is_flashmla_dense_supported()[1] diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index dec92d2d4d19..282880adfad6 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -7,17 +7,6 @@ import torch from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionLayer, - AttentionMetadata, - MultipleOf, -) -from vllm.attention.ops.flashmla import ( - flash_mla_sparse_prefill, - flash_mla_with_kvcache, - get_mla_metadata, -) from vllm.config import VllmConfig, get_current_vllm_config from vllm.config.cache import CacheDType from vllm.logger import init_logger @@ -25,6 +14,12 @@ from vllm.platforms.interface import DeviceCapability from vllm.triton_utils import tl, triton from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionLayer, + AttentionMetadata, + MultipleOf, +) from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl, get_mla_dims from vllm.v1.attention.backends.utils import ( AttentionCGSupport, @@ -35,6 +30,11 @@ split_decodes_and_prefills, split_prefill_chunks, ) +from vllm.v1.attention.ops.flashmla import ( + flash_mla_sparse_prefill, + flash_mla_with_kvcache, + get_mla_metadata, +) from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.worker.workspace import current_workspace_manager diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index d0696f60a08c..351cbc8a6372 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -5,14 +5,14 @@ import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - MultipleOf, -) from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata, is_deep_gemm_supported +from vllm.v1.attention.backend import ( + AttentionBackend, + MultipleOf, +) from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index f79d58ca1fbe..d43516e551c2 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -7,8 +7,8 @@ import torch from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.backends.abstract import AttentionLayer, MultipleOf from vllm.config import VllmConfig +from vllm.v1.attention.backend import AttentionLayer, MultipleOf from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonDecodeMetadata, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py index e68e80e8611a..7d05879d9075 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py @@ -9,13 +9,13 @@ from vllm import _custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.backends.abstract import ( +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.attention.backend import ( AttentionBackend, AttentionLayer, AttentionMetadata, ) -from vllm.config import VllmConfig -from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import MLACommonBaseImpl, get_mla_dims from vllm.v1.attention.backends.mla.flashmla_sparse import ( triton_convert_req_index_to_global_index, diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 54ad3acb93ed..32d3fa3b0320 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -5,23 +5,23 @@ import torch -from vllm.attention.backends.abstract import ( - AttentionLayer, - AttentionType, - is_quantized_kv_cache, -) -from vllm.attention.ops.triton_decode_attention import decode_attention_fwd from vllm.config.cache import CacheDType from vllm.logger import init_logger from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import ( + AttentionLayer, + AttentionType, + is_quantized_kv_cache, +) from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonImpl, MLACommonMetadata, ) +from vllm.v1.attention.ops.triton_decode_attention import decode_attention_fwd logger = init_logger(__name__) diff --git a/vllm/attention/backends/registry.py b/vllm/v1/attention/backends/registry.py similarity index 99% rename from vllm/attention/backends/registry.py rename to vllm/v1/attention/backends/registry.py index cc1d3bfb3905..bd45702fa587 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/v1/attention/backends/registry.py @@ -10,7 +10,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend + from vllm.v1.attention.backend import AttentionBackend logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 501f197c57cb..da14a848447d 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -7,25 +7,25 @@ import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionType, - MultipleOf, -) from vllm.attention.layer import Attention -from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import get_cu_count +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionType, + MultipleOf, +) from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, split_decodes_prefills_and_extends, ) +from vllm.v1.attention.ops.merge_attn_states import merge_attn_states from vllm.v1.kv_cache_interface import AttentionSpec _PARTITION_SIZE_ROCM = 256 diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py index 16fb52ab501c..9589c3128f8e 100644 --- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py +++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py @@ -5,12 +5,12 @@ import torch from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import AttentionType from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8StaticTensorSym, ) +from vllm.v1.attention.backend import AttentionType from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.rocm_attn import ( RocmAttentionBackend, diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py index 0b7a51434cd3..9d00d8fa6810 100644 --- a/vllm/v1/attention/backends/rocm_attn.py +++ b/vllm/v1/attention/backends/rocm_attn.py @@ -7,17 +7,6 @@ import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionType, - MultipleOf, -) -from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode -from vllm.attention.ops.paged_attn import PagedAttention -from vllm.attention.ops.triton_reshape_and_cache_flash import ( - triton_reshape_and_cache_flash, -) from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -25,12 +14,25 @@ kFp8StaticTensorSym, ) from vllm.platforms import current_platform +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionType, + MultipleOf, +) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, ) +from vllm.v1.attention.ops.chunked_prefill_paged_decode import ( + chunked_prefill_paged_decode, +) +from vllm.v1.attention.ops.paged_attn import PagedAttention +from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash, +) from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/short_conv_attn.py b/vllm/v1/attention/backends/short_conv_attn.py index e2fae37f5619..dc6b425ce860 100644 --- a/vllm/v1/attention/backends/short_conv_attn.py +++ b/vllm/v1/attention/backends/short_conv_attn.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from vllm.attention.backends.abstract import AttentionBackend +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.mamba_attn import ( BaseMambaAttentionMetadata, BaseMambaAttentionMetadataBuilder, diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index 5e3c436f8a95..b6e58a25f1fa 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -9,20 +9,20 @@ import torch from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import ( +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, AttentionType, MultipleOf, ) -from vllm.attention.ops.triton_unified_attention import unified_attention -from vllm.config import VllmConfig -from vllm.logger import init_logger from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, split_decodes_and_prefills, ) +from vllm.v1.attention.ops.triton_unified_attention import unified_attention from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 9bf440a04d06..ed2f9564e329 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -7,17 +7,6 @@ import torch -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionType, - MultipleOf, -) -from vllm.attention.ops.triton_prefill_attention import context_attention_fwd -from vllm.attention.ops.triton_reshape_and_cache_flash import ( - triton_reshape_and_cache_flash, -) -from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.config import CUDAGraphMode, VllmConfig from vllm.config.cache import CacheDType from vllm.logger import init_logger @@ -28,11 +17,22 @@ from vllm.platforms import current_platform from vllm.platforms.interface import DeviceCapability from vllm.utils.math_utils import next_power_of_2 +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionType, + MultipleOf, +) from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, ) +from vllm.v1.attention.ops.triton_prefill_attention import context_attention_fwd +from vllm.v1.attention.ops.triton_reshape_and_cache_flash import ( + triton_reshape_and_cache_flash, +) +from vllm.v1.attention.ops.triton_unified_attention import unified_attention from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index cc33b3319712..eecd81a6961c 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -29,16 +29,16 @@ from vllm.v1.worker.gpu_input_batch import InputBatch import vllm.envs as envs -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionMetadata, -) from vllm.distributed.kv_transfer.kv_connector.utils import ( get_kv_connector_cache_layout, ) from vllm.logger import init_logger from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionImpl, + AttentionMetadata, +) from vllm.v1.kv_cache_interface import AttentionSpec from vllm.v1.worker.ubatch_utils import UBatchSlice diff --git a/vllm/attention/layers/__init__.py b/vllm/v1/attention/ops/__init__.py similarity index 100% rename from vllm/attention/layers/__init__.py rename to vllm/v1/attention/ops/__init__.py diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/v1/attention/ops/chunked_prefill_paged_decode.py similarity index 100% rename from vllm/attention/ops/chunked_prefill_paged_decode.py rename to vllm/v1/attention/ops/chunked_prefill_paged_decode.py diff --git a/vllm/attention/ops/common.py b/vllm/v1/attention/ops/common.py similarity index 100% rename from vllm/attention/ops/common.py rename to vllm/v1/attention/ops/common.py diff --git a/vllm/attention/ops/flashmla.py b/vllm/v1/attention/ops/flashmla.py similarity index 100% rename from vllm/attention/ops/flashmla.py rename to vllm/v1/attention/ops/flashmla.py diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/v1/attention/ops/merge_attn_states.py similarity index 94% rename from vllm/attention/ops/merge_attn_states.py rename to vllm/v1/attention/ops/merge_attn_states.py index f347fb3fbba5..673d2d94790e 100644 --- a/vllm/attention/ops/merge_attn_states.py +++ b/vllm/v1/attention/ops/merge_attn_states.py @@ -40,7 +40,7 @@ def supported_headdim(o: torch.Tensor) -> bool: output, prefix_output, prefix_lse, suffix_output, suffix_lse, output_lse ) else: - from vllm.attention.ops.triton_merge_attn_states import merge_attn_states + from vllm.v1.attention.ops.triton_merge_attn_states import merge_attn_states return merge_attn_states( output, prefix_output, prefix_lse, suffix_output, suffix_lse, output_lse diff --git a/vllm/attention/ops/paged_attn.py b/vllm/v1/attention/ops/paged_attn.py similarity index 100% rename from vllm/attention/ops/paged_attn.py rename to vllm/v1/attention/ops/paged_attn.py diff --git a/vllm/attention/ops/pallas_kv_cache_update.py b/vllm/v1/attention/ops/pallas_kv_cache_update.py similarity index 100% rename from vllm/attention/ops/pallas_kv_cache_update.py rename to vllm/v1/attention/ops/pallas_kv_cache_update.py diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/v1/attention/ops/prefix_prefill.py similarity index 100% rename from vllm/attention/ops/prefix_prefill.py rename to vllm/v1/attention/ops/prefix_prefill.py diff --git a/vllm/attention/ops/rocm_aiter_mla_sparse.py b/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py similarity index 100% rename from vllm/attention/ops/rocm_aiter_mla_sparse.py rename to vllm/v1/attention/ops/rocm_aiter_mla_sparse.py diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/v1/attention/ops/triton_decode_attention.py similarity index 100% rename from vllm/attention/ops/triton_decode_attention.py rename to vllm/v1/attention/ops/triton_decode_attention.py diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/v1/attention/ops/triton_merge_attn_states.py similarity index 100% rename from vllm/attention/ops/triton_merge_attn_states.py rename to vllm/v1/attention/ops/triton_merge_attn_states.py diff --git a/vllm/attention/ops/triton_prefill_attention.py b/vllm/v1/attention/ops/triton_prefill_attention.py similarity index 100% rename from vllm/attention/ops/triton_prefill_attention.py rename to vllm/v1/attention/ops/triton_prefill_attention.py diff --git a/vllm/attention/ops/triton_reshape_and_cache_flash.py b/vllm/v1/attention/ops/triton_reshape_and_cache_flash.py similarity index 100% rename from vllm/attention/ops/triton_reshape_and_cache_flash.py rename to vllm/v1/attention/ops/triton_reshape_and_cache_flash.py diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/v1/attention/ops/triton_unified_attention.py similarity index 100% rename from vllm/attention/ops/triton_unified_attention.py rename to vllm/v1/attention/ops/triton_unified_attention.py diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py similarity index 98% rename from vllm/attention/ops/vit_attn_wrappers.py rename to vllm/v1/attention/ops/vit_attn_wrappers.py index 80c4f1491cb0..72c45571f89a 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -35,7 +35,7 @@ def flash_attn_maxseqlen_wrapper( if is_rocm_aiter: from aiter import flash_attn_varlen_func else: - from vllm.attention.utils.fa_utils import flash_attn_varlen_func + from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func if not current_platform.is_rocm() and fa_version is not None: kwargs["fa_version"] = fa_version diff --git a/vllm/attention/selector.py b/vllm/v1/attention/selector.py similarity index 97% rename from vllm/attention/selector.py rename to vllm/v1/attention/selector.py index e66f698add99..e364c3235cfe 100644 --- a/vllm/attention/selector.py +++ b/vllm/v1/attention/selector.py @@ -6,14 +6,14 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend, AttentionType -from vllm.attention.backends.registry import ( - MAMBA_TYPE_TO_BACKEND_MAP, - MambaAttentionBackendEnum, -) from vllm.config.cache import CacheDType from vllm.logger import init_logger from vllm.utils.import_utils import resolve_obj_by_qualname +from vllm.v1.attention.backend import AttentionBackend, AttentionType +from vllm.v1.attention.backends.registry import ( + MAMBA_TYPE_TO_BACKEND_MAP, + MambaAttentionBackendEnum, +) logger = init_logger(__name__) diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py index e1cf7b14a785..061cf226798d 100644 --- a/vllm/v1/kv_offload/cpu.py +++ b/vllm/v1/kv_offload/cpu.py @@ -4,9 +4,9 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.platforms import current_platform +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager from vllm.v1.kv_offload.backends.cpu import CPUBackend diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py index 2cdd5ba5ffe5..549a0fdbf95c 100644 --- a/vllm/v1/kv_offload/spec.py +++ b/vllm/v1/kv_offload/spec.py @@ -6,8 +6,8 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.worker.worker import OffloadingHandler diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index 42ae4f1413ad..dcaecb09929b 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -6,9 +6,9 @@ import torch from vllm import _custom_ops as ops -from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger from vllm.utils.platform_utils import is_pin_memory_available +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.kv_offload.mediums import BlockIDsLoadStoreSpec from vllm.v1.kv_offload.worker.worker import ( OffloadingHandler, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index f6d198f63aff..cd4f55b792a9 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -8,7 +8,6 @@ import torch import torch.nn as nn -from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ( CompilationMode, CUDAGraphMode, @@ -27,6 +26,7 @@ from vllm.platforms import current_platform from vllm.triton_utils import triton from vllm.utils.platform_utils import is_pin_memory_available +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.tree_attn import ( TreeAttentionMetadata, TreeAttentionMetadataBuilder, diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py index 6386f1a08b44..312f0ab93b84 100644 --- a/vllm/v1/worker/gpu/attn_utils.py +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -6,9 +6,9 @@ import numpy as np import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5228167ed3c6..40937caefa97 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -20,12 +20,6 @@ from tqdm import tqdm import vllm.envs as envs -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionMetadata, - AttentionType, - MultipleOf, -) from vllm.attention.layer import Attention, MLAAttention from vllm.compilation.counter import compilation_counter from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper @@ -101,6 +95,12 @@ kv_cache_dtype_str_to_dtype, supports_dynamo, ) +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionMetadata, + AttentionType, + MultipleOf, +) from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.attention.backends.utils import ( AttentionCGSupport, diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 7bb4ebe476ec..ca0868befd44 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -11,7 +11,6 @@ import torch -from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.config.cache import CacheDType from vllm.distributed.kv_transfer import ( @@ -22,6 +21,7 @@ from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig from vllm.v1.outputs import ( EMPTY_MODEL_RUNNER_OUTPUT, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index bfe90572e232..85acc16795e2 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,7 +7,6 @@ import torch from typing_extensions import deprecated -from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.logger import init_logger @@ -17,6 +16,7 @@ from vllm.multimodal.registry import MultiModalRegistry from vllm.platforms import current_platform from vllm.utils.mem_utils import MemorySnapshot, format_gib +from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec