diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 0e7ab72558cc..a20b4662f0fc 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -640,8 +640,9 @@ steps: # grade: Blocking source_file_dependencies: - csrc/attention/ - - vllm/attention - vllm/v1/attention + # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) + - vllm/model_executor/layers/attention - tests/kernels/attention commands: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c2587dd8173c..0640aea928ed 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -568,8 +568,9 @@ steps: mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/attention/ - - vllm/attention - vllm/v1/attention + # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) + - vllm/model_executor/layers/attention - tests/kernels/attention commands: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index cf4b646f3495..06a94219e992 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -15,8 +15,9 @@ steps: timeout_in_minutes: 35 source_file_dependencies: - csrc/attention/ - - vllm/attention - vllm/v1/attention + # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) + - vllm/model_executor/layers/attention - tests/kernels/attention commands: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c963be4cb8f9..772c62973973 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,8 +2,8 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review -/vllm/attention @LucasWilkinson /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn +/vllm/model_executor/layers/attention @LucasWilkinson /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/mamba @tdoublep diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index e2f560815d54..624f13bf7937 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -29,7 +29,7 @@ The initialization code should look like this: ```python from torch import nn from vllm.config import VllmConfig - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention import Attention class MyAttention(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str): diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index 3f4934b15699..487522389268 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -271,7 +271,7 @@ Taking `MMEncoderAttention` as an example: ??? code ```python - from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention + from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.custom_op import CustomOp diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 961d6873f070..50492a5693d9 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -21,7 +21,6 @@ from tests.utils import flat_product from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention.layer import Attention from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.matcher_utils import QUANT_OPS @@ -40,6 +39,7 @@ set_current_vllm_config, ) from vllm.forward_context import get_forward_context, set_forward_context +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8StaticTensorSym, diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index 45a114679beb..19511b7877d3 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -5,7 +5,6 @@ import torch from tests.compile.backend import TestBackend -from vllm.attention.layer import Attention from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass @@ -21,6 +20,7 @@ VllmConfig, set_current_vllm_config, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.platforms import current_platform diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 94d494613fe7..e3b612123c0c 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -9,8 +9,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm import _custom_ops as ops -from vllm.attention.layer import Attention -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.platforms import current_platform from vllm.utils.mem_utils import get_max_shared_memory_bytes from vllm.utils.torch_utils import set_random_seed diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index ecaea88674c2..25fb5c926326 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -12,7 +12,7 @@ import pytest import torch -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index badbd3e9adff..458c7a2e5a4e 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -5,7 +5,6 @@ import pytest import torch -from vllm.attention.layer import Attention from vllm.config import ( AttentionConfig, CacheConfig, @@ -19,6 +18,7 @@ init_distributed_environment, initialize_model_parallel, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py index d223ad6e0d41..76f9a8f90f70 100644 --- a/tests/v1/worker/test_utils.py +++ b/tests/v1/worker/test_utils.py @@ -7,7 +7,7 @@ def test_bind_kv_cache(default_vllm_config): - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention import Attention ctx = { "layers.0.self_attn": Attention(32, 128, 0.1, prefix="layers.0.self_attn"), @@ -35,7 +35,7 @@ def test_bind_kv_cache(default_vllm_config): def test_bind_kv_cache_non_attention(default_vllm_config): - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention import Attention # example from Jamba PP=2 ctx = { @@ -58,7 +58,7 @@ def test_bind_kv_cache_non_attention(default_vllm_config): def test_bind_kv_cache_draft_model(default_vllm_config): - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention import Attention layer_names = [ "model.layers.0.attn", diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 90e65d059321..3e4f92cdc7a1 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -58,7 +58,6 @@ SEPARATE_GROUPS = [ "tests", # v0 related - "vllm/attention", "vllm/compilation", "vllm/lora", "vllm/model_executor", diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/vllm/attention/utils/__init__.py b/vllm/attention/utils/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/vllm/attention/utils/kv_sharing_utils.py b/vllm/attention/utils/kv_sharing_utils.py deleted file mode 100644 index 93af5bf7e13f..000000000000 --- a/vllm/attention/utils/kv_sharing_utils.py +++ /dev/null @@ -1,33 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -def validate_kv_sharing_target( - current_layer_name, target_layer_name, static_forward_context -): - error_msg = ( - f"Specified KV sharing target layer for {current_layer_name} " - f"is not valid: target layer {target_layer_name} " - ) - - if current_layer_name == target_layer_name: - raise ValueError(error_msg + "cannot be the same as the current layer.") - - if target_layer_name not in static_forward_context: - from vllm.model_executor.models.utils import extract_layer_index - - # If target layer name is not in the static fwd context, it means either - # a) the target layer does not come BEFORE the current layer, or - # b) the target layer is not an Attention layer that exists in the model - current_layer_idx = extract_layer_index(current_layer_name) - target_layer_idx = extract_layer_index(target_layer_name) - if current_layer_idx <= target_layer_idx: - raise ValueError(error_msg + "must come before the current layer.") - else: - raise ValueError(error_msg + "is not a valid Attention layer in the model.") - - # Currently KV sharing is only supported between layers of the same type - target_layer_attn_type = static_forward_context[target_layer_name].attn_type - expected = static_forward_context[current_layer_name].attn_type - if target_layer_attn_type != expected: - raise ValueError( - error_msg + f"must be the same type as the current layer ({expected})." - ) diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index 618892ad382a..0dc4b1489997 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -11,9 +11,9 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kNvfp4Dynamic, diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py index bc95b7238af3..3ddd2b87f065 100644 --- a/vllm/compilation/qk_norm_rope_fusion.py +++ b/vllm/compilation/qk_norm_rope_fusion.py @@ -10,9 +10,9 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from .fusion import empty_bf16, empty_fp32, empty_i64 diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 5667f1d3f248..4cfe98a6f933 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -8,7 +8,6 @@ import torch -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.utils import yield_req_data @@ -19,6 +18,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata from vllm.forward_context import ForwardContext from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_utils import BlockHash diff --git a/vllm/model_executor/layers/attention/__init__.py b/vllm/model_executor/layers/attention/__init__.py index e69de29bb2d1..1be9f77427d3 100644 --- a/vllm/model_executor/layers/attention/__init__.py +++ b/vllm/model_executor/layers/attention/__init__.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention.chunked_local_attention import ( + ChunkedLocalAttention, +) +from vllm.model_executor.layers.attention.cross_attention import CrossAttention +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) +from vllm.model_executor.layers.attention.mla_attention import MLAAttention +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention.static_sink_attention import ( + StaticSinkAttention, +) + +__all__ = [ + "Attention", + "ChunkedLocalAttention", + "CrossAttention", + "EncoderOnlyAttention", + "MLAAttention", + "MMEncoderAttention", + "StaticSinkAttention", +] diff --git a/vllm/attention/layer.py b/vllm/model_executor/layers/attention/attention.py similarity index 70% rename from vllm/attention/layer.py rename to vllm/model_executor/layers/attention/attention.py index 9a6945f7ad2d..25917294ab7e 100644 --- a/vllm/attention/layer.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -1,23 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer.""" -from typing import cast +from typing import TYPE_CHECKING import torch import torch.nn as nn import vllm.envs as envs -from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target -from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer from vllm.config import CacheConfig, get_current_vllm_config from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger +from vllm.model_executor.layers.attention.kv_transfer_utils import ( + maybe_transfer_kv_layer, +) from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, UnquantizedLinearMethod, ) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -33,20 +32,54 @@ from vllm.v1.attention.backend import ( AttentionBackend, AttentionType, - MLAAttentionImpl, ) from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.selector import get_attn_backend from vllm.v1.kv_cache_interface import ( FullAttentionSpec, KVCacheSpec, - MLAAttentionSpec, SlidingWindowSpec, ) +if TYPE_CHECKING: + from vllm.model_executor.layers.attention import MLAAttention + logger = init_logger(__name__) +def validate_kv_sharing_target( + current_layer_name, target_layer_name, static_forward_context +): + error_msg = ( + f"Specified KV sharing target layer for {current_layer_name} " + f"is not valid: target layer {target_layer_name} " + ) + + if current_layer_name == target_layer_name: + raise ValueError(error_msg + "cannot be the same as the current layer.") + + if target_layer_name not in static_forward_context: + from vllm.model_executor.models.utils import extract_layer_index + + # If target layer name is not in the static fwd context, it means either + # a) the target layer does not come BEFORE the current layer, or + # b) the target layer is not an Attention layer that exists in the model + current_layer_idx = extract_layer_index(current_layer_name) + target_layer_idx = extract_layer_index(target_layer_name) + if current_layer_idx <= target_layer_idx: + raise ValueError(error_msg + "must come before the current layer.") + else: + raise ValueError(error_msg + "is not a valid Attention layer in the model.") + + # Currently KV sharing is only supported between layers of the same type + target_layer_attn_type = static_forward_context[target_layer_name].attn_type + expected = static_forward_context[current_layer_name].attn_type + if target_layer_attn_type != expected: + raise ValueError( + error_msg + f"must be the same type as the current layer ({expected})." + ) + + def should_load_quant_weights(quant_method: QuantizeMethodBase | None) -> bool: """Returns whether the quantization method should load quantized weights.""" return quant_method is not None and not isinstance( @@ -493,236 +526,6 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: ) -class MLAAttention(nn.Module, AttentionLayerBase): - """Multi-Head Latent Attention layer. - - This class takes query, and compressed key/value tensors as input. - The class does the following: - - 1. Store the input key and value tensors in the KV cache. - 2. Perform (multi-head/multi-query/grouped-query) attention. - 3. Return the output tensor. - """ - - def __init__( - self, - num_heads: int, - scale: float, - qk_nope_head_dim: int, - qk_rope_head_dim: int, - v_head_dim: int, - q_lora_rank: int | None, - kv_lora_rank: int, - kv_b_proj: ColumnParallelLinear, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - use_sparse: bool = False, - indexer: object | None = None, - **extra_impl_args, - ): - super().__init__() - self.num_heads = num_heads - self.scale = scale - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.q_lora_rank = q_lora_rank - self.kv_lora_rank = kv_lora_rank - self.head_size = kv_lora_rank + qk_rope_head_dim - self.layer_name = prefix - - if cache_config is not None: - kv_cache_dtype = cache_config.cache_dtype - block_size = cache_config.block_size - calculate_kv_scales = cache_config.calculate_kv_scales - else: - kv_cache_dtype = "auto" - block_size = 16 - calculate_kv_scales = False - self.quant_config = quant_config - - # Initialize KV cache quantization attributes - self.kv_cache_dtype = kv_cache_dtype - self.calculate_kv_scales = calculate_kv_scales - _init_kv_cache_quant(self, quant_config, prefix) - - dtype = torch.get_default_dtype() - self.attn_backend = get_attn_backend( - self.head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla=True, - use_sparse=use_sparse, - ) - - if ( - cache_config is not None - and cache_config.enable_prefix_caching - and vllm_is_batch_invariant() - and ( - self.attn_backend.get_name() == "TRITON_MLA" - or self.attn_backend.get_name() == "FLASHINFER" - ) - ): - logger.warning_once( - "Disabling prefix caching for TRITON_MLA / FLASHINFER " - "with batch invariance, as it is not yet supported.", - scope="local", - ) - cache_config.enable_prefix_caching = False - - impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls()) - self.impl = impl_cls( - num_heads=self.num_heads, - head_size=self.head_size, - scale=self.scale, - num_kv_heads=1, - alibi_slopes=None, - sliding_window=None, - kv_cache_dtype=self.kv_cache_dtype, - logits_soft_cap=None, - attn_type=AttentionType.DECODER, - kv_sharing_target_layer_name=None, - # MLA Args - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - qk_head_dim=self.qk_nope_head_dim + self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - kv_b_proj=kv_b_proj, - indexer=indexer, - **extra_impl_args, - ) - - self.use_direct_call = not current_platform.opaque_attention_op() - - compilation_config = get_current_vllm_config().compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self - - self.kv_cache = [ - torch.tensor([]) - for _ in range( - get_current_vllm_config().parallel_config.pipeline_parallel_size - ) - ] - - self.use_sparse = use_sparse - - # Initialize q/k/v range constants. - self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) - self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) - self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) - - def forward( - self, - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - output_shape: torch.Size | None = None, - ) -> torch.Tensor: - if self.calculate_kv_scales: - torch.ops.vllm.maybe_calc_kv_scales(q, kv_c_normed, k_pe, self.layer_name) - - if self.use_direct_call: - forward_context: ForwardContext = get_forward_context() - attn_metadata = forward_context.attn_metadata - if isinstance(attn_metadata, dict): - attn_metadata = attn_metadata[self.layer_name] - self_kv_cache = self.kv_cache[forward_context.virtual_engine] - - if self.attn_backend.accept_output_buffer: - output = torch.empty(output_shape, dtype=q.dtype, device=q.device) - self.impl.forward( - self, - q, - kv_c_normed, - k_pe, - self_kv_cache, - attn_metadata, - output=output, - ) - return output - else: - return self.impl.forward( - self, q, kv_c_normed, k_pe, self_kv_cache, attn_metadata - ) - else: - if self.attn_backend.accept_output_buffer: - output = torch.empty(output_shape, dtype=q.dtype, device=q.device) - torch.ops.vllm.unified_mla_attention_with_output( - q, - kv_c_normed, - k_pe, - output, - self.layer_name, - ) - return output - else: - return torch.ops.vllm.unified_mla_attention( - q, - kv_c_normed, - k_pe, - self.layer_name, - ) - - def process_weights_after_loading(self, act_dtype: torch.dtype): - if hasattr(self.impl, "process_weights_after_loading"): - self.impl.process_weights_after_loading(act_dtype) - - # If we should not load quant weights, we initialize the scales to 1.0 - # as the default value. See [Note: Register q/k/v/prob scales in state dict] - # for more details. - quant_method = ( - self.quant_config.get_quant_method(self, prefix=self.layer_name) - if self.quant_config - else None - ) - if not should_load_quant_weights(quant_method): - set_default_quant_scales(self, register_buffer=False) - - def calc_kv_scales( - self, q: torch.Tensor, kv_c_normed: torch.Tensor, k_pe: torch.Tensor - ) -> None: - """Optional scale calculation for MLA inputs. - - Mirrors Attention.calc_kv_scales. Not all MLA backends require this - """ - # Use safe defaults if ranges are not present - q_range = getattr(self, "q_range", torch.tensor(1.0)) - k_range = getattr(self, "k_range", torch.tensor(1.0)) - v_range = getattr(self, "v_range", torch.tensor(1.0)) - - self._q_scale.copy_(torch.abs(q).max() / q_range) - # kv_c_normed is the compressed KV representation; use it for k/v - kv_abs_max = torch.abs(kv_c_normed).max() - self._k_scale.copy_(kv_abs_max / k_range) - self._v_scale.copy_(kv_abs_max / v_range) - self._q_scale_float = self._q_scale.item() - self._k_scale_float = self._k_scale.item() - self._v_scale_float = self._v_scale.item() - self.calculate_kv_scales = False - - def get_attn_backend(self) -> type[AttentionBackend]: - return self.attn_backend - - def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: - kv_cache_dtype = kv_cache_dtype_str_to_dtype( - self.kv_cache_dtype, vllm_config.model_config - ) - return MLAAttentionSpec( - block_size=vllm_config.cache_config.block_size, - num_kv_heads=1, - head_size=self.head_size, - dtype=kv_cache_dtype, - cache_dtype_str=vllm_config.cache_config.cache_dtype, - ) - - def maybe_calc_kv_scales( query: torch.Tensor, key: torch.Tensor, @@ -759,7 +562,7 @@ def maybe_calc_kv_scales_fake( def get_attention_context( layer_name: str, -) -> tuple[dict | object | None, Attention | MLAAttention, torch.Tensor]: +) -> tuple[dict | object | None, "Attention | MLAAttention", torch.Tensor]: """Extract attention context for a given layer. This helper function extracts the attention metadata, attention layer @@ -782,7 +585,7 @@ def get_attention_context( attn_metadata = forward_context.attn_metadata if isinstance(attn_metadata, dict): attn_metadata = attn_metadata[layer_name] - attn_layer: Attention | MLAAttention = forward_context.no_compile_layers[layer_name] + attn_layer = forward_context.no_compile_layers[layer_name] kv_cache = attn_layer.kv_cache[forward_context.virtual_engine] return attn_metadata, attn_layer, kv_cache @@ -914,79 +717,3 @@ def unified_attention_with_output_fake( mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, ) - - -@maybe_transfer_kv_layer -def unified_mla_attention( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - layer_name: str, -) -> torch.Tensor: - attn_metadata, self, kv_cache = get_attention_context(layer_name) - output = self.impl.forward(self, q, kv_c_normed, k_pe, kv_cache, attn_metadata) - - return output - - -def unified_mla_attention_fake( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - layer_name: str, -) -> torch.Tensor: - return torch.empty_like(q).contiguous() - - -direct_register_custom_op( - op_name="unified_mla_attention", - op_func=unified_mla_attention, - mutates_args=[], - fake_impl=unified_mla_attention_fake, - dispatch_key=current_platform.dispatch_key, -) - - -@maybe_transfer_kv_layer -def unified_mla_attention_with_output( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - output: torch.Tensor, - layer_name: str, - output_scale: torch.Tensor | None = None, - output_block_scale: torch.Tensor | None = None, -) -> None: - attn_metadata, self, kv_cache = get_attention_context(layer_name) - self.impl.forward( - self, - q, - kv_c_normed, - k_pe, - kv_cache, - attn_metadata, - output=output, - output_scale=output_scale, - output_block_scale=output_block_scale, - ) - - -def unified_mla_attention_with_output_fake( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - output: torch.Tensor, - layer_name: str, - output_scale: torch.Tensor | None = None, - output_block_scale: torch.Tensor | None = None, -) -> None: - return - - -direct_register_custom_op( - op_name="unified_mla_attention_with_output", - op_func=unified_mla_attention_with_output, - mutates_args=["output", "output_block_scale"], - fake_impl=unified_mla_attention_with_output_fake, - dispatch_key=current_platform.dispatch_key, -) diff --git a/vllm/model_executor/layers/attention/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py index 0fae5144397f..e33733c0cc1f 100644 --- a/vllm/model_executor/layers/attention/chunked_local_attention.py +++ b/vllm/model_executor/layers/attention/chunked_local_attention.py @@ -4,9 +4,9 @@ import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.v1.attention.backend import ( AttentionBackend, diff --git a/vllm/model_executor/layers/attention/cross_attention.py b/vllm/model_executor/layers/attention/cross_attention.py index f47fa1148da6..6a829db26361 100644 --- a/vllm/model_executor/layers/attention/cross_attention.py +++ b/vllm/model_executor/layers/attention/cross_attention.py @@ -6,9 +6,9 @@ import numpy as np import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv from vllm.v1.attention.backend import ( AttentionBackend, diff --git a/vllm/model_executor/layers/attention/encoder_only_attention.py b/vllm/model_executor/layers/attention/encoder_only_attention.py index 89a92ca1bf38..941911028912 100644 --- a/vllm/model_executor/layers/attention/encoder_only_attention.py +++ b/vllm/model_executor/layers/attention/encoder_only_attention.py @@ -5,9 +5,9 @@ import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig +from vllm.model_executor.layers.attention import Attention from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, diff --git a/vllm/attention/utils/kv_transfer_utils.py b/vllm/model_executor/layers/attention/kv_transfer_utils.py similarity index 95% rename from vllm/attention/utils/kv_transfer_utils.py rename to vllm/model_executor/layers/attention/kv_transfer_utils.py index 210be55feb2f..9ee6b4d0f5b8 100644 --- a/vllm/attention/utils/kv_transfer_utils.py +++ b/vllm/model_executor/layers/attention/kv_transfer_utils.py @@ -19,7 +19,7 @@ def maybe_transfer_kv_layer(func: Callable) -> Callable: On exit: saves the KV layer to the connector. """ # Import at runtime to avoid circular dependency - from vllm.attention.layer import get_attention_context + from vllm.model_executor.layers.attention.attention import get_attention_context # Inspect the signature ONCE when the decorator is applied. sig = inspect.signature(func) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py old mode 100755 new mode 100644 index 9371a977f96f..112c3a5a9523 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -191,24 +191,38 @@ from abc import abstractmethod from dataclasses import dataclass, field from enum import Enum -from typing import ClassVar, Generic, TypeVar +from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast + +if TYPE_CHECKING: + from flashinfer import BatchPrefillWithRaggedKVCacheWrapper import torch +import torch.nn as nn from tqdm import tqdm +import vllm.envs as envs from vllm import _custom_ops as ops -from vllm import envs from vllm._aiter_ops import rocm_aiter_ops -from vllm.config import ModelConfig, VllmConfig, get_current_vllm_config +from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed.parallel_state import get_dcp_group, is_global_first_rank +from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp -from vllm.model_executor.layers.batch_invariant import ( - vllm_is_batch_invariant, +from vllm.model_executor.layers.attention.attention import ( + _init_kv_cache_quant, + get_attention_context, + set_default_quant_scales, + should_load_quant_weights, +) +from vllm.model_executor.layers.attention.kv_transfer_utils import ( + maybe_transfer_kv_layer, ) +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.model_executor.layers.linear import ( ColumnParallelLinear, ) +from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, @@ -217,11 +231,16 @@ from vllm.platforms import current_platform from vllm.utils.flashinfer import has_nvidia_artifactory from vllm.utils.math_utils import cdiv, round_down +from vllm.utils.torch_utils import ( + direct_register_custom_op, + kv_cache_dtype_str_to_dtype, +) from vllm.v1.attention.backend import ( AttentionBackend, AttentionLayer, AttentionMetadata, AttentionMetadataBuilder, + AttentionType, CommonAttentionMetadata, MLAAttentionImpl, ) @@ -234,7 +253,320 @@ ) from vllm.v1.attention.ops.common import cp_lse_ag_out_rs from vllm.v1.attention.ops.merge_attn_states import merge_attn_states -from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.v1.attention.selector import get_attn_backend +from vllm.v1.kv_cache_interface import ( + AttentionSpec, + KVCacheSpec, + MLAAttentionSpec, +) + +logger = init_logger(__name__) + + +class MLAAttention(nn.Module, AttentionLayerBase): + """Multi-Head Latent Attention layer. + + This class takes query, and compressed key/value tensors as input. + The class does the following: + + 1. Store the input key and value tensors in the KV cache. + 2. Perform (multi-head/multi-query/grouped-query) attention. + 3. Return the output tensor. + """ + + def __init__( + self, + num_heads: int, + scale: float, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: int | None, + kv_lora_rank: int, + kv_b_proj: ColumnParallelLinear, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + use_sparse: bool = False, + indexer: object | None = None, + **extra_impl_args, + ): + super().__init__() + self.num_heads = num_heads + self.scale = scale + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.head_size = kv_lora_rank + qk_rope_head_dim + self.layer_name = prefix + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + calculate_kv_scales = cache_config.calculate_kv_scales + else: + kv_cache_dtype = "auto" + block_size = 16 + calculate_kv_scales = False + self.quant_config = quant_config + + # Initialize KV cache quantization attributes + self.kv_cache_dtype = kv_cache_dtype + self.calculate_kv_scales = calculate_kv_scales + _init_kv_cache_quant(self, quant_config, prefix) + + dtype = torch.get_default_dtype() + self.attn_backend = get_attn_backend( + self.head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla=True, + use_sparse=use_sparse, + ) + + if ( + cache_config is not None + and cache_config.enable_prefix_caching + and vllm_is_batch_invariant() + and ( + self.attn_backend.get_name() == "TRITON_MLA" + or self.attn_backend.get_name() == "FLASHINFER" + ) + ): + logger.warning_once( + "Disabling prefix caching for TRITON_MLA / FLASHINFER " + "with batch invariance, as it is not yet supported.", + scope="local", + ) + cache_config.enable_prefix_caching = False + + impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls()) + self.impl = impl_cls( + num_heads=self.num_heads, + head_size=self.head_size, + scale=self.scale, + num_kv_heads=1, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype=self.kv_cache_dtype, + logits_soft_cap=None, + attn_type=AttentionType.DECODER, + kv_sharing_target_layer_name=None, + # MLA Args + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + qk_head_dim=self.qk_nope_head_dim + self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + kv_b_proj=kv_b_proj, + indexer=indexer, + **extra_impl_args, + ) + + self.use_direct_call = not current_platform.opaque_attention_op() + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + self.kv_cache = [ + torch.tensor([]) + for _ in range( + get_current_vllm_config().parallel_config.pipeline_parallel_size + ) + ] + + self.use_sparse = use_sparse + + # Initialize q/k/v range constants. + self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + + def forward( + self, + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output_shape: torch.Size | None = None, + ) -> torch.Tensor: + if self.calculate_kv_scales: + torch.ops.vllm.maybe_calc_kv_scales(q, kv_c_normed, k_pe, self.layer_name) + + if self.use_direct_call: + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if isinstance(attn_metadata, dict): + attn_metadata = attn_metadata[self.layer_name] + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + + if self.attn_backend.accept_output_buffer: + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) + self.impl.forward( + self, + q, + kv_c_normed, + k_pe, + self_kv_cache, + attn_metadata, + output=output, + ) + return output + else: + return self.impl.forward( + self, q, kv_c_normed, k_pe, self_kv_cache, attn_metadata + ) + else: + if self.attn_backend.accept_output_buffer: + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) + torch.ops.vllm.unified_mla_attention_with_output( + q, + kv_c_normed, + k_pe, + output, + self.layer_name, + ) + return output + else: + return torch.ops.vllm.unified_mla_attention( + q, + kv_c_normed, + k_pe, + self.layer_name, + ) + + def process_weights_after_loading(self, act_dtype: torch.dtype): + if hasattr(self.impl, "process_weights_after_loading"): + self.impl.process_weights_after_loading(act_dtype) + + # If we should not load quant weights, we initialize the scales to 1.0 + # as the default value. See [Note: Register q/k/v/prob scales in state dict] + # for more details. + quant_method = ( + self.quant_config.get_quant_method(self, prefix=self.layer_name) + if self.quant_config + else None + ) + if not should_load_quant_weights(quant_method): + set_default_quant_scales(self, register_buffer=False) + + def calc_kv_scales( + self, q: torch.Tensor, kv_c_normed: torch.Tensor, k_pe: torch.Tensor + ) -> None: + """Optional scale calculation for MLA inputs. + + Mirrors Attention.calc_kv_scales. Not all MLA backends require this + """ + # Use safe defaults if ranges are not present + q_range = getattr(self, "q_range", torch.tensor(1.0)) + k_range = getattr(self, "k_range", torch.tensor(1.0)) + v_range = getattr(self, "v_range", torch.tensor(1.0)) + + self._q_scale.copy_(torch.abs(q).max() / q_range) + # kv_c_normed is the compressed KV representation; use it for k/v + kv_abs_max = torch.abs(kv_c_normed).max() + self._k_scale.copy_(kv_abs_max / k_range) + self._v_scale.copy_(kv_abs_max / v_range) + self._q_scale_float = self._q_scale.item() + self._k_scale_float = self._k_scale.item() + self._v_scale_float = self._v_scale.item() + self.calculate_kv_scales = False + + def get_attn_backend(self) -> type[AttentionBackend]: + return self.attn_backend + + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: + kv_cache_dtype = kv_cache_dtype_str_to_dtype( + self.kv_cache_dtype, vllm_config.model_config + ) + return MLAAttentionSpec( + block_size=vllm_config.cache_config.block_size, + num_kv_heads=1, + head_size=self.head_size, + dtype=kv_cache_dtype, + cache_dtype_str=vllm_config.cache_config.cache_dtype, + ) + + +@maybe_transfer_kv_layer +def unified_mla_attention( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + output = self.impl.forward(self, q, kv_c_normed, k_pe, kv_cache, attn_metadata) + + return output + + +def unified_mla_attention_fake( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + return torch.empty_like(q).contiguous() + + +direct_register_custom_op( + op_name="unified_mla_attention", + op_func=unified_mla_attention, + mutates_args=[], + fake_impl=unified_mla_attention_fake, + dispatch_key=current_platform.dispatch_key, +) + + +@maybe_transfer_kv_layer +def unified_mla_attention_with_output( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + self.impl.forward( + self, + q, + kv_c_normed, + k_pe, + kv_cache, + attn_metadata, + output=output, + output_scale=output_scale, + output_block_scale=output_block_scale, + ) + + +def unified_mla_attention_with_output_fake( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + return + + +direct_register_custom_op( + op_name="unified_mla_attention_with_output", + op_func=unified_mla_attention_with_output, + mutates_args=["output", "output_block_scale"], + fake_impl=unified_mla_attention_with_output_fake, + dispatch_key=current_platform.dispatch_key, +) class QueryLenSupport(Enum): @@ -266,15 +598,12 @@ class QueryLenSupport(Enum): from flash_attn import flash_attn_varlen_func # type: ignore[no-redef] is_vllm_fa = False -try: - from flashinfer import BatchPrefillWithRaggedKVCacheWrapper - from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache # noqa: F401 - flashinfer_available = True -except ImportError: - BatchPrefillWithRaggedKVCacheWrapper = object +@functools.cache +def flashinfer_available() -> bool: + import importlib.util - flashinfer_available = False + return importlib.util.find_spec("flashinfer") is not None def dynamic_per_batched_tensor_quant( @@ -398,8 +727,8 @@ class ChunkedContextMetadata: @dataclass class FlashInferPrefillMetadata(MLACommonPrefillMetadata): - prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None - prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = field( + prefill_main: "BatchPrefillWithRaggedKVCacheWrapper | None" = None + prefill_chunks: "list[BatchPrefillWithRaggedKVCacheWrapper]" = field( default_factory=list ) @@ -495,7 +824,7 @@ def use_flashinfer_prefill() -> bool: vllm_config = get_current_vllm_config() if not ( not vllm_config.attention_config.disable_flashinfer_prefill - and flashinfer_available + and flashinfer_available() and not vllm_config.attention_config.use_cudnn_prefill and current_platform.is_device_capability_family(100) ): @@ -509,7 +838,7 @@ def use_cudnn_prefill() -> bool: vllm_config = get_current_vllm_config() return ( - flashinfer_available + flashinfer_available() and vllm_config.attention_config.use_cudnn_prefill and current_platform.is_device_capability_family(100) and has_nvidia_artifactory() @@ -731,6 +1060,8 @@ def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata): has_context = True if self._fi_prefill_main is None: + from flashinfer import BatchPrefillWithRaggedKVCacheWrapper + self._fi_prefill_main = BatchPrefillWithRaggedKVCacheWrapper( self._workspace_buffer, "NHD", backend="cutlass" ) @@ -739,6 +1070,8 @@ def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata): num_chunks = chunked_context.cu_seq_lens.shape[0] # Allocate more prefill chunk wrappers if needed if len(self._fi_prefill_chunks) < num_chunks: + from flashinfer import BatchPrefillWithRaggedKVCacheWrapper + for _ in range(len(self._fi_prefill_chunks), num_chunks): self._fi_prefill_chunks.append( BatchPrefillWithRaggedKVCacheWrapper( @@ -1513,6 +1846,8 @@ def _run_prefill_new_tokens_cudnn( ): assert isinstance(prefill, CudnnPrefillMetadata) assert prefill.query_seq_lens is not None + from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache + output, lse = cudnn_batch_prefill_with_kv_cache( q=q, k_cache=k, @@ -1572,6 +1907,8 @@ def _run_prefill_context_chunk_cudnn( assert prefill.chunked_context is not None assert prefill.chunked_context.seq_lens[chunk_idx] is not None assert prefill.query_seq_lens is not None + from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache + return cudnn_batch_prefill_with_kv_cache( q=q, k_cache=k, diff --git a/vllm/model_executor/layers/attention/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py index a869226ea182..49d83823b512 100644 --- a/vllm/model_executor/layers/attention/static_sink_attention.py +++ b/vllm/model_executor/layers/attention/static_sink_attention.py @@ -4,11 +4,11 @@ import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backend import ( diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index 2549f1221f36..9f10ca57c037 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -4,9 +4,9 @@ import torch -from vllm.attention.layer import MLAAttention from vllm.config import CacheConfig from vllm.model_executor.custom_op import PluggableLayer +from vllm.model_executor.layers.attention import MLAAttention from vllm.model_executor.layers.quantization import QuantizationConfig diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 8b1d564e27a5..5745cb547a7e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -19,12 +19,12 @@ from compressed_tensors.transform import TransformConfig import vllm.envs as envs -from vllm.attention.layer import Attention from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index fe59022cba19..b378dad8c8b9 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -11,9 +11,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.layer import Attention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 476ad618eaa1..069567866c6d 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -11,8 +11,8 @@ import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index f2232b1db187..c47676d01610 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -7,9 +7,9 @@ from torch.nn.parameter import Parameter from vllm import envs -from vllm.attention.layer import Attention from vllm.config import get_current_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEConfig, diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py index 5ccc73166361..e97fac80fe5e 100644 --- a/vllm/model_executor/layers/quantization/petit.py +++ b/vllm/model_executor/layers/quantization/petit.py @@ -8,8 +8,8 @@ import torch from torch.nn.parameter import Parameter -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index b97eddaff04a..1f433e07e661 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -7,8 +7,8 @@ from torch.nn.parameter import Parameter from vllm import _custom_ops as ops -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 39bcd56bcd3d..8fd7b875fdc8 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -6,8 +6,8 @@ import torch -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 1d67cb835e93..f3c9729268de 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -11,9 +11,9 @@ from torch import nn from typing_extensions import assert_never -from vllm.attention.layer import Attention, MLAAttention from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention, MLAAttention from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 00605fdc6018..a47fe4b7b9dc 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -9,7 +9,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -18,6 +17,7 @@ get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index b802bb0ee35b..5b8ead4c7b7c 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -11,7 +11,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.utils import divide from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 3ae501610c26..921d0cd3bf0c 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -32,12 +32,12 @@ from torch import nn from transformers import ApertusConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import XIELU -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index cf93d2eb61a6..031b6534fb69 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -8,7 +8,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 1e0f27ec7218..bc1cd2ed811b 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -29,7 +29,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -38,6 +37,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index a8ee14aa02d2..fc10f790e4d6 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -32,7 +32,6 @@ from torch import nn from transformers.configuration_utils import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -41,6 +40,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 77f49eb65505..d220b22ddae7 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -9,12 +9,12 @@ from torch import nn from transformers import BambaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 532175e7294c..0cdf4f70e5bd 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -11,7 +11,7 @@ from vllm.config import CacheConfig, ModelConfig, PoolerConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, ) from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index f200f791c32a..22bcdeb453c4 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -15,7 +15,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import get_act_and_mul_fn, get_act_fn -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, ) from vllm.model_executor.layers.fused_moe import activation_without_mul, fused_topk diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 9279cccd596d..ac9ae49f03e1 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -11,7 +11,7 @@ from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index c6056329f200..233028a905f6 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -27,7 +27,6 @@ from torch import nn from transformers import BloomConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -36,6 +35,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index c6c48a8214fc..c4b885cc9a0c 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -16,12 +16,12 @@ ChameleonVQVAEConfig, ) -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index ea4f87d97ab0..f48e5dc1db62 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -12,11 +12,11 @@ from torch import nn from torch.nn import LayerNorm -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 481f5ae6def7..a6a303348bb7 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -14,12 +14,11 @@ CLIPVisionConfig, ) -from vllm.attention.layer import Attention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 6095120786e6..e73dfb1f01e3 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -30,11 +30,11 @@ from torch import nn from transformers import Cohere2Config, CohereConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 8cef4b42836a..ca6e6a49a98a 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -8,13 +8,13 @@ import torch.nn as nn from transformers import DbrxConfig -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py index 46ce87b348db..651ced896b5d 100644 --- a/vllm/model_executor/models/deepencoder.py +++ b/vllm/model_executor/models/deepencoder.py @@ -18,7 +18,7 @@ import torch.nn.functional as F from transformers import CLIPVisionConfig -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 5649f8bd1dcb..f8907ed86efa 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -33,7 +33,6 @@ from transformers import DeepseekV2Config, DeepseekV3Config from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -45,6 +44,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index b69d87f1ed48..4e393145462a 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -32,7 +32,6 @@ from torch import nn from transformers import Dots1Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( @@ -41,6 +40,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index fa5a5cc7f149..d2f39553dac5 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -16,7 +16,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index be153dbcf080..452c7624dcc0 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -32,7 +32,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -42,6 +41,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 85c447c9b7c7..db724d027026 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -42,7 +42,7 @@ from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 63da84d6950c..9d3cbbecff17 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -31,12 +31,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention - # from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 6cafbfb5733b..b633fd285082 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -32,11 +32,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 5b2ef90825c7..485b145b9cdf 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -28,11 +28,11 @@ from torch import nn from transformers import Exaone4Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 77cb68c4c3a1..dc636274a3fb 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -30,7 +30,6 @@ from torch.nn import LayerNorm from transformers import FalconConfig as HF_FalconConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -40,6 +39,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 582f1d2441f9..3d4d253c3907 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -9,12 +9,12 @@ from torch import nn from transformers import FalconH1Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 4d016f286217..b3ae5f5acc8e 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -26,12 +26,12 @@ from torch import nn from transformers import GemmaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 6d946522f70b..303f04b64dcc 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -23,12 +23,12 @@ from torch import nn from transformers import Gemma2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 502fe6b825e0..b2352a3c9268 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -22,13 +22,13 @@ from torch import nn from transformers import Gemma3TextConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import GemmaRMSNorm diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index bdbb3c91e83e..770424ba0fdf 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -21,7 +21,6 @@ from torch import nn from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size @@ -32,6 +31,7 @@ GeluAndMul, GeluAndMulSparse, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 3e55df4ffea2..89447927d5cd 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -29,10 +29,10 @@ from torch import nn from transformers import Glm4Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index a120b106ff46..a081641be70f 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -52,7 +52,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer, Conv3dLayer diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 4c60cd460ca8..d0e6cb6ada8b 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -32,7 +32,6 @@ from torch import nn from transformers.models.glm4_moe import Glm4MoeConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -42,6 +41,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 59ba0fccc4fe..1ff346d0e8dd 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -23,7 +23,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index a14aa47ed769..bc70e0ad2d86 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -16,7 +16,7 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 4026d69cd077..41a4ca174257 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -27,7 +27,6 @@ from torch import nn from transformers import GPT2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed.parallel_state import ( @@ -35,6 +34,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 8af41d00404f..c6629c937dc6 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -28,11 +28,11 @@ from torch import nn from transformers import GPTBigCodeConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 366f0fd902f0..c29103c6d52c 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -26,11 +26,11 @@ from torch import nn from transformers import GPTJConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 764a801db521..8d44d12fc212 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -26,11 +26,11 @@ from torch import nn from transformers import GPTNeoXConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index acaf099edb65..b273880ce0cf 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -7,7 +7,6 @@ from torch import nn from transformers import GptOssConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 53c3230b29a8..4b486ede4439 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -31,11 +31,11 @@ from torch import nn from transformers import GraniteConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 33446e7444b7..171b2e0ec5a0 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -31,7 +31,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -39,6 +38,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index b6a3c4872a3e..500ef1a1d041 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -9,11 +9,11 @@ from torch import nn from transformers import GraniteMoeHybridConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 49bdc02414b8..e2943b7978b4 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -32,12 +32,12 @@ import torch.nn.functional as F from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index df507a234887..d9362e1dd185 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -33,7 +33,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -43,6 +42,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 9214f47c7acb..5381b08b0b7f 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -39,7 +39,7 @@ from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index c78ad64790e8..441aabd7ec1e 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -29,7 +29,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 3e705defc3ed..fa5efc808e39 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -23,7 +23,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 60db4c4c6c30..c00b9a0ee671 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -10,7 +10,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -21,6 +20,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py index 2b2866d678a8..195bb96817f4 100644 --- a/vllm/model_executor/models/interns1_vit.py +++ b/vllm/model_executor/models/interns1_vit.py @@ -15,7 +15,7 @@ from transformers.utils import torch_int from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 9dd6a08d1e90..24c004ff4c20 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -24,10 +24,10 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 704ade320d7c..c0e4a1932254 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -20,7 +20,7 @@ from vllm.config.model import ModelConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 2173b7e4aad6..5685acd75502 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -28,7 +28,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -36,6 +35,7 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index 265a57db592e..ea06ee1b1c7a 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -31,7 +31,6 @@ from torch import nn from transformers import Jais2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -39,6 +38,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import ReLUSquaredActivation +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 27f14374cbd0..980bcffb5f9b 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -9,11 +9,11 @@ from torch import nn from transformers import JambaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index b32f2762eee1..d37b43102aa6 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -20,7 +20,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index ba5e80ac7bc1..fa611ad504b4 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -7,11 +7,11 @@ import torch.nn as nn from transformers import Lfm2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 6d786276a0e1..293471bba3ec 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -6,7 +6,6 @@ import torch import torch.nn as nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -15,6 +14,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/lfm2_siglip2.py b/vllm/model_executor/models/lfm2_siglip2.py index 960f119a5120..d58e2ad85e6e 100644 --- a/vllm/model_executor/models/lfm2_siglip2.py +++ b/vllm/model_executor/models/lfm2_siglip2.py @@ -13,7 +13,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 29cbea187d26..16d3cf88a60b 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,12 +31,12 @@ from torch import nn from transformers import LlamaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 477775514499..0cdb4989ec73 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -24,7 +24,6 @@ from torch import nn from transformers import Llama4TextConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -33,7 +32,8 @@ tensor_model_parallel_all_gather, ) from vllm.logger import init_logger -from vllm.model_executor.layers.attention.chunked_local_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, ChunkedLocalAttention, ) from vllm.model_executor.layers.fused_moe import SharedFusedMoE diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index 9d8c26e465fd..f7640746aab6 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -6,7 +6,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.config import ( CacheConfig, VllmConfig, @@ -22,6 +21,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 023d08691a38..4217d119a188 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -33,7 +33,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -43,6 +42,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import FatreluAndMul, SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index c7a54cea2154..e61e9d06103d 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -29,9 +29,9 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index be5f0b921000..7583be200ffc 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -30,7 +30,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( @@ -38,6 +37,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index b91321aedc79..a7785bcfc3df 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -14,7 +14,6 @@ from torch import nn from transformers import MiniMaxConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import ( @@ -24,6 +23,7 @@ ) from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 8d1197800fd1..376fd7a1709d 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -32,7 +32,6 @@ from torch import nn from transformers import MixtralConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -40,6 +39,7 @@ get_pp_group, get_tensor_model_parallel_world_size, ) +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index be34e436a544..9de43f1e1f5c 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -36,7 +36,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import set_forward_context -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index a8119b046150..a29b1a9fbfbb 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -10,7 +10,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, ) from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 7ba1382e39b2..7ea06fd85ae4 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -17,7 +17,6 @@ from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -29,7 +28,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import MulAndSilu, QuickGELU, SiluAndMul -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index b0a75c18ba62..6ded8e08c02c 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -23,7 +23,6 @@ from transformers.tokenization_utils_base import TextInput from transformers.video_utils import VideoInput, VideoMetadata -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions @@ -36,7 +35,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import MulAndSilu, SiluAndMul, get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index 823a8c0e7d8c..8c699865618b 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -53,7 +53,7 @@ from transformers.modeling_utils import PreTrainedModel from vllm.distributed import divide, get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 5d039f7b4590..85933626cd30 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -10,7 +10,6 @@ import torch.nn as nn from transformers import MptConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index c416cbb15132..7689e9c60098 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -30,11 +30,11 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 999949fa1b8f..e33bbe9fa162 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -25,7 +25,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.parallel import ParallelConfig @@ -33,6 +32,7 @@ from vllm.distributed.communication_op import tensor_model_parallel_all_gather from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import ReLUSquaredActivation +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE from vllm.model_executor.layers.fused_moe.utils import activation_without_mul from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 3ed316dbea2f..4491a6a3ea1b 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -31,11 +31,11 @@ from torch import nn from transformers import OlmoConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 5019cd787963..1de5a12fd43e 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -32,7 +32,6 @@ from torch import nn from transformers import Olmo2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -40,6 +39,7 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_rank from vllm.distributed.utils import split_tensor_along_last_dim from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index d9695f8f2049..f0afe0e997cc 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -21,7 +21,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import ( @@ -32,6 +31,7 @@ ) from vllm.distributed.utils import split_tensor_along_last_dim from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 982c4e1fc049..5eba82e2cf54 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -29,7 +29,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.distributed import ( @@ -41,7 +40,8 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.static_sink_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, StaticSinkAttention, ) from vllm.model_executor.layers.fused_moe import SharedFusedMoE @@ -84,6 +84,7 @@ from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import set_default_rope_theta +from vllm.v1.attention.backend import AttentionType from vllm.v1.attention.backends.flash_attn_diffkv import FlashAttentionDiffKVBackend diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 4c64b5771040..81653b9516ac 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -27,11 +27,11 @@ from torch import nn from transformers import OPTConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 7d5a36a97ed4..3cacb9d61cd5 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -15,11 +15,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index a9476645a035..56505ec7be20 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -33,11 +33,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index d8e66dc104cf..1728e8ef6e71 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -34,7 +34,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index c7829476e5a6..a03a785577ee 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -30,11 +30,11 @@ from torch import nn from transformers import PersimmonConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index bf1e13614369..75c42c0d3930 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -45,11 +45,11 @@ from torch import nn from transformers import PhiConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 19c7cecda4f7..0b55b7ec8392 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -31,10 +31,10 @@ from torch import nn from transformers.configuration_utils import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 82833dddcd93..2bc89cc232da 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -9,7 +9,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig, get_current_vllm_config from vllm.distributed import divide, get_tensor_model_parallel_world_size @@ -17,6 +16,7 @@ from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index df1300ac1243..4ba51898d30c 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -10,12 +10,12 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 906395260852..b4526beac637 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -16,11 +16,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 99bddeec24eb..ccddc6e811a1 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -33,12 +33,12 @@ from torch import nn from transformers import Qwen2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index c0fb1f13b6fe..0310c5415dc9 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -49,7 +49,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 04330a8682e9..4b0c756165a5 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -34,12 +34,12 @@ from torch import nn from transformers import Qwen2MoeConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 847501c01658..61ff54abd96c 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -49,7 +49,7 @@ from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 5757739c11a8..06df051446ae 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -30,11 +30,11 @@ from torch import nn from transformers import Qwen3Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 690d5368a380..2f95f4141c03 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -32,7 +32,6 @@ import torch.nn.functional as F from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -43,6 +42,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 3e89d1972ba4..503b40702070 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -10,7 +10,6 @@ from torch import nn from transformers.activations import ACT2FN -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CacheConfig, @@ -29,6 +28,7 @@ ) from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fla.ops import ( chunk_gated_delta_rule, fused_recurrent_gated_delta_rule, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index e38cf927706c..d90174911fb6 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -30,12 +30,12 @@ from torch import nn from transformers import PretrainedConfig as SeedOssConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 9d4e76f1cb9a..4e63521bc051 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -19,10 +19,10 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, + MMEncoderAttention, ) -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index 0b81d1b00729..ccda1d9c9896 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -13,7 +13,7 @@ from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 589727c6f980..bff866d0d0c2 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -30,11 +30,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 211b57ddb450..034c9c18ff7b 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -29,10 +29,10 @@ from torch import nn from transformers import StableLmConfig -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index dd1e8e98facd..5f08a59e2364 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -28,11 +28,11 @@ from torch import nn from transformers import Starcoder2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/step1.py b/vllm/model_executor/models/step1.py index 8e655c691e0a..4173b9ebf31d 100644 --- a/vllm/model_executor/models/step1.py +++ b/vllm/model_executor/models/step1.py @@ -10,7 +10,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention, AttentionType from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( get_pp_group, @@ -18,6 +17,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -41,6 +41,7 @@ maybe_prefix, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType STEP_PACKED_MODULES_MAPPING = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 4855dffec35e..18b689166a5f 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -9,7 +9,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 8f41e8c5b654..8b795ecea2c1 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -19,7 +19,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index d3f248f8bfcb..0bd7e4729597 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -27,12 +27,12 @@ from transformers import AutoModel from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS -from vllm.attention.layer import Attention from vllm.config.utils import getattr_iter from vllm.distributed import get_pp_group, get_tp_group from vllm.distributed.utils import get_pp_indices from vllm.logger import init_logger -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index ec3e5818ef7d..b254a53081d5 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -17,7 +17,6 @@ ) from transformers.models.whisper.modeling_whisper import sinusoids -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -25,8 +24,11 @@ from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.cross_attention import CrossAttention -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import ( + Attention, + CrossAttention, + MMEncoderAttention, +) from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/whisper_causal.py b/vllm/model_executor/models/whisper_causal.py index 8438b460fc00..4f2d4c07c661 100644 --- a/vllm/model_executor/models/whisper_causal.py +++ b/vllm/model_executor/models/whisper_causal.py @@ -10,9 +10,9 @@ import torch.nn.functional as F from torch import nn -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index dafad457a3cf..b4d844ba6d76 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -16,11 +16,11 @@ from torch import nn from transformers import Zamba2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index e88ee4de4333..8097dca3110a 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -9,7 +9,7 @@ import numpy as np import torch -from vllm.attention.layer import Attention +from vllm.model_executor.layers.attention import Attention from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 3febbe57a66f..e122882c7bc3 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -8,9 +8,9 @@ import torch from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import get_cu_count diff --git a/vllm/v1/spec_decode/draft_model.py b/vllm/v1/spec_decode/draft_model.py index 5a54074dd756..7d631aa89e04 100644 --- a/vllm/v1/spec_decode/draft_model.py +++ b/vllm/v1/spec_decode/draft_model.py @@ -4,10 +4,10 @@ import torch -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config.speculative import SpeculativeConfig from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.model_loader import get_model from vllm.triton_utils import tl, triton from vllm.v1.attention.backends.utils import ( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 64f6263cc038..adf64f749048 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -21,7 +21,6 @@ from tqdm import tqdm import vllm.envs as envs -from vllm.attention.layer import Attention, MLAAttention from vllm.compilation.counter import compilation_counter from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled @@ -50,6 +49,7 @@ ) from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping, LoRAMappingType +from vllm.model_executor.layers.attention import Attention, MLAAttention from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( RoutedExpertsCapturer, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index ccfbc3c6b04d..8af17e270c4f 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,9 +7,9 @@ import torch from typing_extensions import deprecated -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.registry import MultiModalRegistry