From 3f2691ad1452a3236198b8bbbd46d9e281ff6b96 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Fri, 9 Jan 2026 18:01:06 -0500 Subject: [PATCH 01/18] Move kv_transfer_utils Signed-off-by: Matthew Bonanni --- .../layers/attention}/kv_transfer_utils.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename vllm/{attention/utils => model_executor/layers/attention}/kv_transfer_utils.py (100%) diff --git a/vllm/attention/utils/kv_transfer_utils.py b/vllm/model_executor/layers/attention/kv_transfer_utils.py similarity index 100% rename from vllm/attention/utils/kv_transfer_utils.py rename to vllm/model_executor/layers/attention/kv_transfer_utils.py From 1cb4ce33882571e10c5d54a2bbcede22847ea8ef Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Fri, 9 Jan 2026 18:11:50 -0500 Subject: [PATCH 02/18] Start splitting up layer.py Signed-off-by: Matthew Bonanni --- vllm/attention/layer.py | 733 ------------------ .../layers/attention/attention.py | 432 +++++++++++ .../layers/attention/mla_attention.py | 345 +++++++++ 3 files changed, 777 insertions(+), 733 deletions(-) create mode 100644 vllm/model_executor/layers/attention/attention.py create mode 100644 vllm/model_executor/layers/attention/mla_attention.py diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 411d11e5a23f..b12c8c67fea8 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -2,46 +2,19 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer.""" -from typing import cast - import torch import torch.nn as nn -import vllm.envs as envs -from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target -from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer -from vllm.config import CacheConfig, get_current_vllm_config -from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger -from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase -from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, UnquantizedLinearMethod, ) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase -from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod -from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape -from vllm.platforms import current_platform from vllm.utils.torch_utils import ( direct_register_custom_op, - kv_cache_dtype_str_to_dtype, -) -from vllm.v1.attention.backend import ( - AttentionBackend, - AttentionType, - MLAAttentionImpl, -) -from vllm.v1.attention.backends.registry import AttentionBackendEnum -from vllm.v1.attention.selector import get_attn_backend -from vllm.v1.kv_cache_interface import ( - FullAttentionSpec, - KVCacheSpec, - MLAAttentionSpec, - SlidingWindowSpec, ) logger = init_logger(__name__) @@ -143,562 +116,6 @@ def _init_kv_cache_quant( layer.quant_method.create_weights(layer) -class Attention(nn.Module, AttentionLayerBase): - """Attention layer. - - This class takes query, key, and value tensors as input. The input tensors - can either contain prompt tokens or generation tokens. - The class does the following: - - 1. Store the input key and value tensors in the KV cache. - 2. Perform (multi-head/multi-query/grouped-query) attention. - 3. Return the output tensor. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int | None = None, - alibi_slopes: list[float] | None = None, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - logits_soft_cap: float | None = None, - per_layer_sliding_window: int | None = None, - prefix: str = "", - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: str | None = None, - attn_backend: type[AttentionBackend] | None = None, - head_size_v: int | None = None, - **extra_impl_args, - ) -> None: - """ - The KV cache is stored inside this class and is accessed via - `self.kv_cache`. - """ - super().__init__() - if per_layer_sliding_window is not None: - # per-layer sliding window - sliding_window = per_layer_sliding_window - elif cache_config is not None: - # model-level sliding window - sliding_window = cache_config.sliding_window - else: - sliding_window = None - - vllm_config = get_current_vllm_config() - if cache_config is not None: - kv_cache_dtype = cache_config.cache_dtype - block_size = cache_config.block_size - calculate_kv_scales = cache_config.calculate_kv_scales - else: - kv_cache_dtype = "auto" - block_size = 16 - calculate_kv_scales = False - self.kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype( - kv_cache_dtype, vllm_config.model_config - ) - if num_kv_heads is None: - num_kv_heads = num_heads - assert num_heads % num_kv_heads == 0, ( - f"num_heads ({num_heads}) is not divisible by num_kv_heads ({num_kv_heads})" - ) - self.quant_config = quant_config - self.layer_name = prefix - - # Initialize KV cache quantization attributes - _init_kv_cache_quant( - self, - self.quant_config, - self.layer_name, - kv_cache_dtype, - calculate_kv_scales, - ) - - self.num_heads = num_heads - self.head_size = head_size - self.head_size_v = self.head_size if head_size_v is None else head_size_v - self.num_kv_heads = num_kv_heads - self.sliding_window = sliding_window - self.has_sink = extra_impl_args.get("sinks") is not None - - # NOTE: model_config may be None during certain tests - model_config = vllm_config.model_config - self.use_mm_prefix = model_config is not None and model_config.is_mm_prefix_lm - - # During model initialization, the default dtype is set as the model - # weight and activation dtype. - dtype = torch.get_default_dtype() - if attn_backend is None: - self.attn_backend = get_attn_backend( - head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla=False, - has_sink=self.has_sink, - use_mm_prefix=self.use_mm_prefix, - attn_type=attn_type, - ) - else: - self.attn_backend = attn_backend - - # prefix caching + batch invariance is currently not supported for - # FLASHINFER and TRITON_MLA. - if ( - cache_config is not None - and cache_config.enable_prefix_caching - and vllm_is_batch_invariant() - and ( - self.attn_backend.get_name() == "FLASHINFER" - or self.attn_backend.get_name() == "TRITON_MLA" - ) - ): - logger.warning_once( - "Disabling prefix caching for FLASHINFER/TRITON_MLA " - "with batch invariance, as it is not yet supported.", - scope="local", - ) - cache_config.enable_prefix_caching = False - - impl_cls = self.attn_backend.get_impl_cls() - self.impl = impl_cls( - num_heads, - head_size, - scale, - num_kv_heads, - alibi_slopes, - sliding_window, - kv_cache_dtype, - logits_soft_cap, - attn_type, - kv_sharing_target_layer_name, - **extra_impl_args, - ) - self.backend = AttentionBackendEnum[self.attn_backend.get_name()] - self.dtype = dtype - - # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how - # torch.compile works by registering the attention as one giant - # opaque custom op. For other platforms, we directly call them - # and let torch.compile handle them. - self.use_direct_call = not current_platform.opaque_attention_op() - - self.use_output = self.attn_backend.accept_output_buffer - compilation_config = vllm_config.compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self - self.attn_type = attn_type - - if kv_sharing_target_layer_name is not None: - validate_kv_sharing_target( - prefix, - kv_sharing_target_layer_name, - compilation_config.static_forward_context, - ) - self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - - # use a placeholder kv cache tensor during init, which will be replaced - # by bind_kv_cache - # this variable will not be accessed if use_direct_call is True - self.kv_cache = [ - torch.tensor([]) - for _ in range(vllm_config.parallel_config.pipeline_parallel_size) - ] - - # Initialize q/k/v range constants. - self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) - self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) - self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) - - # for attn backends supporting query quantization - self.query_quant = None - if ( - self.kv_cache_dtype.startswith("fp8") - and self.impl.supports_quant_query_input - ): - self.query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR) - - def forward( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - # For some alternate attention backends like MLA the attention output - # shape does not match the query shape, so we optionally let the model - # definition specify the output tensor shape. - output_shape: torch.Size | None = None, - ) -> torch.Tensor: - """ - The KV cache is stored inside this class and is accessed via - `self.kv_cache`. - - Attention metadata (`attn_metadata`) is set using a context manager in - the model runner's `execute_model` method. It is accessed via forward - context using - `vllm.forward_context.get_forward_context().attn_metadata`. - """ - if self.calculate_kv_scales: - torch.ops.vllm.maybe_calc_kv_scales(query, key, value, self.layer_name) - output_dtype = query.dtype - if self.query_quant is not None: - # quantizing with a simple torch operation enables - # torch.compile to fuse this into previous ops - # which reduces overheads during decoding. - # Otherwise queries are quantized using custom ops - # which causes decoding overheads - assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"} - - # check if query quantization is supported - if self.impl.supports_quant_query_input: - query, _ = self.query_quant(query, self._q_scale) - - if self.use_output: - if output_shape is None: - # Handle both 2D [num_tokens, hidden] and - # 3D [num_tokens, heads, head_dim] query - num_tokens = query.shape[0] - output_shape = torch.Size( - (num_tokens, self.num_heads * self.head_size_v) - ) - output_shape = output_shape if output_shape is not None else query.shape - output = torch.empty(output_shape, dtype=output_dtype, device=query.device) - hidden_size = output_shape[-1] - # Reshape the query, key, and value tensors. - # NOTE(woosuk): We do this outside the custom op to minimize the - # CPU overheads from the non-CUDA-graph regions. - query = query.view(-1, self.num_heads, self.head_size) - output = output.view(-1, self.num_heads, self.head_size_v) - if key is not None: - key = key.view(-1, self.num_kv_heads, self.head_size) - if value is not None: - value = value.view(-1, self.num_kv_heads, self.head_size_v) - if self.use_direct_call: - forward_context: ForwardContext = get_forward_context() - attn_metadata = forward_context.attn_metadata - if isinstance(attn_metadata, dict): - attn_metadata = attn_metadata[self.layer_name] - self_kv_cache = self.kv_cache[forward_context.virtual_engine] - self.impl.forward( - self, query, key, value, self_kv_cache, attn_metadata, output=output - ) - else: - torch.ops.vllm.unified_attention_with_output( - query, key, value, output, self.layer_name - ) - return output.view(-1, hidden_size) - else: - if self.use_direct_call: - forward_context = get_forward_context() - attn_metadata = forward_context.attn_metadata - if isinstance(attn_metadata, dict): - attn_metadata = attn_metadata[self.layer_name] - self_kv_cache = self.kv_cache[forward_context.virtual_engine] - return self.impl.forward( - self, query, key, value, self_kv_cache, attn_metadata - ) - else: - return torch.ops.vllm.unified_attention( - query, key, value, self.layer_name - ) - - def calc_kv_scales(self, query, key, value): - self._q_scale.copy_(torch.abs(query).max() / self.q_range) - self._k_scale.copy_(torch.abs(key).max() / self.k_range) - self._v_scale.copy_(torch.abs(value).max() / self.v_range) - self._q_scale_float = self._q_scale.item() - self._k_scale_float = self._k_scale.item() - self._v_scale_float = self._v_scale.item() - # We only calculate the scales once - self.calculate_kv_scales = False - - def extra_repr(self) -> str: - s = f"head_size={self.impl.head_size}" # type: ignore - s += f", num_heads={self.impl.num_heads}" # type: ignore - s += f", num_kv_heads={self.impl.num_kv_heads}" # type: ignore - s += f", scale={self.impl.scale}" # type: ignore - s += f", backend={self.impl.__class__.__name__}" - return s - - def process_weights_after_loading(self, act_dtype: torch.dtype): - self.impl.process_weights_after_loading(act_dtype) - - # If we should not load quant weights, we initialize the scales to 1.0 - # as the default value. See [Note: Register q/k/v/prob scales in state dict] - # for more details. - quant_method = ( - self.quant_config.get_quant_method(self, prefix=self.layer_name) - if self.quant_config - else None - ) - if not should_load_quant_weights(quant_method): - set_default_quant_scales(self, register_buffer=False) - - def get_attn_backend(self) -> type[AttentionBackend]: - return self.attn_backend - - def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: - # Block size may get updated after model loading, refresh it - block_size = vllm_config.cache_config.block_size - # Should not be called for enc-dec or encoder-only attention. - assert self.attn_type == AttentionType.DECODER - if self.sliding_window is not None: - assert not vllm_config.model_config.use_mla, ( - "MLA is not supported for slidingwindow" - ) - return SlidingWindowSpec( - block_size=block_size, - num_kv_heads=self.num_kv_heads, - head_size=self.head_size, - dtype=self.kv_cache_torch_dtype, - sliding_window=self.sliding_window, - ) - else: - return FullAttentionSpec( - block_size=block_size, - num_kv_heads=self.num_kv_heads, - head_size=self.head_size, - head_size_v=self.head_size_v, - dtype=self.kv_cache_torch_dtype, - ) - - -class MLAAttention(nn.Module, AttentionLayerBase): - """Multi-Head Latent Attention layer. - - This class takes query, and compressed key/value tensors as input. - The class does the following: - - 1. Store the input key and value tensors in the KV cache. - 2. Perform (multi-head/multi-query/grouped-query) attention. - 3. Return the output tensor. - """ - - def __init__( - self, - num_heads: int, - scale: float, - qk_nope_head_dim: int, - qk_rope_head_dim: int, - v_head_dim: int, - q_lora_rank: int | None, - kv_lora_rank: int, - kv_b_proj: ColumnParallelLinear, - cache_config: CacheConfig | None = None, - quant_config: QuantizationConfig | None = None, - prefix: str = "", - use_sparse: bool = False, - indexer: object | None = None, - **extra_impl_args, - ): - super().__init__() - self.num_heads = num_heads - self.scale = scale - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.v_head_dim = v_head_dim - self.q_lora_rank = q_lora_rank - self.kv_lora_rank = kv_lora_rank - self.head_size = kv_lora_rank + qk_rope_head_dim - self.layer_name = prefix - - if cache_config is not None: - kv_cache_dtype = cache_config.cache_dtype - block_size = cache_config.block_size - calculate_kv_scales = cache_config.calculate_kv_scales - else: - kv_cache_dtype = "auto" - block_size = 16 - calculate_kv_scales = False - self.quant_config = quant_config - - # Initialize KV cache quantization attributes - _init_kv_cache_quant( - self, - self.quant_config, - self.layer_name, - kv_cache_dtype, - calculate_kv_scales, - ) - - dtype = torch.get_default_dtype() - self.attn_backend = get_attn_backend( - self.head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla=True, - use_sparse=use_sparse, - ) - - if ( - cache_config is not None - and cache_config.enable_prefix_caching - and vllm_is_batch_invariant() - and ( - self.attn_backend.get_name() == "TRITON_MLA" - or self.attn_backend.get_name() == "FLASHINFER" - ) - ): - logger.warning_once( - "Disabling prefix caching for TRITON_MLA / FLASHINFER " - "with batch invariance, as it is not yet supported.", - scope="local", - ) - cache_config.enable_prefix_caching = False - - impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls()) - self.impl = impl_cls( - num_heads=self.num_heads, - head_size=self.head_size, - scale=self.scale, - num_kv_heads=1, - alibi_slopes=None, - sliding_window=None, - kv_cache_dtype=self.kv_cache_dtype, - logits_soft_cap=None, - attn_type=AttentionType.DECODER, - kv_sharing_target_layer_name=None, - # MLA Args - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - qk_head_dim=self.qk_nope_head_dim + self.qk_rope_head_dim, - v_head_dim=self.v_head_dim, - kv_b_proj=kv_b_proj, - indexer=indexer, - **extra_impl_args, - ) - - self.use_direct_call = not current_platform.opaque_attention_op() - - compilation_config = get_current_vllm_config().compilation_config - if prefix in compilation_config.static_forward_context: - raise ValueError(f"Duplicate layer name: {prefix}") - compilation_config.static_forward_context[prefix] = self - - self.kv_cache = [ - torch.tensor([]) - for _ in range( - get_current_vllm_config().parallel_config.pipeline_parallel_size - ) - ] - - self.use_sparse = use_sparse - - # Initialize q/k/v range constants. - self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) - self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) - self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) - - def forward( - self, - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - output_shape: torch.Size | None = None, - ) -> torch.Tensor: - if self.calculate_kv_scales: - torch.ops.vllm.maybe_calc_kv_scales(q, kv_c_normed, k_pe, self.layer_name) - - if self.use_direct_call: - forward_context: ForwardContext = get_forward_context() - attn_metadata = forward_context.attn_metadata - if isinstance(attn_metadata, dict): - attn_metadata = attn_metadata[self.layer_name] - self_kv_cache = self.kv_cache[forward_context.virtual_engine] - - if self.attn_backend.accept_output_buffer: - output = torch.empty(output_shape, dtype=q.dtype, device=q.device) - self.impl.forward( - self, - q, - kv_c_normed, - k_pe, - self_kv_cache, - attn_metadata, - output=output, - ) - return output - else: - return self.impl.forward( - self, q, kv_c_normed, k_pe, self_kv_cache, attn_metadata - ) - else: - if self.attn_backend.accept_output_buffer: - output = torch.empty(output_shape, dtype=q.dtype, device=q.device) - torch.ops.vllm.unified_mla_attention_with_output( - q, - kv_c_normed, - k_pe, - output, - self.layer_name, - ) - return output - else: - return torch.ops.vllm.unified_mla_attention( - q, - kv_c_normed, - k_pe, - self.layer_name, - ) - - def process_weights_after_loading(self, act_dtype: torch.dtype): - if hasattr(self.impl, "process_weights_after_loading"): - self.impl.process_weights_after_loading(act_dtype) - - # If we should not load quant weights, we initialize the scales to 1.0 - # as the default value. See [Note: Register q/k/v/prob scales in state dict] - # for more details. - quant_method = ( - self.quant_config.get_quant_method(self, prefix=self.layer_name) - if self.quant_config - else None - ) - if not should_load_quant_weights(quant_method): - set_default_quant_scales(self, register_buffer=False) - - def calc_kv_scales( - self, q: torch.Tensor, kv_c_normed: torch.Tensor, k_pe: torch.Tensor - ) -> None: - """Optional scale calculation for MLA inputs. - - Mirrors Attention.calc_kv_scales. Not all MLA backends require this - """ - # Use safe defaults if ranges are not present - q_range = getattr(self, "q_range", torch.tensor(1.0)) - k_range = getattr(self, "k_range", torch.tensor(1.0)) - v_range = getattr(self, "v_range", torch.tensor(1.0)) - - self._q_scale.copy_(torch.abs(q).max() / q_range) - # kv_c_normed is the compressed KV representation; use it for k/v - kv_abs_max = torch.abs(kv_c_normed).max() - self._k_scale.copy_(kv_abs_max / k_range) - self._v_scale.copy_(kv_abs_max / v_range) - self._q_scale_float = self._q_scale.item() - self._k_scale_float = self._k_scale.item() - self._v_scale_float = self._v_scale.item() - self.calculate_kv_scales = False - - def get_attn_backend(self) -> type[AttentionBackend]: - return self.attn_backend - - def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: - kv_cache_dtype = kv_cache_dtype_str_to_dtype( - self.kv_cache_dtype, vllm_config.model_config - ) - return MLAAttentionSpec( - block_size=vllm_config.cache_config.block_size, - num_kv_heads=1, - head_size=self.head_size, - dtype=kv_cache_dtype, - cache_dtype_str=vllm_config.cache_config.cache_dtype, - ) - - def maybe_calc_kv_scales( query: torch.Tensor, key: torch.Tensor, @@ -761,153 +178,3 @@ def get_attention_context( attn_layer: Attention | MLAAttention = forward_context.no_compile_layers[layer_name] kv_cache = attn_layer.kv_cache[forward_context.virtual_engine] return attn_metadata, attn_layer, kv_cache - - -@maybe_transfer_kv_layer -def unified_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - layer_name: str, -) -> torch.Tensor: - attn_metadata, self, kv_cache = get_attention_context(layer_name) - output = self.impl.forward(self, query, key, value, kv_cache, attn_metadata) - - return output - - -def unified_attention_fake( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - layer_name: str, -) -> torch.Tensor: - return torch.empty_like(query).contiguous() - - -direct_register_custom_op( - op_name="unified_attention", - op_func=unified_attention, - fake_impl=unified_attention_fake, -) - - -@maybe_transfer_kv_layer -def unified_attention_with_output( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - output: torch.Tensor, - layer_name: str, - output_scale: torch.Tensor | None = None, - output_block_scale: torch.Tensor | None = None, -) -> None: - attn_metadata, self, kv_cache = get_attention_context(layer_name) - - self.impl.forward( - self, - query, - key, - value, - kv_cache, - attn_metadata, - output=output, - output_scale=output_scale, - output_block_scale=output_block_scale, - ) - - -def unified_attention_with_output_fake( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - output: torch.Tensor, - layer_name: str, - output_scale: torch.Tensor | None = None, - output_block_scale: torch.Tensor | None = None, -) -> None: - return - - -direct_register_custom_op( - op_name="unified_attention_with_output", - op_func=unified_attention_with_output, - mutates_args=["output", "output_block_scale"], - fake_impl=unified_attention_with_output_fake, -) - - -@maybe_transfer_kv_layer -def unified_mla_attention( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - layer_name: str, -) -> torch.Tensor: - attn_metadata, self, kv_cache = get_attention_context(layer_name) - output = self.impl.forward(self, q, kv_c_normed, k_pe, kv_cache, attn_metadata) - - return output - - -def unified_mla_attention_fake( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - layer_name: str, -) -> torch.Tensor: - return torch.empty_like(q).contiguous() - - -direct_register_custom_op( - op_name="unified_mla_attention", - op_func=unified_mla_attention, - mutates_args=[], - fake_impl=unified_mla_attention_fake, - dispatch_key=current_platform.dispatch_key, -) - - -@maybe_transfer_kv_layer -def unified_mla_attention_with_output( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - output: torch.Tensor, - layer_name: str, - output_scale: torch.Tensor | None = None, - output_block_scale: torch.Tensor | None = None, -) -> None: - attn_metadata, self, kv_cache = get_attention_context(layer_name) - self.impl.forward( - self, - q, - kv_c_normed, - k_pe, - kv_cache, - attn_metadata, - output=output, - output_scale=output_scale, - output_block_scale=output_block_scale, - ) - - -def unified_mla_attention_with_output_fake( - q: torch.Tensor, - kv_c_normed: torch.Tensor, - k_pe: torch.Tensor, - output: torch.Tensor, - layer_name: str, - output_scale: torch.Tensor | None = None, - output_block_scale: torch.Tensor | None = None, -) -> None: - return - - -direct_register_custom_op( - op_name="unified_mla_attention_with_output", - op_func=unified_mla_attention_with_output, - mutates_args=["output", "output_block_scale"], - fake_impl=unified_mla_attention_with_output_fake, - dispatch_key=current_platform.dispatch_key, -) diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py new file mode 100644 index 000000000000..aa911b4ffcfb --- /dev/null +++ b/vllm/model_executor/layers/attention/attention.py @@ -0,0 +1,432 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch +import torch.nn as nn + +import vllm.envs as envs +from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target +from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer +from vllm.config import CacheConfig, get_current_vllm_config +from vllm.config.vllm import VllmConfig +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.platforms import current_platform +from vllm.utils.torch_utils import ( + direct_register_custom_op, + kv_cache_dtype_str_to_dtype, +) +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionType, +) +from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.selector import get_attn_backend +from vllm.v1.kv_cache_interface import ( + FullAttentionSpec, + KVCacheSpec, + SlidingWindowSpec, +) + +logger = init_logger(__name__) + +class Attention(nn.Module, AttentionLayerBase): + """Attention layer. + + This class takes query, key, and value tensors as input. The input tensors + can either contain prompt tokens or generation tokens. + The class does the following: + + 1. Store the input key and value tensors in the KV cache. + 2. Perform (multi-head/multi-query/grouped-query) attention. + 3. Return the output tensor. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int | None = None, + alibi_slopes: list[float] | None = None, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + logits_soft_cap: float | None = None, + per_layer_sliding_window: int | None = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: str | None = None, + attn_backend: type[AttentionBackend] | None = None, + head_size_v: int | None = None, + **extra_impl_args, + ) -> None: + """ + The KV cache is stored inside this class and is accessed via + `self.kv_cache`. + """ + super().__init__() + if per_layer_sliding_window is not None: + # per-layer sliding window + sliding_window = per_layer_sliding_window + elif cache_config is not None: + # model-level sliding window + sliding_window = cache_config.sliding_window + else: + sliding_window = None + + vllm_config = get_current_vllm_config() + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + calculate_kv_scales = cache_config.calculate_kv_scales + else: + kv_cache_dtype = "auto" + block_size = 16 + calculate_kv_scales = False + self.kv_cache_torch_dtype = kv_cache_dtype_str_to_dtype( + kv_cache_dtype, vllm_config.model_config + ) + if num_kv_heads is None: + num_kv_heads = num_heads + assert num_heads % num_kv_heads == 0, ( + f"num_heads ({num_heads}) is not divisible by num_kv_heads ({num_kv_heads})" + ) + self.quant_config = quant_config + self.layer_name = prefix + + # Initialize KV cache quantization attributes + _init_kv_cache_quant( + self, + self.quant_config, + self.layer_name, + kv_cache_dtype, + calculate_kv_scales, + ) + + self.num_heads = num_heads + self.head_size = head_size + self.head_size_v = self.head_size if head_size_v is None else head_size_v + self.num_kv_heads = num_kv_heads + self.sliding_window = sliding_window + self.has_sink = extra_impl_args.get("sinks") is not None + + # NOTE: model_config may be None during certain tests + model_config = vllm_config.model_config + self.use_mm_prefix = model_config is not None and model_config.is_mm_prefix_lm + + # During model initialization, the default dtype is set as the model + # weight and activation dtype. + dtype = torch.get_default_dtype() + if attn_backend is None: + self.attn_backend = get_attn_backend( + head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla=False, + has_sink=self.has_sink, + use_mm_prefix=self.use_mm_prefix, + attn_type=attn_type, + ) + else: + self.attn_backend = attn_backend + + # prefix caching + batch invariance is currently not supported for + # FLASHINFER and TRITON_MLA. + if ( + cache_config is not None + and cache_config.enable_prefix_caching + and vllm_is_batch_invariant() + and ( + self.attn_backend.get_name() == "FLASHINFER" + or self.attn_backend.get_name() == "TRITON_MLA" + ) + ): + logger.warning_once( + "Disabling prefix caching for FLASHINFER/TRITON_MLA " + "with batch invariance, as it is not yet supported.", + scope="local", + ) + cache_config.enable_prefix_caching = False + + impl_cls = self.attn_backend.get_impl_cls() + self.impl = impl_cls( + num_heads, + head_size, + scale, + num_kv_heads, + alibi_slopes, + sliding_window, + kv_cache_dtype, + logits_soft_cap, + attn_type, + kv_sharing_target_layer_name, + **extra_impl_args, + ) + self.backend = AttentionBackendEnum[self.attn_backend.get_name()] + self.dtype = dtype + + # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how + # torch.compile works by registering the attention as one giant + # opaque custom op. For other platforms, we directly call them + # and let torch.compile handle them. + self.use_direct_call = not current_platform.opaque_attention_op() + + self.use_output = self.attn_backend.accept_output_buffer + compilation_config = vllm_config.compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + self.attn_type = attn_type + + if kv_sharing_target_layer_name is not None: + validate_kv_sharing_target( + prefix, + kv_sharing_target_layer_name, + compilation_config.static_forward_context, + ) + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name + + # use a placeholder kv cache tensor during init, which will be replaced + # by bind_kv_cache + # this variable will not be accessed if use_direct_call is True + self.kv_cache = [ + torch.tensor([]) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] + + # Initialize q/k/v range constants. + self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + + # for attn backends supporting query quantization + self.query_quant = None + if ( + self.kv_cache_dtype.startswith("fp8") + and self.impl.supports_quant_query_input + ): + self.query_quant = QuantFP8(static=True, group_shape=GroupShape.PER_TENSOR) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + # For some alternate attention backends like MLA the attention output + # shape does not match the query shape, so we optionally let the model + # definition specify the output tensor shape. + output_shape: torch.Size | None = None, + ) -> torch.Tensor: + """ + The KV cache is stored inside this class and is accessed via + `self.kv_cache`. + + Attention metadata (`attn_metadata`) is set using a context manager in + the model runner's `execute_model` method. It is accessed via forward + context using + `vllm.forward_context.get_forward_context().attn_metadata`. + """ + if self.calculate_kv_scales: + torch.ops.vllm.maybe_calc_kv_scales(query, key, value, self.layer_name) + output_dtype = query.dtype + if self.query_quant is not None: + # quantizing with a simple torch operation enables + # torch.compile to fuse this into previous ops + # which reduces overheads during decoding. + # Otherwise queries are quantized using custom ops + # which causes decoding overheads + assert self.kv_cache_dtype in {"fp8", "fp8_e4m3"} + + # check if query quantization is supported + if self.impl.supports_quant_query_input: + query, _ = self.query_quant(query, self._q_scale) + + if self.use_output: + if output_shape is None: + # Handle both 2D [num_tokens, hidden] and + # 3D [num_tokens, heads, head_dim] query + num_tokens = query.shape[0] + output_shape = torch.Size( + (num_tokens, self.num_heads * self.head_size_v) + ) + output_shape = output_shape if output_shape is not None else query.shape + output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + hidden_size = output_shape[-1] + # Reshape the query, key, and value tensors. + # NOTE(woosuk): We do this outside the custom op to minimize the + # CPU overheads from the non-CUDA-graph regions. + query = query.view(-1, self.num_heads, self.head_size) + output = output.view(-1, self.num_heads, self.head_size_v) + if key is not None: + key = key.view(-1, self.num_kv_heads, self.head_size) + if value is not None: + value = value.view(-1, self.num_kv_heads, self.head_size_v) + if self.use_direct_call: + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if isinstance(attn_metadata, dict): + attn_metadata = attn_metadata[self.layer_name] + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + self.impl.forward( + self, query, key, value, self_kv_cache, attn_metadata, output=output + ) + else: + torch.ops.vllm.unified_attention_with_output( + query, key, value, output, self.layer_name + ) + return output.view(-1, hidden_size) + else: + if self.use_direct_call: + forward_context = get_forward_context() + attn_metadata = forward_context.attn_metadata + if isinstance(attn_metadata, dict): + attn_metadata = attn_metadata[self.layer_name] + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + return self.impl.forward( + self, query, key, value, self_kv_cache, attn_metadata + ) + else: + return torch.ops.vllm.unified_attention( + query, key, value, self.layer_name + ) + + def calc_kv_scales(self, query, key, value): + self._q_scale.copy_(torch.abs(query).max() / self.q_range) + self._k_scale.copy_(torch.abs(key).max() / self.k_range) + self._v_scale.copy_(torch.abs(value).max() / self.v_range) + self._q_scale_float = self._q_scale.item() + self._k_scale_float = self._k_scale.item() + self._v_scale_float = self._v_scale.item() + # We only calculate the scales once + self.calculate_kv_scales = False + + def extra_repr(self) -> str: + s = f"head_size={self.impl.head_size}" # type: ignore + s += f", num_heads={self.impl.num_heads}" # type: ignore + s += f", num_kv_heads={self.impl.num_kv_heads}" # type: ignore + s += f", scale={self.impl.scale}" # type: ignore + s += f", backend={self.impl.__class__.__name__}" + return s + + def process_weights_after_loading(self, act_dtype: torch.dtype): + self.impl.process_weights_after_loading(act_dtype) + + # If we should not load quant weights, we initialize the scales to 1.0 + # as the default value. See [Note: Register q/k/v/prob scales in state dict] + # for more details. + quant_method = ( + self.quant_config.get_quant_method(self, prefix=self.layer_name) + if self.quant_config + else None + ) + if not should_load_quant_weights(quant_method): + set_default_quant_scales(self, register_buffer=False) + + def get_attn_backend(self) -> type[AttentionBackend]: + return self.attn_backend + + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: + # Block size may get updated after model loading, refresh it + block_size = vllm_config.cache_config.block_size + # Should not be called for enc-dec or encoder-only attention. + assert self.attn_type == AttentionType.DECODER + if self.sliding_window is not None: + assert not vllm_config.model_config.use_mla, ( + "MLA is not supported for slidingwindow" + ) + return SlidingWindowSpec( + block_size=block_size, + num_kv_heads=self.num_kv_heads, + head_size=self.head_size, + dtype=self.kv_cache_torch_dtype, + sliding_window=self.sliding_window, + ) + else: + return FullAttentionSpec( + block_size=block_size, + num_kv_heads=self.num_kv_heads, + head_size=self.head_size, + head_size_v=self.head_size_v, + dtype=self.kv_cache_torch_dtype, + ) + + +@maybe_transfer_kv_layer +def unified_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + output = self.impl.forward(self, query, key, value, kv_cache, attn_metadata) + + return output + + +def unified_attention_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + return torch.empty_like(query).contiguous() + + +direct_register_custom_op( + op_name="unified_attention", + op_func=unified_attention, + fake_impl=unified_attention_fake, +) + + +@maybe_transfer_kv_layer +def unified_attention_with_output( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + + self.impl.forward( + self, + query, + key, + value, + kv_cache, + attn_metadata, + output=output, + output_scale=output_scale, + output_block_scale=output_block_scale, + ) + + +def unified_attention_with_output_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + return + + +direct_register_custom_op( + op_name="unified_attention_with_output", + op_func=unified_attention_with_output, + mutates_args=["output", "output_block_scale"], + fake_impl=unified_attention_with_output_fake, +) + diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py new file mode 100644 index 000000000000..330271da0b5f --- /dev/null +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -0,0 +1,345 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import cast + +import torch +import torch.nn as nn + +import vllm.envs as envs +from vllm.config import CacheConfig, get_current_vllm_config +from vllm.config.vllm import VllmConfig +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, +) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.platforms import current_platform +from vllm.utils.torch_utils import ( + kv_cache_dtype_str_to_dtype, +) +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionType, + MLAAttentionImpl, +) +from vllm.v1.attention.selector import get_attn_backend +from vllm.v1.kv_cache_interface import ( + KVCacheSpec, + MLAAttentionSpec, +) + +logger = init_logger(__name__) + + +class MLAAttention(nn.Module, AttentionLayerBase): + """Multi-Head Latent Attention layer. + + This class takes query, and compressed key/value tensors as input. + The class does the following: + + 1. Store the input key and value tensors in the KV cache. + 2. Perform (multi-head/multi-query/grouped-query) attention. + 3. Return the output tensor. + """ + + def __init__( + self, + num_heads: int, + scale: float, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: int | None, + kv_lora_rank: int, + kv_b_proj: ColumnParallelLinear, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + use_sparse: bool = False, + indexer: object | None = None, + **extra_impl_args, + ): + super().__init__() + self.num_heads = num_heads + self.scale = scale + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.head_size = kv_lora_rank + qk_rope_head_dim + self.layer_name = prefix + + if cache_config is not None: + kv_cache_dtype = cache_config.cache_dtype + block_size = cache_config.block_size + calculate_kv_scales = cache_config.calculate_kv_scales + else: + kv_cache_dtype = "auto" + block_size = 16 + calculate_kv_scales = False + self.quant_config = quant_config + + # Initialize KV cache quantization attributes + _init_kv_cache_quant( + self, + self.quant_config, + self.layer_name, + kv_cache_dtype, + calculate_kv_scales, + ) + + dtype = torch.get_default_dtype() + self.attn_backend = get_attn_backend( + self.head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla=True, + use_sparse=use_sparse, + ) + + if ( + cache_config is not None + and cache_config.enable_prefix_caching + and vllm_is_batch_invariant() + and ( + self.attn_backend.get_name() == "TRITON_MLA" + or self.attn_backend.get_name() == "FLASHINFER" + ) + ): + logger.warning_once( + "Disabling prefix caching for TRITON_MLA / FLASHINFER " + "with batch invariance, as it is not yet supported.", + scope="local", + ) + cache_config.enable_prefix_caching = False + + impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls()) + self.impl = impl_cls( + num_heads=self.num_heads, + head_size=self.head_size, + scale=self.scale, + num_kv_heads=1, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype=self.kv_cache_dtype, + logits_soft_cap=None, + attn_type=AttentionType.DECODER, + kv_sharing_target_layer_name=None, + # MLA Args + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + qk_head_dim=self.qk_nope_head_dim + self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + kv_b_proj=kv_b_proj, + indexer=indexer, + **extra_impl_args, + ) + + self.use_direct_call = not current_platform.opaque_attention_op() + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + self.kv_cache = [ + torch.tensor([]) + for _ in range( + get_current_vllm_config().parallel_config.pipeline_parallel_size + ) + ] + + self.use_sparse = use_sparse + + # Initialize q/k/v range constants. + self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32) + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + + def forward( + self, + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output_shape: torch.Size | None = None, + ) -> torch.Tensor: + if self.calculate_kv_scales: + torch.ops.vllm.maybe_calc_kv_scales(q, kv_c_normed, k_pe, self.layer_name) + + if self.use_direct_call: + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if isinstance(attn_metadata, dict): + attn_metadata = attn_metadata[self.layer_name] + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + + if self.attn_backend.accept_output_buffer: + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) + self.impl.forward( + self, + q, + kv_c_normed, + k_pe, + self_kv_cache, + attn_metadata, + output=output, + ) + return output + else: + return self.impl.forward( + self, q, kv_c_normed, k_pe, self_kv_cache, attn_metadata + ) + else: + if self.attn_backend.accept_output_buffer: + output = torch.empty(output_shape, dtype=q.dtype, device=q.device) + torch.ops.vllm.unified_mla_attention_with_output( + q, + kv_c_normed, + k_pe, + output, + self.layer_name, + ) + return output + else: + return torch.ops.vllm.unified_mla_attention( + q, + kv_c_normed, + k_pe, + self.layer_name, + ) + + def process_weights_after_loading(self, act_dtype: torch.dtype): + if hasattr(self.impl, "process_weights_after_loading"): + self.impl.process_weights_after_loading(act_dtype) + + # If we should not load quant weights, we initialize the scales to 1.0 + # as the default value. See [Note: Register q/k/v/prob scales in state dict] + # for more details. + quant_method = ( + self.quant_config.get_quant_method(self, prefix=self.layer_name) + if self.quant_config + else None + ) + if not should_load_quant_weights(quant_method): + set_default_quant_scales(self, register_buffer=False) + + def calc_kv_scales( + self, q: torch.Tensor, kv_c_normed: torch.Tensor, k_pe: torch.Tensor + ) -> None: + """Optional scale calculation for MLA inputs. + + Mirrors Attention.calc_kv_scales. Not all MLA backends require this + """ + # Use safe defaults if ranges are not present + q_range = getattr(self, "q_range", torch.tensor(1.0)) + k_range = getattr(self, "k_range", torch.tensor(1.0)) + v_range = getattr(self, "v_range", torch.tensor(1.0)) + + self._q_scale.copy_(torch.abs(q).max() / q_range) + # kv_c_normed is the compressed KV representation; use it for k/v + kv_abs_max = torch.abs(kv_c_normed).max() + self._k_scale.copy_(kv_abs_max / k_range) + self._v_scale.copy_(kv_abs_max / v_range) + self._q_scale_float = self._q_scale.item() + self._k_scale_float = self._k_scale.item() + self._v_scale_float = self._v_scale.item() + self.calculate_kv_scales = False + + def get_attn_backend(self) -> type[AttentionBackend]: + return self.attn_backend + + def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: + kv_cache_dtype = kv_cache_dtype_str_to_dtype( + self.kv_cache_dtype, vllm_config.model_config + ) + return MLAAttentionSpec( + block_size=vllm_config.cache_config.block_size, + num_kv_heads=1, + head_size=self.head_size, + dtype=kv_cache_dtype, + cache_dtype_str=vllm_config.cache_config.cache_dtype, + ) + + +@maybe_transfer_kv_layer +def unified_mla_attention( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + output = self.impl.forward(self, q, kv_c_normed, k_pe, kv_cache, attn_metadata) + + return output + + +def unified_mla_attention_fake( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + layer_name: str, +) -> torch.Tensor: + return torch.empty_like(q).contiguous() + + +direct_register_custom_op( + op_name="unified_mla_attention", + op_func=unified_mla_attention, + mutates_args=[], + fake_impl=unified_mla_attention_fake, + dispatch_key=current_platform.dispatch_key, +) + + +@maybe_transfer_kv_layer +def unified_mla_attention_with_output( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + attn_metadata, self, kv_cache = get_attention_context(layer_name) + self.impl.forward( + self, + q, + kv_c_normed, + k_pe, + kv_cache, + attn_metadata, + output=output, + output_scale=output_scale, + output_block_scale=output_block_scale, + ) + + +def unified_mla_attention_with_output_fake( + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + output: torch.Tensor, + layer_name: str, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, +) -> None: + return + + +direct_register_custom_op( + op_name="unified_mla_attention_with_output", + op_func=unified_mla_attention_with_output, + mutates_args=["output", "output_block_scale"], + fake_impl=unified_mla_attention_with_output_fake, + dispatch_key=current_platform.dispatch_key, +) From d55d99a8a9be7a03dedae2e22d014edeb092dbd1 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 12 Jan 2026 10:20:09 -0500 Subject: [PATCH 03/18] Move content out of kv_sharing_utils.py and delete file Signed-off-by: Matthew Bonanni --- vllm/attention/utils/kv_sharing_utils.py | 33 ----------------- .../layers/attention/attention.py | 36 +++++++++++++++++-- 2 files changed, 34 insertions(+), 35 deletions(-) delete mode 100644 vllm/attention/utils/kv_sharing_utils.py diff --git a/vllm/attention/utils/kv_sharing_utils.py b/vllm/attention/utils/kv_sharing_utils.py deleted file mode 100644 index 93af5bf7e13f..000000000000 --- a/vllm/attention/utils/kv_sharing_utils.py +++ /dev/null @@ -1,33 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -def validate_kv_sharing_target( - current_layer_name, target_layer_name, static_forward_context -): - error_msg = ( - f"Specified KV sharing target layer for {current_layer_name} " - f"is not valid: target layer {target_layer_name} " - ) - - if current_layer_name == target_layer_name: - raise ValueError(error_msg + "cannot be the same as the current layer.") - - if target_layer_name not in static_forward_context: - from vllm.model_executor.models.utils import extract_layer_index - - # If target layer name is not in the static fwd context, it means either - # a) the target layer does not come BEFORE the current layer, or - # b) the target layer is not an Attention layer that exists in the model - current_layer_idx = extract_layer_index(current_layer_name) - target_layer_idx = extract_layer_index(target_layer_name) - if current_layer_idx <= target_layer_idx: - raise ValueError(error_msg + "must come before the current layer.") - else: - raise ValueError(error_msg + "is not a valid Attention layer in the model.") - - # Currently KV sharing is only supported between layers of the same type - target_layer_attn_type = static_forward_context[target_layer_name].attn_type - expected = static_forward_context[current_layer_name].attn_type - if target_layer_attn_type != expected: - raise ValueError( - error_msg + f"must be the same type as the current layer ({expected})." - ) diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index aa911b4ffcfb..88ec63a809ee 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -5,7 +5,6 @@ import torch.nn as nn import vllm.envs as envs -from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer from vllm.config import CacheConfig, get_current_vllm_config from vllm.config.vllm import VllmConfig @@ -35,6 +34,40 @@ logger = init_logger(__name__) + +def validate_kv_sharing_target( + current_layer_name, target_layer_name, static_forward_context +): + error_msg = ( + f"Specified KV sharing target layer for {current_layer_name} " + f"is not valid: target layer {target_layer_name} " + ) + + if current_layer_name == target_layer_name: + raise ValueError(error_msg + "cannot be the same as the current layer.") + + if target_layer_name not in static_forward_context: + from vllm.model_executor.models.utils import extract_layer_index + + # If target layer name is not in the static fwd context, it means either + # a) the target layer does not come BEFORE the current layer, or + # b) the target layer is not an Attention layer that exists in the model + current_layer_idx = extract_layer_index(current_layer_name) + target_layer_idx = extract_layer_index(target_layer_name) + if current_layer_idx <= target_layer_idx: + raise ValueError(error_msg + "must come before the current layer.") + else: + raise ValueError(error_msg + "is not a valid Attention layer in the model.") + + # Currently KV sharing is only supported between layers of the same type + target_layer_attn_type = static_forward_context[target_layer_name].attn_type + expected = static_forward_context[current_layer_name].attn_type + if target_layer_attn_type != expected: + raise ValueError( + error_msg + f"must be the same type as the current layer ({expected})." + ) + + class Attention(nn.Module, AttentionLayerBase): """Attention layer. @@ -429,4 +462,3 @@ def unified_attention_with_output_fake( mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, ) - From d1828949053783cc9e78f684f0732cf38252296e Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 12 Jan 2026 10:30:46 -0500 Subject: [PATCH 04/18] Update imports Signed-off-by: Matthew Bonanni --- docs/contributing/model/basic.md | 2 +- docs/design/custom_op.md | 2 +- tests/compile/test_fusion_attn.py | 2 +- tests/compile/test_qk_norm_rope_fusion.py | 2 +- tests/kernels/attention/test_attention.py | 2 +- tests/v1/worker/test_gpu_model_runner.py | 2 +- tests/v1/worker/test_utils.py | 4 ++-- vllm/compilation/fusion_attn.py | 2 +- vllm/compilation/qk_norm_rope_fusion.py | 2 +- .../kv_transfer/kv_connector/v1/offloading_connector.py | 2 +- .../layers/attention/chunked_local_attention.py | 2 +- vllm/model_executor/layers/attention/cross_attention.py | 2 +- .../model_executor/layers/attention/encoder_only_attention.py | 2 +- vllm/model_executor/layers/attention/static_sink_attention.py | 2 +- vllm/model_executor/layers/mla.py | 2 +- .../quantization/compressed_tensors/compressed_tensors.py | 2 +- vllm/model_executor/layers/quantization/fp8.py | 2 +- vllm/model_executor/layers/quantization/modelopt.py | 2 +- vllm/model_executor/layers/quantization/mxfp4.py | 2 +- vllm/model_executor/layers/quantization/petit.py | 2 +- vllm/model_executor/layers/quantization/ptpc_fp8.py | 2 +- vllm/model_executor/layers/quantization/quark/quark.py | 2 +- vllm/model_executor/model_loader/utils.py | 3 ++- vllm/model_executor/models/afmoe.py | 2 +- vllm/model_executor/models/apertus.py | 2 +- vllm/model_executor/models/arctic.py | 2 +- vllm/model_executor/models/baichuan.py | 2 +- vllm/model_executor/models/bailing_moe.py | 2 +- vllm/model_executor/models/bamba.py | 2 +- vllm/model_executor/models/bloom.py | 2 +- vllm/model_executor/models/chameleon.py | 2 +- vllm/model_executor/models/chatglm.py | 2 +- vllm/model_executor/models/clip.py | 2 +- vllm/model_executor/models/commandr.py | 2 +- vllm/model_executor/models/dbrx.py | 2 +- vllm/model_executor/models/deepseek_v2.py | 2 +- vllm/model_executor/models/dots1.py | 2 +- vllm/model_executor/models/ernie45_moe.py | 2 +- vllm/model_executor/models/ernie45_vl_moe.py | 3 +-- vllm/model_executor/models/exaone.py | 2 +- vllm/model_executor/models/exaone4.py | 2 +- vllm/model_executor/models/falcon.py | 2 +- vllm/model_executor/models/falcon_h1.py | 2 +- vllm/model_executor/models/gemma.py | 2 +- vllm/model_executor/models/gemma2.py | 2 +- vllm/model_executor/models/gemma3.py | 2 +- vllm/model_executor/models/gemma3n.py | 2 +- vllm/model_executor/models/glm4.py | 2 +- vllm/model_executor/models/glm4_moe.py | 2 +- vllm/model_executor/models/gpt2.py | 2 +- vllm/model_executor/models/gpt_bigcode.py | 2 +- vllm/model_executor/models/gpt_j.py | 2 +- vllm/model_executor/models/gpt_neox.py | 2 +- vllm/model_executor/models/gpt_oss.py | 2 +- vllm/model_executor/models/granite.py | 2 +- vllm/model_executor/models/granitemoe.py | 2 +- vllm/model_executor/models/granitemoehybrid.py | 2 +- vllm/model_executor/models/grok1.py | 2 +- vllm/model_executor/models/hunyuan_v1.py | 2 +- vllm/model_executor/models/internlm2.py | 2 +- vllm/model_executor/models/iquest_loopcoder.py | 2 +- vllm/model_executor/models/jais.py | 2 +- vllm/model_executor/models/jais2.py | 2 +- vllm/model_executor/models/jamba.py | 2 +- vllm/model_executor/models/lfm2.py | 2 +- vllm/model_executor/models/lfm2_moe.py | 2 +- vllm/model_executor/models/llama.py | 2 +- vllm/model_executor/models/llama4.py | 2 +- vllm/model_executor/models/mimo_v2_flash.py | 2 +- vllm/model_executor/models/minicpm.py | 2 +- vllm/model_executor/models/minicpm3.py | 2 +- vllm/model_executor/models/minimax_m2.py | 2 +- vllm/model_executor/models/minimax_text_01.py | 2 +- vllm/model_executor/models/mixtral.py | 2 +- vllm/model_executor/models/molmo.py | 2 +- vllm/model_executor/models/mpt.py | 2 +- vllm/model_executor/models/nemotron.py | 2 +- vllm/model_executor/models/nemotron_h.py | 2 +- vllm/model_executor/models/olmo.py | 2 +- vllm/model_executor/models/olmo2.py | 2 +- vllm/model_executor/models/olmoe.py | 2 +- vllm/model_executor/models/openpangu.py | 3 ++- vllm/model_executor/models/opt.py | 2 +- vllm/model_executor/models/orion.py | 2 +- vllm/model_executor/models/ouro.py | 2 +- vllm/model_executor/models/persimmon.py | 2 +- vllm/model_executor/models/phi.py | 2 +- vllm/model_executor/models/phimoe.py | 2 +- vllm/model_executor/models/plamo2.py | 2 +- vllm/model_executor/models/plamo3.py | 2 +- vllm/model_executor/models/qwen.py | 2 +- vllm/model_executor/models/qwen2.py | 2 +- vllm/model_executor/models/qwen2_moe.py | 2 +- vllm/model_executor/models/qwen3.py | 2 +- vllm/model_executor/models/qwen3_moe.py | 2 +- vllm/model_executor/models/qwen3_next.py | 2 +- vllm/model_executor/models/seed_oss.py | 2 +- vllm/model_executor/models/solar.py | 2 +- vllm/model_executor/models/stablelm.py | 2 +- vllm/model_executor/models/starcoder2.py | 2 +- vllm/model_executor/models/step3_text.py | 2 +- vllm/model_executor/models/transformers/base.py | 2 +- vllm/model_executor/models/whisper.py | 2 +- vllm/model_executor/models/whisper_utils.py | 2 +- vllm/model_executor/models/zamba2.py | 2 +- vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 3 ++- vllm/v1/worker/utils.py | 2 +- 109 files changed, 113 insertions(+), 111 deletions(-) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index 28f6f960ab04..ea5c3e922739 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -29,7 +29,7 @@ The initialization code should look like this: ```python from torch import nn from vllm.config import VllmConfig - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention.attention import Attention class MyAttention(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str): diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index fd298a149ab0..4e0922c4825b 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -280,7 +280,7 @@ Taking `MMEncoderAttention` as an example: ??? code ```python - from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention + from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.custom_op import CustomOp diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index a1fd098aee5f..aac59cb8cdb0 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -9,7 +9,6 @@ from tests.utils import flat_product from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention.layer import Attention from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.matcher_utils import QUANT_OPS @@ -27,6 +26,7 @@ set_current_vllm_config, ) from vllm.forward_context import get_forward_context, set_forward_context +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8StaticTensorSym, diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index 45a114679beb..9e4787f1f087 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -5,7 +5,6 @@ import torch from tests.compile.backend import TestBackend -from vllm.attention.layer import Attention from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP from vllm.compilation.noop_elimination import NoOpEliminationPass from vllm.compilation.post_cleanup import PostCleanupPass @@ -21,6 +20,7 @@ VllmConfig, set_current_vllm_config, ) +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.platforms import current_platform diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 94d494613fe7..4aa8f98d8798 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -9,7 +9,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm import _custom_ops as ops -from vllm.attention.layer import Attention +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.utils.mem_utils import get_max_shared_memory_bytes diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index badbd3e9adff..0b6f446154fe 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -5,7 +5,6 @@ import pytest import torch -from vllm.attention.layer import Attention from vllm.config import ( AttentionConfig, CacheConfig, @@ -19,6 +18,7 @@ init_distributed_environment, initialize_model_parallel, ) +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py index a13e11d7178e..9c5f753ed04a 100644 --- a/tests/v1/worker/test_utils.py +++ b/tests/v1/worker/test_utils.py @@ -7,7 +7,7 @@ def test_bind_kv_cache(default_vllm_config): - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention.attention import Attention ctx = { "layers.0.self_attn": Attention(32, 128, 0.1, prefix="layers.0.self_attn"), @@ -35,7 +35,7 @@ def test_bind_kv_cache(default_vllm_config): def test_bind_kv_cache_non_attention(default_vllm_config): - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention.attention import Attention # example from Jamba PP=2 ctx = { diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index 6dcbbd85d703..cde2e5611690 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -10,9 +10,9 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kNvfp4Quant, diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py index 794cd8e3fce5..20f092faf8c0 100644 --- a/vllm/compilation/qk_norm_rope_fusion.py +++ b/vllm/compilation/qk_norm_rope_fusion.py @@ -9,9 +9,9 @@ from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor.pattern_matcher import PatternMatcherPass -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from .fusion import empty_bf16, empty_fp32, empty_i64 diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 67cf4b047025..fcae13d9c427 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -8,7 +8,6 @@ import torch -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.utils import yield_req_data @@ -19,6 +18,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata from vllm.forward_context import ForwardContext from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_utils import BlockHash diff --git a/vllm/model_executor/layers/attention/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py index a34506934bde..e251f4bf6397 100644 --- a/vllm/model_executor/layers/attention/chunked_local_attention.py +++ b/vllm/model_executor/layers/attention/chunked_local_attention.py @@ -4,9 +4,9 @@ import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import ( diff --git a/vllm/model_executor/layers/attention/cross_attention.py b/vllm/model_executor/layers/attention/cross_attention.py index 9c3bc3403940..1041f0519925 100644 --- a/vllm/model_executor/layers/attention/cross_attention.py +++ b/vllm/model_executor/layers/attention/cross_attention.py @@ -6,9 +6,9 @@ import numpy as np import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.utils.math_utils import cdiv from vllm.v1.attention.backend import ( AttentionBackend, diff --git a/vllm/model_executor/layers/attention/encoder_only_attention.py b/vllm/model_executor/layers/attention/encoder_only_attention.py index c130fd095652..023959203c68 100644 --- a/vllm/model_executor/layers/attention/encoder_only_attention.py +++ b/vllm/model_executor/layers/attention/encoder_only_attention.py @@ -5,9 +5,9 @@ import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig +from vllm.model_executor.layers.attention.attention import Attention from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, diff --git a/vllm/model_executor/layers/attention/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py index 918dff560f1d..c2b20fc61ba5 100644 --- a/vllm/model_executor/layers/attention/static_sink_attention.py +++ b/vllm/model_executor/layers/attention/static_sink_attention.py @@ -4,11 +4,11 @@ import torch -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.attention.attention import Attention from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backend import ( diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index 65541d2a485a..a6236fbe3995 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -4,9 +4,9 @@ import torch -from vllm.attention.layer import MLAAttention from vllm.config import CacheConfig from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.attention.mla_attention import MLAAttention from vllm.model_executor.layers.quantization import QuantizationConfig diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index bc4fdfdda771..b20cf50c88b1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -18,8 +18,8 @@ from compressed_tensors.transform import TransformConfig import vllm.envs as envs -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 1c0c35bf6f41..9a1e708c480c 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -11,9 +11,9 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.layer import Attention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index a646012ddd3a..19d103b20119 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -11,8 +11,8 @@ import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 8e050b795f94..5d66005801c4 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -7,9 +7,9 @@ from torch.nn.parameter import Parameter from vllm import envs -from vllm.attention.layer import Attention from vllm.config import get_current_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEConfig, diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py index 5ccc73166361..ad7f453bee48 100644 --- a/vllm/model_executor/layers/quantization/petit.py +++ b/vllm/model_executor/layers/quantization/petit.py @@ -8,8 +8,8 @@ import torch from torch.nn.parameter import Parameter -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index ed8a2c7fa084..f017b5821641 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -7,8 +7,8 @@ from torch.nn.parameter import Parameter from vllm import _custom_ops as ops -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 39bcd56bcd3d..1a3e97d395b4 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -6,8 +6,8 @@ import torch -from vllm.attention.layer import Attention from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 08d7a851ac9a..042add92bfaa 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -11,9 +11,10 @@ from torch import nn from typing_extensions import assert_never -from vllm.attention.layer import Attention, MLAAttention from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention.mla_attention import MLAAttention from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index ef6f59e447d2..2360a37f9b0b 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -9,7 +9,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -18,6 +17,7 @@ get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 7d43735c0053..3cf41608b844 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -32,11 +32,11 @@ from torch import nn from transformers import ApertusConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import XIELU +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.encoder_only_attention import ( EncoderOnlyAttention, ) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 0200984c0ec8..5bc9b71a3d64 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -8,7 +8,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index ee4a1dbd6df9..922819694d48 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -29,7 +29,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -38,6 +37,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index e1e675bd5a05..8b2ddf2bf0f0 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -32,7 +32,6 @@ from torch import nn from transformers.configuration_utils import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -41,6 +40,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 22631bbc5489..94761e6aa7fd 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -9,12 +9,12 @@ from torch import nn from transformers import BambaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 507fbf1fdd0a..12ddedd5220b 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -27,7 +27,6 @@ from torch import nn from transformers import BloomConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -36,6 +35,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 176c5cd14c6e..7db36f6f5fae 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -16,12 +16,12 @@ ChameleonVQVAEConfig, ) -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 26181d1c9bae..498b6d9afcc5 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -12,11 +12,11 @@ from torch import nn from torch.nn import LayerNorm -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index d18904fdf603..d779cedc83fb 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -14,11 +14,11 @@ CLIPVisionConfig, ) -from vllm.attention.layer import Attention from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 63a93eaa2d4f..43af3f15bd3b 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -30,11 +30,11 @@ from torch import nn from transformers import Cohere2Config, CohereConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index db4fe61b0d85..beadba5d25a0 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -8,13 +8,13 @@ import torch.nn as nn from transformers import DbrxConfig -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index db0ccd6958b9..47f338a4a5a8 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -33,7 +33,6 @@ from transformers import DeepseekV2Config, DeepseekV3Config from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -46,6 +45,7 @@ from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index b64f163761c8..101815ca3f03 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -32,7 +32,6 @@ from torch import nn from transformers import Dots1Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( @@ -41,6 +40,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index 8c8cb73b8d6e..00a8ce37e6db 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -32,7 +32,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -42,6 +41,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 75be587eedb2..9c89744d0f2f 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -31,12 +31,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention - # from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 039e7cf68e52..691da7644297 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -32,11 +32,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index b4b7a798fd05..0faad7feb280 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -28,11 +28,11 @@ from torch import nn from transformers import Exaone4Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 7cdfcae0e718..3c564012198d 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -30,7 +30,6 @@ from torch.nn import LayerNorm from transformers import FalconConfig as HF_FalconConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -40,6 +39,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index bfb6b1a1f160..285f4fa5ecb5 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -9,12 +9,12 @@ from torch import nn from transformers import FalconH1Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 7304a728067f..4ec6695d8f55 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -26,12 +26,12 @@ from torch import nn from transformers import GemmaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index fe6ec5ff83de..3456a7ccb8b9 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -23,12 +23,12 @@ from torch import nn from transformers import Gemma2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index c8a0ba8c9d3b..c3c5c8b997e5 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -22,12 +22,12 @@ from torch import nn from transformers import Gemma3TextConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.encoder_only_attention import ( EncoderOnlyAttention, ) diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 4d446f51c2ec..3fe0b14a53ff 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -21,7 +21,6 @@ from torch import nn from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size @@ -32,6 +31,7 @@ GeluAndMul, GeluAndMulSparse, ) +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 06da2a8b3498..76a83d23b80a 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -29,10 +29,10 @@ from torch import nn from transformers import Glm4Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index efa6c1cfe93c..907924eda4f0 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -32,7 +32,6 @@ from torch import nn from transformers.models.glm4_moe import Glm4MoeConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -42,6 +41,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index bacf30d12650..2c6b12f9a03a 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -27,7 +27,6 @@ from torch import nn from transformers import GPT2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed.parallel_state import ( @@ -35,6 +34,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index a405fd184513..db7bddae464f 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -28,11 +28,11 @@ from torch import nn from transformers import GPTBigCodeConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index f32ac2639435..df921ce4b0a6 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -26,11 +26,11 @@ from torch import nn from transformers import GPTJConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index d994e380dfef..f2e05632a6ac 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -26,11 +26,11 @@ from torch import nn from transformers import GPTNeoXConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 69678188a619..6c25cbb81be1 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -7,7 +7,6 @@ from torch import nn from transformers import GptOssConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 82c945f5ad5e..cf26930ddb1c 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -31,11 +31,11 @@ from torch import nn from transformers import GraniteConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 237fabff98f7..2ee145936c51 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -31,7 +31,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -39,6 +38,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 3434716b8378..b4837455d025 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -9,11 +9,11 @@ from torch import nn from transformers import GraniteMoeHybridConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 43c658a2c11e..a74464c11383 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -32,12 +32,12 @@ import torch.nn.functional as F from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 1cf6e824fa28..3e603e990cb0 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -33,7 +33,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -43,6 +42,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 37309cd09106..670f6f76f254 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -10,7 +10,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -21,6 +20,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 1901cc6e81c4..7efe5092b38c 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -24,10 +24,10 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 6012288814f1..648c71fa89a6 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -28,7 +28,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -36,6 +35,7 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index aacc4abd43e6..448841444d19 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -31,7 +31,6 @@ from torch import nn from transformers import Jais2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -39,6 +38,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import ReLUSquaredActivation +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 91b58a83e09a..1beb04415a19 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -9,11 +9,11 @@ from torch import nn from transformers import JambaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 142ad3d6d1d1..5d8dfe0b46b3 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -7,11 +7,11 @@ import torch.nn as nn from transformers import Lfm2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 6677eb9f93e8..828da18822fd 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -6,7 +6,6 @@ import torch import torch.nn as nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -15,6 +14,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 95b5f0f5bf19..72dddae5b988 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -31,11 +31,11 @@ from torch import nn from transformers import LlamaConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.encoder_only_attention import ( EncoderOnlyAttention, ) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index dde6db7c204b..331901a80261 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -24,7 +24,6 @@ from torch import nn from transformers import Llama4TextConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -33,6 +32,7 @@ tensor_model_parallel_all_gather, ) from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.chunked_local_attention import ( ChunkedLocalAttention, ) diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index db85073b38a4..1d79f7ae9c10 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -6,7 +6,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.config import ( CacheConfig, VllmConfig, @@ -22,6 +21,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index a05be794a29c..8165ffcc10fb 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -33,7 +33,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -43,6 +42,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import FatreluAndMul, SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index c7a54cea2154..a8a6ac00774e 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -29,9 +29,9 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 292969db6d03..ea170c370c90 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -30,7 +30,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( @@ -38,6 +37,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 955a73ff19ed..36260b1af954 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -14,7 +14,6 @@ from torch import nn from transformers import MiniMaxConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed.parallel_state import ( @@ -24,6 +23,7 @@ ) from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 89dab5f3cb8e..395dcc46b1b6 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -32,7 +32,6 @@ from torch import nn from transformers import MixtralConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -40,6 +39,7 @@ get_pp_group, get_tensor_model_parallel_world_size, ) +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index bdfa6178b4e3..e189ca7643a9 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -17,7 +17,6 @@ from transformers.image_utils import ImageInput from transformers.tokenization_utils_base import TextInput -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -29,6 +28,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import MulAndSilu, QuickGELU, SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 1e285646b9ec..2148e9004b52 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -10,7 +10,6 @@ import torch.nn as nn from transformers import MptConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 21605015c470..52981cc52ce8 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -30,11 +30,11 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index aff1d5fd4107..f05ae0f71320 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -25,7 +25,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.config.parallel import ParallelConfig @@ -33,6 +32,7 @@ from vllm.distributed.communication_op import tensor_model_parallel_all_gather from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import ReLUSquaredActivation +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE from vllm.model_executor.layers.fused_moe.utils import activation_without_mul from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index dd7c27f10c53..b22086e02d05 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -31,11 +31,11 @@ from torch import nn from transformers import OlmoConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index b030c94b54cd..0814d69e008c 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -32,7 +32,6 @@ from torch import nn from transformers import Olmo2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -40,6 +39,7 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_rank from vllm.distributed.utils import split_tensor_along_last_dim from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index b4cf98de1810..3f8fe73b4abb 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -21,7 +21,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import ( @@ -32,6 +31,7 @@ ) from vllm.distributed.utils import split_tensor_along_last_dim from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index 9f569bcc71cf..fe5886142d8b 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -29,7 +29,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.distributed import ( @@ -41,6 +40,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.static_sink_attention import ( StaticSinkAttention, ) @@ -84,6 +84,7 @@ from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import set_default_rope_theta +from vllm.v1.attention.backend import AttentionType from vllm.v1.attention.backends.flash_attn_diffkv import FlashAttentionDiffKVBackend diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index bba5291ea5ef..bb6c02344ec3 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -27,11 +27,11 @@ from torch import nn from transformers import OPTConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 9d9066c4ba61..376992806f54 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -15,11 +15,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index f51c0f095072..cd6a61141112 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -33,11 +33,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index b644603c5baa..9212211e3f8e 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -30,11 +30,11 @@ from torch import nn from transformers import PersimmonConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index e01e9d47c545..85fa02919d80 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -45,11 +45,11 @@ from torch import nn from transformers import PhiConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 835f360df058..ecd03c60a981 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -31,10 +31,10 @@ from torch import nn from transformers.configuration_utils import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 225e131ec764..41b0daf0cfe6 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -9,7 +9,6 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig, get_current_vllm_config from vllm.distributed import divide, get_tensor_model_parallel_world_size @@ -17,6 +16,7 @@ from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index 3550c9fa7f65..1c663e219e5e 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -10,12 +10,12 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 50b53a1ff039..ffead3127136 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -16,11 +16,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index ab9eac1a94fd..6be09fea50a9 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -33,11 +33,11 @@ from torch import nn from transformers import Qwen2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.encoder_only_attention import ( EncoderOnlyAttention, ) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index fbfd681d59e5..e36a5f04e75f 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -34,12 +34,12 @@ from torch import nn from transformers import Qwen2MoeConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 707e0ccfd3c5..0c7f0dca1022 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -30,11 +30,11 @@ from torch import nn from transformers import Qwen3Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index f2f3546047aa..b63ee2c18c16 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -31,7 +31,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( @@ -42,6 +41,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index c3e45de707c5..d5b27796f33b 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -10,7 +10,6 @@ from torch import nn from transformers.activations import ACT2FN -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CacheConfig, @@ -29,6 +28,7 @@ ) from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fla.ops import ( chunk_gated_delta_rule, fused_recurrent_gated_delta_rule, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 91a60bfd1668..4440f8fb64b5 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -30,12 +30,12 @@ from torch import nn from transformers import PretrainedConfig as SeedOssConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 964aa902704b..0aa3fc25969d 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -30,11 +30,11 @@ from torch import nn from transformers import PretrainedConfig -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index ea4342882feb..3e05f0adcac4 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -29,10 +29,10 @@ from torch import nn from transformers import StableLmConfig -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 569ca9b082cf..da0d8ce21461 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -28,11 +28,11 @@ from torch import nn from transformers import Starcoder2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 7077f1a22e8d..40ebb6a522f3 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -9,7 +9,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import ( @@ -19,6 +18,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index d094bb2895f2..e8460a76093f 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -27,11 +27,11 @@ from transformers import AutoModel from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS -from vllm.attention.layer import Attention from vllm.config.utils import getattr_iter from vllm.distributed import get_pp_group, get_tp_group from vllm.distributed.utils import get_pp_indices from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.encoder_only_attention import ( EncoderOnlyAttention, ) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 14d646f85876..b68e0e8f90c4 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -18,13 +18,13 @@ ) from transformers.models.whisper.modeling_whisper import sinusoids -from vllm.attention.layer import Attention from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.attention.cross_attention import CrossAttention from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/whisper_utils.py b/vllm/model_executor/models/whisper_utils.py index 0bd0db061541..db3144d2f075 100644 --- a/vllm/model_executor/models/whisper_utils.py +++ b/vllm/model_executor/models/whisper_utils.py @@ -9,8 +9,8 @@ import torch.nn.functional as F from torch import nn -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.v1.attention.backend import ( AttentionBackend, diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index b5132cd86024..f23421610273 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -16,11 +16,11 @@ from torch import nn from transformers import Zamba2Config -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import GeluAndMul +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index aa51c1a4301f..8139ca87c360 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -9,7 +9,7 @@ import numpy as np import torch -from vllm.attention.layer import Attention +from vllm.model_executor.layers.attention.attention import Attention from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index da14a848447d..834c492a2b07 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -7,9 +7,9 @@ import torch -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import get_cu_count diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1fca06b5f112..3996cc32ce9e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -20,7 +20,6 @@ from tqdm import tqdm import vllm.envs as envs -from vllm.attention.layer import Attention, MLAAttention from vllm.compilation.counter import compilation_counter from vllm.compilation.cuda_graph import CUDAGraphStat, CUDAGraphWrapper from vllm.compilation.monitor import set_cudagraph_capturing_enabled @@ -49,6 +48,8 @@ ) from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping, LoRAMappingType +from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention.mla_attention import MLAAttention from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.rotary_embedding import ( MRotaryEmbedding, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 85acc16795e2..93f901cf261a 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,9 +7,9 @@ import torch from typing_extensions import deprecated -from vllm.attention.layer import Attention from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import Attention from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config From 47f80e99ef3dd0a90cfff076f26814b6f42ed215 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 12 Jan 2026 11:10:30 -0500 Subject: [PATCH 05/18] Move helpers, delete layer.py Signed-off-by: Matthew Bonanni --- vllm/attention/layer.py | 180 ------------------ .../layers/attention/attention.py | 174 ++++++++++++++++- .../layers/attention/mla_attention.py | 10 + 3 files changed, 183 insertions(+), 181 deletions(-) delete mode 100644 vllm/attention/layer.py diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py deleted file mode 100644 index b12c8c67fea8..000000000000 --- a/vllm/attention/layer.py +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer.""" - -import torch -import torch.nn as nn - -from vllm.forward_context import ForwardContext, get_forward_context -from vllm.logger import init_logger -from vllm.model_executor.layers.linear import ( - UnquantizedLinearMethod, -) -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase -from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod -from vllm.utils.torch_utils import ( - direct_register_custom_op, -) - -logger = init_logger(__name__) - - -def should_load_quant_weights(quant_method: QuantizeMethodBase | None) -> bool: - """Returns whether the quantization method should load quantized weights.""" - return quant_method is not None and not isinstance( - quant_method, UnquantizedLinearMethod - ) - - -def set_default_quant_scales(layer: nn.Module, register_buffer: bool = False) -> None: - """Sets default quantization scales for the layer.""" - if register_buffer: - layer.register_buffer("_k_scale", torch.tensor(1.0, dtype=torch.float32)) - layer.register_buffer("_v_scale", torch.tensor(1.0, dtype=torch.float32)) - layer.register_buffer("_q_scale", torch.tensor(1.0, dtype=torch.float32)) - layer.register_buffer("_prob_scale", torch.tensor(1.0, dtype=torch.float32)) - else: - layer._k_scale.fill_(1.0) - layer._v_scale.fill_(1.0) - layer._q_scale.fill_(1.0) - layer._prob_scale.fill_(1.0) - - # We also keep q/k/v_scale on host (cpu) memory for attention - # backends that require the scales to be on host instead of on device. - # e.g. Flashinfer - layer._q_scale_float = 1.0 - layer._k_scale_float = 1.0 - layer._v_scale_float = 1.0 - layer._prob_scale_float = 1.0 - - -def _init_kv_cache_quant( - layer: nn.Module, - quant_config: QuantizationConfig | None, - prefix: str, - kv_cache_dtype: str, - calculate_kv_scales: bool, -) -> None: - """Initializes KV cache scaling factors and quantization method. - - This helper function sets up the KV cache quantization attributes that are - shared between Attention and MLAAttention layers. It initializes scale - tensors for query, key, value, and probability, and configures the - quantization method if applicable. - - Args: - layer: The attention layer instance to initialize. - quant_config: Optional quantization configuration. - prefix: Layer name prefix for quantization method lookup. - kv_cache_dtype: The KV cache data type string. - calculate_kv_scales: Whether to calculate KV scales dynamically. - """ - # The default k/v_scale is set to 1.0. This is ignored - # when kv-cache is not fp8, and should be used with - # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we - # expect the pre-quantized k/v_scale to be loaded along - # with the model weights. - layer.kv_cache_dtype = kv_cache_dtype - layer.calculate_kv_scales = calculate_kv_scales - - # Note [Register q/k/v/prob scales in state dict] - # When calling model.to(device), only parameters/buffers in state dict are - # moved. If not registering q/k/v/prob scales in state dict, there would - # be an IMA error when a cuda kernel (e.g., quant_fp8) accesses the tensor - # on cpu. - # Registering in state dict means it interacts with weight loading. One edge - # case is when quant_method is None, or quant_method is UnquantizedLinearMethod - # (i.e., should_load_quant_weights(quant_method) == False). - # In this case, the checkpoint does not have the scales. We need to - # initialize the scales to 1.0 and update the scales after weight loading. - # This is espectially important when we load dummy weights first (providing - # wrong scales) and then load real weights (which misses scales and keeps the - # wrong scales from dummy load). - set_default_quant_scales(layer, register_buffer=True) - - # The output scale on host memory. This should be the input scale of - # the quant op after this attention layer. - layer._o_scale_float = None - - quant_method = ( - quant_config.get_quant_method(layer, prefix=prefix) if quant_config else None - ) - - # See [Note: Register q/k/v/prob scales in state dict] - if should_load_quant_weights(quant_method): - assert isinstance(quant_method, BaseKVCacheMethod) - # TODO (mgoin): kv cache dtype should be specified in the FP8 - # checkpoint config and become the "auto" behavior - if kv_cache_dtype == "fp8_e5m2": - raise ValueError("fp8_e5m2 kv-cache is not supported with fp8 checkpoints.") - # If quantization is enabled, we make "k_scale" and "v_scale" - # parameters so that it can be loaded from the model checkpoint. - # The k/v_scale will then be converted back to native float32 - # values after weight loading. - layer.quant_method = quant_method - layer.quant_method.create_weights(layer) - - -def maybe_calc_kv_scales( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - layer_name: str, -) -> None: - forward_context: ForwardContext = get_forward_context() - self = forward_context.no_compile_layers[layer_name] - - # Only calculate if the layer's calculate_kv_scales flag is True - # This flag gets set to False after the first forward pass - if not self.calculate_kv_scales: - return - - self.calc_kv_scales(query, key, value) - - -def maybe_calc_kv_scales_fake( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - layer_name: str, -) -> None: - return - - -direct_register_custom_op( - op_name="maybe_calc_kv_scales", - op_func=maybe_calc_kv_scales, - mutates_args=["query", "key", "value"], - fake_impl=maybe_calc_kv_scales_fake, -) - - -def get_attention_context( - layer_name: str, -) -> tuple[dict | object | None, Attention | MLAAttention, torch.Tensor]: - """Extract attention context for a given layer. - - This helper function extracts the attention metadata, attention layer - instance, and KV cache tensor for a specific layer. - - Args: - layer_name: The name/identifier of the attention layer. - - Returns: - A tuple containing: - - attn_metadata: Attention metadata for this specific layer, or None if - no metadata available - - attn_layer: The attention layer instance (Attention or MLAAttention) - - kv_cache: The KV cache tensor for current virtual engine - - Note: attn_metadata may be None, but attn_layer and kv_cache are always - extracted from the forward context. - """ - forward_context: ForwardContext = get_forward_context() - attn_metadata = forward_context.attn_metadata - if isinstance(attn_metadata, dict): - attn_metadata = attn_metadata[layer_name] - attn_layer: Attention | MLAAttention = forward_context.no_compile_layers[layer_name] - kv_cache = attn_layer.kv_cache[forward_context.virtual_engine] - return attn_metadata, attn_layer, kv_cache diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 88ec63a809ee..5e751a95156c 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -1,19 +1,28 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import TYPE_CHECKING + import torch import torch.nn as nn import vllm.envs as envs -from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer from vllm.config import CacheConfig, get_current_vllm_config from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger +from vllm.model_executor.layers.attention.kv_transfer_utils import ( + maybe_transfer_kv_layer, +) from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant +from vllm.model_executor.layers.linear import ( + UnquantizedLinearMethod, +) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.platforms import current_platform from vllm.utils.torch_utils import ( @@ -32,6 +41,9 @@ SlidingWindowSpec, ) +if TYPE_CHECKING: + from vllm.model_executor.layers.attention.mla_attention import MLAAttention + logger = init_logger(__name__) @@ -68,6 +80,136 @@ def validate_kv_sharing_target( ) +def should_load_quant_weights(quant_method: QuantizeMethodBase | None) -> bool: + """Returns whether the quantization method should load quantized weights.""" + return quant_method is not None and not isinstance( + quant_method, UnquantizedLinearMethod + ) + + +def set_default_quant_scales(layer: nn.Module, register_buffer: bool = False) -> None: + """Sets default quantization scales for the layer.""" + if register_buffer: + layer.register_buffer("_k_scale", torch.tensor(1.0, dtype=torch.float32)) + layer.register_buffer("_v_scale", torch.tensor(1.0, dtype=torch.float32)) + layer.register_buffer("_q_scale", torch.tensor(1.0, dtype=torch.float32)) + layer.register_buffer("_prob_scale", torch.tensor(1.0, dtype=torch.float32)) + else: + layer._k_scale.fill_(1.0) + layer._v_scale.fill_(1.0) + layer._q_scale.fill_(1.0) + layer._prob_scale.fill_(1.0) + + # We also keep q/k/v_scale on host (cpu) memory for attention + # backends that require the scales to be on host instead of on device. + # e.g. Flashinfer + layer._q_scale_float = 1.0 + layer._k_scale_float = 1.0 + layer._v_scale_float = 1.0 + layer._prob_scale_float = 1.0 + + +def _init_kv_cache_quant( + layer: nn.Module, + quant_config: QuantizationConfig | None, + prefix: str, + kv_cache_dtype: str, + calculate_kv_scales: bool, +) -> None: + """Initializes KV cache scaling factors and quantization method. + + This helper function sets up the KV cache quantization attributes that are + shared between Attention and MLAAttention layers. It initializes scale + tensors for query, key, value, and probability, and configures the + quantization method if applicable. + + Args: + layer: The attention layer instance to initialize. + quant_config: Optional quantization configuration. + prefix: Layer name prefix for quantization method lookup. + kv_cache_dtype: The KV cache data type string. + calculate_kv_scales: Whether to calculate KV scales dynamically. + """ + # The default k/v_scale is set to 1.0. This is ignored + # when kv-cache is not fp8, and should be used with + # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we + # expect the pre-quantized k/v_scale to be loaded along + # with the model weights. + layer.kv_cache_dtype = kv_cache_dtype + layer.calculate_kv_scales = calculate_kv_scales + + # Note [Register q/k/v/prob scales in state dict] + # When calling model.to(device), only parameters/buffers in state dict are + # moved. If not registering q/k/v/prob scales in state dict, there would + # be an IMA error when a cuda kernel (e.g., quant_fp8) accesses the tensor + # on cpu. + # Registering in state dict means it interacts with weight loading. One edge + # case is when quant_method is None, or quant_method is UnquantizedLinearMethod + # (i.e., should_load_quant_weights(quant_method) == False). + # In this case, the checkpoint does not have the scales. We need to + # initialize the scales to 1.0 and update the scales after weight loading. + # This is espectially important when we load dummy weights first (providing + # wrong scales) and then load real weights (which misses scales and keeps the + # wrong scales from dummy load). + set_default_quant_scales(layer, register_buffer=True) + + # The output scale on host memory. This should be the input scale of + # the quant op after this attention layer. + layer._o_scale_float = None + + quant_method = ( + quant_config.get_quant_method(layer, prefix=prefix) if quant_config else None + ) + + # See [Note: Register q/k/v/prob scales in state dict] + if should_load_quant_weights(quant_method): + assert isinstance(quant_method, BaseKVCacheMethod) + # TODO (mgoin): kv cache dtype should be specified in the FP8 + # checkpoint config and become the "auto" behavior + if kv_cache_dtype == "fp8_e5m2": + raise ValueError("fp8_e5m2 kv-cache is not supported with fp8 checkpoints.") + # If quantization is enabled, we make "k_scale" and "v_scale" + # parameters so that it can be loaded from the model checkpoint. + # The k/v_scale will then be converted back to native float32 + # values after weight loading. + layer.quant_method = quant_method + layer.quant_method.create_weights(layer) + + +def maybe_calc_kv_scales( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + + # Only calculate if the layer's calculate_kv_scales flag is True + # This flag gets set to False after the first forward pass + if not self.calculate_kv_scales: + return + + self.calc_kv_scales(query, key, value) + + +def maybe_calc_kv_scales_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="maybe_calc_kv_scales", + op_func=maybe_calc_kv_scales, + mutates_args=["query", "key", "value"], + fake_impl=maybe_calc_kv_scales_fake, +) + + class Attention(nn.Module, AttentionLayerBase): """Attention layer. @@ -390,6 +532,36 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: ) +def get_attention_context( + layer_name: str, +) -> tuple[dict | object | None, "Attention | MLAAttention", torch.Tensor]: + """Extract attention context for a given layer. + + This helper function extracts the attention metadata, attention layer + instance, and KV cache tensor for a specific layer. + + Args: + layer_name: The name/identifier of the attention layer. + + Returns: + A tuple containing: + - attn_metadata: Attention metadata for this specific layer, or None if + no metadata available + - attn_layer: The attention layer instance (Attention or MLAAttention) + - kv_cache: The KV cache tensor for current virtual engine + + Note: attn_metadata may be None, but attn_layer and kv_cache are always + extracted from the forward context. + """ + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + if isinstance(attn_metadata, dict): + attn_metadata = attn_metadata[layer_name] + attn_layer = forward_context.no_compile_layers[layer_name] + kv_cache = attn_layer.kv_cache[forward_context.virtual_engine] + return attn_metadata, attn_layer, kv_cache + + @maybe_transfer_kv_layer def unified_attention( query: torch.Tensor, diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index 330271da0b5f..3c5ee3773d7d 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -11,6 +11,15 @@ from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger +from vllm.model_executor.layers.attention.attention import ( + _init_kv_cache_quant, + get_attention_context, + set_default_quant_scales, + should_load_quant_weights, +) +from vllm.model_executor.layers.attention.kv_transfer_utils import ( + maybe_transfer_kv_layer, +) from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.model_executor.layers.linear import ( @@ -19,6 +28,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.platforms import current_platform from vllm.utils.torch_utils import ( + direct_register_custom_op, kv_cache_dtype_str_to_dtype, ) from vllm.v1.attention.backend import ( From ddda80cf9618c5c8da2740583963f4d42c35163e Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 12 Jan 2026 11:11:19 -0500 Subject: [PATCH 06/18] Delete vllm/attention Signed-off-by: Matthew Bonanni --- vllm/attention/__init__.py | 0 vllm/attention/utils/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 vllm/attention/__init__.py delete mode 100644 vllm/attention/utils/__init__.py diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/vllm/attention/utils/__init__.py b/vllm/attention/utils/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 From 99e5293da82e63aaf07c94902c84f2c272d50a31 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 12 Jan 2026 11:26:01 -0500 Subject: [PATCH 07/18] Update and remove old references to vllm/attention Signed-off-by: Matthew Bonanni --- .buildkite/test-amd.yaml | 1 - .buildkite/test-pipeline.yaml | 1 - .buildkite/test_areas/kernels.yaml | 1 - .github/CODEOWNERS | 2 +- tools/pre_commit/mypy.py | 1 - vllm/model_executor/layers/attention/kv_transfer_utils.py | 2 +- 6 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ff5ce20d111b..fa4e1e9a86ca 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -620,7 +620,6 @@ steps: # grade: Blocking source_file_dependencies: - csrc/attention/ - - vllm/attention - vllm/v1/attention - tests/kernels/attention commands: diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4e004e3479a1..64ec92d137dd 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -550,7 +550,6 @@ steps: mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/attention/ - - vllm/attention - vllm/v1/attention - tests/kernels/attention commands: diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index cf4b646f3495..8a38e258b40d 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -15,7 +15,6 @@ steps: timeout_in_minutes: 35 source_file_dependencies: - csrc/attention/ - - vllm/attention - vllm/v1/attention - tests/kernels/attention commands: diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index c963be4cb8f9..772c62973973 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,8 +2,8 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review -/vllm/attention @LucasWilkinson /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn +/vllm/model_executor/layers/attention @LucasWilkinson /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety /vllm/model_executor/layers/mamba @tdoublep diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 4b7f85077962..492b0fee5a76 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -56,7 +56,6 @@ SEPARATE_GROUPS = [ "tests", # v0 related - "vllm/attention", "vllm/compilation", "vllm/lora", "vllm/model_executor", diff --git a/vllm/model_executor/layers/attention/kv_transfer_utils.py b/vllm/model_executor/layers/attention/kv_transfer_utils.py index 210be55feb2f..9ee6b4d0f5b8 100644 --- a/vllm/model_executor/layers/attention/kv_transfer_utils.py +++ b/vllm/model_executor/layers/attention/kv_transfer_utils.py @@ -19,7 +19,7 @@ def maybe_transfer_kv_layer(func: Callable) -> Callable: On exit: saves the KV layer to the connector. """ # Import at runtime to avoid circular dependency - from vllm.attention.layer import get_attention_context + from vllm.model_executor.layers.attention.attention import get_attention_context # Inspect the signature ONCE when the decorator is applied. sig = inspect.signature(func) From 9942dffe92c535c4e97f47ae951492e4d1b2c46f Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 12 Jan 2026 17:33:39 -0500 Subject: [PATCH 08/18] Add dependency Signed-off-by: Matthew Bonanni --- .buildkite/test_areas/kernels.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index 8a38e258b40d..29822d251af4 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -16,6 +16,7 @@ steps: source_file_dependencies: - csrc/attention/ - vllm/v1/attention + - vllm/model_executor/layers/attention - tests/kernels/attention commands: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT From 3111dd0854c990ae634f94070a85c95cf3225b9d Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 12 Jan 2026 18:01:40 -0500 Subject: [PATCH 09/18] Add to AMD Signed-off-by: Matthew Bonanni --- .buildkite/test-amd.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index fa4e1e9a86ca..48a241229b93 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -621,6 +621,7 @@ steps: source_file_dependencies: - csrc/attention/ - vllm/v1/attention + - vllm/model_executor/layers/attention - tests/kernels/attention commands: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT From 8b56809eded85e49e105943f6f20e3129f7dbbea Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 12 Jan 2026 20:00:02 -0500 Subject: [PATCH 10/18] Add to test-pipeline.yaml Signed-off-by: Matthew Bonanni --- .buildkite/test-pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 64ec92d137dd..f82f27be683b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -551,6 +551,7 @@ steps: source_file_dependencies: - csrc/attention/ - vllm/v1/attention + - vllm/model_executor/layers/attention - tests/kernels/attention commands: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT From 5eb3a4791f0fe546063b692ec52cedd71c2bbabe Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 13 Jan 2026 10:03:15 -0500 Subject: [PATCH 11/18] Add imports to __init__ Signed-off-by: Matthew Bonanni --- .../layers/attention/__init__.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/vllm/model_executor/layers/attention/__init__.py b/vllm/model_executor/layers/attention/__init__.py index e69de29bb2d1..1be9f77427d3 100644 --- a/vllm/model_executor/layers/attention/__init__.py +++ b/vllm/model_executor/layers/attention/__init__.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention.chunked_local_attention import ( + ChunkedLocalAttention, +) +from vllm.model_executor.layers.attention.cross_attention import CrossAttention +from vllm.model_executor.layers.attention.encoder_only_attention import ( + EncoderOnlyAttention, +) +from vllm.model_executor.layers.attention.mla_attention import MLAAttention +from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention.static_sink_attention import ( + StaticSinkAttention, +) + +__all__ = [ + "Attention", + "ChunkedLocalAttention", + "CrossAttention", + "EncoderOnlyAttention", + "MLAAttention", + "MMEncoderAttention", + "StaticSinkAttention", +] From 877c5e99b945631e89fe89d3baaedfd6872ddb00 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 13 Jan 2026 10:06:06 -0500 Subject: [PATCH 12/18] Update imports Signed-off-by: Matthew Bonanni --- docs/contributing/model/basic.md | 2 +- docs/design/custom_op.md | 2 +- tests/compile/test_fusion_attn.py | 2 +- tests/compile/test_qk_norm_rope_fusion.py | 2 +- tests/kernels/attention/test_attention.py | 3 +-- tests/kernels/attention/test_mha_attn.py | 2 +- tests/v1/worker/test_gpu_model_runner.py | 2 +- tests/v1/worker/test_utils.py | 4 ++-- vllm/compilation/fusion_attn.py | 2 +- vllm/compilation/qk_norm_rope_fusion.py | 2 +- .../kv_transfer/kv_connector/v1/offloading_connector.py | 2 +- vllm/model_executor/layers/attention/attention.py | 2 +- .../layers/attention/chunked_local_attention.py | 2 +- vllm/model_executor/layers/attention/cross_attention.py | 2 +- .../layers/attention/encoder_only_attention.py | 2 +- vllm/model_executor/layers/attention/kv_transfer_utils.py | 2 +- vllm/model_executor/layers/attention/mla_attention.py | 2 +- .../layers/attention/static_sink_attention.py | 2 +- vllm/model_executor/layers/mla.py | 2 +- .../quantization/compressed_tensors/compressed_tensors.py | 2 +- vllm/model_executor/layers/quantization/fp8.py | 2 +- vllm/model_executor/layers/quantization/modelopt.py | 2 +- vllm/model_executor/layers/quantization/mxfp4.py | 2 +- vllm/model_executor/layers/quantization/petit.py | 2 +- vllm/model_executor/layers/quantization/ptpc_fp8.py | 2 +- vllm/model_executor/layers/quantization/quark/quark.py | 2 +- vllm/model_executor/model_loader/utils.py | 3 +-- vllm/model_executor/models/afmoe.py | 2 +- vllm/model_executor/models/aimv2.py | 2 +- vllm/model_executor/models/apertus.py | 4 ++-- vllm/model_executor/models/arctic.py | 2 +- vllm/model_executor/models/baichuan.py | 2 +- vllm/model_executor/models/bailing_moe.py | 2 +- vllm/model_executor/models/bamba.py | 2 +- vllm/model_executor/models/bert.py | 2 +- vllm/model_executor/models/bert_with_rope.py | 2 +- vllm/model_executor/models/blip.py | 2 +- vllm/model_executor/models/bloom.py | 2 +- vllm/model_executor/models/chameleon.py | 2 +- vllm/model_executor/models/chatglm.py | 2 +- vllm/model_executor/models/clip.py | 3 +-- vllm/model_executor/models/commandr.py | 2 +- vllm/model_executor/models/dbrx.py | 2 +- vllm/model_executor/models/deepencoder.py | 2 +- vllm/model_executor/models/deepseek_v2.py | 2 +- vllm/model_executor/models/dots1.py | 2 +- vllm/model_executor/models/dots_ocr.py | 2 +- vllm/model_executor/models/ernie45_moe.py | 2 +- vllm/model_executor/models/ernie45_vl.py | 2 +- vllm/model_executor/models/ernie45_vl_moe.py | 2 +- vllm/model_executor/models/exaone.py | 2 +- vllm/model_executor/models/exaone4.py | 2 +- vllm/model_executor/models/falcon.py | 2 +- vllm/model_executor/models/falcon_h1.py | 2 +- vllm/model_executor/models/gemma.py | 2 +- vllm/model_executor/models/gemma2.py | 2 +- vllm/model_executor/models/gemma3.py | 4 ++-- vllm/model_executor/models/gemma3n.py | 2 +- vllm/model_executor/models/glm4.py | 2 +- vllm/model_executor/models/glm4_1v.py | 2 +- vllm/model_executor/models/glm4_moe.py | 2 +- vllm/model_executor/models/glm4v.py | 2 +- vllm/model_executor/models/glmasr.py | 2 +- vllm/model_executor/models/gpt2.py | 2 +- vllm/model_executor/models/gpt_bigcode.py | 2 +- vllm/model_executor/models/gpt_j.py | 2 +- vllm/model_executor/models/gpt_neox.py | 2 +- vllm/model_executor/models/gpt_oss.py | 2 +- vllm/model_executor/models/granite.py | 2 +- vllm/model_executor/models/granitemoe.py | 2 +- vllm/model_executor/models/granitemoehybrid.py | 2 +- vllm/model_executor/models/grok1.py | 2 +- vllm/model_executor/models/hunyuan_v1.py | 2 +- vllm/model_executor/models/hunyuan_vision.py | 2 +- vllm/model_executor/models/idefics2_vision_model.py | 2 +- vllm/model_executor/models/intern_vit.py | 2 +- vllm/model_executor/models/internlm2.py | 2 +- vllm/model_executor/models/interns1_vit.py | 2 +- vllm/model_executor/models/iquest_loopcoder.py | 2 +- vllm/model_executor/models/isaac.py | 2 +- vllm/model_executor/models/jais.py | 2 +- vllm/model_executor/models/jais2.py | 2 +- vllm/model_executor/models/jamba.py | 2 +- vllm/model_executor/models/keye.py | 2 +- vllm/model_executor/models/lfm2.py | 2 +- vllm/model_executor/models/lfm2_moe.py | 2 +- vllm/model_executor/models/llama.py | 4 ++-- vllm/model_executor/models/llama4.py | 4 ++-- vllm/model_executor/models/mimo_v2_flash.py | 2 +- vllm/model_executor/models/minicpm.py | 2 +- vllm/model_executor/models/minicpm3.py | 2 +- vllm/model_executor/models/minimax_m2.py | 2 +- vllm/model_executor/models/minimax_text_01.py | 2 +- vllm/model_executor/models/mixtral.py | 2 +- vllm/model_executor/models/mllama4.py | 2 +- vllm/model_executor/models/modernbert.py | 2 +- vllm/model_executor/models/molmo.py | 3 +-- vllm/model_executor/models/moonvit.py | 2 +- vllm/model_executor/models/mpt.py | 2 +- vllm/model_executor/models/nemotron.py | 2 +- vllm/model_executor/models/nemotron_h.py | 2 +- vllm/model_executor/models/olmo.py | 2 +- vllm/model_executor/models/olmo2.py | 2 +- vllm/model_executor/models/olmoe.py | 2 +- vllm/model_executor/models/openpangu.py | 4 ++-- vllm/model_executor/models/opt.py | 2 +- vllm/model_executor/models/orion.py | 2 +- vllm/model_executor/models/ouro.py | 2 +- vllm/model_executor/models/paddleocr_vl.py | 2 +- vllm/model_executor/models/persimmon.py | 2 +- vllm/model_executor/models/phi.py | 2 +- vllm/model_executor/models/phimoe.py | 2 +- vllm/model_executor/models/plamo2.py | 2 +- vllm/model_executor/models/plamo3.py | 2 +- vllm/model_executor/models/qwen.py | 2 +- vllm/model_executor/models/qwen2.py | 4 ++-- vllm/model_executor/models/qwen2_5_vl.py | 2 +- vllm/model_executor/models/qwen2_moe.py | 2 +- vllm/model_executor/models/qwen2_vl.py | 2 +- vllm/model_executor/models/qwen3.py | 2 +- vllm/model_executor/models/qwen3_moe.py | 2 +- vllm/model_executor/models/qwen3_next.py | 2 +- vllm/model_executor/models/seed_oss.py | 2 +- vllm/model_executor/models/siglip.py | 4 ++-- vllm/model_executor/models/siglip2.py | 2 +- vllm/model_executor/models/siglip2navit.py | 2 +- vllm/model_executor/models/solar.py | 2 +- vllm/model_executor/models/stablelm.py | 2 +- vllm/model_executor/models/starcoder2.py | 2 +- vllm/model_executor/models/step3_text.py | 2 +- vllm/model_executor/models/step3_vl.py | 2 +- vllm/model_executor/models/transformers/base.py | 4 ++-- vllm/model_executor/models/whisper.py | 8 +++++--- vllm/model_executor/models/whisper_utils.py | 2 +- vllm/model_executor/models/zamba2.py | 2 +- vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 3 +-- vllm/v1/worker/utils.py | 2 +- 139 files changed, 152 insertions(+), 155 deletions(-) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index ea5c3e922739..f439eb79b0d7 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -29,7 +29,7 @@ The initialization code should look like this: ```python from torch import nn from vllm.config import VllmConfig - from vllm.model_executor.layers.attention.attention import Attention + from vllm.model_executor.layers.attention import Attention class MyAttention(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str): diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index 4e0922c4825b..fd186d043591 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -280,7 +280,7 @@ Taking `MMEncoderAttention` as an example: ??? code ```python - from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention + from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.custom_op import CustomOp diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index aac59cb8cdb0..4ac00ac01704 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -26,7 +26,7 @@ set_current_vllm_config, ) from vllm.forward_context import get_forward_context, set_forward_context -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8StaticTensorSym, diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index 9e4787f1f087..19511b7877d3 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -20,7 +20,7 @@ VllmConfig, set_current_vllm_config, ) -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.platforms import current_platform diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 4aa8f98d8798..e3b612123c0c 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -9,8 +9,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.utils import opcheck from vllm import _custom_ops as ops -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.platforms import current_platform from vllm.utils.mem_utils import get_max_shared_memory_bytes from vllm.utils.torch_utils import set_random_seed diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index ecaea88674c2..25fb5c926326 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -12,7 +12,7 @@ import pytest import torch -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 0b6f446154fe..458c7a2e5a4e 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -18,7 +18,7 @@ init_distributed_environment, initialize_model_parallel, ) -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2 from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py index 9c5f753ed04a..19cc94a24a43 100644 --- a/tests/v1/worker/test_utils.py +++ b/tests/v1/worker/test_utils.py @@ -7,7 +7,7 @@ def test_bind_kv_cache(default_vllm_config): - from vllm.model_executor.layers.attention.attention import Attention + from vllm.model_executor.layers.attention import Attention ctx = { "layers.0.self_attn": Attention(32, 128, 0.1, prefix="layers.0.self_attn"), @@ -35,7 +35,7 @@ def test_bind_kv_cache(default_vllm_config): def test_bind_kv_cache_non_attention(default_vllm_config): - from vllm.model_executor.layers.attention.attention import Attention + from vllm.model_executor.layers.attention import Attention # example from Jamba PP=2 ctx = { diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index cde2e5611690..6fd11b9edd53 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -12,7 +12,7 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kNvfp4Quant, diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py index 20f092faf8c0..69b0f15500aa 100644 --- a/vllm/compilation/qk_norm_rope_fusion.py +++ b/vllm/compilation/qk_norm_rope_fusion.py @@ -11,7 +11,7 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from .fusion import empty_bf16, empty_fp32, empty_i64 diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index fcae13d9c427..069127588d38 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -18,7 +18,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata from vllm.forward_context import ForwardContext from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.kv_cache_utils import BlockHash diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 5e751a95156c..b09ba6a2da34 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -42,7 +42,7 @@ ) if TYPE_CHECKING: - from vllm.model_executor.layers.attention.mla_attention import MLAAttention + from vllm.model_executor.layers.attention import MLAAttention logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/attention/chunked_local_attention.py b/vllm/model_executor/layers/attention/chunked_local_attention.py index e251f4bf6397..40b77f8fc9be 100644 --- a/vllm/model_executor/layers/attention/chunked_local_attention.py +++ b/vllm/model_executor/layers/attention/chunked_local_attention.py @@ -6,7 +6,7 @@ from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.v1.attention.backend import AttentionBackend from vllm.v1.attention.backends.utils import ( diff --git a/vllm/model_executor/layers/attention/cross_attention.py b/vllm/model_executor/layers/attention/cross_attention.py index 1041f0519925..9114fed58e4d 100644 --- a/vllm/model_executor/layers/attention/cross_attention.py +++ b/vllm/model_executor/layers/attention/cross_attention.py @@ -8,7 +8,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv from vllm.v1.attention.backend import ( AttentionBackend, diff --git a/vllm/model_executor/layers/attention/encoder_only_attention.py b/vllm/model_executor/layers/attention/encoder_only_attention.py index 023959203c68..fad1fc80b853 100644 --- a/vllm/model_executor/layers/attention/encoder_only_attention.py +++ b/vllm/model_executor/layers/attention/encoder_only_attention.py @@ -7,7 +7,7 @@ from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.v1.attention.backend import ( AttentionBackend, AttentionMetadata, diff --git a/vllm/model_executor/layers/attention/kv_transfer_utils.py b/vllm/model_executor/layers/attention/kv_transfer_utils.py index 9ee6b4d0f5b8..e9bb27e512a4 100644 --- a/vllm/model_executor/layers/attention/kv_transfer_utils.py +++ b/vllm/model_executor/layers/attention/kv_transfer_utils.py @@ -19,7 +19,7 @@ def maybe_transfer_kv_layer(func: Callable) -> Callable: On exit: saves the KV layer to the connector. """ # Import at runtime to avoid circular dependency - from vllm.model_executor.layers.attention.attention import get_attention_context + from vllm.model_executor.layers.attention import get_attention_context # Inspect the signature ONCE when the decorator is applied. sig = inspect.signature(func) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index 3c5ee3773d7d..3be052cdc3b4 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -11,7 +11,7 @@ from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import ( +from vllm.model_executor.layers.attention import ( _init_kv_cache_quant, get_attention_context, set_default_quant_scales, diff --git a/vllm/model_executor/layers/attention/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py index c2b20fc61ba5..a02f91784b65 100644 --- a/vllm/model_executor/layers/attention/static_sink_attention.py +++ b/vllm/model_executor/layers/attention/static_sink_attention.py @@ -8,7 +8,7 @@ from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backend import ( diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index a6236fbe3995..0c9b41aef089 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -6,7 +6,7 @@ from vllm.config import CacheConfig from vllm.model_executor.custom_op import CustomOp -from vllm.model_executor.layers.attention.mla_attention import MLAAttention +from vllm.model_executor.layers.attention import MLAAttention from vllm.model_executor.layers.quantization import QuantizationConfig diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index b20cf50c88b1..3547c8d5e713 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -19,7 +19,7 @@ import vllm.envs as envs from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 9a1e708c480c..857d05bd337f 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -13,7 +13,7 @@ from vllm._aiter_ops import rocm_aiter_ops from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 19d103b20119..4ea578b9d4d7 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -12,7 +12,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 5d66005801c4..1e1a01eebbb1 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -9,7 +9,7 @@ from vllm import envs from vllm.config import get_current_vllm_config from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEConfig, diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py index ad7f453bee48..e97fac80fe5e 100644 --- a/vllm/model_executor/layers/quantization/petit.py +++ b/vllm/model_executor/layers/quantization/petit.py @@ -9,7 +9,7 @@ from torch.nn.parameter import Parameter from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index f017b5821641..b6e69da564d7 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -8,7 +8,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 1a3e97d395b4..8fd7b875fdc8 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -7,7 +7,7 @@ import torch from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( LinearBase, diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 042add92bfaa..94361a3cacb0 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -13,8 +13,7 @@ from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.mla_attention import MLAAttention +from vllm.model_executor.layers.attention import Attention, MLAAttention from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 2360a37f9b0b..87d910e76616 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -17,7 +17,7 @@ get_tensor_model_parallel_world_size, ) from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index b802bb0ee35b..5b8ead4c7b7c 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -11,7 +11,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.utils import divide from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 3cf41608b844..b35223a1c490 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -36,8 +36,8 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import XIELU -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 5bc9b71a3d64..940860371110 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -18,7 +18,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 922819694d48..e0b72444e383 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -37,7 +37,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 8b2ddf2bf0f0..3dab2ae3a66a 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -40,7 +40,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 94761e6aa7fd..663f78ca9510 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -14,7 +14,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index cce01ea50acd..57d2c70cc565 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -11,7 +11,7 @@ from vllm.config import CacheConfig, PoolerConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, ) from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index a5c43bbb301e..59ecf8a62fea 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -15,7 +15,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import get_act_and_mul_fn, get_act_fn -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, ) from vllm.model_executor.layers.fused_moe import activation_without_mul, fused_topk diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 9279cccd596d..ac9ae49f03e1 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -11,7 +11,7 @@ from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 12ddedd5220b..f9ba44acba51 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -35,7 +35,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 7db36f6f5fae..132624a79355 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -21,7 +21,7 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 498b6d9afcc5..dac588085193 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -16,7 +16,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index d779cedc83fb..9ede8ea8649d 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -18,8 +18,7 @@ from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 43af3f15bd3b..62f85d3151ed 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -34,7 +34,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index beadba5d25a0..fd1c697c2928 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -14,7 +14,7 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py index b3e5d920e03a..3591de309806 100644 --- a/vllm/model_executor/models/deepencoder.py +++ b/vllm/model_executor/models/deepencoder.py @@ -19,7 +19,7 @@ from transformers import CLIPVisionConfig from vllm.config import MultiModalConfig -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 47f338a4a5a8..e3592780dfe9 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -45,7 +45,7 @@ from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import LayerNorm, RMSNorm diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 101815ca3f03..9efa5b1fd401 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -40,7 +40,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index ac9ad3b67d65..53fdaee6e7df 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -16,7 +16,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index 00a8ce37e6db..dbf866c786fa 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -41,7 +41,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index a382cb5b61fe..5cda0747bd13 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -42,7 +42,7 @@ from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index 9c89744d0f2f..273483c4a0e3 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -35,7 +35,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 691da7644297..754d46afd2d6 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -36,7 +36,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 0faad7feb280..33e1a12570d8 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -32,7 +32,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 3c564012198d..ffb06426705a 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -39,7 +39,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 285f4fa5ecb5..2e2182dfab92 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -14,7 +14,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 4ec6695d8f55..31a05b3a33bd 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -31,7 +31,7 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 3456a7ccb8b9..fd31ba238842 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -28,7 +28,7 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import GemmaRMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index c3c5c8b997e5..ea24a47953e6 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -27,8 +27,8 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import GemmaRMSNorm diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 3fe0b14a53ff..e645bd05f647 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -31,7 +31,7 @@ GeluAndMul, GeluAndMulSparse, ) -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 76a83d23b80a..7ac0c002e436 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -32,7 +32,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 05257bd1ea01..df6da8a7c082 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -51,7 +51,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer, Conv3dLayer diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 907924eda4f0..1ebfaef9cab3 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -41,7 +41,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 297237fd196a..ccb0903999fa 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -23,7 +23,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index a3b9a1221934..7f82730dcfbe 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -16,7 +16,7 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 2c6b12f9a03a..8fb1536f4a33 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -34,7 +34,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index db7bddae464f..2f611e019c4e 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -32,7 +32,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index df921ce4b0a6..7a9d2b711755 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -30,7 +30,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index f2e05632a6ac..91156c4e207c 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -30,7 +30,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 6c25cbb81be1..d71e5196eec1 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -18,7 +18,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index cf26930ddb1c..cfb45444250b 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -35,7 +35,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 2ee145936c51..16ca30ce3472 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -38,7 +38,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index b4837455d025..66993e8ce1c9 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -13,7 +13,7 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index a74464c11383..36889da414a2 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -37,7 +37,7 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 3e603e990cb0..4c2d021cba22 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -42,7 +42,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 9afb86a89f7d..bddf544e675b 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -39,7 +39,7 @@ from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index c78ad64790e8..441aabd7ec1e 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -29,7 +29,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 3e3d60ceaf93..9688e12b61b3 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -23,7 +23,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 670f6f76f254..e3702b371669 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -20,7 +20,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py index 2b2866d678a8..195bb96817f4 100644 --- a/vllm/model_executor/models/interns1_vit.py +++ b/vllm/model_executor/models/interns1_vit.py @@ -15,7 +15,7 @@ from transformers.utils import torch_int from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 7efe5092b38c..644c0ab38112 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -27,7 +27,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index ffcc2444627e..302078dea2a7 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -20,7 +20,7 @@ from vllm.config.model import ModelConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 648c71fa89a6..3efa95a2b204 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -35,7 +35,7 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index 448841444d19..52feabed2a79 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -38,7 +38,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import ReLUSquaredActivation -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 1beb04415a19..d202f351a0b4 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -13,7 +13,7 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 8e6b6642591d..4ad34f817730 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -20,7 +20,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index 5d8dfe0b46b3..642fb3a5e850 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -11,7 +11,7 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 828da18822fd..69e024974fea 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -14,7 +14,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 72dddae5b988..6a6eee38c23e 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -35,8 +35,8 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 331901a80261..ecbdf6263bea 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -32,8 +32,8 @@ tensor_model_parallel_all_gather, ) from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.chunked_local_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, ChunkedLocalAttention, ) from vllm.model_executor.layers.fused_moe import SharedFusedMoE diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index 1d79f7ae9c10..eff3f1841590 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -21,7 +21,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 8165ffcc10fb..35e5198bffd5 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -42,7 +42,7 @@ tensor_model_parallel_all_reduce, ) from vllm.model_executor.layers.activation import FatreluAndMul, SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index a8a6ac00774e..e61e9d06103d 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -31,7 +31,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index ea170c370c90..e51ecacddea6 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -37,7 +37,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 36260b1af954..c06cc492b9ee 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -23,7 +23,7 @@ ) from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 395dcc46b1b6..b82d3e38e3b1 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -39,7 +39,7 @@ get_pp_group, get_tensor_model_parallel_world_size, ) -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index fb66a03b8b22..682e44249d1c 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -34,7 +34,7 @@ from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 77394803996d..eec0c6ed5a30 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -10,7 +10,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, ) from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index e189ca7643a9..b729d490cfa9 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -28,8 +28,7 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import MulAndSilu, QuickGELU, SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index c675b2cd6594..0f84dbcf4ad1 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -54,7 +54,7 @@ from vllm.config import MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 2148e9004b52..9dbe78dee348 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -18,7 +18,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 52981cc52ce8..cccbb9a9c9a4 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -34,7 +34,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index f05ae0f71320..d32975dd677f 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -32,7 +32,7 @@ from vllm.distributed.communication_op import tensor_model_parallel_all_gather from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import ReLUSquaredActivation -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE from vllm.model_executor.layers.fused_moe.utils import activation_without_mul from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index b22086e02d05..3a2e45628994 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -35,7 +35,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 0814d69e008c..7049032b9dd8 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -39,7 +39,7 @@ from vllm.distributed.parallel_state import get_tensor_model_parallel_rank from vllm.distributed.utils import split_tensor_along_last_dim from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 3f8fe73b4abb..a9c4395f3173 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -31,7 +31,7 @@ ) from vllm.distributed.utils import split_tensor_along_last_dim from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index fe5886142d8b..1026827da588 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -40,8 +40,8 @@ tensor_model_parallel_all_gather, ) from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.static_sink_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, StaticSinkAttention, ) from vllm.model_executor.layers.fused_moe import SharedFusedMoE diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index bb6c02344ec3..8ab6ea4db0d4 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -31,7 +31,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 376992806f54..d16cfae8d9d8 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -19,7 +19,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index cd6a61141112..f6b2ab3ef73c 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -37,7 +37,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 530974f7fa8b..a6cd3e01df4d 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -34,7 +34,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.model_executor.layers.attention.mm_encoder_attention import ( +from vllm.model_executor.layers.attention import ( MMEncoderAttention, ) from vllm.model_executor.layers.conv import Conv2dLayer diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 9212211e3f8e..775e55635ede 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -34,7 +34,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 85fa02919d80..be8275d88214 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -49,7 +49,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index ecd03c60a981..b6d4866f51a0 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -34,7 +34,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 41b0daf0cfe6..c3982fb7aa8c 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -16,7 +16,7 @@ from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index 1c663e219e5e..ce3e015e7efe 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -15,7 +15,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index ffead3127136..053032a817a9 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -20,7 +20,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 6be09fea50a9..975f66770b1f 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -37,8 +37,8 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 6e9e46368f26..616e16c03f35 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -48,7 +48,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index e36a5f04e75f..be9ca1d32d7f 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -39,7 +39,7 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 3b0dce7fcd17..aa9f2492ccdf 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -49,7 +49,7 @@ from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor.layers.activation import QuickGELU -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv3dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 0c7f0dca1022..b570650726df 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -34,7 +34,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index b63ee2c18c16..af16521030ec 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -41,7 +41,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.layernorm import RMSNorm diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index d5b27796f33b..f25c314490c4 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -28,7 +28,7 @@ ) from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fla.ops import ( chunk_gated_delta_rule, fused_recurrent_gated_delta_rule, diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 4440f8fb64b5..a923e17a0eaa 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -35,7 +35,7 @@ from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index c047415d4104..eecaad1b235d 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -19,10 +19,10 @@ from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( EncoderOnlyAttention, + MMEncoderAttention, ) -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/siglip2.py b/vllm/model_executor/models/siglip2.py index 8fbc408ec23e..4d0ae5cbc3ab 100644 --- a/vllm/model_executor/models/siglip2.py +++ b/vllm/model_executor/models/siglip2.py @@ -14,7 +14,7 @@ from vllm.config import MultiModalConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index f4b79da5c3a4..97d25303eabb 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -14,7 +14,7 @@ from vllm.config import MultiModalConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 0aa3fc25969d..3b51f7e3492e 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -34,7 +34,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 3e05f0adcac4..37c0246ed875 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -32,7 +32,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index da0d8ce21461..a120c12a2cdf 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -32,7 +32,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 40ebb6a522f3..01aea0cd9f8a 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -18,7 +18,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 771e5974ae00..85eeeeee7b5f 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -19,7 +19,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import MMEncoderAttention from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index e8460a76093f..1fb4ca3f0237 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -31,8 +31,8 @@ from vllm.distributed import get_pp_group, get_tp_group from vllm.distributed.utils import get_pp_indices from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.encoder_only_attention import ( +from vllm.model_executor.layers.attention import ( + Attention, EncoderOnlyAttention, ) from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index b68e0e8f90c4..b0cf1754404a 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -24,9 +24,11 @@ from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.cross_attention import CrossAttention -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import ( + Attention, + CrossAttention, + MMEncoderAttention, +) from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, diff --git a/vllm/model_executor/models/whisper_utils.py b/vllm/model_executor/models/whisper_utils.py index db3144d2f075..356f7816e613 100644 --- a/vllm/model_executor/models/whisper_utils.py +++ b/vllm/model_executor/models/whisper_utils.py @@ -10,7 +10,7 @@ from torch import nn from vllm.config import CacheConfig, VllmConfig -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.v1.attention.backend import ( AttentionBackend, diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index f23421610273..0508bda39eeb 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -20,7 +20,7 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 8139ca87c360..f76acc4fc976 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -9,7 +9,7 @@ import numpy as np import torch -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.v1.attention.backend import ( AttentionBackend, AttentionImpl, diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 834c492a2b07..5b7894f22c84 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -9,7 +9,7 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import get_cu_count diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3996cc32ce9e..8cd22f968e97 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -48,8 +48,7 @@ ) from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping, LoRAMappingType -from vllm.model_executor.layers.attention.attention import Attention -from vllm.model_executor.layers.attention.mla_attention import MLAAttention +from vllm.model_executor.layers.attention import Attention, MLAAttention from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.rotary_embedding import ( MRotaryEmbedding, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 93f901cf261a..a209a07e1dc7 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -9,7 +9,7 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig from vllm.logger import init_logger -from vllm.model_executor.layers.attention.attention import Attention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config From d5a9b0e9bfc668e6c43caa9d027e6147699ef211 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 13 Jan 2026 10:26:57 -0500 Subject: [PATCH 13/18] Fix circular imports Signed-off-by: Matthew Bonanni --- vllm/model_executor/layers/attention/kv_transfer_utils.py | 2 +- vllm/model_executor/layers/attention/mla_attention.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/attention/kv_transfer_utils.py b/vllm/model_executor/layers/attention/kv_transfer_utils.py index e9bb27e512a4..9ee6b4d0f5b8 100644 --- a/vllm/model_executor/layers/attention/kv_transfer_utils.py +++ b/vllm/model_executor/layers/attention/kv_transfer_utils.py @@ -19,7 +19,7 @@ def maybe_transfer_kv_layer(func: Callable) -> Callable: On exit: saves the KV layer to the connector. """ # Import at runtime to avoid circular dependency - from vllm.model_executor.layers.attention import get_attention_context + from vllm.model_executor.layers.attention.attention import get_attention_context # Inspect the signature ONCE when the decorator is applied. sig = inspect.signature(func) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index 3be052cdc3b4..3c5ee3773d7d 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -11,7 +11,7 @@ from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger -from vllm.model_executor.layers.attention import ( +from vllm.model_executor.layers.attention.attention import ( _init_kv_cache_quant, get_attention_context, set_default_quant_scales, From 3c914760c0cb3ab88a31cc830c665bc225de43fe Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 13 Jan 2026 10:29:49 -0500 Subject: [PATCH 14/18] Move maybe_calc_kv_scales below Attention Signed-off-by: Matthew Bonanni --- .../layers/attention/attention.py | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index b09ba6a2da34..f1d4d328b5cd 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -176,40 +176,6 @@ def _init_kv_cache_quant( layer.quant_method.create_weights(layer) -def maybe_calc_kv_scales( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - layer_name: str, -) -> None: - forward_context: ForwardContext = get_forward_context() - self = forward_context.no_compile_layers[layer_name] - - # Only calculate if the layer's calculate_kv_scales flag is True - # This flag gets set to False after the first forward pass - if not self.calculate_kv_scales: - return - - self.calc_kv_scales(query, key, value) - - -def maybe_calc_kv_scales_fake( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - layer_name: str, -) -> None: - return - - -direct_register_custom_op( - op_name="maybe_calc_kv_scales", - op_func=maybe_calc_kv_scales, - mutates_args=["query", "key", "value"], - fake_impl=maybe_calc_kv_scales_fake, -) - - class Attention(nn.Module, AttentionLayerBase): """Attention layer. @@ -532,6 +498,40 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec: ) +def maybe_calc_kv_scales( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + + # Only calculate if the layer's calculate_kv_scales flag is True + # This flag gets set to False after the first forward pass + if not self.calculate_kv_scales: + return + + self.calc_kv_scales(query, key, value) + + +def maybe_calc_kv_scales_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="maybe_calc_kv_scales", + op_func=maybe_calc_kv_scales, + mutates_args=["query", "key", "value"], + fake_impl=maybe_calc_kv_scales_fake, +) + + def get_attention_context( layer_name: str, ) -> tuple[dict | object | None, "Attention | MLAAttention", torch.Tensor]: From 1e6ebc3ee55efa4eeb8c09ab97d089561596abd4 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 13 Jan 2026 10:38:03 -0500 Subject: [PATCH 15/18] Add comment Signed-off-by: Matthew Bonanni --- .buildkite/test-amd.yaml | 1 + .buildkite/test-pipeline.yaml | 1 + .buildkite/test_areas/kernels.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index f97987dc9e52..5c83ba020d97 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -635,6 +635,7 @@ steps: source_file_dependencies: - csrc/attention/ - vllm/v1/attention + # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) - vllm/model_executor/layers/attention - tests/kernels/attention commands: diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b82ed956b21d..2e17904de941 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -563,6 +563,7 @@ steps: source_file_dependencies: - csrc/attention/ - vllm/v1/attention + # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) - vllm/model_executor/layers/attention - tests/kernels/attention commands: diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml index 29822d251af4..06a94219e992 100644 --- a/.buildkite/test_areas/kernels.yaml +++ b/.buildkite/test_areas/kernels.yaml @@ -16,6 +16,7 @@ steps: source_file_dependencies: - csrc/attention/ - vllm/v1/attention + # TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267) - vllm/model_executor/layers/attention - tests/kernels/attention commands: From 84a7c03dc800af77c9478e6d641466cfe76fcdab Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 14 Jan 2026 23:20:35 +0000 Subject: [PATCH 16/18] Lazy import flashinfer Signed-off-by: Matthew Bonanni --- .../layers/attention/mla_attention.py | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index 8a43c275837e..19ae8dc313d8 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -191,7 +191,10 @@ from abc import abstractmethod from dataclasses import dataclass, field from enum import Enum -from typing import ClassVar, Generic, TypeVar, cast +from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, cast + +if TYPE_CHECKING: + from flashinfer import BatchPrefillWithRaggedKVCacheWrapper import torch import torch.nn as nn @@ -596,15 +599,12 @@ class QueryLenSupport(Enum): from flash_attn import flash_attn_varlen_func # type: ignore[no-redef] is_vllm_fa = False -try: - from flashinfer import BatchPrefillWithRaggedKVCacheWrapper - from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache # noqa: F401 - flashinfer_available = True -except ImportError: - BatchPrefillWithRaggedKVCacheWrapper = object +@functools.cache +def flashinfer_available() -> bool: + import importlib.util - flashinfer_available = False + return importlib.util.find_spec("flashinfer") is not None def dynamic_per_batched_tensor_quant( @@ -697,8 +697,8 @@ class ChunkedContextMetadata: @dataclass class FlashInferPrefillMetadata(MLACommonPrefillMetadata): - prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None - prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = field( + prefill_main: "BatchPrefillWithRaggedKVCacheWrapper | None" = None + prefill_chunks: "list[BatchPrefillWithRaggedKVCacheWrapper]" = field( default_factory=list ) @@ -781,7 +781,7 @@ def use_flashinfer_prefill() -> bool: vllm_config = get_current_vllm_config() return ( not vllm_config.attention_config.disable_flashinfer_prefill - and flashinfer_available + and flashinfer_available() and not vllm_config.attention_config.use_cudnn_prefill and not vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill and current_platform.is_device_capability_family(100) @@ -793,7 +793,7 @@ def use_cudnn_prefill() -> bool: vllm_config = get_current_vllm_config() return ( - flashinfer_available + flashinfer_available() and vllm_config.attention_config.use_cudnn_prefill and current_platform.is_device_capability_family(100) and has_nvidia_artifactory() @@ -806,7 +806,7 @@ def use_trtllm_ragged_deepseek_prefill() -> bool: vllm_config = get_current_vllm_config() return ( - flashinfer_available + flashinfer_available() and vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill and current_platform.is_device_capability_family(100) ) @@ -1012,6 +1012,8 @@ def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata): has_context = True if self._fi_prefill_main is None: + from flashinfer import BatchPrefillWithRaggedKVCacheWrapper + self._fi_prefill_main = BatchPrefillWithRaggedKVCacheWrapper( self._workspace_buffer, "NHD", backend="cutlass" ) @@ -1020,6 +1022,8 @@ def _build_fi_prefill_wrappers(self, prefill: FlashInferPrefillMetadata): num_chunks = chunked_context.cu_seq_lens.shape[0] # Allocate more prefill chunk wrappers if needed if len(self._fi_prefill_chunks) < num_chunks: + from flashinfer import BatchPrefillWithRaggedKVCacheWrapper + for _ in range(len(self._fi_prefill_chunks), num_chunks): self._fi_prefill_chunks.append( BatchPrefillWithRaggedKVCacheWrapper( @@ -1757,6 +1761,8 @@ def _run_prefill_new_tokens_cudnn( ): assert isinstance(prefill, CudnnPrefillMetadata) assert prefill.query_seq_lens is not None + from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache + output, lse = cudnn_batch_prefill_with_kv_cache( q=q, k_cache=k, @@ -1816,6 +1822,8 @@ def _run_prefill_context_chunk_cudnn( assert prefill.chunked_context is not None assert prefill.chunked_context.seq_lens[chunk_idx] is not None assert prefill.query_seq_lens is not None + from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache + return cudnn_batch_prefill_with_kv_cache( q=q, k_cache=k, From 8545dfd6e7ff1af2c8363a97adbc312becba9c69 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 15 Jan 2026 10:15:47 -0500 Subject: [PATCH 17/18] Fix molmo Signed-off-by: Matthew Bonanni --- vllm/model_executor/models/molmo2.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index bf4a298a75e4..a894d3931a78 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -23,7 +23,6 @@ from transformers.tokenization_utils_base import TextInput from transformers.video_utils import VideoInput, VideoMetadata -from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions @@ -36,7 +35,7 @@ ) from vllm.logger import init_logger from vllm.model_executor.layers.activation import MulAndSilu, SiluAndMul, get_act_fn -from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention +from vllm.model_executor.layers.attention import Attention, MMEncoderAttention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, From cfb5220643db098ed879d552c43047e3f3624c99 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Mon, 26 Jan 2026 10:25:48 -0500 Subject: [PATCH 18/18] Fix imports Signed-off-by: Matthew Bonanni --- tests/v1/worker/test_utils.py | 2 +- vllm/model_executor/models/step1.py | 3 ++- vllm/model_executor/models/whisper_causal.py | 2 +- vllm/v1/spec_decode/draft_model.py | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py index 91254fc23904..76f9a8f90f70 100644 --- a/tests/v1/worker/test_utils.py +++ b/tests/v1/worker/test_utils.py @@ -58,7 +58,7 @@ def test_bind_kv_cache_non_attention(default_vllm_config): def test_bind_kv_cache_draft_model(default_vllm_config): - from vllm.attention.layer import Attention + from vllm.model_executor.layers.attention import Attention layer_names = [ "model.layers.0.attn", diff --git a/vllm/model_executor/models/step1.py b/vllm/model_executor/models/step1.py index 8e655c691e0a..4173b9ebf31d 100644 --- a/vllm/model_executor/models/step1.py +++ b/vllm/model_executor/models/step1.py @@ -10,7 +10,6 @@ import torch from torch import nn -from vllm.attention.layer import Attention, AttentionType from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( get_pp_group, @@ -18,6 +17,7 @@ get_tensor_model_parallel_world_size, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( MergedColumnParallelLinear, @@ -41,6 +41,7 @@ maybe_prefix, ) from vllm.sequence import IntermediateTensors +from vllm.v1.attention.backend import AttentionType STEP_PACKED_MODULES_MAPPING = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], diff --git a/vllm/model_executor/models/whisper_causal.py b/vllm/model_executor/models/whisper_causal.py index c547d5d3f303..489bde33f11a 100644 --- a/vllm/model_executor/models/whisper_causal.py +++ b/vllm/model_executor/models/whisper_causal.py @@ -10,9 +10,9 @@ import torch.nn.functional as F from torch import nn -from vllm.attention.layer import Attention from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, diff --git a/vllm/v1/spec_decode/draft_model.py b/vllm/v1/spec_decode/draft_model.py index 5a54074dd756..7d631aa89e04 100644 --- a/vllm/v1/spec_decode/draft_model.py +++ b/vllm/v1/spec_decode/draft_model.py @@ -4,10 +4,10 @@ import torch -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config.speculative import SpeculativeConfig from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.model_loader import get_model from vllm.triton_utils import tl, triton from vllm.v1.attention.backends.utils import (