Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
3f2691a
Move kv_transfer_utils
MatthewBonanni Jan 9, 2026
1cb4ce3
Start splitting up layer.py
MatthewBonanni Jan 9, 2026
d55d99a
Move content out of kv_sharing_utils.py and delete file
MatthewBonanni Jan 12, 2026
d182894
Update imports
MatthewBonanni Jan 12, 2026
47f80e9
Move helpers, delete layer.py
MatthewBonanni Jan 12, 2026
ddda80c
Delete vllm/attention
MatthewBonanni Jan 12, 2026
99e5293
Update and remove old references to vllm/attention
MatthewBonanni Jan 12, 2026
9942dff
Add dependency
MatthewBonanni Jan 12, 2026
3111dd0
Add to AMD
MatthewBonanni Jan 12, 2026
8b56809
Add to test-pipeline.yaml
MatthewBonanni Jan 13, 2026
5eb3a47
Add imports to __init__
MatthewBonanni Jan 13, 2026
877c5e9
Update imports
MatthewBonanni Jan 13, 2026
fc11d9d
Merge branch 'main' into attention_restructure_5
MatthewBonanni Jan 13, 2026
d5a9b0e
Fix circular imports
MatthewBonanni Jan 13, 2026
3c91476
Move maybe_calc_kv_scales below Attention
MatthewBonanni Jan 13, 2026
1e6ebc3
Add comment
MatthewBonanni Jan 13, 2026
4aee556
Merge branch 'main' into attention_restructure_5
MatthewBonanni Jan 13, 2026
84a7c03
Lazy import flashinfer
MatthewBonanni Jan 14, 2026
ed5bd4b
Merge branch 'main' into attention_restructure_5
MatthewBonanni Jan 15, 2026
8545dfd
Fix molmo
MatthewBonanni Jan 15, 2026
fb5b97e
Merge branch 'main' into attention_restructure_5
MatthewBonanni Jan 15, 2026
3b6eef3
Merge branch 'main' into attention_restructure_5
MatthewBonanni Jan 16, 2026
99b8a64
Merge branch 'main' into attention_restructure_5
MatthewBonanni Jan 26, 2026
cfb5220
Fix imports
MatthewBonanni Jan 26, 2026
3ed57e7
Merge branch 'main' into attention_restructure_5
MatthewBonanni Jan 26, 2026
e00ccbc
Merge branch 'main' into attention_restructure_5
MatthewBonanni Jan 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -640,8 +640,9 @@ steps:
# grade: Blocking
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
Expand Down
3 changes: 2 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -568,8 +568,9 @@ steps:
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
Expand Down
3 changes: 2 additions & 1 deletion .buildkite/test_areas/kernels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ steps:
timeout_in_minutes: 35
source_file_dependencies:
- csrc/attention/
- vllm/attention
- vllm/v1/attention
# TODO: remove this dependency (https://github.com/vllm-project/vllm/issues/32267)
- vllm/model_executor/layers/attention
- tests/kernels/attention
commands:
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
Expand Down
2 changes: 1 addition & 1 deletion .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# for more info about CODEOWNERS file

# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/attention @LucasWilkinson
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep
Expand Down
2 changes: 1 addition & 1 deletion docs/contributing/model/basic.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ The initialization code should look like this:
```python
from torch import nn
from vllm.config import VllmConfig
from vllm.attention.layer import Attention
from vllm.model_executor.layers.attention import Attention

class MyAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):
Expand Down
2 changes: 1 addition & 1 deletion docs/design/custom_op.md
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ Taking `MMEncoderAttention` as an example:
??? code

```python
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
from vllm.model_executor.layers.attention import MMEncoderAttention
from vllm.model_executor.custom_op import CustomOp


Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_fusion_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from tests.utils import flat_product
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
from vllm.attention.layer import Attention
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
from vllm.compilation.fx_utils import find_op_nodes
from vllm.compilation.matcher_utils import QUANT_OPS
Expand All @@ -40,6 +39,7 @@
set_current_vllm_config,
)
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey,
kFp8StaticTensorSym,
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_qk_norm_rope_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import torch

from tests.compile.backend import TestBackend
from vllm.attention.layer import Attention
from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
from vllm.compilation.noop_elimination import NoOpEliminationPass
from vllm.compilation.post_cleanup import PostCleanupPass
Expand All @@ -21,6 +20,7 @@
VllmConfig,
set_current_vllm_config,
)
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.platforms import current_platform
Expand Down
3 changes: 1 addition & 2 deletions tests/kernels/attention/test_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops
from vllm.attention.layer import Attention
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
from vllm.model_executor.layers.attention import Attention, MMEncoderAttention
from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes
from vllm.utils.torch_utils import set_random_seed
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/attention/test_mha_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import pytest
import torch

from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
from vllm.model_executor.layers.attention import MMEncoderAttention
from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/worker/test_gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pytest
import torch

from vllm.attention.layer import Attention
from vllm.config import (
AttentionConfig,
CacheConfig,
Expand All @@ -19,6 +18,7 @@
init_distributed_environment,
initialize_model_parallel,
)
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
Expand Down
6 changes: 3 additions & 3 deletions tests/v1/worker/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


def test_bind_kv_cache(default_vllm_config):
from vllm.attention.layer import Attention
from vllm.model_executor.layers.attention import Attention

ctx = {
"layers.0.self_attn": Attention(32, 128, 0.1, prefix="layers.0.self_attn"),
Expand Down Expand Up @@ -35,7 +35,7 @@ def test_bind_kv_cache(default_vllm_config):


def test_bind_kv_cache_non_attention(default_vllm_config):
from vllm.attention.layer import Attention
from vllm.model_executor.layers.attention import Attention

# example from Jamba PP=2
ctx = {
Expand All @@ -58,7 +58,7 @@ def test_bind_kv_cache_non_attention(default_vllm_config):


def test_bind_kv_cache_draft_model(default_vllm_config):
from vllm.attention.layer import Attention
from vllm.model_executor.layers.attention import Attention

layer_names = [
"model.layers.0.attn",
Expand Down
1 change: 0 additions & 1 deletion tools/pre_commit/mypy.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
SEPARATE_GROUPS = [
"tests",
# v0 related
"vllm/attention",
"vllm/compilation",
"vllm/lora",
"vllm/model_executor",
Expand Down
Empty file removed vllm/attention/__init__.py
Empty file.
Empty file removed vllm/attention/utils/__init__.py
Empty file.
33 changes: 0 additions & 33 deletions vllm/attention/utils/kv_sharing_utils.py

This file was deleted.

2 changes: 1 addition & 1 deletion vllm/compilation/fusion_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
from torch._higher_order_ops.auto_functionalize import auto_functionalized
from torch._inductor.pattern_matcher import PatternMatcherPass

from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.logger import init_logger
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.quantization.utils.quant_utils import (
QuantKey,
kNvfp4Dynamic,
Expand Down
2 changes: 1 addition & 1 deletion vllm/compilation/qk_norm_rope_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from torch._higher_order_ops.auto_functionalize import auto_functionalized
from torch._inductor.pattern_matcher import PatternMatcherPass

from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.logger import init_logger
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding

from .fusion import empty_bf16, empty_fp32, empty_i64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import torch

from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
from vllm.distributed.kv_transfer.kv_connector.utils import yield_req_data
Expand All @@ -19,6 +18,7 @@
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
from vllm.forward_context import ForwardContext
from vllm.logger import init_logger
from vllm.model_executor.layers.attention import Attention
from vllm.v1.attention.backend import AttentionBackend, AttentionMetadata
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.core.kv_cache_utils import BlockHash
Expand Down
26 changes: 26 additions & 0 deletions vllm/model_executor/layers/attention/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm.model_executor.layers.attention.attention import Attention
from vllm.model_executor.layers.attention.chunked_local_attention import (
ChunkedLocalAttention,
)
from vllm.model_executor.layers.attention.cross_attention import CrossAttention
from vllm.model_executor.layers.attention.encoder_only_attention import (
EncoderOnlyAttention,
)
from vllm.model_executor.layers.attention.mla_attention import MLAAttention
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
from vllm.model_executor.layers.attention.static_sink_attention import (
StaticSinkAttention,
)

__all__ = [
"Attention",
"ChunkedLocalAttention",
"CrossAttention",
"EncoderOnlyAttention",
"MLAAttention",
"MMEncoderAttention",
"StaticSinkAttention",
]
Loading