Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1105,8 +1105,8 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py
- vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -954,8 +954,8 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py
- vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/test_areas/kernels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ steps:
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/attention/backends/mla/cutlass_mla.py
- vllm/v1/attention/backends/mla/flashinfer_mla.py
- vllm/v1/attention/selector.py
- vllm/platforms/cuda.py
- vllm/attention/selector.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
Expand Down
8 changes: 4 additions & 4 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

# This lists cover the "core" components of vLLM that require careful review
/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
Expand All @@ -27,6 +26,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson

# vLLM V1
/vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backend.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
/vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep
Expand Down Expand Up @@ -117,15 +117,15 @@ mkdocs.yaml @hmellor
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten

# Kernels
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
/vllm/attention/ops/triton_unified_attention.py @tdoublep
/vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep
/vllm/v1/attention/ops/triton_unified_attention.py @tdoublep

# ROCm related: specify owner with write access to notify AMD folks for careful code review
/vllm/**/*rocm* @tjtanaa
/docker/Dockerfile.rocm* @gshtras @tjtanaa
/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
/vllm/v1/attention/ops/rocm*.py @gshtras @tjtanaa
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
/csrc/rocm @gshtras @tjtanaa
/requirements/*rocm* @tjtanaa
Expand Down
4 changes: 2 additions & 2 deletions .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -222,10 +222,10 @@ pull_request_rules:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
- files~=^requirements/rocm.*\.txt
- files~=^vllm/attention/backends/rocm.*\.py
- files~=^vllm/attention/ops/rocm.*\.py
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
- files~=^vllm/v1/attention/backends/rocm.*\.py
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
- files~=^vllm/v1/attention/ops/rocm.*\.py
- files~=^tests/kernels/.*_rocm.*\.py
- files=vllm/platforms/rocm.py
- title~=(?i)AMD
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/kernels/benchmark_reshape_and_cache_flash.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@
from tabulate import tabulate

from vllm import _custom_ops as ops
from vllm.attention.ops.triton_reshape_and_cache_flash import (
triton_reshape_and_cache_flash,
)
from vllm.logger import init_logger
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
create_kv_caches_with_random_flash,
set_random_seed,
)
from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
triton_reshape_and_cache_flash,
)

logger = init_logger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion docs/contributing/model/basic.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl
For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend.
It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/v1/attention/backends/registry.py) when adding a new mamba backend.
Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
2 changes: 1 addition & 1 deletion docs/design/custom_op.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ For example:
**1. Attention:**

```python
--8<-- "vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn"
--8<-- "vllm/model_executor/layers/attention/mm_encoder_attention.py:mm_encoder_attn"

--8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention"
```
Expand Down
4 changes: 2 additions & 2 deletions docs/design/plugin_system.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ Every plugin has three parts:

Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented.

5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.v1.attention.backend.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.

6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports:

Expand Down Expand Up @@ -153,5 +153,5 @@ The interface for the model/module may change during vLLM's development. If you

!!! warning "Deprecations"
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
- `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead.
2 changes: 1 addition & 1 deletion examples/offline_inference/basic/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from argparse import Namespace

from vllm import LLM, EngineArgs
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import AttentionConfig
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.attention.backends.registry import AttentionBackendEnum


def parse_args():
Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference/basic/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from argparse import Namespace

from vllm import LLM, EngineArgs
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import AttentionConfig
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.attention.backends.registry import AttentionBackendEnum


def parse_args():
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/fullgraph/test_full_cudagraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
from tests.utils import wait_for_gpu_memory_to_clear
from tests.v1.attention.utils import full_cg_backend_configs as backend_configs
from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import CompilationConfig
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
from vllm.v1.attention.backends.registry import AttentionBackendEnum


@contextlib.contextmanager
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/fullgraph/test_full_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@

from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
from vllm.platforms import current_platform
from vllm.utils.torch_utils import is_torch_equal_or_newer
from vllm.v1.attention.backends.registry import AttentionBackendEnum

from ...utils import create_new_process_for_each_test

Expand Down
4 changes: 2 additions & 2 deletions tests/compile/test_fusion_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
from tests.utils import flat_product
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.layer import Attention
from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
from vllm.compilation.fx_utils import find_op_nodes
Expand All @@ -37,6 +35,8 @@
from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp
from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer
from vllm.v1.attention.backend import AttentionMetadata
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from vllm.v1.kv_cache_interface import AttentionSpec

FP8_DTYPE = current_platform.fp8_dtype()
Expand Down
2 changes: 1 addition & 1 deletion tests/compile/test_qk_norm_rope_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import torch

from tests.compile.backend import TestBackend
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention
from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
from vllm.compilation.noop_elimination import NoOpEliminationPass
Expand All @@ -25,6 +24,7 @@
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.platforms import current_platform
from vllm.v1.attention.backend import AttentionType

RSQRT_OP = torch.ops.aten.rsqrt.default
INDEX_SELECT_OP = torch.ops.aten.index.Tensor
Expand Down
2 changes: 1 addition & 1 deletion tests/config/test_multimodal_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

import pytest

from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.config.multimodal import MultiModalConfig
from vllm.v1.attention.backends.registry import AttentionBackendEnum


def test_mm_encoder_attn_backend_str_conversion():
Expand Down
2 changes: 1 addition & 1 deletion tests/engine/test_arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def test_compilation_config():


def test_attention_config():
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.v1.attention.backends.registry import AttentionBackendEnum

parser = EngineArgs.add_cli_args(FlexibleArgumentParser())

Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/attention/test_aiter_flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import torch

import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401
from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available

NUM_HEADS = [(4, 4), (8, 2)]
HEAD_SIZES = [128, 256]
Expand Down
4 changes: 2 additions & 2 deletions tests/kernels/attention/test_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tests.kernels.utils import opcheck
from vllm import _custom_ops as ops
from vllm.attention.layer import Attention
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes
from vllm.utils.torch_utils import set_random_seed
Expand All @@ -30,7 +30,7 @@
NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing

# This should be sync with get_supported_head_sizes() in
# vllm.attention.ops.paged_attn.PagedAttention
# vllm.v1.attention.ops.paged_attn.PagedAttention
HEAD_SIZES = [32, 80, 128, 256]

BLOCK_SIZES = [16, 32]
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/attention/test_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
import pytest
import torch

from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.config import AttentionConfig, VllmConfig, set_current_vllm_config
from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.rocm import RocmPlatform
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend


@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -182,7 +182,7 @@ def test_backend_selection(
expected = name
assert backend.get_name() == expected
elif name == "FLASH_ATTN_MLA":
from vllm.attention.utils.fa_utils import (
from vllm.v1.attention.backends.fa_utils import (
flash_attn_supports_mla,
)

Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/attention/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def permute_and_compact(x):
v_scale,
)
elif implementation == "triton":
from vllm.attention.ops.triton_reshape_and_cache_flash import (
from vllm.v1.attention.ops.triton_reshape_and_cache_flash import (
triton_reshape_and_cache_flash,
)

Expand Down
4 changes: 2 additions & 2 deletions tests/kernels/attention/test_flashmla.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
import pytest
import torch

from vllm.attention.ops.flashmla import (
from vllm.triton_utils import triton
from vllm.v1.attention.ops.flashmla import (
flash_mla_with_kvcache,
get_mla_metadata,
is_flashmla_dense_supported,
)
from vllm.triton_utils import triton


def cal_diff(
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/attention/test_flashmla_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


def test_sparse_flashmla_metadata_smoke():
import vllm.attention.ops.flashmla as fm
import vllm.v1.attention.ops.flashmla as fm

ok, reason = fm.is_flashmla_sparse_supported()
if not ok:
Expand Down Expand Up @@ -34,7 +34,7 @@ def test_sparse_flashmla_metadata_smoke():


def test_sparse_flashmla_decode_smoke():
import vllm.attention.ops.flashmla as fm
import vllm.v1.attention.ops.flashmla as fm

ok, reason = fm.is_flashmla_sparse_supported()
if not ok:
Expand Down Expand Up @@ -97,7 +97,7 @@ def test_sparse_flashmla_decode_smoke():


def test_sparse_flashmla_prefill_smoke():
import vllm.attention.ops.flashmla as fm
import vllm.v1.attention.ops.flashmla as fm

ok, reason = fm.is_flashmla_sparse_supported()
if not ok:
Expand Down
4 changes: 2 additions & 2 deletions tests/kernels/attention/test_merge_attn_states.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
import torch

from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
from vllm.attention.ops.triton_merge_attn_states import (
from vllm.platforms import current_platform
from vllm.v1.attention.ops.triton_merge_attn_states import (
merge_attn_states as merge_attn_states_triton,
)
from vllm.platforms import current_platform


# Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/attention/test_mha_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
import pytest
import torch

from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
from vllm.attention.selector import _cached_get_attn_backend
from vllm.model_executor.layers.attention.mm_encoder_attention import MMEncoderAttention
from vllm.platforms import current_platform
from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.rocm import RocmPlatform
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from vllm.v1.attention.selector import _cached_get_attn_backend


@pytest.fixture(autouse=True)
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/attention/test_pack_unpack_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import torch
from torch.testing import assert_close

from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
from vllm.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton


def test_pack_seq_basic_fp8():
Expand Down
6 changes: 4 additions & 2 deletions tests/kernels/attention/test_prefix_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
import torch
import torch.nn.functional as F

from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
from vllm.attention.ops.prefix_prefill import context_attention_fwd
from vllm.platforms import current_platform
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
from vllm.v1.attention.ops.chunked_prefill_paged_decode import (
chunked_prefill_paged_decode,
)
from vllm.v1.attention.ops.prefix_prefill import context_attention_fwd

NUM_HEADS = [64]
NUM_QUERIES_PER_KV = [1, 64]
Expand Down
Loading