diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index 6dfa56017838..81f63122efdc 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -76,33 +76,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/common.txt -# must put before installing xformers, so it can install the correct version of xfomrers. ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} -# Build xformers with cuda and torch nightly -# following official xformers guidance: https://github.com/facebookresearch/xformers#build -# todo(elainewy): cache xformers build result for faster build ARG max_jobs=16 ENV MAX_JOBS=${max_jobs} -ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c ENV CCACHE_DIR=/root/.cache/ccache -RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/uv \ - echo 'git clone xformers...' \ - && git clone https://github.com/facebookresearch/xformers.git --recursive \ - && cd xformers \ - && git checkout ${XFORMERS_COMMIT} \ - && git submodule update --init --recursive \ - && echo 'finish git clone xformers...' \ - && rm -rf build \ - && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \ - && cd .. \ - && rm -rf xformers - -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system xformers-dist/*.whl --verbose # build can take a long time, and the torch nightly version fetched from url can be different in next docker stage. # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same @@ -233,10 +213,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm --mount=type=cache,target=/root/.cache/uv \ uv pip install --system vllm-dist/*.whl --verbose -# install xformers again for the new environment -RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \ - --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0' @@ -307,7 +283,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/nightly_torch_test.txt # Logging to confirm the torch versions -RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer' +RUN pip freeze | grep -E 'torch|vllm|flashinfer' # Logging to confirm all the packages are installed RUN pip freeze diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md index f983c25f26ee..e0ec85b57ab7 100644 --- a/docs/contributing/ci/update_pytorch_version.md +++ b/docs/contributing/ci/update_pytorch_version.md @@ -100,19 +100,11 @@ to warm it up so that future builds are faster. ## Update dependencies -Several vLLM dependencies like xFormers depend on PyTorch and need +Several vLLM dependencies depend on PyTorch and need to be updated accordingly. Rather than waiting for all of them to publish new releases (which would take too much time), they can be built from source to unblock the update process. -### xFormers - -```bash -export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a' -MAX_JOBS=16 uv pip install --system \ - --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2" -``` - ## Update all the different vLLM platforms Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index cfc8b4d9838a..7fe5f1d6b025 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -283,7 +283,7 @@ Currently, vLLM supports multiple backends for efficient Attention computation a If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: -- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`. +- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`. - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`. For AMD ROCm, you can futher control the specific Attention implementation using the following variables: diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh index 1577de85f7ff..b5c92749466b 100644 --- a/examples/online_serving/openai_embedding_long_text/service.sh +++ b/examples/online_serving/openai_embedding_long_text/service.sh @@ -22,7 +22,6 @@ API_KEY=${API_KEY:-"your-api-key"} POOLING_TYPE=${POOLING_TYPE:-"auto"} # auto, MEAN, CLS, LAST export VLLM_ENABLE_CHUNKED_PROCESSING=true export CUDA_VISIBLE_DEVICES=2,3,4,5 -# export VLLM_ATTENTION_BACKEND=XFORMERS echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing" echo "==================================================================" diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 5f7d520cd366..7283e7746953 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -9,7 +9,5 @@ torch==2.9.0 torchaudio==2.9.0 # These must be updated alongside torch torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -# Build from https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1 -xformers==0.0.33+5d4b92a5.d20251029; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9 # FlashInfer should be updated together with the Dockerfile flashinfer-python==0.4.1 diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 0cf1e85d4e8e..521d6c33dd39 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -74,9 +74,6 @@ def test_models( model_executor: str, enable_prompt_embeds: bool, ) -> None: - if backend == "XFORMERS" and model == "google/gemma-2-2b-it": - pytest.skip(f"{backend} does not support gemma2 with full context length.") - with monkeypatch.context() as m: m.setenv("VLLM_ATTENTION_BACKEND", backend) diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 9662e73321eb..1a7d5ce0ddc1 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -13,12 +13,6 @@ from vllm.platforms import current_platform from vllm.utils.mem_utils import get_max_shared_memory_bytes -if not current_platform.is_rocm(): - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask - - from tests.kernels.utils import make_alibi_bias - FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer @@ -448,129 +442,6 @@ def ref_multi_query_kv_attention( return torch.cat(ref_outputs, dim=0) -@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.skipif( - current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm." -) -@torch.inference_mode() -def test_multi_query_kv_attention( - num_seqs: int, - num_heads: tuple[int, int], - head_size: int, - dtype: torch.dtype, - seed: int, - device: str, - use_alibi: bool = False, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. - # As the xformers library is already tested with its own tests, we can use - # a smaller MAX_SEQ_LEN here. - max_len = min(MAX_SEQ_LEN, 4096) - seq_lens = random.sample(range(1, max_len), num_seqs) - num_tokens = sum(seq_lens) - - scale = float(1.0 / (head_size**0.5)) - num_query_heads, num_kv_heads = num_heads - qkv = torch.empty( - num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype - ) - qkv.uniform_(-scale, scale) - query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1) - - num_queries_per_kv = num_query_heads // num_kv_heads - if num_queries_per_kv > 1: - # Handle MQA and GQA - key = torch.repeat_interleave(key, num_queries_per_kv, dim=1) - value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) - alibi_bias = None - if use_alibi: - alibi_slopes = torch.randn(num_query_heads, dtype=torch.float) - attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens) - output = torch.empty_like(query) - start = 0 - # Dynamic sequence length not supported with custom attn_bias. - for i, seq_len in enumerate(seq_lens): - end = start + seq_len - out = xops.memory_efficient_attention_forward( - query[None, start:end], - key[None, start:end], - value[None, start:end], - attn_bias=attn_bias[i], - p=0.0, - scale=scale, - ) - output[start:end].copy_(out.view_as(query[start:end])) - start += seq_len - # xformers.AttentionBias to Tensor for use in reference impl. - alibi_bias = [ - b.materialize((1, num_query_heads, i, i), device=device).squeeze() - for b, i in zip(attn_bias, seq_lens) - ] - else: - attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens) - output = xops.memory_efficient_attention_forward( - query.unsqueeze(0), - key.unsqueeze(0), - value.unsqueeze(0), - attn_bias=attn_bias, - p=0.0, - scale=scale, - ) - output = output.squeeze(0) - - cu_seq_lens = [0] - for seq_len in seq_lens: - cu_seq_lens.append(cu_seq_lens[-1] + seq_len) - ref_output = ref_multi_query_kv_attention( - cu_seq_lens, - query, - key, - value, - scale, - alibi_bias, - dtype, - ) - atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 - rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 - torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol) - - -@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", [64]) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.skipif( - current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm." -) -@torch.inference_mode() -def test_multi_query_kv_attention_with_alibi( - num_seqs: int, - num_heads: tuple[int, int], - head_size: int, - dtype: torch.dtype, - seed: int, - device: str, -) -> None: - return test_multi_query_kv_attention( - num_seqs, - num_heads, - head_size, - dtype, - seed, - device, - use_alibi=True, - ) - - @pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention]) def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None: head_size = 64 diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 8149ce7672cd..e634432bd14a 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -33,7 +33,7 @@ def clear_cache(): } DEVICE_REGULAR_ATTN_BACKENDS = { - "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"], + "cuda": ["FLASHINFER", "FLASH_ATTN"], "hip": ["ROCM_ATTN"], "cpu": ["TORCH_SDPA"], } @@ -189,12 +189,6 @@ def test_env( ) expected = "FLASHINFER" assert backend.get_name() == expected - elif name == "XFORMERS": - backend = get_attn_backend( - 32, torch.float16, None, block_size, use_mla=use_mla - ) - expected = "XFORMERS" - assert backend.get_name() == expected elif name == "FLASH_ATTN": backend = get_attn_backend( 32, torch.float16, None, block_size, use_mla=use_mla diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 14d1618bca3c..c6058876e96d 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -24,10 +24,6 @@ def clear_cache(): """Clear lru cache to ensure each test case runs without caching.""" _cached_get_attn_backend.cache_clear() - # Clear xformers availability cache - import vllm.attention.layer as layer_module - - layer_module.USE_XFORMERS_OPS = None @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) @@ -63,7 +59,7 @@ def test_mha_attn_platform(device: str): # Test CUDA with head_size=72 (not divisible by 32) # - with upstream FA not available - # - should use xformers + # - should fall back to TORCH_SDPA with ( patch("vllm.attention.layer.current_platform", CudaPlatform()), patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()), @@ -73,7 +69,7 @@ def test_mha_attn_platform(device: str): ), ): attn = MultiHeadAttention(16, 72, scale=1) - assert attn.attn_backend == _Backend.XFORMERS + assert attn.attn_backend == _Backend.TORCH_SDPA # Test CUDA with head_size=72 (not divisible by 32) # - with upstream FA available diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py deleted file mode 100644 index 65972d02f2f6..000000000000 --- a/tests/kernels/attention/test_prefix_prefill.py +++ /dev/null @@ -1,561 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -import random -import time -from collections.abc import Callable - -import pytest -import torch -from xformers import ops as xops -from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask - -from tests.kernels.utils import make_alibi_bias -from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode -from vllm.attention.ops.prefix_prefill import context_attention_fwd -from vllm.platforms import current_platform -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE - -NUM_HEADS = [64] -NUM_QUERIES_PER_KV = [1, 64] -HEAD_SIZES = [24, 128] -DTYPES = [torch.float16] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] -SLIDING_WINDOW = [0, 16, 2048] -KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"] - -OPS = [chunked_prefill_paged_decode, context_attention_fwd] - - -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW) -@pytest.mark.parametrize("op", OPS) -@torch.inference_mode() -def test_contexted_kv_attention( - num_heads: int, - num_queries_per_kv: int, - head_size: int, - sliding_window: int, - dtype: torch.dtype, - kv_cache_dtype: str, - device: str, - op: Callable, -) -> None: - if "fp8" in kv_cache_dtype and not current_platform.has_device_capability(89): - pytest.skip( - "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" - ) - - current_platform.seed_everything(0) - torch.set_default_device(device) - - # Need this, otherwise when we capture the graph the process - # for GPU 1 would run on both GPU0 and GPU1 and things would hang - # - # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523 - torch.cuda.set_device(device) - - MAX_SEQ_LEN = 1024 - MAX_CTX_LEN = 1024 - BS = 10 - cache_size = 640 - block_size = 32 - max_block_per_request = 64 - query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)] - # ensure one sequence in batch is a decode - query_lens[-1] = 1 - - ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)] - seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)] - num_kv_heads = num_heads // num_queries_per_kv - - num_tokens = sum(query_lens) - query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) - query.uniform_(-1e-3, 1e-3) - output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) - - kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype) - kv.uniform_(-1e-3, 1e-3) - key, value = kv.unbind(dim=1) - - if kv_cache_dtype == "auto": - cache_dtype = dtype - else: - cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype] - k_cache = torch.zeros( - cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype - ) - v_cache = torch.zeros( - cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype - ) - k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) - v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) - values = torch.arange(0, cache_size, dtype=torch.long) - values = values[torch.randperm(cache_size)] - block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request) - b_seq_len = torch.tensor(seq_lens, dtype=torch.long) - b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long) - b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.long), dim=0) - max_input_len = MAX_SEQ_LEN - # copy kv to cache - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_lens[:-1], dtype=torch.long), dim=0 - ) - for i in range(BS): - for j in range(query_lens[i]): - k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + j]) - v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + b_ctx_len[i] + j]) - cur_ctx = 0 - block_id = 0 - while cur_ctx < b_ctx_len[i]: - start_loc = b_seq_start_loc[i] + cur_ctx - if cur_ctx + block_size > b_ctx_len[i]: - end_loc = b_seq_start_loc[i] + b_ctx_len[i] - else: - end_loc = start_loc + block_size - start_slot = block_table[i, block_id] * block_size - end_slot = start_slot + end_loc - start_loc - k_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_( - key[start_loc:end_loc] - ) - v_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_( - value[start_loc:end_loc] - ) - cur_ctx += block_size - block_id += 1 - # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size] - # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8] - k_cache = ( - k_cache.view(-1, block_size, num_kv_heads, head_size // 8, 8) - .permute(0, 2, 3, 1, 4) - .contiguous() - ) - # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size] - # to V_cache[num_blocks, num_kv_heads, head_size, block_size] - v_cache = ( - v_cache.view(-1, block_size, num_kv_heads, head_size) - .permute(0, 2, 3, 1) - .contiguous() - ) - k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) - - # Warm up the Triton kernel by calling it once before actually measuring - # generation time - op( - query, - k, - v, - output, - kv_cache_dtype, - k_cache, - v_cache, - block_table, - b_start_loc, - b_seq_len, - MAX_CTX_LEN, - max_input_len, - k_scale, - v_scale, - sliding_window=sliding_window, - ) - torch.cuda.synchronize() - start_time = time.time() - op( - query, - k, - v, - output, - kv_cache_dtype, - k_cache, - v_cache, - block_table, - b_start_loc, - b_seq_len, - MAX_CTX_LEN, - max_input_len, - k_scale, - v_scale, - sliding_window=sliding_window, - ) - torch.cuda.synchronize() - end_time = time.time() - print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms") - - scale = float(1.0 / (head_size**0.5)) - - attn_op = xops.fmha.cutlass.FwOp() - - if num_kv_heads != num_heads: - # As of Nov 2023, xformers only supports MHA. For MQA/GQA, - # project the key and value tensors to the desired number of - # heads. - # - # see also: vllm/model_executor/layers/attention.py - query = query.view( - query.shape[0], num_kv_heads, num_queries_per_kv, query.shape[-1] - ) - key = key[:, :, None, :].expand( - key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1] - ) - value = value[:, :, None, :].expand( - value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1] - ) - query = query.unsqueeze(0) - key = key.unsqueeze(0) - value = value.unsqueeze(0) - - attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens( - query_lens, seq_lens - ) - if sliding_window > 0: - attn_bias = attn_bias.make_local_attention_from_bottomright(sliding_window) - output_ref = xops.memory_efficient_attention_forward( - query, - key, - value, - attn_bias=attn_bias, - p=0.0, - scale=scale, - op=attn_op, - ) - torch.cuda.synchronize() - start_time = time.time() - output_ref = xops.memory_efficient_attention_forward( - query, - key, - value, - attn_bias=attn_bias, - p=0.0, - scale=scale, - op=attn_op, - ) - torch.cuda.synchronize() - end_time = time.time() - print(f"xformers Time: {(end_time - start_time) * 1000:.2f} ms") - output_ref = output_ref.reshape(output.shape) - atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-4 - torch.testing.assert_close(output, output_ref, atol=atol, rtol=0) - - -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("op", OPS) -@torch.inference_mode() -def test_contexted_kv_attention_alibi( - num_heads: int, - num_queries_per_kv: int, - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: str, - device: str, - op: Callable, -) -> None: - if "fp8" in kv_cache_dtype and not current_platform.has_device_capability(89): - pytest.skip( - "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89" - ) - - current_platform.seed_everything(0) - torch.set_default_device(device) - - # Need this, otherwise when we capture the graph the process - # for GPU 1 would run on both GPU0 and GPU1 and things would hang - # - # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523 - torch.cuda.set_device(device) - - def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: - # Fork from: vllm/vllm/model_executor/models/bloom.py#L44 - closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads)) - base = torch.tensor( - 2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), - dtype=torch.float32, - ) - powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32) - slopes = torch.pow(base, powers) - - if closest_power_of_2 != total_num_heads: - extra_base = torch.tensor( - 2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), - dtype=torch.float32, - ) - num_remaining_heads = min( - closest_power_of_2, total_num_heads - closest_power_of_2 - ) - extra_powers = torch.arange( - start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32 - ) - slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0) - return slopes - - alibi_slopes = _get_alibi_slopes(num_heads).to(device) - - MAX_SEQ_LEN = 1024 - MAX_CTX_LEN = 1024 - BS = 10 - cache_size = 640 - block_size = 32 - max_block_per_request = 64 - query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)] - ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)] - seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)] - num_kv_heads = num_heads // num_queries_per_kv - - num_tokens = sum(query_lens) - query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) - query.uniform_(-1e-3, 1e-3) - output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) - - kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype) - kv.uniform_(-1e-3, 1e-3) - key, value = kv.unbind(dim=1) - if kv_cache_dtype == "auto": - cache_dtype = dtype - else: - cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype] - k_cache = torch.zeros( - cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype - ) - v_cache = torch.zeros( - cache_size, block_size, num_kv_heads, head_size, dtype=cache_dtype - ) - k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) - v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) - values = torch.arange(0, cache_size, dtype=torch.long) - values = values[torch.randperm(cache_size)] - block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request) - b_seq_len = torch.tensor(seq_lens, dtype=torch.long) - b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long) - b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.long), dim=0) - max_input_len = MAX_SEQ_LEN - # copy kv to cache - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_lens[:-1], dtype=torch.long), dim=0 - ) - for i in range(BS): - for j in range(query_lens[i]): - k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + j]) - v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + b_ctx_len[i] + j]) - cur_ctx = 0 - block_id = 0 - while cur_ctx < b_ctx_len[i]: - start_loc = b_seq_start_loc[i] + cur_ctx - if cur_ctx + block_size > b_ctx_len[i]: - end_loc = b_seq_start_loc[i] + b_ctx_len[i] - else: - end_loc = start_loc + block_size - start_slot = block_table[i, block_id] * block_size - end_slot = start_slot + end_loc - start_loc - k_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_( - key[start_loc:end_loc] - ) - v_cache.view(-1, num_kv_heads, head_size)[start_slot:end_slot].copy_( - value[start_loc:end_loc] - ) - cur_ctx += block_size - block_id += 1 - # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size] - # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8] - k_cache = ( - k_cache.view(-1, block_size, num_kv_heads, head_size // 8, 8) - .permute(0, 2, 3, 1, 4) - .contiguous() - ) - # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size] - # to V_cache[num_blocks, num_kv_heads, head_size, block_size] - v_cache = ( - v_cache.view(-1, block_size, num_kv_heads, head_size) - .permute(0, 2, 3, 1) - .contiguous() - ) - k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) - - # Warm up the Triton kernel by calling it once before actually measuring - # generation time - op( - query, - k, - v, - output, - kv_cache_dtype, - k_cache, - v_cache, - block_table, - b_start_loc, - b_seq_len, - MAX_CTX_LEN, - max_input_len, - k_scale, - v_scale, - alibi_slopes=alibi_slopes, - ) - torch.cuda.synchronize() - start_time = time.time() - op( - query, - k, - v, - output, - kv_cache_dtype, - k_cache, - v_cache, - block_table, - b_start_loc, - b_seq_len, - MAX_CTX_LEN, - max_input_len, - k_scale, - v_scale, - alibi_slopes=alibi_slopes, - ) - torch.cuda.synchronize() - end_time = time.time() - print(f"triton Time: {(end_time - start_time) * 1000:.2f} ms") - scale = float(1.0 / (head_size**0.5)) - - # NOTE(DefTruth): In order to reuse _make_alibi_bias function, - # we have to pad query tensor before MQA/GQA expanding. - if query.shape[0] != key.shape[0]: - query_pad = torch.empty(sum(seq_lens), num_heads, head_size, dtype=dtype) - query_pad.uniform_(-1e-3, 1e-3) - seq_start = 0 - query_start = 0 - for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)): - seq_end = seq_start + seq_len - query_end = query_start + query_len - query_pad[seq_start:seq_end, ...] = torch.cat( - [ - torch.zeros(seq_len - query_len, num_heads, head_size, dtype=dtype), - query[query_start:query_end, ...], - ], - dim=0, - ) - seq_start += seq_len - query_start += query_len - query = query_pad - - if num_kv_heads != num_heads: - # As of Nov 2023, xformers only supports MHA. For MQA/GQA, - # project the key and value tensors to the desired number of - # heads. - # - # see also: vllm/model_executor/layers/attention.py - key = key[:, :, None, :].expand( - key.shape[0], num_kv_heads, num_queries_per_kv, key.shape[-1] - ) - value = value[:, :, None, :].expand( - value.shape[0], num_kv_heads, num_queries_per_kv, value.shape[-1] - ) - # [seq, num_kv_heads, num_queries_per_kv, dk]=> - # [seq, num_kv_heads*num_queries_per_kv, dk] to comply with rest of the - # codebase. We save some time reshaping alibi matrix at runtime. - key = key.reshape(key.shape[0], -1, key.shape[-1]) - value = value.reshape(value.shape[0], -1, value.shape[-1]) - query = query.unsqueeze(0) - key = key.unsqueeze(0) - value = value.unsqueeze(0) - - attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens) - output_ref = torch.empty_like(output) - seq_start = 0 - query_start = 0 - start_time = time.time() - # Attention with alibi slopes. - # FIXME(DefTruth): Because xformers does not support dynamic sequence - # lengths with custom attention bias, we process each prompt one by - # one. This is inefficient, especially when we have many short prompts. - # modified from: vllm/v1/attention/backends/xformers.py#L343 - for i, (query_len, seq_len) in enumerate(zip(query_lens, seq_lens)): - seq_end = seq_start + seq_len - query_end = query_start + query_len - out = xops.memory_efficient_attention_forward( - query[:, seq_start:seq_end], - key[:, seq_start:seq_end], - value[:, seq_start:seq_end], - attn_bias=attn_bias[i], - p=0.0, - scale=scale, - ) - out = out.view_as(query[:, seq_start:seq_end]).view( - seq_len, num_heads, head_size - ) - output_ref[query_start:query_end, ...].copy_(out[seq_len - query_len :, ...]) - seq_start += seq_len - query_start += query_len - torch.cuda.synchronize() - end_time = time.time() - print(f"xformers Time: {(end_time - start_time) * 1000:.2f} ms") - atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6 - torch.testing.assert_close(output, output_ref, atol=atol, rtol=0) - - -# These tests are optional to only run when explicitly invoked -# -# pytest -v -s --optional \ -# tests/kernels/test_prefix_prefill.py::test_contexted_kv_attention_f32 -# -# These tests are useful to test model dtype float32 on Turing devices. -# We skip them to not increase the time when running tests on CI -@pytest.mark.optional -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("dtype", [torch.float32]) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW) -@pytest.mark.parametrize("op", OPS) -@torch.inference_mode() -def test_contexted_kv_attention_f32( - num_heads: int, - num_queries_per_kv: int, - head_size: int, - sliding_window: int, - dtype: torch.dtype, - kv_cache_dtype: str, - device: str, - op: Callable, -) -> None: - test_contexted_kv_attention( - num_heads, - num_queries_per_kv, - head_size, - sliding_window, - dtype, - kv_cache_dtype, - device, - op, - ) - - -@pytest.mark.optional -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("dtype", [torch.float32]) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("op", OPS) -@torch.inference_mode() -def test_contexted_kv_attention_alibi_f32( - num_heads: int, - num_queries_per_kv: int, - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: str, - device: str, - op: Callable, -) -> None: - test_contexted_kv_attention_alibi( - num_heads, num_queries_per_kv, head_size, dtype, kv_cache_dtype, device, op - ) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index eb00bc72b4b0..f5bb56b46d60 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -21,7 +21,6 @@ from vllm.utils import ( STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, - STR_XFORMERS_ATTN_VAL, ) from vllm.utils.torch_utils import make_tensor_with_pad @@ -528,10 +527,6 @@ def make_backend(backend_name: str) -> AttentionBackend: * Backend instance """ - if backend_name == STR_XFORMERS_ATTN_VAL: - from vllm.v1.attention.backends.xformers import XFormersAttentionBackend - - return XFormersAttentionBackend() if backend_name == STR_FLASH_ATTN_VAL: from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend @@ -556,43 +551,6 @@ def make_backend(backend_name: str) -> AttentionBackend: raise AssertionError(f"Unrecognized backend_name {backend_name} for unit test") -def make_alibi_bias( - alibi_slopes: torch.Tensor, - num_kv_heads: int, - dtype: torch.dtype, - seq_lens: list[int], -) -> list[Any]: - """Create ALiBi biases compatible with xFormers attention tests.""" - from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias - - if alibi_slopes is None: - return [None for _ in seq_lens] - - attn_biases: list[Any] = [] - num_heads = alibi_slopes.shape[0] - assert num_heads >= num_kv_heads, ( - "ALiBi slopes expect at least as many heads as KV heads" - ) - - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) - bias = bias[None, :] - bias[:, None] - - padded_len = (seq_len + 7) // 8 * 8 - bias_tensor = torch.empty( - 1, - num_heads, - seq_len, - padded_len, - device=alibi_slopes.device, - dtype=dtype, - )[:, :, :, :seq_len].copy_(bias) - bias_tensor.mul_(alibi_slopes[:, None, None]) - attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor)) - - return attn_biases - - def _make_metadata_tensors( seq_lens: list[int] | None, context_lens: list[int] | None, @@ -693,26 +651,13 @@ def make_kv_cache( * block_size: number of offsets within a block * device: CPU or CUDA device * default_val: initialization value for KV cache elements - - Returns: - - * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size) - * for backend 'XFORMERS' - * kv_cache: 2 x num_blocks x block_size x num_heads x head_size - * for backend 'FLASH_ATTN' """ - if backend == "XFORMERS": - kv_cache = torch.rand((2, num_blocks, block_size * num_heads * head_size)).to( - device - ) - elif backend == "FLASH_ATTN": + if backend == "FLASH_ATTN": kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to( device ) else: - raise ValueError( - f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'." - ) + raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.") if default_val is not None: kv_cache[:, :, :] = default_val return kv_cache @@ -1081,12 +1026,7 @@ def assert_actual_matches_ideal( * output_under_test: actually observed output value """ ideal_output = test_params.packed_qkvo.ideal_output - if backend == "XFORMERS": - torch.testing.assert_close( - ideal_output, output_under_test.view_as(ideal_output) - ) - - elif backend == "FLASH_ATTN": + if backend == "FLASH_ATTN": # For FlashAttention override the accuracy thresholds to non default # values since we notice a higher difference between the ideal and # actual output. @@ -1094,8 +1034,8 @@ def assert_actual_matches_ideal( ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016 ) else: - raise ValueError( - f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'." + torch.testing.assert_close( + ideal_output, output_under_test.view_as(ideal_output) ) diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 1cf8ed602b6a..e430826461a1 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -57,10 +57,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: return generated_texts -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="MiniCPM-V dependency xformers incompatible with ROCm", -) def test_minicpmv_lora(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -84,10 +80,6 @@ def test_minicpmv_lora(minicpmv_lora_files): @pytest.mark.skipif( current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests" ) -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="MiniCPM-V dependency xformers incompatible with ROCm", -) @multi_gpu_test(num_gpus=4) def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): llm = vllm.LLM( @@ -108,10 +100,6 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files): @pytest.mark.skipif( current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests" ) -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="MiniCPM-V dependency xformers incompatible with ROCm", -) @multi_gpu_test(num_gpus=4) def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files): llm = vllm.LLM( diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 1800ca107a42..9f319202959f 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -142,10 +142,6 @@ def run_beam_search_test( QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct" -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm", -) def test_qwen2vl_lora(qwen2vl_lora_files): """Test Qwen 2.0 VL model with LoRA""" config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files) @@ -156,10 +152,6 @@ def test_qwen2vl_lora(qwen2vl_lora_files): tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id) -@pytest.mark.xfail( - current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm", -) def test_qwen2vl_lora_beam_search(qwen2vl_lora_files): """Test Qwen 2.0 VL model with LoRA through beam search.""" config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files) @@ -180,7 +172,7 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files): @pytest.mark.xfail( current_platform.is_rocm(), - reason="Qwen2.5-VL dependency xformers incompatible with ROCm", + reason="Qwen2.5-VL currently incompatible with ROCm", ) def test_qwen25vl_lora(qwen25vl_lora_files): """Test Qwen 2.5 VL model with LoRA""" diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 489ac1e6475b..b52c3f92f8db 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -23,9 +23,6 @@ REVISION_ROBERTA = os.environ.get("REVISION", "main") -@pytest.mark.skipif( - current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm." -) def test_model_loading_with_params(vllm_runner, monkeypatch): """ Test parameter weight loading with tp>1. @@ -68,7 +65,8 @@ def check_model(model): @pytest.mark.skipif( - current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm." + current_platform.is_rocm(), + reason="Embedding model loading test not supported on ROCm.", ) def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): """ @@ -112,7 +110,8 @@ def check_model(model): @pytest.mark.skipif( - current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm." + current_platform.is_rocm(), + reason="Embedding model loading test not supported on ROCm.", ) def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch): """ diff --git a/tests/test_config.py b/tests/test_config.py index bba2fbec3db2..238574949b1d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -197,9 +197,6 @@ def test_disable_sliding_window(model_id_expected): assert model_config.max_model_len == expected -@pytest.mark.skipif( - current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm." -) def test_get_pooling_config(): model_id = "sentence-transformers/all-MiniLM-L12-v2" model_config = ModelConfig(model_id) @@ -209,9 +206,6 @@ def test_get_pooling_config(): assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name -@pytest.mark.skipif( - current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm." -) def test_get_pooling_config_from_args(): model_id = "sentence-transformers/all-MiniLM-L12-v2" pooler_config = PoolerConfig(pooling_type="CLS", normalize=True) @@ -235,9 +229,6 @@ def test_default_pooling_type(model_id, default_pooling_type, pooling_type): assert model_config.pooler_config.pooling_type == pooling_type -@pytest.mark.skipif( - current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm." -) def test_get_bert_tokenization_sentence_transformer_config(): model_id = "BAAI/bge-base-en-v1.5" bge_model_config = ModelConfig(model_id) diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index 05d0159d0861..0e24f2f3cee3 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -10,7 +10,6 @@ class _Backend(enum.Enum): FLASH_ATTN = enum.auto() TRITON_ATTN = enum.auto() - XFORMERS = enum.auto() ROCM_ATTN = enum.auto() ROCM_AITER_MLA = enum.auto() ROCM_AITER_FA = enum.auto() # used for ViT attn backend @@ -33,7 +32,6 @@ class _Backend(enum.Enum): BACKEND_MAP = { _Backend.FLASH_ATTN: "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend", # noqa: E501 _Backend.TRITON_ATTN: "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend", # noqa: E501 - _Backend.XFORMERS: "vllm.v1.attention.backends.xformers.XFormersAttentionBackend", # noqa: E501 _Backend.ROCM_ATTN: "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend", # noqa: E501 _Backend.ROCM_AITER_MLA: "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend", # noqa: E501 _Backend.ROCM_AITER_FA: "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend", # noqa: E501 diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 17e025155a43..d9b9af3e25a6 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -55,31 +55,6 @@ FP8_DTYPE = current_platform.fp8_dtype() logger = init_logger(__name__) -USE_XFORMERS_OPS = None - - -def check_xformers_availability(): - global USE_XFORMERS_OPS - if USE_XFORMERS_OPS is not None: - return USE_XFORMERS_OPS - - if current_platform.is_cuda() and current_platform.has_device_capability(100): - # Xformers FA is not compatible with B200 - USE_XFORMERS_OPS = False - else: - try: - from importlib.util import find_spec - - find_spec("xformers.ops") - USE_XFORMERS_OPS = True - except ImportError: - USE_XFORMERS_OPS = False - - # the warning only needs to be shown once - if not USE_XFORMERS_OPS: - logger.warning("Xformers is not available, falling back.") - - return USE_XFORMERS_OPS def check_upstream_fa_availability(dtype: torch.dtype): @@ -531,7 +506,6 @@ def __init__( if backend in { _Backend.TORCH_SDPA, - _Backend.XFORMERS, _Backend.PALLAS, _Backend.ROCM_AITER_FA, _Backend.FLASH_ATTN, @@ -547,9 +521,6 @@ def __init__( ) ) - if self.attn_backend == _Backend.XFORMERS and not check_xformers_availability(): - self.attn_backend = _Backend.TORCH_SDPA - self.is_flash_attn_backend = self.attn_backend in { _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA, @@ -606,12 +577,6 @@ def forward( max_seqlen_k=kv_len, softmax_scale=self.scale, ) - elif self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - - out = xops.memory_efficient_attention_forward( - query, key, value, scale=self.scale - ) elif self.attn_backend == _Backend.TORCH_SDPA: query, key, value = (x.transpose(1, 2) for x in (query, key, value)) out = F.scaled_dot_product_attention(query, key, value, scale=self.scale) diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py index 06a9f7cd8226..7c9d8fac2677 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/attention/ops/vit_attn_wrappers.py @@ -3,7 +3,7 @@ """ This file contains ops for ViT attention to be compatible with torch.compile as there are operations here not supported by torch.compile (for instance, -`to_list` in xformers attn, or `.item()` in flash attention) +using `to_list` in certain attention kernels, or `.item()` in flash attention) Using these ops and wrapping vision blocks with `torch.compile` can speed up throughput in vision models by ~5% relative on H100, and improve token @@ -19,42 +19,6 @@ from vllm.utils.torch_utils import direct_register_custom_op -def xformers_attn_seqlens_wrapper( - q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor -) -> torch.Tensor: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalMask - - attn_bias = BlockDiagonalMask.from_seqlens( - q_seqlen=seqlens.tolist(), kv_seqlen=None, device=q.device - ) - context_layer = xops.memory_efficient_attention_forward( - q, k, v, attn_bias=attn_bias, p=0, scale=None - ) - context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous() - return context_layer - - -def xformers_attn_seqlens_wrapper_fake( - q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor -) -> torch.Tensor: - b, s, h, d = q.shape - return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device) - - -direct_register_custom_op( - op_name="xformers_attn_seqlens_wrapper", - op_func=xformers_attn_seqlens_wrapper, - fake_impl=xformers_attn_seqlens_wrapper_fake, -) - - -def vit_xformers_attn_wrapper( - q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor -) -> torch.Tensor: - return torch.ops.vllm.xformers_attn_seqlens_wrapper(q, k, v, seqlens) - - def flash_attn_maxseqlen_wrapper( q: torch.Tensor, k: torch.Tensor, diff --git a/vllm/envs.py b/vllm/envs.py index eb50ea6e5dbe..ccbfb6aa9d8a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -617,7 +617,6 @@ def get_vllm_port() -> int | None: # Example options: # - "TORCH_SDPA": use torch.nn.MultiheadAttention # - "FLASH_ATTN": use FlashAttention - # - "XFORMERS": use XFormers # - "FLASHINFER": use flashinfer # - "FLASHMLA": use FlashMLA # - "FLASH_ATTN_MLA": use FlashAttention for MLA diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 6d462ad8ae62..eafe577ef888 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -305,7 +305,6 @@ def __init__( if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, - _Backend.XFORMERS, _Backend.ROCM_AITER_FA, }: raise RuntimeError( @@ -373,18 +372,10 @@ def forward( out_i = out_i.permute(0, 2, 1, 3) outputs.append(out_i) context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0] - elif self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalMask - - attn_bias = BlockDiagonalMask.from_seqlens( - q_seqlen=seqlens, kv_seqlen=None, device=q.device - ) - context_layer = xops.memory_efficient_attention_forward( - q, k, v, attn_bias=attn_bias, p=0, scale=None - ) else: - raise RuntimeError("Unsupported attention backend") + raise RuntimeError( + f"DotsOCR vision attention does not support {self.attn_backend}." + ) # [B,S,H,D] -> [S,B,H*D] -> [S, C] context_layer = context_layer.permute(1, 0, 2, 3).contiguous() @@ -664,14 +655,13 @@ def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor: def compute_attn_mask_seqlen( self, cu_seqlens: torch.Tensor ) -> tuple[int | None, list[int] | None]: - max_seqlen, seqlens = None, None + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + max_seqlen = None if ( self.attn_backend == _Backend.FLASH_ATTN or self.attn_backend == _Backend.ROCM_AITER_FA ): max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() return max_seqlen, seqlens def forward( diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 86536b21c33f..4ad34a72fa0f 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -213,7 +213,6 @@ def __init__( if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, - _Backend.XFORMERS, _Backend.ROCM_AITER_FA, }: raise RuntimeError( @@ -258,7 +257,7 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, max_seqlen: int | None = None, # Only used for Flash Attention - seqlens: list[int] | None = None, # Only used for xFormers + seqlens: list[int] | None = None, ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) @@ -310,20 +309,10 @@ def forward( context_layer = rearrange( context_layer, "b s h d -> s b (h d)" ).contiguous() - elif self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalMask - - attn_bias = BlockDiagonalMask.from_seqlens( - q_seqlen=seqlens, kv_seqlen=None, device=q.device - ) - - context_layer = xops.memory_efficient_attention_forward( - q, k, v, attn_bias=attn_bias, p=0, scale=None + else: + raise RuntimeError( + f"ERNIE 4.5-VL does not support {self.attn_backend} backend now." ) - context_layer = rearrange( - context_layer, "b s h d -> s b (h d)" - ).contiguous() output, _ = self.proj(context_layer) return output @@ -403,7 +392,7 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, max_seqlen: int | None = None, # Only used for Flash Attention - seqlens: list[int] | None = None, # Only used for xFormers + seqlens: list[int] | None = None, ) -> torch.Tensor: hidden_states = hidden_states + self.attn( self.norm1(hidden_states), @@ -563,14 +552,13 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: def compute_attn_mask_seqlen( self, cu_seqlens: torch.Tensor ) -> tuple[int | None, list[int] | None]: - max_seqlen, seqlens = None, None + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + max_seqlen = None if ( self.attn_backend == _Backend.FLASH_ATTN or self.attn_backend == _Backend.ROCM_AITER_FA ): max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() return max_seqlen, seqlens def forward( diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 121e84469c52..2f773f4d8a75 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -308,7 +308,6 @@ def __init__( if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, - _Backend.XFORMERS, _Backend.ROCM_AITER_FA, }: raise RuntimeError( @@ -343,7 +342,7 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, max_seqlen: int | None = None, # Only used for Flash Attention - seqlens: list[int] | None = None, # Only used for xFormers + seqlens: list[int] | None = None, ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) @@ -396,20 +395,10 @@ def forward( context_layer = rearrange( context_layer, "b s h d -> s b (h d)" ).contiguous() - elif self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalMask - - attn_bias = BlockDiagonalMask.from_seqlens( - q_seqlen=seqlens, kv_seqlen=None, device=q.device - ) - - context_layer = xops.memory_efficient_attention_forward( - q, k, v, attn_bias=attn_bias, p=0, scale=None + else: + raise RuntimeError( + f"GLM-4V does not support {self.attn_backend} backend now." ) - context_layer = rearrange( - context_layer, "b s h d -> s b (h d)" - ).contiguous() output, _ = self.proj(context_layer) return output @@ -456,7 +445,7 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, max_seqlen: int | None = None, # Only used for Flash Attention - seqlens: list[int] | None = None, # Only used for xFormers + seqlens: list[int] | None = None, ) -> torch.Tensor: x_attn = self.attn( self.norm1(x), diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 5f8659a3064e..97873e004c57 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -415,7 +415,6 @@ def __init__( if self.attn_backend not in { _Backend.FLASH_ATTN, - _Backend.XFORMERS, _Backend.ROCM_AITER_FA, }: raise RuntimeError( @@ -442,7 +441,6 @@ def forward( ) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() batch_size = q.shape[0] if rope_emb is None: @@ -489,16 +487,9 @@ def forward( softmax_scale=self.scale, ) context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size) - elif self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalMask - - attn_bias = BlockDiagonalMask.from_seqlens( - q_seqlen=seqlens, kv_seqlen=None, device=q.device - ) - - context_layer = xops.memory_efficient_attention_forward( - q, k, v, attn_bias=attn_bias, p=0, scale=None + else: + raise RuntimeError( + f"Keye-VL does not support {self.attn_backend} backend now." ) context_layer = rearrange(context_layer, "b s h d -> b s (h d)").contiguous() diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 377b41a35578..2239a664d452 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -38,7 +38,6 @@ ) from vllm.attention.ops.vit_attn_wrappers import ( vit_flash_attn_wrapper, - vit_xformers_attn_wrapper, ) from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -696,10 +695,6 @@ def forward( context_layer = rearrange( context_layer, "b s h d -> s b (h d)" ).contiguous() - elif self.attn_backend == _Backend.XFORMERS: - if seqlens is None: - raise ValueError("xFormers attention backend requires seqlens tensor.") - context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens) else: raise RuntimeError( f"PaddleOCR-VL does not support {self.attn_backend} backend now." @@ -863,7 +858,6 @@ def __init__( if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, - _Backend.XFORMERS, _Backend.ROCM_AITER_FA, }: raise RuntimeError( @@ -936,11 +930,9 @@ def forward( cu_seqlens = cu_seqlens.to(device=device) max_seqlen = None - seqlens = None + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}: - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = cu_seqlens[1:] - cu_seqlens[:-1] + max_seqlen = seqlens.max() hidden_states = inputs_embeds for encoder_layer in self.layers: diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 0555717017cd..cdb1dee4e0d6 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -23,6 +23,7 @@ from transformers.models.pixtral.modeling_pixtral import ( PixtralRotaryEmbedding, apply_rotary_pos_emb, + generate_block_attention_mask, position_ids_in_meshgrid, ) from transformers.tokenization_utils_base import TextInput @@ -56,7 +57,6 @@ PromptUpdateDetails, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import ( MistralTokenizer, @@ -72,17 +72,6 @@ resolve_visual_encoder_outputs, ) -try: - from xformers import ops as xops - - if current_platform.is_cuda() and current_platform.has_device_capability(100): - # Xformers FA is not compatible with B200 - USE_XFORMERS_OPS = False - else: - USE_XFORMERS_OPS = True -except ImportError: - USE_XFORMERS_OPS = False - PATCH_MERGE = "patch_merge" @@ -670,14 +659,11 @@ def forward( q, k = apply_rotary_emb_vit(q, k, freqs_cis=freqs_cis) - if USE_XFORMERS_OPS: - out = xops.memory_efficient_attention(q, k, v, attn_bias=mask) - else: - q = q.transpose(1, 2) - k = k.transpose(1, 2) - v = v.transpose(1, 2) - out = nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask) - out = out.transpose(1, 2) + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + out = nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask) + out = out.transpose(1, 2) out = out.reshape(batch, patches, self.n_heads * self.head_dim) return self.wo(out) @@ -817,18 +803,9 @@ def forward( freqs_cis = self.freqs_cis[positions[:, 0], positions[:, 1]] # pass through Transformer with a block diagonal mask delimiting images - if USE_XFORMERS_OPS: - mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens( - [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], - ) - else: - from transformers.models.pixtral.modeling_pixtral import ( - generate_block_attention_mask, - ) - - mask = generate_block_attention_mask( - [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds - ) + mask = generate_block_attention_mask( + [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds + ) out = self.transformer(patch_embeds, mask=mask, freqs_cis=freqs_cis) # squeeze dim 0 and split into separate tensors for each image @@ -1097,17 +1074,11 @@ def forward( cos, sin = position_embeddings q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0) - if USE_XFORMERS_OPS: - # Transpose q and k back for attention - q = q.transpose(1, 2).contiguous() - k = k.transpose(1, 2).contiguous() - out = xops.memory_efficient_attention(q, k, v, attn_bias=attention_mask) - else: - v = v.transpose(1, 2) - out = nn.functional.scaled_dot_product_attention( - q, k, v, attn_mask=attention_mask - ) - out = out.transpose(1, 2) + v = v.transpose(1, 2) + out = nn.functional.scaled_dot_product_attention( + q, k, v, attn_mask=attention_mask + ) + out = out.transpose(1, 2) out = out.view(batch, patches, self.n_heads * self.head_dim) attn_output, _ = self.o_proj(out) @@ -1283,18 +1254,9 @@ def forward( ).to(self.device) position_embedding = self.patch_positional_embedding(patch_embeds, position_ids) - if USE_XFORMERS_OPS: - attention_mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens( - [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], - ) - else: - from transformers.models.pixtral.modeling_pixtral import ( - generate_block_attention_mask, - ) - - attention_mask = generate_block_attention_mask( - [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds - ) + attention_mask = generate_block_attention_mask( + [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds + ) out = self.transformer( patch_embeds, diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index a90cfe96414b..0cabe54e36b5 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -47,7 +47,6 @@ from vllm.attention.ops.vit_attn_wrappers import ( vit_flash_attn_wrapper, vit_torch_sdpa_wrapper, - vit_xformers_attn_wrapper, ) from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig @@ -407,7 +406,7 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention - seqlens: torch.Tensor, # Only used for xFormers + seqlens: torch.Tensor, ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) @@ -450,8 +449,10 @@ def forward( v, cu_seqlens, ) - elif self.attn_backend == _Backend.XFORMERS: - context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens) + else: + raise RuntimeError( + f"Unsupported attention backend {self.attn_backend} for Qwen2.5 vision." + ) output, _ = self.proj(context_layer) return output @@ -514,7 +515,7 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention - seqlens: torch.Tensor, # Only used for xFormers + seqlens: torch.Tensor, ) -> torch.Tensor: x_attn = self.attn( self.norm1(x), @@ -710,7 +711,6 @@ def __init__( if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, - _Backend.XFORMERS, _Backend.ROCM_AITER_FA, }: raise RuntimeError( @@ -849,11 +849,9 @@ def compute_attn_mask_seqlen( cu_seqlens: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: max_seqlen = torch.zeros([], device=cu_seqlens.device) - seqlens = torch.zeros(1, device=cu_seqlens.device) + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}: - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = cu_seqlens[1:] - cu_seqlens[:-1] + max_seqlen = seqlens.max() return max_seqlen, seqlens @staticmethod diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 1ec12bdb55df..79135bd0f847 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -380,7 +380,6 @@ def __init__( if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, - _Backend.XFORMERS, _Backend.ROCM_AITER_FA, }: raise RuntimeError( @@ -426,7 +425,7 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, max_seqlen: int | None = None, # Only used for Flash Attention - seqlens: list[int] | None = None, # Only used for xFormers + seqlens: list[int] | None = None, ) -> torch.Tensor: # [s, b, c] --> [s, b, 3 * head * head_dim] x, _ = self.qkv(x) @@ -485,20 +484,10 @@ def forward( context_layer = rearrange( context_layer, "b s h d -> s b (h d)" ).contiguous() - elif self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalMask - - attn_bias = BlockDiagonalMask.from_seqlens( - q_seqlen=seqlens, kv_seqlen=None, device=q.device - ) - - context_layer = xops.memory_efficient_attention_forward( - q, k, v, attn_bias=attn_bias, p=0, scale=None + else: + raise RuntimeError( + f"Unsupported attention backend {self.attn_backend} for Qwen2 vision." ) - context_layer = rearrange( - context_layer, "b s h d -> s b (h d)" - ).contiguous() output, _ = self.proj(context_layer) return output @@ -548,7 +537,7 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, max_seqlen: int | None = None, # Only used for Flash Attention - seqlens: list[int] | None = None, # Only used for xFormers + seqlens: list[int] | None = None, ) -> torch.Tensor: x = x + self.attn( self.norm1(x), @@ -788,11 +777,10 @@ def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor: def compute_attn_mask_seqlen( self, cu_seqlens: torch.Tensor ) -> tuple[int | None, list[int] | None]: - max_seqlen, seqlens = None, None + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + max_seqlen = None if self.attn_backend in {_Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA}: max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() return max_seqlen, seqlens def forward( diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index f20e67902721..b75c3c0b986f 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -223,7 +223,7 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention - seqlens: torch.Tensor, # Only used for xFormers + seqlens: torch.Tensor, ) -> torch.Tensor: x = x + self.attn( self.norm1(x), @@ -489,11 +489,9 @@ def compute_attn_mask_seqlen( cu_seqlens: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: max_seqlen = torch.zeros([], device=cu_seqlens.device) - seqlens = torch.zeros(1, device=cu_seqlens.device) + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] if self.attn_backend == _Backend.FLASH_ATTN: - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = cu_seqlens[1:] - cu_seqlens[:-1] + max_seqlen = seqlens.max() return max_seqlen, seqlens def forward( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index d611580c7182..f352137708a5 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -232,7 +232,7 @@ def forward( cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, max_seqlen: torch.Tensor, # Only used for Flash Attention - seqlens: torch.Tensor, # Only used for xFormers + seqlens: torch.Tensor, ) -> torch.Tensor: x = x + self.attn( self.norm1(x), @@ -382,7 +382,6 @@ def __init__( if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, - _Backend.XFORMERS, _Backend.ROCM_AITER_FA, }: raise RuntimeError( @@ -514,14 +513,12 @@ def compute_attn_mask_seqlen( cu_seqlens: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: max_seqlen = torch.zeros([], device=cu_seqlens.device) - seqlens = torch.zeros(1, device=cu_seqlens.device) + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] if ( self.attn_backend == _Backend.FLASH_ATTN or self.attn_backend == _Backend.ROCM_AITER_FA ): - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = cu_seqlens[1:] - cu_seqlens[:-1] + max_seqlen = seqlens.max() return max_seqlen, seqlens def forward( diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 32734c3aba5e..1865dcd16e4b 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -220,12 +220,12 @@ def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> "_Backend": from vllm.attention.backends.registry import _Backend # For Blackwell GPUs, force TORCH_SDPA for now. - # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501 + # See facebookresearch issue #1317 for additional context. if cls.has_device_capability(100): return _Backend.TORCH_SDPA if dtype not in (torch.float16, torch.bfloat16): - return _Backend.XFORMERS + return _Backend.TORCH_SDPA if cls.has_device_capability(80): FLASH_ATTN_V1 = ( @@ -239,11 +239,11 @@ def get_vit_attn_backend(cls, head_size: int, dtype: torch.dtype) -> "_Backend": if is_default_fa_supported: return _Backend.FLASH_ATTN else: - # Fallback to XFORMERS - return _Backend.XFORMERS + # Fallback to SDPA + return _Backend.TORCH_SDPA else: # Fallback for Volta/Turing GPUs or FA not supported - return _Backend.XFORMERS + return _Backend.TORCH_SDPA @classmethod def get_attn_backend_cls( @@ -268,7 +268,6 @@ def get_attn_backend_cls( _Backend.FLASH_ATTN, _Backend.TRITON_ATTN, _Backend.TREE_ATTN, - _Backend.XFORMERS, }: raise ValueError( f"Attention backend {selected_backend} incompatible with MLA. " @@ -344,7 +343,6 @@ def get_attn_backend_cls( TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend" # noqa: E501 FLASH_ATTN_V1 = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" # noqa: E501 TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501 - XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501 use_fp8_kv_cache = kv_cache_dtype is not None and kv_cache_dtype.startswith( "fp8" @@ -369,9 +367,6 @@ def get_attn_backend_cls( elif selected_backend == _Backend.TREE_ATTN: logger.info_once("Using Tree Attention backend.") return TREE_ATTN_V1 - elif selected_backend == _Backend.XFORMERS: - logger.info_once("Using XFormers backend.") - return XFORMERS_V1 from vllm.attention.selector import is_attn_backend_supported diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index b5a7fea2c357..833c1c532468 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -58,7 +58,6 @@ def __dir__() -> list[str]: # register, corresponding to possible backends STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER" STR_TORCH_SDPA_ATTN_VAL: str = "TORCH_SDPA" -STR_XFORMERS_ATTN_VAL: str = "XFORMERS" STR_FLASH_ATTN_VAL: str = "FLASH_ATTN" STR_INVALID_VAL: str = "INVALID" diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py deleted file mode 100644 index 457b15ebdd82..000000000000 --- a/vllm/v1/attention/backends/xformers.py +++ /dev/null @@ -1,442 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Attention layer with XFormersAttention.""" - -from dataclasses import dataclass -from typing import Optional - -import torch - -from vllm.attention.backends.abstract import ( - AttentionBackend, - AttentionImpl, - AttentionMetadata, - AttentionType, - MultipleOf, -) -from vllm.attention.ops.triton_unified_attention import unified_attention -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.v1.attention.backends.utils import ( - AttentionMetadataBuilder, - CommonAttentionMetadata, - split_decodes_and_prefills, -) -from vllm.v1.kv_cache_interface import AttentionSpec - -try: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import ( - AttentionBias, - PagedBlockDiagonalCausalWithOffsetPaddedKeysMask, - ) - - XFORMERS_AVAILABLE = True -except ImportError: - XFORMERS_AVAILABLE = False - -from vllm import _custom_ops as ops - -logger = init_logger(__name__) - - -class XFormersAttentionBackend(AttentionBackend): - accept_output_buffer: bool = True - - @classmethod - def get_supported_dtypes(cls) -> list[torch.dtype]: - return [torch.float16, torch.bfloat16] - - @classmethod - def get_supported_head_sizes(cls) -> list[int]: - return [ - 32, - 40, - 48, - 56, - 64, - 72, - 80, - 88, - 96, - 104, - 112, - 120, - 128, - 136, - 144, - 152, - 160, - 168, - 176, - 184, - 192, - 200, - 208, - 216, - 224, - 232, - 240, - 248, - 256, - ] - - @staticmethod - def get_supported_kernel_block_size() -> list[int | MultipleOf]: - return [MultipleOf(16)] - - @classmethod - def validate_head_size(cls, head_size: int) -> None: - supported_head_sizes = cls.get_supported_head_sizes() - if head_size not in supported_head_sizes: - attn_type = cls.__name__.removesuffix("Backend") - raise ValueError( - f"Head size {head_size} is not supported by {attn_type}. " - f"Supported head sizes are: {supported_head_sizes}. " - "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use " - "FlexAttention backend which supports all head sizes." - ) - - @staticmethod - def get_name() -> str: - return "XFORMERS" - - @staticmethod - def get_impl_cls() -> type["XFormersAttentionImpl"]: - return XFormersAttentionImpl - - @staticmethod - def get_metadata_cls() -> type["AttentionMetadata"]: - return XFormersAttentionMetadata - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - cache_dtype_str: str = "auto", - ) -> tuple[int, ...]: - if block_size % 16 != 0: - raise ValueError("Block size must be a multiple of 16.") - return (2, num_blocks, block_size, num_kv_heads, head_size) - - @staticmethod - def get_builder_cls() -> type["XFormersAttentionMetadataBuilder"]: - return XFormersAttentionMetadataBuilder - - @staticmethod - def use_cascade_attention(*args, **kwargs) -> bool: - return False - - -@dataclass -class XFormersAttentionMetadata: - num_actual_tokens: int # Number of tokens excluding padding. - max_query_len: int - query_start_loc: torch.Tensor - max_seq_len: int - seq_lens: torch.Tensor - block_table: torch.Tensor - slot_mapping: torch.Tensor - - num_prefill_tokens: int = 0 - num_decode_tokens: int = 0 - num_prefills: int = 0 - num_decodes: int = 0 - - # Biases for different attention types. - attn_bias: Optional["AttentionBias"] = None - - # Self-attention prefill/decode metadata cache - _cached_prefill_metadata: Optional["XFormersAttentionMetadata"] = None - _cached_decode_metadata: Optional["XFormersAttentionMetadata"] = None - - @property - def prefill_metadata(self) -> Optional["XFormersAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - # Recover cached prefill-phase attention - # metadata structure - return self._cached_prefill_metadata - - q_start_loc = self.query_start_loc[self.num_decodes :] - q_seqlens = torch.diff(q_start_loc) - kv_seqlens = self.seq_lens[self.num_decodes :] - # Construct & cache prefill-phase attention metadata structure - self._cached_prefill_metadata = XFormersAttentionMetadata( - num_actual_tokens=self.num_prefill_tokens, - max_query_len=int(q_seqlens.max().item()), - query_start_loc=q_start_loc - q_start_loc[0], - max_seq_len=int(kv_seqlens.max().item()), - seq_lens=kv_seqlens, - block_table=self.block_table[self.num_decodes :], - slot_mapping=self.slot_mapping[self.num_decode_tokens :], - ) - return self._cached_prefill_metadata - - @property - def decode_metadata(self) -> Optional["XFormersAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - # Recover cached decode-phase attention - # metadata structure - return self._cached_decode_metadata - - q_start_loc = self.query_start_loc - q_seqlens = torch.diff(q_start_loc) - decode_kv_seqlens = self.seq_lens[: self.num_decodes] - # Construct & cache decode-phase attention metadata structure - self._cached_decode_metadata = XFormersAttentionMetadata( - num_actual_tokens=self.num_decode_tokens, - max_query_len=int(q_seqlens[: self.num_decodes].max().item()), - query_start_loc=q_start_loc[: self.num_decodes + 1], - max_seq_len=int(decode_kv_seqlens.max().item()), - seq_lens=decode_kv_seqlens, - block_table=self.block_table[: self.num_decodes], - slot_mapping=self.slot_mapping[: self.num_decode_tokens], - attn_bias=self.attn_bias, - ) - return self._cached_decode_metadata - - -class XFormersAttentionMetadataBuilder( - AttentionMetadataBuilder[XFormersAttentionMetadata] -): - reorder_batch_threshold: int = 1 - - def __init__( - self, - kv_cache_spec: AttentionSpec, - layer_names: list[str], - vllm_config: VllmConfig, - device: torch.device, - ): - super().__init__(kv_cache_spec, layer_names, vllm_config, device) - - assert XFORMERS_AVAILABLE - self.block_size = kv_cache_spec.block_size - self._num_decodes = 0 - self._num_decode_tokens = 0 - - def build( - self, - common_prefix_len: int, - common_attn_metadata: CommonAttentionMetadata, - fast_build: bool = False, - ) -> XFormersAttentionMetadata: - num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( - split_decodes_and_prefills( - common_attn_metadata, decode_threshold=self.reorder_batch_threshold - ) - ) - - num_actual_tokens = common_attn_metadata.num_actual_tokens - q_start_loc = common_attn_metadata.query_start_loc - q_seqlens = torch.diff(q_start_loc) - max_query_len = common_attn_metadata.max_query_len - kv_seqlens = common_attn_metadata.seq_lens - max_seq_len = common_attn_metadata.max_seq_len - block_table = common_attn_metadata.block_table_tensor - slot_mapping = common_attn_metadata.slot_mapping - - bias = None - if num_decodes > 0: - # Construct the decoder bias. - decode_q_seqlens = q_seqlens[:num_decodes] - decode_kv_seqlens = kv_seqlens[:num_decodes] - bias = PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens( - q_seqlen=decode_q_seqlens.tolist(), - kv_seqlen=decode_kv_seqlens.tolist(), - page_size=self.block_size, - block_tables=block_table[:num_decodes], - device=block_table.device, - ) - - return XFormersAttentionMetadata( - num_actual_tokens=num_actual_tokens, - num_prefill_tokens=num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - num_prefills=num_prefills, - num_decodes=num_decodes, - max_query_len=max_query_len, - query_start_loc=q_start_loc, - max_seq_len=max_seq_len, - seq_lens=kv_seqlens, - block_table=block_table, - slot_mapping=slot_mapping, - attn_bias=bias, - ) - - -class XFormersAttentionImpl(AttentionImpl): - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: list[float] | None, - sliding_window: int | None, - kv_cache_dtype: str, - logits_soft_cap: float | None = None, - attn_type: AttentionType = AttentionType.DECODER, - kv_sharing_target_layer_name: str | None = None, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0.") - if alibi_slopes is not None: - raise NotImplementedError("XFormers does not support alibi slopes yet.") - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_kv_heads - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.kv_cache_dtype = kv_cache_dtype - self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.alibi_slopes = alibi_slopes - if sliding_window is None: - self.sliding_window = (-1, -1) - else: - self.sliding_window = (sliding_window - 1, 0) - if logits_soft_cap is None: - # Setting logits_soft_cap to 0 means no soft cap. - logits_soft_cap = 0 - self.logits_soft_cap = logits_soft_cap - - XFormersAttentionBackend.validate_head_size(head_size) - - if attn_type != AttentionType.DECODER: - raise NotImplementedError( - "Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "XFormersAttentionImpl." - ) - - def forward( - self, - layer: torch.nn.Module, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: XFormersAttentionMetadata, - output: torch.Tensor | None = None, - output_scale: torch.Tensor | None = None, - output_block_scale: torch.Tensor | None = None, - ) -> torch.Tensor: - """Forward pass with XFormers. - - Args: - query: shape = [num_tokens, num_heads, head_size] - key: shape = [num_tokens, num_kv_heads, head_size] - value: shape = [num_tokens, num_kv_heads, head_size] - kv_cache: shape = - [2, num_blocks, block_size, num_kv_heads, head_size] - attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - assert output is not None, "Output tensor must be provided." - - if output_scale is not None or output_block_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for XFormersAttentionImpl" - ) - - if attn_metadata is None: - # Profiling run. - return output.fill_(0) - - # Cache the input KVs. - key_cache, value_cache = kv_cache.unbind(0) - if self.kv_sharing_target_layer_name is None: - # Reshape the input keys and values and store them in the cache. - # Skip this if sharing KV cache with an earlier attention layer. - # NOTE(woosuk): Here, key and value are padded while slot_mapping is - # not padded. However, we don't need to do key[:num_actual_tokens] - # and value[:num_actual_tokens] because the reshape_and_cache_flash - # op uses the slot_mapping's shape to determine the number of - # actual tokens. - ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - - num_actual_tokens = attn_metadata.num_actual_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - if prefill_meta := attn_metadata.prefill_metadata: - descale_shape = (prefill_meta.query_start_loc.shape[0] - 1, key.shape[1]) - unified_attention( - q=query[num_decode_tokens:num_actual_tokens], - k=key_cache, - v=value_cache, - out=output[num_decode_tokens:num_actual_tokens], - cu_seqlens_q=prefill_meta.query_start_loc, - max_seqlen_q=prefill_meta.max_query_len, - seqused_k=prefill_meta.seq_lens, - max_seqlen_k=prefill_meta.max_seq_len, - softmax_scale=self.scale, - causal=True, - alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, - block_table=prefill_meta.block_table, - softcap=self.logits_soft_cap, - q_descale=None, # Not supported - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - - if decode_meta := attn_metadata.decode_metadata: - # Query for decode. KV is not needed because it is already cached. - decode_query = query[:num_decode_tokens] - # Reshape query to [1, B_T, G, H, D]. - q = decode_query.view( - 1, -1, self.num_kv_heads, self.num_queries_per_kv, self.head_size - ) - # Reshape the k and v caches to [1, Bkv_T, G, H, D] - cache_k = key_cache.view( - 1, -1, self.num_kv_heads, 1, self.head_size - ).expand( - 1, - -1, - self.num_kv_heads, - self.num_queries_per_kv, - self.head_size, - ) - cache_v = value_cache.view( - 1, -1, self.num_kv_heads, 1, self.head_size - ).expand( - 1, - -1, - self.num_kv_heads, - self.num_queries_per_kv, - self.head_size, - ) - - attn_bias = decode_meta.attn_bias - output[:num_decode_tokens] = xops.memory_efficient_attention_forward( - q, - cache_k, - cache_v, - attn_bias=attn_bias, - p=0.0, - scale=self.scale, - ).view(decode_query.shape) - - # Reshape the output tensor. - return output