From ab76b2846ffc8bf88cd1228ae8fb70de4d6af824 Mon Sep 17 00:00:00 2001 From: zhandaz Date: Wed, 21 Jan 2026 11:00:19 -0800 Subject: [PATCH 1/9] [Docker][Dev] Fix libnccl-dev version for the CUDA 13.0.1 devel image [Docker][Dev] Fix libnccl-dev version conflict for the CUDA 13.0.1 devel image Further update --- docker/Dockerfile | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index d4ecf96b1485..e1c8f3318a97 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -442,7 +442,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ python${PYTHON_VERSION}-dev \ python${PYTHON_VERSION}-venv \ libibverbs-dev \ - && rm -rf /var/lib/apt/lists/* \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ @@ -453,17 +452,13 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ # (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime) RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \ apt-get update -y && \ - apt-get install -y --no-install-recommends \ + apt-get install -y --no-install-recommends --no-upgrade \ cuda-nvcc-${CUDA_VERSION_DASH} \ cuda-cudart-${CUDA_VERSION_DASH} \ cuda-nvrtc-${CUDA_VERSION_DASH} \ cuda-cuobjdump-${CUDA_VERSION_DASH} \ libcurand-dev-${CUDA_VERSION_DASH} \ - libcublas-${CUDA_VERSION_DASH} \ - # Fixes nccl_allocator requiring nccl.h at runtime - # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22 - libnccl-dev && \ - rm -rf /var/lib/apt/lists/* + libcublas-${CUDA_VERSION_DASH} # Install uv for faster pip installs RUN python3 -m pip install uv @@ -671,12 +666,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ fi; \ uv pip install --system -r /tmp/kv_connectors.txt --no-build || ( \ # if the above fails, install from source - apt-get update -y && \ - apt-get install -y --no-install-recommends ${BUILD_PKGS} && \ - uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation && \ - apt-get purge -y ${BUILD_PKGS} && \ - # clean up -dev packages, keep runtime libraries - rm -rf /var/lib/apt/lists/* \ + uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation \ ); \ fi From 2dc741957cc9f2135ca75fb9f0fd1eeeb3300f96 Mon Sep 17 00:00:00 2001 From: zhandaz Date: Thu, 22 Jan 2026 11:05:15 -0800 Subject: [PATCH 2/9] feat: Support FA4 for mm-encoder-attn-backend for qwen models --- vllm/config/attention.py | 12 +- .../layers/attention/mm_encoder_attention.py | 41 ++++++- vllm/model_executor/models/qwen2_5_vl.py | 2 + vllm/model_executor/models/qwen3_vl.py | 2 + vllm/platforms/cuda.py | 33 ++++++ vllm/v1/attention/backends/fa4_utils.py | 111 ++++++++++++++++++ vllm/v1/attention/backends/registry.py | 4 + vllm/v1/attention/ops/vit_attn_wrappers.py | 79 +++++++++++++ 8 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 vllm/v1/attention/backends/fa4_utils.py diff --git a/vllm/config/attention.py b/vllm/config/attention.py index 293045787a1c..0fd4a67cb5b6 100644 --- a/vllm/config/attention.py +++ b/vllm/config/attention.py @@ -67,7 +67,17 @@ def compute_hash(self) -> str: def validate_backend_before(cls, value: Any) -> Any: """Enable parsing of the `backend` enum type from string.""" if isinstance(value, str): - return AttentionBackendEnum[value.upper()] + value = AttentionBackendEnum[value.upper()] + + # Disallow ViT-only attention tags in the KV-cache attention config. + if value == AttentionBackendEnum.FLASH_ATTN_CUTE: + raise ValueError( + "AttentionConfig.backend does not support FLASH_ATTN_CUTE " + "(FA4 / flash_attn.cute). This is a ViT/MM-encoder-only attention " + "tag. Use --mm-encoder-attn-backend / MultiModalConfig.mm_encoder_attn_backend " + "instead." + ) + return value def _set_from_env_if_set(self, field_name: str, env_var_name: str) -> None: diff --git a/vllm/model_executor/layers/attention/mm_encoder_attention.py b/vllm/model_executor/layers/attention/mm_encoder_attention.py index 44e990d29c16..33e120e7660e 100644 --- a/vllm/model_executor/layers/attention/mm_encoder_attention.py +++ b/vllm/model_executor/layers/attention/mm_encoder_attention.py @@ -11,6 +11,7 @@ from vllm.v1.attention.backends.fa_utils import get_flash_attn_version from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.ops.vit_attn_wrappers import ( + vit_fa4_flash_attn_wrapper, vit_flash_attn_wrapper, vit_torch_sdpa_wrapper, ) @@ -79,6 +80,8 @@ def __init__( AttentionBackendEnum.ROCM_AITER_FA, } + self.is_fa4_backend = self.attn_backend == AttentionBackendEnum.FLASH_ATTN_CUTE + self._fa_version = ( get_flash_attn_version() if self.is_flash_attn_backend else None ) @@ -182,6 +185,40 @@ def _forward_fa( output = output.reshape(bsz, q_len, -1) return output + def _forward_fa4( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + ) -> torch.Tensor: + """FA4 (flash_attn.cute) attention for multimodal encoder (no KV cache).""" + assert (cu_seqlens is not None and max_seqlen is not None) or ( + cu_seqlens is None and max_seqlen is None + ), "cu_seqlens and max_seqlen should be both set or both None." + + bsz, q_len = query.size()[:2] + kv_len = key.size(1) + is_reshaped = query.dim() != 4 + + query, key, value = self.maybe_reshape_qkv_to_4d( + query, key, value, bsz, q_len, kv_len + ) + + output = vit_fa4_flash_attn_wrapper( + q=query, + k=key, + v=value, + batch_size=bsz, + scale=self.scale, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + if is_reshaped: + output = output.reshape(bsz, q_len, -1) + return output + def forward_native( self, query: torch.Tensor, @@ -200,7 +237,9 @@ def forward_cuda( cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention ) -> torch.Tensor: - if self.is_flash_attn_backend: + if self.is_fa4_backend: + return self._forward_fa4(query, key, value, cu_seqlens, max_seqlen) + elif self.is_flash_attn_backend: return self._forward_fa(query, key, value, cu_seqlens, max_seqlen) elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: return self._forward_sdpa(query, key, value, cu_seqlens) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index b8da164ee8e3..9cfd12a31903 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -632,6 +632,7 @@ def __init__( if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.FLASH_ATTN_CUTE, AttentionBackendEnum.TORCH_SDPA, AttentionBackendEnum.ROCM_AITER_FA, }: @@ -785,6 +786,7 @@ def compute_attn_mask_seqlen( max_seqlen = torch.zeros([], device=cu_seqlens.device) if self.attn_backend in { AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.FLASH_ATTN_CUTE, AttentionBackendEnum.ROCM_AITER_FA, }: max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 56c3db49ed77..44b45a08dc1e 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -396,6 +396,7 @@ def __init__( if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.FLASH_ATTN_CUTE, AttentionBackendEnum.TORCH_SDPA, AttentionBackendEnum.ROCM_AITER_FA, }: @@ -538,6 +539,7 @@ def compute_attn_mask_seqlen( max_seqlen = torch.zeros([], device=cu_seqlens.device) if ( self.attn_backend == AttentionBackendEnum.FLASH_ATTN + or self.attn_backend == AttentionBackendEnum.FLASH_ATTN_CUTE or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA ): max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 47d634416ae5..dc2bc5544275 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -361,6 +361,7 @@ def get_attn_backend_cls( def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]: return [ AttentionBackendEnum.TORCH_SDPA, + AttentionBackendEnum.FLASH_ATTN_CUTE, AttentionBackendEnum.FLASH_ATTN, ] @@ -371,11 +372,43 @@ def get_vit_attn_backend( dtype: torch.dtype, backend: Optional["AttentionBackendEnum"] = None, ) -> "AttentionBackendEnum": + cc = cls.get_device_capability() + if backend is not None: assert backend in cls.get_supported_vit_attn_backends(), ( f"Backend {backend} is not supported for vit attention. " f"Supported backends are: {cls.get_supported_vit_attn_backends()}" ) + + # FA4 is Blackwell-only and opt-in (via --mm-encoder-attn-backend). + if backend == AttentionBackendEnum.FLASH_ATTN_CUTE: + if cc is None or cc.major != 10: + raise ValueError( + "FLASH_ATTN_CUTE (FA4 / flash_attn.cute) is only supported on " + "Blackwell GPUs (compute capability 10.x)." + ) + + from vllm.v1.attention.backends.fa4_utils import ( + is_flash_attn_cute_available, + supports_dtype as fa4_supports_dtype, + warn_if_unoptimized_head_size, + ) + + if not fa4_supports_dtype(dtype): + raise ValueError( + "FLASH_ATTN_CUTE (FA4 / flash_attn.cute) only supports " + "float16/bfloat16 for ViT attention." + ) + + if not is_flash_attn_cute_available(): + raise ImportError( + "FLASH_ATTN_CUTE (FA4 / flash_attn.cute) selected, but " + "`flash_attn.cute.interface` is not available in this " + "environment." + ) + + warn_if_unoptimized_head_size(head_size) + logger.info_once(f"Using backend {backend} for vit attention") return backend diff --git a/vllm/v1/attention/backends/fa4_utils.py b/vllm/v1/attention/backends/fa4_utils.py new file mode 100644 index 000000000000..26e7262f4e08 --- /dev/null +++ b/vllm/v1/attention/backends/fa4_utils.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +from importlib.util import find_spec +from typing import Optional, Tuple + +import torch + +from vllm.logger import init_logger +from vllm.platforms import current_platform + +logger = init_logger(__name__) + +# flash_attn.cute.interface (Cute-DSL / FA4). +# +# NOTE: vLLM currently only enables this path for **Blackwell** GPUs +# (compute capability 10.x) and only for ViT/MM encoder attention. +# It is NOT a KV-cache attention backend. +_OPTIMIZED_HEAD_SIZES: tuple[int, ...] = (64, 96, 128, 192) + + +def warn_if_unoptimized_head_size(head_size: int) -> None: + """Warn if `head_size` is outside the known-optimized set. + + We intentionally don't hard-block on head_size here, since upstream support + may evolve and some shapes may still work (albeit slower). + """ + if head_size not in _OPTIMIZED_HEAD_SIZES: + logger.warning_once( + "FA4 (flash_attn.cute) selected for head_size=%d, which is not in the " + "known-optimized set %s. The kernel may be slower or unsupported.", + head_size, + _OPTIMIZED_HEAD_SIZES, + ) + + +def supports_dtype(dtype: torch.dtype) -> bool: + return dtype in (torch.float16, torch.bfloat16) + + +def supports_device() -> bool: + if not current_platform.is_cuda(): + return False + cc = current_platform.get_device_capability() + return cc is not None and cc.major == 10 + + +def is_flash_attn_cute_available() -> bool: + """Best-effort availability check for FA4 (flash_attn.cute). + + This intentionally avoids importing `flash_attn.cute.interface` because + that may pull in heavy deps (cutlass-dsl / cuda-python). The actual import + happens in `flash_attn_varlen_func`. + """ + if not supports_device(): + return False + return find_spec("flash_attn.cute.interface") is not None + + +def flash_attn_varlen_func( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + *, + cu_seqlens_q: Optional[torch.Tensor] = None, + cu_seqlens_k: Optional[torch.Tensor] = None, + max_seqlen_q: Optional[int] = None, + max_seqlen_k: Optional[int] = None, + seqused_q: Optional[torch.Tensor] = None, + seqused_k: Optional[torch.Tensor] = None, + softmax_scale: Optional[float] = None, + causal: bool = False, + window_size: Tuple[Optional[int], Optional[int]] = (None, None), + deterministic: bool = False, +) -> torch.Tensor: + """FA4 (Cute-DSL) FlashAttention varlen forward. + + Wraps `flash_attn.cute.interface.flash_attn_varlen_func`, which returns + `(out, lse)`. vLLM only needs `out` for inference. + """ + if not current_platform.is_cuda(): + raise RuntimeError("FA4 (flash_attn.cute) is only supported on CUDA.") + + try: + from flash_attn.cute.interface import flash_attn_varlen_func as _fa4_varlen + except Exception as e: + raise ImportError( + "FA4 (flash_attn.cute) is not available. " + "Please ensure the Cute-DSL FlashAttention build is installed " + "(e.g. nvidia-cutlass-dsl) and cuda-python bindings are present." + ) from e + + out, _lse = _fa4_varlen( + q, + k, + v, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_q, + max_seqlen_k=max_seqlen_k, + seqused_q=seqused_q, + seqused_k=seqused_k, + softmax_scale=softmax_scale, + causal=causal, + window_size=window_size, + deterministic=deterministic, + ) + return out + diff --git a/vllm/v1/attention/backends/registry.py b/vllm/v1/attention/backends/registry.py index bd45702fa587..6bdf9691b402 100644 --- a/vllm/v1/attention/backends/registry.py +++ b/vllm/v1/attention/backends/registry.py @@ -42,6 +42,10 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): """ FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend" + # FA4 (Cute-DSL) - this tag is only used for ViT (MM encoder) attention. + # NOTE: This backend does not implement the KV-cache attention path and + # should not be used with `--attention-config.backend`. + FLASH_ATTN_CUTE = "flash_attn.cute" FLASH_ATTN_DIFFKV = ( "vllm.v1.attention.backends.flash_attn_diffkv.FlashAttentionDiffKVBackend" ) diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index f077a61c984f..8fa0a442c3a5 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -110,6 +110,85 @@ def vit_flash_attn_wrapper( ) +def fa4_flash_attn_maxseqlen_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + batch_size: int, + scale: float | None = None, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, +) -> torch.Tensor: + """FA4 (flash_attn.cute) wrapper for ViT attention. + + flash_attn.cute returns (out, lse); we only return out. + """ + from vllm.v1.attention.backends.fa4_utils import ( + flash_attn_varlen_func as fa4_flash_attn_varlen_func, + ) + + q_len = q.size(1) + if cu_seqlens is None: + cu_seqlens = torch.arange( + 0, (batch_size + 1) * q_len, step=q_len, dtype=torch.int32, device=q.device + ) + max_seqlen_int = q_len if max_seqlen is None else max_seqlen.item() + + q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) + output = fa4_flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen_int, + max_seqlen_k=max_seqlen_int, + softmax_scale=scale, + causal=False, + ) + context_layer = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size) + return context_layer + + +def fa4_flash_attn_maxseqlen_wrapper_fake( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + batch_size: int, + scale: float | None = None, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, +) -> torch.Tensor: + return torch.empty_like(q) + + +direct_register_custom_op( + op_name="fa4_flash_attn_maxseqlen_wrapper", + op_func=fa4_flash_attn_maxseqlen_wrapper, + fake_impl=fa4_flash_attn_maxseqlen_wrapper_fake, +) + + +def vit_fa4_flash_attn_wrapper( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + batch_size: int, + scale: float | None = None, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, +) -> torch.Tensor: + return torch.ops.vllm.fa4_flash_attn_maxseqlen_wrapper( + q, + k, + v, + batch_size, + scale, + cu_seqlens, + max_seqlen, + ) + + def apply_sdpa( q: torch.Tensor, k: torch.Tensor, From c0de26056cd5dcd25507709932c69073359fa64b Mon Sep 17 00:00:00 2001 From: zhandaz Date: Thu, 22 Jan 2026 12:39:15 -0800 Subject: [PATCH 3/9] feat: Kernel warmup for vit fa4 --- vllm/model_executor/warmup/fa4_warmup.py | 151 ++++++++++++++++++++ vllm/model_executor/warmup/kernel_warmup.py | 5 + 2 files changed, 156 insertions(+) create mode 100644 vllm/model_executor/warmup/fa4_warmup.py diff --git a/vllm/model_executor/warmup/fa4_warmup.py b/vllm/model_executor/warmup/fa4_warmup.py new file mode 100644 index 000000000000..4bcb6881b7e6 --- /dev/null +++ b/vllm/model_executor/warmup/fa4_warmup.py @@ -0,0 +1,151 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Warmup FA4 (flash_attn.cute) kernels for ViT/MM encoder attention. + +We specifically warm up the FlashAttention Cute-DSL (FA4) compile cache by +running a few representative varlen attention calls that differ only in +sequence length. This helps avoid JIT compilation in the hot path. + +This warmup is: +- Blackwell-only (compute capability 10.x) +- Opt-in (only when mm_encoder_attn_backend == FLASH_ATTN_CUTE) +- Scoped to Qwen3-VL / Qwen3-VL-MoE vision transformer workloads +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch + +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum + +if TYPE_CHECKING: + from vllm.v1.worker.gpu_worker import Worker + +logger = init_logger(__name__) + + +def _get_default_qwen3_vit_warmup_seqlens(max_positions: int | None) -> list[int]: + candidates = [ + 16**2, # 256 + 24**2, # 576 + 32**2, # 1024 + 48**2, # 2304 + 64**2, # 4096 + 96**2, # 9216 + 128**2, # 16384 + 192**2, # 36864 + 256**2, # 65536 + ] + if max_positions is None: + return candidates + return [s for s in candidates if s <= max_positions] + + +def should_fa4_vit_warmup(worker: "Worker") -> bool: + """Fast predicate used by `kernel_warmup` to gate FA4 warmup.""" + if not current_platform.is_cuda(): + return False + cc = current_platform.get_device_capability() + if cc is None or cc.major != 10: + return False + + mm_cfg = getattr(worker.model_config, "multimodal_config", None) + return ( + mm_cfg is not None + and mm_cfg.mm_encoder_attn_backend == AttentionBackendEnum.FLASH_ATTN_CUTE + ) + + +def fa4_vit_warmup(worker: "Worker") -> None: + """Warm up FA4 kernels for Qwen3-VL(-MoE) ViT attention.""" + + # Config gating: only warm up when explicitly selected for mm encoder. + if not should_fa4_vit_warmup(worker): + return + + # Dependency gating. + from vllm.v1.attention.backends.fa4_utils import ( + is_flash_attn_cute_available, + supports_dtype, + warn_if_unoptimized_head_size, + ) + + if not is_flash_attn_cute_available(): + logger.warning( + "Skipping FA4 warmup: `flash_attn.cute.interface` is not available." + ) + return + + model = worker.get_model() + visual = getattr(model, "visual", None) + if visual is None: + # Not a Qwen3-VL(-MoE) style model, or vision tower disabled. + logger.warning( + "Skipping FA4 warmup: not a Qwen3-VL(-MoE) style model, or vision tower disabled." + ) + return + + # Derive head shape and dtype from the actual vision attention module. + try: + first_attn = visual.blocks[0].attn # Qwen2_5_VisionAttention + head_size = int(first_attn.hidden_size_per_attention_head) + num_heads = int(first_attn.num_attention_heads_per_partition) + scale = float(first_attn.hidden_size_per_attention_head**-0.5) + dtype = visual.dtype + except Exception: + # If the model structure is unexpected, skip warmup. + return + + if not supports_dtype(dtype): + # If dtype is not supported, the FA4 backend should not have been selected. + logger.warning_once( + "Skipping FA4 warmup: dtype %s is not supported by flash_attn.cute.", + dtype, + ) + return + + warn_if_unoptimized_head_size(head_size) + + max_positions = getattr(visual, "num_position_embeddings", None) + seqlens = _get_default_qwen3_vit_warmup_seqlens( + int(max_positions) if max_positions is not None else None + ) + + logger.info_once( + "Warming up FA4 (flash_attn.cute) ViT kernels for seqlens=%s (head_size=%d, num_heads=%d, dtype=%s).", + seqlens, + head_size, + num_heads, + dtype, + ) + + # Run a small number of representative calls that only vary seqlen. + # Compilation key can be found under `flash_attn/cute/interface.py`. + from vllm.v1.attention.backends.fa4_utils import flash_attn_varlen_func + + device = torch.device("cuda") + with torch.inference_mode(): + for seqlen in seqlens: + q = torch.empty((seqlen, num_heads, head_size), device=device, dtype=dtype) + k = torch.empty_like(q) + v = torch.empty_like(q) + cu = torch.tensor([0, seqlen], device=device, dtype=torch.int32) + + # This call will populate FA4's internal compile cache (Cute-DSL). + _ = flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q=cu, + cu_seqlens_k=cu, + max_seqlen_q=seqlen, + max_seqlen_k=seqlen, + softmax_scale=scale, + causal=False, + ) + diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 98b28d3e5292..515b3b95e55c 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -13,6 +13,7 @@ import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup +from vllm.model_executor.warmup.fa4_warmup import fa4_vit_warmup, should_fa4_vit_warmup from vllm.platforms import current_platform from vllm.utils.deep_gemm import is_deep_gemm_supported from vllm.utils.flashinfer import has_flashinfer @@ -72,6 +73,10 @@ def _is_flashinfer_backend(backend): create_mixed_batch=True, ) + # FA4 (flash_attn.cute) warmup for ViT/MM encoder attention. + if should_fa4_vit_warmup(worker): + fa4_vit_warmup(worker) + def flashinfer_autotune(runner: "GPUModelRunner") -> None: """ From c87d6424ecfe56eb1696818a54269d3557b61d20 Mon Sep 17 00:00:00 2001 From: Zhanda Date: Fri, 23 Jan 2026 11:03:07 -0800 Subject: [PATCH 4/9] fix: Fix some minor conflicts due to the introduction of flash_attn.cute --- README.md | 31 +++++++++++++++++++ .../layers/rotary_embedding/common.py | 5 ++- vllm/model_executor/warmup/fa4_warmup.py | 7 ++--- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 705fbcb9150b..f209c4c6bd80 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,35 @@ + +# FA4 Integration + +### (1) Support fa4 in vllm. + +From low-level to high-level: +1. Add `FLASH_ATTN_CUTE` (FA4 / `flash_attn.cute`) to `vllm/v1/attention/backends/registry.py` (`AttentionBackendEnum`). +2. Create a new file `vllm/v1/attention/backends/fa4_utils.py`, for the utils / imports for fa4 (keep imports lazy). +3. Register the new backend in `vllm/platforms/cuda.py` (FA4 is **Blackwell-only (CC 10.x)** and **opt-in** via `--mm-encoder-attn-backend FLASH_ATTN_CUTE`; default remains FA2/3 or Torch SDPA). +4. Add the fa4 custom op under `vllm/v1/attention/ops/vit_attn_wrappers.py`. +5. Update `vllm/model_executor/layers/attention/mm_encoder_attention.py` to add another _forward_impl method for fa4 (`FLASH_ATTN_CUTE`). +6. Update `vllm/model_executor/models/qwen3_vl.py` and (optionally) `qwen2_5_vl.py` to accept `FLASH_ATTN_CUTE` and compute `max_seqlen` for it. + +Notes: +- FA4 (`flash_attn.cute`) is only considered on **Blackwell** (compute capability 10.x) in this vLLM fork. +- To force FA4 for ViT/MM encoder attention (Blackwell only): `--mm-encoder-attn-backend FLASH_ATTN_CUTE`. + +### (2) Do the kernel_warmup in vllm. + +- Add a FA4 ViT warmup in `vllm/model_executor/warmup/kernel_warmup.py` (see `vllm/model_executor/warmup/fa4_warmup.py`). +- Scope: **Qwen3-VL / Qwen3-VL-MoE** vision transformer only, **Blackwell-only**, and only when `--mm-encoder-attn-backend FLASH_ATTN_CUTE` is set. +- Candidate seqlens (only varying seqlen): `[64, 256, 576, 1024, 2304, 4096, 9216, 16384, 36864, 65536]` (filtered by `vision_config.num_position_embeddings` if smaller). + +### (3) Minor fixes for FA4 integration. + +- In `vllm/model_executor/layers/rotary_embedding/common.py`, there is a logic of `if find_spec("flash_attn") is not None:` + However, flash_attn original package is actually not installed, not `flash_attn.cute` is installed. + Therefore, minor fix is needed for the import error. + +--- +

diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 34de1da561f5..6836ba8d7566 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -135,7 +135,10 @@ def __init__( self.apply_rotary_emb_flash_attn = None if find_spec("flash_attn") is not None: - from flash_attn.ops.triton.rotary import apply_rotary + try: + from flash_attn.ops.triton.rotary import apply_rotary + except ImportError: + apply_rotary = None self.apply_rotary_emb_flash_attn = apply_rotary diff --git a/vllm/model_executor/warmup/fa4_warmup.py b/vllm/model_executor/warmup/fa4_warmup.py index 4bcb6881b7e6..ebdfabb66915 100644 --- a/vllm/model_executor/warmup/fa4_warmup.py +++ b/vllm/model_executor/warmup/fa4_warmup.py @@ -29,7 +29,7 @@ logger = init_logger(__name__) -def _get_default_qwen3_vit_warmup_seqlens(max_positions: int | None) -> list[int]: +def _get_default_qwen3_vit_warmup_seqlens(max_positions: int | None = None) -> list[int]: candidates = [ 16**2, # 256 24**2, # 576 @@ -111,10 +111,7 @@ def fa4_vit_warmup(worker: "Worker") -> None: warn_if_unoptimized_head_size(head_size) - max_positions = getattr(visual, "num_position_embeddings", None) - seqlens = _get_default_qwen3_vit_warmup_seqlens( - int(max_positions) if max_positions is not None else None - ) + seqlens = tuple(_get_default_qwen3_vit_warmup_seqlens()) logger.info_once( "Warming up FA4 (flash_attn.cute) ViT kernels for seqlens=%s (head_size=%d, num_heads=%d, dtype=%s).", From a7d29c094d579d53822aa43bc3a2686a3bf922da Mon Sep 17 00:00:00 2001 From: Zhanda Date: Fri, 23 Jan 2026 11:05:48 -0800 Subject: [PATCH 5/9] Revert "[Docker][Dev] Fix libnccl-dev version for the CUDA 13.0.1 devel image" This reverts commit ab76b2846ffc8bf88cd1228ae8fb70de4d6af824. --- docker/Dockerfile | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index e1c8f3318a97..d4ecf96b1485 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -442,6 +442,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ python${PYTHON_VERSION}-dev \ python${PYTHON_VERSION}-venv \ libibverbs-dev \ + && rm -rf /var/lib/apt/lists/* \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ @@ -452,13 +453,17 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ # (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime) RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \ apt-get update -y && \ - apt-get install -y --no-install-recommends --no-upgrade \ + apt-get install -y --no-install-recommends \ cuda-nvcc-${CUDA_VERSION_DASH} \ cuda-cudart-${CUDA_VERSION_DASH} \ cuda-nvrtc-${CUDA_VERSION_DASH} \ cuda-cuobjdump-${CUDA_VERSION_DASH} \ libcurand-dev-${CUDA_VERSION_DASH} \ - libcublas-${CUDA_VERSION_DASH} + libcublas-${CUDA_VERSION_DASH} \ + # Fixes nccl_allocator requiring nccl.h at runtime + # https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22 + libnccl-dev && \ + rm -rf /var/lib/apt/lists/* # Install uv for faster pip installs RUN python3 -m pip install uv @@ -666,7 +671,12 @@ RUN --mount=type=cache,target=/root/.cache/uv \ fi; \ uv pip install --system -r /tmp/kv_connectors.txt --no-build || ( \ # if the above fails, install from source - uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation \ + apt-get update -y && \ + apt-get install -y --no-install-recommends ${BUILD_PKGS} && \ + uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation && \ + apt-get purge -y ${BUILD_PKGS} && \ + # clean up -dev packages, keep runtime libraries + rm -rf /var/lib/apt/lists/* \ ); \ fi From 911292e59d4ca9ac336a4fc5219711c5dd7e3dde Mon Sep 17 00:00:00 2001 From: Zhanda Date: Fri, 23 Jan 2026 11:11:26 -0800 Subject: [PATCH 6/9] chore: Update requirements and revert README.md --- README.md | 31 ------------------------------- requirements/cuda.txt | 2 ++ 2 files changed, 2 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index f209c4c6bd80..705fbcb9150b 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,4 @@ - -# FA4 Integration - -### (1) Support fa4 in vllm. - -From low-level to high-level: -1. Add `FLASH_ATTN_CUTE` (FA4 / `flash_attn.cute`) to `vllm/v1/attention/backends/registry.py` (`AttentionBackendEnum`). -2. Create a new file `vllm/v1/attention/backends/fa4_utils.py`, for the utils / imports for fa4 (keep imports lazy). -3. Register the new backend in `vllm/platforms/cuda.py` (FA4 is **Blackwell-only (CC 10.x)** and **opt-in** via `--mm-encoder-attn-backend FLASH_ATTN_CUTE`; default remains FA2/3 or Torch SDPA). -4. Add the fa4 custom op under `vllm/v1/attention/ops/vit_attn_wrappers.py`. -5. Update `vllm/model_executor/layers/attention/mm_encoder_attention.py` to add another _forward_impl method for fa4 (`FLASH_ATTN_CUTE`). -6. Update `vllm/model_executor/models/qwen3_vl.py` and (optionally) `qwen2_5_vl.py` to accept `FLASH_ATTN_CUTE` and compute `max_seqlen` for it. - -Notes: -- FA4 (`flash_attn.cute`) is only considered on **Blackwell** (compute capability 10.x) in this vLLM fork. -- To force FA4 for ViT/MM encoder attention (Blackwell only): `--mm-encoder-attn-backend FLASH_ATTN_CUTE`. - -### (2) Do the kernel_warmup in vllm. - -- Add a FA4 ViT warmup in `vllm/model_executor/warmup/kernel_warmup.py` (see `vllm/model_executor/warmup/fa4_warmup.py`). -- Scope: **Qwen3-VL / Qwen3-VL-MoE** vision transformer only, **Blackwell-only**, and only when `--mm-encoder-attn-backend FLASH_ATTN_CUTE` is set. -- Candidate seqlens (only varying seqlen): `[64, 256, 576, 1024, 2304, 4096, 9216, 16384, 36864, 65536]` (filtered by `vision_config.num_position_embeddings` if smaller). - -### (3) Minor fixes for FA4 integration. - -- In `vllm/model_executor/layers/rotary_embedding/common.py`, there is a logic of `if find_spec("flash_attn") is not None:` - However, flash_attn original package is actually not installed, not `flash_attn.cute` is installed. - Therefore, minor fix is needed for the import error. - ---- -

diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 1417fb99120b..92d4cc59d118 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -11,3 +11,5 @@ torchaudio==2.9.1 torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # FlashInfer should be updated together with the Dockerfile flashinfer-python==0.5.3 +# FA4 +git+https://github.com/Dao-AILab/flash-attention.git@2580b5a4882562640f3cfbffd2bb8d2de9268f9f#subdirectory=flash_attn/cute \ No newline at end of file From f6e3ae7366b98861179d46dfc1f05b548baa4d83 Mon Sep 17 00:00:00 2001 From: Zhanda Date: Fri, 23 Jan 2026 11:45:09 -0800 Subject: [PATCH 7/9] chore: Install git for flash_attn cute installation --- docker/Dockerfile | 1 + requirements/cuda.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index d4ecf96b1485..0ba7b0934411 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -423,6 +423,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ libsm6 \ libxext6 \ libgl1 \ + git \ && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \ if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \ mkdir -p -m 0755 /etc/apt/keyrings ; \ diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 92d4cc59d118..380bbc30e3d1 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -12,4 +12,4 @@ torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytor # FlashInfer should be updated together with the Dockerfile flashinfer-python==0.5.3 # FA4 -git+https://github.com/Dao-AILab/flash-attention.git@2580b5a4882562640f3cfbffd2bb8d2de9268f9f#subdirectory=flash_attn/cute \ No newline at end of file +flash-attn-cute @ git+https://github.com/Dao-AILab/flash-attention.git@2580b5a4882562640f3cfbffd2bb8d2de9268f9f#subdirectory=flash_attn/cute \ No newline at end of file From 034e03a7eb1b7a46cbe12904696cf1a660efd7c3 Mon Sep 17 00:00:00 2001 From: Zhanda Date: Sat, 24 Jan 2026 17:17:19 -0800 Subject: [PATCH 8/9] lint: Fix linting --- vllm/config/attention.py | 4 ++-- vllm/model_executor/warmup/fa4_warmup.py | 22 +++++++++++----------- vllm/platforms/cuda.py | 4 +++- vllm/v1/attention/backends/fa4_utils.py | 18 ++++++++---------- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/vllm/config/attention.py b/vllm/config/attention.py index 0fd4a67cb5b6..8f7f20ef133a 100644 --- a/vllm/config/attention.py +++ b/vllm/config/attention.py @@ -74,8 +74,8 @@ def validate_backend_before(cls, value: Any) -> Any: raise ValueError( "AttentionConfig.backend does not support FLASH_ATTN_CUTE " "(FA4 / flash_attn.cute). This is a ViT/MM-encoder-only attention " - "tag. Use --mm-encoder-attn-backend / MultiModalConfig.mm_encoder_attn_backend " - "instead." + "tag. Use --mm-encoder-attn-backend / " + "MultiModalConfig.mm_encoder_attn_backend instead." ) return value diff --git a/vllm/model_executor/warmup/fa4_warmup.py b/vllm/model_executor/warmup/fa4_warmup.py index ebdfabb66915..a9b0bcd61459 100644 --- a/vllm/model_executor/warmup/fa4_warmup.py +++ b/vllm/model_executor/warmup/fa4_warmup.py @@ -29,7 +29,9 @@ logger = init_logger(__name__) -def _get_default_qwen3_vit_warmup_seqlens(max_positions: int | None = None) -> list[int]: +def _get_default_qwen3_vit_warmup_seqlens( + max_positions: int | None = None, +) -> list[int]: candidates = [ 16**2, # 256 24**2, # 576 @@ -37,16 +39,16 @@ def _get_default_qwen3_vit_warmup_seqlens(max_positions: int | None = None) -> l 48**2, # 2304 64**2, # 4096 96**2, # 9216 - 128**2, # 16384 - 192**2, # 36864 - 256**2, # 65536 + 128**2, # 16384 + 192**2, # 36864 + 256**2, # 65536 ] if max_positions is None: return candidates return [s for s in candidates if s <= max_positions] -def should_fa4_vit_warmup(worker: "Worker") -> bool: +def should_fa4_vit_warmup(worker: Worker) -> bool: """Fast predicate used by `kernel_warmup` to gate FA4 warmup.""" if not current_platform.is_cuda(): return False @@ -61,7 +63,7 @@ def should_fa4_vit_warmup(worker: "Worker") -> bool: ) -def fa4_vit_warmup(worker: "Worker") -> None: +def fa4_vit_warmup(worker: Worker) -> None: """Warm up FA4 kernels for Qwen3-VL(-MoE) ViT attention.""" # Config gating: only warm up when explicitly selected for mm encoder. @@ -85,9 +87,7 @@ def fa4_vit_warmup(worker: "Worker") -> None: visual = getattr(model, "visual", None) if visual is None: # Not a Qwen3-VL(-MoE) style model, or vision tower disabled. - logger.warning( - "Skipping FA4 warmup: not a Qwen3-VL(-MoE) style model, or vision tower disabled." - ) + logger.warning("Skipping FA4 warmup: vision tower disabled or not found.") return # Derive head shape and dtype from the actual vision attention module. @@ -114,7 +114,8 @@ def fa4_vit_warmup(worker: "Worker") -> None: seqlens = tuple(_get_default_qwen3_vit_warmup_seqlens()) logger.info_once( - "Warming up FA4 (flash_attn.cute) ViT kernels for seqlens=%s (head_size=%d, num_heads=%d, dtype=%s).", + "Warming up FA4 (flash_attn.cute) ViT kernels for seqlens=%s " + "(head_size=%d, num_heads=%d, dtype=%s).", seqlens, head_size, num_heads, @@ -145,4 +146,3 @@ def fa4_vit_warmup(worker: "Worker") -> None: softmax_scale=scale, causal=False, ) - diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index dc2bc5544275..8f315881df45 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -390,9 +390,11 @@ def get_vit_attn_backend( from vllm.v1.attention.backends.fa4_utils import ( is_flash_attn_cute_available, - supports_dtype as fa4_supports_dtype, warn_if_unoptimized_head_size, ) + from vllm.v1.attention.backends.fa4_utils import ( + supports_dtype as fa4_supports_dtype, + ) if not fa4_supports_dtype(dtype): raise ValueError( diff --git a/vllm/v1/attention/backends/fa4_utils.py b/vllm/v1/attention/backends/fa4_utils.py index 26e7262f4e08..b1ce45e6bd4d 100644 --- a/vllm/v1/attention/backends/fa4_utils.py +++ b/vllm/v1/attention/backends/fa4_utils.py @@ -4,7 +4,6 @@ from __future__ import annotations from importlib.util import find_spec -from typing import Optional, Tuple import torch @@ -64,15 +63,15 @@ def flash_attn_varlen_func( k: torch.Tensor, v: torch.Tensor, *, - cu_seqlens_q: Optional[torch.Tensor] = None, - cu_seqlens_k: Optional[torch.Tensor] = None, - max_seqlen_q: Optional[int] = None, - max_seqlen_k: Optional[int] = None, - seqused_q: Optional[torch.Tensor] = None, - seqused_k: Optional[torch.Tensor] = None, - softmax_scale: Optional[float] = None, + cu_seqlens_q: torch.Tensor | None = None, + cu_seqlens_k: torch.Tensor | None = None, + max_seqlen_q: int | None = None, + max_seqlen_k: int | None = None, + seqused_q: torch.Tensor | None = None, + seqused_k: torch.Tensor | None = None, + softmax_scale: float | None = None, causal: bool = False, - window_size: Tuple[Optional[int], Optional[int]] = (None, None), + window_size: tuple[int | None, int | None] = (None, None), deterministic: bool = False, ) -> torch.Tensor: """FA4 (Cute-DSL) FlashAttention varlen forward. @@ -108,4 +107,3 @@ def flash_attn_varlen_func( deterministic=deterministic, ) return out - From f66762f3911b6266108f0e9b5ab4ca21b26ff419 Mon Sep 17 00:00:00 2001 From: Shang Wang Date: Sat, 24 Jan 2026 19:26:43 -0500 Subject: [PATCH 9/9] Revert "[Improvement] Persist CUDA compat libraries paths to prevent reset on `apt-get` (#30784)" (#31) This reverts commit 2a60ac91d0f5c24cdb2863b178d2f5405fae50b8. --- docker/Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0ba7b0934411..227f4a3355c8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -134,8 +134,8 @@ ENV UV_LINK_MODE=copy # Verify GCC version RUN gcc --version -# Ensure CUDA compatibility library is loaded -RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/00-cuda-compat.conf && ldconfig +# Workaround for triton/pytorch issues +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ # ============================================================ # SLOW-CHANGING DEPENDENCIES BELOW @@ -474,8 +474,8 @@ ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_LINK_MODE=copy -# Ensure CUDA compatibility library is loaded -RUN echo "/usr/local/cuda-$(echo "$CUDA_VERSION" | cut -d. -f1,2)/compat/" > /etc/ld.so.conf.d/00-cuda-compat.conf && ldconfig +# Workaround for triton/pytorch issues +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ # ============================================================ # SLOW-CHANGING DEPENDENCIES BELOW