diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 4ccd58a533ec..8f9c4fafe804 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -122,6 +122,39 @@ logger = init_logger(__name__) + +def _pad_cumulative_seqlens_buffer( + dst: torch.Tensor, + src: torch.Tensor, +) -> None: + n = src.shape[0] + dst.zero_() + dst[:n].copy_(src) + if n < dst.shape[0]: + dst[n:] = src[-1] + + +def _pad_flashinfer_cu_seqlens_buffer( + dst: torch.Tensor, + src: torch.Tensor, +) -> None: + src_mid = src.shape[0] // 2 + dst_mid = dst.shape[0] // 2 + assert src_mid <= dst_mid, ( + f"FlashInfer cu_seqlens replay buffer is larger than capture buffer: " + f"src_section={src_mid}, dst_section={dst_mid}" + ) + + dst.zero_() + dst[:src_mid].copy_(src[:src_mid]) + if src_mid < dst_mid: + dst[src_mid:dst_mid] = src[src_mid - 1] + + dst[dst_mid : dst_mid + src_mid].copy_(src[src_mid:]) + if dst_mid + src_mid < dst.shape[0]: + dst[dst_mid + src_mid :] = src[-1] + + # === Vision Inputs === # @@ -796,6 +829,38 @@ def invert_permutation(perm: torch.Tensor) -> torch.Tensor: inv[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype) return inv + def get_encoder_cudagraph_max_window_seqs( + self, + token_budget: int, + max_batch_size: int, + max_frames_per_batch: int, + ) -> int: + # token_budget is an upper bound on the total number of merged vision + # tokens replayed by this encoder CUDA graph. cu_window_seqlens, however, + # is sized by the number of window-attention sequences (non-empty local + # windows), not by the number of tokens. Using max_num_batched_tokens as + # this sequence count can over-pad cu_window_seqlens and make FlashAttention + # launch thousands of empty CTAs during replay. + vit_merger_window_size = ( + self.window_size // self.spatial_merge_size // self.patch_size + ) + max_sequence_units = max(max_batch_size, max_frames_per_batch) + + # Each local window covers vit_merger_window_size tokens along one merged + # spatial axis. The largest number of non-empty windows for a fixed token + # budget comes from a thin strip that advances along only one axis, so + # ceil(token_budget / window_side) is a safe geometry-driven bound. Multiple + # images or video frames can fragment that strip at item/frame boundaries, + # so add max_sequence_units to cover one extra partial window per sequence. + max_strip_windows = ( + token_budget + vit_merger_window_size - 1 + ) // vit_merger_window_size + + # A non-empty window must contain at least one merged vision token, so the + # number of window sequences can never exceed token_budget. This final + # clamp keeps the bound tight for tiny budgets while remaining safe. + return min(token_budget, max_sequence_units + max_strip_windows) + def prepare_encoder_metadata( self, grid_thw: list[list[int]], @@ -1640,6 +1705,11 @@ def get_encoder_cudagraph_config(self): modalities = [] if self.is_multimodal_pruning_enabled else ["image", "video"] max_frames = self.get_max_frames_per_video() if "video" in modalities else 1 + cu_seqlens_padding = ( + _pad_flashinfer_cu_seqlens_buffer + if self.visual.attn_backend == AttentionBackendEnum.FLASHINFER + else _pad_cumulative_seqlens_buffer + ) return EncoderCudaGraphConfig( modalities=modalities, input_key_by_modality={ @@ -1658,6 +1728,10 @@ def get_encoder_cudagraph_config(self): "sequence_lengths_full", "sequence_lengths_window", ], + padding_logics={ + "cu_seqlens": cu_seqlens_padding, + "cu_window_seqlens": cu_seqlens_padding, + }, out_hidden_size=self.visual.out_hidden_size, max_frames_per_video=max_frames, ) @@ -1791,9 +1865,8 @@ def prepare_encoder_cudagraph_capture_inputs( ) spatial_merge_size = self.visual.spatial_merge_size - max_window_seqs_per_batch = min( - self.vllm_config.scheduler_config.max_num_batched_tokens, - self.model_config.max_model_len, + max_window_seqs_per_batch = self.visual.get_encoder_cudagraph_max_window_seqs( + token_budget, max_batch_size, max_frames_per_batch ) # Use ceil here (not floor) so total captured capacity is never smaller # than token_budget when token_budget is not divisible by max_batch_size @@ -1888,23 +1961,21 @@ def prepare_encoder_cudagraph_replay_buffers( modality = self.get_input_modality(mm_kwargs) grid_thw_list = self._get_grid_thw_by_modality(mm_kwargs) + # Keep replay metadata sized to the actual batch. The captured buffers + # may be larger, but EncoderCudaGraphManager fills the remaining + # cu*_seqlens entries with the last cumulative offset to represent empty + # sequences. Padding cu_window_seqlens here would require a static upper + # bound and can over-pad window attention into many empty FlashAttention + # CTAs. if modality == "image": buffers = self.visual.prepare_encoder_metadata( grid_thw_list, max_batch_size=max_batch_size, - max_window_seqs_per_batch=min( - self.vllm_config.scheduler_config.max_num_batched_tokens, - self.model_config.max_model_len, - ), ) elif modality == "video": buffers = self.visual.prepare_encoder_metadata( grid_thw_list, max_frames_per_batch=max_frames_per_batch, - max_window_seqs_per_batch=min( - self.vllm_config.scheduler_config.max_num_batched_tokens, - self.model_config.max_model_len, - ), ) else: raise AssertionError("This line should be unreachable.") diff --git a/vllm/v1/worker/encoder_cudagraph.py b/vllm/v1/worker/encoder_cudagraph.py index 6d5aa6f41243..e39db0f23bf8 100644 --- a/vllm/v1/worker/encoder_cudagraph.py +++ b/vllm/v1/worker/encoder_cudagraph.py @@ -45,7 +45,9 @@ class BudgetGraphMetadata: # The input tensor updated before replay (e.g. pixel_values) input_buffer: torch.Tensor # Buffers recorded into the CUDA graph (e.g. embeddings, sequence metadata). - # Before replay the manager zeros then slice-copies new data into these. + # Before replay the manager updates these in-place. By default buffers are + # zeroed before slice-copying the actual values; model-specific padding + # behavior is provided by EncoderCudaGraphConfig.padding_logics. metadata_buffers: dict[str, torch.Tensor] # Output written by graph, read after replay output_buffer: torch.Tensor @@ -259,6 +261,14 @@ def _get_per_item_out_tokens(self, mm_kwargs: dict[str, Any]) -> list[int]: """Get per-item output token counts as plain ints.""" return [spec.output_tokens for spec in self._get_item_specs(mm_kwargs)] + @staticmethod + def _copy_padded_buffer( + dst: torch.Tensor, + src: torch.Tensor, + ) -> None: + dst.zero_() + dst[: src.shape[0]].copy_(src) + def _run_budget_graph( self, mm_kwargs: dict[str, Any], @@ -302,9 +312,10 @@ def _run_budget_graph( if src.ndim == 0: buf.copy_(src) else: - n = src.shape[0] - buf.zero_() - buf[:n].copy_(src) + padding_logic = self.config.padding_logics.get( + key, self._copy_padded_buffer + ) + padding_logic(buf, src) graph_meta.graph.replay() diff --git a/vllm/v1/worker/encoder_cudagraph_defs.py b/vllm/v1/worker/encoder_cudagraph_defs.py index 70e15703f978..20f1d7c33d41 100644 --- a/vllm/v1/worker/encoder_cudagraph_defs.py +++ b/vllm/v1/worker/encoder_cudagraph_defs.py @@ -2,11 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Data transfer objects for encoder CUDA graph management.""" -from dataclasses import dataclass +from collections.abc import Callable +from dataclasses import dataclass, field from typing import Any import torch +EncoderCudaGraphPaddingLogic = Callable[[torch.Tensor, torch.Tensor], None] + @dataclass class EncoderItemSpec: @@ -51,6 +54,13 @@ class EncoderCudaGraphConfig: """Output hidden dim of the vision encoder. Used for DP gather buffer allocation.""" + padding_logics: dict[str, EncoderCudaGraphPaddingLogic] = field( + default_factory=dict + ) + """Optional per-buffer replay padding/copy logic. + If absent for a key, the manager zeros the capture buffer and slice-copies + the replay buffer into it.""" + max_frames_per_video: int = 1 """Maximum number of frames per video. Only relevant when "video" is in ``modalities``.