Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 82 additions & 11 deletions vllm/model_executor/models/qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,39 @@

logger = init_logger(__name__)


def _pad_cumulative_seqlens_buffer(
dst: torch.Tensor,
src: torch.Tensor,
) -> None:
n = src.shape[0]
dst.zero_()
dst[:n].copy_(src)
if n < dst.shape[0]:
dst[n:] = src[-1]


def _pad_flashinfer_cu_seqlens_buffer(
dst: torch.Tensor,
src: torch.Tensor,
) -> None:
src_mid = src.shape[0] // 2
dst_mid = dst.shape[0] // 2
assert src_mid <= dst_mid, (
f"FlashInfer cu_seqlens replay buffer is larger than capture buffer: "
f"src_section={src_mid}, dst_section={dst_mid}"
)

dst.zero_()
dst[:src_mid].copy_(src[:src_mid])
if src_mid < dst_mid:
dst[src_mid:dst_mid] = src[src_mid - 1]

dst[dst_mid : dst_mid + src_mid].copy_(src[src_mid:])
if dst_mid + src_mid < dst.shape[0]:
dst[dst_mid + src_mid :] = src[-1]


# === Vision Inputs === #


Expand Down Expand Up @@ -796,6 +829,38 @@ def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
inv[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
return inv

def get_encoder_cudagraph_max_window_seqs(
self,
token_budget: int,
max_batch_size: int,
max_frames_per_batch: int,
) -> int:
# token_budget is an upper bound on the total number of merged vision
# tokens replayed by this encoder CUDA graph. cu_window_seqlens, however,
# is sized by the number of window-attention sequences (non-empty local
# windows), not by the number of tokens. Using max_num_batched_tokens as
# this sequence count can over-pad cu_window_seqlens and make FlashAttention
# launch thousands of empty CTAs during replay.
vit_merger_window_size = (
self.window_size // self.spatial_merge_size // self.patch_size
)
max_sequence_units = max(max_batch_size, max_frames_per_batch)

# Each local window covers vit_merger_window_size tokens along one merged
# spatial axis. The largest number of non-empty windows for a fixed token
# budget comes from a thin strip that advances along only one axis, so
# ceil(token_budget / window_side) is a safe geometry-driven bound. Multiple
# images or video frames can fragment that strip at item/frame boundaries,
# so add max_sequence_units to cover one extra partial window per sequence.
max_strip_windows = (
token_budget + vit_merger_window_size - 1
) // vit_merger_window_size

# A non-empty window must contain at least one merged vision token, so the
# number of window sequences can never exceed token_budget. This final
# clamp keeps the bound tight for tiny budgets while remaining safe.
return min(token_budget, max_sequence_units + max_strip_windows)

def prepare_encoder_metadata(
self,
grid_thw: list[list[int]],
Expand Down Expand Up @@ -1640,6 +1705,11 @@ def get_encoder_cudagraph_config(self):
modalities = [] if self.is_multimodal_pruning_enabled else ["image", "video"]

max_frames = self.get_max_frames_per_video() if "video" in modalities else 1
cu_seqlens_padding = (
_pad_flashinfer_cu_seqlens_buffer
if self.visual.attn_backend == AttentionBackendEnum.FLASHINFER
else _pad_cumulative_seqlens_buffer
)
return EncoderCudaGraphConfig(
modalities=modalities,
input_key_by_modality={
Expand All @@ -1658,6 +1728,10 @@ def get_encoder_cudagraph_config(self):
"sequence_lengths_full",
"sequence_lengths_window",
],
padding_logics={
"cu_seqlens": cu_seqlens_padding,
"cu_window_seqlens": cu_seqlens_padding,
},
out_hidden_size=self.visual.out_hidden_size,
max_frames_per_video=max_frames,
)
Expand Down Expand Up @@ -1791,9 +1865,8 @@ def prepare_encoder_cudagraph_capture_inputs(
)

spatial_merge_size = self.visual.spatial_merge_size
max_window_seqs_per_batch = min(
self.vllm_config.scheduler_config.max_num_batched_tokens,
self.model_config.max_model_len,
max_window_seqs_per_batch = self.visual.get_encoder_cudagraph_max_window_seqs(
token_budget, max_batch_size, max_frames_per_batch
)
# Use ceil here (not floor) so total captured capacity is never smaller
# than token_budget when token_budget is not divisible by max_batch_size
Expand Down Expand Up @@ -1888,23 +1961,21 @@ def prepare_encoder_cudagraph_replay_buffers(
modality = self.get_input_modality(mm_kwargs)
grid_thw_list = self._get_grid_thw_by_modality(mm_kwargs)

# Keep replay metadata sized to the actual batch. The captured buffers
# may be larger, but EncoderCudaGraphManager fills the remaining
# cu*_seqlens entries with the last cumulative offset to represent empty
# sequences. Padding cu_window_seqlens here would require a static upper
# bound and can over-pad window attention into many empty FlashAttention
# CTAs.
if modality == "image":
buffers = self.visual.prepare_encoder_metadata(
grid_thw_list,
max_batch_size=max_batch_size,
max_window_seqs_per_batch=min(
self.vllm_config.scheduler_config.max_num_batched_tokens,
self.model_config.max_model_len,
),
)
elif modality == "video":
buffers = self.visual.prepare_encoder_metadata(
grid_thw_list,
max_frames_per_batch=max_frames_per_batch,
max_window_seqs_per_batch=min(
self.vllm_config.scheduler_config.max_num_batched_tokens,
self.model_config.max_model_len,
),
)
else:
raise AssertionError("This line should be unreachable.")
Expand Down
19 changes: 15 additions & 4 deletions vllm/v1/worker/encoder_cudagraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ class BudgetGraphMetadata:
# The input tensor updated before replay (e.g. pixel_values)
input_buffer: torch.Tensor
# Buffers recorded into the CUDA graph (e.g. embeddings, sequence metadata).
# Before replay the manager zeros then slice-copies new data into these.
# Before replay the manager updates these in-place. By default buffers are
# zeroed before slice-copying the actual values; model-specific padding
# behavior is provided by EncoderCudaGraphConfig.padding_logics.
metadata_buffers: dict[str, torch.Tensor]
# Output written by graph, read after replay
output_buffer: torch.Tensor
Expand Down Expand Up @@ -259,6 +261,14 @@ def _get_per_item_out_tokens(self, mm_kwargs: dict[str, Any]) -> list[int]:
"""Get per-item output token counts as plain ints."""
return [spec.output_tokens for spec in self._get_item_specs(mm_kwargs)]

@staticmethod
def _copy_padded_buffer(
dst: torch.Tensor,
src: torch.Tensor,
) -> None:
dst.zero_()
dst[: src.shape[0]].copy_(src)

def _run_budget_graph(
self,
mm_kwargs: dict[str, Any],
Expand Down Expand Up @@ -302,9 +312,10 @@ def _run_budget_graph(
if src.ndim == 0:
buf.copy_(src)
else:
n = src.shape[0]
buf.zero_()
buf[:n].copy_(src)
padding_logic = self.config.padding_logics.get(
key, self._copy_padded_buffer
)
padding_logic(buf, src)

graph_meta.graph.replay()

Expand Down
12 changes: 11 additions & 1 deletion vllm/v1/worker/encoder_cudagraph_defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Data transfer objects for encoder CUDA graph management."""

from dataclasses import dataclass
from collections.abc import Callable
from dataclasses import dataclass, field
from typing import Any

import torch

EncoderCudaGraphPaddingLogic = Callable[[torch.Tensor, torch.Tensor], None]


@dataclass
class EncoderItemSpec:
Expand Down Expand Up @@ -51,6 +54,13 @@ class EncoderCudaGraphConfig:
"""Output hidden dim of the vision encoder.
Used for DP gather buffer allocation."""

padding_logics: dict[str, EncoderCudaGraphPaddingLogic] = field(
default_factory=dict
)
"""Optional per-buffer replay padding/copy logic.
If absent for a key, the manager zeros the capture buffer and slice-copies
the replay buffer into it."""

max_frames_per_video: int = 1
"""Maximum number of frames per video.
Only relevant when "video" is in ``modalities``.
Expand Down
Loading