vllm-project · Isotr0py · May 28, 2026 · May 26, 2026 · May 26, 2026 · May 27, 2026
@@ -122,6 +122,39 @@
 
 logger = init_logger(__name__)
 
+
+def _pad_cumulative_seqlens_buffer(
+    dst: torch.Tensor,
+    src: torch.Tensor,
+) -> None:
+    n = src.shape[0]
+    dst.zero_()
+    dst[:n].copy_(src)
+    if n < dst.shape[0]:
+        dst[n:] = src[-1]
+
+
+def _pad_flashinfer_cu_seqlens_buffer(
+    dst: torch.Tensor,
+    src: torch.Tensor,
+) -> None:
+    src_mid = src.shape[0] // 2
+    dst_mid = dst.shape[0] // 2
+    assert src_mid <= dst_mid, (
+        f"FlashInfer cu_seqlens replay buffer is larger than capture buffer: "
+        f"src_section={src_mid}, dst_section={dst_mid}"
+    )
+
+    dst.zero_()
+    dst[:src_mid].copy_(src[:src_mid])
+    if src_mid < dst_mid:
+        dst[src_mid:dst_mid] = src[src_mid - 1]
+
+    dst[dst_mid : dst_mid + src_mid].copy_(src[src_mid:])
+    if dst_mid + src_mid < dst.shape[0]:
+        dst[dst_mid + src_mid :] = src[-1]
+
+
 # === Vision Inputs === #
 
 
@@ -796,6 +829,38 @@ def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
         inv[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
         return inv
 
+    def get_encoder_cudagraph_max_window_seqs(
+        self,
+        token_budget: int,
+        max_batch_size: int,
+        max_frames_per_batch: int,
+    ) -> int:
+        # token_budget is an upper bound on the total number of merged vision
+        # tokens replayed by this encoder CUDA graph. cu_window_seqlens, however,
+        # is sized by the number of window-attention sequences (non-empty local
+        # windows), not by the number of tokens. Using max_num_batched_tokens as
+        # this sequence count can over-pad cu_window_seqlens and make FlashAttention
+        # launch thousands of empty CTAs during replay.
+        vit_merger_window_size = (
+            self.window_size // self.spatial_merge_size // self.patch_size
+        )
+        max_sequence_units = max(max_batch_size, max_frames_per_batch)
+
+        # Each local window covers vit_merger_window_size tokens along one merged
+        # spatial axis. The largest number of non-empty windows for a fixed token
+        # budget comes from a thin strip that advances along only one axis, so
+        # ceil(token_budget / window_side) is a safe geometry-driven bound. Multiple
+        # images or video frames can fragment that strip at item/frame boundaries,
+        # so add max_sequence_units to cover one extra partial window per sequence.
+        max_strip_windows = (
+            token_budget + vit_merger_window_size - 1
+        ) // vit_merger_window_size
+
+        # A non-empty window must contain at least one merged vision token, so the
+        # number of window sequences can never exceed token_budget. This final
+        # clamp keeps the bound tight for tiny budgets while remaining safe.
+        return min(token_budget, max_sequence_units + max_strip_windows)
+
     def prepare_encoder_metadata(
         self,
         grid_thw: list[list[int]],
@@ -1640,6 +1705,11 @@ def get_encoder_cudagraph_config(self):
         modalities = [] if self.is_multimodal_pruning_enabled else ["image", "video"]
 
         max_frames = self.get_max_frames_per_video() if "video" in modalities else 1
+        cu_seqlens_padding = (
+            _pad_flashinfer_cu_seqlens_buffer
+            if self.visual.attn_backend == AttentionBackendEnum.FLASHINFER
+            else _pad_cumulative_seqlens_buffer
+        )
         return EncoderCudaGraphConfig(
             modalities=modalities,
             input_key_by_modality={
@@ -1658,6 +1728,10 @@ def get_encoder_cudagraph_config(self):
                 "sequence_lengths_full",
                 "sequence_lengths_window",
             ],
+            padding_logics={
+                "cu_seqlens": cu_seqlens_padding,
+                "cu_window_seqlens": cu_seqlens_padding,
+            },
             out_hidden_size=self.visual.out_hidden_size,
             max_frames_per_video=max_frames,
         )
@@ -1791,9 +1865,8 @@ def prepare_encoder_cudagraph_capture_inputs(
         )
 
         spatial_merge_size = self.visual.spatial_merge_size
-        max_window_seqs_per_batch = min(
-            self.vllm_config.scheduler_config.max_num_batched_tokens,
-            self.model_config.max_model_len,
+        max_window_seqs_per_batch = self.visual.get_encoder_cudagraph_max_window_seqs(
+            token_budget, max_batch_size, max_frames_per_batch
         )
         # Use ceil here (not floor) so total captured capacity is never smaller
         # than token_budget when token_budget is not divisible by max_batch_size
@@ -1888,23 +1961,21 @@ def prepare_encoder_cudagraph_replay_buffers(
         modality = self.get_input_modality(mm_kwargs)
         grid_thw_list = self._get_grid_thw_by_modality(mm_kwargs)
 
+        # Keep replay metadata sized to the actual batch. The captured buffers
+        # may be larger, but EncoderCudaGraphManager fills the remaining
+        # cu*_seqlens entries with the last cumulative offset to represent empty
+        # sequences. Padding cu_window_seqlens here would require a static upper
+        # bound and can over-pad window attention into many empty FlashAttention
+        # CTAs.
         if modality == "image":
             buffers = self.visual.prepare_encoder_metadata(
                 grid_thw_list,
                 max_batch_size=max_batch_size,
-                max_window_seqs_per_batch=min(
-                    self.vllm_config.scheduler_config.max_num_batched_tokens,
-                    self.model_config.max_model_len,
-                ),
             )
         elif modality == "video":
             buffers = self.visual.prepare_encoder_metadata(
                 grid_thw_list,
                 max_frames_per_batch=max_frames_per_batch,
-                max_window_seqs_per_batch=min(
-                    self.vllm_config.scheduler_config.max_num_batched_tokens,
-                    self.model_config.max_model_len,
-                ),
             )
         else:
             raise AssertionError("This line should be unreachable.")

@@ -45,7 +45,9 @@ class BudgetGraphMetadata:
     # The input tensor updated before replay (e.g. pixel_values)
     input_buffer: torch.Tensor
     # Buffers recorded into the CUDA graph (e.g. embeddings, sequence metadata).
-    # Before replay the manager zeros then slice-copies new data into these.
+    # Before replay the manager updates these in-place. By default buffers are
+    # zeroed before slice-copying the actual values; model-specific padding
+    # behavior is provided by EncoderCudaGraphConfig.padding_logics.
     metadata_buffers: dict[str, torch.Tensor]
     # Output written by graph, read after replay
     output_buffer: torch.Tensor
@@ -259,6 +261,14 @@ def _get_per_item_out_tokens(self, mm_kwargs: dict[str, Any]) -> list[int]:
         """Get per-item output token counts as plain ints."""
         return [spec.output_tokens for spec in self._get_item_specs(mm_kwargs)]
 
+    @staticmethod
+    def _copy_padded_buffer(
+        dst: torch.Tensor,
+        src: torch.Tensor,
+    ) -> None:
+        dst.zero_()
+        dst[: src.shape[0]].copy_(src)
+
     def _run_budget_graph(
         self,
         mm_kwargs: dict[str, Any],
@@ -302,9 +312,10 @@ def _run_budget_graph(
             if src.ndim == 0:
                 buf.copy_(src)
             else:
-                n = src.shape[0]
-                buf.zero_()
-                buf[:n].copy_(src)
+                padding_logic = self.config.padding_logics.get(
+                    key, self._copy_padded_buffer
+                )
+                padding_logic(buf, src)
 
         graph_meta.graph.replay()
 

@@ -2,11 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Data transfer objects for encoder CUDA graph management."""
 
-from dataclasses import dataclass
+from collections.abc import Callable
+from dataclasses import dataclass, field
 from typing import Any
 
 import torch
 
+EncoderCudaGraphPaddingLogic = Callable[[torch.Tensor, torch.Tensor], None]
+
 
 @dataclass
 class EncoderItemSpec:
@@ -51,6 +54,13 @@ class EncoderCudaGraphConfig:
     """Output hidden dim of the vision encoder.
     Used for DP gather buffer allocation."""
 
+    padding_logics: dict[str, EncoderCudaGraphPaddingLogic] = field(
+        default_factory=dict
+    )
+    """Optional per-buffer replay padding/copy logic.
+    If absent for a key, the manager zeros the capture buffer and slice-copies
+    the replay buffer into it."""
+
     max_frames_per_video: int = 1
     """Maximum number of frames per video.
     Only relevant when "video" is in ``modalities``.