From 28fa0e9ba691143b2ee02e130063987d04b83624 Mon Sep 17 00:00:00 2001
From: Xingran Wang <wangxingran123456@outlook.com>
Date: Wed, 12 Nov 2025 10:52:21 +0800
Subject: [PATCH 01/35] multimodal compile & piecewise graph

Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
---
 vllm/compilation/cuda_graph.py           |  1 +
 vllm/compilation/monitor.py              |  4 ++
 vllm/config/vllm.py                      |  2 +
 vllm/model_executor/models/qwen2_5_vl.py | 10 +++-
 vllm/v1/worker/gpu_model_runner.py       | 60 ++++++++++++++++++++++++
 5 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index 7ffa74d0d7e6..098dd095e9c3 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -288,6 +288,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
             entry.cudagraph = cudagraph
 
             compilation_counter.num_cudagraph_captured += 1
+            logger.info(f"Compilation Counter: {compilation_counter.num_cudagraph_captured}")
 
             # important: we need to return the output, rather than
             # the weak ref of the output, so that pytorch can correctly
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 2bad5f0a16fc..912e3d828abc 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -13,6 +13,8 @@
 
 
 def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None:
+    vllm_config.is_in_compile = True
+
     global torch_compile_start_time
     torch_compile_start_time = time.time()
 
@@ -29,6 +31,8 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None:
 
 
 def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None:
+    vllm_config.is_in_compile = False
+
     compilation_config: CompilationConfig = vllm_config.compilation_config
     if compilation_config.mode == CompilationMode.VLLM_COMPILE:
         logger.info_once(
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index ea133856360d..ef18ce03d88e 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -257,6 +257,8 @@ class VllmConfig:
     performance, with -O0 having the best startup time and -O3 having the best
     performance. -02 is used by defult. See  OptimizationLevel for full
     description."""
+    is_in_compile: bool = False
+    """For ViT Compile, Compile Status Flag"""
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 0310c5415dc9..d92ef7bdc312 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1207,7 +1207,15 @@ def _process_image_input(
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"]
-            with set_forward_context(None, self.vllm_config):
+            if self.vllm_config.is_in_compile:
+                with set_forward_context(None, self.vllm_config):
+                    if self.use_data_parallel:
+                        return run_dp_sharded_mrope_vision_model(
+                            self.visual, pixel_values, grid_thw_list, rope_type="rope_3d"
+                        )
+                    else:
+                        image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+            else:
                 if self.use_data_parallel:
                     return run_dp_sharded_mrope_vision_model(
                         self.visual, pixel_values, grid_thw_list, rope_type="rope_3d"
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 49211c6805ce..d3888ee79733 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4569,6 +4569,59 @@ def rand_inputs_embeds() -> torch.Tensor:
             yield
             inputs_embeds.fill_(0)
 
+    def _get_dummy_vit_input(self, num_image_tokens: int) -> BatchedTensorInputs:
+        """
+        Generates dummy multimodal inputs for a single image, with a controllable
+        number of resulting image tokens for a Vision Transformer (ViT) like model,
+        ensuring a square-like aspect ratio for the patch grid.
+
+        This is useful for profiling or testing, allowing the creation of inputs
+        that result in a specific number of image tokens after vision encoding.
+
+        Args:
+            num_image_tokens: The desired number of image tokens after encoding.
+
+        Returns:
+            A BatchedTensorInputs dictionary containing `pixel_values` and
+            `image_grid_thw` that can be passed as kwargs to
+            `get_multimodal_embeddings`.
+        """
+        import math
+
+        def find_square_like_factors(n: int):
+            """Finds two factors of n that are closest to its square root."""
+            h = int(math.sqrt(n))
+            while h > 0:
+                if n % h == 0:
+                    return h, n // h
+                h -= 1
+            return 1, n
+
+        h_patches, w_patches = find_square_like_factors(num_image_tokens)
+
+        # The first dimension of pixel_values corresponds to the total number of
+        # tokens (patches).
+        #TODO 修改1176为vit feature dim.
+        # 根据num_image_tokens反推原图片长宽利用原api跑一遍？还是先跑一遍得到结果后取其feature dim再构造
+        pixel_values = torch.zeros(
+            (num_image_tokens, 1176),
+            dtype=self.dtype,
+            device=self.device
+        )
+
+        # image_grid_thw specifies the grid layout for a single image.
+        # Shape: (1, 3) for (t, h, w) patch counts.
+        image_grid_thw = torch.tensor(
+            [[1, h_patches, w_patches]],
+            dtype=torch.long,
+            device=self.device
+        )
+
+        return {
+            "pixel_values": pixel_values,
+            "image_grid_thw": image_grid_thw,
+        }
+
     def _get_mm_dummy_batch(
         self,
         modality: str,
@@ -4855,6 +4908,13 @@ def _dummy_run(
                     slot_mapping=slot_mappings,
                 ),
             ):
+                if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs:
+                    # TODO: This will be improved to support different shapes.
+                    dummy_mm_inputs = self._get_dummy_vit_input(1024)
+                    # logger.info("st.!!!!!!!!!!!!!!!!!!!")
+                    # self.model.visual(dummy_mm_inputs["pixel_values"], grid_thw=dummy_mm_inputs["image_grid_thw"])
+                    self.model.get_multimodal_embeddings(**dummy_mm_inputs)
+                    # logger.info("ed!!!!!!!!!!!!!!!!!!!")
                 outputs = self.model(
                     input_ids=input_ids,
                     positions=positions,

From 52da31267d1305f97178d8d2f3c90a20f0c76e4f Mon Sep 17 00:00:00 2001
From: Xingran Wang <wangxingran123456@outlook.com>
Date: Wed, 12 Nov 2025 18:22:43 +0800
Subject: [PATCH 02/35] hardcoded ViT piecewise cuda graph size without padding

Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
---
 vllm/compilation/cuda_graph.py           |  1 -
 vllm/model_executor/models/qwen2_5_vl.py | 45 +++++++++++---
 vllm/v1/worker/gpu_model_runner.py       | 75 ++++++++++++++++++------
 3 files changed, 92 insertions(+), 29 deletions(-)

diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py
index 098dd095e9c3..7ffa74d0d7e6 100644
--- a/vllm/compilation/cuda_graph.py
+++ b/vllm/compilation/cuda_graph.py
@@ -288,7 +288,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
             entry.cudagraph = cudagraph
 
             compilation_counter.num_cudagraph_captured += 1
-            logger.info(f"Compilation Counter: {compilation_counter.num_cudagraph_captured}")
 
             # important: we need to return the output, rather than
             # the weak ref of the output, so that pytorch can correctly
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index d92ef7bdc312..bdafcdac5200 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -641,6 +641,9 @@ def __init__(
                 prefix=f"{prefix}.merger",
             )
 
+        self._persistent_hidden_states_buffer = torch.empty((4096, 1176), device=self.device, dtype=self.dtype)
+        self._persistent_rotary_pos_emb_buffer = torch.empty((4096, 40), device=self.device, dtype=self.dtype)
+
     @property
     def dtype(self) -> torch.dtype:
         return self.patch_embed.proj.weight.dtype
@@ -784,7 +787,13 @@ def forward(
         cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)]
         cu_seqlens: list = []
 
-        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        # logger.info(f"X Shape: {x.shape}")
+        if seq_len < 4096:
+            hidden_states = self._persistent_hidden_states_buffer[:seq_len]
+            hidden_states.copy_(x, non_blocking=True)
+        else:
+            hidden_states = x.to(device=self.device, dtype=self.dtype)
+
         hidden_states = self.patch_embed(hidden_states)
 
         window_index_id = 0
@@ -838,18 +847,36 @@ def forward(
         rotary_pos_emb_sin = rotary_pos_emb_sin.to(
             device=self.device, non_blocking=True
         )
+        rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True)
+        if seq_len < 4096:
+            rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(rotary_pos_emb)
         window_index = window_index.to(device=hidden_states.device, non_blocking=True)
         reverse_indices = reverse_indices.to(
             device=hidden_states.device, non_blocking=True
         )
-
-        hidden_states = hidden_states.reshape(
-            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
-        )
-        hidden_states = hidden_states[window_index, :, :]
-        hidden_states = hidden_states.reshape(seq_len, -1)
-
-        hidden_states = hidden_states.unsqueeze(1)
+        original_hidden_states = hidden_states  # 这只是引用，不是拷贝
+        # logger.info(f"Before Copy, original address: {original_hidden_states.storage().data_ptr()}")
+        # logger.info(f"Original Numel: {original_hidden_states.numel()}")
+        # Step 2: 执行一些转换操作（这些会创建新张量）
+        tmp = original_hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        tmp = tmp[window_index, :, :]
+        tmp = tmp.reshape(seq_len, -1)
+        tmp = tmp.unsqueeze(1)
+        # logger.info(f"Tmp Numel: {tmp.numel()}")
+        # Step 3: 将结果拷贝回原始张量的显存地址中（这是原地拷贝！）
+        original_storage = original_hidden_states.storage()
+        tmp_storage = tmp.storage()
+        original_storage.copy_(tmp_storage)
+
+        # Step 4: 创建一个使用原始显存、具有新 shape 的 view
+        # 条件：original numel 必须等于新 shape 的总元素数
+        new_shape = tmp.shape  # (seq_len, 1, new_hidden_dim)
+        hidden_states = original_hidden_states.view(new_shape)
+        # 现在 hidden_states.shape == new_shape，且使用和 original 相同的显存
+        # logger.info(f"After Copy, original address: {original_hidden_states.storage().data_ptr()}")
+        # logger.info(f"After Copy, tmp address: {tmp.storage().data_ptr()}")
+
+        # logger.info(f"Before Input to Vision Block, Shape: {hidden_states.shape}")
 
         for layer_num, blk in enumerate(self.blocks):
             if layer_num in self.fullatt_block_indexes:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d3888ee79733..3d98ba7e4ed5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2420,11 +2420,20 @@ def _execute_mm_encoder(
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
-
-                with self.timed_encoder_operation(
-                    should_time, mm_lora_refs, current_item_idx, num_items
-                ):
-                    curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
+                batch_descriptor = BatchDescriptor(
+                    num_tokens=mm_kwargs_group["pixel_values"].shape[0],
+                )
+                with set_forward_context(
+                        None,
+                        vllm_config=self.vllm_config,
+                        cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                        batch_descriptor=batch_descriptor,
+                    ), self.timed_encoder_operation(
+                        should_time, mm_lora_refs, current_item_idx, num_items
+                    ):
+                    curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group)
+                    # logger.info("cuda graph mm embedding complete!")
+                    # logger.info(f"curr_group_outputs: {curr_group_outputs}")
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
@@ -4592,13 +4601,12 @@ def find_square_like_factors(n: int):
             """Finds two factors of n that are closest to its square root."""
             h = int(math.sqrt(n))
             while h > 0:
+                h-=h&1
                 if n % h == 0:
                     return h, n // h
                 h -= 1
             return 1, n
 
-        h_patches, w_patches = find_square_like_factors(num_image_tokens)
-
         # The first dimension of pixel_values corresponds to the total number of
         # tokens (patches).
         #TODO 修改1176为vit feature dim.
@@ -4611,11 +4619,19 @@ def find_square_like_factors(n: int):
 
         # image_grid_thw specifies the grid layout for a single image.
         # Shape: (1, 3) for (t, h, w) patch counts.
-        image_grid_thw = torch.tensor(
-            [[1, h_patches, w_patches]],
-            dtype=torch.long,
-            device=self.device
-        )
+        if num_image_tokens == 3060:
+            image_grid_thw = torch.tensor(
+                [[1, 46, 34], [1, 44, 34]],
+                dtype=torch.long,
+                device=self.device
+            )
+        else:
+            h_patches, w_patches = find_square_like_factors(num_image_tokens)
+            image_grid_thw = torch.tensor(
+                [[1, h_patches, w_patches]],
+                dtype=torch.long,
+                device=self.device
+            )
 
         return {
             "pixel_values": pixel_values,
@@ -4908,13 +4924,6 @@ def _dummy_run(
                     slot_mapping=slot_mappings,
                 ),
             ):
-                if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs:
-                    # TODO: This will be improved to support different shapes.
-                    dummy_mm_inputs = self._get_dummy_vit_input(1024)
-                    # logger.info("st.!!!!!!!!!!!!!!!!!!!")
-                    # self.model.visual(dummy_mm_inputs["pixel_values"], grid_thw=dummy_mm_inputs["image_grid_thw"])
-                    self.model.get_multimodal_embeddings(**dummy_mm_inputs)
-                    # logger.info("ed!!!!!!!!!!!!!!!!!!!")
                 outputs = self.model(
                     input_ids=input_ids,
                     positions=positions,
@@ -5157,6 +5166,31 @@ def _dummy_pooler_run(
         max_task = max(output_size.items(), key=lambda x: x[1])[0]
         return self._dummy_pooler_run_task(hidden_states, max_task)
 
+    @torch.inference_mode()
+    def _dummy_mm_encoder_run(
+        self,
+        cudagraph_runtime_mode: CUDAGraphMode | None = None,
+    ) -> None:
+        logger.info("In _dummy_mm_encoder_run")
+        capture_sizes = [16, 32, 64, 128, 256, 512, 1024, 3060, 3128]
+        if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs:
+            # TODO: This will be improved to support different shapes.
+            for capture_size in capture_sizes:
+                logger.info(f"Capturing {capture_size}")
+                dummy_mm_inputs = self._get_dummy_vit_input(capture_size)
+                batch_descriptor = BatchDescriptor(
+                                    num_tokens=capture_size,
+                                )
+                with (
+                    set_forward_context(
+                        None,
+                        vllm_config=self.vllm_config,
+                        cudagraph_runtime_mode=cudagraph_runtime_mode,
+                        batch_descriptor=batch_descriptor,
+                    ),
+                ):
+                    self.model.get_multimodal_embeddings(**dummy_mm_inputs)
+
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
@@ -5368,6 +5402,9 @@ def _capture_cudagraphs(
                 num_active_loras=num_active_loras,
                 is_graph_capturing=True,
             )
+
+        self._dummy_mm_encoder_run(cudagraph_runtime_mode)
+
         self.maybe_remove_all_loras(self.lora_config)
 
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:

From 438b8ebff324643c0a7cb6d820634f4a41d7f247 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Thu, 13 Nov 2025 16:35:32 +0800
Subject: [PATCH 03/35] feat: add vit padding

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 108 ++++++++++++++++++++---------
 1 file changed, 77 insertions(+), 31 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3d98ba7e4ed5..d25acf20df8a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -599,6 +599,13 @@ def __init__(
             ]
             self.is_mm_embed_idx = 0
 
+            # START: Add persistent buffers for ViT inputs
+            # Use a large enough size for the CUDA graph
+            # The feature dimension is model-specific. We'll initialize
+            # the buffer lazily on the first run to get this dimension.
+            self.pixel_values_buffer: torch.Tensor | None = None
+            self.image_grid_thw_buffer: torch.Tensor | None = None
+
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
             # NOTE: `mrope_positions` is implemented with one additional dummy
@@ -2420,6 +2427,46 @@ def _execute_mm_encoder(
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
+                original_num_imgs = -1
+                if "pixel_values" in mm_kwargs_group:
+                    pixel_values = mm_kwargs_group["pixel_values"]
+                    num_tokens = pixel_values.shape[0]
+
+                    # Pad to the size expected by CUDA graph
+                    # TODO
+                    # padded_num_tokens = self.vllm_config.pad_for_mm_cudagraph(num_tokens)
+                    padded_num_tokens = 4096
+
+                    if padded_num_tokens > num_tokens:
+                        assert(self.pixel_values_buffer is not None and self.image_grid_thw_buffer is not None)
+                        
+                        self.pixel_values_buffer[:num_tokens].copy_(pixel_values) # type: ignore
+                        mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[:padded_num_tokens]
+
+                        # Update image_grid_thw to account for padding
+                        if "image_grid_thw" in mm_kwargs_group:
+                            image_grid_thw = mm_kwargs_group["image_grid_thw"]
+                            num_images = image_grid_thw.shape[0]
+                            original_num_imgs = num_images
+                            padding_amount = padded_num_tokens - num_tokens
+                            
+                            # Treat padding as a new virtual image.
+                            # Assuming a fixed patch grid logic where height is merge_size.
+                            merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1)
+                            assert(padding_amount % (merge_size * merge_size) == 0)
+                            h_patches = merge_size
+                            w_patches = padding_amount // h_patches
+
+                            self.image_grid_thw_buffer[:num_images].copy_(image_grid_thw)
+                            self.image_grid_thw_buffer[num_images] = torch.tensor(
+                                [1, h_patches, w_patches],
+                                dtype=torch.long,
+                                device=self.device
+                            )
+                            mm_kwargs_group["image_grid_thw"] = self.image_grid_thw_buffer[:num_images + 1]
+                    # END: Added padding logic for ViT CUDA Graph
+
+                # TODO get batch_descriptor from dispatch
                 batch_descriptor = BatchDescriptor(
                     num_tokens=mm_kwargs_group["pixel_values"].shape[0],
                 )
@@ -2434,7 +2481,10 @@ def _execute_mm_encoder(
                     curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group)
                     # logger.info("cuda graph mm embedding complete!")
                     # logger.info(f"curr_group_outputs: {curr_group_outputs}")
-
+                # START: Added cropping logic for ViT CUDA Graph
+                if original_num_imgs != -1:
+                    curr_group_outputs = curr_group_outputs[:original_num_imgs]
+            # END: Added cropping logic for ViT CUDA Graph
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
                 expected_num_items=num_items,
@@ -4595,17 +4645,11 @@ def _get_dummy_vit_input(self, num_image_tokens: int) -> BatchedTensorInputs:
             `image_grid_thw` that can be passed as kwargs to
             `get_multimodal_embeddings`.
         """
-        import math
-
-        def find_square_like_factors(n: int):
-            """Finds two factors of n that are closest to its square root."""
-            h = int(math.sqrt(n))
-            while h > 0:
-                h-=h&1
-                if n % h == 0:
-                    return h, n // h
-                h -= 1
-            return 1, n
+        def _get_dummy_h_w_patches(patches: int):
+            assert patches % 4 == 0, "Number of patches must be multiple of 4"
+            h_patches = 2
+            w_patches = patches // 2
+            return h_patches, w_patches
 
         # The first dimension of pixel_values corresponds to the total number of
         # tokens (patches).
@@ -4617,21 +4661,12 @@ def find_square_like_factors(n: int):
             device=self.device
         )
 
-        # image_grid_thw specifies the grid layout for a single image.
-        # Shape: (1, 3) for (t, h, w) patch counts.
-        if num_image_tokens == 3060:
-            image_grid_thw = torch.tensor(
-                [[1, 46, 34], [1, 44, 34]],
-                dtype=torch.long,
-                device=self.device
-            )
-        else:
-            h_patches, w_patches = find_square_like_factors(num_image_tokens)
-            image_grid_thw = torch.tensor(
-                [[1, h_patches, w_patches]],
-                dtype=torch.long,
-                device=self.device
-            )
+        h_patches, w_patches = _get_dummy_h_w_patches(num_image_tokens)
+        image_grid_thw = torch.tensor(
+            [[1, h_patches, w_patches]],
+            dtype=torch.long,
+            device=self.device
+        )
 
         return {
             "pixel_values": pixel_values,
@@ -5172,8 +5207,10 @@ def _dummy_mm_encoder_run(
         cudagraph_runtime_mode: CUDAGraphMode | None = None,
     ) -> None:
         logger.info("In _dummy_mm_encoder_run")
-        capture_sizes = [16, 32, 64, 128, 256, 512, 1024, 3060, 3128]
-        if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs:
+        # capture_sizes = [16, 32, 64, 128, 256, 512, 1024, 3060, 3128]
+        capture_sizes = [4096]
+        # Lazy initialization of the persistent buffer
+        if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE:
             # TODO: This will be improved to support different shapes.
             for capture_size in capture_sizes:
                 logger.info(f"Capturing {capture_size}")
@@ -5181,6 +5218,15 @@ def _dummy_mm_encoder_run(
                 batch_descriptor = BatchDescriptor(
                                     num_tokens=capture_size,
                                 )
+                if self.pixel_values_buffer is None:
+                    self.pixel_values_buffer = torch.zeros(
+                        (capture_sizes[-1], dummy_mm_inputs["pixel_values"].shape[1]), 
+                        dtype=self.dtype,
+                        device=self.device
+                    )
+                    self.image_grid_thw_buffer = torch.zeros((
+                        200, 3), dtype=torch.long, device=self.device
+                    )
                 with (
                     set_forward_context(
                         None,
@@ -5402,8 +5448,8 @@ def _capture_cudagraphs(
                 num_active_loras=num_active_loras,
                 is_graph_capturing=True,
             )
-
-        self._dummy_mm_encoder_run(cudagraph_runtime_mode)
+        if self.supports_mm_inputs:
+            self._dummy_mm_encoder_run(cudagraph_runtime_mode)
 
         self.maybe_remove_all_loras(self.lora_config)
 

From 7eaac5ce0b7f7d86788ff8dedaf349ec5f8a0196 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Thu, 13 Nov 2025 19:46:37 +0800
Subject: [PATCH 04/35] fix: fix vit cuda graph weak ref issue and first graph
 gc issue

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/compilation/backends.py             | 41 ++++++++++++++++++++-
 vllm/model_executor/models/qwen2_5_vl.py | 47 +++++++++++++-----------
 2 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 89981fc29963..ce2cacd0b7cd 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -48,6 +48,41 @@
 
 logger = init_logger(__name__)
 
+# A global flag to indicate if the current graph being compiled
+# is the last one in a sequence of graphs (e.g., a sequence of blocks).
+# This is a workaround to control CUDAGraph weak_ref_output behavior
+# in **vit** piecewise compilation.
+_is_last_graph_in_vit_sequence: bool = True
+
+@contextmanager
+def set_is_last_graph_in_sequence(is_last: bool):
+    """Context manager to indicate if the current graph being compiled
+    is the last one in a sequence of graphs (e.g., a sequence of blocks).
+    """
+    global _is_last_graph_in_vit_sequence
+    original_value = _is_last_graph_in_vit_sequence
+    _is_last_graph_in_vit_sequence = is_last
+    try:
+        yield
+    finally:
+        _is_last_graph_in_vit_sequence = original_value
+
+# A global flag to indicate if the current graph being compiled
+# is the first one in a sequence of graphs (e.g., a sequence of blocks).
+_is_first_graph_in_vit_sequence: bool = True
+
+@contextmanager
+def set_is_first_graph_in_sequence(is_first: bool):
+    """Context manager to indicate if the current graph being compiled
+    is the first one in a sequence of graphs (e.g., a sequence of blocks).
+    """
+    global _is_first_graph_in_vit_sequence
+    original_value = _is_first_graph_in_vit_sequence
+    _is_first_graph_in_vit_sequence = is_first
+    try:
+        yield
+    finally:
+        _is_first_graph_in_vit_sequence = original_value
 
 def make_copy_and_call(
     sym_tensor_indices: list[int],
@@ -449,8 +484,10 @@ def wrap_with_cudagraph_if_needed(
         runtime_mode=CUDAGraphMode.PIECEWISE,
         cudagraph_options=CUDAGraphOptions(
             debug_log_enable=is_first_graph,
-            gc_disable=not is_first_graph,
-            weak_ref_output=is_last_graph,
+            gc_disable=not is_first_graph
+                        or not _is_first_graph_in_vit_sequence,
+            weak_ref_output=is_last_graph
+                        and _is_last_graph_in_vit_sequence,
         ),
     )
 
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index bdafcdac5200..8186ff244608 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -641,8 +641,8 @@ def __init__(
                 prefix=f"{prefix}.merger",
             )
 
-        self._persistent_hidden_states_buffer = torch.empty((4096, 1176), device=self.device, dtype=self.dtype)
-        self._persistent_rotary_pos_emb_buffer = torch.empty((4096, 40), device=self.device, dtype=self.dtype)
+        self._persistent_hidden_states_buffer = torch.empty((8192, 1176), device=self.device, dtype=self.dtype)
+        self._persistent_rotary_pos_emb_buffer = torch.empty((8192, 40), device=self.device, dtype=torch.float32)
 
     @property
     def dtype(self) -> torch.dtype:
@@ -788,13 +788,15 @@ def forward(
         cu_seqlens: list = []
 
         # logger.info(f"X Shape: {x.shape}")
-        if seq_len < 4096:
+        if seq_len < 8192:
             hidden_states = self._persistent_hidden_states_buffer[:seq_len]
             hidden_states.copy_(x, non_blocking=True)
         else:
             hidden_states = x.to(device=self.device, dtype=self.dtype)
 
-        hidden_states = self.patch_embed(hidden_states)
+        from vllm.compilation.backends import set_is_first_graph_in_sequence, set_is_last_graph_in_sequence
+        with set_is_first_graph_in_sequence(True), set_is_last_graph_in_sequence(False):
+            hidden_states = self.patch_embed(hidden_states)
 
         window_index_id = 0
         cu_window_seqlens_last = 0
@@ -848,7 +850,7 @@ def forward(
             device=self.device, non_blocking=True
         )
         rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True)
-        if seq_len < 4096:
+        if seq_len < 8192:
             rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(rotary_pos_emb)
         window_index = window_index.to(device=hidden_states.device, non_blocking=True)
         reverse_indices = reverse_indices.to(
@@ -877,22 +879,22 @@ def forward(
         # logger.info(f"After Copy, tmp address: {tmp.storage().data_ptr()}")
 
         # logger.info(f"Before Input to Vision Block, Shape: {hidden_states.shape}")
-
-        for layer_num, blk in enumerate(self.blocks):
-            if layer_num in self.fullatt_block_indexes:
-                cu_seqlens_now = cu_seqlens
-                max_seqlen_now = max_seqlen_full
-            else:
-                cu_seqlens_now = cu_window_seqlens
-                max_seqlen_now = max_seqlen_window
-
-            hidden_states = blk(
-                hidden_states,
-                cu_seqlens=cu_seqlens_now,
-                rotary_pos_emb_cos=rotary_pos_emb_cos,
-                rotary_pos_emb_sin=rotary_pos_emb_sin,
-                max_seqlen=max_seqlen_now,
-            )
+        with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(False):
+            for layer_num, blk in enumerate(self.blocks):
+                if layer_num in self.fullatt_block_indexes:
+                    cu_seqlens_now = cu_seqlens
+                    max_seqlen_now = max_seqlen_full
+                else:
+                    cu_seqlens_now = cu_window_seqlens
+                    max_seqlen_now = max_seqlen_window
+
+                hidden_states = blk(
+                    hidden_states,
+                    cu_seqlens=cu_seqlens_now,
+                    rotary_pos_emb_cos=rotary_pos_emb_cos,
+                    rotary_pos_emb_sin=rotary_pos_emb_sin,
+                    max_seqlen=max_seqlen_now,
+                )
 
         # For Qwen2.5-VL-3B, float16 will overflow at last block
         # for long visual tokens sequences.
@@ -900,7 +902,8 @@ def forward(
             hidden_states = cast_overflow_tensors(hidden_states)
 
         # adapter
-        hidden_states = self.merger(hidden_states)
+        with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(True):
+            hidden_states = self.merger(hidden_states)
         hidden_states = hidden_states[reverse_indices, :]
         return hidden_states
 

From 330fe8605228d60d18b11af9d3dca53dc50d90ac Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Tue, 18 Nov 2025 13:02:44 +0800
Subject: [PATCH 05/35] feat: add vit cudagraph capture sizes and related
 functionality

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/config/compilation.py         |   4 ++
 vllm/config/vllm.py                |  53 +++++++++++++++
 vllm/engine/arg_utils.py           |  15 +++++
 vllm/v1/worker/gpu_model_runner.py | 103 ++++++++++++++++-------------
 4 files changed, 128 insertions(+), 47 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 7a69629f707c..b74988b2a711 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -530,6 +530,10 @@ class CompilationConfig:
     """Sizes to capture cudagraph.
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
+    vit_cudagraph_capture_sizes: list[int] | None = None
+    """Sizes to capture vit cudagraph.
+    - None (default): capture sizes are inferred from vllm config.
+    - list[int]: capture sizes are specified as given."""
     cudagraph_copy_inputs: bool = False
     """Whether to copy input tensors for
     cudagraph. If the caller can guarantee that the same input buffers
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index ef18ce03d88e..ad3c9ea80a88 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -15,6 +15,7 @@
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypeVar, get_args
+import bisect
 
 import torch
 from pydantic import ConfigDict, Field, model_validator
@@ -360,6 +361,21 @@ def compute_hash(self) -> str:
         ]
         return hash_str
 
+    def pad_for_cudagraph(self, batch_size: int) -> int:
+        # if batch_size > self.compilation_config.max_cudagraph_capture_size,
+        # it should raise an IndexError.
+        # the caller should make sure the batch_size is within the range,
+        # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size
+        return self.compilation_config.bs_to_padded_graph_size[batch_size]
+
+    def pad_for_vit_cudagraph(self, batch_size: int) -> int:
+        capture_sizes = self.compilation_config.vit_cudagraph_capture_sizes
+        # Find the insertion point for batch_size to maintain order.
+        # This gives the index of the first element >= batch_size.
+        idx = bisect.bisect_left(capture_sizes, batch_size)
+
+        return capture_sizes[idx] if idx < len(capture_sizes) else batch_size
+    
     @property
     def needs_dp_coordinator(self) -> bool:
         """
@@ -815,6 +831,7 @@ def has_blocked_weights():
                 self.compilation_config.cudagraph_num_of_warmups = 1
 
             self._set_cudagraph_sizes()
+            self._set_vit_cudagraph_sizes()
         else:
             self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
@@ -1333,6 +1350,42 @@ def _set_compile_ranges(self):
         compilation_config.compile_ranges_split_points = sorted(
             computed_compile_ranges_split_points
         )
+    def _set_vit_cudagraph_sizes(self):
+        if (
+            self.model_config is not None
+            and not self.model_config.enforce_eager
+            and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+        ):
+            # determine the vit_cudagraph_capture_sizes
+            if self.compilation_config.vit_cudagraph_capture_sizes is not None:
+                assert len(self.compilation_config.vit_cudagraph_capture_sizes) > 0, (
+                    "vit_cudagraph_capture_sizes should contain at least one element "
+                    "when using cuda graph."
+                )
+                # sort to make sure the sizes are in ascending order
+                self.compilation_config.vit_cudagraph_capture_sizes.sort()
+                # de-duplicate the sizes provided by the config
+                dedup_sizes = list(set(self.compilation_config.vit_cudagraph_capture_sizes))
+                vit_cudagraph_capture_sizes = dedup_sizes
+            else:
+                max_vit_cudagraph_capture_size = 5120
+                vit_cudagraph_capture_sizes = [
+                    i for i in [16, 32, 64, 128, 256] if i <= max_vit_cudagraph_capture_size
+                ]
+                if max_vit_cudagraph_capture_size >= 1024:
+                    # Step size 64 for small batch sizes, up to 2048(not included)
+                    vit_cudagraph_capture_sizes += list(
+                        range(512, min(max_vit_cudagraph_capture_size + 1, 2048), 64)
+                    )
+                if max_vit_cudagraph_capture_size >= 2048:
+                    # Step size 128 for larger batch sizes
+                    vit_cudagraph_capture_sizes += list(
+                        range(2048, max_vit_cudagraph_capture_size + 1, 128)
+                    )
+            self.compilation_config.vit_cudagraph_capture_sizes = vit_cudagraph_capture_sizes
+        else:
+            # no cudagraph in use
+            self.compilation_config.vit_cudagraph_capture_sizes = []
 
     def try_verify_and_update_config(self):
         if self.model_config is None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f3e7729f64e3..b6d901581f6c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -378,6 +378,9 @@ class EngineArgs:
     max_cudagraph_capture_size: int | None = get_field(
         CompilationConfig, "max_cudagraph_capture_size"
     )
+    vit_cudagraph_capture_sizes: list[int] | None = (
+        CompilationConfig.vit_cudagraph_capture_sizes
+    )
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
     # notice.
@@ -1148,6 +1151,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         compilation_group.add_argument(
             "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"]
         )
+        compilation_group.add_argument(
+            "--vit-cudagraph-capture-sizes", **compilation_kwargs["vit_cudagraph_capture_sizes"]
+        )
         compilation_group.add_argument(
             "--max-cudagraph-capture-size",
             **compilation_kwargs["max_cudagraph_capture_size"],
@@ -1737,6 +1743,15 @@ def create_engine_config(
                     "cudagraph_capture_sizes are mutually exclusive"
                 )
             compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes
+        
+        if self.vit_cudagraph_capture_sizes is not None:
+            if compilation_config.vit_cudagraph_capture_sizes is not None:
+                raise ValueError(
+                    "vit_cudagraph_capture_sizes and compilation_config."
+                    "vit_cudagraph_capture_sizes are mutually exclusive"
+                )
+            compilation_config.vit_cudagraph_capture_sizes = self.vit_cudagraph_capture_sizes
+            
         if self.max_cudagraph_capture_size is not None:
             if compilation_config.max_cudagraph_capture_size is not None:
                 raise ValueError(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d25acf20df8a..f3a2c4a5c0fe 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -552,6 +552,14 @@ def __init__(
             self.cudagraph_batch_sizes = sorted(
                 self.compilation_config.cudagraph_capture_sizes
             )
+        # self.vit_cudagraph_batch_sizes sorts in ascending order.
+        if (
+            self.compilation_config.vit_cudagraph_capture_sizes
+            and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+        ):
+            self.vit_cudagraph_batch_sizes = sorted(
+                self.compilation_config.vit_cudagraph_capture_sizes
+            )
 
         # Cache the device properties.
         self._init_device_properties()
@@ -2433,9 +2441,7 @@ def _execute_mm_encoder(
                     num_tokens = pixel_values.shape[0]
 
                     # Pad to the size expected by CUDA graph
-                    # TODO
-                    # padded_num_tokens = self.vllm_config.pad_for_mm_cudagraph(num_tokens)
-                    padded_num_tokens = 4096
+                    padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph(num_tokens)
 
                     if padded_num_tokens > num_tokens:
                         assert(self.pixel_values_buffer is not None and self.image_grid_thw_buffer is not None)
@@ -2464,7 +2470,6 @@ def _execute_mm_encoder(
                                 device=self.device
                             )
                             mm_kwargs_group["image_grid_thw"] = self.image_grid_thw_buffer[:num_images + 1]
-                    # END: Added padding logic for ViT CUDA Graph
 
                 # TODO get batch_descriptor from dispatch
                 batch_descriptor = BatchDescriptor(
@@ -2479,12 +2484,9 @@ def _execute_mm_encoder(
                         should_time, mm_lora_refs, current_item_idx, num_items
                     ):
                     curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group)
-                    # logger.info("cuda graph mm embedding complete!")
-                    # logger.info(f"curr_group_outputs: {curr_group_outputs}")
-                # START: Added cropping logic for ViT CUDA Graph
+                # Remove the padded items before sanity check
                 if original_num_imgs != -1:
                     curr_group_outputs = curr_group_outputs[:original_num_imgs]
-            # END: Added cropping logic for ViT CUDA Graph
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
                 expected_num_items=num_items,
@@ -4628,7 +4630,7 @@ def rand_inputs_embeds() -> torch.Tensor:
             yield
             inputs_embeds.fill_(0)
 
-    def _get_dummy_vit_input(self, num_image_tokens: int) -> BatchedTensorInputs:
+    def _get_dummy_vit_input(self, num_image_tokens: int, img_feature_dim: int) -> BatchedTensorInputs:
         """
         Generates dummy multimodal inputs for a single image, with a controllable
         number of resulting image tokens for a Vision Transformer (ViT) like model,
@@ -4646,17 +4648,16 @@ def _get_dummy_vit_input(self, num_image_tokens: int) -> BatchedTensorInputs:
             `get_multimodal_embeddings`.
         """
         def _get_dummy_h_w_patches(patches: int):
-            assert patches % 4 == 0, "Number of patches must be multiple of 4"
-            h_patches = 2
-            w_patches = patches // 2
+            merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1)
+            assert(patches % (merge_size * merge_size) == 0, "Number of patches must be multiple of merge_size squared")
+            h_patches = merge_size
+            w_patches = patches // merge_size
             return h_patches, w_patches
 
         # The first dimension of pixel_values corresponds to the total number of
         # tokens (patches).
-        #TODO 修改1176为vit feature dim.
-        # 根据num_image_tokens反推原图片长宽利用原api跑一遍？还是先跑一遍得到结果后取其feature dim再构造
         pixel_values = torch.zeros(
-            (num_image_tokens, 1176),
+            (num_image_tokens, img_feature_dim),
             dtype=self.dtype,
             device=self.device
         )
@@ -5204,38 +5205,43 @@ def _dummy_pooler_run(
     @torch.inference_mode()
     def _dummy_mm_encoder_run(
         self,
-        cudagraph_runtime_mode: CUDAGraphMode | None = None,
+        compilation_cases: list[int],
     ) -> None:
-        logger.info("In _dummy_mm_encoder_run")
-        # capture_sizes = [16, 32, 64, 128, 256, 512, 1024, 3060, 3128]
-        capture_sizes = [4096]
-        # Lazy initialization of the persistent buffer
-        if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE:
-            # TODO: This will be improved to support different shapes.
-            for capture_size in capture_sizes:
-                logger.info(f"Capturing {capture_size}")
-                dummy_mm_inputs = self._get_dummy_vit_input(capture_size)
-                batch_descriptor = BatchDescriptor(
-                                    num_tokens=capture_size,
-                                )
-                if self.pixel_values_buffer is None:
-                    self.pixel_values_buffer = torch.zeros(
-                        (capture_sizes[-1], dummy_mm_inputs["pixel_values"].shape[1]), 
-                        dtype=self.dtype,
-                        device=self.device
+        if self.pixel_values_buffer is None:
+            tmp_dummy_mm_inputs = self._get_mm_dummy_batch(
+                        "image",
+                        1,
                     )
-                    self.image_grid_thw_buffer = torch.zeros((
-                        200, 3), dtype=torch.long, device=self.device
-                    )
-                with (
-                    set_forward_context(
-                        None,
-                        vllm_config=self.vllm_config,
-                        cudagraph_runtime_mode=cudagraph_runtime_mode,
-                        batch_descriptor=batch_descriptor,
-                    ),
-                ):
-                    self.model.get_multimodal_embeddings(**dummy_mm_inputs)
+            img_feature_dim = tmp_dummy_mm_inputs["pixel_values"].shape[1]
+            self.pixel_values_buffer = torch.zeros(
+                (compilation_cases[0], img_feature_dim),
+                dtype=self.dtype,
+                device=self.device
+            )
+            self.image_grid_thw_buffer = torch.zeros((
+                512, 3), dtype=torch.long, device=self.device
+            )
+        if is_global_first_rank():
+            compilation_cases = tqdm(
+                compilation_cases,
+                disable=not self.load_config.use_tqdm_on_load,
+                desc="Capturing Vit CUDA graphs (PIECEWISE)",
+            )
+        # Lazy initialization of the persistent buffer
+        for capture_size in compilation_cases:
+            dummy_mm_inputs = self._get_dummy_vit_input(capture_size, img_feature_dim)
+            batch_descriptor = BatchDescriptor(
+                                num_tokens=capture_size,
+                            )
+            with (
+                set_forward_context(
+                    None,
+                    vllm_config=self.vllm_config,
+                    cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                    batch_descriptor=batch_descriptor,
+                ),
+            ):
+                self.model.get_multimodal_embeddings(**dummy_mm_inputs)
 
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
@@ -5448,8 +5454,11 @@ def _capture_cudagraphs(
                 num_active_loras=num_active_loras,
                 is_graph_capturing=True,
             )
-        if self.supports_mm_inputs:
-            self._dummy_mm_encoder_run(cudagraph_runtime_mode)
+        if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs:
+            vit_capture_sizes = self.vit_cudagraph_batch_sizes
+            if vit_capture_sizes:
+                compilation_cases_vit = list(reversed(vit_capture_sizes))
+                self._dummy_mm_encoder_run(compilation_cases_vit)
 
         self.maybe_remove_all_loras(self.lora_config)
 

From fac98f96887edcefe3095a008a2acf5db9e32ab9 Mon Sep 17 00:00:00 2001
From: Xingran Wang <wangxingran123456@outlook.com>
Date: Tue, 18 Nov 2025 16:47:23 +0800
Subject: [PATCH 06/35] ViT cuda graph dispatcher

Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
---
 vllm/forward_context.py            |  5 +++++
 vllm/v1/cudagraph_dispatcher.py    |  8 ++++++++
 vllm/v1/worker/gpu_model_runner.py | 12 +++++++++---
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index e308c05bc669..1b89c04851f0 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -55,6 +55,10 @@ class BatchDescriptor(NamedTuple):
     (like fused_moe_lora) whose grid size depends on num_active_loras
     to be properly captured.
     """
+    is_vit: bool = False
+    """
+    ViT Piecewise CUDA Graph Flag
+    """
 
     def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor":
         """
@@ -67,6 +71,7 @@ def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor":
             uniform=False,
             has_lora=self.has_lora,
             num_active_loras=self.num_active_loras,
+            is_vit=self.is_vit,
         )
 
 
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 6f3e029c793b..3368f97fe3b3 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -186,6 +186,14 @@ def initialize_cudagraph_keys(
                         bs, False, num_active_loras > 0, num_active_loras
                     ).relax_for_mixed_batch_cudagraphs(),
                 )
+            # ViT CUDAGraph Entry
+            for vit_patch_len in self.compilation_config.vit_cudagraph_capture_sizes:
+                self.add_cudagraph_key(
+                    cudagraph_mode.mixed_mode(),
+                    BatchDescriptor(
+                        num_tokens=vit_patch_len, uniform_decode=False, is_vit=True
+                    ),
+                )
 
         # if decode cudagraph mode is FULL, and we don't already have mixed
         # mode full cudagraphs then add them here.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f3a2c4a5c0fe..0128262de226 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2436,6 +2436,7 @@ def _execute_mm_encoder(
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
                 original_num_imgs = -1
+                padded_num_tokens = -1
                 if "pixel_values" in mm_kwargs_group:
                     pixel_values = mm_kwargs_group["pixel_values"]
                     num_tokens = pixel_values.shape[0]
@@ -2471,14 +2472,18 @@ def _execute_mm_encoder(
                             )
                             mm_kwargs_group["image_grid_thw"] = self.image_grid_thw_buffer[:num_images + 1]
 
-                # TODO get batch_descriptor from dispatch
+                # get batch_descriptor from dispatcher
                 batch_descriptor = BatchDescriptor(
-                    num_tokens=mm_kwargs_group["pixel_values"].shape[0],
+                    num_tokens=padded_num_tokens,
+                    is_vit=True,
+                )
+                cudagraph_runtime_mode, batch_descriptor = (
+                    self.cudagraph_dispatcher.dispatch(batch_descriptor, False)
                 )
                 with set_forward_context(
                         None,
                         vllm_config=self.vllm_config,
-                        cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                        cudagraph_runtime_mode=cudagraph_runtime_mode,
                         batch_descriptor=batch_descriptor,
                     ), self.timed_encoder_operation(
                         should_time, mm_lora_refs, current_item_idx, num_items
@@ -5232,6 +5237,7 @@ def _dummy_mm_encoder_run(
             dummy_mm_inputs = self._get_dummy_vit_input(capture_size, img_feature_dim)
             batch_descriptor = BatchDescriptor(
                                 num_tokens=capture_size,
+                                is_vit=True,
                             )
             with (
                 set_forward_context(

From 2762ba685a7bfa8130c7b50c8589d27c949a717e Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Tue, 18 Nov 2025 19:44:46 +0800
Subject: [PATCH 07/35] feat: update Qwen2.5-VL model to support dynamic buffer
 sizes based on CUDA graph capture settings

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 32 +++++++++++++-----------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 8186ff244608..9d59671c6772 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -43,10 +43,10 @@
 )
 
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.forward_context import set_forward_context
+from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.attention import MMEncoderAttention
@@ -640,9 +640,17 @@ def __init__(
                 quant_config=quant_config,
                 prefix=f"{prefix}.merger",
             )
-
-        self._persistent_hidden_states_buffer = torch.empty((8192, 1176), device=self.device, dtype=self.dtype)
-        self._persistent_rotary_pos_emb_buffer = torch.empty((8192, 40), device=self.device, dtype=torch.float32)
+        vllm_config: VllmConfig = get_current_vllm_config()
+        self._persistent_hidden_states_buffer = None
+        self._persistent_rotary_pos_emb_buffer = None
+        if vllm_config.compilation_config.vit_cudagraph_capture_sizes:
+            max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
+            self._persistent_hidden_states_buffer = torch.empty(
+                (max_compile_size, self.patch_embed.proj.input_size), device=self.device, dtype=self.dtype
+            )
+            self._persistent_rotary_pos_emb_buffer = torch.empty(
+                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.float32
+            )
 
     @property
     def dtype(self) -> torch.dtype:
@@ -787,8 +795,9 @@ def forward(
         cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)]
         cu_seqlens: list = []
 
-        # logger.info(f"X Shape: {x.shape}")
-        if seq_len < 8192:
+        fwd_ctx = get_forward_context()
+        if self._persistent_hidden_states_buffer is not None and \
+            fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE:
             hidden_states = self._persistent_hidden_states_buffer[:seq_len]
             hidden_states.copy_(x, non_blocking=True)
         else:
@@ -850,21 +859,19 @@ def forward(
             device=self.device, non_blocking=True
         )
         rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True)
-        if seq_len < 8192:
+        if self._persistent_rotary_pos_emb_buffer is not None and \
+            fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE:
             rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(rotary_pos_emb)
         window_index = window_index.to(device=hidden_states.device, non_blocking=True)
         reverse_indices = reverse_indices.to(
             device=hidden_states.device, non_blocking=True
         )
         original_hidden_states = hidden_states  # 这只是引用，不是拷贝
-        # logger.info(f"Before Copy, original address: {original_hidden_states.storage().data_ptr()}")
-        # logger.info(f"Original Numel: {original_hidden_states.numel()}")
         # Step 2: 执行一些转换操作（这些会创建新张量）
         tmp = original_hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
         tmp = tmp[window_index, :, :]
         tmp = tmp.reshape(seq_len, -1)
         tmp = tmp.unsqueeze(1)
-        # logger.info(f"Tmp Numel: {tmp.numel()}")
         # Step 3: 将结果拷贝回原始张量的显存地址中（这是原地拷贝！）
         original_storage = original_hidden_states.storage()
         tmp_storage = tmp.storage()
@@ -875,10 +882,7 @@ def forward(
         new_shape = tmp.shape  # (seq_len, 1, new_hidden_dim)
         hidden_states = original_hidden_states.view(new_shape)
         # 现在 hidden_states.shape == new_shape，且使用和 original 相同的显存
-        # logger.info(f"After Copy, original address: {original_hidden_states.storage().data_ptr()}")
-        # logger.info(f"After Copy, tmp address: {tmp.storage().data_ptr()}")
 
-        # logger.info(f"Before Input to Vision Block, Shape: {hidden_states.shape}")
         with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(False):
             for layer_num, blk in enumerate(self.blocks):
                 if layer_num in self.fullatt_block_indexes:

From fb9225e50414c069bdaefed1c1685c4452b5629c Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Wed, 19 Nov 2025 16:15:46 +0800
Subject: [PATCH 08/35] fix: Ordering vit_cudagraph capture sizes and disable
 vit dp mode

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/config/vllm.py                      | 6 ++++--
 vllm/model_executor/models/qwen2_5_vl.py | 4 ++--
 vllm/v1/worker/gpu_model_runner.py       | 4 +++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index ad3c9ea80a88..fc3a421a3d76 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1355,6 +1355,8 @@ def _set_vit_cudagraph_sizes(self):
             self.model_config is not None
             and not self.model_config.enforce_eager
             and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and self.model_config.multimodal_config is not None
+            and self.model_config.multimodal_config.mm_encoder_tp_mode != "data"
         ):
             # determine the vit_cudagraph_capture_sizes
             if self.compilation_config.vit_cudagraph_capture_sizes is not None:
@@ -1362,11 +1364,11 @@ def _set_vit_cudagraph_sizes(self):
                     "vit_cudagraph_capture_sizes should contain at least one element "
                     "when using cuda graph."
                 )
-                # sort to make sure the sizes are in ascending order
-                self.compilation_config.vit_cudagraph_capture_sizes.sort()
                 # de-duplicate the sizes provided by the config
                 dedup_sizes = list(set(self.compilation_config.vit_cudagraph_capture_sizes))
                 vit_cudagraph_capture_sizes = dedup_sizes
+                # sort to make sure the sizes are in ascending order
+                vit_cudagraph_capture_sizes.sort()
             else:
                 max_vit_cudagraph_capture_size = 5120
                 vit_cudagraph_capture_sizes = [
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 9d59671c6772..e4976fb2e416 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -641,8 +641,8 @@ def __init__(
                 prefix=f"{prefix}.merger",
             )
         vllm_config: VllmConfig = get_current_vllm_config()
-        self._persistent_hidden_states_buffer = None
-        self._persistent_rotary_pos_emb_buffer = None
+        self._persistent_hidden_states_buffer: torch.Tensor | None = None
+        self._persistent_rotary_pos_emb_buffer: torch.Tensor | None = None
         if vllm_config.compilation_config.vit_cudagraph_capture_sizes:
             max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
             self._persistent_hidden_states_buffer = torch.empty(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0128262de226..8fc731e245c9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -560,6 +560,8 @@ def __init__(
             self.vit_cudagraph_batch_sizes = sorted(
                 self.compilation_config.vit_cudagraph_capture_sizes
             )
+        else:
+            self.vit_cudagraph_batch_sizes = None
 
         # Cache the device properties.
         self._init_device_properties()
@@ -4654,7 +4656,7 @@ def _get_dummy_vit_input(self, num_image_tokens: int, img_feature_dim: int) -> B
         """
         def _get_dummy_h_w_patches(patches: int):
             merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1)
-            assert(patches % (merge_size * merge_size) == 0, "Number of patches must be multiple of merge_size squared")
+            assert(patches % (merge_size * merge_size) == 0), "Number of patches must be multiple of merge_size squared"
             h_patches = merge_size
             w_patches = patches // merge_size
             return h_patches, w_patches

From c216a0c11e78277a2e061f171fbefa6846dd932a Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Thu, 20 Nov 2025 11:14:05 +0800
Subject: [PATCH 09/35] chore: Optimize code structure and add documentation

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 docs/design/torch_compile.md             |  6 +++++
 vllm/config/vllm.py                      |  4 ---
 vllm/model_executor/models/qwen2_5_vl.py | 32 ++++++++++++------------
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 4dc0da0c7d65..3cef2165543e 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -253,6 +253,12 @@ By default, vLLM will try to determine a set of sizes to capture cudagraph. You
 vllm serve meta-llama/Llama-3.2-1B \
   --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
 ```
+Similarly, For `Qwen2.5-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. Note that ViT DP mode is **not supported**. You can use `--compilation-config '{"vit_cudagraph_capture_sizes": []}'` to disable only the ViT part of the CUDA graph, or use `--enforce-eager` to disable the entire CUDA graph.
+
+```bash
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
+  --compilation-config '{"vit_cudagraph_capture_sizes": [512, 1024]}'
+```
 
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index fc3a421a3d76..84a0c7686054 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1360,10 +1360,6 @@ def _set_vit_cudagraph_sizes(self):
         ):
             # determine the vit_cudagraph_capture_sizes
             if self.compilation_config.vit_cudagraph_capture_sizes is not None:
-                assert len(self.compilation_config.vit_cudagraph_capture_sizes) > 0, (
-                    "vit_cudagraph_capture_sizes should contain at least one element "
-                    "when using cuda graph."
-                )
                 # de-duplicate the sizes provided by the config
                 dedup_sizes = list(set(self.compilation_config.vit_cudagraph_capture_sizes))
                 vit_cudagraph_capture_sizes = dedup_sizes
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index e4976fb2e416..c6c7a1adb0ce 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -866,22 +866,22 @@ def forward(
         reverse_indices = reverse_indices.to(
             device=hidden_states.device, non_blocking=True
         )
-        original_hidden_states = hidden_states  # 这只是引用，不是拷贝
-        # Step 2: 执行一些转换操作（这些会创建新张量）
-        tmp = original_hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
-        tmp = tmp[window_index, :, :]
-        tmp = tmp.reshape(seq_len, -1)
-        tmp = tmp.unsqueeze(1)
-        # Step 3: 将结果拷贝回原始张量的显存地址中（这是原地拷贝！）
-        original_storage = original_hidden_states.storage()
-        tmp_storage = tmp.storage()
-        original_storage.copy_(tmp_storage)
-
-        # Step 4: 创建一个使用原始显存、具有新 shape 的 view
-        # 条件：original numel 必须等于新 shape 的总元素数
-        new_shape = tmp.shape  # (seq_len, 1, new_hidden_dim)
-        hidden_states = original_hidden_states.view(new_shape)
-        # 现在 hidden_states.shape == new_shape，且使用和 original 相同的显存
+
+        original_hidden_states = hidden_states
+        hidden_states = hidden_states.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
+        )
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        hidden_states = hidden_states.unsqueeze(1)
+
+        if self._persistent_hidden_states_buffer is not None and \
+            fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE:
+            # The above operations will produce temporary new tensors.
+            # That is not friendly to cudagraphs, so we need to copy them back to the persistent buffer
+            original_hidden_states = original_hidden_states.view(hidden_states.shape)
+            original_hidden_states.copy_(hidden_states)
+            hidden_states = original_hidden_states
 
         with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(False):
             for layer_num, blk in enumerate(self.blocks):

From c85b49b2fbf3a4048e883e02518bf3cd46ced105 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Thu, 20 Nov 2025 14:01:16 +0800
Subject: [PATCH 10/35] chore: rebase to v0.11.1

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 docs/design/torch_compile.md       |  5 ++---
 vllm/config/vllm.py                | 34 ++++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py |  6 +++---
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 3cef2165543e..041f029294e4 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -253,11 +253,10 @@ By default, vLLM will try to determine a set of sizes to capture cudagraph. You
 vllm serve meta-llama/Llama-3.2-1B \
   --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
 ```
-Similarly, For `Qwen2.5-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. Note that ViT DP mode is **not supported**. You can use `--compilation-config '{"vit_cudagraph_capture_sizes": []}'` to disable only the ViT part of the CUDA graph, or use `--enforce-eager` to disable the entire CUDA graph.
-
+Similarly, For `Qwen2.5-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. Note that ViT DP mode is **not supported**. By default, this is disabled as `compile_mm_encoder` is `False`. To enable it and specify capture sizes, you can do the following:
 ```bash
 vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
-  --compilation-config '{"vit_cudagraph_capture_sizes": [512, 1024]}'
+  --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": [512, 1024]}'
 ```
 
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 84a0c7686054..f2b522a714a3 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1351,10 +1351,44 @@ def _set_compile_ranges(self):
             computed_compile_ranges_split_points
         )
     def _set_vit_cudagraph_sizes(self):
+        """Sets the CUDA graph capture sizes for the Vision Transformer (ViT).
+
+        This method determines the batch sizes for which ViT CUDA graphs will be
+        captured. CUDA graphs improve performance by reducing kernel launch
+        overhead for the vision encoder.
+
+        The logic is as follows:
+        1.  The feature is only enabled if all of the following conditions are met:
+            - Eager mode is not enforced.
+            - CUDA graph mode is enabled.
+            - The multimodal encoder compilation is enabled.
+            - A multimodal config is present.
+            - The multimodal encoder tensor parallelism mode is not "data".
+            If these conditions are not met, the list of capture sizes will be empty,
+            effectively disabling ViT CUDA graphs.
+
+        2.  If the user has explicitly provided `vit_cudagraph_capture_sizes` in the
+            compilation config, those sizes are used. The list is de-duplicated
+            and sorted in ascending order.
+
+        3.  If no sizes are provided by the user, a default list of sizes is
+            generated up to a maximum of 5120. The default sizes are:
+            [16, 32, 64, 128, 256] + list(range(512, 2048, 64)) + list(
+            range(2048, 5120 + 1, 128))
+
+        The final list of sizes is stored in
+        `self.compilation_config.vit_cudagraph_capture_sizes`.
+
+        - If a batch's size matches or is smaller than a captured size, the
+          closest captured graph is used.
+        - If a batch's size is larger than the largest captured size, a CUDA
+          graph will not be used for that batch.
+        """
         if (
             self.model_config is not None
             and not self.model_config.enforce_eager
             and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and self.compilation_config.compile_mm_encoder
             and self.model_config.multimodal_config is not None
             and self.model_config.multimodal_config.mm_encoder_tp_mode != "data"
         ):
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8fc731e245c9..ed87b8a2bc63 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2490,7 +2490,7 @@ def _execute_mm_encoder(
                     ), self.timed_encoder_operation(
                         should_time, mm_lora_refs, current_item_idx, num_items
                     ):
-                    curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group)
+                    curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
                 # Remove the padded items before sanity check
                 if original_num_imgs != -1:
                     curr_group_outputs = curr_group_outputs[:original_num_imgs]
@@ -4652,7 +4652,7 @@ def _get_dummy_vit_input(self, num_image_tokens: int, img_feature_dim: int) -> B
         Returns:
             A BatchedTensorInputs dictionary containing `pixel_values` and
             `image_grid_thw` that can be passed as kwargs to
-            `get_multimodal_embeddings`.
+            `embed_multimodal`.
         """
         def _get_dummy_h_w_patches(patches: int):
             merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1)
@@ -5249,7 +5249,7 @@ def _dummy_mm_encoder_run(
                     batch_descriptor=batch_descriptor,
                 ),
             ):
-                self.model.get_multimodal_embeddings(**dummy_mm_inputs)
+                self.model.embed_multimodal(**dummy_mm_inputs)
 
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.

From f1f26d05558980eadf3b4acb9a259f108bcff1f1 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Thu, 20 Nov 2025 17:05:02 +0800
Subject: [PATCH 11/35] chore: ruff format

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/compilation/backends.py             |  4 ++
 vllm/config/vllm.py                      | 14 ++--
 vllm/engine/arg_utils.py                 | 11 ++--
 vllm/model_executor/models/qwen2_5_vl.py | 56 ++++++++++++----
 vllm/v1/worker/gpu_model_runner.py       | 84 +++++++++++++++---------
 5 files changed, 116 insertions(+), 53 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index ce2cacd0b7cd..20e74d619adf 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -54,6 +54,7 @@
 # in **vit** piecewise compilation.
 _is_last_graph_in_vit_sequence: bool = True
 
+
 @contextmanager
 def set_is_last_graph_in_sequence(is_last: bool):
     """Context manager to indicate if the current graph being compiled
@@ -67,10 +68,12 @@ def set_is_last_graph_in_sequence(is_last: bool):
     finally:
         _is_last_graph_in_vit_sequence = original_value
 
+
 # A global flag to indicate if the current graph being compiled
 # is the first one in a sequence of graphs (e.g., a sequence of blocks).
 _is_first_graph_in_vit_sequence: bool = True
 
+
 @contextmanager
 def set_is_first_graph_in_sequence(is_first: bool):
     """Context manager to indicate if the current graph being compiled
@@ -121,6 +124,7 @@ def copy_and_call(*args: Any) -> Any:
     return copy_and_call
 
 
+
 def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
     assert not envs.VLLM_USE_MEGA_AOT_ARTIFACT or envs.VLLM_USE_STANDALONE_COMPILE, (
         "VLLM_USE_MEGA_AOT_ARTIFACT=1 requires VLLM_USE_STANDALONE_COMPILE=1"
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index f2b522a714a3..f96e9733aecd 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import bisect
 import copy
 import getpass
 import json
@@ -15,7 +16,6 @@
 from functools import lru_cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypeVar, get_args
-import bisect
 
 import torch
 from pydantic import ConfigDict, Field, model_validator
@@ -1395,14 +1395,18 @@ def _set_vit_cudagraph_sizes(self):
             # determine the vit_cudagraph_capture_sizes
             if self.compilation_config.vit_cudagraph_capture_sizes is not None:
                 # de-duplicate the sizes provided by the config
-                dedup_sizes = list(set(self.compilation_config.vit_cudagraph_capture_sizes))
+                dedup_sizes = list(
+                    set(self.compilation_config.vit_cudagraph_capture_sizes)
+                )
                 vit_cudagraph_capture_sizes = dedup_sizes
                 # sort to make sure the sizes are in ascending order
                 vit_cudagraph_capture_sizes.sort()
             else:
                 max_vit_cudagraph_capture_size = 5120
                 vit_cudagraph_capture_sizes = [
-                    i for i in [16, 32, 64, 128, 256] if i <= max_vit_cudagraph_capture_size
+                    i
+                    for i in [16, 32, 64, 128, 256]
+                    if i <= max_vit_cudagraph_capture_size
                 ]
                 if max_vit_cudagraph_capture_size >= 1024:
                     # Step size 64 for small batch sizes, up to 2048(not included)
@@ -1414,7 +1418,9 @@ def _set_vit_cudagraph_sizes(self):
                     vit_cudagraph_capture_sizes += list(
                         range(2048, max_vit_cudagraph_capture_size + 1, 128)
                     )
-            self.compilation_config.vit_cudagraph_capture_sizes = vit_cudagraph_capture_sizes
+            self.compilation_config.vit_cudagraph_capture_sizes = (
+                vit_cudagraph_capture_sizes
+            )
         else:
             # no cudagraph in use
             self.compilation_config.vit_cudagraph_capture_sizes = []
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b6d901581f6c..9733b0f26ec2 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1152,7 +1152,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"]
         )
         compilation_group.add_argument(
-            "--vit-cudagraph-capture-sizes", **compilation_kwargs["vit_cudagraph_capture_sizes"]
+            "--vit-cudagraph-capture-sizes",
+            **compilation_kwargs["vit_cudagraph_capture_sizes"],
         )
         compilation_group.add_argument(
             "--max-cudagraph-capture-size",
@@ -1743,15 +1744,17 @@ def create_engine_config(
                     "cudagraph_capture_sizes are mutually exclusive"
                 )
             compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes
-        
+
         if self.vit_cudagraph_capture_sizes is not None:
             if compilation_config.vit_cudagraph_capture_sizes is not None:
                 raise ValueError(
                     "vit_cudagraph_capture_sizes and compilation_config."
                     "vit_cudagraph_capture_sizes are mutually exclusive"
                 )
-            compilation_config.vit_cudagraph_capture_sizes = self.vit_cudagraph_capture_sizes
-            
+            compilation_config.vit_cudagraph_capture_sizes = (
+                self.vit_cudagraph_capture_sizes
+            )
+
         if self.max_cudagraph_capture_size is not None:
             if compilation_config.max_cudagraph_capture_size is not None:
                 raise ValueError(
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index c6c7a1adb0ce..02e6af8334ac 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -644,12 +644,18 @@ def __init__(
         self._persistent_hidden_states_buffer: torch.Tensor | None = None
         self._persistent_rotary_pos_emb_buffer: torch.Tensor | None = None
         if vllm_config.compilation_config.vit_cudagraph_capture_sizes:
-            max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
+            max_compile_size = (
+                vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
+            )
             self._persistent_hidden_states_buffer = torch.empty(
-                (max_compile_size, self.patch_embed.proj.input_size), device=self.device, dtype=self.dtype
+                (max_compile_size, self.patch_embed.proj.input_size),
+                device=self.device,
+                dtype=self.dtype,
             )
             self._persistent_rotary_pos_emb_buffer = torch.empty(
-                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.float32
+                (max_compile_size, head_dim // 2),
+                device=self.device,
+                dtype=torch.float32,
             )
 
     @property
@@ -796,14 +802,21 @@ def forward(
         cu_seqlens: list = []
 
         fwd_ctx = get_forward_context()
-        if self._persistent_hidden_states_buffer is not None and \
-            fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE:
+        if (
+            self._persistent_hidden_states_buffer is not None
+            and fwd_ctx
+            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+        ):
             hidden_states = self._persistent_hidden_states_buffer[:seq_len]
             hidden_states.copy_(x, non_blocking=True)
         else:
             hidden_states = x.to(device=self.device, dtype=self.dtype)
 
-        from vllm.compilation.backends import set_is_first_graph_in_sequence, set_is_last_graph_in_sequence
+        from vllm.compilation.backends import (
+            set_is_first_graph_in_sequence,
+            set_is_last_graph_in_sequence,
+        )
+
         with set_is_first_graph_in_sequence(True), set_is_last_graph_in_sequence(False):
             hidden_states = self.patch_embed(hidden_states)
 
@@ -859,9 +872,14 @@ def forward(
             device=self.device, non_blocking=True
         )
         rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True)
-        if self._persistent_rotary_pos_emb_buffer is not None and \
-            fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE:
-            rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(rotary_pos_emb)
+        if (
+            self._persistent_rotary_pos_emb_buffer is not None
+            and fwd_ctx
+            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+        ):
+            rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(
+                rotary_pos_emb
+            )
         window_index = window_index.to(device=hidden_states.device, non_blocking=True)
         reverse_indices = reverse_indices.to(
             device=hidden_states.device, non_blocking=True
@@ -875,15 +893,22 @@ def forward(
         hidden_states = hidden_states.reshape(seq_len, -1)
         hidden_states = hidden_states.unsqueeze(1)
 
-        if self._persistent_hidden_states_buffer is not None and \
-            fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE:
+        if (
+            self._persistent_hidden_states_buffer is not None
+            and fwd_ctx
+            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+        ):
             # The above operations will produce temporary new tensors.
-            # That is not friendly to cudagraphs, so we need to copy them back to the persistent buffer
+            # That is not friendly to cudagraphs,
+            # so we need to copy them back to the persistent buffer
             original_hidden_states = original_hidden_states.view(hidden_states.shape)
             original_hidden_states.copy_(hidden_states)
             hidden_states = original_hidden_states
 
-        with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(False):
+        with (
+            set_is_first_graph_in_sequence(False),
+            set_is_last_graph_in_sequence(False),
+        ):
             for layer_num, blk in enumerate(self.blocks):
                 if layer_num in self.fullatt_block_indexes:
                     cu_seqlens_now = cu_seqlens
@@ -1245,7 +1270,10 @@ def _process_image_input(
                 with set_forward_context(None, self.vllm_config):
                     if self.use_data_parallel:
                         return run_dp_sharded_mrope_vision_model(
-                            self.visual, pixel_values, grid_thw_list, rope_type="rope_3d"
+                            self.visual,
+                            pixel_values,
+                            grid_thw_list,
+                            rope_type="rope_3d",
                         )
                     else:
                         image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ed87b8a2bc63..775c9523553f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -553,6 +553,7 @@ def __init__(
                 self.compilation_config.cudagraph_capture_sizes
             )
         # self.vit_cudagraph_batch_sizes sorts in ascending order.
+        self.vit_cudagraph_batch_sizes: list[int] | None
         if (
             self.compilation_config.vit_cudagraph_capture_sizes
             and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
@@ -2444,13 +2445,20 @@ def _execute_mm_encoder(
                     num_tokens = pixel_values.shape[0]
 
                     # Pad to the size expected by CUDA graph
-                    padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph(num_tokens)
+                    padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph(
+                        num_tokens
+                    )
 
                     if padded_num_tokens > num_tokens:
-                        assert(self.pixel_values_buffer is not None and self.image_grid_thw_buffer is not None)
-                        
-                        self.pixel_values_buffer[:num_tokens].copy_(pixel_values) # type: ignore
-                        mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[:padded_num_tokens]
+                        assert (
+                            self.pixel_values_buffer is not None
+                            and self.image_grid_thw_buffer is not None
+                        )
+
+                        self.pixel_values_buffer[:num_tokens].copy_(pixel_values)  # type: ignore
+                        mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[
+                            :padded_num_tokens
+                        ]
 
                         # Update image_grid_thw to account for padding
                         if "image_grid_thw" in mm_kwargs_group:
@@ -2458,21 +2466,29 @@ def _execute_mm_encoder(
                             num_images = image_grid_thw.shape[0]
                             original_num_imgs = num_images
                             padding_amount = padded_num_tokens - num_tokens
-                            
+
                             # Treat padding as a new virtual image.
-                            # Assuming a fixed patch grid logic where height is merge_size.
-                            merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1)
-                            assert(padding_amount % (merge_size * merge_size) == 0)
+                            # Assuming a fixed patch size where height is merge_size.
+                            merge_size = getattr(
+                                self.model_config.hf_config.vision_config,
+                                "spatial_merge_size",
+                                1,
+                            )
+                            assert padding_amount % (merge_size * merge_size) == 0
                             h_patches = merge_size
                             w_patches = padding_amount // h_patches
 
-                            self.image_grid_thw_buffer[:num_images].copy_(image_grid_thw)
+                            self.image_grid_thw_buffer[:num_images].copy_(
+                                image_grid_thw
+                            )
                             self.image_grid_thw_buffer[num_images] = torch.tensor(
                                 [1, h_patches, w_patches],
                                 dtype=torch.long,
-                                device=self.device
+                                device=self.device,
+                            )
+                            mm_kwargs_group["image_grid_thw"] = (
+                                self.image_grid_thw_buffer[: num_images + 1]
                             )
-                            mm_kwargs_group["image_grid_thw"] = self.image_grid_thw_buffer[:num_images + 1]
 
                 # get batch_descriptor from dispatcher
                 batch_descriptor = BatchDescriptor(
@@ -4637,7 +4653,9 @@ def rand_inputs_embeds() -> torch.Tensor:
             yield
             inputs_embeds.fill_(0)
 
-    def _get_dummy_vit_input(self, num_image_tokens: int, img_feature_dim: int) -> BatchedTensorInputs:
+    def _get_dummy_vit_input(
+        self, num_image_tokens: int, img_feature_dim: int
+    ) -> BatchedTensorInputs:
         """
         Generates dummy multimodal inputs for a single image, with a controllable
         number of resulting image tokens for a Vision Transformer (ViT) like model,
@@ -4654,9 +4672,14 @@ def _get_dummy_vit_input(self, num_image_tokens: int, img_feature_dim: int) -> B
             `image_grid_thw` that can be passed as kwargs to
             `embed_multimodal`.
         """
+
         def _get_dummy_h_w_patches(patches: int):
-            merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1)
-            assert(patches % (merge_size * merge_size) == 0), "Number of patches must be multiple of merge_size squared"
+            merge_size = getattr(
+                self.model_config.hf_config.vision_config, "spatial_merge_size", 1
+            )
+            assert patches % (merge_size * merge_size) == 0, (
+                "Number of patches must be multiple of merge_size squared"
+            )
             h_patches = merge_size
             w_patches = patches // merge_size
             return h_patches, w_patches
@@ -4664,16 +4687,12 @@ def _get_dummy_h_w_patches(patches: int):
         # The first dimension of pixel_values corresponds to the total number of
         # tokens (patches).
         pixel_values = torch.zeros(
-            (num_image_tokens, img_feature_dim),
-            dtype=self.dtype,
-            device=self.device
+            (num_image_tokens, img_feature_dim), dtype=self.dtype, device=self.device
         )
 
         h_patches, w_patches = _get_dummy_h_w_patches(num_image_tokens)
         image_grid_thw = torch.tensor(
-            [[1, h_patches, w_patches]],
-            dtype=torch.long,
-            device=self.device
+            [[1, h_patches, w_patches]], dtype=torch.long, device=self.device
         )
 
         return {
@@ -5216,17 +5235,17 @@ def _dummy_mm_encoder_run(
     ) -> None:
         if self.pixel_values_buffer is None:
             tmp_dummy_mm_inputs = self._get_mm_dummy_batch(
-                        "image",
-                        1,
-                    )
+                "image",
+                1,
+            )
             img_feature_dim = tmp_dummy_mm_inputs["pixel_values"].shape[1]
             self.pixel_values_buffer = torch.zeros(
                 (compilation_cases[0], img_feature_dim),
                 dtype=self.dtype,
-                device=self.device
+                device=self.device,
             )
-            self.image_grid_thw_buffer = torch.zeros((
-                512, 3), dtype=torch.long, device=self.device
+            self.image_grid_thw_buffer = torch.zeros(
+                (512, 3), dtype=torch.long, device=self.device
             )
         if is_global_first_rank():
             compilation_cases = tqdm(
@@ -5238,9 +5257,9 @@ def _dummy_mm_encoder_run(
         for capture_size in compilation_cases:
             dummy_mm_inputs = self._get_dummy_vit_input(capture_size, img_feature_dim)
             batch_descriptor = BatchDescriptor(
-                                num_tokens=capture_size,
-                                is_vit=True,
-                            )
+                num_tokens=capture_size,
+                is_vit=True,
+            )
             with (
                 set_forward_context(
                     None,
@@ -5462,7 +5481,10 @@ def _capture_cudagraphs(
                 num_active_loras=num_active_loras,
                 is_graph_capturing=True,
             )
-        if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs:
+        if (
+            cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+            and self.supports_mm_inputs
+        ):
             vit_capture_sizes = self.vit_cudagraph_batch_sizes
             if vit_capture_sizes:
                 compilation_cases_vit = list(reversed(vit_capture_sizes))

From ef269187bdce52f9706a494d7e643da729d52b71 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Sun, 23 Nov 2025 20:39:02 +0800
Subject: [PATCH 12/35] feat: Update vit_cudagraph capture size logic

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/config/vllm.py                | 17 +++++++++++++++--
 vllm/v1/worker/gpu_model_runner.py | 14 ++++++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index f96e9733aecd..54aab49ff2ff 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1402,7 +1402,15 @@ def _set_vit_cudagraph_sizes(self):
                 # sort to make sure the sizes are in ascending order
                 vit_cudagraph_capture_sizes.sort()
             else:
-                max_vit_cudagraph_capture_size = 5120
+                from vllm.multimodal import MULTIMODAL_REGISTRY
+                from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
+
+                encoder_compute_budget, _ = compute_encoder_budget(
+                    model_config=self.model_config,
+                    scheduler_config=self.scheduler_config,
+                    mm_registry=MULTIMODAL_REGISTRY,
+                )
+                max_vit_cudagraph_capture_size = min(encoder_compute_budget, 32768)
                 vit_cudagraph_capture_sizes = [
                     i
                     for i in [16, 32, 64, 128, 256]
@@ -1416,7 +1424,12 @@ def _set_vit_cudagraph_sizes(self):
                 if max_vit_cudagraph_capture_size >= 2048:
                     # Step size 128 for larger batch sizes
                     vit_cudagraph_capture_sizes += list(
-                        range(2048, max_vit_cudagraph_capture_size + 1, 128)
+                        range(2048, min(max_vit_cudagraph_capture_size + 1, 4096), 128)
+                    )
+                if max_vit_cudagraph_capture_size >= 4096:
+                    # Step size 256 for largest batch sizes
+                    vit_cudagraph_capture_sizes += list(
+                        range(4096, max_vit_cudagraph_capture_size + 1, 256)
                     )
             self.compilation_config.vit_cudagraph_capture_sizes = (
                 vit_cudagraph_capture_sizes
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 775c9523553f..9a5ebe562dae 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2440,7 +2440,7 @@ def _execute_mm_encoder(
                 # size is dynamic depending on the input multimodal items.
                 original_num_imgs = -1
                 padded_num_tokens = -1
-                if "pixel_values" in mm_kwargs_group:
+                if self.vit_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group:
                     pixel_values = mm_kwargs_group["pixel_values"]
                     num_tokens = pixel_values.shape[0]
 
@@ -2477,7 +2477,17 @@ def _execute_mm_encoder(
                             assert padding_amount % (merge_size * merge_size) == 0
                             h_patches = merge_size
                             w_patches = padding_amount // h_patches
-
+                            if num_images + 1 > self.image_grid_thw_buffer.shape[0]:
+                                new_size = max(
+                                    self.image_grid_thw_buffer.shape[0] * 2,
+                                    num_images + 1,
+                                )
+                                new_buffer = torch.zeros(
+                                    (new_size, 3),
+                                    dtype=torch.long,
+                                    device=self.device,
+                                )
+                                self.image_grid_thw_buffer = new_buffer
                             self.image_grid_thw_buffer[:num_images].copy_(
                                 image_grid_thw
                             )

From 2872257dbf4b2975d343f16099059e032db5b33d Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Wed, 29 Oct 2025 11:48:06 +0000
Subject: [PATCH 13/35] [Model][Qwen3VL] Add `torch.compile` support for
 Qwen3VL

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/model_executor/models/qwen3_vl.py | 142 +++++++++++++++----------
 1 file changed, 84 insertions(+), 58 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 97754833953f..c6c818941128 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -52,6 +52,7 @@
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_pp_group
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
 from vllm.model_executor.layers.conv import Conv3dLayer
@@ -139,6 +140,7 @@
 DUMMY_VIDEO_NUM_FRAMES = 2048
 
 
+@support_torch_compile(dynamic_arg_dims={"x": 0})
 class Qwen3_VisionPatchEmbed(nn.Module):
     def __init__(
         self,
@@ -205,6 +207,10 @@ def forward(self, x: torch.Tensor):
         return mlp_output
 
 
+@support_torch_compile(
+    dynamic_arg_dims={"x": 0, "cu_seqlens": 0, "rotary_pos_emb": 0, "seqlens": 0},
+    mark_unbacked_dims={"seqlens": 0},
+)
 class Qwen3_VisionBlock(nn.Module):
     def __init__(
         self,
@@ -257,6 +263,7 @@ def forward(
         return x
 
 
+@support_torch_compile(dynamic_arg_dims={"x": 0})
 class Qwen3_VisionPatchMerger(nn.Module):
     def __init__(
         self,
@@ -286,6 +293,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.linear_fc1",
             disable_tp=use_data_parallel,
+            return_bias=False,
         )
         self.act_fn = nn.GELU()
         self.linear_fc2 = RowParallelLinear(
@@ -295,6 +303,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.linear_fc2",
             disable_tp=use_data_parallel,
+            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -303,9 +312,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         else:
             x = self.norm(x).view(-1, self.hidden_size)
 
-        x_parallel, _ = self.linear_fc1(x)
+        x_parallel = self.linear_fc1(x)
         x_parallel = self.act_fn(x_parallel)
-        out, _ = self.linear_fc2(x_parallel)
+        out = self.linear_fc2(x_parallel)
         return out
 
 
@@ -333,13 +342,18 @@ def __init__(
         self.out_hidden_size = vision_config.out_hidden_size * (
             1 + len(self.deepstack_visual_indexes)
         )
-
-        self.patch_embed = Qwen3_VisionPatchEmbed(
-            patch_size=self.patch_size,
-            temporal_patch_size=self.temporal_patch_size,
-            in_channels=vision_config.in_channels,
-            hidden_size=self.hidden_size,
-        )
+        # TODO[@lucaskabela]: Investigate fixing this usage
+        # see https://github.com/vllm-project/vllm/issues/27044
+        # DO NOT MOVE THIS IMPORT
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("Qwen3_VisionPatchEmbed", is_encoder=True):
+            self.patch_embed = Qwen3_VisionPatchEmbed(
+                patch_size=self.patch_size,
+                temporal_patch_size=self.temporal_patch_size,
+                in_channels=vision_config.in_channels,
+                hidden_size=self.hidden_size,
+            )
 
         self.pos_embed = nn.Embedding(self.num_position_embeddings, self.hidden_size)
 
@@ -352,29 +366,31 @@ def __init__(
             rope_parameters={"partial_rotary_factor": 0.5},
         )
 
-        self.merger = Qwen3_VisionPatchMerger(
-            d_model=vision_config.out_hidden_size,
-            context_dim=self.hidden_size,
-            norm_layer=norm_layer,
-            spatial_merge_size=self.spatial_merge_size,
-            quant_config=quant_config,
-            prefix=f"{prefix}.merger",
-        )
+        with set_model_tag("Qwen3_VisionPatchMerger", is_encoder=True):
+            self.merger = Qwen3_VisionPatchMerger(
+                d_model=vision_config.out_hidden_size,
+                context_dim=self.hidden_size,
+                norm_layer=norm_layer,
+                spatial_merge_size=self.spatial_merge_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.merger",
+            )
 
-        self.deepstack_merger_list = nn.ModuleList(
-            [
-                Qwen3_VisionPatchMerger(
-                    d_model=vision_config.out_hidden_size,
-                    context_dim=self.hidden_size,
-                    spatial_merge_size=self.spatial_merge_size,
-                    use_postshuffle_norm=True,
-                    norm_layer=norm_layer,
-                    quant_config=quant_config,
-                    prefix=f"{prefix}.deepstack_merger_list.{layer_idx}",
-                )
-                for layer_idx in range(len(self.deepstack_visual_indexes))
-            ]
-        )
+        with set_model_tag("Qwen3_VisionPatchMerger_postshuffle_norm", is_encoder=True):
+            self.deepstack_merger_list = nn.ModuleList(
+                [
+                    Qwen3_VisionPatchMerger(
+                        d_model=vision_config.out_hidden_size,
+                        context_dim=self.hidden_size,
+                        spatial_merge_size=self.spatial_merge_size,
+                        use_postshuffle_norm=True,
+                        norm_layer=norm_layer,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.deepstack_merger_list.{layer_idx}",
+                    )
+                    for layer_idx in range(len(self.deepstack_visual_indexes))
+                ]
+            )
 
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
@@ -389,20 +405,21 @@ def __init__(
             raise RuntimeError(
                 f"Qwen3-VL does not support {self.attn_backend} backend now."
             )
-        self.blocks = nn.ModuleList(
-            [
-                Qwen3_VisionBlock(
-                    dim=self.hidden_size,
-                    num_heads=self.num_heads,
-                    mlp_hidden_dim=vision_config.intermediate_size,
-                    act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
-                    norm_layer=norm_layer,
-                    quant_config=quant_config,
-                    prefix=f"{prefix}.blocks.{layer_idx}",
-                )
-                for layer_idx in range(vision_config.depth)
-            ]
-        )
+        with set_model_tag("Qwen3_VisionBlock", is_encoder=True):
+            self.blocks = nn.ModuleList(
+                [
+                    Qwen3_VisionBlock(
+                        dim=self.hidden_size,
+                        num_heads=self.num_heads,
+                        mlp_hidden_dim=vision_config.intermediate_size,
+                        act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
+                        norm_layer=norm_layer,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.blocks.{layer_idx}",
+                    )
+                    for layer_idx in range(vision_config.depth)
+                ]
+            )
 
     @property
     def dtype(self) -> torch.dtype:
@@ -1257,6 +1274,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
         multimodal_config = vllm_config.model_config.multimodal_config
 
         self.config = config
+        self.vllm_config = vllm_config
         self.multimodal_config = multimodal_config
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
         self.video_pruning_rate = multimodal_config.video_pruning_rate
@@ -1409,17 +1427,19 @@ def _process_image_input(
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
 
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-            if self.use_data_parallel:
-                return run_dp_sharded_mrope_vision_model(
-                    self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d"
-                )
-            else:
-                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+            with set_forward_context(None, self.vllm_config):
+                if self.use_data_parallel:
+                    return run_dp_sharded_mrope_vision_model(
+                        self.visual, pixel_values, grid_thw_list, rope_type="rope_3d"
+                    )
+                else:
+                    image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each image item.
         merge_size = self.visual.spatial_merge_size
@@ -1431,6 +1451,7 @@ def _process_video_input(
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
@@ -1438,13 +1459,18 @@ def _process_video_input(
             pixel_values_videos = video_input["pixel_values_videos"].type(
                 self.visual.dtype
             )
-            if self.use_data_parallel:
-                grid_thw_list = grid_thw.tolist()
-                return run_dp_sharded_mrope_vision_model(
-                    self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d"
-                )
-            else:
-                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+            with set_forward_context(None, self.vllm_config):
+                if self.use_data_parallel:
+                    return run_dp_sharded_mrope_vision_model(
+                        self.visual,
+                        pixel_values_videos,
+                        grid_thw_list,
+                        rope_type="rope_3d",
+                    )
+                else:
+                    video_embeds = self.visual(
+                        pixel_values_videos, grid_thw=grid_thw_list
+                    )
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size

From 8bff371b913b89247e66321f0bebb3ecc471ef64 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Fri, 26 Dec 2025 21:15:50 +0800
Subject: [PATCH 14/35] feat: Enhance Qwen3VL with ViT CUDAGraph support

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/model_executor/models/qwen3_vl.py | 107 ++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index c6c818941128..328a58361ed5 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -49,10 +49,10 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_pp_group
-from vllm.forward_context import set_forward_context
+from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
 from vllm.model_executor.layers.conv import Conv3dLayer
@@ -66,6 +66,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.models.vision import should_torch_compile_mm_vit
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.evs import (
     compute_mrope_for_media,
@@ -210,6 +211,7 @@ def forward(self, x: torch.Tensor):
 @support_torch_compile(
     dynamic_arg_dims={"x": 0, "cu_seqlens": 0, "rotary_pos_emb": 0, "seqlens": 0},
     mark_unbacked_dims={"seqlens": 0},
+    enable_if=should_torch_compile_mm_vit,
 )
 class Qwen3_VisionBlock(nn.Module):
     def __init__(
@@ -263,7 +265,8 @@ def forward(
         return x
 
 
-@support_torch_compile(dynamic_arg_dims={"x": 0})
+@support_torch_compile(dynamic_arg_dims={"x": 0},
+    enable_if=should_torch_compile_mm_vit)
 class Qwen3_VisionPatchMerger(nn.Module):
     def __init__(
         self,
@@ -420,6 +423,17 @@ def __init__(
                     for layer_idx in range(vision_config.depth)
                 ]
             )
+        vllm_config: VllmConfig = get_current_vllm_config()
+        self._persistent_hidden_states_buffer: torch.Tensor | None = None
+        self._persistent_rotary_pos_emb_buffer: torch.Tensor | None = None
+        if vllm_config.compilation_config.vit_cudagraph_capture_sizes:
+            max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
+            self._persistent_hidden_states_buffer = torch.empty(
+                (max_compile_size, self.patch_embed.proj.input_size), device=self.device, dtype=self.dtype
+            )
+            self._persistent_rotary_pos_emb_buffer = torch.empty(
+                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.float32
+            )
 
     @property
     def dtype(self) -> torch.dtype:
@@ -551,8 +565,25 @@ def forward(
         x: torch.Tensor,
         grid_thw: torch.Tensor | list[list[int]],
     ) -> torch.Tensor:
-        hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True)
-        hidden_states = self.patch_embed(hidden_states)
+        seq_len, _ = x.size()
+        fwd_ctx = get_forward_context()
+        if (
+            self._persistent_hidden_states_buffer is not None
+            and fwd_ctx
+            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+        ):
+            hidden_states = self._persistent_hidden_states_buffer[:seq_len]
+            hidden_states.copy_(x, non_blocking=True)
+        else:
+            hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True)
+
+        from vllm.compilation.backends import (
+            set_is_first_graph_in_sequence,
+            set_is_last_graph_in_sequence,
+        )
+
+        with set_is_first_graph_in_sequence(True), set_is_last_graph_in_sequence(False):
+            hidden_states = self.patch_embed(hidden_states)
 
         if isinstance(grid_thw, list):
             grid_thw_list = grid_thw
@@ -562,8 +593,19 @@ def forward(
             grid_thw = grid_thw.numpy()
 
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
+        original_hidden_states = hidden_states
         hidden_states = hidden_states + pos_embeds
         rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
+        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, non_blocking=True)
+        if (
+            self._persistent_rotary_pos_emb_buffer is not None
+            and fwd_ctx
+            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+        ):
+            rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(
+                rotary_pos_emb
+            )
 
         cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
             axis=0, dtype=np.int32
@@ -575,21 +617,37 @@ def forward(
         max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
+        if (
+            self._persistent_hidden_states_buffer is not None
+            and fwd_ctx
+            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+        ):
+            # The above operations will produce temporary new tensors.
+            # That is not friendly to cudagraphs,
+            # so we need to copy them back to the persistent buffer
+            original_hidden_states = original_hidden_states.view(hidden_states.shape)
+            original_hidden_states.copy_(hidden_states)
+            hidden_states = original_hidden_states
+
         deepstack_feature_lists = []
-        for layer_num, blk in enumerate(self.blocks):
-            hidden_states = blk(
-                hidden_states,
-                cu_seqlens=cu_seqlens,
-                rotary_pos_emb_cos=rotary_pos_emb_cos,
-                rotary_pos_emb_sin=rotary_pos_emb_sin,
-                max_seqlen=max_seqlen,
-            )
-            if layer_num in self.deepstack_visual_indexes:
-                deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num)
-                deepstack_feature = self.deepstack_merger_list[deepstack_merger_idx](
-                    hidden_states
+        with (
+            set_is_first_graph_in_sequence(False),
+            set_is_last_graph_in_sequence(False),
+        ):
+            for layer_num, blk in enumerate(self.blocks):
+                hidden_states = blk(
+                    hidden_states,
+                    cu_seqlens=cu_seqlens,
+                    rotary_pos_emb_cos=rotary_pos_emb_cos,
+                    rotary_pos_emb_sin=rotary_pos_emb_sin,
+                    max_seqlen=max_seqlen,
+                )
+                if layer_num in self.deepstack_visual_indexes:
+                    deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num)
+                    deepstack_feature = self.deepstack_merger_list[deepstack_merger_idx](
+                        hidden_states
                 )
-                deepstack_feature_lists.append(deepstack_feature)
+                    deepstack_feature_lists.append(deepstack_feature)
         hidden_states = self.merger(hidden_states)
         hidden_states = torch.cat(
             [hidden_states] + deepstack_feature_lists, dim=1
@@ -1433,7 +1491,18 @@ def _process_image_input(
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-            with set_forward_context(None, self.vllm_config):
+            if self.vllm_config.is_in_compile:
+                with set_forward_context(None, self.vllm_config):
+                    if self.use_data_parallel:
+                        return run_dp_sharded_mrope_vision_model(
+                            self.visual,
+                            pixel_values,
+                            grid_thw_list,
+                            rope_type="rope_3d",
+                        )
+                    else:
+                        image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+            else:
                 if self.use_data_parallel:
                     return run_dp_sharded_mrope_vision_model(
                         self.visual, pixel_values, grid_thw_list, rope_type="rope_3d"

From 7dc0fcfb460be7ba4c1190ec31d518b504939999 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Tue, 30 Dec 2025 15:39:20 +0800
Subject: [PATCH 15/35] feat: add vit dp mode cuda graph

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/config/vllm.py                    |  13 +--
 vllm/model_executor/models/qwen3_vl.py |  30 +++--
 vllm/model_executor/models/vision.py   |  91 +++++++++++----
 vllm/v1/worker/gpu_model_runner.py     | 147 +++++++++++++------------
 4 files changed, 168 insertions(+), 113 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 54aab49ff2ff..48c1e7be2d23 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1389,8 +1389,6 @@ def _set_vit_cudagraph_sizes(self):
             and not self.model_config.enforce_eager
             and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
             and self.compilation_config.compile_mm_encoder
-            and self.model_config.multimodal_config is not None
-            and self.model_config.multimodal_config.mm_encoder_tp_mode != "data"
         ):
             # determine the vit_cudagraph_capture_sizes
             if self.compilation_config.vit_cudagraph_capture_sizes is not None:
@@ -1410,17 +1408,10 @@ def _set_vit_cudagraph_sizes(self):
                     scheduler_config=self.scheduler_config,
                     mm_registry=MULTIMODAL_REGISTRY,
                 )
-                max_vit_cudagraph_capture_size = min(encoder_compute_budget, 32768)
+                max_vit_cudagraph_capture_size = min(encoder_compute_budget, 8192)
                 vit_cudagraph_capture_sizes = [
-                    i
-                    for i in [16, 32, 64, 128, 256]
-                    if i <= max_vit_cudagraph_capture_size
+                    i for i in [512, 1024, 1536] if i <= max_vit_cudagraph_capture_size
                 ]
-                if max_vit_cudagraph_capture_size >= 1024:
-                    # Step size 64 for small batch sizes, up to 2048(not included)
-                    vit_cudagraph_capture_sizes += list(
-                        range(512, min(max_vit_cudagraph_capture_size + 1, 2048), 64)
-                    )
                 if max_vit_cudagraph_capture_size >= 2048:
                     # Step size 128 for larger batch sizes
                     vit_cudagraph_capture_sizes += list(
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 328a58361ed5..a4464d9868f4 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -49,10 +49,10 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config
+from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_pp_group
-from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.forward_context import get_forward_context, set_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
 from vllm.model_executor.layers.conv import Conv3dLayer
@@ -1481,7 +1481,8 @@ def _parse_and_validate_video_input(
             )
 
     def _process_image_input(
-        self, image_input: Qwen2_5_VLImageInputs
+        self, image_input: Qwen2_5_VLImageInputs,
+        cudagraph_dispatcher: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
@@ -1491,22 +1492,26 @@ def _process_image_input(
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            maybe_in_vit_cuda_graph_capture = is_forward_context_available()
             if self.vllm_config.is_in_compile:
                 with set_forward_context(None, self.vllm_config):
-                    if self.use_data_parallel:
+                    if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture:
                         return run_dp_sharded_mrope_vision_model(
                             self.visual,
                             pixel_values,
                             grid_thw_list,
                             rope_type="rope_3d",
+                            cudagraph_dispatcher=cudagraph_dispatcher,
                         )
                     else:
                         image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
             else:
-                if self.use_data_parallel:
-                    return run_dp_sharded_mrope_vision_model(
-                        self.visual, pixel_values, grid_thw_list, rope_type="rope_3d"
-                    )
+                if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture:
+                    with set_current_vllm_config(self.vllm_config):
+                        return run_dp_sharded_mrope_vision_model(
+                            self.visual, pixel_values, grid_thw_list, rope_type="rope_3d",
+                            cudagraph_dispatcher=cudagraph_dispatcher,
+                        )
                 else:
                     image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
 
@@ -1516,7 +1521,8 @@ def _process_image_input(
         return image_embeds.split(sizes)
 
     def _process_video_input(
-        self, video_input: Qwen2_5_VLVideoInputs
+        self, video_input: Qwen2_5_VLVideoInputs,
+        cudagraph_dispatcher: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
@@ -1535,6 +1541,7 @@ def _process_video_input(
                         pixel_values_videos,
                         grid_thw_list,
                         rope_type="rope_3d",
+                        cudagraph_dispatcher=cudagraph_dispatcher,
                     )
                 else:
                     video_embeds = self.visual(
@@ -1983,6 +1990,7 @@ def get_mrope_input_positions(
         return torch.from_numpy(llm_positions), mrope_position_delta
 
     def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
+        cudagraph_dispatcher = kwargs.pop("cudagraph_dispatcher", None)
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return None
@@ -1996,14 +2004,14 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         for modality in mm_input_by_modality:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
-                image_embeddings = self._process_image_input(multimodal_input)
+                image_embeddings = self._process_image_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher)
                 if self.is_multimodal_pruning_enabled:
                     image_embeddings = self._postprocess_image_embeds_evs(
                         image_embeddings, multimodal_input
                     )
                 multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
-                video_embeddings = self._process_video_input(multimodal_input)
+                video_embeddings = self._process_video_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher)
                 if self.is_multimodal_pruning_enabled:
                     video_embeddings = self._postprocess_video_embeds_evs(
                         video_embeddings, multimodal_input
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index a2b78753a0c6..d4e9dd9fa159 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -16,6 +16,8 @@
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_gather,
 )
+from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
@@ -387,6 +389,7 @@ def run_dp_sharded_mrope_vision_model(
     grid_thw_list: list[list[int]],
     *,
     rope_type: Literal["rope_3d", "rope_2d"],
+    cudagraph_dispatcher: CudagraphDispatcher | None = None,
 ) -> tuple[torch.Tensor, ...]:
     """Run a vision model with data parallelism (DP) sharding.
     The function will shard the input image tensor on the
@@ -462,43 +465,85 @@ def run_dp_sharded_mrope_vision_model(
         embed_dim_reduction_factor = (
             vision_model.merge_kernel_size[0] * vision_model.merge_kernel_size[1]
         )
+        merge_size = vision_model.merge_kernel_size[0]
     else:
         embed_dim_reduction_factor = (
             vision_model.spatial_merge_size * vision_model.spatial_merge_size
         )
+        merge_size = vision_model.spatial_merge_size
 
     # Find the max length across all ranks
     # The output embedding of every DP rank has to be
     # padded to this length for tensor_model_parallel_all_gather
     # to work
-    max_len_per_rank = max(grouped_pixel_values_len) // embed_dim_reduction_factor
+    vllm_config = get_current_vllm_config()
+    use_cudagraph = False
+
+    if (vllm_config and
+        vllm_config.compilation_config.vit_cudagraph_capture_sizes):
+        max_input_len = max(grouped_pixel_values_len) if grouped_pixel_values_len else 0
+        target_input_len = vllm_config.pad_for_vit_cudagraph(max_input_len)
+        max_len_per_rank = target_input_len // embed_dim_reduction_factor
+        use_cudagraph = True
+    else:
+        max_len_per_rank = (max(grouped_pixel_values_len) if grouped_pixel_values_len else 0) // embed_dim_reduction_factor
+
     local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local]
 
-    # Run the vision model on the local pixel_values_local
-    if rope_type == "rope_2d":
-        if pixel_values_local.shape[0] > 0:
-            image_embeds_local = vision_model(
-                pixel_values_local, torch.tensor(local_grid_thw_list)
-            )
-            if isinstance(image_embeds_local, list):
-                image_embeds_local = torch.cat(image_embeds_local, dim=0)
-        else:
-            out_dim = getattr(vision_model.config, "hidden_size", None)
-            image_embeds_local = torch.empty(
-                (0, embed_dim_reduction_factor, out_dim),
-                device=pixel_values.device,
-                dtype=pixel_values.dtype,
+    # Pad pixel_values_local for CUDA graph if needed
+    if use_cudagraph:
+        current_input_len = pixel_values_local.shape[0]
+        # target_input_len derived from max_len_per_rank for consistency
+        target_input_len = max_len_per_rank * embed_dim_reduction_factor
+        
+        if current_input_len < target_input_len:
+            padding_size = target_input_len - current_input_len
+            padding = torch.empty(
+                (padding_size, pixel_values_local.shape[1]),
+                device=pixel_values_local.device,
+                dtype=pixel_values_local.dtype,
             )
+            pixel_values_local = torch.cat([pixel_values_local, padding], dim=0)
+            local_grid_thw_list.append([1, merge_size, padding_size // merge_size])
+
+    # Context setup
+    if cudagraph_dispatcher is not None:
+        dispatcher = cudagraph_dispatcher
     else:
-        if pixel_values_local.shape[0] > 0:
-            image_embeds_local = vision_model(pixel_values_local, local_grid_thw_list)
+        dispatcher = CudagraphDispatcher(vllm_config)
+    batch_descriptor = BatchDescriptor(num_tokens=pixel_values_local.shape[0], is_vit=True)
+    cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(batch_descriptor, False)
+    with set_forward_context(
+        None, 
+        vllm_config=vllm_config, 
+        cudagraph_runtime_mode=cudagraph_runtime_mode, 
+        batch_descriptor=batch_descriptor
+    ):
+        # Run the vision model on the local pixel_values_local
+        if rope_type == "rope_2d":
+            if pixel_values_local.shape[0] > 0:
+                image_embeds_local = vision_model(
+                    pixel_values_local, torch.tensor(local_grid_thw_list)
+                )
+                if isinstance(image_embeds_local, list):
+                    image_embeds_local = torch.cat(image_embeds_local, dim=0)
+            else:
+                out_dim = getattr(vision_model.config, "hidden_size", None)
+                image_embeds_local = torch.empty(
+                    (0, embed_dim_reduction_factor, out_dim),
+                    device=pixel_values.device,
+                    dtype=pixel_values.dtype,
+                )
         else:
-            # Handle empty case
-            image_embeds_local = torch.empty(
-                (0, vision_model.out_hidden_size),
-                device=pixel_values.device,
-                dtype=pixel_values.dtype,
-            )
+            if pixel_values_local.shape[0] > 0:
+                image_embeds_local = vision_model(pixel_values_local, local_grid_thw_list)
+            else:
+                # Handle empty case
+                image_embeds_local = torch.empty(
+                    (0, vision_model.out_hidden_size),
+                    device=pixel_values.device,
+                    dtype=pixel_values.dtype,
+                )
 
     # Pad the output based on max_len_per_rank
     # for tensor_model_parallel_all_gather to work
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9a5ebe562dae..681c9d0f3563 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2438,88 +2438,99 @@ def _execute_mm_encoder(
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
-                original_num_imgs = -1
-                padded_num_tokens = -1
-                if self.vit_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group:
-                    pixel_values = mm_kwargs_group["pixel_values"]
-                    num_tokens = pixel_values.shape[0]
-
-                    # Pad to the size expected by CUDA graph
-                    padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph(
-                        num_tokens
-                    )
-
-                    if padded_num_tokens > num_tokens:
-                        assert (
-                            self.pixel_values_buffer is not None
-                            and self.image_grid_thw_buffer is not None
+                is_vit_dp_mode = (
+                    getattr(self.model_config.multimodal_config, "mm_encoder_tp_mode", None) == "data"
+                    and self.parallel_config.tensor_parallel_size > 1
+                )
+                if not is_vit_dp_mode:
+                    original_num_imgs = -1
+                    padded_num_tokens = -1
+                    if self.vit_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group:
+                        pixel_values = mm_kwargs_group["pixel_values"]
+                        num_tokens = pixel_values.shape[0]
+
+                        # Pad to the size expected by CUDA graph
+                        padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph(
+                            num_tokens
                         )
 
-                        self.pixel_values_buffer[:num_tokens].copy_(pixel_values)  # type: ignore
-                        mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[
-                            :padded_num_tokens
-                        ]
-
-                        # Update image_grid_thw to account for padding
-                        if "image_grid_thw" in mm_kwargs_group:
-                            image_grid_thw = mm_kwargs_group["image_grid_thw"]
-                            num_images = image_grid_thw.shape[0]
-                            original_num_imgs = num_images
-                            padding_amount = padded_num_tokens - num_tokens
-
-                            # Treat padding as a new virtual image.
-                            # Assuming a fixed patch size where height is merge_size.
-                            merge_size = getattr(
-                                self.model_config.hf_config.vision_config,
-                                "spatial_merge_size",
-                                1,
+                        if padded_num_tokens > num_tokens:
+                            assert (
+                                self.pixel_values_buffer is not None
+                                and self.image_grid_thw_buffer is not None
                             )
-                            assert padding_amount % (merge_size * merge_size) == 0
-                            h_patches = merge_size
-                            w_patches = padding_amount // h_patches
-                            if num_images + 1 > self.image_grid_thw_buffer.shape[0]:
-                                new_size = max(
-                                    self.image_grid_thw_buffer.shape[0] * 2,
-                                    num_images + 1,
+
+                            self.pixel_values_buffer[:num_tokens].copy_(pixel_values)  # type: ignore
+                            mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[
+                                :padded_num_tokens
+                            ]
+
+                            # Update image_grid_thw to account for padding
+                            if "image_grid_thw" in mm_kwargs_group:
+                                image_grid_thw = mm_kwargs_group["image_grid_thw"]
+                                num_images = image_grid_thw.shape[0]
+                                original_num_imgs = num_images
+                                padding_amount = padded_num_tokens - num_tokens
+
+                                # Treat padding as a new virtual image.
+                                # Assuming a fixed patch size where height is merge_size.
+                                merge_size = getattr(
+                                    self.model_config.hf_config.vision_config,
+                                    "spatial_merge_size",
+                                    1,
+                                )
+                                assert padding_amount % (merge_size * merge_size) == 0
+                                h_patches = merge_size
+                                w_patches = padding_amount // h_patches
+                                if num_images + 1 > self.image_grid_thw_buffer.shape[0]:
+                                    new_size = max(
+                                        self.image_grid_thw_buffer.shape[0] * 2,
+                                        num_images + 1,
+                                    )
+                                    new_buffer = torch.zeros(
+                                        (new_size, 3),
+                                        dtype=torch.long,
+                                        device=self.device,
+                                    )
+                                    self.image_grid_thw_buffer = new_buffer
+                                self.image_grid_thw_buffer[:num_images].copy_(
+                                    image_grid_thw
                                 )
-                                new_buffer = torch.zeros(
-                                    (new_size, 3),
+                                self.image_grid_thw_buffer[num_images] = torch.tensor(
+                                    [1, h_patches, w_patches],
                                     dtype=torch.long,
                                     device=self.device,
                                 )
-                                self.image_grid_thw_buffer = new_buffer
-                            self.image_grid_thw_buffer[:num_images].copy_(
-                                image_grid_thw
-                            )
-                            self.image_grid_thw_buffer[num_images] = torch.tensor(
-                                [1, h_patches, w_patches],
-                                dtype=torch.long,
-                                device=self.device,
-                            )
-                            mm_kwargs_group["image_grid_thw"] = (
-                                self.image_grid_thw_buffer[: num_images + 1]
-                            )
+                                mm_kwargs_group["image_grid_thw"] = (
+                                    self.image_grid_thw_buffer[: num_images + 1]
+                                )
 
-                # get batch_descriptor from dispatcher
-                batch_descriptor = BatchDescriptor(
-                    num_tokens=padded_num_tokens,
-                    is_vit=True,
-                )
-                cudagraph_runtime_mode, batch_descriptor = (
-                    self.cudagraph_dispatcher.dispatch(batch_descriptor, False)
-                )
-                with set_forward_context(
+                    # get batch_descriptor from dispatcher
+                    batch_descriptor = BatchDescriptor(
+                        num_tokens=padded_num_tokens,
+                        is_vit=True,
+                    )
+                    cudagraph_runtime_mode, batch_descriptor = (
+                        self.cudagraph_dispatcher.dispatch(batch_descriptor, False)
+                    )
+                    with set_forward_context(
                         None,
                         vllm_config=self.vllm_config,
                         cudagraph_runtime_mode=cudagraph_runtime_mode,
                         batch_descriptor=batch_descriptor,
-                    ), self.timed_encoder_operation(
+                        ), self.timed_encoder_operation(
+                        should_time, mm_lora_refs, current_item_idx, num_items
+                    ):
+                        curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
+                        # Remove the padded items before sanity check
+                        if original_num_imgs != -1:
+                            curr_group_outputs = curr_group_outputs[:original_num_imgs]
+                else:
+                    with self.timed_encoder_operation(
                         should_time, mm_lora_refs, current_item_idx, num_items
                     ):
-                    curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
-                # Remove the padded items before sanity check
-                if original_num_imgs != -1:
-                    curr_group_outputs = curr_group_outputs[:original_num_imgs]
+                        mm_kwargs_group["cudagraph_dispatcher"] = self.cudagraph_dispatcher
+                        curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
                 expected_num_items=num_items,

From c0e8849e46d9c137ebc84cf0938bc003b2f0e892 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Tue, 30 Dec 2025 15:41:16 +0800
Subject: [PATCH 16/35] chore: remove ViT's useless persistent buffer at engine
 level

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 73 +++++++++---------------------
 1 file changed, 21 insertions(+), 52 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 681c9d0f3563..7d57537ca1cc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -610,13 +610,6 @@ def __init__(
             ]
             self.is_mm_embed_idx = 0
 
-            # START: Add persistent buffers for ViT inputs
-            # Use a large enough size for the CUDA graph
-            # The feature dimension is model-specific. We'll initialize
-            # the buffer lazily on the first run to get this dimension.
-            self.pixel_values_buffer: torch.Tensor | None = None
-            self.image_grid_thw_buffer: torch.Tensor | None = None
-
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
             # NOTE: `mrope_positions` is implemented with one additional dummy
@@ -2455,22 +2448,20 @@ def _execute_mm_encoder(
                         )
 
                         if padded_num_tokens > num_tokens:
-                            assert (
-                                self.pixel_values_buffer is not None
-                                and self.image_grid_thw_buffer is not None
+                            padding_amount = padded_num_tokens - num_tokens
+                            padding_tensor = torch.zeros(
+                                (padding_amount, pixel_values.shape[1]),
+                                dtype=pixel_values.dtype,
+                                device=pixel_values.device,
+                            )
+                            mm_kwargs_group["pixel_values"] = torch.cat(
+                                [pixel_values, padding_tensor], dim=0
                             )
-
-                            self.pixel_values_buffer[:num_tokens].copy_(pixel_values)  # type: ignore
-                            mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[
-                                :padded_num_tokens
-                            ]
 
                             # Update image_grid_thw to account for padding
                             if "image_grid_thw" in mm_kwargs_group:
                                 image_grid_thw = mm_kwargs_group["image_grid_thw"]
-                                num_images = image_grid_thw.shape[0]
-                                original_num_imgs = num_images
-                                padding_amount = padded_num_tokens - num_tokens
+                                original_num_imgs = image_grid_thw.shape[0]
 
                                 # Treat padding as a new virtual image.
                                 # Assuming a fixed patch size where height is merge_size.
@@ -2482,27 +2473,13 @@ def _execute_mm_encoder(
                                 assert padding_amount % (merge_size * merge_size) == 0
                                 h_patches = merge_size
                                 w_patches = padding_amount // h_patches
-                                if num_images + 1 > self.image_grid_thw_buffer.shape[0]:
-                                    new_size = max(
-                                        self.image_grid_thw_buffer.shape[0] * 2,
-                                        num_images + 1,
-                                    )
-                                    new_buffer = torch.zeros(
-                                        (new_size, 3),
-                                        dtype=torch.long,
-                                        device=self.device,
-                                    )
-                                    self.image_grid_thw_buffer = new_buffer
-                                self.image_grid_thw_buffer[:num_images].copy_(
-                                    image_grid_thw
+                                padding_grid_info = torch.tensor(
+                                    [[1, h_patches, w_patches]],
+                                    dtype=image_grid_thw.dtype,
+                                    device=image_grid_thw.device,
                                 )
-                                self.image_grid_thw_buffer[num_images] = torch.tensor(
-                                    [1, h_patches, w_patches],
-                                    dtype=torch.long,
-                                    device=self.device,
-                                )
-                                mm_kwargs_group["image_grid_thw"] = (
-                                    self.image_grid_thw_buffer[: num_images + 1]
+                                mm_kwargs_group["image_grid_thw"] = torch.cat(
+                                    [image_grid_thw, padding_grid_info], dim=0
                                 )
 
                     # get batch_descriptor from dispatcher
@@ -5254,20 +5231,12 @@ def _dummy_mm_encoder_run(
         self,
         compilation_cases: list[int],
     ) -> None:
-        if self.pixel_values_buffer is None:
-            tmp_dummy_mm_inputs = self._get_mm_dummy_batch(
-                "image",
-                1,
-            )
-            img_feature_dim = tmp_dummy_mm_inputs["pixel_values"].shape[1]
-            self.pixel_values_buffer = torch.zeros(
-                (compilation_cases[0], img_feature_dim),
-                dtype=self.dtype,
-                device=self.device,
-            )
-            self.image_grid_thw_buffer = torch.zeros(
-                (512, 3), dtype=torch.long, device=self.device
-            )
+        tmp_dummy_mm_inputs = self._get_mm_dummy_batch(
+            "image",
+            1,
+        )
+        img_feature_dim = tmp_dummy_mm_inputs["pixel_values"].shape[1]
+
         if is_global_first_rank():
             compilation_cases = tqdm(
                 compilation_cases,

From ef7e45d74ec5fac5240f5d4ac823ef7b45df93a3 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Tue, 30 Dec 2025 15:41:33 +0800
Subject: [PATCH 17/35] feat: add FA and sdpa wrappers to compilation config

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/config/compilation.py                 |  2 ++
 vllm/v1/attention/ops/vit_attn_wrappers.py | 21 ++++++++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index b74988b2a711..327869bfd44e 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -652,6 +652,8 @@ class CompilationConfig:
         "vllm::kda_attention",
         "vllm::sparse_attn_indexer",
         "vllm::rocm_aiter_sparse_attn_indexer",
+        "vllm::flash_attn_maxseqlen_wrapper",
+        "vllm::torch_sdpa_wrapper",
     ]
 
     def compute_hash(self) -> str:
diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py
index f077a61c984f..f6051e54713b 100644
--- a/vllm/v1/attention/ops/vit_attn_wrappers.py
+++ b/vllm/v1/attention/ops/vit_attn_wrappers.py
@@ -26,6 +26,7 @@ def flash_attn_maxseqlen_wrapper(
     v: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    output: torch.Tensor,
     fa_version: int | None,
     scale: float | None = None,
     cu_seqlens: torch.Tensor | None = None,
@@ -48,7 +49,7 @@ def flash_attn_maxseqlen_wrapper(
     max_seqlen = q_len if max_seqlen is None else max_seqlen.item()
 
     q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
-    output = flash_attn_varlen_func(
+    fa_output = flash_attn_varlen_func(
         q,
         k,
         v,
@@ -61,8 +62,9 @@ def flash_attn_maxseqlen_wrapper(
         softmax_scale=scale,
         **kwargs,
     )
-    context_layer = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size)
-    return context_layer
+    context_layer = einops.rearrange(fa_output, "(b s) h d -> b s h d", b=batch_size)
+    output.copy_(context_layer)
+    return output
 
 
 def flash_attn_maxseqlen_wrapper_fake(
@@ -71,6 +73,7 @@ def flash_attn_maxseqlen_wrapper_fake(
     v: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
+    output: torch.Tensor,
     fa_version: int | None,
     scale: float | None = None,
     cu_seqlens: torch.Tensor | None = None,
@@ -97,12 +100,15 @@ def vit_flash_attn_wrapper(
     cu_seqlens: torch.Tensor | None = None,
     max_seqlen: torch.Tensor | None = None,
 ) -> torch.Tensor:
+    b, s, h, d = q.shape
+    output = torch.empty((b, s, h, d), dtype=q.dtype, device=q.device)
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
         q,
         k,
         v,
         batch_size,
         is_rocm_aiter,
+        output,
         fa_version,
         scale,
         cu_seqlens,
@@ -132,6 +138,7 @@ def torch_sdpa_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
+    output: torch.Tensor,
     scale: float | None = None,
     cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
@@ -155,13 +162,15 @@ def torch_sdpa_wrapper(
         output_i = apply_sdpa(q_i, k_i, v_i, scale=scale)
         outputs.append(output_i)
     context_layer = torch.cat(outputs, dim=1)
-    return context_layer
+    output.copy_(context_layer)
+    return output
 
 
 def torch_sdpa_wrapper_fake(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
+    output: torch.Tensor,
     scale: float | None,
     cu_seqlens: torch.Tensor | None,
 ) -> torch.Tensor:
@@ -182,4 +191,6 @@ def vit_torch_sdpa_wrapper(
     scale: float | None = None,
     cu_seqlens: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, scale, cu_seqlens)
+    b, s, h, d = q.shape
+    output = torch.empty((b, s, h, d), dtype=q.dtype, device=q.device)
+    return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, output, scale, cu_seqlens)

From e23899d454ab5400ffbac3e99b1410f7d1300fe3 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Wed, 31 Dec 2025 12:02:58 +0800
Subject: [PATCH 18/35] fix: update dummy input type from image to video to
 avoid preprocess_opt warmup problem

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7d57537ca1cc..bda3d3f8999c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5232,10 +5232,10 @@ def _dummy_mm_encoder_run(
         compilation_cases: list[int],
     ) -> None:
         tmp_dummy_mm_inputs = self._get_mm_dummy_batch(
-            "image",
+            "video",
             1,
         )
-        img_feature_dim = tmp_dummy_mm_inputs["pixel_values"].shape[1]
+        img_feature_dim = tmp_dummy_mm_inputs["pixel_values_videos"].shape[1]
 
         if is_global_first_rank():
             compilation_cases = tqdm(

From 506f75bb743b3486fd374704ce2b2531c6e22152 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Mon, 5 Jan 2026 12:01:45 +0800
Subject: [PATCH 19/35] feat: add max_vit_cudagraph_capture_size and simplify
 code

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 docs/design/torch_compile.md             |  7 ++-
 vllm/compilation/monitor.py              |  3 +-
 vllm/config/compilation.py               | 19 ++++++
 vllm/config/vllm.py                      | 77 ++++++++++++++++++------
 vllm/model_executor/models/qwen2_5_vl.py | 46 ++++++++------
 vllm/model_executor/models/qwen3_vl.py   | 43 ++++++-------
 vllm/v1/worker/gpu_model_runner.py       | 72 +++++++++-------------
 7 files changed, 159 insertions(+), 108 deletions(-)

diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 041f029294e4..8532c7020cbc 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -253,11 +253,16 @@ By default, vLLM will try to determine a set of sizes to capture cudagraph. You
 vllm serve meta-llama/Llama-3.2-1B \
   --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
 ```
-Similarly, For `Qwen2.5-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. Note that ViT DP mode is **not supported**. By default, this is disabled as `compile_mm_encoder` is `False`. To enable it and specify capture sizes, you can do the following:
+Similarly, For `Qwen2.5-VL`,`Qwen3-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. By default, this is disabled as `compile_mm_encoder` is `False`. To enable it and specify capture sizes, you can do the following:
 ```bash
 vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
   --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": [512, 1024]}'
 ```
+Alternatively, you can specify `max_vit_cudagraph_capture_size` to generate a default list of capture sizes up to the given value:
+```bash
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
+  --compilation-config '{"compile_mm_encoder": true, "max_vit_cudagraph_capture_size": 2048}'
+```
 
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
 
diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 912e3d828abc..ca56574cfb30 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -31,8 +31,6 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None:
 
 
 def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None:
-    vllm_config.is_in_compile = False
-
     compilation_config: CompilationConfig = vllm_config.compilation_config
     if compilation_config.mode == CompilationMode.VLLM_COMPILE:
         logger.info_once(
@@ -45,6 +43,7 @@ def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None:
             context_manager.__exit__(None, None, None)
             context_manager = None
 
+    vllm_config.is_in_compile = False
 
 cudagraph_capturing_enabled: bool = True
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 327869bfd44e..99543929f2dc 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -534,6 +534,9 @@ class CompilationConfig:
     """Sizes to capture vit cudagraph.
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
+    max_vit_cudagraph_capture_size: int | None = field(default=None)
+    """The maximum vit cudagraph capture size.
+    """
     cudagraph_copy_inputs: bool = False
     """Whether to copy input tensors for
     cudagraph. If the caller can guarantee that the same input buffers
@@ -1166,3 +1169,19 @@ def get_compile_ranges(self) -> list[Range]:
             Range(start=s + 1, end=e)
             for s, e in zip([0] + split_points[:-1], split_points)
         ]
+
+    def compute_bs_to_padded_vit_graph_size(self):
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_vit_graph_size = [
+            0 for i in range(self.max_vit_cudagraph_capture_size + 1)
+        ]
+        for end, start in zip(
+            self.vit_cudagraph_capture_sizes
+            + [self.max_vit_cudagraph_capture_size + 1],
+            [0] + self.vit_cudagraph_capture_sizes,
+        ):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_vit_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_vit_graph_size[bs] = end
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 48c1e7be2d23..9f05046839c3 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import bisect
 import copy
 import getpass
 import json
@@ -369,12 +368,16 @@ def pad_for_cudagraph(self, batch_size: int) -> int:
         return self.compilation_config.bs_to_padded_graph_size[batch_size]
 
     def pad_for_vit_cudagraph(self, batch_size: int) -> int:
-        capture_sizes = self.compilation_config.vit_cudagraph_capture_sizes
-        # Find the insertion point for batch_size to maintain order.
-        # This gives the index of the first element >= batch_size.
-        idx = bisect.bisect_left(capture_sizes, batch_size)
-
-        return capture_sizes[idx] if idx < len(capture_sizes) else batch_size
+        if (
+            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and hasattr(self.compilation_config, "max_vit_cudagraph_capture_size")
+            and self.compilation_config.max_vit_cudagraph_capture_size
+            and batch_size <= self.compilation_config.max_vit_cudagraph_capture_size
+        ):
+            # Use CUDA graphs.
+            # Add padding to the batch size.
+            return self.compilation_config.bs_to_padded_vit_graph_size[batch_size]
+        return batch_size
     
     @property
     def needs_dp_coordinator(self) -> bool:
@@ -1362,8 +1365,6 @@ def _set_vit_cudagraph_sizes(self):
             - Eager mode is not enforced.
             - CUDA graph mode is enabled.
             - The multimodal encoder compilation is enabled.
-            - A multimodal config is present.
-            - The multimodal encoder tensor parallelism mode is not "data".
             If these conditions are not met, the list of capture sizes will be empty,
             effectively disabling ViT CUDA graphs.
 
@@ -1373,8 +1374,8 @@ def _set_vit_cudagraph_sizes(self):
 
         3.  If no sizes are provided by the user, a default list of sizes is
             generated up to a maximum of 5120. The default sizes are:
-            [16, 32, 64, 128, 256] + list(range(512, 2048, 64)) + list(
-            range(2048, 5120 + 1, 128))
+            [512, 1024, 1536] + list(range(2048, 2048, 128)) + list(
+            range(4096, 8192 + 1, 256))
 
         The final list of sizes is stored in
         `self.compilation_config.vit_cudagraph_capture_sizes`.
@@ -1390,6 +1391,21 @@ def _set_vit_cudagraph_sizes(self):
             and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
             and self.compilation_config.compile_mm_encoder
         ):
+            # determine the initial max_vit_cudagraph_capture_size
+            max_vit_cudagraph_capture_size = (
+                self.compilation_config.max_vit_cudagraph_capture_size
+            )
+            if max_vit_cudagraph_capture_size is None:
+                from vllm.multimodal import MULTIMODAL_REGISTRY
+                from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
+
+                encoder_compute_budget, _ = compute_encoder_budget(
+                    model_config=self.model_config,
+                    scheduler_config=self.scheduler_config,
+                    mm_registry=MULTIMODAL_REGISTRY,
+                )
+                max_vit_cudagraph_capture_size = min(encoder_compute_budget, 8192)
+
             # determine the vit_cudagraph_capture_sizes
             if self.compilation_config.vit_cudagraph_capture_sizes is not None:
                 # de-duplicate the sizes provided by the config
@@ -1400,15 +1416,6 @@ def _set_vit_cudagraph_sizes(self):
                 # sort to make sure the sizes are in ascending order
                 vit_cudagraph_capture_sizes.sort()
             else:
-                from vllm.multimodal import MULTIMODAL_REGISTRY
-                from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-
-                encoder_compute_budget, _ = compute_encoder_budget(
-                    model_config=self.model_config,
-                    scheduler_config=self.scheduler_config,
-                    mm_registry=MULTIMODAL_REGISTRY,
-                )
-                max_vit_cudagraph_capture_size = min(encoder_compute_budget, 8192)
                 vit_cudagraph_capture_sizes = [
                     i for i in [512, 1024, 1536] if i <= max_vit_cudagraph_capture_size
                 ]
@@ -1422,13 +1429,43 @@ def _set_vit_cudagraph_sizes(self):
                     vit_cudagraph_capture_sizes += list(
                         range(4096, max_vit_cudagraph_capture_size + 1, 256)
                     )
+
+            # user-specific compilation_config.max_vit_cudagraph_capture_size get
+            # truncated to valid_max_size when they are inconsistent.
+            valid_max_size = (
+                vit_cudagraph_capture_sizes[-1] if vit_cudagraph_capture_sizes else 0
+            )
+            if (
+                self.compilation_config.max_vit_cudagraph_capture_size is not None
+                and self.compilation_config.max_vit_cudagraph_capture_size
+                != valid_max_size
+            ):
+                # raise error only when both two flags are user-specified
+                # and they are inconsistent with each other
+                if self.compilation_config.vit_cudagraph_capture_sizes is not None:
+                    raise ValueError(
+                        "customized max_vit_cudagraph_capture_size"
+                        f"(={self.compilation_config.max_vit_cudagraph_capture_size}) "
+                        "should be consistent with the max value of "
+                        f"vit_cudagraph_capture_sizes(={valid_max_size})"
+                    )
+
+                logger.warning(
+                    "Truncating max_vit_cudagraph_capture_size to %d",
+                    valid_max_size,
+                )
+            # always set the final max_vit_cudagraph_capture_size
+            self.compilation_config.max_vit_cudagraph_capture_size = valid_max_size
             self.compilation_config.vit_cudagraph_capture_sizes = (
                 vit_cudagraph_capture_sizes
             )
         else:
             # no cudagraph in use
+            self.compilation_config.max_vit_cudagraph_capture_size = 0
             self.compilation_config.vit_cudagraph_capture_sizes = []
 
+        self.compilation_config.compute_bs_to_padded_vit_graph_size()
+
     def try_verify_and_update_config(self):
         if self.model_config is None:
             return
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 02e6af8334ac..cd23032b0d0c 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -27,6 +27,7 @@
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
 
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
+from contextlib import nullcontext
 from functools import lru_cache, partial
 from typing import Annotated, Any, Literal, TypeAlias
 
@@ -46,7 +47,7 @@
 from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.forward_context import get_forward_context, set_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.attention import MMEncoderAttention
@@ -1256,7 +1257,8 @@ def _parse_and_validate_video_input(
             )
 
     def _process_image_input(
-        self, image_input: Qwen2_5_VLImageInputs
+        self, image_input: Qwen2_5_VLImageInputs,
+        cudagraph_dispatcher: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
@@ -1266,21 +1268,24 @@ def _process_image_input(
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"]
-            if self.vllm_config.is_in_compile:
-                with set_forward_context(None, self.vllm_config):
-                    if self.use_data_parallel:
-                        return run_dp_sharded_mrope_vision_model(
-                            self.visual,
-                            pixel_values,
-                            grid_thw_list,
-                            rope_type="rope_3d",
-                        )
-                    else:
-                        image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
-            else:
-                if self.use_data_parallel:
+            maybe_in_vit_cuda_graph_capture = False
+            if is_forward_context_available():
+                ctx = get_forward_context()
+                if ctx.cudagraph_runtime_mode != CUDAGraphMode.NONE:
+                    maybe_in_vit_cuda_graph_capture = True
+            context = (
+                set_forward_context(None, self.vllm_config)
+                if self.vllm_config.is_in_compile
+                else nullcontext()
+            )
+            with context:
+                if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture:
                     return run_dp_sharded_mrope_vision_model(
-                        self.visual, pixel_values, grid_thw_list, rope_type="rope_3d"
+                        self.visual,
+                        pixel_values,
+                        grid_thw_list,
+                        rope_type="rope_3d",
+                        cudagraph_dispatcher=cudagraph_dispatcher,
                     )
                 else:
                     image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
@@ -1322,7 +1327,8 @@ def _postprocess_image_embeds_evs(
         return tuple(image_embeds_split)
 
     def _process_video_input(
-        self, video_input: Qwen2_5_VLVideoInputs
+        self, video_input: Qwen2_5_VLVideoInputs,
+        cudagraph_dispatcher: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
@@ -1339,6 +1345,7 @@ def _process_video_input(
                         pixel_values_videos,
                         grid_thw_list,
                         rope_type="rope_3d",
+                        cudagraph_dispatcher=cudagraph_dispatcher,
                     )
                 else:
                     video_embeds = self.visual(
@@ -1488,6 +1495,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         return mm_input_by_modality
 
     def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        cudagraph_dispatcher = kwargs.pop("cudagraph_dispatcher", None)
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
@@ -1501,14 +1509,14 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         for modality in mm_input_by_modality:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
-                image_embeddings = self._process_image_input(multimodal_input)
+                image_embeddings = self._process_image_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher)
                 if self.is_multimodal_pruning_enabled:
                     image_embeddings = self._postprocess_image_embeds_evs(
                         image_embeddings, multimodal_input
                     )
                 multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
-                video_embeddings = self._process_video_input(multimodal_input)
+                video_embeddings = self._process_video_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher)
                 if self.is_multimodal_pruning_enabled:
                     video_embeddings = self._postprocess_video_embeds_evs(
                         video_embeddings, multimodal_input
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index a4464d9868f4..4ddc0c85e627 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -25,6 +25,7 @@
 """Inference-only Qwen3VL model compatible with HuggingFace weights."""
 
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
+from contextlib import nullcontext
 from functools import lru_cache, partial
 from itertools import islice
 from typing import Any
@@ -49,7 +50,7 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config
+from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_pp_group
 from vllm.forward_context import get_forward_context, set_forward_context, is_forward_context_available
@@ -648,7 +649,8 @@ def forward(
                         hidden_states
                 )
                     deepstack_feature_lists.append(deepstack_feature)
-        hidden_states = self.merger(hidden_states)
+        with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(True):
+            hidden_states = self.merger(hidden_states)
         hidden_states = torch.cat(
             [hidden_states] + deepstack_feature_lists, dim=1
         )  # [seq_len, hidden_size * (1 + depth_of_deepstack)]
@@ -1492,26 +1494,25 @@ def _process_image_input(
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-            maybe_in_vit_cuda_graph_capture = is_forward_context_available()
-            if self.vllm_config.is_in_compile:
-                with set_forward_context(None, self.vllm_config):
-                    if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture:
-                        return run_dp_sharded_mrope_vision_model(
-                            self.visual,
-                            pixel_values,
-                            grid_thw_list,
-                            rope_type="rope_3d",
-                            cudagraph_dispatcher=cudagraph_dispatcher,
-                        )
-                    else:
-                        image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
-            else:
+            maybe_in_vit_cuda_graph_capture = False
+            if is_forward_context_available():
+                ctx = get_forward_context()
+                if ctx.cudagraph_runtime_mode != CUDAGraphMode.NONE:
+                    maybe_in_vit_cuda_graph_capture = True
+            context = (
+                set_forward_context(None, self.vllm_config)
+                if self.vllm_config.is_in_compile
+                else nullcontext()
+            )
+            with context:
                 if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture:
-                    with set_current_vllm_config(self.vllm_config):
-                        return run_dp_sharded_mrope_vision_model(
-                            self.visual, pixel_values, grid_thw_list, rope_type="rope_3d",
-                            cudagraph_dispatcher=cudagraph_dispatcher,
-                        )
+                    return run_dp_sharded_mrope_vision_model(
+                        self.visual,
+                        pixel_values,
+                        grid_thw_list,
+                        rope_type="rope_3d",
+                        cudagraph_dispatcher=cudagraph_dispatcher,
+                    )
                 else:
                     image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bda3d3f8999c..3db59c96cb30 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -28,6 +28,7 @@
     CompilationMode,
     CUDAGraphMode,
     VllmConfig,
+    set_current_vllm_config,
     get_layers_from_vllm_config,
     update_config,
 )
@@ -553,7 +554,7 @@ def __init__(
                 self.compilation_config.cudagraph_capture_sizes
             )
         # self.vit_cudagraph_batch_sizes sorts in ascending order.
-        self.vit_cudagraph_batch_sizes: list[int] | None
+        self.vit_cudagraph_batch_sizes: list[int] | None = None
         if (
             self.compilation_config.vit_cudagraph_capture_sizes
             and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
@@ -561,8 +562,6 @@ def __init__(
             self.vit_cudagraph_batch_sizes = sorted(
                 self.compilation_config.vit_cudagraph_capture_sizes
             )
-        else:
-            self.vit_cudagraph_batch_sizes = None
 
         # Cache the device properties.
         self._init_device_properties()
@@ -2465,14 +2464,7 @@ def _execute_mm_encoder(
 
                                 # Treat padding as a new virtual image.
                                 # Assuming a fixed patch size where height is merge_size.
-                                merge_size = getattr(
-                                    self.model_config.hf_config.vision_config,
-                                    "spatial_merge_size",
-                                    1,
-                                )
-                                assert padding_amount % (merge_size * merge_size) == 0
-                                h_patches = merge_size
-                                w_patches = padding_amount // h_patches
+                                h_patches, w_patches = self._get_dummy_h_w_patches(padding_amount)
                                 padding_grid_info = torch.tensor(
                                     [[1, h_patches, w_patches]],
                                     dtype=image_grid_thw.dtype,
@@ -2499,11 +2491,11 @@ def _execute_mm_encoder(
                         should_time, mm_lora_refs, current_item_idx, num_items
                     ):
                         curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
-                        # Remove the padded items before sanity check
-                        if original_num_imgs != -1:
-                            curr_group_outputs = curr_group_outputs[:original_num_imgs]
+                    # Remove the padded items before sanity check
+                    if original_num_imgs != -1:
+                        curr_group_outputs = curr_group_outputs[:original_num_imgs]
                 else:
-                    with self.timed_encoder_operation(
+                    with set_current_vllm_config(self.vllm_config), self.timed_encoder_operation(
                         should_time, mm_lora_refs, current_item_idx, num_items
                     ):
                         mm_kwargs_group["cudagraph_dispatcher"] = self.cudagraph_dispatcher
@@ -4651,44 +4643,34 @@ def rand_inputs_embeds() -> torch.Tensor:
             yield
             inputs_embeds.fill_(0)
 
+    def _get_dummy_h_w_patches(self, patches: int):
+        vision_config = self.model_config.hf_config.vision_config
+        if hasattr(vision_config, "spatial_merge_size"):
+            merge_size = vision_config.spatial_merge_size
+        elif hasattr(vision_config, "merge_kernel_size"):
+            merge_size = vision_config.merge_kernel_size[0]
+        else:
+            merge_size = 1
+
+        assert patches % (merge_size * merge_size) == 0, (
+            "Number of patches must be multiple of merge_size squared"
+        )
+        h_patches = merge_size
+        w_patches = patches // merge_size
+        return h_patches, w_patches
+
     def _get_dummy_vit_input(
         self, num_image_tokens: int, img_feature_dim: int
     ) -> BatchedTensorInputs:
-        """
-        Generates dummy multimodal inputs for a single image, with a controllable
-        number of resulting image tokens for a Vision Transformer (ViT) like model,
-        ensuring a square-like aspect ratio for the patch grid.
-
-        This is useful for profiling or testing, allowing the creation of inputs
-        that result in a specific number of image tokens after vision encoding.
-
-        Args:
-            num_image_tokens: The desired number of image tokens after encoding.
-
-        Returns:
-            A BatchedTensorInputs dictionary containing `pixel_values` and
-            `image_grid_thw` that can be passed as kwargs to
-            `embed_multimodal`.
-        """
-
-        def _get_dummy_h_w_patches(patches: int):
-            merge_size = getattr(
-                self.model_config.hf_config.vision_config, "spatial_merge_size", 1
-            )
-            assert patches % (merge_size * merge_size) == 0, (
-                "Number of patches must be multiple of merge_size squared"
-            )
-            h_patches = merge_size
-            w_patches = patches // merge_size
-            return h_patches, w_patches
+        """Dummy data for profiling and precompiling ViT."""
 
-        # The first dimension of pixel_values corresponds to the total number of
-        # tokens (patches).
+        # The first dimension of pixel_values corresponds 
+        # to the total number of patches.
         pixel_values = torch.zeros(
             (num_image_tokens, img_feature_dim), dtype=self.dtype, device=self.device
         )
 
-        h_patches, w_patches = _get_dummy_h_w_patches(num_image_tokens)
+        h_patches, w_patches = self._get_dummy_h_w_patches(num_image_tokens)
         image_grid_thw = torch.tensor(
             [[1, h_patches, w_patches]], dtype=torch.long, device=self.device
         )

From ee801444ad6bccadccb6b1a811c0e72be9f26c70 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Sun, 11 Jan 2026 20:44:51 +0800
Subject: [PATCH 20/35] rebase to v0.13.0

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/compilation/monitor.py              |  3 --
 vllm/config/compilation.py               |  2 +-
 vllm/config/vllm.py                      |  9 +++-
 vllm/model_executor/models/qwen2_5_vl.py | 50 ++++++++++-----------
 vllm/model_executor/models/qwen3_vl.py   | 56 ++++++++++++------------
 vllm/model_executor/models/vision.py     | 32 +++++++++-----
 vllm/v1/cudagraph_dispatcher.py          | 23 ++++++----
 vllm/v1/worker/gpu_model_runner.py       | 53 ++++++++++++----------
 8 files changed, 124 insertions(+), 104 deletions(-)

diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index ca56574cfb30..2bad5f0a16fc 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -13,8 +13,6 @@
 
 
 def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None:
-    vllm_config.is_in_compile = True
-
     global torch_compile_start_time
     torch_compile_start_time = time.time()
 
@@ -43,7 +41,6 @@ def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None:
             context_manager.__exit__(None, None, None)
             context_manager = None
 
-    vllm_config.is_in_compile = False
 
 cudagraph_capturing_enabled: bool = True
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 99543929f2dc..c573f2636be4 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -534,7 +534,7 @@ class CompilationConfig:
     """Sizes to capture vit cudagraph.
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
-    max_vit_cudagraph_capture_size: int | None = field(default=None)
+    max_vit_cudagraph_capture_size: int = field(default=None)
     """The maximum vit cudagraph capture size.
     """
     cudagraph_copy_inputs: bool = False
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 9f05046839c3..975045d085f8 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -257,8 +257,13 @@ class VllmConfig:
     performance, with -O0 having the best startup time and -O3 having the best
     performance. -02 is used by defult. See  OptimizationLevel for full
     description."""
-    is_in_compile: bool = False
-    """For ViT Compile, Compile Status Flag"""
+    is_in_compile_or_vit_cuda_graph_capture: bool = False
+    """Flag for ViT compilation or ViT CUDA graph capture.
+    
+    If true, ViT in DP mode will execute the ViT model directly instead of
+    `run_dp_sharded_mrope_vision_model` to ensure correct memory profiling
+    and compilation for each rank.
+    """
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index cd23032b0d0c..ce133ac02205 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -27,7 +27,6 @@
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
 
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
-from contextlib import nullcontext
 from functools import lru_cache, partial
 from typing import Annotated, Any, Literal, TypeAlias
 
@@ -44,10 +43,10 @@
 )
 
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config
+from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.forward_context import get_forward_context, set_forward_context, is_forward_context_available
+from vllm.forward_context import get_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.attention import MMEncoderAttention
@@ -643,7 +642,8 @@ def __init__(
             )
         vllm_config: VllmConfig = get_current_vllm_config()
         self._persistent_hidden_states_buffer: torch.Tensor | None = None
-        self._persistent_rotary_pos_emb_buffer: torch.Tensor | None = None
+        self._persistent_rotary_pos_emb_cos_buffer: torch.Tensor | None = None
+        self._persistent_rotary_pos_emb_sin_buffer: torch.Tensor | None = None
         if vllm_config.compilation_config.vit_cudagraph_capture_sizes:
             max_compile_size = (
                 vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
@@ -653,10 +653,11 @@ def __init__(
                 device=self.device,
                 dtype=self.dtype,
             )
-            self._persistent_rotary_pos_emb_buffer = torch.empty(
-                (max_compile_size, head_dim // 2),
-                device=self.device,
-                dtype=torch.float32,
+            self._persistent_rotary_pos_emb_cos_buffer = torch.empty(
+                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16
+            )
+            self._persistent_rotary_pos_emb_sin_buffer = torch.empty(
+                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16
             )
 
     @property
@@ -802,7 +803,9 @@ def forward(
         cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)]
         cu_seqlens: list = []
 
-        fwd_ctx = get_forward_context()
+        fwd_ctx = None
+        if is_forward_context_available():  
+            fwd_ctx = get_forward_context()
         if (
             self._persistent_hidden_states_buffer is not None
             and fwd_ctx
@@ -872,14 +875,17 @@ def forward(
         rotary_pos_emb_sin = rotary_pos_emb_sin.to(
             device=self.device, non_blocking=True
         )
-        rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True)
         if (
-            self._persistent_rotary_pos_emb_buffer is not None
+            self._persistent_rotary_pos_emb_sin_buffer is not None
+            and self._persistent_rotary_pos_emb_cos_buffer is not None
             and fwd_ctx
             and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
         ):
-            rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(
-                rotary_pos_emb
+            rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[:seq_len].copy_(
+                rotary_pos_emb_sin
+            )
+            rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[:seq_len].copy_(
+                rotary_pos_emb_cos
             )
         window_index = window_index.to(device=hidden_states.device, non_blocking=True)
         reverse_indices = reverse_indices.to(
@@ -1268,18 +1274,8 @@ def _process_image_input(
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"]
-            maybe_in_vit_cuda_graph_capture = False
-            if is_forward_context_available():
-                ctx = get_forward_context()
-                if ctx.cudagraph_runtime_mode != CUDAGraphMode.NONE:
-                    maybe_in_vit_cuda_graph_capture = True
-            context = (
-                set_forward_context(None, self.vllm_config)
-                if self.vllm_config.is_in_compile
-                else nullcontext()
-            )
-            with context:
-                if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture:
+            with set_current_vllm_config(self.vllm_config):
+                if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture:
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
                         pixel_values,
@@ -1338,8 +1334,8 @@ def _process_video_input(
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
-            with set_forward_context(None, self.vllm_config):
-                if self.use_data_parallel:
+            with set_current_vllm_config(self.vllm_config):
+                if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture:
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
                         pixel_values_videos,
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 4ddc0c85e627..9ee61fb4851c 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -25,7 +25,6 @@
 """Inference-only Qwen3VL model compatible with HuggingFace weights."""
 
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
-from contextlib import nullcontext
 from functools import lru_cache, partial
 from itertools import islice
 from typing import Any
@@ -50,10 +49,10 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config
+from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_pp_group
-from vllm.forward_context import get_forward_context, set_forward_context, is_forward_context_available
+from vllm.forward_context import get_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
 from vllm.model_executor.layers.conv import Conv3dLayer
@@ -142,7 +141,8 @@
 DUMMY_VIDEO_NUM_FRAMES = 2048
 
 
-@support_torch_compile(dynamic_arg_dims={"x": 0})
+@support_torch_compile(dynamic_arg_dims={"x": 0},
+    enable_if=should_torch_compile_mm_vit)
 class Qwen3_VisionPatchEmbed(nn.Module):
     def __init__(
         self,
@@ -210,8 +210,7 @@ def forward(self, x: torch.Tensor):
 
 
 @support_torch_compile(
-    dynamic_arg_dims={"x": 0, "cu_seqlens": 0, "rotary_pos_emb": 0, "seqlens": 0},
-    mark_unbacked_dims={"seqlens": 0},
+    dynamic_arg_dims={"x": 0, "cu_seqlens": 0, "rotary_pos_emb_cos": 0, "rotary_pos_emb_sin": 0},
     enable_if=should_torch_compile_mm_vit,
 )
 class Qwen3_VisionBlock(nn.Module):
@@ -426,14 +425,18 @@ def __init__(
             )
         vllm_config: VllmConfig = get_current_vllm_config()
         self._persistent_hidden_states_buffer: torch.Tensor | None = None
-        self._persistent_rotary_pos_emb_buffer: torch.Tensor | None = None
+        self._persistent_rotary_pos_emb_cos_buffer: torch.Tensor | None = None
+        self._persistent_rotary_pos_emb_sin_buffer: torch.Tensor | None = None
         if vllm_config.compilation_config.vit_cudagraph_capture_sizes:
             max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
             self._persistent_hidden_states_buffer = torch.empty(
                 (max_compile_size, self.patch_embed.proj.input_size), device=self.device, dtype=self.dtype
             )
-            self._persistent_rotary_pos_emb_buffer = torch.empty(
-                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.float32
+            self._persistent_rotary_pos_emb_cos_buffer = torch.empty(
+                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16
+            )
+            self._persistent_rotary_pos_emb_sin_buffer = torch.empty(
+                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16
             )
 
     @property
@@ -567,7 +570,9 @@ def forward(
         grid_thw: torch.Tensor | list[list[int]],
     ) -> torch.Tensor:
         seq_len, _ = x.size()
-        fwd_ctx = get_forward_context()
+        fwd_ctx = None
+        if is_forward_context_available():  
+            fwd_ctx = get_forward_context()
         if (
             self._persistent_hidden_states_buffer is not None
             and fwd_ctx
@@ -597,15 +602,17 @@ def forward(
         original_hidden_states = hidden_states
         hidden_states = hidden_states + pos_embeds
         rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
-        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
-        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, non_blocking=True)
         if (
-            self._persistent_rotary_pos_emb_buffer is not None
+            self._persistent_rotary_pos_emb_sin_buffer is not None
+            and self._persistent_rotary_pos_emb_cos_buffer is not None
             and fwd_ctx
             and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
         ):
-            rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(
-                rotary_pos_emb
+            rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[:seq_len].copy_(
+                rotary_pos_emb_sin
+            )
+            rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[:seq_len].copy_(
+                rotary_pos_emb_cos
             )
 
         cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
@@ -1494,18 +1501,9 @@ def _process_image_input(
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
-            maybe_in_vit_cuda_graph_capture = False
-            if is_forward_context_available():
-                ctx = get_forward_context()
-                if ctx.cudagraph_runtime_mode != CUDAGraphMode.NONE:
-                    maybe_in_vit_cuda_graph_capture = True
-            context = (
-                set_forward_context(None, self.vllm_config)
-                if self.vllm_config.is_in_compile
-                else nullcontext()
-            )
-            with context:
-                if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture:
+
+            with set_current_vllm_config(self.vllm_config):
+                if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture:
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
                         pixel_values,
@@ -1535,8 +1533,8 @@ def _process_video_input(
             pixel_values_videos = video_input["pixel_values_videos"].type(
                 self.visual.dtype
             )
-            with set_forward_context(None, self.vllm_config):
-                if self.use_data_parallel:
+            with set_current_vllm_config(self.vllm_config):
+                if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture:
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
                         pixel_values_videos,
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index d4e9dd9fa159..70e9a0011811 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -10,7 +10,7 @@
 import torch
 from transformers import PretrainedConfig
 
-from vllm.config import MultiModalConfig, VllmConfig, get_current_vllm_config
+from vllm.config import MultiModalConfig, VllmConfig, CUDAGraphMode, get_current_vllm_config
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -479,10 +479,25 @@ def run_dp_sharded_mrope_vision_model(
     vllm_config = get_current_vllm_config()
     use_cudagraph = False
 
+    # Context setup
+    if cudagraph_dispatcher is not None:
+        dispatcher = cudagraph_dispatcher
+    else:
+        dispatcher = CudagraphDispatcher(vllm_config)
+    cudagraph_runtime_mode = CUDAGraphMode.NONE
+    batch_descriptor = None
+
     if (vllm_config and
         vllm_config.compilation_config.vit_cudagraph_capture_sizes):
         max_input_len = max(grouped_pixel_values_len) if grouped_pixel_values_len else 0
-        target_input_len = vllm_config.pad_for_vit_cudagraph(max_input_len)
+        cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(
+            num_tokens=max_input_len,
+            uniform_decode=False,
+            has_lora=False,
+            disable_full=False,
+            is_vit=True,
+        )
+        target_input_len = batch_descriptor.num_tokens
         max_len_per_rank = target_input_len // embed_dim_reduction_factor
         use_cudagraph = True
     else:
@@ -506,17 +521,10 @@ def run_dp_sharded_mrope_vision_model(
             pixel_values_local = torch.cat([pixel_values_local, padding], dim=0)
             local_grid_thw_list.append([1, merge_size, padding_size // merge_size])
 
-    # Context setup
-    if cudagraph_dispatcher is not None:
-        dispatcher = cudagraph_dispatcher
-    else:
-        dispatcher = CudagraphDispatcher(vllm_config)
-    batch_descriptor = BatchDescriptor(num_tokens=pixel_values_local.shape[0], is_vit=True)
-    cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(batch_descriptor, False)
     with set_forward_context(
-        None, 
-        vllm_config=vllm_config, 
-        cudagraph_runtime_mode=cudagraph_runtime_mode, 
+        None,
+        vllm_config=vllm_config,
+        cudagraph_runtime_mode=cudagraph_runtime_mode,
         batch_descriptor=batch_descriptor
     ):
         # Run the vision model on the local pixel_values_local
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 3368f97fe3b3..71226d7c283d 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -125,10 +125,14 @@ def _create_padded_batch_descriptor(
         uniform_decode: bool,
         has_lora: bool,
         num_active_loras: int = 0,
+        is_vit: bool = False,
     ) -> BatchDescriptor:
         max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs
         uniform_decode_query_len = self.uniform_decode_query_len
-        num_tokens_padded = self._bs_to_padded_graph_size[num_tokens]
+        if is_vit:
+            num_tokens_padded = self.vllm_config.pad_for_vit_cudagraph(num_tokens)
+        else:
+            num_tokens_padded = self._bs_to_padded_graph_size[num_tokens]
 
         if uniform_decode and self.cudagraph_mode.has_mode(CUDAGraphMode.FULL):
             num_reqs = num_tokens_padded // uniform_decode_query_len
@@ -143,6 +147,7 @@ def _create_padded_batch_descriptor(
             uniform=uniform_decode,
             has_lora=has_lora,
             num_active_loras=num_active_loras,
+            is_vit=is_vit
         )
 
     def add_cudagraph_key(
@@ -187,12 +192,12 @@ def initialize_cudagraph_keys(
                     ).relax_for_mixed_batch_cudagraphs(),
                 )
             # ViT CUDAGraph Entry
-            for vit_patch_len in self.compilation_config.vit_cudagraph_capture_sizes:
+            for patch_len in self.compilation_config.vit_cudagraph_capture_sizes:
                 self.add_cudagraph_key(
                     cudagraph_mode.mixed_mode(),
-                    BatchDescriptor(
-                        num_tokens=vit_patch_len, uniform_decode=False, is_vit=True
-                    ),
+                    self._create_padded_batch_descriptor(
+                        patch_len, False, False, is_vit=True
+                    ).relax_for_mixed_batch_cudagraphs(),
                 )
 
         # if decode cudagraph mode is FULL, and we don't already have mixed
@@ -229,6 +234,7 @@ def dispatch(
         has_lora: bool = False,
         disable_full: bool = False,
         num_active_loras: int = 0,
+        is_vit: bool = False,
     ) -> tuple[CUDAGraphMode, BatchDescriptor]:
         """
         Given conditions(e.g.,batch descriptor and if using piecewise only),
@@ -249,9 +255,10 @@ def dispatch(
         if (
             not self.keys_initialized
             or self.cudagraph_mode == CUDAGraphMode.NONE
-            or num_tokens > self.compilation_config.max_cudagraph_capture_size
+            or (not is_vit and num_tokens > self.compilation_config.max_cudagraph_capture_size)
+            or (is_vit and num_tokens > self.compilation_config.max_vit_cudagraph_capture_size)
         ):
-            return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
+            return CUDAGraphMode.NONE, BatchDescriptor(num_tokens, is_vit=is_vit)
 
         effective_num_active_loras = num_active_loras
         if has_lora and num_active_loras > 0:
@@ -270,7 +277,7 @@ def dispatch(
                 effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1
 
         batch_desc = self._create_padded_batch_descriptor(
-            num_tokens, uniform_decode, has_lora, effective_num_active_loras
+            num_tokens, uniform_decode, has_lora, effective_num_active_loras, is_vit
         )
         relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 3db59c96cb30..5d88be3212d0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -28,7 +28,6 @@
     CompilationMode,
     CUDAGraphMode,
     VllmConfig,
-    set_current_vllm_config,
     get_layers_from_vllm_config,
     update_config,
 )
@@ -2437,14 +2436,23 @@ def _execute_mm_encoder(
                 if not is_vit_dp_mode:
                     original_num_imgs = -1
                     padded_num_tokens = -1
+
+                    # Default values for non-ViT cudagraph case
+                    cudagraph_runtime_mode = CUDAGraphMode.NONE
+                    batch_descriptor = None
                     if self.vit_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group:
                         pixel_values = mm_kwargs_group["pixel_values"]
                         num_tokens = pixel_values.shape[0]
 
-                        # Pad to the size expected by CUDA graph
-                        padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph(
-                            num_tokens
+                        # get batch_descriptor from dispatcher
+                        cudagraph_runtime_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch(
+                            num_tokens=num_tokens,
+                            uniform_decode=False,
+                            has_lora=False,
+                            disable_full=False,
+                            is_vit=True,
                         )
+                        padded_num_tokens = batch_descriptor.num_tokens
 
                         if padded_num_tokens > num_tokens:
                             padding_amount = padded_num_tokens - num_tokens
@@ -2474,19 +2482,11 @@ def _execute_mm_encoder(
                                     [image_grid_thw, padding_grid_info], dim=0
                                 )
 
-                    # get batch_descriptor from dispatcher
-                    batch_descriptor = BatchDescriptor(
-                        num_tokens=padded_num_tokens,
-                        is_vit=True,
-                    )
-                    cudagraph_runtime_mode, batch_descriptor = (
-                        self.cudagraph_dispatcher.dispatch(batch_descriptor, False)
-                    )
                     with set_forward_context(
-                        None,
-                        vllm_config=self.vllm_config,
-                        cudagraph_runtime_mode=cudagraph_runtime_mode,
-                        batch_descriptor=batch_descriptor,
+                            None,
+                            vllm_config=self.vllm_config,
+                            cudagraph_runtime_mode=cudagraph_runtime_mode,
+                            batch_descriptor=batch_descriptor,
                         ), self.timed_encoder_operation(
                         should_time, mm_lora_refs, current_item_idx, num_items
                     ):
@@ -5213,6 +5213,7 @@ def _dummy_mm_encoder_run(
         self,
         compilation_cases: list[int],
     ) -> None:
+        self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = True
         tmp_dummy_mm_inputs = self._get_mm_dummy_batch(
             "video",
             1,
@@ -5228,21 +5229,27 @@ def _dummy_mm_encoder_run(
         # Lazy initialization of the persistent buffer
         for capture_size in compilation_cases:
             dummy_mm_inputs = self._get_dummy_vit_input(capture_size, img_feature_dim)
-            batch_descriptor = BatchDescriptor(
+            cudagraph_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch(
                 num_tokens=capture_size,
+                uniform_decode=False,
+                has_lora=False,
+                disable_full=False,
                 is_vit=True,
             )
             with (
                 set_forward_context(
                     None,
-                    vllm_config=self.vllm_config,
-                    cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                    self.vllm_config,
+                    num_tokens=capture_size,
+                    cudagraph_runtime_mode=cudagraph_mode,
                     batch_descriptor=batch_descriptor,
                 ),
             ):
                 self.model.embed_multimodal(**dummy_mm_inputs)
+        self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = False
 
     def profile_run(self) -> None:
+        self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = True
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
             mm_config = self.model_config.multimodal_config
@@ -5280,9 +5287,10 @@ def profile_run(self) -> None:
                     )
 
                     # Run multimodal encoder.
-                    dummy_encoder_outputs = self.model.embed_multimodal(
-                        **batched_dummy_mm_inputs
-                    )
+                    with set_forward_context(None, self.vllm_config):
+                        dummy_encoder_outputs = self.model.embed_multimodal(
+                            **batched_dummy_mm_inputs
+                        )
 
                     sanity_check_mm_encoder_outputs(
                         dummy_encoder_outputs,
@@ -5306,6 +5314,7 @@ def profile_run(self) -> None:
         del hidden_states, output
         self.encoder_cache.clear()
         gc.collect()
+        self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = False
 
     def capture_model(self) -> int:
         if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:

From f8defd77f5ad37da20966186502887c9762b0ea5 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Sun, 11 Jan 2026 22:15:40 +0800
Subject: [PATCH 21/35] chore: Reduce unnecessary computations in ViT dp mode

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/model_executor/models/vision.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 70e9a0011811..6fff76a44a25 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -476,8 +476,10 @@ def run_dp_sharded_mrope_vision_model(
     # The output embedding of every DP rank has to be
     # padded to this length for tensor_model_parallel_all_gather
     # to work
+    max_len_per_rank = max(grouped_pixel_values_len) // embed_dim_reduction_factor
+    local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local]
+
     vllm_config = get_current_vllm_config()
-    use_cudagraph = False
 
     # Context setup
     if cudagraph_dispatcher is not None:
@@ -489,28 +491,17 @@ def run_dp_sharded_mrope_vision_model(
 
     if (vllm_config and
         vllm_config.compilation_config.vit_cudagraph_capture_sizes):
-        max_input_len = max(grouped_pixel_values_len) if grouped_pixel_values_len else 0
+        current_input_len = pixel_values_local.shape[0]
         cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(
-            num_tokens=max_input_len,
+            num_tokens=current_input_len,
             uniform_decode=False,
             has_lora=False,
             disable_full=False,
             is_vit=True,
         )
         target_input_len = batch_descriptor.num_tokens
-        max_len_per_rank = target_input_len // embed_dim_reduction_factor
-        use_cudagraph = True
-    else:
-        max_len_per_rank = (max(grouped_pixel_values_len) if grouped_pixel_values_len else 0) // embed_dim_reduction_factor
-
-    local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local]
-
-    # Pad pixel_values_local for CUDA graph if needed
-    if use_cudagraph:
-        current_input_len = pixel_values_local.shape[0]
-        # target_input_len derived from max_len_per_rank for consistency
-        target_input_len = max_len_per_rank * embed_dim_reduction_factor
-        
+    
+        # Pad pixel_values_local for CUDA graph if needed
         if current_input_len < target_input_len:
             padding_size = target_input_len - current_input_len
             padding = torch.empty(

From 602c69273dc32be501594112e258a49715004780 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Fri, 16 Jan 2026 14:22:52 +0800
Subject: [PATCH 22/35] fix: truncate padded output in CUDA graph execution to
 prevent all_gather hang

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/model_executor/models/vision.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 6fff76a44a25..0396408ae819 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -566,6 +566,9 @@ def run_dp_sharded_mrope_vision_model(
                 device=image_embeds_local.device,
             )
         image_embeds_local_padded = torch.cat([image_embeds_local, padding], dim=0)
+    # truncate the padded output from CUDA graph execution
+    elif current_len > max_len_per_rank:
+        image_embeds_local_padded = image_embeds_local[:max_len_per_rank]
     else:
         image_embeds_local_padded = image_embeds_local
 

From c1746c1af1815fa2dfe9dfec646b4e8a06de520d Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Wed, 21 Jan 2026 16:53:45 +0800
Subject: [PATCH 23/35] fix: change padding init from empty to zeros to avoid
 FA3 issues

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/model_executor/models/vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 0396408ae819..22c9a30c23c6 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -504,7 +504,7 @@ def run_dp_sharded_mrope_vision_model(
         # Pad pixel_values_local for CUDA graph if needed
         if current_input_len < target_input_len:
             padding_size = target_input_len - current_input_len
-            padding = torch.empty(
+            padding = torch.zeros(
                 (padding_size, pixel_values_local.shape[1]),
                 device=pixel_values_local.device,
                 dtype=pixel_values_local.dtype,

From 99d8272915f67d35baaeab8eef73c82a8e78d9ff Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Fri, 23 Jan 2026 15:39:44 +0800
Subject: [PATCH 24/35] rebase to main 7ef587

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/compilation/backends.py             |  5 ++---
 vllm/config/compilation.py               | 16 ----------------
 vllm/config/vllm.py                      | 14 --------------
 vllm/model_executor/models/qwen2_5_vl.py | 12 ++++++------
 vllm/model_executor/models/qwen3_vl.py   | 12 ++++++------
 vllm/v1/cudagraph_dispatcher.py          | 19 ++++++++++++++++++-
 6 files changed, 32 insertions(+), 46 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 20e74d619adf..615948072f72 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -56,7 +56,7 @@
 
 
 @contextmanager
-def set_is_last_graph_in_sequence(is_last: bool):
+def set_is_last_graph_in_vit_sequence(is_last: bool):
     """Context manager to indicate if the current graph being compiled
     is the last one in a sequence of graphs (e.g., a sequence of blocks).
     """
@@ -75,7 +75,7 @@ def set_is_last_graph_in_sequence(is_last: bool):
 
 
 @contextmanager
-def set_is_first_graph_in_sequence(is_first: bool):
+def set_is_first_graph_in_vit_sequence(is_first: bool):
     """Context manager to indicate if the current graph being compiled
     is the first one in a sequence of graphs (e.g., a sequence of blocks).
     """
@@ -124,7 +124,6 @@ def copy_and_call(*args: Any) -> Any:
     return copy_and_call
 
 
-
 def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
     assert not envs.VLLM_USE_MEGA_AOT_ARTIFACT or envs.VLLM_USE_STANDALONE_COMPILE, (
         "VLLM_USE_MEGA_AOT_ARTIFACT=1 requires VLLM_USE_STANDALONE_COMPILE=1"
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index c573f2636be4..2da96d938765 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -1169,19 +1169,3 @@ def get_compile_ranges(self) -> list[Range]:
             Range(start=s + 1, end=e)
             for s, e in zip([0] + split_points[:-1], split_points)
         ]
-
-    def compute_bs_to_padded_vit_graph_size(self):
-        # pre-compute the mapping from batch size to padded graph size
-        self.bs_to_padded_vit_graph_size = [
-            0 for i in range(self.max_vit_cudagraph_capture_size + 1)
-        ]
-        for end, start in zip(
-            self.vit_cudagraph_capture_sizes
-            + [self.max_vit_cudagraph_capture_size + 1],
-            [0] + self.vit_cudagraph_capture_sizes,
-        ):
-            for bs in range(start, end):
-                if bs == start:
-                    self.bs_to_padded_vit_graph_size[bs] = start
-                else:
-                    self.bs_to_padded_vit_graph_size[bs] = end
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 975045d085f8..90b54e9e2396 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -372,18 +372,6 @@ def pad_for_cudagraph(self, batch_size: int) -> int:
         # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size
         return self.compilation_config.bs_to_padded_graph_size[batch_size]
 
-    def pad_for_vit_cudagraph(self, batch_size: int) -> int:
-        if (
-            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            and hasattr(self.compilation_config, "max_vit_cudagraph_capture_size")
-            and self.compilation_config.max_vit_cudagraph_capture_size
-            and batch_size <= self.compilation_config.max_vit_cudagraph_capture_size
-        ):
-            # Use CUDA graphs.
-            # Add padding to the batch size.
-            return self.compilation_config.bs_to_padded_vit_graph_size[batch_size]
-        return batch_size
-    
     @property
     def needs_dp_coordinator(self) -> bool:
         """
@@ -1469,8 +1457,6 @@ def _set_vit_cudagraph_sizes(self):
             self.compilation_config.max_vit_cudagraph_capture_size = 0
             self.compilation_config.vit_cudagraph_capture_sizes = []
 
-        self.compilation_config.compute_bs_to_padded_vit_graph_size()
-
     def try_verify_and_update_config(self):
         if self.model_config is None:
             return
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index ce133ac02205..a7c16401360a 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -817,11 +817,11 @@ def forward(
             hidden_states = x.to(device=self.device, dtype=self.dtype)
 
         from vllm.compilation.backends import (
-            set_is_first_graph_in_sequence,
-            set_is_last_graph_in_sequence,
+            set_is_first_graph_in_vit_sequence,
+            set_is_last_graph_in_vit_sequence,
         )
 
-        with set_is_first_graph_in_sequence(True), set_is_last_graph_in_sequence(False):
+        with set_is_first_graph_in_vit_sequence(True), set_is_last_graph_in_vit_sequence(False):
             hidden_states = self.patch_embed(hidden_states)
 
         window_index_id = 0
@@ -913,8 +913,8 @@ def forward(
             hidden_states = original_hidden_states
 
         with (
-            set_is_first_graph_in_sequence(False),
-            set_is_last_graph_in_sequence(False),
+            set_is_first_graph_in_vit_sequence(False),
+            set_is_last_graph_in_vit_sequence(False),
         ):
             for layer_num, blk in enumerate(self.blocks):
                 if layer_num in self.fullatt_block_indexes:
@@ -938,7 +938,7 @@ def forward(
             hidden_states = cast_overflow_tensors(hidden_states)
 
         # adapter
-        with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(True):
+        with set_is_first_graph_in_vit_sequence(False), set_is_last_graph_in_vit_sequence(True):
             hidden_states = self.merger(hidden_states)
         hidden_states = hidden_states[reverse_indices, :]
         return hidden_states
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 9ee61fb4851c..c012e17adfcb 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -584,11 +584,11 @@ def forward(
             hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True)
 
         from vllm.compilation.backends import (
-            set_is_first_graph_in_sequence,
-            set_is_last_graph_in_sequence,
+            set_is_first_graph_in_vit_sequence,
+            set_is_last_graph_in_vit_sequence,
         )
 
-        with set_is_first_graph_in_sequence(True), set_is_last_graph_in_sequence(False):
+        with set_is_first_graph_in_vit_sequence(True), set_is_last_graph_in_vit_sequence(False):
             hidden_states = self.patch_embed(hidden_states)
 
         if isinstance(grid_thw, list):
@@ -639,8 +639,8 @@ def forward(
 
         deepstack_feature_lists = []
         with (
-            set_is_first_graph_in_sequence(False),
-            set_is_last_graph_in_sequence(False),
+            set_is_first_graph_in_vit_sequence(False),
+            set_is_last_graph_in_vit_sequence(False),
         ):
             for layer_num, blk in enumerate(self.blocks):
                 hidden_states = blk(
@@ -656,7 +656,7 @@ def forward(
                         hidden_states
                 )
                     deepstack_feature_lists.append(deepstack_feature)
-        with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(True):
+        with set_is_first_graph_in_vit_sequence(False), set_is_last_graph_in_vit_sequence(True):
             hidden_states = self.merger(hidden_states)
         hidden_states = torch.cat(
             [hidden_states] + deepstack_feature_lists, dim=1
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 71226d7c283d..90572805f750 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -119,6 +119,22 @@ def _get_lora_cases(self) -> list[int]:
             # No specialization: only capture graphs with LoRA active
             return [lora_config.max_loras + 1]
 
+    def _compute_bs_to_padded_vit_graph_size(self) -> None:
+        """pre-compute the mapping from batch size to ViT padded graph size."""
+        max_size = self.compilation_config.max_vit_cudagraph_capture_size
+        capture_sizes = self.compilation_config.vit_cudagraph_capture_sizes
+        self._bs_to_padded_vit_graph_size: list[int] = [0] * (max_size + 1)
+        for end, start in zip(
+            capture_sizes + [max_size + 1],
+            [0] + capture_sizes,
+        ):
+            for bs in range(start, end):
+                if bs == start:
+                    self._bs_to_padded_vit_graph_size[bs] = start
+                else:
+                    self._bs_to_padded_vit_graph_size[bs] = end
+        
+
     def _create_padded_batch_descriptor(
         self,
         num_tokens: int,
@@ -130,7 +146,7 @@ def _create_padded_batch_descriptor(
         max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs
         uniform_decode_query_len = self.uniform_decode_query_len
         if is_vit:
-            num_tokens_padded = self.vllm_config.pad_for_vit_cudagraph(num_tokens)
+            num_tokens_padded = self._bs_to_padded_vit_graph_size[num_tokens]
         else:
             num_tokens_padded = self._bs_to_padded_graph_size[num_tokens]
 
@@ -171,6 +187,7 @@ def initialize_cudagraph_keys(
             return
 
         self._compute_bs_to_padded_graph_size()
+        self._compute_bs_to_padded_vit_graph_size()
 
         # Get LoRA cases to capture
         lora_cases = self._get_lora_cases()

From 79ea2407131d3dbdd7c90f4eab7f3491f8876539 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Mon, 26 Jan 2026 15:53:59 +0800
Subject: [PATCH 25/35] rebase to ff6c1d

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/v1/cudagraph_dispatcher.py    | 8 +++++---
 vllm/v1/worker/gpu_model_runner.py | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 90572805f750..0048ef61c3c3 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -315,7 +315,7 @@ def dispatch(
         # finally, just return no cudagraphs and a trivial batch descriptor
         return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
 
-    def get_capture_descs(self) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]:
+    def get_capture_descs(self, is_vit: bool = False) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]:
         """
         Returns capture descriptors for cudagraph capturing.
 
@@ -333,7 +333,9 @@ def get_capture_descs(self) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]
             descs = list(self.cudagraph_keys[mode])
             if descs:
                 # Sort by num_tokens descending (largest first)
-                descs.sort(key=lambda d: d.num_tokens, reverse=True)
-                result.append((mode, descs))
+                filter_descs = [d for d in descs if d.is_vit == is_vit]
+                if filter_descs:
+                    filter_descs.sort(key=lambda d: d.num_tokens, reverse=True)
+                    result.append((mode, filter_descs))
 
         return result
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5d88be3212d0..9afe730a0641 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -29,6 +29,7 @@
     CUDAGraphMode,
     VllmConfig,
     get_layers_from_vllm_config,
+    set_current_vllm_config,
     update_config,
 )
 from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer

From 7be22e790ef389a8a31bc5d4eb4c8a1022bf63ea Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Tue, 27 Jan 2026 16:23:01 +0800
Subject: [PATCH 26/35] feat: add test

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 .../piecewise/test_qwenvl_vit_cudagraph.py    | 261 ++++++++++++++++++
 1 file changed, 261 insertions(+)
 create mode 100644 tests/compile/piecewise/test_qwenvl_vit_cudagraph.py

diff --git a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
new file mode 100644
index 000000000000..cddf2147b137
--- /dev/null
+++ b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
@@ -0,0 +1,261 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import weakref
+from functools import partial
+
+import pytest
+import torch
+
+from vllm import LLM
+from vllm.config import CompilationConfig, CUDAGraphMode
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.forward_context import set_forward_context
+from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
+from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+
+
+def _worker_embed_multimodal(
+    worker, vllm_config, cudagraph_runtime_mode, batch_descriptor, multi_modal_data
+):
+    """Helper function to run multimodal embedding on a worker.
+    This function sets up the necessary forward context for tensor-parallel (TP)
+    execution and then calls the model's `embed_multimodal` method.
+    Note: For data-parallel (DP) mode, the forward context is typically
+    created and managed within the vision dispatcher, which would override
+    the context set here.
+    Args:
+        worker: The worker instance containing the model runner.
+        vllm_config: The vLLM engine configuration.
+        cudagraph_runtime_mode: The runtime mode for CUDA graph execution.
+        batch_descriptor: An object describing the current batch.
+        multi_modal_data: A dictionary of keyword arguments to be passed to
+            the model's `embed_multimodal` method.
+    Returns:
+        The output from the model's `embed_multimodal` method.
+    """
+
+    # Access model via worker.model_runner.model
+    # Note: Accessing internal attributes. Assuming V1 worker structure.
+    model = worker.model_runner.model
+
+    # Move multi_modal_data to the model's device
+    target_device = next(model.parameters()).device
+    multi_modal_data = {
+        k: v.to(target_device) if isinstance(v, torch.Tensor) else v
+        for k, v in multi_modal_data.items()
+    }
+
+    with (
+        set_forward_context(
+            None,
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=batch_descriptor,
+        ),
+        torch.inference_mode(),
+    ):
+        ans = model.embed_multimodal(**multi_modal_data)
+        torch.cuda.synchronize()
+        return ans
+
+
+# Format: (model_name, tp_size, mm_encoder_tp_mode)
+TEST_CONFIGS = [
+    ("Qwen/Qwen2.5-VL-3B-Instruct", 1, "weights"),
+    ("Qwen/Qwen3-VL-4B-Instruct", 1, "weights"),
+    # TP/DP modes with 2 GPUs
+    ("Qwen/Qwen2.5-VL-3B-Instruct", 2, "data"),
+    ("Qwen/Qwen2.5-VL-3B-Instruct", 2, "weights"),
+    ("Qwen/Qwen3-VL-4B-Instruct", 2, "data"),
+    ("Qwen/Qwen3-VL-4B-Instruct", 2, "weights"),
+]
+
+
+@pytest.fixture(
+    params=TEST_CONFIGS, ids=lambda x: f"{x[0].split('/')[-1]}-tp{x[1]}-{x[2]}"
+)
+def llm(request):
+    model_name, tp_size, mm_mode = request.param
+
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tp_size={tp_size}")
+
+    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+    # Common configuration
+    common_args = {
+        "model": model_name,
+        "trust_remote_code": True,
+        "max_model_len": 4096,
+        "max_num_seqs": 16,
+        "gpu_memory_utilization": 0.2,
+        "tensor_parallel_size": tp_size,
+        "mm_encoder_tp_mode": mm_mode,
+    }
+
+    # Initialize LLM with ViT CUDA graph enabled (piecewise)
+    # We only need one LLM instance. For eager execution, we will force
+    # cudagraph_runtime_mode=NONE at runtime.
+    llm_instance = None
+    try:
+        llm_instance = LLM(
+            **common_args,
+            compilation_config=CompilationConfig(
+                cudagraph_mode="PIECEWISE",
+                compile_mm_encoder=True,
+                vit_cudagraph_capture_sizes=[64, 128, 256],
+            ),
+        )
+        print(f"LLM initialized for {model_name} tp={tp_size} mode={mm_mode}")
+        yield weakref.proxy(llm_instance)
+    finally:
+        print("Cleaning up LLM after testing.")
+        if llm_instance:
+            # Ensure model executor and workers are properly shut down
+            # llm_instance.llm_engine is vllm.v1.engine.llm_engine.LLMEngine
+            # which has engine_core (InprocClient).
+            if hasattr(llm_instance.llm_engine, "engine_core"):
+                llm_instance.llm_engine.engine_core.shutdown()
+            del llm_instance
+
+        # Clean up distributed environment
+        cleanup_dist_env_and_memory()
+
+
+class TestQwenVLCUDAGraph:
+    def _run_embed_multimodal(
+        self, llm, multi_modal_data, num_patches, force_eager=False
+    ):
+        """Runs the multimodal embedding process, potentially with CUDA graphs.
+        This method manually constructs a CudagraphDispatcher because accessing the
+        one within the GPU model runner is difficult. It then dispatches based on
+        the number of image patches to determine the appropriate CUDA graph or
+        eager mode for execution. The actual embedding is performed on the
+        worker(s) via an RPC call.
+        Args:
+            llm: The LLM object containing the model engine and configuration.
+            multi_modal_data: A dictionary containing the multimodal data to be
+                processed.
+            num_patches: The number of image patches, used to determine the
+                number of tokens for the dispatcher.
+            force_eager: If True, forces the execution to run in eager mode,
+                bypassing CUDA graphs.
+        Returns:
+            The outputs from the multimodal embedding process executed on the
+            worker.
+        """
+        vllm_config = llm.llm_engine.vllm_config
+
+        dispatcher = CudagraphDispatcher(vllm_config)
+        dispatcher.initialize_cudagraph_keys(
+            cudagraph_mode=vllm_config.compilation_config.cudagraph_mode,
+            uniform_decode_query_len=1,
+        )
+
+        # Dispatch to get runtime mode and batch descriptor
+        cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(
+            num_tokens=num_patches, uniform_decode=False, has_lora=False, is_vit=True
+        )
+
+        model_executor = llm.llm_engine.model_executor
+
+        rpc_kwargs = {}
+        # Use collective_rpc to execute on driver worker (rank 0)
+        if isinstance(model_executor, MultiprocExecutor):
+            rpc_kwargs["unique_reply_rank"] = 0
+        # If force_eager is True, override the runtime mode to NONE
+        if force_eager:
+            cudagraph_runtime_mode = CUDAGraphMode.NONE
+        else:
+            multi_modal_data["cudagraph_dispatcher"] = dispatcher
+        outputs = model_executor.collective_rpc(
+            partial(
+                _worker_embed_multimodal,
+                vllm_config=vllm_config,
+                cudagraph_runtime_mode=cudagraph_runtime_mode,
+                batch_descriptor=batch_descriptor,
+                multi_modal_data=multi_modal_data,
+            ),
+            **rpc_kwargs,
+        )
+
+        if isinstance(outputs, list) and len(outputs) == 1:
+            outputs = outputs[0]
+        return outputs
+
+    def test_vit_cudagraph_consistency(self, llm):
+        print("Starting test for ViT CUDA graph consistency.")
+
+        model_name = llm.llm_engine.vllm_config.model_config.model
+        # Qwen3-VL uses patch_size=16, temporal_patch_size=2 -> 16*16*3*2 = 1536
+        # Qwen2.5-VL uses patch_size=14, temporal_patch_size=2 -> 14*14*3*2 = 1176
+        input_dim = 1536 if "Qwen3-VL" in model_name else 1176
+
+        num_patches = 64
+        for num_imgs in [1, 2, 4]:
+            image_grid_thw = torch.tensor(
+                [[1, 2, num_patches // 2]] * num_imgs, dtype=torch.long, device="cpu"
+            )
+            pixel_values = torch.rand(
+                (num_patches * num_imgs, input_dim), dtype=torch.bfloat16, device="cpu"
+            )
+
+            multi_modal_data = {
+                "pixel_values": pixel_values,
+                "image_grid_thw": image_grid_thw,
+            }
+            print(
+                "Running inference with single LLM (Piecewise vs Eager via context)."
+                "num_imgs:",
+                num_imgs,
+            )
+
+            # Run with Piecewise CUDA Graph
+            piecewise_outputs = self._run_embed_multimodal(
+                llm, multi_modal_data, num_patches * num_imgs, force_eager=False
+            )
+
+            # Run with Eager Mode (simulated by setting runtime mode to NONE)
+            eager_outputs = self._run_embed_multimodal(
+                llm, multi_modal_data, num_patches * num_imgs, force_eager=True
+            )
+
+            if isinstance(piecewise_outputs, torch.Tensor):
+                assert torch.allclose(
+                    piecewise_outputs, eager_outputs, atol=1e-3, rtol=1e-5
+                ), (
+                    f"num_imgs: {num_imgs}. Piecewise and Eager outputs do not match. "
+                    "Max abs diff: "
+                    f"{torch.max(torch.abs(piecewise_outputs - eager_outputs))}. "
+                    "Max rel diff: "
+                    f"{
+                        torch.max(
+                            torch.abs(piecewise_outputs - eager_outputs)
+                            / (torch.abs(eager_outputs) + 1e-8)
+                        )
+                    }"
+                )
+            elif isinstance(piecewise_outputs, tuple):
+                assert isinstance(eager_outputs, tuple), (
+                    "Output types mismatch, piecewise is tuple but eager is not."
+                )
+                assert len(piecewise_outputs) == len(eager_outputs), (
+                    "Output tuple lengths mismatch."
+                )
+                for i, (p_out, e_out) in enumerate(
+                    zip(piecewise_outputs, eager_outputs)
+                ):
+                    assert torch.allclose(p_out, e_out, atol=1e-3, rtol=1e-5), (
+                        f"num_imgs: {num_imgs}. "
+                        f"Tuple element {i} does not match. "
+                        "Max abs diff: "
+                        f"{torch.max(torch.abs(p_out - e_out))}. "
+                        "Max rel diff: "
+                        f"{
+                            torch.max(
+                                torch.abs(p_out - e_out) / (torch.abs(e_out) + 1e-8)
+                            )
+                        }"
+                    )
+            else:
+                raise TypeError(f"Unsupported output type: {type(piecewise_outputs)}")

From eb91c31014b4f406e0c2650a198da2c6478d5030 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Tue, 27 Jan 2026 16:57:29 +0800
Subject: [PATCH 27/35] ruff

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
Co-authored-by: Xingran Wang <wangxingran123456@outlook.com>
---
 vllm/compilation/backends.py             | 13 ++--
 vllm/config/vllm.py                      |  1 +
 vllm/model_executor/models/qwen2_5_vl.py | 63 ++++++++++-----
 vllm/model_executor/models/qwen3_vl.py   | 98 ++++++++++++++++--------
 vllm/model_executor/models/vision.py     | 22 ++++--
 vllm/v1/cudagraph_dispatcher.py          | 15 +++-
 vllm/v1/worker/gpu_model_runner.py       | 64 +++++++++++-----
 7 files changed, 189 insertions(+), 87 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 615948072f72..0cff6ed5ef53 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -10,7 +10,7 @@
 import os
 import pprint
 import time
-from collections.abc import Callable, Generator, Sequence
+from collections.abc import Callable, Generator, Iterator, Sequence
 from contextlib import contextmanager
 from copy import deepcopy
 from functools import partial
@@ -56,7 +56,7 @@
 
 
 @contextmanager
-def set_is_last_graph_in_vit_sequence(is_last: bool):
+def set_is_last_graph_in_vit_sequence(is_last: bool) -> Iterator[None]:
     """Context manager to indicate if the current graph being compiled
     is the last one in a sequence of graphs (e.g., a sequence of blocks).
     """
@@ -75,7 +75,7 @@ def set_is_last_graph_in_vit_sequence(is_last: bool):
 
 
 @contextmanager
-def set_is_first_graph_in_vit_sequence(is_first: bool):
+def set_is_first_graph_in_vit_sequence(is_first: bool) -> Iterator[None]:
     """Context manager to indicate if the current graph being compiled
     is the first one in a sequence of graphs (e.g., a sequence of blocks).
     """
@@ -87,6 +87,7 @@ def set_is_first_graph_in_vit_sequence(is_first: bool):
     finally:
         _is_first_graph_in_vit_sequence = original_value
 
+
 def make_copy_and_call(
     sym_tensor_indices: list[int],
     input_buffers: list[torch.Tensor | None],
@@ -487,10 +488,8 @@ def wrap_with_cudagraph_if_needed(
         runtime_mode=CUDAGraphMode.PIECEWISE,
         cudagraph_options=CUDAGraphOptions(
             debug_log_enable=is_first_graph,
-            gc_disable=not is_first_graph
-                        or not _is_first_graph_in_vit_sequence,
-            weak_ref_output=is_last_graph
-                        and _is_last_graph_in_vit_sequence,
+            gc_disable=not is_first_graph or not _is_first_graph_in_vit_sequence,
+            weak_ref_output=is_last_graph and _is_last_graph_in_vit_sequence,
         ),
     )
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 90b54e9e2396..f43ee67f3fd1 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1346,6 +1346,7 @@ def _set_compile_ranges(self):
         compilation_config.compile_ranges_split_points = sorted(
             computed_compile_ranges_split_points
         )
+
     def _set_vit_cudagraph_sizes(self):
         """Sets the CUDA graph capture sizes for the Vision Transformer (ViT).
 
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index a7c16401360a..57b7b59fe28a 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -43,7 +43,12 @@
 )
 
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config
+from vllm.config import (
+    CUDAGraphMode,
+    VllmConfig,
+    get_current_vllm_config,
+    set_current_vllm_config,
+)
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.forward_context import get_forward_context, is_forward_context_available
@@ -654,10 +659,14 @@ def __init__(
                 dtype=self.dtype,
             )
             self._persistent_rotary_pos_emb_cos_buffer = torch.empty(
-                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16
+                (max_compile_size, head_dim // 2),
+                device=self.device,
+                dtype=torch.bfloat16,
             )
             self._persistent_rotary_pos_emb_sin_buffer = torch.empty(
-                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16
+                (max_compile_size, head_dim // 2),
+                device=self.device,
+                dtype=torch.bfloat16,
             )
 
     @property
@@ -804,7 +813,7 @@ def forward(
         cu_seqlens: list = []
 
         fwd_ctx = None
-        if is_forward_context_available():  
+        if is_forward_context_available():
             fwd_ctx = get_forward_context()
         if (
             self._persistent_hidden_states_buffer is not None
@@ -821,7 +830,10 @@ def forward(
             set_is_last_graph_in_vit_sequence,
         )
 
-        with set_is_first_graph_in_vit_sequence(True), set_is_last_graph_in_vit_sequence(False):
+        with (
+            set_is_first_graph_in_vit_sequence(True),
+            set_is_last_graph_in_vit_sequence(False),
+        ):
             hidden_states = self.patch_embed(hidden_states)
 
         window_index_id = 0
@@ -881,12 +893,12 @@ def forward(
             and fwd_ctx
             and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
         ):
-            rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[:seq_len].copy_(
-                rotary_pos_emb_sin
-            )
-            rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[:seq_len].copy_(
-                rotary_pos_emb_cos
-            )
+            rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[
+                :seq_len
+            ].copy_(rotary_pos_emb_sin)
+            rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[
+                :seq_len
+            ].copy_(rotary_pos_emb_cos)
         window_index = window_index.to(device=hidden_states.device, non_blocking=True)
         reverse_indices = reverse_indices.to(
             device=hidden_states.device, non_blocking=True
@@ -938,7 +950,10 @@ def forward(
             hidden_states = cast_overflow_tensors(hidden_states)
 
         # adapter
-        with set_is_first_graph_in_vit_sequence(False), set_is_last_graph_in_vit_sequence(True):
+        with (
+            set_is_first_graph_in_vit_sequence(False),
+            set_is_last_graph_in_vit_sequence(True),
+        ):
             hidden_states = self.merger(hidden_states)
         hidden_states = hidden_states[reverse_indices, :]
         return hidden_states
@@ -1263,7 +1278,8 @@ def _parse_and_validate_video_input(
             )
 
     def _process_image_input(
-        self, image_input: Qwen2_5_VLImageInputs,
+        self,
+        image_input: Qwen2_5_VLImageInputs,
         cudagraph_dispatcher: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
@@ -1275,7 +1291,10 @@ def _process_image_input(
         else:
             pixel_values = image_input["pixel_values"]
             with set_current_vllm_config(self.vllm_config):
-                if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture:
+                if (
+                    self.use_data_parallel
+                    and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture
+                ):
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
                         pixel_values,
@@ -1323,7 +1342,8 @@ def _postprocess_image_embeds_evs(
         return tuple(image_embeds_split)
 
     def _process_video_input(
-        self, video_input: Qwen2_5_VLVideoInputs,
+        self,
+        video_input: Qwen2_5_VLVideoInputs,
         cudagraph_dispatcher: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
@@ -1335,7 +1355,10 @@ def _process_video_input(
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
             with set_current_vllm_config(self.vllm_config):
-                if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture:
+                if (
+                    self.use_data_parallel
+                    and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture
+                ):
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
                         pixel_values_videos,
@@ -1505,14 +1528,18 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
         for modality in mm_input_by_modality:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
-                image_embeddings = self._process_image_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher)
+                image_embeddings = self._process_image_input(
+                    multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher
+                )
                 if self.is_multimodal_pruning_enabled:
                     image_embeddings = self._postprocess_image_embeds_evs(
                         image_embeddings, multimodal_input
                     )
                 multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
-                video_embeddings = self._process_video_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher)
+                video_embeddings = self._process_video_input(
+                    multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher
+                )
                 if self.is_multimodal_pruning_enabled:
                     video_embeddings = self._postprocess_video_embeds_evs(
                         video_embeddings, multimodal_input
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index c012e17adfcb..9669809318c9 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -49,7 +49,12 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config
+from vllm.config import (
+    CUDAGraphMode,
+    VllmConfig,
+    get_current_vllm_config,
+    set_current_vllm_config,
+)
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_pp_group
 from vllm.forward_context import get_forward_context, is_forward_context_available
@@ -141,8 +146,7 @@
 DUMMY_VIDEO_NUM_FRAMES = 2048
 
 
-@support_torch_compile(dynamic_arg_dims={"x": 0},
-    enable_if=should_torch_compile_mm_vit)
+@support_torch_compile(dynamic_arg_dims={"x": 0}, enable_if=should_torch_compile_mm_vit)
 class Qwen3_VisionPatchEmbed(nn.Module):
     def __init__(
         self,
@@ -210,7 +214,12 @@ def forward(self, x: torch.Tensor):
 
 
 @support_torch_compile(
-    dynamic_arg_dims={"x": 0, "cu_seqlens": 0, "rotary_pos_emb_cos": 0, "rotary_pos_emb_sin": 0},
+    dynamic_arg_dims={
+        "x": 0,
+        "cu_seqlens": 0,
+        "rotary_pos_emb_cos": 0,
+        "rotary_pos_emb_sin": 0,
+    },
     enable_if=should_torch_compile_mm_vit,
 )
 class Qwen3_VisionBlock(nn.Module):
@@ -265,8 +274,7 @@ def forward(
         return x
 
 
-@support_torch_compile(dynamic_arg_dims={"x": 0},
-    enable_if=should_torch_compile_mm_vit)
+@support_torch_compile(dynamic_arg_dims={"x": 0}, enable_if=should_torch_compile_mm_vit)
 class Qwen3_VisionPatchMerger(nn.Module):
     def __init__(
         self,
@@ -428,15 +436,23 @@ def __init__(
         self._persistent_rotary_pos_emb_cos_buffer: torch.Tensor | None = None
         self._persistent_rotary_pos_emb_sin_buffer: torch.Tensor | None = None
         if vllm_config.compilation_config.vit_cudagraph_capture_sizes:
-            max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
+            max_compile_size = (
+                vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
+            )
             self._persistent_hidden_states_buffer = torch.empty(
-                (max_compile_size, self.patch_embed.proj.input_size), device=self.device, dtype=self.dtype
+                (max_compile_size, self.patch_embed.proj.input_size),
+                device=self.device,
+                dtype=self.dtype,
             )
             self._persistent_rotary_pos_emb_cos_buffer = torch.empty(
-                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16
+                (max_compile_size, head_dim // 2),
+                device=self.device,
+                dtype=torch.bfloat16,
             )
             self._persistent_rotary_pos_emb_sin_buffer = torch.empty(
-                (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16
+                (max_compile_size, head_dim // 2),
+                device=self.device,
+                dtype=torch.bfloat16,
             )
 
     @property
@@ -571,7 +587,7 @@ def forward(
     ) -> torch.Tensor:
         seq_len, _ = x.size()
         fwd_ctx = None
-        if is_forward_context_available():  
+        if is_forward_context_available():
             fwd_ctx = get_forward_context()
         if (
             self._persistent_hidden_states_buffer is not None
@@ -581,14 +597,19 @@ def forward(
             hidden_states = self._persistent_hidden_states_buffer[:seq_len]
             hidden_states.copy_(x, non_blocking=True)
         else:
-            hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True)
+            hidden_states = x.to(
+                device=self.device, dtype=self.dtype, non_blocking=True
+            )
 
         from vllm.compilation.backends import (
             set_is_first_graph_in_vit_sequence,
             set_is_last_graph_in_vit_sequence,
         )
 
-        with set_is_first_graph_in_vit_sequence(True), set_is_last_graph_in_vit_sequence(False):
+        with (
+            set_is_first_graph_in_vit_sequence(True),
+            set_is_last_graph_in_vit_sequence(False),
+        ):
             hidden_states = self.patch_embed(hidden_states)
 
         if isinstance(grid_thw, list):
@@ -608,12 +629,12 @@ def forward(
             and fwd_ctx
             and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
         ):
-            rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[:seq_len].copy_(
-                rotary_pos_emb_sin
-            )
-            rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[:seq_len].copy_(
-                rotary_pos_emb_cos
-            )
+            rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[
+                :seq_len
+            ].copy_(rotary_pos_emb_sin)
+            rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[
+                :seq_len
+            ].copy_(rotary_pos_emb_cos)
 
         cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
             axis=0, dtype=np.int32
@@ -651,12 +672,17 @@ def forward(
                     max_seqlen=max_seqlen,
                 )
                 if layer_num in self.deepstack_visual_indexes:
-                    deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num)
-                    deepstack_feature = self.deepstack_merger_list[deepstack_merger_idx](
-                        hidden_states
-                )
+                    deepstack_merger_idx = self.deepstack_visual_indexes.index(
+                        layer_num
+                    )
+                    deepstack_feature = self.deepstack_merger_list[
+                        deepstack_merger_idx
+                    ](hidden_states)
                     deepstack_feature_lists.append(deepstack_feature)
-        with set_is_first_graph_in_vit_sequence(False), set_is_last_graph_in_vit_sequence(True):
+        with (
+            set_is_first_graph_in_vit_sequence(False),
+            set_is_last_graph_in_vit_sequence(True),
+        ):
             hidden_states = self.merger(hidden_states)
         hidden_states = torch.cat(
             [hidden_states] + deepstack_feature_lists, dim=1
@@ -1490,7 +1516,8 @@ def _parse_and_validate_video_input(
             )
 
     def _process_image_input(
-        self, image_input: Qwen2_5_VLImageInputs,
+        self,
+        image_input: Qwen2_5_VLImageInputs,
         cudagraph_dispatcher: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
@@ -1503,7 +1530,10 @@ def _process_image_input(
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
 
             with set_current_vllm_config(self.vllm_config):
-                if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture:
+                if (
+                    self.use_data_parallel
+                    and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture
+                ):
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
                         pixel_values,
@@ -1520,7 +1550,8 @@ def _process_image_input(
         return image_embeds.split(sizes)
 
     def _process_video_input(
-        self, video_input: Qwen2_5_VLVideoInputs,
+        self,
+        video_input: Qwen2_5_VLVideoInputs,
         cudagraph_dispatcher: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
@@ -1534,7 +1565,10 @@ def _process_video_input(
                 self.visual.dtype
             )
             with set_current_vllm_config(self.vllm_config):
-                if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture:
+                if (
+                    self.use_data_parallel
+                    and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture
+                ):
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
                         pixel_values_videos,
@@ -2003,14 +2037,18 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
         for modality in mm_input_by_modality:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
-                image_embeddings = self._process_image_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher)
+                image_embeddings = self._process_image_input(
+                    multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher
+                )
                 if self.is_multimodal_pruning_enabled:
                     image_embeddings = self._postprocess_image_embeds_evs(
                         image_embeddings, multimodal_input
                     )
                 multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
-                video_embeddings = self._process_video_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher)
+                video_embeddings = self._process_video_input(
+                    multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher
+                )
                 if self.is_multimodal_pruning_enabled:
                     video_embeddings = self._postprocess_video_embeds_evs(
                         video_embeddings, multimodal_input
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 22c9a30c23c6..1637b27209af 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -10,17 +10,22 @@
 import torch
 from transformers import PretrainedConfig
 
-from vllm.config import MultiModalConfig, VllmConfig, CUDAGraphMode, get_current_vllm_config
+from vllm.config import (
+    CUDAGraphMode,
+    MultiModalConfig,
+    VllmConfig,
+    get_current_vllm_config,
+)
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_gather,
 )
-from vllm.forward_context import BatchDescriptor, set_forward_context
-from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
+from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 
 logger = init_logger(__name__)
 
@@ -489,8 +494,7 @@ def run_dp_sharded_mrope_vision_model(
     cudagraph_runtime_mode = CUDAGraphMode.NONE
     batch_descriptor = None
 
-    if (vllm_config and
-        vllm_config.compilation_config.vit_cudagraph_capture_sizes):
+    if vllm_config and vllm_config.compilation_config.vit_cudagraph_capture_sizes:
         current_input_len = pixel_values_local.shape[0]
         cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(
             num_tokens=current_input_len,
@@ -500,7 +504,7 @@ def run_dp_sharded_mrope_vision_model(
             is_vit=True,
         )
         target_input_len = batch_descriptor.num_tokens
-    
+
         # Pad pixel_values_local for CUDA graph if needed
         if current_input_len < target_input_len:
             padding_size = target_input_len - current_input_len
@@ -516,7 +520,7 @@ def run_dp_sharded_mrope_vision_model(
         None,
         vllm_config=vllm_config,
         cudagraph_runtime_mode=cudagraph_runtime_mode,
-        batch_descriptor=batch_descriptor
+        batch_descriptor=batch_descriptor,
     ):
         # Run the vision model on the local pixel_values_local
         if rope_type == "rope_2d":
@@ -535,7 +539,9 @@ def run_dp_sharded_mrope_vision_model(
                 )
         else:
             if pixel_values_local.shape[0] > 0:
-                image_embeds_local = vision_model(pixel_values_local, local_grid_thw_list)
+                image_embeds_local = vision_model(
+                    pixel_values_local, local_grid_thw_list
+                )
             else:
                 # Handle empty case
                 image_embeds_local = torch.empty(
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 0048ef61c3c3..bc90d4044f71 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -133,7 +133,6 @@ def _compute_bs_to_padded_vit_graph_size(self) -> None:
                     self._bs_to_padded_vit_graph_size[bs] = start
                 else:
                     self._bs_to_padded_vit_graph_size[bs] = end
-        
 
     def _create_padded_batch_descriptor(
         self,
@@ -272,8 +271,14 @@ def dispatch(
         if (
             not self.keys_initialized
             or self.cudagraph_mode == CUDAGraphMode.NONE
-            or (not is_vit and num_tokens > self.compilation_config.max_cudagraph_capture_size)
-            or (is_vit and num_tokens > self.compilation_config.max_vit_cudagraph_capture_size)
+            or (
+                not is_vit
+                and num_tokens > self.compilation_config.max_cudagraph_capture_size
+            )
+            or (
+                is_vit
+                and num_tokens > self.compilation_config.max_vit_cudagraph_capture_size
+            )
         ):
             return CUDAGraphMode.NONE, BatchDescriptor(num_tokens, is_vit=is_vit)
 
@@ -315,7 +320,9 @@ def dispatch(
         # finally, just return no cudagraphs and a trivial batch descriptor
         return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
 
-    def get_capture_descs(self, is_vit: bool = False) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]:
+    def get_capture_descs(
+        self, is_vit: bool = False
+    ) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]:
         """
         Returns capture descriptors for cudagraph capturing.
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9afe730a0641..b07874cd6a16 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2431,7 +2431,10 @@ def _execute_mm_encoder(
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
                 is_vit_dp_mode = (
-                    getattr(self.model_config.multimodal_config, "mm_encoder_tp_mode", None) == "data"
+                    getattr(
+                        self.model_config.multimodal_config, "mm_encoder_tp_mode", None
+                    )
+                    == "data"
                     and self.parallel_config.tensor_parallel_size > 1
                 )
                 if not is_vit_dp_mode:
@@ -2441,17 +2444,24 @@ def _execute_mm_encoder(
                     # Default values for non-ViT cudagraph case
                     cudagraph_runtime_mode = CUDAGraphMode.NONE
                     batch_descriptor = None
-                    if self.vit_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group:
-                        pixel_values = mm_kwargs_group["pixel_values"]
+                    if (
+                        self.vit_cudagraph_batch_sizes
+                        and "pixel_values" in mm_kwargs_group
+                    ):
+                        pixel_values = cast(
+                            torch.Tensor, mm_kwargs_group["pixel_values"]
+                        )
                         num_tokens = pixel_values.shape[0]
 
                         # get batch_descriptor from dispatcher
-                        cudagraph_runtime_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch(
-                            num_tokens=num_tokens,
-                            uniform_decode=False,
-                            has_lora=False,
-                            disable_full=False,
-                            is_vit=True,
+                        cudagraph_runtime_mode, batch_descriptor = (
+                            self.cudagraph_dispatcher.dispatch(
+                                num_tokens=num_tokens,
+                                uniform_decode=False,
+                                has_lora=False,
+                                disable_full=False,
+                                is_vit=True,
+                            )
                         )
                         padded_num_tokens = batch_descriptor.num_tokens
 
@@ -2468,12 +2478,16 @@ def _execute_mm_encoder(
 
                             # Update image_grid_thw to account for padding
                             if "image_grid_thw" in mm_kwargs_group:
-                                image_grid_thw = mm_kwargs_group["image_grid_thw"]
+                                image_grid_thw = cast(
+                                    torch.Tensor, mm_kwargs_group["image_grid_thw"]
+                                )
                                 original_num_imgs = image_grid_thw.shape[0]
 
                                 # Treat padding as a new virtual image.
-                                # Assuming a fixed patch size where height is merge_size.
-                                h_patches, w_patches = self._get_dummy_h_w_patches(padding_amount)
+                                # Assuming a fixed patch size where height = merge_size
+                                h_patches, w_patches = self._get_dummy_h_w_patches(
+                                    padding_amount
+                                )
                                 padding_grid_info = torch.tensor(
                                     [[1, h_patches, w_patches]],
                                     dtype=image_grid_thw.dtype,
@@ -2483,23 +2497,31 @@ def _execute_mm_encoder(
                                     [image_grid_thw, padding_grid_info], dim=0
                                 )
 
-                    with set_forward_context(
+                    with (
+                        set_forward_context(
                             None,
                             vllm_config=self.vllm_config,
                             cudagraph_runtime_mode=cudagraph_runtime_mode,
                             batch_descriptor=batch_descriptor,
-                        ), self.timed_encoder_operation(
-                        should_time, mm_lora_refs, current_item_idx, num_items
+                        ),
+                        self.timed_encoder_operation(
+                            should_time, mm_lora_refs, current_item_idx, num_items
+                        ),
                     ):
                         curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
                     # Remove the padded items before sanity check
                     if original_num_imgs != -1:
                         curr_group_outputs = curr_group_outputs[:original_num_imgs]
                 else:
-                    with set_current_vllm_config(self.vllm_config), self.timed_encoder_operation(
-                        should_time, mm_lora_refs, current_item_idx, num_items
+                    with (
+                        set_current_vllm_config(self.vllm_config),
+                        self.timed_encoder_operation(
+                            should_time, mm_lora_refs, current_item_idx, num_items
+                        ),
                     ):
-                        mm_kwargs_group["cudagraph_dispatcher"] = self.cudagraph_dispatcher
+                        mm_kwargs_group["cudagraph_dispatcher"] = (
+                            self.cudagraph_dispatcher
+                        )
                         curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
@@ -4665,7 +4687,7 @@ def _get_dummy_vit_input(
     ) -> BatchedTensorInputs:
         """Dummy data for profiling and precompiling ViT."""
 
-        # The first dimension of pixel_values corresponds 
+        # The first dimension of pixel_values corresponds
         # to the total number of patches.
         pixel_values = torch.zeros(
             (num_image_tokens, img_feature_dim), dtype=self.dtype, device=self.device
@@ -5219,7 +5241,9 @@ def _dummy_mm_encoder_run(
             "video",
             1,
         )
-        img_feature_dim = tmp_dummy_mm_inputs["pixel_values_videos"].shape[1]
+        img_feature_dim = cast(
+            torch.Tensor, tmp_dummy_mm_inputs["pixel_values_videos"]
+        ).shape[1]
 
         if is_global_first_rank():
             compilation_cases = tqdm(

From f7e4ea9180443972122da62eeae198cc3b1a83b4 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Thu, 29 Jan 2026 16:16:27 +0800
Subject: [PATCH 28/35] fix review suggestion

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 docs/design/torch_compile.md               | 10 ---
 docs/design/torch_compile_multimodal.md    | 40 +++++++++++-
 vllm/compilation/backends.py               | 72 +++++++++++++---------
 vllm/config/vllm.py                        | 34 +++++-----
 vllm/forward_context.py                    |  4 ++
 vllm/model_executor/models/qwen2_5_vl.py   | 21 +++----
 vllm/model_executor/models/qwen3_vl.py     | 21 +++----
 vllm/v1/attention/ops/vit_attn_wrappers.py |  4 +-
 vllm/v1/cudagraph_dispatcher.py            | 40 +++++++-----
 9 files changed, 151 insertions(+), 95 deletions(-)

diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 8532c7020cbc..4dc0da0c7d65 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -253,16 +253,6 @@ By default, vLLM will try to determine a set of sizes to capture cudagraph. You
 vllm serve meta-llama/Llama-3.2-1B \
   --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
 ```
-Similarly, For `Qwen2.5-VL`,`Qwen3-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. By default, this is disabled as `compile_mm_encoder` is `False`. To enable it and specify capture sizes, you can do the following:
-```bash
-vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
-  --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": [512, 1024]}'
-```
-Alternatively, you can specify `max_vit_cudagraph_capture_size` to generate a default list of capture sizes up to the given value:
-```bash
-vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
-  --compilation-config '{"compile_mm_encoder": true, "max_vit_cudagraph_capture_size": 2048}'
-```
 
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
 
diff --git a/docs/design/torch_compile_multimodal.md b/docs/design/torch_compile_multimodal.md
index 674ddd801d65..f3f3f3b433f5 100644
--- a/docs/design/torch_compile_multimodal.md
+++ b/docs/design/torch_compile_multimodal.md
@@ -68,7 +68,45 @@ to alert torch.compile to the fact that this range cannot be inferred, and we de
 
 ### Cudagraphs
 
-We have not yet explored compilation for multimodal encoders with CUDAGraph integration; behavior is currently unspecified.
+vLLM now supports Piecewise CUDA Graph integration for the Vision Transformer (ViT) encoder in Qwen2.5-VL and Qwen3-VL models. This feature captures CUDA graphs at specified patch sizes to reduce kernel launch overhead and improve performance.
+
+#### Enabling ViT CUDA Graphs
+
+**Important**: This feature is **not enabled by default**. The Piecewise CUDA Graph implementation relies on `torch.compile` to trace the computation graph and separate the attention operators. Therefore, users must explicitly enable ViT compilation via the `--compilation-config` argument to activate this feature.
+
+To enable ViT CUDA graph compilation, use:
+
+```bash
+vllm serve <model> --compilation-config '{"compile_mm_encoder": true}'
+```
+
+#### Configuring Capture Sizes
+
+You can specify custom patch sizes for CUDA graph capture using `vit_cudagraph_capture_sizes`. For models like `Qwen2.5-VL` and `Qwen3-VL`, the capture sizes should be multiples of the square of `merge_size`:
+
+```bash
+vllm serve <model> --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": [512, 1024]}'
+```
+
+Alternatively, you can specify `max_vit_cudagraph_capture_size` to generate a default list of capture sizes up to the given value:
+
+```bash
+vllm serve <model> --compilation-config '{"compile_mm_encoder": true, "max_vit_cudagraph_capture_size": 2048}'
+```
+
+#### Default Behavior
+
+Once enabled, if `vit_cudagraph_capture_sizes` is not specified, vLLM will use a default set of sizes for capture. Since `compile_mm_encoder` is `False` by default, this feature remains inactive unless configured.
+
+If you only want to enable `torch.compile` for ViT without using the CUDA Graph feature, you can explicitly set the capture sizes to empty:
+
+```bash
+vllm serve <model> --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": []}'
+```
+
+#### Limitations & Notes
+
+- **Image Only**: This feature currently only supports image inference. Video inference is not supported yet.
 
 ## Troubleshooting
 
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 0cff6ed5ef53..63bf3690891a 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -10,8 +10,8 @@
 import os
 import pprint
 import time
-from collections.abc import Callable, Generator, Iterator, Sequence
-from contextlib import contextmanager
+from collections.abc import Callable, Generator, Sequence
+from contextlib import AbstractContextManager, contextmanager
 from copy import deepcopy
 from functools import partial
 from typing import Any
@@ -30,6 +30,7 @@
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
 from vllm.config.compilation import DynamicShapesType
 from vllm.config.utils import Range, hash_factors
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.logging_utils import lazy
 from vllm.platforms import current_platform
@@ -48,44 +49,45 @@
 
 logger = init_logger(__name__)
 
-# A global flag to indicate if the current graph being compiled
-# is the last one in a sequence of graphs (e.g., a sequence of blocks).
-# This is a workaround to control CUDAGraph weak_ref_output behavior
-# in **vit** piecewise compilation.
-_is_last_graph_in_vit_sequence: bool = True
-
 
 @contextmanager
-def set_is_last_graph_in_vit_sequence(is_last: bool) -> Iterator[None]:
-    """Context manager to indicate if the current graph being compiled
-    is the last one in a sequence of graphs (e.g., a sequence of blocks).
-    """
-    global _is_last_graph_in_vit_sequence
-    original_value = _is_last_graph_in_vit_sequence
-    _is_last_graph_in_vit_sequence = is_last
+def _set_mm_encoder_sequence_flag(
+    attr_name: str, value: bool
+) -> Generator[None, None, None]:
+    try:
+        ctx = get_forward_context()
+        original_value = getattr(ctx, attr_name)
+        setattr(ctx, attr_name, value)
+    except Exception:
+        yield
+        return
+
     try:
         yield
     finally:
-        _is_last_graph_in_vit_sequence = original_value
+        setattr(ctx, attr_name, original_value)
 
 
-# A global flag to indicate if the current graph being compiled
-# is the first one in a sequence of graphs (e.g., a sequence of blocks).
-_is_first_graph_in_vit_sequence: bool = True
+def set_is_last_graph_in_mm_encoder_sequence(
+    is_last: bool,
+) -> AbstractContextManager[None]:
+    """Context manager to indicate if the current graph being compiled
+    is the last one in a sequence of graphs (e.g., a sequence of blocks).
+    """
+    return _set_mm_encoder_sequence_flag(
+        "is_last_graph_in_mm_encoder_sequence", is_last
+    )
 
 
-@contextmanager
-def set_is_first_graph_in_vit_sequence(is_first: bool) -> Iterator[None]:
+def set_is_first_graph_in_mm_encoder_sequence(
+    is_first: bool,
+) -> AbstractContextManager[None]:
     """Context manager to indicate if the current graph being compiled
     is the first one in a sequence of graphs (e.g., a sequence of blocks).
     """
-    global _is_first_graph_in_vit_sequence
-    original_value = _is_first_graph_in_vit_sequence
-    _is_first_graph_in_vit_sequence = is_first
-    try:
-        yield
-    finally:
-        _is_first_graph_in_vit_sequence = original_value
+    return _set_mm_encoder_sequence_flag(
+        "is_first_graph_in_mm_encoder_sequence", is_first
+    )
 
 
 def make_copy_and_call(
@@ -482,14 +484,24 @@ def wrap_with_cudagraph_if_needed(
     # CUDAGraphWrapper for piecewise_backend, to distinguish
     # it from the FULL cudagraph runtime mode, no matter it
     # is wrapped on a full or piecewise fx graph.
+
+    try:
+        fwd_ctx = get_forward_context()
+        is_first_graph_in_sequence = fwd_ctx.is_first_graph_in_mm_encoder_sequence
+        is_last_graph_in_sequence = fwd_ctx.is_last_graph_in_mm_encoder_sequence
+    except Exception:
+        # Fallback for when ForwardContext is not available
+        is_first_graph_in_sequence = True
+        is_last_graph_in_sequence = True
+
     return static_graph_wrapper_class(
         runnable=piecewise_backend,
         vllm_config=vllm_config,
         runtime_mode=CUDAGraphMode.PIECEWISE,
         cudagraph_options=CUDAGraphOptions(
             debug_log_enable=is_first_graph,
-            gc_disable=not is_first_graph or not _is_first_graph_in_vit_sequence,
-            weak_ref_output=is_last_graph and _is_last_graph_in_vit_sequence,
+            gc_disable=not is_first_graph or not is_first_graph_in_sequence,
+            weak_ref_output=is_last_graph and is_last_graph_in_sequence,
         ),
     )
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index f43ee67f3fd1..919f214ce720 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1350,15 +1350,16 @@ def _set_compile_ranges(self):
     def _set_vit_cudagraph_sizes(self):
         """Sets the CUDA graph capture sizes for the Vision Transformer (ViT).
 
-        This method determines the batch sizes for which ViT CUDA graphs will be
-        captured. CUDA graphs improve performance by reducing kernel launch
-        overhead for the vision encoder.
+        This method determines the batch sizes (in terms of number of patches)
+        for which ViT CUDA graphs will be captured. CUDA graphs improve
+        performance by reducing kernel launch overhead for the vision encoder.
 
         The logic is as follows:
         1.  The feature is only enabled if all of the following conditions are met:
-            - Eager mode is not enforced.
-            - CUDA graph mode is enabled.
-            - The multimodal encoder compilation is enabled.
+            - A model is configured (`model_config` is not None).
+            - Eager mode is not enforced (`enforce_eager` is False).
+            - CUDA graph mode is enabled (`cudagraph_mode` is not NONE).
+            - Multimodal encoder compilation is enabled (`compile_mm_encoder` is True).
             If these conditions are not met, the list of capture sizes will be empty,
             effectively disabling ViT CUDA graphs.
 
@@ -1367,17 +1368,22 @@ def _set_vit_cudagraph_sizes(self):
             and sorted in ascending order.
 
         3.  If no sizes are provided by the user, a default list of sizes is
-            generated up to a maximum of 5120. The default sizes are:
-            [512, 1024, 1536] + list(range(2048, 2048, 128)) + list(
-            range(4096, 8192 + 1, 256))
-
-        The final list of sizes is stored in
-        `self.compilation_config.vit_cudagraph_capture_sizes`.
-
+            generated. The maximum size for this list is determined automatically
+            by `compute_encoder_budget` (capped at 8192), or by the user-provided
+            `max_vit_cudagraph_capture_size`. The default sizes are:
+            [512, 1024, 1536] + list(range(2048, 4096, 128)) + list(
+            range(4096, max_size + 1, 256))
+
+        4.  The final list of sizes is stored in
+            `self.compilation_config.vit_cudagraph_capture_sizes`. The
+            `max_vit_cudagraph_capture_size` is also updated to be consistent
+            with the largest value in this final list.
+
+        At runtime:
         - If a batch's size matches or is smaller than a captured size, the
           closest captured graph is used.
         - If a batch's size is larger than the largest captured size, a CUDA
-          graph will not be used for that batch.
+          graph will not be used for that batch (fallback to eager execution).
         """
         if (
             self.model_config is not None
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 1b89c04851f0..d7a7603f6b41 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -255,6 +255,10 @@ class ForwardContext:
     all_moe_layers: list[str] | None = None
     moe_layer_index: int = 0
 
+    # ViT Multi-Modal Encoder flags used by backend compiler
+    is_first_graph_in_mm_encoder_sequence: bool = True
+    is_last_graph_in_mm_encoder_sequence: bool = True
+
     additional_kwargs: dict[str, Any] = field(default_factory=dict)
 
     def __post_init__(self):
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 57b7b59fe28a..1127cfea1634 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -42,6 +42,10 @@
     Qwen2_5_VLVisionConfig,
 )
 
+from vllm.compilation.backends import (
+    set_is_first_graph_in_mm_encoder_sequence,
+    set_is_last_graph_in_mm_encoder_sequence,
+)
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CUDAGraphMode,
@@ -825,14 +829,9 @@ def forward(
         else:
             hidden_states = x.to(device=self.device, dtype=self.dtype)
 
-        from vllm.compilation.backends import (
-            set_is_first_graph_in_vit_sequence,
-            set_is_last_graph_in_vit_sequence,
-        )
-
         with (
-            set_is_first_graph_in_vit_sequence(True),
-            set_is_last_graph_in_vit_sequence(False),
+            set_is_first_graph_in_mm_encoder_sequence(True),
+            set_is_last_graph_in_mm_encoder_sequence(False),
         ):
             hidden_states = self.patch_embed(hidden_states)
 
@@ -925,8 +924,8 @@ def forward(
             hidden_states = original_hidden_states
 
         with (
-            set_is_first_graph_in_vit_sequence(False),
-            set_is_last_graph_in_vit_sequence(False),
+            set_is_first_graph_in_mm_encoder_sequence(False),
+            set_is_last_graph_in_mm_encoder_sequence(False),
         ):
             for layer_num, blk in enumerate(self.blocks):
                 if layer_num in self.fullatt_block_indexes:
@@ -951,8 +950,8 @@ def forward(
 
         # adapter
         with (
-            set_is_first_graph_in_vit_sequence(False),
-            set_is_last_graph_in_vit_sequence(True),
+            set_is_first_graph_in_mm_encoder_sequence(False),
+            set_is_last_graph_in_mm_encoder_sequence(True),
         ):
             hidden_states = self.merger(hidden_states)
         hidden_states = hidden_states[reverse_indices, :]
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 9669809318c9..6a8ef0c239ec 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -48,6 +48,10 @@
 )
 from transformers.video_utils import VideoMetadata
 
+from vllm.compilation.backends import (
+    set_is_first_graph_in_mm_encoder_sequence,
+    set_is_last_graph_in_mm_encoder_sequence,
+)
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CUDAGraphMode,
@@ -601,14 +605,9 @@ def forward(
                 device=self.device, dtype=self.dtype, non_blocking=True
             )
 
-        from vllm.compilation.backends import (
-            set_is_first_graph_in_vit_sequence,
-            set_is_last_graph_in_vit_sequence,
-        )
-
         with (
-            set_is_first_graph_in_vit_sequence(True),
-            set_is_last_graph_in_vit_sequence(False),
+            set_is_first_graph_in_mm_encoder_sequence(True),
+            set_is_last_graph_in_mm_encoder_sequence(False),
         ):
             hidden_states = self.patch_embed(hidden_states)
 
@@ -660,8 +659,8 @@ def forward(
 
         deepstack_feature_lists = []
         with (
-            set_is_first_graph_in_vit_sequence(False),
-            set_is_last_graph_in_vit_sequence(False),
+            set_is_first_graph_in_mm_encoder_sequence(False),
+            set_is_last_graph_in_mm_encoder_sequence(False),
         ):
             for layer_num, blk in enumerate(self.blocks):
                 hidden_states = blk(
@@ -680,8 +679,8 @@ def forward(
                     ](hidden_states)
                     deepstack_feature_lists.append(deepstack_feature)
         with (
-            set_is_first_graph_in_vit_sequence(False),
-            set_is_last_graph_in_vit_sequence(True),
+            set_is_first_graph_in_mm_encoder_sequence(False),
+            set_is_last_graph_in_mm_encoder_sequence(True),
         ):
             hidden_states = self.merger(hidden_states)
         hidden_states = torch.cat(
diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py
index f6051e54713b..b226fb8c1134 100644
--- a/vllm/v1/attention/ops/vit_attn_wrappers.py
+++ b/vllm/v1/attention/ops/vit_attn_wrappers.py
@@ -150,7 +150,9 @@ def torch_sdpa_wrapper(
         v = v.contiguous()
 
     if cu_seqlens is None:
-        return apply_sdpa(q, k, v, scale=scale)
+        context_layer = apply_sdpa(q, k, v, scale=scale)
+        output.copy_(context_layer)
+        return output
 
     outputs = []
 
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index bc90d4044f71..0549a57597ce 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -68,18 +68,11 @@ def __init__(self, vllm_config: VllmConfig):
 
     def _compute_bs_to_padded_graph_size(self) -> None:
         """Pre-compute the mapping from batch size to padded graph size."""
-        max_size = self.compilation_config.max_cudagraph_capture_size
+        max_capture_size = self.compilation_config.max_cudagraph_capture_size
         capture_sizes = self.compilation_config.cudagraph_capture_sizes
-        self._bs_to_padded_graph_size: list[int] = [0] * (max_size + 1)
-        for end, start in zip(
-            capture_sizes + [max_size + 1],
-            [0] + capture_sizes,
-        ):
-            for bs in range(start, end):
-                if bs == start:
-                    self._bs_to_padded_graph_size[bs] = start
-                else:
-                    self._bs_to_padded_graph_size[bs] = end
+        self._bs_to_padded_graph_size = self._get_padded_size_map(
+            capture_sizes, max_capture_size
+        )
 
         # Validate that compile_sizes won't be changed by padding.
         # Only validate when cudagraphs are actually being used.
@@ -88,7 +81,7 @@ def _compute_bs_to_padded_graph_size(self) -> None:
             and self.cudagraph_mode != CUDAGraphMode.NONE
         ):
             for size in self.compilation_config.compile_sizes:
-                if size <= self.compilation_config.max_cudagraph_capture_size:
+                if size <= max_capture_size:
                     padded = self._bs_to_padded_graph_size[size]
                     if padded != size:
                         raise ValueError(
@@ -121,18 +114,31 @@ def _get_lora_cases(self) -> list[int]:
 
     def _compute_bs_to_padded_vit_graph_size(self) -> None:
         """pre-compute the mapping from batch size to ViT padded graph size."""
-        max_size = self.compilation_config.max_vit_cudagraph_capture_size
+        max_capture_size = self.compilation_config.max_vit_cudagraph_capture_size
         capture_sizes = self.compilation_config.vit_cudagraph_capture_sizes
-        self._bs_to_padded_vit_graph_size: list[int] = [0] * (max_size + 1)
+
+        self._bs_to_padded_vit_graph_size = self._get_padded_size_map(
+            capture_sizes, max_capture_size
+        )
+
+    def _get_padded_size_map(
+        self, capture_sizes: list[int] | None, max_size: int | None
+    ) -> list[int]:
+        if capture_sizes is None:
+            capture_sizes = []
+        if max_size is None:
+            max_size = 0
+        padded_size_map: list[int] = [0] * (max_size + 1)
         for end, start in zip(
             capture_sizes + [max_size + 1],
             [0] + capture_sizes,
         ):
             for bs in range(start, end):
                 if bs == start:
-                    self._bs_to_padded_vit_graph_size[bs] = start
+                    padded_size_map[bs] = start
                 else:
-                    self._bs_to_padded_vit_graph_size[bs] = end
+                    padded_size_map[bs] = end
+        return padded_size_map
 
     def _create_padded_batch_descriptor(
         self,
@@ -318,7 +324,7 @@ def dispatch(
             return CUDAGraphMode.PIECEWISE, relaxed_batch_desc
 
         # finally, just return no cudagraphs and a trivial batch descriptor
-        return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
+        return CUDAGraphMode.NONE, BatchDescriptor(num_tokens, is_vit=is_vit)
 
     def get_capture_descs(
         self, is_vit: bool = False

From 3f9950e9ba6d4883e995deb49bb10c7fef82203c Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Fri, 30 Jan 2026 16:01:24 +0800
Subject: [PATCH 29/35] chore: rename vit to mm_encoder

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>

Signed-off-by: Hongjian Zhang <zhanghongjian@xiaohongshu.com>
---
 docs/design/torch_compile_multimodal.md       |  14 +--
 .../piecewise/test_qwenvl_vit_cudagraph.py    |  12 +-
 vllm/config/compilation.py                    |   8 +-
 vllm/config/vllm.py                           | 116 ++++++++++--------
 vllm/engine/arg_utils.py                      |  20 +--
 vllm/forward_context.py                       |   8 +-
 vllm/model_executor/models/qwen2_5_vl.py      |   8 +-
 vllm/model_executor/models/qwen3_vl.py        |   8 +-
 vllm/model_executor/models/vision.py          |   7 +-
 vllm/v1/cudagraph_dispatcher.py               |  53 ++++----
 vllm/v1/worker/gpu_model_runner.py            |  46 +++----
 11 files changed, 166 insertions(+), 134 deletions(-)

diff --git a/docs/design/torch_compile_multimodal.md b/docs/design/torch_compile_multimodal.md
index f3f3f3b433f5..260f4e136a58 100644
--- a/docs/design/torch_compile_multimodal.md
+++ b/docs/design/torch_compile_multimodal.md
@@ -72,7 +72,7 @@ vLLM now supports Piecewise CUDA Graph integration for the Vision Transformer (V
 
 #### Enabling ViT CUDA Graphs
 
-**Important**: This feature is **not enabled by default**. The Piecewise CUDA Graph implementation relies on `torch.compile` to trace the computation graph and separate the attention operators. Therefore, users must explicitly enable ViT compilation via the `--compilation-config` argument to activate this feature.
+**Important**: This feature is **not enabled by default**. The Piecewise CUDA Graph implementation relies on `torch.compile` to trace the computation graph and separate the attention operators. Therefore, users must explicitly enable mm_encoder compilation via the `--compilation-config` argument to activate this feature.
 
 To enable ViT CUDA graph compilation, use:
 
@@ -82,26 +82,26 @@ vllm serve <model> --compilation-config '{"compile_mm_encoder": true}'
 
 #### Configuring Capture Sizes
 
-You can specify custom patch sizes for CUDA graph capture using `vit_cudagraph_capture_sizes`. For models like `Qwen2.5-VL` and `Qwen3-VL`, the capture sizes should be multiples of the square of `merge_size`:
+You can specify custom patch sizes for CUDA graph capture using `mm_encoder_cudagraph_capture_sizes`. For models like `Qwen2.5-VL` and `Qwen3-VL`, the capture sizes should be multiples of the square of `merge_size`:
 
 ```bash
-vllm serve <model> --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": [512, 1024]}'
+vllm serve <model> --compilation-config '{"compile_mm_encoder": true, "mm_encoder_cudagraph_capture_sizes": [512, 1024]}'
 ```
 
-Alternatively, you can specify `max_vit_cudagraph_capture_size` to generate a default list of capture sizes up to the given value:
+Alternatively, you can specify `max_mm_encoder_cudagraph_capture_size` to generate a default list of capture sizes up to the given value:
 
 ```bash
-vllm serve <model> --compilation-config '{"compile_mm_encoder": true, "max_vit_cudagraph_capture_size": 2048}'
+vllm serve <model> --compilation-config '{"compile_mm_encoder": true, "max_mm_encoder_cudagraph_capture_size": 2048}'
 ```
 
 #### Default Behavior
 
-Once enabled, if `vit_cudagraph_capture_sizes` is not specified, vLLM will use a default set of sizes for capture. Since `compile_mm_encoder` is `False` by default, this feature remains inactive unless configured.
+Once enabled, if `mm_encoder_cudagraph_capture_sizes` is not specified, vLLM will use a default set of sizes for capture. Since `compile_mm_encoder` is `False` by default, this feature remains inactive unless configured.
 
 If you only want to enable `torch.compile` for ViT without using the CUDA Graph feature, you can explicitly set the capture sizes to empty:
 
 ```bash
-vllm serve <model> --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": []}'
+vllm serve <model> --compilation-config '{"compile_mm_encoder": true, "mm_encoder_cudagraph_capture_sizes": []}'
 ```
 
 #### Limitations & Notes
diff --git a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
index cddf2147b137..ec3176885409 100644
--- a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
+++ b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
@@ -22,8 +22,9 @@ def _worker_embed_multimodal(
     This function sets up the necessary forward context for tensor-parallel (TP)
     execution and then calls the model's `embed_multimodal` method.
     Note: For data-parallel (DP) mode, the forward context is typically
-    created and managed within the vision dispatcher, which would override
-    the context set here.
+          created and managed within the
+          vision.py:run_dp_sharded_mrope_vision_model(), which would override the
+          context set here.
     Args:
         worker: The worker instance containing the model runner.
         vllm_config: The vLLM engine configuration.
@@ -103,7 +104,7 @@ def llm(request):
             compilation_config=CompilationConfig(
                 cudagraph_mode="PIECEWISE",
                 compile_mm_encoder=True,
-                vit_cudagraph_capture_sizes=[64, 128, 256],
+                mm_encoder_cudagraph_capture_sizes=[64, 128, 256],
             ),
         )
         print(f"LLM initialized for {model_name} tp={tp_size} mode={mm_mode}")
@@ -154,7 +155,10 @@ def _run_embed_multimodal(
 
         # Dispatch to get runtime mode and batch descriptor
         cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(
-            num_tokens=num_patches, uniform_decode=False, has_lora=False, is_vit=True
+            num_tokens=num_patches,
+            uniform_decode=False,
+            has_lora=False,
+            is_mm_encoder=True,
         )
 
         model_executor = llm.llm_engine.model_executor
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 2da96d938765..70ba6e68ec5d 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -530,12 +530,12 @@ class CompilationConfig:
     """Sizes to capture cudagraph.
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
-    vit_cudagraph_capture_sizes: list[int] | None = None
-    """Sizes to capture vit cudagraph.
+    mm_encoder_cudagraph_capture_sizes: list[int] | None = None
+    """Sizes to capture mm_encoder cudagraph.
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
-    max_vit_cudagraph_capture_size: int = field(default=None)
-    """The maximum vit cudagraph capture size.
+    max_mm_encoder_cudagraph_capture_size: int = field(default=None)
+    """The maximum mm_encoder cudagraph capture size.
     """
     cudagraph_copy_inputs: bool = False
     """Whether to copy input tensors for
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 919f214ce720..8a05b2533089 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -257,10 +257,10 @@ class VllmConfig:
     performance, with -O0 having the best startup time and -O3 having the best
     performance. -02 is used by defult. See  OptimizationLevel for full
     description."""
-    is_in_compile_or_vit_cuda_graph_capture: bool = False
-    """Flag for ViT compilation or ViT CUDA graph capture.
+    in_mm_encoder_tracing: bool = False
+    """Flag for mm_encoder compilation or mm_encoder CUDA graph capture.
     
-    If true, ViT in DP mode will execute the ViT model directly instead of
+    If true, mm_encoder in DP mode will execute the mm_encoder model directly instead of
     `run_dp_sharded_mrope_vision_model` to ensure correct memory profiling
     and compilation for each rank.
     """
@@ -827,7 +827,7 @@ def has_blocked_weights():
                 self.compilation_config.cudagraph_num_of_warmups = 1
 
             self._set_cudagraph_sizes()
-            self._set_vit_cudagraph_sizes()
+            self._set_mm_encoder_cudagraph_sizes()
         else:
             self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
@@ -1347,12 +1347,12 @@ def _set_compile_ranges(self):
             computed_compile_ranges_split_points
         )
 
-    def _set_vit_cudagraph_sizes(self):
-        """Sets the CUDA graph capture sizes for the Vision Transformer (ViT).
+    def _set_mm_encoder_cudagraph_sizes(self):
+        """Sets the CUDA graph capture sizes for the multimodal encoder (MM Encoder).
 
         This method determines the batch sizes (in terms of number of patches)
-        for which ViT CUDA graphs will be captured. CUDA graphs improve
-        performance by reducing kernel launch overhead for the vision encoder.
+        for which MM Encoder CUDA graphs will be captured. CUDA graphs improve
+        performance by reducing kernel launch overhead for the multimodal encoder.
 
         The logic is as follows:
         1.  The feature is only enabled if all of the following conditions are met:
@@ -1361,22 +1361,22 @@ def _set_vit_cudagraph_sizes(self):
             - CUDA graph mode is enabled (`cudagraph_mode` is not NONE).
             - Multimodal encoder compilation is enabled (`compile_mm_encoder` is True).
             If these conditions are not met, the list of capture sizes will be empty,
-            effectively disabling ViT CUDA graphs.
+            effectively disabling mm_encoder CUDA graphs.
 
-        2.  If the user has explicitly provided `vit_cudagraph_capture_sizes` in the
-            compilation config, those sizes are used. The list is de-duplicated
-            and sorted in ascending order.
+        2.  If the user has explicitly provided `mm_encoder_cudagraph_capture_sizes`
+            in the compilation config, those sizes are used. The list is
+            de-duplicated and sorted in ascending order.
 
         3.  If no sizes are provided by the user, a default list of sizes is
             generated. The maximum size for this list is determined automatically
             by `compute_encoder_budget` (capped at 8192), or by the user-provided
-            `max_vit_cudagraph_capture_size`. The default sizes are:
+            `max_mm_encoder_cudagraph_capture_size`. The default sizes are:
             [512, 1024, 1536] + list(range(2048, 4096, 128)) + list(
             range(4096, max_size + 1, 256))
 
         4.  The final list of sizes is stored in
-            `self.compilation_config.vit_cudagraph_capture_sizes`. The
-            `max_vit_cudagraph_capture_size` is also updated to be consistent
+            `self.compilation_config.mm_encoder_cudagraph_capture_sizes`. The
+            `max_mm_encoder_cudagraph_capture_size` is also updated to be consistent
             with the largest value in this final list.
 
         At runtime:
@@ -1391,11 +1391,11 @@ def _set_vit_cudagraph_sizes(self):
             and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
             and self.compilation_config.compile_mm_encoder
         ):
-            # determine the initial max_vit_cudagraph_capture_size
-            max_vit_cudagraph_capture_size = (
-                self.compilation_config.max_vit_cudagraph_capture_size
+            # determine the initial max_mm_encoder_cudagraph_capture_size
+            max_mm_encoder_cudagraph_capture_size = (
+                self.compilation_config.max_mm_encoder_cudagraph_capture_size
             )
-            if max_vit_cudagraph_capture_size is None:
+            if max_mm_encoder_cudagraph_capture_size is None:
                 from vllm.multimodal import MULTIMODAL_REGISTRY
                 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 
@@ -1404,65 +1404,83 @@ def _set_vit_cudagraph_sizes(self):
                     scheduler_config=self.scheduler_config,
                     mm_registry=MULTIMODAL_REGISTRY,
                 )
-                max_vit_cudagraph_capture_size = min(encoder_compute_budget, 8192)
+                max_mm_encoder_cudagraph_capture_size = min(
+                    encoder_compute_budget, 8192
+                )
 
-            # determine the vit_cudagraph_capture_sizes
-            if self.compilation_config.vit_cudagraph_capture_sizes is not None:
+            # determine the mm_encoder_cudagraph_capture_sizes
+            if self.compilation_config.mm_encoder_cudagraph_capture_sizes is not None:
                 # de-duplicate the sizes provided by the config
                 dedup_sizes = list(
-                    set(self.compilation_config.vit_cudagraph_capture_sizes)
+                    set(self.compilation_config.mm_encoder_cudagraph_capture_sizes)
                 )
-                vit_cudagraph_capture_sizes = dedup_sizes
+                mm_encoder_cudagraph_capture_sizes = dedup_sizes
                 # sort to make sure the sizes are in ascending order
-                vit_cudagraph_capture_sizes.sort()
+                mm_encoder_cudagraph_capture_sizes.sort()
             else:
-                vit_cudagraph_capture_sizes = [
-                    i for i in [512, 1024, 1536] if i <= max_vit_cudagraph_capture_size
+                mm_encoder_cudagraph_capture_sizes = [
+                    i
+                    for i in [512, 1024, 1536]
+                    if i <= max_mm_encoder_cudagraph_capture_size
                 ]
-                if max_vit_cudagraph_capture_size >= 2048:
+                if max_mm_encoder_cudagraph_capture_size >= 2048:
                     # Step size 128 for larger batch sizes
-                    vit_cudagraph_capture_sizes += list(
-                        range(2048, min(max_vit_cudagraph_capture_size + 1, 4096), 128)
+                    mm_encoder_cudagraph_capture_sizes += list(
+                        range(
+                            2048,
+                            min(max_mm_encoder_cudagraph_capture_size + 1, 4096),
+                            128,
+                        )
                     )
-                if max_vit_cudagraph_capture_size >= 4096:
+                if max_mm_encoder_cudagraph_capture_size >= 4096:
                     # Step size 256 for largest batch sizes
-                    vit_cudagraph_capture_sizes += list(
-                        range(4096, max_vit_cudagraph_capture_size + 1, 256)
+                    mm_encoder_cudagraph_capture_sizes += list(
+                        range(4096, max_mm_encoder_cudagraph_capture_size + 1, 256)
                     )
 
-            # user-specific compilation_config.max_vit_cudagraph_capture_size get
+            # user-specific compilation_config.max_mm_encoder_cudagraph_capture_size get
             # truncated to valid_max_size when they are inconsistent.
             valid_max_size = (
-                vit_cudagraph_capture_sizes[-1] if vit_cudagraph_capture_sizes else 0
+                mm_encoder_cudagraph_capture_sizes[-1]
+                if mm_encoder_cudagraph_capture_sizes
+                else 0
             )
             if (
-                self.compilation_config.max_vit_cudagraph_capture_size is not None
-                and self.compilation_config.max_vit_cudagraph_capture_size
+                self.compilation_config.max_mm_encoder_cudagraph_capture_size
+                is not None
+                and self.compilation_config.max_mm_encoder_cudagraph_capture_size
                 != valid_max_size
             ):
                 # raise error only when both two flags are user-specified
                 # and they are inconsistent with each other
-                if self.compilation_config.vit_cudagraph_capture_sizes is not None:
+                if (
+                    self.compilation_config.mm_encoder_cudagraph_capture_sizes
+                    is not None
+                ):
                     raise ValueError(
-                        "customized max_vit_cudagraph_capture_size"
-                        f"(={self.compilation_config.max_vit_cudagraph_capture_size}) "
-                        "should be consistent with the max value of "
-                        f"vit_cudagraph_capture_sizes(={valid_max_size})"
+                        "customized max_mm_encoder_cudagraph_capture_size(="
+                        f"{
+                            self.compilation_config.max_mm_encoder_cudagraph_capture_size
+                        }"
+                        ") should be consistent with the max value of "
+                        f"mm_encoder_cudagraph_capture_sizes(={valid_max_size})"
                     )
 
                 logger.warning(
-                    "Truncating max_vit_cudagraph_capture_size to %d",
+                    "Truncating max_mm_encoder_cudagraph_capture_size to %d",
                     valid_max_size,
                 )
-            # always set the final max_vit_cudagraph_capture_size
-            self.compilation_config.max_vit_cudagraph_capture_size = valid_max_size
-            self.compilation_config.vit_cudagraph_capture_sizes = (
-                vit_cudagraph_capture_sizes
+            # always set the final max_mm_encoder_cudagraph_capture_size
+            self.compilation_config.max_mm_encoder_cudagraph_capture_size = (
+                valid_max_size
+            )
+            self.compilation_config.mm_encoder_cudagraph_capture_sizes = (
+                mm_encoder_cudagraph_capture_sizes
             )
         else:
             # no cudagraph in use
-            self.compilation_config.max_vit_cudagraph_capture_size = 0
-            self.compilation_config.vit_cudagraph_capture_sizes = []
+            self.compilation_config.max_mm_encoder_cudagraph_capture_size = 0
+            self.compilation_config.mm_encoder_cudagraph_capture_sizes = []
 
     def try_verify_and_update_config(self):
         if self.model_config is None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9733b0f26ec2..7a8599a29e43 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -378,8 +378,8 @@ class EngineArgs:
     max_cudagraph_capture_size: int | None = get_field(
         CompilationConfig, "max_cudagraph_capture_size"
     )
-    vit_cudagraph_capture_sizes: list[int] | None = (
-        CompilationConfig.vit_cudagraph_capture_sizes
+    mm_encoder_cudagraph_capture_sizes: list[int] | None = (
+        CompilationConfig.mm_encoder_cudagraph_capture_sizes
     )
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
@@ -1152,8 +1152,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"]
         )
         compilation_group.add_argument(
-            "--vit-cudagraph-capture-sizes",
-            **compilation_kwargs["vit_cudagraph_capture_sizes"],
+            "--mm_encoder-cudagraph-capture-sizes",
+            **compilation_kwargs["mm_encoder_cudagraph_capture_sizes"],
         )
         compilation_group.add_argument(
             "--max-cudagraph-capture-size",
@@ -1745,14 +1745,14 @@ def create_engine_config(
                 )
             compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes
 
-        if self.vit_cudagraph_capture_sizes is not None:
-            if compilation_config.vit_cudagraph_capture_sizes is not None:
+        if self.mm_encoder_cudagraph_capture_sizes is not None:
+            if compilation_config.mm_encoder_cudagraph_capture_sizes is not None:
                 raise ValueError(
-                    "vit_cudagraph_capture_sizes and compilation_config."
-                    "vit_cudagraph_capture_sizes are mutually exclusive"
+                    "mm_encoder_cudagraph_capture_sizes and compilation_config."
+                    "mm_encoder_cudagraph_capture_sizes are mutually exclusive"
                 )
-            compilation_config.vit_cudagraph_capture_sizes = (
-                self.vit_cudagraph_capture_sizes
+            compilation_config.mm_encoder_cudagraph_capture_sizes = (
+                self.mm_encoder_cudagraph_capture_sizes
             )
 
         if self.max_cudagraph_capture_size is not None:
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index d7a7603f6b41..be08e2d9a6bc 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -55,9 +55,9 @@ class BatchDescriptor(NamedTuple):
     (like fused_moe_lora) whose grid size depends on num_active_loras
     to be properly captured.
     """
-    is_vit: bool = False
+    is_mm_encoder: bool = False
     """
-    ViT Piecewise CUDA Graph Flag
+    mm_encoder Piecewise CUDA Graph Flag
     """
 
     def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor":
@@ -71,7 +71,7 @@ def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor":
             uniform=False,
             has_lora=self.has_lora,
             num_active_loras=self.num_active_loras,
-            is_vit=self.is_vit,
+            is_mm_encoder=self.is_mm_encoder,
         )
 
 
@@ -255,7 +255,7 @@ class ForwardContext:
     all_moe_layers: list[str] | None = None
     moe_layer_index: int = 0
 
-    # ViT Multi-Modal Encoder flags used by backend compiler
+    # mm_encoder Multi-Modal Encoder flags used by backend compiler
     is_first_graph_in_mm_encoder_sequence: bool = True
     is_last_graph_in_mm_encoder_sequence: bool = True
 
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 1127cfea1634..e2793bae0b4b 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -653,9 +653,9 @@ def __init__(
         self._persistent_hidden_states_buffer: torch.Tensor | None = None
         self._persistent_rotary_pos_emb_cos_buffer: torch.Tensor | None = None
         self._persistent_rotary_pos_emb_sin_buffer: torch.Tensor | None = None
-        if vllm_config.compilation_config.vit_cudagraph_capture_sizes:
+        if vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes:
             max_compile_size = (
-                vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
+                vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes[-1]
             )
             self._persistent_hidden_states_buffer = torch.empty(
                 (max_compile_size, self.patch_embed.proj.input_size),
@@ -1292,7 +1292,7 @@ def _process_image_input(
             with set_current_vllm_config(self.vllm_config):
                 if (
                     self.use_data_parallel
-                    and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture
+                    and not self.vllm_config.in_mm_encoder_tracing
                 ):
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
@@ -1356,7 +1356,7 @@ def _process_video_input(
             with set_current_vllm_config(self.vllm_config):
                 if (
                     self.use_data_parallel
-                    and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture
+                    and not self.vllm_config.in_mm_encoder_tracing
                 ):
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 6a8ef0c239ec..e87d3701c31a 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -439,9 +439,9 @@ def __init__(
         self._persistent_hidden_states_buffer: torch.Tensor | None = None
         self._persistent_rotary_pos_emb_cos_buffer: torch.Tensor | None = None
         self._persistent_rotary_pos_emb_sin_buffer: torch.Tensor | None = None
-        if vllm_config.compilation_config.vit_cudagraph_capture_sizes:
+        if vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes:
             max_compile_size = (
-                vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1]
+                vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes[-1]
             )
             self._persistent_hidden_states_buffer = torch.empty(
                 (max_compile_size, self.patch_embed.proj.input_size),
@@ -1531,7 +1531,7 @@ def _process_image_input(
             with set_current_vllm_config(self.vllm_config):
                 if (
                     self.use_data_parallel
-                    and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture
+                    and not self.vllm_config.in_mm_encoder_tracing
                 ):
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
@@ -1566,7 +1566,7 @@ def _process_video_input(
             with set_current_vllm_config(self.vllm_config):
                 if (
                     self.use_data_parallel
-                    and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture
+                    and not self.vllm_config.in_mm_encoder_tracing
                 ):
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 1637b27209af..67a50b0da054 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -494,14 +494,17 @@ def run_dp_sharded_mrope_vision_model(
     cudagraph_runtime_mode = CUDAGraphMode.NONE
     batch_descriptor = None
 
-    if vllm_config and vllm_config.compilation_config.vit_cudagraph_capture_sizes:
+    if (
+        vllm_config
+        and vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes
+    ):
         current_input_len = pixel_values_local.shape[0]
         cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(
             num_tokens=current_input_len,
             uniform_decode=False,
             has_lora=False,
             disable_full=False,
-            is_vit=True,
+            is_mm_encoder=True,
         )
         target_input_len = batch_descriptor.num_tokens
 
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 0549a57597ce..1dfe1d07f1d4 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -70,7 +70,7 @@ def _compute_bs_to_padded_graph_size(self) -> None:
         """Pre-compute the mapping from batch size to padded graph size."""
         max_capture_size = self.compilation_config.max_cudagraph_capture_size
         capture_sizes = self.compilation_config.cudagraph_capture_sizes
-        self._bs_to_padded_graph_size = self._get_padded_size_map(
+        self._bs_to_padded_graph_size: list[int] = self._get_padded_size_map(
             capture_sizes, max_capture_size
         )
 
@@ -112,12 +112,12 @@ def _get_lora_cases(self) -> list[int]:
             # No specialization: only capture graphs with LoRA active
             return [lora_config.max_loras + 1]
 
-    def _compute_bs_to_padded_vit_graph_size(self) -> None:
-        """pre-compute the mapping from batch size to ViT padded graph size."""
-        max_capture_size = self.compilation_config.max_vit_cudagraph_capture_size
-        capture_sizes = self.compilation_config.vit_cudagraph_capture_sizes
+    def _compute_bs_to_padded_mm_encoder_graph_size(self) -> None:
+        """pre-compute the mapping from batch size to mm_encoder padded graph size."""
+        max_capture_size = self.compilation_config.max_mm_encoder_cudagraph_capture_size
+        capture_sizes = self.compilation_config.mm_encoder_cudagraph_capture_sizes
 
-        self._bs_to_padded_vit_graph_size = self._get_padded_size_map(
+        self._bs_to_padded_mm_encoder_graph_size: list[int] = self._get_padded_size_map(
             capture_sizes, max_capture_size
         )
 
@@ -146,12 +146,12 @@ def _create_padded_batch_descriptor(
         uniform_decode: bool,
         has_lora: bool,
         num_active_loras: int = 0,
-        is_vit: bool = False,
+        is_mm_encoder: bool = False,
     ) -> BatchDescriptor:
         max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs
         uniform_decode_query_len = self.uniform_decode_query_len
-        if is_vit:
-            num_tokens_padded = self._bs_to_padded_vit_graph_size[num_tokens]
+        if is_mm_encoder:
+            num_tokens_padded = self._bs_to_padded_mm_encoder_graph_size[num_tokens]
         else:
             num_tokens_padded = self._bs_to_padded_graph_size[num_tokens]
 
@@ -168,7 +168,7 @@ def _create_padded_batch_descriptor(
             uniform=uniform_decode,
             has_lora=has_lora,
             num_active_loras=num_active_loras,
-            is_vit=is_vit
+            is_mm_encoder=is_mm_encoder,
         )
 
     def add_cudagraph_key(
@@ -192,7 +192,7 @@ def initialize_cudagraph_keys(
             return
 
         self._compute_bs_to_padded_graph_size()
-        self._compute_bs_to_padded_vit_graph_size()
+        self._compute_bs_to_padded_mm_encoder_graph_size()
 
         # Get LoRA cases to capture
         lora_cases = self._get_lora_cases()
@@ -213,12 +213,12 @@ def initialize_cudagraph_keys(
                         bs, False, num_active_loras > 0, num_active_loras
                     ).relax_for_mixed_batch_cudagraphs(),
                 )
-            # ViT CUDAGraph Entry
-            for patch_len in self.compilation_config.vit_cudagraph_capture_sizes:
+            # mm_encoder CUDAGraph Entry
+            for patch_len in self.compilation_config.mm_encoder_cudagraph_capture_sizes:
                 self.add_cudagraph_key(
-                    cudagraph_mode.mixed_mode(),
+                    CUDAGraphMode.PIECEWISE,
                     self._create_padded_batch_descriptor(
-                        patch_len, False, False, is_vit=True
+                        patch_len, False, False, is_mm_encoder=True
                     ).relax_for_mixed_batch_cudagraphs(),
                 )
 
@@ -256,7 +256,7 @@ def dispatch(
         has_lora: bool = False,
         disable_full: bool = False,
         num_active_loras: int = 0,
-        is_vit: bool = False,
+        is_mm_encoder: bool = False,
     ) -> tuple[CUDAGraphMode, BatchDescriptor]:
         """
         Given conditions(e.g.,batch descriptor and if using piecewise only),
@@ -278,15 +278,18 @@ def dispatch(
             not self.keys_initialized
             or self.cudagraph_mode == CUDAGraphMode.NONE
             or (
-                not is_vit
+                not is_mm_encoder
                 and num_tokens > self.compilation_config.max_cudagraph_capture_size
             )
             or (
-                is_vit
-                and num_tokens > self.compilation_config.max_vit_cudagraph_capture_size
+                is_mm_encoder
+                and num_tokens
+                > self.compilation_config.max_mm_encoder_cudagraph_capture_size
             )
         ):
-            return CUDAGraphMode.NONE, BatchDescriptor(num_tokens, is_vit=is_vit)
+            return CUDAGraphMode.NONE, BatchDescriptor(
+                num_tokens, is_mm_encoder=is_mm_encoder
+            )
 
         effective_num_active_loras = num_active_loras
         if has_lora and num_active_loras > 0:
@@ -305,7 +308,7 @@ def dispatch(
                 effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1
 
         batch_desc = self._create_padded_batch_descriptor(
-            num_tokens, uniform_decode, has_lora, effective_num_active_loras, is_vit
+            num_tokens, uniform_decode, has_lora, effective_num_active_loras, is_mm_encoder
         )
         relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
 
@@ -324,10 +327,12 @@ def dispatch(
             return CUDAGraphMode.PIECEWISE, relaxed_batch_desc
 
         # finally, just return no cudagraphs and a trivial batch descriptor
-        return CUDAGraphMode.NONE, BatchDescriptor(num_tokens, is_vit=is_vit)
+        return CUDAGraphMode.NONE, BatchDescriptor(
+            num_tokens, is_mm_encoder=is_mm_encoder
+        )
 
     def get_capture_descs(
-        self, is_vit: bool = False
+        self, is_mm_encoder: bool = False
     ) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]:
         """
         Returns capture descriptors for cudagraph capturing.
@@ -346,7 +351,7 @@ def get_capture_descs(
             descs = list(self.cudagraph_keys[mode])
             if descs:
                 # Sort by num_tokens descending (largest first)
-                filter_descs = [d for d in descs if d.is_vit == is_vit]
+                filter_descs = [d for d in descs if d.is_mm_encoder == is_mm_encoder]
                 if filter_descs:
                     filter_descs.sort(key=lambda d: d.num_tokens, reverse=True)
                     result.append((mode, filter_descs))
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b07874cd6a16..a8d7d74ea127 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -553,14 +553,14 @@ def __init__(
             self.cudagraph_batch_sizes = sorted(
                 self.compilation_config.cudagraph_capture_sizes
             )
-        # self.vit_cudagraph_batch_sizes sorts in ascending order.
-        self.vit_cudagraph_batch_sizes: list[int] | None = None
+        # self.mm_encoder_cudagraph_batch_sizes sorts in ascending order.
+        self.mm_encoder_cudagraph_batch_sizes: list[int] | None = None
         if (
-            self.compilation_config.vit_cudagraph_capture_sizes
+            self.compilation_config.mm_encoder_cudagraph_capture_sizes
             and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
         ):
-            self.vit_cudagraph_batch_sizes = sorted(
-                self.compilation_config.vit_cudagraph_capture_sizes
+            self.mm_encoder_cudagraph_batch_sizes = sorted(
+                self.compilation_config.mm_encoder_cudagraph_capture_sizes
             )
 
         # Cache the device properties.
@@ -2441,11 +2441,11 @@ def _execute_mm_encoder(
                     original_num_imgs = -1
                     padded_num_tokens = -1
 
-                    # Default values for non-ViT cudagraph case
+                    # Default values for non-mm_encoder cudagraph case
                     cudagraph_runtime_mode = CUDAGraphMode.NONE
                     batch_descriptor = None
                     if (
-                        self.vit_cudagraph_batch_sizes
+                        self.mm_encoder_cudagraph_batch_sizes
                         and "pixel_values" in mm_kwargs_group
                     ):
                         pixel_values = cast(
@@ -2460,7 +2460,7 @@ def _execute_mm_encoder(
                                 uniform_decode=False,
                                 has_lora=False,
                                 disable_full=False,
-                                is_vit=True,
+                                is_mm_encoder=True,
                             )
                         )
                         padded_num_tokens = batch_descriptor.num_tokens
@@ -4682,10 +4682,10 @@ def _get_dummy_h_w_patches(self, patches: int):
         w_patches = patches // merge_size
         return h_patches, w_patches
 
-    def _get_dummy_vit_input(
+    def _get_dummy_mm_encoder_input(
         self, num_image_tokens: int, img_feature_dim: int
     ) -> BatchedTensorInputs:
-        """Dummy data for profiling and precompiling ViT."""
+        """Dummy data for profiling and precompiling mm_encoder."""
 
         # The first dimension of pixel_values corresponds
         # to the total number of patches.
@@ -5236,7 +5236,7 @@ def _dummy_mm_encoder_run(
         self,
         compilation_cases: list[int],
     ) -> None:
-        self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = True
+        self.vllm_config.in_mm_encoder_tracing = True
         tmp_dummy_mm_inputs = self._get_mm_dummy_batch(
             "video",
             1,
@@ -5249,17 +5249,19 @@ def _dummy_mm_encoder_run(
             compilation_cases = tqdm(
                 compilation_cases,
                 disable=not self.load_config.use_tqdm_on_load,
-                desc="Capturing Vit CUDA graphs (PIECEWISE)",
+                desc="Capturing mm_encoder CUDA graphs (PIECEWISE)",
             )
-        # Lazy initialization of the persistent buffer
+
         for capture_size in compilation_cases:
-            dummy_mm_inputs = self._get_dummy_vit_input(capture_size, img_feature_dim)
+            dummy_mm_inputs = self._get_dummy_mm_encoder_input(
+                capture_size, img_feature_dim
+            )
             cudagraph_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch(
                 num_tokens=capture_size,
                 uniform_decode=False,
                 has_lora=False,
                 disable_full=False,
-                is_vit=True,
+                is_mm_encoder=True,
             )
             with (
                 set_forward_context(
@@ -5271,10 +5273,10 @@ def _dummy_mm_encoder_run(
                 ),
             ):
                 self.model.embed_multimodal(**dummy_mm_inputs)
-        self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = False
+        self.vllm_config.in_mm_encoder_tracing = False
 
     def profile_run(self) -> None:
-        self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = True
+        self.vllm_config.in_mm_encoder_tracing = True
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
             mm_config = self.model_config.multimodal_config
@@ -5339,7 +5341,7 @@ def profile_run(self) -> None:
         del hidden_states, output
         self.encoder_cache.clear()
         gc.collect()
-        self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = False
+        self.vllm_config.in_mm_encoder_tracing = False
 
     def capture_model(self) -> int:
         if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
@@ -5491,10 +5493,10 @@ def _capture_cudagraphs(
             cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
             and self.supports_mm_inputs
         ):
-            vit_capture_sizes = self.vit_cudagraph_batch_sizes
-            if vit_capture_sizes:
-                compilation_cases_vit = list(reversed(vit_capture_sizes))
-                self._dummy_mm_encoder_run(compilation_cases_vit)
+            mm_encoder_capture_sizes = self.mm_encoder_cudagraph_batch_sizes
+            if mm_encoder_capture_sizes:
+                compilation_cases_mm_encoder = list(reversed(mm_encoder_capture_sizes))
+                self._dummy_mm_encoder_run(compilation_cases_mm_encoder)
 
         self.maybe_remove_all_loras(self.lora_config)
 

From 13c6422a591d505a3946ef399f5f8cd2bbed96dc Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Tue, 3 Feb 2026 00:10:44 +0800
Subject: [PATCH 30/35] feat: add MMEncoderCudagraphManager and update related
 components for multimodal input handling

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py   |  14 +-
 vllm/model_executor/models/qwen2_vl.py     |  19 ++
 vllm/model_executor/models/qwen3_vl.py     |  33 +++-
 vllm/model_executor/models/vision.py       |  50 ++----
 vllm/multimodal/processing/dummy_inputs.py |  50 ++++++
 vllm/v1/worker/gpu_model_runner.py         | 192 ++++-----------------
 vllm/v1/worker/mm_cudagraph.py             | 173 +++++++++++++++++++
 7 files changed, 324 insertions(+), 207 deletions(-)
 create mode 100644 vllm/v1/worker/mm_cudagraph.py

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index e2793bae0b4b..270346801437 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1279,7 +1279,7 @@ def _parse_and_validate_video_input(
     def _process_image_input(
         self,
         image_input: Qwen2_5_VLImageInputs,
-        cudagraph_dispatcher: Any | None = None,
+        mm_cudagraph_manager: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
@@ -1299,7 +1299,7 @@ def _process_image_input(
                         pixel_values,
                         grid_thw_list,
                         rope_type="rope_3d",
-                        cudagraph_dispatcher=cudagraph_dispatcher,
+                        mm_cudagraph_manager=mm_cudagraph_manager,
                     )
                 else:
                     image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
@@ -1343,7 +1343,7 @@ def _postprocess_image_embeds_evs(
     def _process_video_input(
         self,
         video_input: Qwen2_5_VLVideoInputs,
-        cudagraph_dispatcher: Any | None = None,
+        mm_cudagraph_manager: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
@@ -1363,7 +1363,7 @@ def _process_video_input(
                         pixel_values_videos,
                         grid_thw_list,
                         rope_type="rope_3d",
-                        cudagraph_dispatcher=cudagraph_dispatcher,
+                        mm_cudagraph_manager=mm_cudagraph_manager,
                     )
                 else:
                     video_embeds = self.visual(
@@ -1513,7 +1513,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         return mm_input_by_modality
 
     def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
-        cudagraph_dispatcher = kwargs.pop("cudagraph_dispatcher", None)
+        mm_cudagraph_manager = kwargs.pop("mm_cudagraph_manager", None)
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return []
@@ -1528,7 +1528,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
                 image_embeddings = self._process_image_input(
-                    multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher
+                    multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager
                 )
                 if self.is_multimodal_pruning_enabled:
                     image_embeddings = self._postprocess_image_embeds_evs(
@@ -1537,7 +1537,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
                 multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
                 video_embeddings = self._process_video_input(
-                    multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher
+                    multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager
                 )
                 if self.is_multimodal_pruning_enabled:
                     video_embeddings = self._postprocess_video_embeds_evs(
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index c7c26c206726..94348a77b55a 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1043,6 +1043,25 @@ def get_dummy_mm_data(
             ),
         }
 
+    def _calculate_patch_size(self, patches: int) -> tuple[int, int]:
+        vision_config = self.info.get_hf_config().vision_config
+        merge_size = vision_config.spatial_merge_size
+
+        assert patches % (merge_size * merge_size) == 0, (
+            f"Qwen2-VL: Number of patches ({patches}) must be multiple of "
+            f"merge_size squared ({merge_size}^2)"
+        )
+        h_patches = merge_size
+        w_patches = patches // merge_size
+        return h_patches, w_patches
+
+    def _get_img_feature_dim(self) -> int:
+        vision_config = self.info.get_hf_config().vision_config
+        in_channels = vision_config.in_channels
+        temporal_patch_size = vision_config.temporal_patch_size
+        patch_size = vision_config.patch_size
+        return in_channels * temporal_patch_size * patch_size * patch_size
+
 
 class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]):
     def _get_prompt_updates(
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index e87d3701c31a..35d7986d13b6 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1015,6 +1015,25 @@ def _get_dummy_videos(
             video_items.append(video_item)
         return video_items
 
+    def _calculate_patch_size(self, patches: int) -> tuple[int, int]:
+        vision_config = self.info.get_hf_config().vision_config
+        merge_size = vision_config.spatial_merge_size
+
+        assert patches % (merge_size * merge_size) == 0, (
+            f"Qwen3-VL: Number of patches ({patches}) must be multiple of "
+            f"merge_size squared ({merge_size}^2)"
+        )
+        h_patches = merge_size
+        w_patches = patches // merge_size
+        return h_patches, w_patches
+
+    def _get_img_feature_dim(self) -> int:
+        vision_config = self.info.get_hf_config().vision_config
+        in_channels = vision_config.in_channels
+        temporal_patch_size = vision_config.temporal_patch_size
+        patch_size = vision_config.patch_size
+        return in_channels * temporal_patch_size * patch_size * patch_size
+
 
 class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]):
     def _call_hf_processor(
@@ -1517,7 +1536,7 @@ def _parse_and_validate_video_input(
     def _process_image_input(
         self,
         image_input: Qwen2_5_VLImageInputs,
-        cudagraph_dispatcher: Any | None = None,
+        mm_cudagraph_manager: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = image_input["image_grid_thw"]
         assert grid_thw.ndim == 2
@@ -1538,7 +1557,7 @@ def _process_image_input(
                         pixel_values,
                         grid_thw_list,
                         rope_type="rope_3d",
-                        cudagraph_dispatcher=cudagraph_dispatcher,
+                        mm_cudagraph_manager=mm_cudagraph_manager,
                     )
                 else:
                     image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
@@ -1551,7 +1570,7 @@ def _process_image_input(
     def _process_video_input(
         self,
         video_input: Qwen2_5_VLVideoInputs,
-        cudagraph_dispatcher: Any | None = None,
+        mm_cudagraph_manager: Any | None = None,
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
@@ -1573,7 +1592,7 @@ def _process_video_input(
                         pixel_values_videos,
                         grid_thw_list,
                         rope_type="rope_3d",
-                        cudagraph_dispatcher=cudagraph_dispatcher,
+                        mm_cudagraph_manager=mm_cudagraph_manager,
                     )
                 else:
                     video_embeds = self.visual(
@@ -2022,7 +2041,7 @@ def get_mrope_input_positions(
         return torch.from_numpy(llm_positions), mrope_position_delta
 
     def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
-        cudagraph_dispatcher = kwargs.pop("cudagraph_dispatcher", None)
+        mm_cudagraph_manager = kwargs.pop("mm_cudagraph_manager", None)
         mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not mm_input_by_modality:
             return None
@@ -2037,7 +2056,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
             multimodal_input = mm_input_by_modality[modality]
             if modality == "image":
                 image_embeddings = self._process_image_input(
-                    multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher
+                    multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager
                 )
                 if self.is_multimodal_pruning_enabled:
                     image_embeddings = self._postprocess_image_embeds_evs(
@@ -2046,7 +2065,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
                 multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
                 video_embeddings = self._process_video_input(
-                    multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher
+                    multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager
                 )
                 if self.is_multimodal_pruning_enabled:
                     video_embeddings = self._postprocess_video_embeds_evs(
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 67a50b0da054..538f1c98d64e 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -25,7 +25,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
-from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
+from vllm.v1.worker.mm_cudagraph import MMEncoderCudagraphManager
 
 logger = init_logger(__name__)
 
@@ -394,7 +394,7 @@ def run_dp_sharded_mrope_vision_model(
     grid_thw_list: list[list[int]],
     *,
     rope_type: Literal["rope_3d", "rope_2d"],
-    cudagraph_dispatcher: CudagraphDispatcher | None = None,
+    mm_cudagraph_manager: MMEncoderCudagraphManager | None = None,
 ) -> tuple[torch.Tensor, ...]:
     """Run a vision model with data parallelism (DP) sharding.
     The function will shard the input image tensor on the
@@ -470,12 +470,10 @@ def run_dp_sharded_mrope_vision_model(
         embed_dim_reduction_factor = (
             vision_model.merge_kernel_size[0] * vision_model.merge_kernel_size[1]
         )
-        merge_size = vision_model.merge_kernel_size[0]
     else:
         embed_dim_reduction_factor = (
             vision_model.spatial_merge_size * vision_model.spatial_merge_size
         )
-        merge_size = vision_model.spatial_merge_size
 
     # Find the max length across all ranks
     # The output embedding of every DP rank has to be
@@ -484,40 +482,24 @@ def run_dp_sharded_mrope_vision_model(
     max_len_per_rank = max(grouped_pixel_values_len) // embed_dim_reduction_factor
     local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local]
 
-    vllm_config = get_current_vllm_config()
-
     # Context setup
-    if cudagraph_dispatcher is not None:
-        dispatcher = cudagraph_dispatcher
-    else:
-        dispatcher = CudagraphDispatcher(vllm_config)
+    vllm_config = get_current_vllm_config()
     cudagraph_runtime_mode = CUDAGraphMode.NONE
     batch_descriptor = None
 
-    if (
-        vllm_config
-        and vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes
-    ):
-        current_input_len = pixel_values_local.shape[0]
-        cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(
-            num_tokens=current_input_len,
-            uniform_decode=False,
-            has_lora=False,
-            disable_full=False,
-            is_mm_encoder=True,
-        )
-        target_input_len = batch_descriptor.num_tokens
-
-        # Pad pixel_values_local for CUDA graph if needed
-        if current_input_len < target_input_len:
-            padding_size = target_input_len - current_input_len
-            padding = torch.zeros(
-                (padding_size, pixel_values_local.shape[1]),
-                device=pixel_values_local.device,
-                dtype=pixel_values_local.dtype,
-            )
-            pixel_values_local = torch.cat([pixel_values_local, padding], dim=0)
-            local_grid_thw_list.append([1, merge_size, padding_size // merge_size])
+    if mm_cudagraph_manager is not None:
+        mm_groups: dict[str, torch.Tensor | list] = {
+            "pixel_values": pixel_values_local,
+            "image_grid_thw": local_grid_thw_list,
+        }
+        (
+            cudagraph_runtime_mode,
+            batch_descriptor,
+            _,
+            mm_groups,
+        ) = mm_cudagraph_manager.dispatch_and_pad_mm_input(mm_groups)
+        pixel_values_local = mm_groups["pixel_values"]
+        local_grid_thw_list = mm_groups["image_grid_thw"]
 
     with set_forward_context(
         None,
diff --git a/vllm/multimodal/processing/dummy_inputs.py b/vllm/multimodal/processing/dummy_inputs.py
index b23e2b86cc20..9eb1020db681 100644
--- a/vllm/multimodal/processing/dummy_inputs.py
+++ b/vllm/multimodal/processing/dummy_inputs.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import numpy.typing as npt
+import torch
 from PIL import Image
 
 from vllm.config.multimodal import (
@@ -199,3 +200,52 @@ def _get_dummy_videos(
                 height = min(height, overrides.height)
         video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
         return [video] * num_videos
+
+    @abstractmethod
+    def _get_img_feature_dim(self) -> int:
+        """
+        Get the image feature dimension for MM encoder CUDA graph capture.
+
+        Returns:
+            The image feature dimension.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _calculate_patch_size(self, patches: int) -> tuple[int, int]:
+        """
+        Calculate the patch grid size (height, width) from the total number of
+        patches.
+        """
+        raise NotImplementedError
+
+    def get_dummy_mm_encoder_input(
+        self,
+        num_patches: int,
+    ) -> "dict[str, torch.Tensor]":
+        """
+        Get dummy MM encoder input for CUDA graph capture or padding.
+
+        Args:
+            num_patches: Number of patches (tokens) for the dummy input
+
+        Returns:
+            dict with pixel_values and image_grid_thw
+        """
+        img_feature_dim = self._get_img_feature_dim()
+
+        dtype = self.info.ctx.model_config.dtype
+
+        h_patches, w_patches = self._calculate_patch_size(num_patches)
+
+        pixel_values = torch.zeros(
+            (num_patches, img_feature_dim), dtype=dtype, device="cuda"
+        )
+        grid_thw_list = torch.tensor(
+            [[1, h_patches, w_patches]], dtype=torch.long, device="cpu"
+        )
+
+        return {
+            "pixel_values": pixel_values,
+            "image_grid_thw": grid_thw_list,
+        }
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a8d7d74ea127..36f4e7f5bdcc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -171,6 +171,7 @@
 from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+from vllm.v1.worker.mm_cudagraph import MMEncoderCudagraphManager
 from vllm.v1.worker.ubatch_utils import (
     UBatchSlices,
     check_ubatch_thresholds,
@@ -553,15 +554,6 @@ def __init__(
             self.cudagraph_batch_sizes = sorted(
                 self.compilation_config.cudagraph_capture_sizes
             )
-        # self.mm_encoder_cudagraph_batch_sizes sorts in ascending order.
-        self.mm_encoder_cudagraph_batch_sizes: list[int] | None = None
-        if (
-            self.compilation_config.mm_encoder_cudagraph_capture_sizes
-            and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-        ):
-            self.mm_encoder_cudagraph_batch_sizes = sorted(
-                self.compilation_config.mm_encoder_cudagraph_capture_sizes
-            )
 
         # Cache the device properties.
         self._init_device_properties()
@@ -660,6 +652,18 @@ def __init__(
         # Cudagraph dispatcher for runtime cudagraph dispatching.
         self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config)
 
+        # MM encoder CUDA graph manager for ViT piecewise CUDA graph.
+        self.mm_cudagraph_manager: MMEncoderCudagraphManager | None = None
+        if self.supports_mm_inputs:
+            processor = self.mm_registry.create_processor(self.model_config)
+            dummy_inputs_builder = processor.dummy_inputs
+            self.mm_cudagraph_manager = MMEncoderCudagraphManager(
+                self.vllm_config,
+                self.cudagraph_dispatcher,
+                self.device,
+                dummy_inputs_builder,
+            )
+
         self.mm_budget = (
             MultiModalBudget(self.vllm_config, self.mm_registry)
             if self.supports_mm_inputs
@@ -2430,72 +2434,26 @@ def _execute_mm_encoder(
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
-                is_vit_dp_mode = (
-                    getattr(
-                        self.model_config.multimodal_config, "mm_encoder_tp_mode", None
-                    )
-                    == "data"
-                    and self.parallel_config.tensor_parallel_size > 1
-                )
+                mm_mgr = self.mm_cudagraph_manager
+                is_vit_dp_mode = mm_mgr.is_vit_dp_mode if mm_mgr else False
+
                 if not is_vit_dp_mode:
                     original_num_imgs = -1
-                    padded_num_tokens = -1
 
                     # Default values for non-mm_encoder cudagraph case
                     cudagraph_runtime_mode = CUDAGraphMode.NONE
                     batch_descriptor = None
                     if (
-                        self.mm_encoder_cudagraph_batch_sizes
+                        mm_mgr is not None
+                        and mm_mgr.enabled
                         and "pixel_values" in mm_kwargs_group
                     ):
-                        pixel_values = cast(
-                            torch.Tensor, mm_kwargs_group["pixel_values"]
-                        )
-                        num_tokens = pixel_values.shape[0]
-
-                        # get batch_descriptor from dispatcher
-                        cudagraph_runtime_mode, batch_descriptor = (
-                            self.cudagraph_dispatcher.dispatch(
-                                num_tokens=num_tokens,
-                                uniform_decode=False,
-                                has_lora=False,
-                                disable_full=False,
-                                is_mm_encoder=True,
-                            )
-                        )
-                        padded_num_tokens = batch_descriptor.num_tokens
-
-                        if padded_num_tokens > num_tokens:
-                            padding_amount = padded_num_tokens - num_tokens
-                            padding_tensor = torch.zeros(
-                                (padding_amount, pixel_values.shape[1]),
-                                dtype=pixel_values.dtype,
-                                device=pixel_values.device,
-                            )
-                            mm_kwargs_group["pixel_values"] = torch.cat(
-                                [pixel_values, padding_tensor], dim=0
-                            )
-
-                            # Update image_grid_thw to account for padding
-                            if "image_grid_thw" in mm_kwargs_group:
-                                image_grid_thw = cast(
-                                    torch.Tensor, mm_kwargs_group["image_grid_thw"]
-                                )
-                                original_num_imgs = image_grid_thw.shape[0]
-
-                                # Treat padding as a new virtual image.
-                                # Assuming a fixed patch size where height = merge_size
-                                h_patches, w_patches = self._get_dummy_h_w_patches(
-                                    padding_amount
-                                )
-                                padding_grid_info = torch.tensor(
-                                    [[1, h_patches, w_patches]],
-                                    dtype=image_grid_thw.dtype,
-                                    device=image_grid_thw.device,
-                                )
-                                mm_kwargs_group["image_grid_thw"] = torch.cat(
-                                    [image_grid_thw, padding_grid_info], dim=0
-                                )
+                        (
+                            cudagraph_runtime_mode,
+                            batch_descriptor,
+                            original_num_imgs,
+                            mm_kwargs_group,
+                        ) = mm_mgr.dispatch_and_pad_mm_input(mm_kwargs_group)
 
                     with (
                         set_forward_context(
@@ -2519,9 +2477,7 @@ def _execute_mm_encoder(
                             should_time, mm_lora_refs, current_item_idx, num_items
                         ),
                     ):
-                        mm_kwargs_group["cudagraph_dispatcher"] = (
-                            self.cudagraph_dispatcher
-                        )
+                        mm_kwargs_group["mm_cudagraph_manager"] = mm_mgr
                         curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
@@ -4666,43 +4622,6 @@ def rand_inputs_embeds() -> torch.Tensor:
             yield
             inputs_embeds.fill_(0)
 
-    def _get_dummy_h_w_patches(self, patches: int):
-        vision_config = self.model_config.hf_config.vision_config
-        if hasattr(vision_config, "spatial_merge_size"):
-            merge_size = vision_config.spatial_merge_size
-        elif hasattr(vision_config, "merge_kernel_size"):
-            merge_size = vision_config.merge_kernel_size[0]
-        else:
-            merge_size = 1
-
-        assert patches % (merge_size * merge_size) == 0, (
-            "Number of patches must be multiple of merge_size squared"
-        )
-        h_patches = merge_size
-        w_patches = patches // merge_size
-        return h_patches, w_patches
-
-    def _get_dummy_mm_encoder_input(
-        self, num_image_tokens: int, img_feature_dim: int
-    ) -> BatchedTensorInputs:
-        """Dummy data for profiling and precompiling mm_encoder."""
-
-        # The first dimension of pixel_values corresponds
-        # to the total number of patches.
-        pixel_values = torch.zeros(
-            (num_image_tokens, img_feature_dim), dtype=self.dtype, device=self.device
-        )
-
-        h_patches, w_patches = self._get_dummy_h_w_patches(num_image_tokens)
-        image_grid_thw = torch.tensor(
-            [[1, h_patches, w_patches]], dtype=torch.long, device=self.device
-        )
-
-        return {
-            "pixel_values": pixel_values,
-            "image_grid_thw": image_grid_thw,
-        }
-
     def _get_mm_dummy_batch(
         self,
         modality: str,
@@ -5231,50 +5150,6 @@ def _dummy_pooler_run(
         max_task = max(output_size.items(), key=lambda x: x[1])[0]
         return self._dummy_pooler_run_task(hidden_states, max_task)
 
-    @torch.inference_mode()
-    def _dummy_mm_encoder_run(
-        self,
-        compilation_cases: list[int],
-    ) -> None:
-        self.vllm_config.in_mm_encoder_tracing = True
-        tmp_dummy_mm_inputs = self._get_mm_dummy_batch(
-            "video",
-            1,
-        )
-        img_feature_dim = cast(
-            torch.Tensor, tmp_dummy_mm_inputs["pixel_values_videos"]
-        ).shape[1]
-
-        if is_global_first_rank():
-            compilation_cases = tqdm(
-                compilation_cases,
-                disable=not self.load_config.use_tqdm_on_load,
-                desc="Capturing mm_encoder CUDA graphs (PIECEWISE)",
-            )
-
-        for capture_size in compilation_cases:
-            dummy_mm_inputs = self._get_dummy_mm_encoder_input(
-                capture_size, img_feature_dim
-            )
-            cudagraph_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch(
-                num_tokens=capture_size,
-                uniform_decode=False,
-                has_lora=False,
-                disable_full=False,
-                is_mm_encoder=True,
-            )
-            with (
-                set_forward_context(
-                    None,
-                    self.vllm_config,
-                    num_tokens=capture_size,
-                    cudagraph_runtime_mode=cudagraph_mode,
-                    batch_descriptor=batch_descriptor,
-                ),
-            ):
-                self.model.embed_multimodal(**dummy_mm_inputs)
-        self.vllm_config.in_mm_encoder_tracing = False
-
     def profile_run(self) -> None:
         self.vllm_config.in_mm_encoder_tracing = True
         # Profile with multimodal encoder & encoder cache.
@@ -5386,6 +5261,14 @@ def freeze_gc():
                     batch_descriptors=batch_descs,
                     cudagraph_runtime_mode=runtime_mode,
                 )
+            # Capture MM encoder CUDA graphs if enabled
+            if self.mm_cudagraph_manager is not None:
+                for runtime_mode, _ in self.cudagraph_dispatcher.get_capture_descs(
+                    is_mm_encoder=True
+                ):
+                    self.mm_cudagraph_manager.capture(
+                        model=self.model, cudagraph_mode=runtime_mode
+                    )
 
             torch.cuda.synchronize()
             end_free_gpu_memory = torch.cuda.mem_get_info()[0]
@@ -5489,15 +5372,6 @@ def _capture_cudagraphs(
                 num_active_loras=num_active_loras,
                 is_graph_capturing=True,
             )
-        if (
-            cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
-            and self.supports_mm_inputs
-        ):
-            mm_encoder_capture_sizes = self.mm_encoder_cudagraph_batch_sizes
-            if mm_encoder_capture_sizes:
-                compilation_cases_mm_encoder = list(reversed(mm_encoder_capture_sizes))
-                self._dummy_mm_encoder_run(compilation_cases_mm_encoder)
-
         self.maybe_remove_all_loras(self.lora_config)
 
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
diff --git a/vllm/v1/worker/mm_cudagraph.py b/vllm/v1/worker/mm_cudagraph.py
new file mode 100644
index 000000000000..fd572c40c46c
--- /dev/null
+++ b/vllm/v1/worker/mm_cudagraph.py
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, cast
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from vllm.config import CUDAGraphMode, VllmConfig
+from vllm.distributed.parallel_state import is_global_first_rank
+from vllm.forward_context import (
+    BatchDescriptor,
+    set_forward_context,
+)
+from vllm.logger import init_logger
+from vllm.multimodal import BatchedTensorInputs
+from vllm.multimodal.processing import BaseDummyInputsBuilder
+from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
+
+logger = init_logger(__name__)
+
+
+class MMEncoderCudagraphManager:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        cudagraph_dispatcher: CudagraphDispatcher,
+        device: torch.device,
+        dummy_input_builder: BaseDummyInputsBuilder[Any],
+    ):
+        self.vllm_config = vllm_config
+        self.dispatcher = cudagraph_dispatcher
+        self.device = device
+        self.dummy_input_builder = dummy_input_builder
+
+        compilation_config = vllm_config.compilation_config
+        self.capture_sizes: list[int] = []
+        if compilation_config and compilation_config.mm_encoder_cudagraph_capture_sizes:
+            self.capture_sizes = sorted(
+                compilation_config.mm_encoder_cudagraph_capture_sizes
+            )
+
+        self.enabled = bool(
+            self.capture_sizes
+            and compilation_config
+            and compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+        )
+
+        # Check if using data parallel mode for ViT
+        self.is_vit_dp_mode = self._check_vit_dp_mode(vllm_config)
+
+    def _check_vit_dp_mode(self, vllm_config: VllmConfig) -> bool:
+        """Check if ViT is running in data parallel mode."""
+        mm_config = getattr(vllm_config.model_config, "multimodal_config", None)
+        if mm_config is None:
+            return False
+
+        mm_encoder_tp_mode = mm_config.mm_encoder_tp_mode
+        tp_size = vllm_config.parallel_config.tensor_parallel_size
+
+        return mm_encoder_tp_mode == "data" and tp_size > 1
+
+    def dispatch_and_pad_mm_input(
+        self,
+        mm_kwargs_group: BatchedTensorInputs,
+    ) -> tuple[CUDAGraphMode, BatchDescriptor | None, int, BatchedTensorInputs]:
+        pixel_values = cast(torch.Tensor, mm_kwargs_group["pixel_values"])
+        num_tokens = pixel_values.shape[0]
+
+        image_grid_thw = mm_kwargs_group["image_grid_thw"]
+        if isinstance(image_grid_thw, torch.Tensor):
+            original_num_imgs = image_grid_thw.shape[0]
+        else:
+            original_num_imgs = len(image_grid_thw)
+
+        if not self.enabled:
+            return (
+                CUDAGraphMode.NONE,
+                BatchDescriptor(num_tokens, is_mm_encoder=True),
+                original_num_imgs,
+                mm_kwargs_group,
+            )
+
+        # Dispatch to get the target padded size
+        cudagraph_runtime_mode, batch_descriptor = self.dispatcher.dispatch(
+            num_tokens=num_tokens,
+            is_mm_encoder=True,
+        )
+        target_num_tokens = batch_descriptor.num_tokens
+
+        # Pad if necessary
+        if target_num_tokens > num_tokens:
+            # Pad pixel_values
+            padding_size = target_num_tokens - num_tokens
+            padding_mm_inputs = self.dummy_input_builder.get_dummy_mm_encoder_input(
+                padding_size,
+            )
+
+            mm_kwargs_group["pixel_values"] = torch.cat(
+                [pixel_values, padding_mm_inputs["pixel_values"]], dim=0
+            )
+
+            padding_image_grid_thw = padding_mm_inputs["image_grid_thw"]
+            if isinstance(image_grid_thw, torch.Tensor):
+                mm_kwargs_group["image_grid_thw"] = torch.cat(
+                    [image_grid_thw, padding_image_grid_thw], dim=0
+                )
+            else:
+                mm_kwargs_group["image_grid_thw"] = (
+                    image_grid_thw + padding_image_grid_thw.tolist()
+                )
+
+        return (
+            cudagraph_runtime_mode,
+            batch_descriptor,
+            original_num_imgs,
+            mm_kwargs_group,
+        )
+
+    def capture_graph(
+        self,
+        num_tokens: int,
+        model: nn.Module,
+        cudagraph_mode: CUDAGraphMode,
+    ) -> None:
+        dummy_mm_inputs = self.dummy_input_builder.get_dummy_mm_encoder_input(
+            num_tokens
+        )
+
+        batch_descriptor = BatchDescriptor(
+            num_tokens=num_tokens,
+            is_mm_encoder=True,
+        )
+
+        with set_forward_context(
+            None,
+            self.vllm_config,
+            num_tokens=num_tokens,
+            cudagraph_runtime_mode=cudagraph_mode,
+            batch_descriptor=batch_descriptor,
+        ):
+            model.embed_multimodal(**dummy_mm_inputs)
+
+    @torch.inference_mode()
+    def capture(
+        self,
+        model: nn.Module,
+        cudagraph_mode: CUDAGraphMode,
+    ) -> None:
+        if not self.enabled or not self.capture_sizes:
+            return
+
+        self.vllm_config.in_mm_encoder_tracing = True
+
+        capture_sizes_desc = list(reversed(self.capture_sizes))
+
+        if is_global_first_rank():
+            capture_sizes_iter: Any = tqdm(
+                capture_sizes_desc,
+                disable=not self.vllm_config.load_config.use_tqdm_on_load,
+                desc="Capturing MM_Encoder CUDA graphs (PIECEWISE)",
+            )
+        else:
+            capture_sizes_iter = capture_sizes_desc
+
+        for capture_size in capture_sizes_iter:
+            self.capture_graph(
+                capture_size,
+                model=model,
+                cudagraph_mode=cudagraph_mode,
+            )
+
+        self.vllm_config.in_mm_encoder_tracing = False

From 07316832308f02f7e10df42977ada75a72456c6d Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Tue, 3 Feb 2026 15:39:12 +0800
Subject: [PATCH 31/35] simplify cuda graph conditional judgments

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 56 +++++++++++------------
 vllm/model_executor/models/qwen3_vl.py   | 57 ++++++++++++------------
 vllm/v1/worker/gpu_model_runner.py       |  8 +---
 3 files changed, 57 insertions(+), 64 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 270346801437..d6a352392e9c 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -662,16 +662,17 @@ def __init__(
                 device=self.device,
                 dtype=self.dtype,
             )
-            self._persistent_rotary_pos_emb_cos_buffer = torch.empty(
-                (max_compile_size, head_dim // 2),
-                device=self.device,
-                dtype=torch.bfloat16,
-            )
-            self._persistent_rotary_pos_emb_sin_buffer = torch.empty(
-                (max_compile_size, head_dim // 2),
-                device=self.device,
-                dtype=torch.bfloat16,
-            )
+            (
+                self._persistent_rotary_pos_emb_cos_buffer,
+                self._persistent_rotary_pos_emb_sin_buffer,
+            ) = [
+                torch.empty(
+                    (max_compile_size, head_dim // 2),
+                    device=self.device,
+                    dtype=torch.bfloat16,
+                )
+                for _ in range(2)
+            ]
 
     @property
     def dtype(self) -> torch.dtype:
@@ -803,6 +804,17 @@ def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
         inv[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
         return inv
 
+    def _use_piecewise_cudagraph(self) -> bool:
+        if self._persistent_hidden_states_buffer is None:
+            return False
+        if not is_forward_context_available():
+            return False
+        fwd_ctx = get_forward_context()
+        return (
+            fwd_ctx is not None
+            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+        )
+
     def forward(
         self,
         x: torch.Tensor,
@@ -816,14 +828,9 @@ def forward(
         cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)]
         cu_seqlens: list = []
 
-        fwd_ctx = None
-        if is_forward_context_available():
-            fwd_ctx = get_forward_context()
-        if (
-            self._persistent_hidden_states_buffer is not None
-            and fwd_ctx
-            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
-        ):
+        is_cudagraph_mode = self._use_piecewise_cudagraph()
+
+        if is_cudagraph_mode:
             hidden_states = self._persistent_hidden_states_buffer[:seq_len]
             hidden_states.copy_(x, non_blocking=True)
         else:
@@ -886,12 +893,7 @@ def forward(
         rotary_pos_emb_sin = rotary_pos_emb_sin.to(
             device=self.device, non_blocking=True
         )
-        if (
-            self._persistent_rotary_pos_emb_sin_buffer is not None
-            and self._persistent_rotary_pos_emb_cos_buffer is not None
-            and fwd_ctx
-            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
-        ):
+        if is_cudagraph_mode:
             rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[
                 :seq_len
             ].copy_(rotary_pos_emb_sin)
@@ -911,11 +913,7 @@ def forward(
         hidden_states = hidden_states.reshape(seq_len, -1)
         hidden_states = hidden_states.unsqueeze(1)
 
-        if (
-            self._persistent_hidden_states_buffer is not None
-            and fwd_ctx
-            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
-        ):
+        if is_cudagraph_mode:
             # The above operations will produce temporary new tensors.
             # That is not friendly to cudagraphs,
             # so we need to copy them back to the persistent buffer
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 35d7986d13b6..9e6001f474b0 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -448,16 +448,17 @@ def __init__(
                 device=self.device,
                 dtype=self.dtype,
             )
-            self._persistent_rotary_pos_emb_cos_buffer = torch.empty(
-                (max_compile_size, head_dim // 2),
-                device=self.device,
-                dtype=torch.bfloat16,
-            )
-            self._persistent_rotary_pos_emb_sin_buffer = torch.empty(
-                (max_compile_size, head_dim // 2),
-                device=self.device,
-                dtype=torch.bfloat16,
-            )
+            (
+                self._persistent_rotary_pos_emb_cos_buffer,
+                self._persistent_rotary_pos_emb_sin_buffer,
+            ) = [
+                torch.empty(
+                    (max_compile_size, head_dim // 2),
+                    device=self.device,
+                    dtype=torch.bfloat16,
+                )
+                for _ in range(2)
+            ]
 
     @property
     def dtype(self) -> torch.dtype:
@@ -584,20 +585,26 @@ def compute_attn_mask_seqlen(
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
         return max_seqlen
 
+    def _use_piecewise_cudagraph(self) -> bool:
+        if self._persistent_hidden_states_buffer is None:
+            return False
+        if not is_forward_context_available():
+            return False
+        fwd_ctx = get_forward_context()
+        return (
+            fwd_ctx is not None
+            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+        )
+
     def forward(
         self,
         x: torch.Tensor,
         grid_thw: torch.Tensor | list[list[int]],
     ) -> torch.Tensor:
         seq_len, _ = x.size()
-        fwd_ctx = None
-        if is_forward_context_available():
-            fwd_ctx = get_forward_context()
-        if (
-            self._persistent_hidden_states_buffer is not None
-            and fwd_ctx
-            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
-        ):
+        is_cudagraph_mode = self._use_piecewise_cudagraph()
+
+        if is_cudagraph_mode:
             hidden_states = self._persistent_hidden_states_buffer[:seq_len]
             hidden_states.copy_(x, non_blocking=True)
         else:
@@ -622,12 +629,8 @@ def forward(
         original_hidden_states = hidden_states
         hidden_states = hidden_states + pos_embeds
         rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
-        if (
-            self._persistent_rotary_pos_emb_sin_buffer is not None
-            and self._persistent_rotary_pos_emb_cos_buffer is not None
-            and fwd_ctx
-            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
-        ):
+
+        if is_cudagraph_mode:
             rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[
                 :seq_len
             ].copy_(rotary_pos_emb_sin)
@@ -645,11 +648,7 @@ def forward(
         max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
-        if (
-            self._persistent_hidden_states_buffer is not None
-            and fwd_ctx
-            and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
-        ):
+        if is_cudagraph_mode:
             # The above operations will produce temporary new tensors.
             # That is not friendly to cudagraphs,
             # so we need to copy them back to the persistent buffer
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 36f4e7f5bdcc..f1b1438c81b6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -29,7 +29,6 @@
     CUDAGraphMode,
     VllmConfig,
     get_layers_from_vllm_config,
-    set_current_vllm_config,
     update_config,
 )
 from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
@@ -2471,11 +2470,8 @@ def _execute_mm_encoder(
                     if original_num_imgs != -1:
                         curr_group_outputs = curr_group_outputs[:original_num_imgs]
                 else:
-                    with (
-                        set_current_vllm_config(self.vllm_config),
-                        self.timed_encoder_operation(
-                            should_time, mm_lora_refs, current_item_idx, num_items
-                        ),
+                    with self.timed_encoder_operation(
+                        should_time, mm_lora_refs, current_item_idx, num_items
                     ):
                         mm_kwargs_group["mm_cudagraph_manager"] = mm_mgr
                         curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)

From 53814ecac0d451d8f04747f1c4dafb86e5c89925 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Tue, 3 Feb 2026 16:54:22 +0800
Subject: [PATCH 32/35] rebase

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/config/vllm.py             | 16 ++++------------
 vllm/v1/cudagraph_dispatcher.py | 10 +++++++---
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 8a05b2533089..137ab56b65b9 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -365,13 +365,6 @@ def compute_hash(self) -> str:
         ]
         return hash_str
 
-    def pad_for_cudagraph(self, batch_size: int) -> int:
-        # if batch_size > self.compilation_config.max_cudagraph_capture_size,
-        # it should raise an IndexError.
-        # the caller should make sure the batch_size is within the range,
-        # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size
-        return self.compilation_config.bs_to_padded_graph_size[batch_size]
-
     @property
     def needs_dp_coordinator(self) -> bool:
         """
@@ -1397,12 +1390,11 @@ def _set_mm_encoder_cudagraph_sizes(self):
             )
             if max_mm_encoder_cudagraph_capture_size is None:
                 from vllm.multimodal import MULTIMODAL_REGISTRY
-                from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
+                from vllm.multimodal.budget import MultiModalBudget
 
-                encoder_compute_budget, _ = compute_encoder_budget(
-                    model_config=self.model_config,
-                    scheduler_config=self.scheduler_config,
-                    mm_registry=MULTIMODAL_REGISTRY,
+                mm_budget = MultiModalBudget(self, MULTIMODAL_REGISTRY)
+                encoder_compute_budget = (
+                    mm_budget.encoder_compute_budget if mm_budget else 0
                 )
                 max_mm_encoder_cudagraph_capture_size = min(
                     encoder_compute_budget, 8192
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 1dfe1d07f1d4..af9c90f3b016 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -307,9 +307,13 @@ def dispatch(
                 # so we must use max_loras + 1 for dispatch to find a matching graph.
                 effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1
 
-        batch_desc = self._create_padded_batch_descriptor(
-            num_tokens, uniform_decode, has_lora, effective_num_active_loras, is_mm_encoder
-        )
+                batch_desc = self._create_padded_batch_descriptor(
+                    num_tokens,
+                    uniform_decode,
+                    has_lora,
+                    effective_num_active_loras,
+                    is_mm_encoder,
+                )
         relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
 
         if not disable_full:

From ae2e8e62cea0f6c030f92234cb863acdc48658cb Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Wed, 4 Feb 2026 17:16:49 +0800
Subject: [PATCH 33/35] add a dedicated dispatcher for mm encoder

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 .../piecewise/test_qwenvl_vit_cudagraph.py    | 161 +++++++++---------
 vllm/forward_context.py                       |   5 -
 vllm/model_executor/models/vision.py          |  20 +--
 vllm/v1/cudagraph_dispatcher.py               | 121 ++++---------
 vllm/v1/worker/gpu_model_runner.py            |  23 +--
 vllm/v1/worker/mm_cudagraph.py                |  13 +-
 6 files changed, 140 insertions(+), 203 deletions(-)

diff --git a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
index ec3176885409..82cb10394720 100644
--- a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
+++ b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
@@ -11,55 +11,9 @@
 from vllm.config import CompilationConfig, CUDAGraphMode
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.forward_context import set_forward_context
-from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.v1.executor.multiproc_executor import MultiprocExecutor
-
-
-def _worker_embed_multimodal(
-    worker, vllm_config, cudagraph_runtime_mode, batch_descriptor, multi_modal_data
-):
-    """Helper function to run multimodal embedding on a worker.
-    This function sets up the necessary forward context for tensor-parallel (TP)
-    execution and then calls the model's `embed_multimodal` method.
-    Note: For data-parallel (DP) mode, the forward context is typically
-          created and managed within the
-          vision.py:run_dp_sharded_mrope_vision_model(), which would override the
-          context set here.
-    Args:
-        worker: The worker instance containing the model runner.
-        vllm_config: The vLLM engine configuration.
-        cudagraph_runtime_mode: The runtime mode for CUDA graph execution.
-        batch_descriptor: An object describing the current batch.
-        multi_modal_data: A dictionary of keyword arguments to be passed to
-            the model's `embed_multimodal` method.
-    Returns:
-        The output from the model's `embed_multimodal` method.
-    """
-
-    # Access model via worker.model_runner.model
-    # Note: Accessing internal attributes. Assuming V1 worker structure.
-    model = worker.model_runner.model
-
-    # Move multi_modal_data to the model's device
-    target_device = next(model.parameters()).device
-    multi_modal_data = {
-        k: v.to(target_device) if isinstance(v, torch.Tensor) else v
-        for k, v in multi_modal_data.items()
-    }
-
-    with (
-        set_forward_context(
-            None,
-            vllm_config=vllm_config,
-            cudagraph_runtime_mode=cudagraph_runtime_mode,
-            batch_descriptor=batch_descriptor,
-        ),
-        torch.inference_mode(),
-    ):
-        ans = model.embed_multimodal(**multi_modal_data)
-        torch.cuda.synchronize()
-        return ans
-
+from vllm.v1.worker.mm_cudagraph import MMEncoderCudagraphManager
 
 # Format: (model_name, tp_size, mm_encoder_tp_mode)
 TEST_CONFIGS = [
@@ -123,62 +77,103 @@ def llm(request):
         cleanup_dist_env_and_memory()
 
 
-class TestQwenVLCUDAGraph:
-    def _run_embed_multimodal(
-        self, llm, multi_modal_data, num_patches, force_eager=False
+def _worker_embed_multimodal(
+    worker, vllm_config, multi_modal_data, enforce_eager=False
+):
+    """Helper function to run multimodal embedding on a worker.
+    This function sets up the necessary forward context for tensor-parallel (TP)
+    execution and then calls the model's `embed_multimodal` method.
+    Note: For data-parallel (DP) mode, the forward context is typically
+          created and managed within the
+          vision.py:run_dp_sharded_mrope_vision_model(), which would override the
+          context set here.
+    This method manually constructs a MMEncoderCudagraphManager because accessing the
+        one within the GPU model runner is difficult.
+    Args:
+        worker: The worker instance containing the model runner.
+        vllm_config: The vLLM engine configuration.
+        multi_modal_data: A dictionary of keyword arguments to be passed to
+            the model's `embed_multimodal` method.
+        enforce_eager: If True, forces the execution to run in eager mode
+    Returns:
+        The output from the model's `embed_multimodal` method.
+    """
+
+    # Access model via worker.model_runner.model
+    # Note: Accessing internal attributes. Assuming V1 worker structure.
+    model = worker.model_runner.model
+
+    # Move multi_modal_data to the model's device
+    target_device = next(model.parameters()).device
+    multi_modal_data = {
+        k: v.to(target_device) if isinstance(v, torch.Tensor) else v
+        for k, v in multi_modal_data.items()
+    }
+
+    processor = MULTIMODAL_REGISTRY.create_processor(vllm_config.model_config)
+    dummy_inputs_builder = processor.dummy_inputs
+    mm_cudagraph_manager = MMEncoderCudagraphManager(
+        vllm_config,
+        dummy_inputs_builder,
+    )
+    mm_cudagraph_manager.dispatcher.initialize_cudagraph_keys(
+        CUDAGraphMode.PIECEWISE,
+    )
+
+    # Dispatch to get runtime mode and batch descriptor
+    (
+        cudagraph_runtime_mode,
+        batch_descriptor,
+        _,
+        multi_modal_data,
+    ) = mm_cudagraph_manager.dispatch_and_pad_mm_input(multi_modal_data)
+    if enforce_eager:
+        cudagraph_runtime_mode = CUDAGraphMode.NONE
+    else:
+        multi_modal_data["mm_cudagraph_manager"] = mm_cudagraph_manager
+
+    with (
+        set_forward_context(
+            None,
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=batch_descriptor,
+        ),
+        torch.inference_mode(),
     ):
+        ans = model.embed_multimodal(**multi_modal_data)
+        torch.cuda.synchronize()
+        return ans
+
+
+class TestQwenVLCUDAGraph:
+    def _run_embed_multimodal(self, llm, multi_modal_data, enforce_eager=False):
         """Runs the multimodal embedding process, potentially with CUDA graphs.
-        This method manually constructs a CudagraphDispatcher because accessing the
-        one within the GPU model runner is difficult. It then dispatches based on
-        the number of image patches to determine the appropriate CUDA graph or
-        eager mode for execution. The actual embedding is performed on the
-        worker(s) via an RPC call.
+        The actual embedding is performed on the worker(s) via an RPC call.
         Args:
             llm: The LLM object containing the model engine and configuration.
             multi_modal_data: A dictionary containing the multimodal data to be
                 processed.
-            num_patches: The number of image patches, used to determine the
-                number of tokens for the dispatcher.
-            force_eager: If True, forces the execution to run in eager mode,
+            enforce_eager: If True, forces the execution to run in eager mode,
                 bypassing CUDA graphs.
         Returns:
             The outputs from the multimodal embedding process executed on the
             worker.
         """
         vllm_config = llm.llm_engine.vllm_config
-
-        dispatcher = CudagraphDispatcher(vllm_config)
-        dispatcher.initialize_cudagraph_keys(
-            cudagraph_mode=vllm_config.compilation_config.cudagraph_mode,
-            uniform_decode_query_len=1,
-        )
-
-        # Dispatch to get runtime mode and batch descriptor
-        cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(
-            num_tokens=num_patches,
-            uniform_decode=False,
-            has_lora=False,
-            is_mm_encoder=True,
-        )
-
         model_executor = llm.llm_engine.model_executor
 
         rpc_kwargs = {}
         # Use collective_rpc to execute on driver worker (rank 0)
         if isinstance(model_executor, MultiprocExecutor):
             rpc_kwargs["unique_reply_rank"] = 0
-        # If force_eager is True, override the runtime mode to NONE
-        if force_eager:
-            cudagraph_runtime_mode = CUDAGraphMode.NONE
-        else:
-            multi_modal_data["cudagraph_dispatcher"] = dispatcher
+
         outputs = model_executor.collective_rpc(
             partial(
                 _worker_embed_multimodal,
                 vllm_config=vllm_config,
-                cudagraph_runtime_mode=cudagraph_runtime_mode,
-                batch_descriptor=batch_descriptor,
                 multi_modal_data=multi_modal_data,
+                enforce_eager=enforce_eager,
             ),
             **rpc_kwargs,
         )
@@ -216,12 +211,12 @@ def test_vit_cudagraph_consistency(self, llm):
 
             # Run with Piecewise CUDA Graph
             piecewise_outputs = self._run_embed_multimodal(
-                llm, multi_modal_data, num_patches * num_imgs, force_eager=False
+                llm, multi_modal_data, enforce_eager=False
             )
 
             # Run with Eager Mode (simulated by setting runtime mode to NONE)
             eager_outputs = self._run_embed_multimodal(
-                llm, multi_modal_data, num_patches * num_imgs, force_eager=True
+                llm, multi_modal_data, enforce_eager=True
             )
 
             if isinstance(piecewise_outputs, torch.Tensor):
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index be08e2d9a6bc..7d5c48a2e506 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -55,10 +55,6 @@ class BatchDescriptor(NamedTuple):
     (like fused_moe_lora) whose grid size depends on num_active_loras
     to be properly captured.
     """
-    is_mm_encoder: bool = False
-    """
-    mm_encoder Piecewise CUDA Graph Flag
-    """
 
     def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor":
         """
@@ -71,7 +67,6 @@ def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor":
             uniform=False,
             has_lora=self.has_lora,
             num_active_loras=self.num_active_loras,
-            is_mm_encoder=self.is_mm_encoder,
         )
 
 
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 538f1c98d64e..adcfd7d3b370 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import contextlib
 import itertools
 import math
 from abc import ABC, abstractmethod
@@ -11,7 +12,6 @@
 from transformers import PretrainedConfig
 
 from vllm.config import (
-    CUDAGraphMode,
     MultiModalConfig,
     VllmConfig,
     get_current_vllm_config,
@@ -483,9 +483,7 @@ def run_dp_sharded_mrope_vision_model(
     local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local]
 
     # Context setup
-    vllm_config = get_current_vllm_config()
-    cudagraph_runtime_mode = CUDAGraphMode.NONE
-    batch_descriptor = None
+    ctx = contextlib.nullcontext()
 
     if mm_cudagraph_manager is not None:
         mm_groups: dict[str, torch.Tensor | list] = {
@@ -501,12 +499,14 @@ def run_dp_sharded_mrope_vision_model(
         pixel_values_local = mm_groups["pixel_values"]
         local_grid_thw_list = mm_groups["image_grid_thw"]
 
-    with set_forward_context(
-        None,
-        vllm_config=vllm_config,
-        cudagraph_runtime_mode=cudagraph_runtime_mode,
-        batch_descriptor=batch_descriptor,
-    ):
+        ctx = set_forward_context(
+            None,
+            vllm_config=mm_cudagraph_manager.vllm_config,
+            cudagraph_runtime_mode=cudagraph_runtime_mode,
+            batch_descriptor=batch_descriptor,
+        )
+
+    with ctx:
         # Run the vision model on the local pixel_values_local
         if rope_type == "rope_2d":
             if pixel_values_local.shape[0] > 0:
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index af9c90f3b016..2716fce64ef5 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -29,9 +29,20 @@ class CudagraphDispatcher:
     runnable without cudagraph (if the mode does not match or mode is NONE).
     """
 
-    def __init__(self, vllm_config: VllmConfig):
+    def __init__(self, vllm_config: VllmConfig, is_mm_encoder: bool = False):
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
+        self.is_mm_encoder = is_mm_encoder
+        self.max_capture_size = (
+            self.compilation_config.max_cudagraph_capture_size
+            if not is_mm_encoder
+            else self.compilation_config.max_mm_encoder_cudagraph_capture_size
+        )
+        self.capture_sizes = (
+            self.compilation_config.cudagraph_capture_sizes
+            if not is_mm_encoder
+            else self.compilation_config.mm_encoder_cudagraph_capture_sizes
+        )
         self.uniform_decode_query_len = (
             1
             if not self.vllm_config.speculative_config
@@ -68,11 +79,16 @@ def __init__(self, vllm_config: VllmConfig):
 
     def _compute_bs_to_padded_graph_size(self) -> None:
         """Pre-compute the mapping from batch size to padded graph size."""
-        max_capture_size = self.compilation_config.max_cudagraph_capture_size
-        capture_sizes = self.compilation_config.cudagraph_capture_sizes
-        self._bs_to_padded_graph_size: list[int] = self._get_padded_size_map(
-            capture_sizes, max_capture_size
-        )
+        self._bs_to_padded_graph_size: list[int] = [0] * (self.max_capture_size + 1)
+        for end, start in zip(
+            self.capture_sizes + [self.max_capture_size + 1],
+            [0] + self.capture_sizes,
+        ):
+            for bs in range(start, end):
+                if bs == start:
+                    self._bs_to_padded_graph_size[bs] = start
+                else:
+                    self._bs_to_padded_graph_size[bs] = end
 
         # Validate that compile_sizes won't be changed by padding.
         # Only validate when cudagraphs are actually being used.
@@ -81,7 +97,7 @@ def _compute_bs_to_padded_graph_size(self) -> None:
             and self.cudagraph_mode != CUDAGraphMode.NONE
         ):
             for size in self.compilation_config.compile_sizes:
-                if size <= max_capture_size:
+                if size <= self.max_capture_size:
                     padded = self._bs_to_padded_graph_size[size]
                     if padded != size:
                         raise ValueError(
@@ -112,48 +128,16 @@ def _get_lora_cases(self) -> list[int]:
             # No specialization: only capture graphs with LoRA active
             return [lora_config.max_loras + 1]
 
-    def _compute_bs_to_padded_mm_encoder_graph_size(self) -> None:
-        """pre-compute the mapping from batch size to mm_encoder padded graph size."""
-        max_capture_size = self.compilation_config.max_mm_encoder_cudagraph_capture_size
-        capture_sizes = self.compilation_config.mm_encoder_cudagraph_capture_sizes
-
-        self._bs_to_padded_mm_encoder_graph_size: list[int] = self._get_padded_size_map(
-            capture_sizes, max_capture_size
-        )
-
-    def _get_padded_size_map(
-        self, capture_sizes: list[int] | None, max_size: int | None
-    ) -> list[int]:
-        if capture_sizes is None:
-            capture_sizes = []
-        if max_size is None:
-            max_size = 0
-        padded_size_map: list[int] = [0] * (max_size + 1)
-        for end, start in zip(
-            capture_sizes + [max_size + 1],
-            [0] + capture_sizes,
-        ):
-            for bs in range(start, end):
-                if bs == start:
-                    padded_size_map[bs] = start
-                else:
-                    padded_size_map[bs] = end
-        return padded_size_map
-
     def _create_padded_batch_descriptor(
         self,
         num_tokens: int,
         uniform_decode: bool,
         has_lora: bool,
         num_active_loras: int = 0,
-        is_mm_encoder: bool = False,
     ) -> BatchDescriptor:
         max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs
         uniform_decode_query_len = self.uniform_decode_query_len
-        if is_mm_encoder:
-            num_tokens_padded = self._bs_to_padded_mm_encoder_graph_size[num_tokens]
-        else:
-            num_tokens_padded = self._bs_to_padded_graph_size[num_tokens]
+        num_tokens_padded = self._bs_to_padded_graph_size[num_tokens]
 
         if uniform_decode and self.cudagraph_mode.has_mode(CUDAGraphMode.FULL):
             num_reqs = num_tokens_padded // uniform_decode_query_len
@@ -168,7 +152,6 @@ def _create_padded_batch_descriptor(
             uniform=uniform_decode,
             has_lora=has_lora,
             num_active_loras=num_active_loras,
-            is_mm_encoder=is_mm_encoder,
         )
 
     def add_cudagraph_key(
@@ -192,10 +175,9 @@ def initialize_cudagraph_keys(
             return
 
         self._compute_bs_to_padded_graph_size()
-        self._compute_bs_to_padded_mm_encoder_graph_size()
 
         # Get LoRA cases to capture
-        lora_cases = self._get_lora_cases()
+        lora_cases = self._get_lora_cases() if not self.is_mm_encoder else [0]
         self.captured_lora_counts = [
             lora_count for lora_count in lora_cases if lora_count
         ]
@@ -204,23 +186,13 @@ def initialize_cudagraph_keys(
         # guarantee all keys would be used. For example, if we allow lazy
         # capturing in future PR, some keys may never be triggered.
         if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
-            for bs, num_active_loras in product(
-                self.compilation_config.cudagraph_capture_sizes, lora_cases
-            ):
+            for bs, num_active_loras in product(self.capture_sizes, lora_cases):
                 self.add_cudagraph_key(
                     cudagraph_mode.mixed_mode(),
                     self._create_padded_batch_descriptor(
                         bs, False, num_active_loras > 0, num_active_loras
                     ).relax_for_mixed_batch_cudagraphs(),
                 )
-            # mm_encoder CUDAGraph Entry
-            for patch_len in self.compilation_config.mm_encoder_cudagraph_capture_sizes:
-                self.add_cudagraph_key(
-                    CUDAGraphMode.PIECEWISE,
-                    self._create_padded_batch_descriptor(
-                        patch_len, False, False, is_mm_encoder=True
-                    ).relax_for_mixed_batch_cudagraphs(),
-                )
 
         # if decode cudagraph mode is FULL, and we don't already have mixed
         # mode full cudagraphs then add them here.
@@ -234,7 +206,7 @@ def initialize_cudagraph_keys(
             )
             cudagraph_capture_sizes_for_decode = [
                 x
-                for x in self.compilation_config.cudagraph_capture_sizes
+                for x in self.capture_sizes
                 if x <= max_num_tokens and x >= uniform_decode_query_len
             ]
             for bs, num_active_loras in product(
@@ -256,7 +228,6 @@ def dispatch(
         has_lora: bool = False,
         disable_full: bool = False,
         num_active_loras: int = 0,
-        is_mm_encoder: bool = False,
     ) -> tuple[CUDAGraphMode, BatchDescriptor]:
         """
         Given conditions(e.g.,batch descriptor and if using piecewise only),
@@ -277,19 +248,9 @@ def dispatch(
         if (
             not self.keys_initialized
             or self.cudagraph_mode == CUDAGraphMode.NONE
-            or (
-                not is_mm_encoder
-                and num_tokens > self.compilation_config.max_cudagraph_capture_size
-            )
-            or (
-                is_mm_encoder
-                and num_tokens
-                > self.compilation_config.max_mm_encoder_cudagraph_capture_size
-            )
+            or num_tokens > self.max_capture_size
         ):
-            return CUDAGraphMode.NONE, BatchDescriptor(
-                num_tokens, is_mm_encoder=is_mm_encoder
-            )
+            return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
 
         effective_num_active_loras = num_active_loras
         if has_lora and num_active_loras > 0:
@@ -307,13 +268,9 @@ def dispatch(
                 # so we must use max_loras + 1 for dispatch to find a matching graph.
                 effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1
 
-                batch_desc = self._create_padded_batch_descriptor(
-                    num_tokens,
-                    uniform_decode,
-                    has_lora,
-                    effective_num_active_loras,
-                    is_mm_encoder,
-                )
+        batch_desc = self._create_padded_batch_descriptor(
+            num_tokens, uniform_decode, has_lora, effective_num_active_loras
+        )
         relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
 
         if not disable_full:
@@ -331,13 +288,9 @@ def dispatch(
             return CUDAGraphMode.PIECEWISE, relaxed_batch_desc
 
         # finally, just return no cudagraphs and a trivial batch descriptor
-        return CUDAGraphMode.NONE, BatchDescriptor(
-            num_tokens, is_mm_encoder=is_mm_encoder
-        )
+        return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
 
-    def get_capture_descs(
-        self, is_mm_encoder: bool = False
-    ) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]:
+    def get_capture_descs(self) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]:
         """
         Returns capture descriptors for cudagraph capturing.
 
@@ -355,9 +308,7 @@ def get_capture_descs(
             descs = list(self.cudagraph_keys[mode])
             if descs:
                 # Sort by num_tokens descending (largest first)
-                filter_descs = [d for d in descs if d.is_mm_encoder == is_mm_encoder]
-                if filter_descs:
-                    filter_descs.sort(key=lambda d: d.num_tokens, reverse=True)
-                    result.append((mode, filter_descs))
+                descs.sort(key=lambda d: d.num_tokens, reverse=True)
+                result.append((mode, descs))
 
         return result
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f1b1438c81b6..b8c88e881a1a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -658,8 +658,6 @@ def __init__(
             dummy_inputs_builder = processor.dummy_inputs
             self.mm_cudagraph_manager = MMEncoderCudagraphManager(
                 self.vllm_config,
-                self.cudagraph_dispatcher,
-                self.device,
                 dummy_inputs_builder,
             )
 
@@ -2442,11 +2440,7 @@ def _execute_mm_encoder(
                     # Default values for non-mm_encoder cudagraph case
                     cudagraph_runtime_mode = CUDAGraphMode.NONE
                     batch_descriptor = None
-                    if (
-                        mm_mgr is not None
-                        and mm_mgr.enabled
-                        and "pixel_values" in mm_kwargs_group
-                    ):
+                    if mm_mgr is not None and "pixel_values" in mm_kwargs_group:
                         (
                             cudagraph_runtime_mode,
                             batch_descriptor,
@@ -5259,9 +5253,10 @@ def freeze_gc():
                 )
             # Capture MM encoder CUDA graphs if enabled
             if self.mm_cudagraph_manager is not None:
-                for runtime_mode, _ in self.cudagraph_dispatcher.get_capture_descs(
-                    is_mm_encoder=True
-                ):
+                for (
+                    runtime_mode,
+                    _,
+                ) in self.mm_cudagraph_manager.dispatcher.get_capture_descs():
                     self.mm_cudagraph_manager.capture(
                         model=self.model, cudagraph_mode=runtime_mode
                     )
@@ -5629,6 +5624,14 @@ def _check_and_update_cudagraph_mode(
             cudagraph_mode, self.uniform_decode_query_len
         )
 
+        if (
+            self.mm_cudagraph_manager is not None
+            and cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE
+        ):
+            self.mm_cudagraph_manager.dispatcher.initialize_cudagraph_keys(
+                CUDAGraphMode.PIECEWISE,
+            )
+
         # Initialize eagle's cudagraph dispatcher if using eagle spec decode.
         if self.speculative_config and self.speculative_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
diff --git a/vllm/v1/worker/mm_cudagraph.py b/vllm/v1/worker/mm_cudagraph.py
index fd572c40c46c..4f2b1cf72e8e 100644
--- a/vllm/v1/worker/mm_cudagraph.py
+++ b/vllm/v1/worker/mm_cudagraph.py
@@ -24,13 +24,10 @@ class MMEncoderCudagraphManager:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        cudagraph_dispatcher: CudagraphDispatcher,
-        device: torch.device,
         dummy_input_builder: BaseDummyInputsBuilder[Any],
     ):
         self.vllm_config = vllm_config
-        self.dispatcher = cudagraph_dispatcher
-        self.device = device
+        self.dispatcher = CudagraphDispatcher(self.vllm_config, is_mm_encoder=True)
         self.dummy_input_builder = dummy_input_builder
 
         compilation_config = vllm_config.compilation_config
@@ -76,7 +73,7 @@ def dispatch_and_pad_mm_input(
         if not self.enabled:
             return (
                 CUDAGraphMode.NONE,
-                BatchDescriptor(num_tokens, is_mm_encoder=True),
+                BatchDescriptor(num_tokens),
                 original_num_imgs,
                 mm_kwargs_group,
             )
@@ -84,7 +81,6 @@ def dispatch_and_pad_mm_input(
         # Dispatch to get the target padded size
         cudagraph_runtime_mode, batch_descriptor = self.dispatcher.dispatch(
             num_tokens=num_tokens,
-            is_mm_encoder=True,
         )
         target_num_tokens = batch_descriptor.num_tokens
 
@@ -127,10 +123,7 @@ def capture_graph(
             num_tokens
         )
 
-        batch_descriptor = BatchDescriptor(
-            num_tokens=num_tokens,
-            is_mm_encoder=True,
-        )
+        batch_descriptor = BatchDescriptor(num_tokens=num_tokens)
 
         with set_forward_context(
             None,

From 9be3fa64d2278682dd0c6148ad73a4ebd73d4599 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Wed, 4 Feb 2026 19:46:35 +0800
Subject: [PATCH 34/35] modify to be compatible with V1 design

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 .../piecewise/test_qwenvl_vit_cudagraph.py    |  2 +-
 vllm/model_executor/models/qwen2_5_vl.py      | 33 +++++++------
 vllm/model_executor/models/qwen3_vl.py        | 33 +++++++------
 vllm/v1/worker/gpu_model_runner.py            | 15 +++---
 vllm/v1/worker/mm_cudagraph.py                | 48 +++++++------------
 5 files changed, 57 insertions(+), 74 deletions(-)

diff --git a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
index 82cb10394720..f59368fcbd1c 100644
--- a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
+++ b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py
@@ -116,7 +116,7 @@ def _worker_embed_multimodal(
         vllm_config,
         dummy_inputs_builder,
     )
-    mm_cudagraph_manager.dispatcher.initialize_cudagraph_keys(
+    mm_cudagraph_manager.initialize_cudagraph_keys(
         CUDAGraphMode.PIECEWISE,
     )
 
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index d6a352392e9c..bb7e44f9f30b 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -51,11 +51,14 @@
     CUDAGraphMode,
     VllmConfig,
     get_current_vllm_config,
-    set_current_vllm_config,
 )
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
-from vllm.forward_context import get_forward_context, is_forward_context_available
+from vllm.forward_context import (
+    get_forward_context,
+    is_forward_context_available,
+    set_forward_context,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.attention import MMEncoderAttention
@@ -1287,20 +1290,16 @@ def _process_image_input(
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
             pixel_values = image_input["pixel_values"]
-            with set_current_vllm_config(self.vllm_config):
-                if (
-                    self.use_data_parallel
-                    and not self.vllm_config.in_mm_encoder_tracing
-                ):
-                    return run_dp_sharded_mrope_vision_model(
-                        self.visual,
-                        pixel_values,
-                        grid_thw_list,
-                        rope_type="rope_3d",
-                        mm_cudagraph_manager=mm_cudagraph_manager,
-                    )
-                else:
-                    image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+            if self.use_data_parallel and not self.vllm_config.in_mm_encoder_tracing:
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual,
+                    pixel_values,
+                    grid_thw_list,
+                    rope_type="rope_3d",
+                    mm_cudagraph_manager=mm_cudagraph_manager,
+                )
+            else:
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each image item.
         merge_size = self.visual.spatial_merge_size
@@ -1351,7 +1350,7 @@ def _process_video_input(
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
         else:
             pixel_values_videos = video_input["pixel_values_videos"]
-            with set_current_vllm_config(self.vllm_config):
+            with set_forward_context(None, self.vllm_config):
                 if (
                     self.use_data_parallel
                     and not self.vllm_config.in_mm_encoder_tracing
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 9e6001f474b0..f1fb4a1d2ad4 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -57,11 +57,14 @@
     CUDAGraphMode,
     VllmConfig,
     get_current_vllm_config,
-    set_current_vllm_config,
 )
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_pp_group
-from vllm.forward_context import get_forward_context, is_forward_context_available
+from vllm.forward_context import (
+    get_forward_context,
+    is_forward_context_available,
+    set_forward_context,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
 from vllm.model_executor.layers.conv import Conv3dLayer
@@ -1546,20 +1549,16 @@ def _process_image_input(
         else:
             pixel_values = image_input["pixel_values"].type(self.visual.dtype)
 
-            with set_current_vllm_config(self.vllm_config):
-                if (
-                    self.use_data_parallel
-                    and not self.vllm_config.in_mm_encoder_tracing
-                ):
-                    return run_dp_sharded_mrope_vision_model(
-                        self.visual,
-                        pixel_values,
-                        grid_thw_list,
-                        rope_type="rope_3d",
-                        mm_cudagraph_manager=mm_cudagraph_manager,
-                    )
-                else:
-                    image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+            if self.use_data_parallel and not self.vllm_config.in_mm_encoder_tracing:
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual,
+                    pixel_values,
+                    grid_thw_list,
+                    rope_type="rope_3d",
+                    mm_cudagraph_manager=mm_cudagraph_manager,
+                )
+            else:
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
 
         # Split concatenated embeddings for each image item.
         merge_size = self.visual.spatial_merge_size
@@ -1581,7 +1580,7 @@ def _process_video_input(
             pixel_values_videos = video_input["pixel_values_videos"].type(
                 self.visual.dtype
             )
-            with set_current_vllm_config(self.vllm_config):
+            with set_forward_context(None, self.vllm_config):
                 if (
                     self.use_data_parallel
                     and not self.vllm_config.in_mm_encoder_tracing
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b8c88e881a1a..bc48153ad060 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5255,10 +5255,12 @@ def freeze_gc():
             if self.mm_cudagraph_manager is not None:
                 for (
                     runtime_mode,
-                    _,
+                    batch_descs,
                 ) in self.mm_cudagraph_manager.dispatcher.get_capture_descs():
                     self.mm_cudagraph_manager.capture(
-                        model=self.model, cudagraph_mode=runtime_mode
+                        model=self.model,
+                        batch_descs=batch_descs,
+                        cudagraph_mode=runtime_mode,
                     )
 
             torch.cuda.synchronize()
@@ -5624,13 +5626,8 @@ def _check_and_update_cudagraph_mode(
             cudagraph_mode, self.uniform_decode_query_len
         )
 
-        if (
-            self.mm_cudagraph_manager is not None
-            and cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE
-        ):
-            self.mm_cudagraph_manager.dispatcher.initialize_cudagraph_keys(
-                CUDAGraphMode.PIECEWISE,
-            )
+        if self.mm_cudagraph_manager is not None:
+            self.mm_cudagraph_manager.initialize_cudagraph_keys(cudagraph_mode)
 
         # Initialize eagle's cudagraph dispatcher if using eagle spec decode.
         if self.speculative_config and self.speculative_config.use_eagle():
diff --git a/vllm/v1/worker/mm_cudagraph.py b/vllm/v1/worker/mm_cudagraph.py
index 4f2b1cf72e8e..c633d6689ef1 100644
--- a/vllm/v1/worker/mm_cudagraph.py
+++ b/vllm/v1/worker/mm_cudagraph.py
@@ -30,19 +30,6 @@ def __init__(
         self.dispatcher = CudagraphDispatcher(self.vllm_config, is_mm_encoder=True)
         self.dummy_input_builder = dummy_input_builder
 
-        compilation_config = vllm_config.compilation_config
-        self.capture_sizes: list[int] = []
-        if compilation_config and compilation_config.mm_encoder_cudagraph_capture_sizes:
-            self.capture_sizes = sorted(
-                compilation_config.mm_encoder_cudagraph_capture_sizes
-            )
-
-        self.enabled = bool(
-            self.capture_sizes
-            and compilation_config
-            and compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-        )
-
         # Check if using data parallel mode for ViT
         self.is_vit_dp_mode = self._check_vit_dp_mode(vllm_config)
 
@@ -57,6 +44,18 @@ def _check_vit_dp_mode(self, vllm_config: VllmConfig) -> bool:
 
         return mm_encoder_tp_mode == "data" and tp_size > 1
 
+    def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode) -> None:
+        """Initialize cudagraph dispatcher keys for MM Encoder.
+
+        MM Encoder only supports PIECEWISE cudagraphs.
+        """
+        if cudagraph_mode.mixed_mode() in [CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]:
+            mm_cudagraph_mode = CUDAGraphMode.PIECEWISE
+        else:
+            mm_cudagraph_mode = CUDAGraphMode.NONE
+
+        self.dispatcher.initialize_cudagraph_keys(mm_cudagraph_mode)
+
     def dispatch_and_pad_mm_input(
         self,
         mm_kwargs_group: BatchedTensorInputs,
@@ -70,14 +69,6 @@ def dispatch_and_pad_mm_input(
         else:
             original_num_imgs = len(image_grid_thw)
 
-        if not self.enabled:
-            return (
-                CUDAGraphMode.NONE,
-                BatchDescriptor(num_tokens),
-                original_num_imgs,
-                mm_kwargs_group,
-            )
-
         # Dispatch to get the target padded size
         cudagraph_runtime_mode, batch_descriptor = self.dispatcher.dispatch(
             num_tokens=num_tokens,
@@ -138,25 +129,22 @@ def capture_graph(
     def capture(
         self,
         model: nn.Module,
+        batch_descs: "list[BatchDescriptor]",
         cudagraph_mode: CUDAGraphMode,
     ) -> None:
-        if not self.enabled or not self.capture_sizes:
-            return
-
         self.vllm_config.in_mm_encoder_tracing = True
 
-        capture_sizes_desc = list(reversed(self.capture_sizes))
-
         if is_global_first_rank():
-            capture_sizes_iter: Any = tqdm(
-                capture_sizes_desc,
+            batch_descriptors: Any = tqdm(
+                batch_descs,
                 disable=not self.vllm_config.load_config.use_tqdm_on_load,
                 desc="Capturing MM_Encoder CUDA graphs (PIECEWISE)",
             )
         else:
-            capture_sizes_iter = capture_sizes_desc
+            batch_descriptors = batch_descs
 
-        for capture_size in capture_sizes_iter:
+        for batch_desc in batch_descriptors:
+            capture_size = batch_desc.num_tokens
             self.capture_graph(
                 capture_size,
                 model=model,

From 6da90763761c3916f0ccaa58a943184b55c06822 Mon Sep 17 00:00:00 2001
From: Hongjian Zhang <hirokenovo@gmail.com>
Date: Thu, 5 Feb 2026 14:47:24 +0800
Subject: [PATCH 35/35] simplify CudagraphDispatcher init and restore video
 logic

Signed-off-by: Hongjian Zhang <hirokenovo@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py |  9 ++-----
 vllm/model_executor/models/qwen3_vl.py   | 15 +++---------
 vllm/model_executor/models/vision.py     |  6 +----
 vllm/v1/cudagraph_dispatcher.py          | 31 ++++++++++++------------
 vllm/v1/worker/mm_cudagraph.py           | 17 +++++++++++--
 5 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index bb7e44f9f30b..ca13b8c096b1 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -1338,9 +1338,7 @@ def _postprocess_image_embeds_evs(
         return tuple(image_embeds_split)
 
     def _process_video_input(
-        self,
-        video_input: Qwen2_5_VLVideoInputs,
-        mm_cudagraph_manager: Any | None = None,
+        self, video_input: Qwen2_5_VLVideoInputs
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
@@ -1360,7 +1358,6 @@ def _process_video_input(
                         pixel_values_videos,
                         grid_thw_list,
                         rope_type="rope_3d",
-                        mm_cudagraph_manager=mm_cudagraph_manager,
                     )
                 else:
                     video_embeds = self.visual(
@@ -1533,9 +1530,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
                     )
                 multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
-                video_embeddings = self._process_video_input(
-                    multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager
-                )
+                video_embeddings = self._process_video_input(multimodal_input)
                 if self.is_multimodal_pruning_enabled:
                     video_embeddings = self._postprocess_video_embeds_evs(
                         video_embeddings, multimodal_input
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index f1fb4a1d2ad4..3f51ce90fbd2 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -1566,13 +1566,10 @@ def _process_image_input(
         return image_embeds.split(sizes)
 
     def _process_video_input(
-        self,
-        video_input: Qwen2_5_VLVideoInputs,
-        mm_cudagraph_manager: Any | None = None,
+        self, video_input: Qwen2_5_VLVideoInputs
     ) -> tuple[torch.Tensor, ...]:
         grid_thw = video_input["video_grid_thw"]
         assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()
 
         if video_input["type"] == "video_embeds":
             video_embeds = video_input["video_embeds"].type(self.visual.dtype)
@@ -1585,17 +1582,15 @@ def _process_video_input(
                     self.use_data_parallel
                     and not self.vllm_config.in_mm_encoder_tracing
                 ):
+                    grid_thw_list = grid_thw.tolist()
                     return run_dp_sharded_mrope_vision_model(
                         self.visual,
                         pixel_values_videos,
                         grid_thw_list,
                         rope_type="rope_3d",
-                        mm_cudagraph_manager=mm_cudagraph_manager,
                     )
                 else:
-                    video_embeds = self.visual(
-                        pixel_values_videos, grid_thw=grid_thw_list
-                    )
+                    video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
 
         # Split concatenated embeddings for each video item.
         merge_size = self.visual.spatial_merge_size
@@ -2062,9 +2057,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
                     )
                 multimodal_embeddings += tuple(image_embeddings)
             if modality == "video":
-                video_embeddings = self._process_video_input(
-                    multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager
-                )
+                video_embeddings = self._process_video_input(multimodal_input)
                 if self.is_multimodal_pruning_enabled:
                     video_embeddings = self._postprocess_video_embeds_evs(
                         video_embeddings, multimodal_input
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index adcfd7d3b370..837ddf4a2534 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -11,11 +11,7 @@
 import torch
 from transformers import PretrainedConfig
 
-from vllm.config import (
-    MultiModalConfig,
-    VllmConfig,
-    get_current_vllm_config,
-)
+from vllm.config import MultiModalConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 2716fce64ef5..2fca21831c86 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -29,20 +29,9 @@ class CudagraphDispatcher:
     runnable without cudagraph (if the mode does not match or mode is NONE).
     """
 
-    def __init__(self, vllm_config: VllmConfig, is_mm_encoder: bool = False):
+    def __init__(self, vllm_config: VllmConfig):
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
-        self.is_mm_encoder = is_mm_encoder
-        self.max_capture_size = (
-            self.compilation_config.max_cudagraph_capture_size
-            if not is_mm_encoder
-            else self.compilation_config.max_mm_encoder_cudagraph_capture_size
-        )
-        self.capture_sizes = (
-            self.compilation_config.cudagraph_capture_sizes
-            if not is_mm_encoder
-            else self.compilation_config.mm_encoder_cudagraph_capture_sizes
-        )
         self.uniform_decode_query_len = (
             1
             if not self.vllm_config.speculative_config
@@ -76,6 +65,8 @@ def __init__(self, vllm_config: VllmConfig, is_mm_encoder: bool = False):
         )
         # Default cudagraph_mode to NONE until initialize_cudagraph_keys is called
         self.cudagraph_mode = CUDAGraphMode.NONE
+        self.capture_sizes: list[int] = []
+        self.max_capture_size: int = 0
 
     def _compute_bs_to_padded_graph_size(self) -> None:
         """Pre-compute the mapping from batch size to padded graph size."""
@@ -163,12 +154,22 @@ def add_cudagraph_key(
         self.cudagraph_keys[runtime_mode].add(batch_descriptor)
 
     def initialize_cudagraph_keys(
-        self, cudagraph_mode: CUDAGraphMode, uniform_decode_query_len: int = 1
+        self,
+        cudagraph_mode: CUDAGraphMode,
+        uniform_decode_query_len: int = 1,
+        capture_sizes: list[int] | None = None,
+        max_capture_size: int | None = None,
+        enable_lora: bool = True,
     ):
         # This should be called only after attention backend is initialized. So we can
         # get the correct cudagraph mode after backend support is resolved.
         self.cudagraph_mode = cudagraph_mode
-
+        self.capture_sizes = (
+            capture_sizes or self.compilation_config.cudagraph_capture_sizes
+        )
+        self.max_capture_size = (
+            max_capture_size or self.compilation_config.max_cudagraph_capture_size
+        )
         # Early exit if cudagraphs are disabled
         if cudagraph_mode == CUDAGraphMode.NONE:
             self.keys_initialized = True
@@ -177,7 +178,7 @@ def initialize_cudagraph_keys(
         self._compute_bs_to_padded_graph_size()
 
         # Get LoRA cases to capture
-        lora_cases = self._get_lora_cases() if not self.is_mm_encoder else [0]
+        lora_cases = self._get_lora_cases() if enable_lora else [0]
         self.captured_lora_counts = [
             lora_count for lora_count in lora_cases if lora_count
         ]
diff --git a/vllm/v1/worker/mm_cudagraph.py b/vllm/v1/worker/mm_cudagraph.py
index c633d6689ef1..6175d7d5c893 100644
--- a/vllm/v1/worker/mm_cudagraph.py
+++ b/vllm/v1/worker/mm_cudagraph.py
@@ -27,7 +27,7 @@ def __init__(
         dummy_input_builder: BaseDummyInputsBuilder[Any],
     ):
         self.vllm_config = vllm_config
-        self.dispatcher = CudagraphDispatcher(self.vllm_config, is_mm_encoder=True)
+        self.dispatcher = CudagraphDispatcher(self.vllm_config)
         self.dummy_input_builder = dummy_input_builder
 
         # Check if using data parallel mode for ViT
@@ -54,7 +54,20 @@ def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode) -> None:
         else:
             mm_cudagraph_mode = CUDAGraphMode.NONE
 
-        self.dispatcher.initialize_cudagraph_keys(mm_cudagraph_mode)
+        max_capture_size = (
+            self.vllm_config.compilation_config.max_mm_encoder_cudagraph_capture_size
+        )
+
+        capture_sizes = (
+            self.vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes
+        )
+
+        self.dispatcher.initialize_cudagraph_keys(
+            mm_cudagraph_mode,
+            capture_sizes=capture_sizes,
+            max_capture_size=max_capture_size,
+            enable_lora=False,
+        )
 
     def dispatch_and_pad_mm_input(
         self,