From 28fa0e9ba691143b2ee02e130063987d04b83624 Mon Sep 17 00:00:00 2001 From: Xingran Wang Date: Wed, 12 Nov 2025 10:52:21 +0800 Subject: [PATCH 01/35] multimodal compile & piecewise graph Signed-off-by: Xingran Wang --- vllm/compilation/cuda_graph.py | 1 + vllm/compilation/monitor.py | 4 ++ vllm/config/vllm.py | 2 + vllm/model_executor/models/qwen2_5_vl.py | 10 +++- vllm/v1/worker/gpu_model_runner.py | 60 ++++++++++++++++++++++++ 5 files changed, 76 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py index 7ffa74d0d7e6..098dd095e9c3 100644 --- a/vllm/compilation/cuda_graph.py +++ b/vllm/compilation/cuda_graph.py @@ -288,6 +288,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None: entry.cudagraph = cudagraph compilation_counter.num_cudagraph_captured += 1 + logger.info(f"Compilation Counter: {compilation_counter.num_cudagraph_captured}") # important: we need to return the output, rather than # the weak ref of the output, so that pytorch can correctly diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index 2bad5f0a16fc..912e3d828abc 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -13,6 +13,8 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None: + vllm_config.is_in_compile = True + global torch_compile_start_time torch_compile_start_time = time.time() @@ -29,6 +31,8 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None: def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None: + vllm_config.is_in_compile = False + compilation_config: CompilationConfig = vllm_config.compilation_config if compilation_config.mode == CompilationMode.VLLM_COMPILE: logger.info_once( diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index ea133856360d..ef18ce03d88e 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -257,6 +257,8 @@ class VllmConfig: performance, with -O0 having the best startup time and -O3 having the best performance. -02 is used by defult. See OptimizationLevel for full description.""" + is_in_compile: bool = False + """For ViT Compile, Compile Status Flag""" def compute_hash(self) -> str: """ diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 0310c5415dc9..d92ef7bdc312 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1207,7 +1207,15 @@ def _process_image_input( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"] - with set_forward_context(None, self.vllm_config): + if self.vllm_config.is_in_compile: + with set_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, pixel_values, grid_thw_list, rope_type="rope_3d" + ) + else: + image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) + else: if self.use_data_parallel: return run_dp_sharded_mrope_vision_model( self.visual, pixel_values, grid_thw_list, rope_type="rope_3d" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 49211c6805ce..d3888ee79733 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4569,6 +4569,59 @@ def rand_inputs_embeds() -> torch.Tensor: yield inputs_embeds.fill_(0) + def _get_dummy_vit_input(self, num_image_tokens: int) -> BatchedTensorInputs: + """ + Generates dummy multimodal inputs for a single image, with a controllable + number of resulting image tokens for a Vision Transformer (ViT) like model, + ensuring a square-like aspect ratio for the patch grid. + + This is useful for profiling or testing, allowing the creation of inputs + that result in a specific number of image tokens after vision encoding. + + Args: + num_image_tokens: The desired number of image tokens after encoding. + + Returns: + A BatchedTensorInputs dictionary containing `pixel_values` and + `image_grid_thw` that can be passed as kwargs to + `get_multimodal_embeddings`. + """ + import math + + def find_square_like_factors(n: int): + """Finds two factors of n that are closest to its square root.""" + h = int(math.sqrt(n)) + while h > 0: + if n % h == 0: + return h, n // h + h -= 1 + return 1, n + + h_patches, w_patches = find_square_like_factors(num_image_tokens) + + # The first dimension of pixel_values corresponds to the total number of + # tokens (patches). + #TODO 修改1176为vit feature dim. + # 根据num_image_tokens反推原图片长宽利用原api跑一遍?还是先跑一遍得到结果后取其feature dim再构造 + pixel_values = torch.zeros( + (num_image_tokens, 1176), + dtype=self.dtype, + device=self.device + ) + + # image_grid_thw specifies the grid layout for a single image. + # Shape: (1, 3) for (t, h, w) patch counts. + image_grid_thw = torch.tensor( + [[1, h_patches, w_patches]], + dtype=torch.long, + device=self.device + ) + + return { + "pixel_values": pixel_values, + "image_grid_thw": image_grid_thw, + } + def _get_mm_dummy_batch( self, modality: str, @@ -4855,6 +4908,13 @@ def _dummy_run( slot_mapping=slot_mappings, ), ): + if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs: + # TODO: This will be improved to support different shapes. + dummy_mm_inputs = self._get_dummy_vit_input(1024) + # logger.info("st.!!!!!!!!!!!!!!!!!!!") + # self.model.visual(dummy_mm_inputs["pixel_values"], grid_thw=dummy_mm_inputs["image_grid_thw"]) + self.model.get_multimodal_embeddings(**dummy_mm_inputs) + # logger.info("ed!!!!!!!!!!!!!!!!!!!") outputs = self.model( input_ids=input_ids, positions=positions, From 52da31267d1305f97178d8d2f3c90a20f0c76e4f Mon Sep 17 00:00:00 2001 From: Xingran Wang Date: Wed, 12 Nov 2025 18:22:43 +0800 Subject: [PATCH 02/35] hardcoded ViT piecewise cuda graph size without padding Signed-off-by: Xingran Wang --- vllm/compilation/cuda_graph.py | 1 - vllm/model_executor/models/qwen2_5_vl.py | 45 +++++++++++--- vllm/v1/worker/gpu_model_runner.py | 75 ++++++++++++++++++------ 3 files changed, 92 insertions(+), 29 deletions(-) diff --git a/vllm/compilation/cuda_graph.py b/vllm/compilation/cuda_graph.py index 098dd095e9c3..7ffa74d0d7e6 100644 --- a/vllm/compilation/cuda_graph.py +++ b/vllm/compilation/cuda_graph.py @@ -288,7 +288,6 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None: entry.cudagraph = cudagraph compilation_counter.num_cudagraph_captured += 1 - logger.info(f"Compilation Counter: {compilation_counter.num_cudagraph_captured}") # important: we need to return the output, rather than # the weak ref of the output, so that pytorch can correctly diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index d92ef7bdc312..bdafcdac5200 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -641,6 +641,9 @@ def __init__( prefix=f"{prefix}.merger", ) + self._persistent_hidden_states_buffer = torch.empty((4096, 1176), device=self.device, dtype=self.dtype) + self._persistent_rotary_pos_emb_buffer = torch.empty((4096, 40), device=self.device, dtype=self.dtype) + @property def dtype(self) -> torch.dtype: return self.patch_embed.proj.weight.dtype @@ -784,7 +787,13 @@ def forward( cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)] cu_seqlens: list = [] - hidden_states = x.to(device=self.device, dtype=self.dtype) + # logger.info(f"X Shape: {x.shape}") + if seq_len < 4096: + hidden_states = self._persistent_hidden_states_buffer[:seq_len] + hidden_states.copy_(x, non_blocking=True) + else: + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.patch_embed(hidden_states) window_index_id = 0 @@ -838,18 +847,36 @@ def forward( rotary_pos_emb_sin = rotary_pos_emb_sin.to( device=self.device, non_blocking=True ) + rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True) + if seq_len < 4096: + rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(rotary_pos_emb) window_index = window_index.to(device=hidden_states.device, non_blocking=True) reverse_indices = reverse_indices.to( device=hidden_states.device, non_blocking=True ) - - hidden_states = hidden_states.reshape( - seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1 - ) - hidden_states = hidden_states[window_index, :, :] - hidden_states = hidden_states.reshape(seq_len, -1) - - hidden_states = hidden_states.unsqueeze(1) + original_hidden_states = hidden_states # 这只是引用,不是拷贝 + # logger.info(f"Before Copy, original address: {original_hidden_states.storage().data_ptr()}") + # logger.info(f"Original Numel: {original_hidden_states.numel()}") + # Step 2: 执行一些转换操作(这些会创建新张量) + tmp = original_hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + tmp = tmp[window_index, :, :] + tmp = tmp.reshape(seq_len, -1) + tmp = tmp.unsqueeze(1) + # logger.info(f"Tmp Numel: {tmp.numel()}") + # Step 3: 将结果拷贝回原始张量的显存地址中(这是原地拷贝!) + original_storage = original_hidden_states.storage() + tmp_storage = tmp.storage() + original_storage.copy_(tmp_storage) + + # Step 4: 创建一个使用原始显存、具有新 shape 的 view + # 条件:original numel 必须等于新 shape 的总元素数 + new_shape = tmp.shape # (seq_len, 1, new_hidden_dim) + hidden_states = original_hidden_states.view(new_shape) + # 现在 hidden_states.shape == new_shape,且使用和 original 相同的显存 + # logger.info(f"After Copy, original address: {original_hidden_states.storage().data_ptr()}") + # logger.info(f"After Copy, tmp address: {tmp.storage().data_ptr()}") + + # logger.info(f"Before Input to Vision Block, Shape: {hidden_states.shape}") for layer_num, blk in enumerate(self.blocks): if layer_num in self.fullatt_block_indexes: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d3888ee79733..3d98ba7e4ed5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2420,11 +2420,20 @@ def _execute_mm_encoder( # 2. A list or tuple (length: num_items) of tensors, # each of shape (feature_size, hidden_size) in case the feature # size is dynamic depending on the input multimodal items. - - with self.timed_encoder_operation( - should_time, mm_lora_refs, current_item_idx, num_items - ): - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + batch_descriptor = BatchDescriptor( + num_tokens=mm_kwargs_group["pixel_values"].shape[0], + ) + with set_forward_context( + None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, + batch_descriptor=batch_descriptor, + ), self.timed_encoder_operation( + should_time, mm_lora_refs, current_item_idx, num_items + ): + curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group) + # logger.info("cuda graph mm embedding complete!") + # logger.info(f"curr_group_outputs: {curr_group_outputs}") sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -4592,13 +4601,12 @@ def find_square_like_factors(n: int): """Finds two factors of n that are closest to its square root.""" h = int(math.sqrt(n)) while h > 0: + h-=h&1 if n % h == 0: return h, n // h h -= 1 return 1, n - h_patches, w_patches = find_square_like_factors(num_image_tokens) - # The first dimension of pixel_values corresponds to the total number of # tokens (patches). #TODO 修改1176为vit feature dim. @@ -4611,11 +4619,19 @@ def find_square_like_factors(n: int): # image_grid_thw specifies the grid layout for a single image. # Shape: (1, 3) for (t, h, w) patch counts. - image_grid_thw = torch.tensor( - [[1, h_patches, w_patches]], - dtype=torch.long, - device=self.device - ) + if num_image_tokens == 3060: + image_grid_thw = torch.tensor( + [[1, 46, 34], [1, 44, 34]], + dtype=torch.long, + device=self.device + ) + else: + h_patches, w_patches = find_square_like_factors(num_image_tokens) + image_grid_thw = torch.tensor( + [[1, h_patches, w_patches]], + dtype=torch.long, + device=self.device + ) return { "pixel_values": pixel_values, @@ -4908,13 +4924,6 @@ def _dummy_run( slot_mapping=slot_mappings, ), ): - if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs: - # TODO: This will be improved to support different shapes. - dummy_mm_inputs = self._get_dummy_vit_input(1024) - # logger.info("st.!!!!!!!!!!!!!!!!!!!") - # self.model.visual(dummy_mm_inputs["pixel_values"], grid_thw=dummy_mm_inputs["image_grid_thw"]) - self.model.get_multimodal_embeddings(**dummy_mm_inputs) - # logger.info("ed!!!!!!!!!!!!!!!!!!!") outputs = self.model( input_ids=input_ids, positions=positions, @@ -5157,6 +5166,31 @@ def _dummy_pooler_run( max_task = max(output_size.items(), key=lambda x: x[1])[0] return self._dummy_pooler_run_task(hidden_states, max_task) + @torch.inference_mode() + def _dummy_mm_encoder_run( + self, + cudagraph_runtime_mode: CUDAGraphMode | None = None, + ) -> None: + logger.info("In _dummy_mm_encoder_run") + capture_sizes = [16, 32, 64, 128, 256, 512, 1024, 3060, 3128] + if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs: + # TODO: This will be improved to support different shapes. + for capture_size in capture_sizes: + logger.info(f"Capturing {capture_size}") + dummy_mm_inputs = self._get_dummy_vit_input(capture_size) + batch_descriptor = BatchDescriptor( + num_tokens=capture_size, + ) + with ( + set_forward_context( + None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + ), + ): + self.model.get_multimodal_embeddings(**dummy_mm_inputs) + def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: @@ -5368,6 +5402,9 @@ def _capture_cudagraphs( num_active_loras=num_active_loras, is_graph_capturing=True, ) + + self._dummy_mm_encoder_run(cudagraph_runtime_mode) + self.maybe_remove_all_loras(self.lora_config) def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: From 438b8ebff324643c0a7cb6d820634f4a41d7f247 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Thu, 13 Nov 2025 16:35:32 +0800 Subject: [PATCH 03/35] feat: add vit padding Signed-off-by: Hongjian Zhang --- vllm/v1/worker/gpu_model_runner.py | 108 ++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 31 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3d98ba7e4ed5..d25acf20df8a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -599,6 +599,13 @@ def __init__( ] self.is_mm_embed_idx = 0 + # START: Add persistent buffers for ViT inputs + # Use a large enough size for the CUDA graph + # The feature dimension is model-specific. We'll initialize + # the buffer lazily on the first run to get this dimension. + self.pixel_values_buffer: torch.Tensor | None = None + self.image_grid_thw_buffer: torch.Tensor | None = None + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: # NOTE: `mrope_positions` is implemented with one additional dummy @@ -2420,6 +2427,46 @@ def _execute_mm_encoder( # 2. A list or tuple (length: num_items) of tensors, # each of shape (feature_size, hidden_size) in case the feature # size is dynamic depending on the input multimodal items. + original_num_imgs = -1 + if "pixel_values" in mm_kwargs_group: + pixel_values = mm_kwargs_group["pixel_values"] + num_tokens = pixel_values.shape[0] + + # Pad to the size expected by CUDA graph + # TODO + # padded_num_tokens = self.vllm_config.pad_for_mm_cudagraph(num_tokens) + padded_num_tokens = 4096 + + if padded_num_tokens > num_tokens: + assert(self.pixel_values_buffer is not None and self.image_grid_thw_buffer is not None) + + self.pixel_values_buffer[:num_tokens].copy_(pixel_values) # type: ignore + mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[:padded_num_tokens] + + # Update image_grid_thw to account for padding + if "image_grid_thw" in mm_kwargs_group: + image_grid_thw = mm_kwargs_group["image_grid_thw"] + num_images = image_grid_thw.shape[0] + original_num_imgs = num_images + padding_amount = padded_num_tokens - num_tokens + + # Treat padding as a new virtual image. + # Assuming a fixed patch grid logic where height is merge_size. + merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1) + assert(padding_amount % (merge_size * merge_size) == 0) + h_patches = merge_size + w_patches = padding_amount // h_patches + + self.image_grid_thw_buffer[:num_images].copy_(image_grid_thw) + self.image_grid_thw_buffer[num_images] = torch.tensor( + [1, h_patches, w_patches], + dtype=torch.long, + device=self.device + ) + mm_kwargs_group["image_grid_thw"] = self.image_grid_thw_buffer[:num_images + 1] + # END: Added padding logic for ViT CUDA Graph + + # TODO get batch_descriptor from dispatch batch_descriptor = BatchDescriptor( num_tokens=mm_kwargs_group["pixel_values"].shape[0], ) @@ -2434,7 +2481,10 @@ def _execute_mm_encoder( curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group) # logger.info("cuda graph mm embedding complete!") # logger.info(f"curr_group_outputs: {curr_group_outputs}") - + # START: Added cropping logic for ViT CUDA Graph + if original_num_imgs != -1: + curr_group_outputs = curr_group_outputs[:original_num_imgs] + # END: Added cropping logic for ViT CUDA Graph sanity_check_mm_encoder_outputs( curr_group_outputs, expected_num_items=num_items, @@ -4595,17 +4645,11 @@ def _get_dummy_vit_input(self, num_image_tokens: int) -> BatchedTensorInputs: `image_grid_thw` that can be passed as kwargs to `get_multimodal_embeddings`. """ - import math - - def find_square_like_factors(n: int): - """Finds two factors of n that are closest to its square root.""" - h = int(math.sqrt(n)) - while h > 0: - h-=h&1 - if n % h == 0: - return h, n // h - h -= 1 - return 1, n + def _get_dummy_h_w_patches(patches: int): + assert patches % 4 == 0, "Number of patches must be multiple of 4" + h_patches = 2 + w_patches = patches // 2 + return h_patches, w_patches # The first dimension of pixel_values corresponds to the total number of # tokens (patches). @@ -4617,21 +4661,12 @@ def find_square_like_factors(n: int): device=self.device ) - # image_grid_thw specifies the grid layout for a single image. - # Shape: (1, 3) for (t, h, w) patch counts. - if num_image_tokens == 3060: - image_grid_thw = torch.tensor( - [[1, 46, 34], [1, 44, 34]], - dtype=torch.long, - device=self.device - ) - else: - h_patches, w_patches = find_square_like_factors(num_image_tokens) - image_grid_thw = torch.tensor( - [[1, h_patches, w_patches]], - dtype=torch.long, - device=self.device - ) + h_patches, w_patches = _get_dummy_h_w_patches(num_image_tokens) + image_grid_thw = torch.tensor( + [[1, h_patches, w_patches]], + dtype=torch.long, + device=self.device + ) return { "pixel_values": pixel_values, @@ -5172,8 +5207,10 @@ def _dummy_mm_encoder_run( cudagraph_runtime_mode: CUDAGraphMode | None = None, ) -> None: logger.info("In _dummy_mm_encoder_run") - capture_sizes = [16, 32, 64, 128, 256, 512, 1024, 3060, 3128] - if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs: + # capture_sizes = [16, 32, 64, 128, 256, 512, 1024, 3060, 3128] + capture_sizes = [4096] + # Lazy initialization of the persistent buffer + if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE: # TODO: This will be improved to support different shapes. for capture_size in capture_sizes: logger.info(f"Capturing {capture_size}") @@ -5181,6 +5218,15 @@ def _dummy_mm_encoder_run( batch_descriptor = BatchDescriptor( num_tokens=capture_size, ) + if self.pixel_values_buffer is None: + self.pixel_values_buffer = torch.zeros( + (capture_sizes[-1], dummy_mm_inputs["pixel_values"].shape[1]), + dtype=self.dtype, + device=self.device + ) + self.image_grid_thw_buffer = torch.zeros(( + 200, 3), dtype=torch.long, device=self.device + ) with ( set_forward_context( None, @@ -5402,8 +5448,8 @@ def _capture_cudagraphs( num_active_loras=num_active_loras, is_graph_capturing=True, ) - - self._dummy_mm_encoder_run(cudagraph_runtime_mode) + if self.supports_mm_inputs: + self._dummy_mm_encoder_run(cudagraph_runtime_mode) self.maybe_remove_all_loras(self.lora_config) From 7eaac5ce0b7f7d86788ff8dedaf349ec5f8a0196 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Thu, 13 Nov 2025 19:46:37 +0800 Subject: [PATCH 04/35] fix: fix vit cuda graph weak ref issue and first graph gc issue Signed-off-by: Hongjian Zhang --- vllm/compilation/backends.py | 41 ++++++++++++++++++++- vllm/model_executor/models/qwen2_5_vl.py | 47 +++++++++++++----------- 2 files changed, 64 insertions(+), 24 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 89981fc29963..ce2cacd0b7cd 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -48,6 +48,41 @@ logger = init_logger(__name__) +# A global flag to indicate if the current graph being compiled +# is the last one in a sequence of graphs (e.g., a sequence of blocks). +# This is a workaround to control CUDAGraph weak_ref_output behavior +# in **vit** piecewise compilation. +_is_last_graph_in_vit_sequence: bool = True + +@contextmanager +def set_is_last_graph_in_sequence(is_last: bool): + """Context manager to indicate if the current graph being compiled + is the last one in a sequence of graphs (e.g., a sequence of blocks). + """ + global _is_last_graph_in_vit_sequence + original_value = _is_last_graph_in_vit_sequence + _is_last_graph_in_vit_sequence = is_last + try: + yield + finally: + _is_last_graph_in_vit_sequence = original_value + +# A global flag to indicate if the current graph being compiled +# is the first one in a sequence of graphs (e.g., a sequence of blocks). +_is_first_graph_in_vit_sequence: bool = True + +@contextmanager +def set_is_first_graph_in_sequence(is_first: bool): + """Context manager to indicate if the current graph being compiled + is the first one in a sequence of graphs (e.g., a sequence of blocks). + """ + global _is_first_graph_in_vit_sequence + original_value = _is_first_graph_in_vit_sequence + _is_first_graph_in_vit_sequence = is_first + try: + yield + finally: + _is_first_graph_in_vit_sequence = original_value def make_copy_and_call( sym_tensor_indices: list[int], @@ -449,8 +484,10 @@ def wrap_with_cudagraph_if_needed( runtime_mode=CUDAGraphMode.PIECEWISE, cudagraph_options=CUDAGraphOptions( debug_log_enable=is_first_graph, - gc_disable=not is_first_graph, - weak_ref_output=is_last_graph, + gc_disable=not is_first_graph + or not _is_first_graph_in_vit_sequence, + weak_ref_output=is_last_graph + and _is_last_graph_in_vit_sequence, ), ) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index bdafcdac5200..8186ff244608 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -641,8 +641,8 @@ def __init__( prefix=f"{prefix}.merger", ) - self._persistent_hidden_states_buffer = torch.empty((4096, 1176), device=self.device, dtype=self.dtype) - self._persistent_rotary_pos_emb_buffer = torch.empty((4096, 40), device=self.device, dtype=self.dtype) + self._persistent_hidden_states_buffer = torch.empty((8192, 1176), device=self.device, dtype=self.dtype) + self._persistent_rotary_pos_emb_buffer = torch.empty((8192, 40), device=self.device, dtype=torch.float32) @property def dtype(self) -> torch.dtype: @@ -788,13 +788,15 @@ def forward( cu_seqlens: list = [] # logger.info(f"X Shape: {x.shape}") - if seq_len < 4096: + if seq_len < 8192: hidden_states = self._persistent_hidden_states_buffer[:seq_len] hidden_states.copy_(x, non_blocking=True) else: hidden_states = x.to(device=self.device, dtype=self.dtype) - hidden_states = self.patch_embed(hidden_states) + from vllm.compilation.backends import set_is_first_graph_in_sequence, set_is_last_graph_in_sequence + with set_is_first_graph_in_sequence(True), set_is_last_graph_in_sequence(False): + hidden_states = self.patch_embed(hidden_states) window_index_id = 0 cu_window_seqlens_last = 0 @@ -848,7 +850,7 @@ def forward( device=self.device, non_blocking=True ) rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True) - if seq_len < 4096: + if seq_len < 8192: rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(rotary_pos_emb) window_index = window_index.to(device=hidden_states.device, non_blocking=True) reverse_indices = reverse_indices.to( @@ -877,22 +879,22 @@ def forward( # logger.info(f"After Copy, tmp address: {tmp.storage().data_ptr()}") # logger.info(f"Before Input to Vision Block, Shape: {hidden_states.shape}") - - for layer_num, blk in enumerate(self.blocks): - if layer_num in self.fullatt_block_indexes: - cu_seqlens_now = cu_seqlens - max_seqlen_now = max_seqlen_full - else: - cu_seqlens_now = cu_window_seqlens - max_seqlen_now = max_seqlen_window - - hidden_states = blk( - hidden_states, - cu_seqlens=cu_seqlens_now, - rotary_pos_emb_cos=rotary_pos_emb_cos, - rotary_pos_emb_sin=rotary_pos_emb_sin, - max_seqlen=max_seqlen_now, - ) + with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(False): + for layer_num, blk in enumerate(self.blocks): + if layer_num in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + max_seqlen_now = max_seqlen_full + else: + cu_seqlens_now = cu_window_seqlens + max_seqlen_now = max_seqlen_window + + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens_now, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + max_seqlen=max_seqlen_now, + ) # For Qwen2.5-VL-3B, float16 will overflow at last block # for long visual tokens sequences. @@ -900,7 +902,8 @@ def forward( hidden_states = cast_overflow_tensors(hidden_states) # adapter - hidden_states = self.merger(hidden_states) + with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(True): + hidden_states = self.merger(hidden_states) hidden_states = hidden_states[reverse_indices, :] return hidden_states From 330fe8605228d60d18b11af9d3dca53dc50d90ac Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Tue, 18 Nov 2025 13:02:44 +0800 Subject: [PATCH 05/35] feat: add vit cudagraph capture sizes and related functionality Signed-off-by: Hongjian Zhang --- vllm/config/compilation.py | 4 ++ vllm/config/vllm.py | 53 +++++++++++++++ vllm/engine/arg_utils.py | 15 +++++ vllm/v1/worker/gpu_model_runner.py | 103 ++++++++++++++++------------- 4 files changed, 128 insertions(+), 47 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 7a69629f707c..b74988b2a711 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -530,6 +530,10 @@ class CompilationConfig: """Sizes to capture cudagraph. - None (default): capture sizes are inferred from vllm config. - list[int]: capture sizes are specified as given.""" + vit_cudagraph_capture_sizes: list[int] | None = None + """Sizes to capture vit cudagraph. + - None (default): capture sizes are inferred from vllm config. + - list[int]: capture sizes are specified as given.""" cudagraph_copy_inputs: bool = False """Whether to copy input tensors for cudagraph. If the caller can guarantee that the same input buffers diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index ef18ce03d88e..ad3c9ea80a88 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -15,6 +15,7 @@ from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar, get_args +import bisect import torch from pydantic import ConfigDict, Field, model_validator @@ -360,6 +361,21 @@ def compute_hash(self) -> str: ] return hash_str + def pad_for_cudagraph(self, batch_size: int) -> int: + # if batch_size > self.compilation_config.max_cudagraph_capture_size, + # it should raise an IndexError. + # the caller should make sure the batch_size is within the range, + # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size + return self.compilation_config.bs_to_padded_graph_size[batch_size] + + def pad_for_vit_cudagraph(self, batch_size: int) -> int: + capture_sizes = self.compilation_config.vit_cudagraph_capture_sizes + # Find the insertion point for batch_size to maintain order. + # This gives the index of the first element >= batch_size. + idx = bisect.bisect_left(capture_sizes, batch_size) + + return capture_sizes[idx] if idx < len(capture_sizes) else batch_size + @property def needs_dp_coordinator(self) -> bool: """ @@ -815,6 +831,7 @@ def has_blocked_weights(): self.compilation_config.cudagraph_num_of_warmups = 1 self._set_cudagraph_sizes() + self._set_vit_cudagraph_sizes() else: self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE @@ -1333,6 +1350,42 @@ def _set_compile_ranges(self): compilation_config.compile_ranges_split_points = sorted( computed_compile_ranges_split_points ) + def _set_vit_cudagraph_sizes(self): + if ( + self.model_config is not None + and not self.model_config.enforce_eager + and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + ): + # determine the vit_cudagraph_capture_sizes + if self.compilation_config.vit_cudagraph_capture_sizes is not None: + assert len(self.compilation_config.vit_cudagraph_capture_sizes) > 0, ( + "vit_cudagraph_capture_sizes should contain at least one element " + "when using cuda graph." + ) + # sort to make sure the sizes are in ascending order + self.compilation_config.vit_cudagraph_capture_sizes.sort() + # de-duplicate the sizes provided by the config + dedup_sizes = list(set(self.compilation_config.vit_cudagraph_capture_sizes)) + vit_cudagraph_capture_sizes = dedup_sizes + else: + max_vit_cudagraph_capture_size = 5120 + vit_cudagraph_capture_sizes = [ + i for i in [16, 32, 64, 128, 256] if i <= max_vit_cudagraph_capture_size + ] + if max_vit_cudagraph_capture_size >= 1024: + # Step size 64 for small batch sizes, up to 2048(not included) + vit_cudagraph_capture_sizes += list( + range(512, min(max_vit_cudagraph_capture_size + 1, 2048), 64) + ) + if max_vit_cudagraph_capture_size >= 2048: + # Step size 128 for larger batch sizes + vit_cudagraph_capture_sizes += list( + range(2048, max_vit_cudagraph_capture_size + 1, 128) + ) + self.compilation_config.vit_cudagraph_capture_sizes = vit_cudagraph_capture_sizes + else: + # no cudagraph in use + self.compilation_config.vit_cudagraph_capture_sizes = [] def try_verify_and_update_config(self): if self.model_config is None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f3e7729f64e3..b6d901581f6c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -378,6 +378,9 @@ class EngineArgs: max_cudagraph_capture_size: int | None = get_field( CompilationConfig, "max_cudagraph_capture_size" ) + vit_cudagraph_capture_sizes: list[int] | None = ( + CompilationConfig.vit_cudagraph_capture_sizes + ) # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without # notice. @@ -1148,6 +1151,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: compilation_group.add_argument( "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"] ) + compilation_group.add_argument( + "--vit-cudagraph-capture-sizes", **compilation_kwargs["vit_cudagraph_capture_sizes"] + ) compilation_group.add_argument( "--max-cudagraph-capture-size", **compilation_kwargs["max_cudagraph_capture_size"], @@ -1737,6 +1743,15 @@ def create_engine_config( "cudagraph_capture_sizes are mutually exclusive" ) compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes + + if self.vit_cudagraph_capture_sizes is not None: + if compilation_config.vit_cudagraph_capture_sizes is not None: + raise ValueError( + "vit_cudagraph_capture_sizes and compilation_config." + "vit_cudagraph_capture_sizes are mutually exclusive" + ) + compilation_config.vit_cudagraph_capture_sizes = self.vit_cudagraph_capture_sizes + if self.max_cudagraph_capture_size is not None: if compilation_config.max_cudagraph_capture_size is not None: raise ValueError( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d25acf20df8a..f3a2c4a5c0fe 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -552,6 +552,14 @@ def __init__( self.cudagraph_batch_sizes = sorted( self.compilation_config.cudagraph_capture_sizes ) + # self.vit_cudagraph_batch_sizes sorts in ascending order. + if ( + self.compilation_config.vit_cudagraph_capture_sizes + and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + ): + self.vit_cudagraph_batch_sizes = sorted( + self.compilation_config.vit_cudagraph_capture_sizes + ) # Cache the device properties. self._init_device_properties() @@ -2433,9 +2441,7 @@ def _execute_mm_encoder( num_tokens = pixel_values.shape[0] # Pad to the size expected by CUDA graph - # TODO - # padded_num_tokens = self.vllm_config.pad_for_mm_cudagraph(num_tokens) - padded_num_tokens = 4096 + padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph(num_tokens) if padded_num_tokens > num_tokens: assert(self.pixel_values_buffer is not None and self.image_grid_thw_buffer is not None) @@ -2464,7 +2470,6 @@ def _execute_mm_encoder( device=self.device ) mm_kwargs_group["image_grid_thw"] = self.image_grid_thw_buffer[:num_images + 1] - # END: Added padding logic for ViT CUDA Graph # TODO get batch_descriptor from dispatch batch_descriptor = BatchDescriptor( @@ -2479,12 +2484,9 @@ def _execute_mm_encoder( should_time, mm_lora_refs, current_item_idx, num_items ): curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group) - # logger.info("cuda graph mm embedding complete!") - # logger.info(f"curr_group_outputs: {curr_group_outputs}") - # START: Added cropping logic for ViT CUDA Graph + # Remove the padded items before sanity check if original_num_imgs != -1: curr_group_outputs = curr_group_outputs[:original_num_imgs] - # END: Added cropping logic for ViT CUDA Graph sanity_check_mm_encoder_outputs( curr_group_outputs, expected_num_items=num_items, @@ -4628,7 +4630,7 @@ def rand_inputs_embeds() -> torch.Tensor: yield inputs_embeds.fill_(0) - def _get_dummy_vit_input(self, num_image_tokens: int) -> BatchedTensorInputs: + def _get_dummy_vit_input(self, num_image_tokens: int, img_feature_dim: int) -> BatchedTensorInputs: """ Generates dummy multimodal inputs for a single image, with a controllable number of resulting image tokens for a Vision Transformer (ViT) like model, @@ -4646,17 +4648,16 @@ def _get_dummy_vit_input(self, num_image_tokens: int) -> BatchedTensorInputs: `get_multimodal_embeddings`. """ def _get_dummy_h_w_patches(patches: int): - assert patches % 4 == 0, "Number of patches must be multiple of 4" - h_patches = 2 - w_patches = patches // 2 + merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1) + assert(patches % (merge_size * merge_size) == 0, "Number of patches must be multiple of merge_size squared") + h_patches = merge_size + w_patches = patches // merge_size return h_patches, w_patches # The first dimension of pixel_values corresponds to the total number of # tokens (patches). - #TODO 修改1176为vit feature dim. - # 根据num_image_tokens反推原图片长宽利用原api跑一遍?还是先跑一遍得到结果后取其feature dim再构造 pixel_values = torch.zeros( - (num_image_tokens, 1176), + (num_image_tokens, img_feature_dim), dtype=self.dtype, device=self.device ) @@ -5204,38 +5205,43 @@ def _dummy_pooler_run( @torch.inference_mode() def _dummy_mm_encoder_run( self, - cudagraph_runtime_mode: CUDAGraphMode | None = None, + compilation_cases: list[int], ) -> None: - logger.info("In _dummy_mm_encoder_run") - # capture_sizes = [16, 32, 64, 128, 256, 512, 1024, 3060, 3128] - capture_sizes = [4096] - # Lazy initialization of the persistent buffer - if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE: - # TODO: This will be improved to support different shapes. - for capture_size in capture_sizes: - logger.info(f"Capturing {capture_size}") - dummy_mm_inputs = self._get_dummy_vit_input(capture_size) - batch_descriptor = BatchDescriptor( - num_tokens=capture_size, - ) - if self.pixel_values_buffer is None: - self.pixel_values_buffer = torch.zeros( - (capture_sizes[-1], dummy_mm_inputs["pixel_values"].shape[1]), - dtype=self.dtype, - device=self.device + if self.pixel_values_buffer is None: + tmp_dummy_mm_inputs = self._get_mm_dummy_batch( + "image", + 1, ) - self.image_grid_thw_buffer = torch.zeros(( - 200, 3), dtype=torch.long, device=self.device - ) - with ( - set_forward_context( - None, - vllm_config=self.vllm_config, - cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor, - ), - ): - self.model.get_multimodal_embeddings(**dummy_mm_inputs) + img_feature_dim = tmp_dummy_mm_inputs["pixel_values"].shape[1] + self.pixel_values_buffer = torch.zeros( + (compilation_cases[0], img_feature_dim), + dtype=self.dtype, + device=self.device + ) + self.image_grid_thw_buffer = torch.zeros(( + 512, 3), dtype=torch.long, device=self.device + ) + if is_global_first_rank(): + compilation_cases = tqdm( + compilation_cases, + disable=not self.load_config.use_tqdm_on_load, + desc="Capturing Vit CUDA graphs (PIECEWISE)", + ) + # Lazy initialization of the persistent buffer + for capture_size in compilation_cases: + dummy_mm_inputs = self._get_dummy_vit_input(capture_size, img_feature_dim) + batch_descriptor = BatchDescriptor( + num_tokens=capture_size, + ) + with ( + set_forward_context( + None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, + batch_descriptor=batch_descriptor, + ), + ): + self.model.get_multimodal_embeddings(**dummy_mm_inputs) def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. @@ -5448,8 +5454,11 @@ def _capture_cudagraphs( num_active_loras=num_active_loras, is_graph_capturing=True, ) - if self.supports_mm_inputs: - self._dummy_mm_encoder_run(cudagraph_runtime_mode) + if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs: + vit_capture_sizes = self.vit_cudagraph_batch_sizes + if vit_capture_sizes: + compilation_cases_vit = list(reversed(vit_capture_sizes)) + self._dummy_mm_encoder_run(compilation_cases_vit) self.maybe_remove_all_loras(self.lora_config) From fac98f96887edcefe3095a008a2acf5db9e32ab9 Mon Sep 17 00:00:00 2001 From: Xingran Wang Date: Tue, 18 Nov 2025 16:47:23 +0800 Subject: [PATCH 06/35] ViT cuda graph dispatcher Signed-off-by: Xingran Wang --- vllm/forward_context.py | 5 +++++ vllm/v1/cudagraph_dispatcher.py | 8 ++++++++ vllm/v1/worker/gpu_model_runner.py | 12 +++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index e308c05bc669..1b89c04851f0 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -55,6 +55,10 @@ class BatchDescriptor(NamedTuple): (like fused_moe_lora) whose grid size depends on num_active_loras to be properly captured. """ + is_vit: bool = False + """ + ViT Piecewise CUDA Graph Flag + """ def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor": """ @@ -67,6 +71,7 @@ def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor": uniform=False, has_lora=self.has_lora, num_active_loras=self.num_active_loras, + is_vit=self.is_vit, ) diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 6f3e029c793b..3368f97fe3b3 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -186,6 +186,14 @@ def initialize_cudagraph_keys( bs, False, num_active_loras > 0, num_active_loras ).relax_for_mixed_batch_cudagraphs(), ) + # ViT CUDAGraph Entry + for vit_patch_len in self.compilation_config.vit_cudagraph_capture_sizes: + self.add_cudagraph_key( + cudagraph_mode.mixed_mode(), + BatchDescriptor( + num_tokens=vit_patch_len, uniform_decode=False, is_vit=True + ), + ) # if decode cudagraph mode is FULL, and we don't already have mixed # mode full cudagraphs then add them here. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f3a2c4a5c0fe..0128262de226 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2436,6 +2436,7 @@ def _execute_mm_encoder( # each of shape (feature_size, hidden_size) in case the feature # size is dynamic depending on the input multimodal items. original_num_imgs = -1 + padded_num_tokens = -1 if "pixel_values" in mm_kwargs_group: pixel_values = mm_kwargs_group["pixel_values"] num_tokens = pixel_values.shape[0] @@ -2471,14 +2472,18 @@ def _execute_mm_encoder( ) mm_kwargs_group["image_grid_thw"] = self.image_grid_thw_buffer[:num_images + 1] - # TODO get batch_descriptor from dispatch + # get batch_descriptor from dispatcher batch_descriptor = BatchDescriptor( - num_tokens=mm_kwargs_group["pixel_values"].shape[0], + num_tokens=padded_num_tokens, + is_vit=True, + ) + cudagraph_runtime_mode, batch_descriptor = ( + self.cudagraph_dispatcher.dispatch(batch_descriptor, False) ) with set_forward_context( None, vllm_config=self.vllm_config, - cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_descriptor, ), self.timed_encoder_operation( should_time, mm_lora_refs, current_item_idx, num_items @@ -5232,6 +5237,7 @@ def _dummy_mm_encoder_run( dummy_mm_inputs = self._get_dummy_vit_input(capture_size, img_feature_dim) batch_descriptor = BatchDescriptor( num_tokens=capture_size, + is_vit=True, ) with ( set_forward_context( From 2762ba685a7bfa8130c7b50c8589d27c949a717e Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Tue, 18 Nov 2025 19:44:46 +0800 Subject: [PATCH 07/35] feat: update Qwen2.5-VL model to support dynamic buffer sizes based on CUDA graph capture settings Signed-off-by: Hongjian Zhang --- vllm/model_executor/models/qwen2_5_vl.py | 32 +++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8186ff244608..9d59671c6772 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -43,10 +43,10 @@ ) from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.forward_context import set_forward_context +from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.attention import MMEncoderAttention @@ -640,9 +640,17 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.merger", ) - - self._persistent_hidden_states_buffer = torch.empty((8192, 1176), device=self.device, dtype=self.dtype) - self._persistent_rotary_pos_emb_buffer = torch.empty((8192, 40), device=self.device, dtype=torch.float32) + vllm_config: VllmConfig = get_current_vllm_config() + self._persistent_hidden_states_buffer = None + self._persistent_rotary_pos_emb_buffer = None + if vllm_config.compilation_config.vit_cudagraph_capture_sizes: + max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] + self._persistent_hidden_states_buffer = torch.empty( + (max_compile_size, self.patch_embed.proj.input_size), device=self.device, dtype=self.dtype + ) + self._persistent_rotary_pos_emb_buffer = torch.empty( + (max_compile_size, head_dim // 2), device=self.device, dtype=torch.float32 + ) @property def dtype(self) -> torch.dtype: @@ -787,8 +795,9 @@ def forward( cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)] cu_seqlens: list = [] - # logger.info(f"X Shape: {x.shape}") - if seq_len < 8192: + fwd_ctx = get_forward_context() + if self._persistent_hidden_states_buffer is not None and \ + fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE: hidden_states = self._persistent_hidden_states_buffer[:seq_len] hidden_states.copy_(x, non_blocking=True) else: @@ -850,21 +859,19 @@ def forward( device=self.device, non_blocking=True ) rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True) - if seq_len < 8192: + if self._persistent_rotary_pos_emb_buffer is not None and \ + fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE: rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(rotary_pos_emb) window_index = window_index.to(device=hidden_states.device, non_blocking=True) reverse_indices = reverse_indices.to( device=hidden_states.device, non_blocking=True ) original_hidden_states = hidden_states # 这只是引用,不是拷贝 - # logger.info(f"Before Copy, original address: {original_hidden_states.storage().data_ptr()}") - # logger.info(f"Original Numel: {original_hidden_states.numel()}") # Step 2: 执行一些转换操作(这些会创建新张量) tmp = original_hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) tmp = tmp[window_index, :, :] tmp = tmp.reshape(seq_len, -1) tmp = tmp.unsqueeze(1) - # logger.info(f"Tmp Numel: {tmp.numel()}") # Step 3: 将结果拷贝回原始张量的显存地址中(这是原地拷贝!) original_storage = original_hidden_states.storage() tmp_storage = tmp.storage() @@ -875,10 +882,7 @@ def forward( new_shape = tmp.shape # (seq_len, 1, new_hidden_dim) hidden_states = original_hidden_states.view(new_shape) # 现在 hidden_states.shape == new_shape,且使用和 original 相同的显存 - # logger.info(f"After Copy, original address: {original_hidden_states.storage().data_ptr()}") - # logger.info(f"After Copy, tmp address: {tmp.storage().data_ptr()}") - # logger.info(f"Before Input to Vision Block, Shape: {hidden_states.shape}") with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(False): for layer_num, blk in enumerate(self.blocks): if layer_num in self.fullatt_block_indexes: From fb9225e50414c069bdaefed1c1685c4452b5629c Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Wed, 19 Nov 2025 16:15:46 +0800 Subject: [PATCH 08/35] fix: Ordering vit_cudagraph capture sizes and disable vit dp mode Signed-off-by: Hongjian Zhang --- vllm/config/vllm.py | 6 ++++-- vllm/model_executor/models/qwen2_5_vl.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index ad3c9ea80a88..fc3a421a3d76 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1355,6 +1355,8 @@ def _set_vit_cudagraph_sizes(self): self.model_config is not None and not self.model_config.enforce_eager and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and self.model_config.multimodal_config is not None + and self.model_config.multimodal_config.mm_encoder_tp_mode != "data" ): # determine the vit_cudagraph_capture_sizes if self.compilation_config.vit_cudagraph_capture_sizes is not None: @@ -1362,11 +1364,11 @@ def _set_vit_cudagraph_sizes(self): "vit_cudagraph_capture_sizes should contain at least one element " "when using cuda graph." ) - # sort to make sure the sizes are in ascending order - self.compilation_config.vit_cudagraph_capture_sizes.sort() # de-duplicate the sizes provided by the config dedup_sizes = list(set(self.compilation_config.vit_cudagraph_capture_sizes)) vit_cudagraph_capture_sizes = dedup_sizes + # sort to make sure the sizes are in ascending order + vit_cudagraph_capture_sizes.sort() else: max_vit_cudagraph_capture_size = 5120 vit_cudagraph_capture_sizes = [ diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 9d59671c6772..e4976fb2e416 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -641,8 +641,8 @@ def __init__( prefix=f"{prefix}.merger", ) vllm_config: VllmConfig = get_current_vllm_config() - self._persistent_hidden_states_buffer = None - self._persistent_rotary_pos_emb_buffer = None + self._persistent_hidden_states_buffer: torch.Tensor | None = None + self._persistent_rotary_pos_emb_buffer: torch.Tensor | None = None if vllm_config.compilation_config.vit_cudagraph_capture_sizes: max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] self._persistent_hidden_states_buffer = torch.empty( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0128262de226..8fc731e245c9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -560,6 +560,8 @@ def __init__( self.vit_cudagraph_batch_sizes = sorted( self.compilation_config.vit_cudagraph_capture_sizes ) + else: + self.vit_cudagraph_batch_sizes = None # Cache the device properties. self._init_device_properties() @@ -4654,7 +4656,7 @@ def _get_dummy_vit_input(self, num_image_tokens: int, img_feature_dim: int) -> B """ def _get_dummy_h_w_patches(patches: int): merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1) - assert(patches % (merge_size * merge_size) == 0, "Number of patches must be multiple of merge_size squared") + assert(patches % (merge_size * merge_size) == 0), "Number of patches must be multiple of merge_size squared" h_patches = merge_size w_patches = patches // merge_size return h_patches, w_patches From c216a0c11e78277a2e061f171fbefa6846dd932a Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Thu, 20 Nov 2025 11:14:05 +0800 Subject: [PATCH 09/35] chore: Optimize code structure and add documentation Signed-off-by: Hongjian Zhang --- docs/design/torch_compile.md | 6 +++++ vllm/config/vllm.py | 4 --- vllm/model_executor/models/qwen2_5_vl.py | 32 ++++++++++++------------ 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md index 4dc0da0c7d65..3cef2165543e 100644 --- a/docs/design/torch_compile.md +++ b/docs/design/torch_compile.md @@ -253,6 +253,12 @@ By default, vLLM will try to determine a set of sizes to capture cudagraph. You vllm serve meta-llama/Llama-3.2-1B \ --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}' ``` +Similarly, For `Qwen2.5-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. Note that ViT DP mode is **not supported**. You can use `--compilation-config '{"vit_cudagraph_capture_sizes": []}'` to disable only the ViT part of the CUDA graph, or use `--enforce-eager` to disable the entire CUDA graph. + +```bash +vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ + --compilation-config '{"vit_cudagraph_capture_sizes": [512, 1024]}' +``` Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index fc3a421a3d76..84a0c7686054 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1360,10 +1360,6 @@ def _set_vit_cudagraph_sizes(self): ): # determine the vit_cudagraph_capture_sizes if self.compilation_config.vit_cudagraph_capture_sizes is not None: - assert len(self.compilation_config.vit_cudagraph_capture_sizes) > 0, ( - "vit_cudagraph_capture_sizes should contain at least one element " - "when using cuda graph." - ) # de-duplicate the sizes provided by the config dedup_sizes = list(set(self.compilation_config.vit_cudagraph_capture_sizes)) vit_cudagraph_capture_sizes = dedup_sizes diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index e4976fb2e416..c6c7a1adb0ce 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -866,22 +866,22 @@ def forward( reverse_indices = reverse_indices.to( device=hidden_states.device, non_blocking=True ) - original_hidden_states = hidden_states # 这只是引用,不是拷贝 - # Step 2: 执行一些转换操作(这些会创建新张量) - tmp = original_hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) - tmp = tmp[window_index, :, :] - tmp = tmp.reshape(seq_len, -1) - tmp = tmp.unsqueeze(1) - # Step 3: 将结果拷贝回原始张量的显存地址中(这是原地拷贝!) - original_storage = original_hidden_states.storage() - tmp_storage = tmp.storage() - original_storage.copy_(tmp_storage) - - # Step 4: 创建一个使用原始显存、具有新 shape 的 view - # 条件:original numel 必须等于新 shape 的总元素数 - new_shape = tmp.shape # (seq_len, 1, new_hidden_dim) - hidden_states = original_hidden_states.view(new_shape) - # 现在 hidden_states.shape == new_shape,且使用和 original 相同的显存 + + original_hidden_states = hidden_states + hidden_states = hidden_states.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1 + ) + hidden_states = hidden_states[window_index, :, :] + hidden_states = hidden_states.reshape(seq_len, -1) + hidden_states = hidden_states.unsqueeze(1) + + if self._persistent_hidden_states_buffer is not None and \ + fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE: + # The above operations will produce temporary new tensors. + # That is not friendly to cudagraphs, so we need to copy them back to the persistent buffer + original_hidden_states = original_hidden_states.view(hidden_states.shape) + original_hidden_states.copy_(hidden_states) + hidden_states = original_hidden_states with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(False): for layer_num, blk in enumerate(self.blocks): From c85b49b2fbf3a4048e883e02518bf3cd46ced105 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Thu, 20 Nov 2025 14:01:16 +0800 Subject: [PATCH 10/35] chore: rebase to v0.11.1 Signed-off-by: Hongjian Zhang --- docs/design/torch_compile.md | 5 ++--- vllm/config/vllm.py | 34 ++++++++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 6 +++--- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md index 3cef2165543e..041f029294e4 100644 --- a/docs/design/torch_compile.md +++ b/docs/design/torch_compile.md @@ -253,11 +253,10 @@ By default, vLLM will try to determine a set of sizes to capture cudagraph. You vllm serve meta-llama/Llama-3.2-1B \ --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}' ``` -Similarly, For `Qwen2.5-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. Note that ViT DP mode is **not supported**. You can use `--compilation-config '{"vit_cudagraph_capture_sizes": []}'` to disable only the ViT part of the CUDA graph, or use `--enforce-eager` to disable the entire CUDA graph. - +Similarly, For `Qwen2.5-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. Note that ViT DP mode is **not supported**. By default, this is disabled as `compile_mm_encoder` is `False`. To enable it and specify capture sizes, you can do the following: ```bash vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ - --compilation-config '{"vit_cudagraph_capture_sizes": [512, 1024]}' + --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": [512, 1024]}' ``` Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 84a0c7686054..f2b522a714a3 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1351,10 +1351,44 @@ def _set_compile_ranges(self): computed_compile_ranges_split_points ) def _set_vit_cudagraph_sizes(self): + """Sets the CUDA graph capture sizes for the Vision Transformer (ViT). + + This method determines the batch sizes for which ViT CUDA graphs will be + captured. CUDA graphs improve performance by reducing kernel launch + overhead for the vision encoder. + + The logic is as follows: + 1. The feature is only enabled if all of the following conditions are met: + - Eager mode is not enforced. + - CUDA graph mode is enabled. + - The multimodal encoder compilation is enabled. + - A multimodal config is present. + - The multimodal encoder tensor parallelism mode is not "data". + If these conditions are not met, the list of capture sizes will be empty, + effectively disabling ViT CUDA graphs. + + 2. If the user has explicitly provided `vit_cudagraph_capture_sizes` in the + compilation config, those sizes are used. The list is de-duplicated + and sorted in ascending order. + + 3. If no sizes are provided by the user, a default list of sizes is + generated up to a maximum of 5120. The default sizes are: + [16, 32, 64, 128, 256] + list(range(512, 2048, 64)) + list( + range(2048, 5120 + 1, 128)) + + The final list of sizes is stored in + `self.compilation_config.vit_cudagraph_capture_sizes`. + + - If a batch's size matches or is smaller than a captured size, the + closest captured graph is used. + - If a batch's size is larger than the largest captured size, a CUDA + graph will not be used for that batch. + """ if ( self.model_config is not None and not self.model_config.enforce_eager and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and self.compilation_config.compile_mm_encoder and self.model_config.multimodal_config is not None and self.model_config.multimodal_config.mm_encoder_tp_mode != "data" ): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8fc731e245c9..ed87b8a2bc63 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2490,7 +2490,7 @@ def _execute_mm_encoder( ), self.timed_encoder_operation( should_time, mm_lora_refs, current_item_idx, num_items ): - curr_group_outputs = model.get_multimodal_embeddings(**mm_kwargs_group) + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) # Remove the padded items before sanity check if original_num_imgs != -1: curr_group_outputs = curr_group_outputs[:original_num_imgs] @@ -4652,7 +4652,7 @@ def _get_dummy_vit_input(self, num_image_tokens: int, img_feature_dim: int) -> B Returns: A BatchedTensorInputs dictionary containing `pixel_values` and `image_grid_thw` that can be passed as kwargs to - `get_multimodal_embeddings`. + `embed_multimodal`. """ def _get_dummy_h_w_patches(patches: int): merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1) @@ -5249,7 +5249,7 @@ def _dummy_mm_encoder_run( batch_descriptor=batch_descriptor, ), ): - self.model.get_multimodal_embeddings(**dummy_mm_inputs) + self.model.embed_multimodal(**dummy_mm_inputs) def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. From f1f26d05558980eadf3b4acb9a259f108bcff1f1 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Thu, 20 Nov 2025 17:05:02 +0800 Subject: [PATCH 11/35] chore: ruff format Signed-off-by: Hongjian Zhang --- vllm/compilation/backends.py | 4 ++ vllm/config/vllm.py | 14 ++-- vllm/engine/arg_utils.py | 11 ++-- vllm/model_executor/models/qwen2_5_vl.py | 56 ++++++++++++---- vllm/v1/worker/gpu_model_runner.py | 84 +++++++++++++++--------- 5 files changed, 116 insertions(+), 53 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index ce2cacd0b7cd..20e74d619adf 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -54,6 +54,7 @@ # in **vit** piecewise compilation. _is_last_graph_in_vit_sequence: bool = True + @contextmanager def set_is_last_graph_in_sequence(is_last: bool): """Context manager to indicate if the current graph being compiled @@ -67,10 +68,12 @@ def set_is_last_graph_in_sequence(is_last: bool): finally: _is_last_graph_in_vit_sequence = original_value + # A global flag to indicate if the current graph being compiled # is the first one in a sequence of graphs (e.g., a sequence of blocks). _is_first_graph_in_vit_sequence: bool = True + @contextmanager def set_is_first_graph_in_sequence(is_first: bool): """Context manager to indicate if the current graph being compiled @@ -121,6 +124,7 @@ def copy_and_call(*args: Any) -> Any: return copy_and_call + def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: assert not envs.VLLM_USE_MEGA_AOT_ARTIFACT or envs.VLLM_USE_STANDALONE_COMPILE, ( "VLLM_USE_MEGA_AOT_ARTIFACT=1 requires VLLM_USE_STANDALONE_COMPILE=1" diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index f2b522a714a3..f96e9733aecd 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import bisect import copy import getpass import json @@ -15,7 +16,6 @@ from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar, get_args -import bisect import torch from pydantic import ConfigDict, Field, model_validator @@ -1395,14 +1395,18 @@ def _set_vit_cudagraph_sizes(self): # determine the vit_cudagraph_capture_sizes if self.compilation_config.vit_cudagraph_capture_sizes is not None: # de-duplicate the sizes provided by the config - dedup_sizes = list(set(self.compilation_config.vit_cudagraph_capture_sizes)) + dedup_sizes = list( + set(self.compilation_config.vit_cudagraph_capture_sizes) + ) vit_cudagraph_capture_sizes = dedup_sizes # sort to make sure the sizes are in ascending order vit_cudagraph_capture_sizes.sort() else: max_vit_cudagraph_capture_size = 5120 vit_cudagraph_capture_sizes = [ - i for i in [16, 32, 64, 128, 256] if i <= max_vit_cudagraph_capture_size + i + for i in [16, 32, 64, 128, 256] + if i <= max_vit_cudagraph_capture_size ] if max_vit_cudagraph_capture_size >= 1024: # Step size 64 for small batch sizes, up to 2048(not included) @@ -1414,7 +1418,9 @@ def _set_vit_cudagraph_sizes(self): vit_cudagraph_capture_sizes += list( range(2048, max_vit_cudagraph_capture_size + 1, 128) ) - self.compilation_config.vit_cudagraph_capture_sizes = vit_cudagraph_capture_sizes + self.compilation_config.vit_cudagraph_capture_sizes = ( + vit_cudagraph_capture_sizes + ) else: # no cudagraph in use self.compilation_config.vit_cudagraph_capture_sizes = [] diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b6d901581f6c..9733b0f26ec2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1152,7 +1152,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"] ) compilation_group.add_argument( - "--vit-cudagraph-capture-sizes", **compilation_kwargs["vit_cudagraph_capture_sizes"] + "--vit-cudagraph-capture-sizes", + **compilation_kwargs["vit_cudagraph_capture_sizes"], ) compilation_group.add_argument( "--max-cudagraph-capture-size", @@ -1743,15 +1744,17 @@ def create_engine_config( "cudagraph_capture_sizes are mutually exclusive" ) compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes - + if self.vit_cudagraph_capture_sizes is not None: if compilation_config.vit_cudagraph_capture_sizes is not None: raise ValueError( "vit_cudagraph_capture_sizes and compilation_config." "vit_cudagraph_capture_sizes are mutually exclusive" ) - compilation_config.vit_cudagraph_capture_sizes = self.vit_cudagraph_capture_sizes - + compilation_config.vit_cudagraph_capture_sizes = ( + self.vit_cudagraph_capture_sizes + ) + if self.max_cudagraph_capture_size is not None: if compilation_config.max_cudagraph_capture_size is not None: raise ValueError( diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index c6c7a1adb0ce..02e6af8334ac 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -644,12 +644,18 @@ def __init__( self._persistent_hidden_states_buffer: torch.Tensor | None = None self._persistent_rotary_pos_emb_buffer: torch.Tensor | None = None if vllm_config.compilation_config.vit_cudagraph_capture_sizes: - max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] + max_compile_size = ( + vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] + ) self._persistent_hidden_states_buffer = torch.empty( - (max_compile_size, self.patch_embed.proj.input_size), device=self.device, dtype=self.dtype + (max_compile_size, self.patch_embed.proj.input_size), + device=self.device, + dtype=self.dtype, ) self._persistent_rotary_pos_emb_buffer = torch.empty( - (max_compile_size, head_dim // 2), device=self.device, dtype=torch.float32 + (max_compile_size, head_dim // 2), + device=self.device, + dtype=torch.float32, ) @property @@ -796,14 +802,21 @@ def forward( cu_seqlens: list = [] fwd_ctx = get_forward_context() - if self._persistent_hidden_states_buffer is not None and \ - fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE: + if ( + self._persistent_hidden_states_buffer is not None + and fwd_ctx + and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ): hidden_states = self._persistent_hidden_states_buffer[:seq_len] hidden_states.copy_(x, non_blocking=True) else: hidden_states = x.to(device=self.device, dtype=self.dtype) - from vllm.compilation.backends import set_is_first_graph_in_sequence, set_is_last_graph_in_sequence + from vllm.compilation.backends import ( + set_is_first_graph_in_sequence, + set_is_last_graph_in_sequence, + ) + with set_is_first_graph_in_sequence(True), set_is_last_graph_in_sequence(False): hidden_states = self.patch_embed(hidden_states) @@ -859,9 +872,14 @@ def forward( device=self.device, non_blocking=True ) rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True) - if self._persistent_rotary_pos_emb_buffer is not None and \ - fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE: - rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_(rotary_pos_emb) + if ( + self._persistent_rotary_pos_emb_buffer is not None + and fwd_ctx + and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ): + rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_( + rotary_pos_emb + ) window_index = window_index.to(device=hidden_states.device, non_blocking=True) reverse_indices = reverse_indices.to( device=hidden_states.device, non_blocking=True @@ -875,15 +893,22 @@ def forward( hidden_states = hidden_states.reshape(seq_len, -1) hidden_states = hidden_states.unsqueeze(1) - if self._persistent_hidden_states_buffer is not None and \ - fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE: + if ( + self._persistent_hidden_states_buffer is not None + and fwd_ctx + and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ): # The above operations will produce temporary new tensors. - # That is not friendly to cudagraphs, so we need to copy them back to the persistent buffer + # That is not friendly to cudagraphs, + # so we need to copy them back to the persistent buffer original_hidden_states = original_hidden_states.view(hidden_states.shape) original_hidden_states.copy_(hidden_states) hidden_states = original_hidden_states - with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(False): + with ( + set_is_first_graph_in_sequence(False), + set_is_last_graph_in_sequence(False), + ): for layer_num, blk in enumerate(self.blocks): if layer_num in self.fullatt_block_indexes: cu_seqlens_now = cu_seqlens @@ -1245,7 +1270,10 @@ def _process_image_input( with set_forward_context(None, self.vllm_config): if self.use_data_parallel: return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values, grid_thw_list, rope_type="rope_3d" + self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d", ) else: image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ed87b8a2bc63..775c9523553f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -553,6 +553,7 @@ def __init__( self.compilation_config.cudagraph_capture_sizes ) # self.vit_cudagraph_batch_sizes sorts in ascending order. + self.vit_cudagraph_batch_sizes: list[int] | None if ( self.compilation_config.vit_cudagraph_capture_sizes and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE @@ -2444,13 +2445,20 @@ def _execute_mm_encoder( num_tokens = pixel_values.shape[0] # Pad to the size expected by CUDA graph - padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph(num_tokens) + padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph( + num_tokens + ) if padded_num_tokens > num_tokens: - assert(self.pixel_values_buffer is not None and self.image_grid_thw_buffer is not None) - - self.pixel_values_buffer[:num_tokens].copy_(pixel_values) # type: ignore - mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[:padded_num_tokens] + assert ( + self.pixel_values_buffer is not None + and self.image_grid_thw_buffer is not None + ) + + self.pixel_values_buffer[:num_tokens].copy_(pixel_values) # type: ignore + mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[ + :padded_num_tokens + ] # Update image_grid_thw to account for padding if "image_grid_thw" in mm_kwargs_group: @@ -2458,21 +2466,29 @@ def _execute_mm_encoder( num_images = image_grid_thw.shape[0] original_num_imgs = num_images padding_amount = padded_num_tokens - num_tokens - + # Treat padding as a new virtual image. - # Assuming a fixed patch grid logic where height is merge_size. - merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1) - assert(padding_amount % (merge_size * merge_size) == 0) + # Assuming a fixed patch size where height is merge_size. + merge_size = getattr( + self.model_config.hf_config.vision_config, + "spatial_merge_size", + 1, + ) + assert padding_amount % (merge_size * merge_size) == 0 h_patches = merge_size w_patches = padding_amount // h_patches - self.image_grid_thw_buffer[:num_images].copy_(image_grid_thw) + self.image_grid_thw_buffer[:num_images].copy_( + image_grid_thw + ) self.image_grid_thw_buffer[num_images] = torch.tensor( [1, h_patches, w_patches], dtype=torch.long, - device=self.device + device=self.device, + ) + mm_kwargs_group["image_grid_thw"] = ( + self.image_grid_thw_buffer[: num_images + 1] ) - mm_kwargs_group["image_grid_thw"] = self.image_grid_thw_buffer[:num_images + 1] # get batch_descriptor from dispatcher batch_descriptor = BatchDescriptor( @@ -4637,7 +4653,9 @@ def rand_inputs_embeds() -> torch.Tensor: yield inputs_embeds.fill_(0) - def _get_dummy_vit_input(self, num_image_tokens: int, img_feature_dim: int) -> BatchedTensorInputs: + def _get_dummy_vit_input( + self, num_image_tokens: int, img_feature_dim: int + ) -> BatchedTensorInputs: """ Generates dummy multimodal inputs for a single image, with a controllable number of resulting image tokens for a Vision Transformer (ViT) like model, @@ -4654,9 +4672,14 @@ def _get_dummy_vit_input(self, num_image_tokens: int, img_feature_dim: int) -> B `image_grid_thw` that can be passed as kwargs to `embed_multimodal`. """ + def _get_dummy_h_w_patches(patches: int): - merge_size = getattr(self.model_config.hf_config.vision_config, "spatial_merge_size", 1) - assert(patches % (merge_size * merge_size) == 0), "Number of patches must be multiple of merge_size squared" + merge_size = getattr( + self.model_config.hf_config.vision_config, "spatial_merge_size", 1 + ) + assert patches % (merge_size * merge_size) == 0, ( + "Number of patches must be multiple of merge_size squared" + ) h_patches = merge_size w_patches = patches // merge_size return h_patches, w_patches @@ -4664,16 +4687,12 @@ def _get_dummy_h_w_patches(patches: int): # The first dimension of pixel_values corresponds to the total number of # tokens (patches). pixel_values = torch.zeros( - (num_image_tokens, img_feature_dim), - dtype=self.dtype, - device=self.device + (num_image_tokens, img_feature_dim), dtype=self.dtype, device=self.device ) h_patches, w_patches = _get_dummy_h_w_patches(num_image_tokens) image_grid_thw = torch.tensor( - [[1, h_patches, w_patches]], - dtype=torch.long, - device=self.device + [[1, h_patches, w_patches]], dtype=torch.long, device=self.device ) return { @@ -5216,17 +5235,17 @@ def _dummy_mm_encoder_run( ) -> None: if self.pixel_values_buffer is None: tmp_dummy_mm_inputs = self._get_mm_dummy_batch( - "image", - 1, - ) + "image", + 1, + ) img_feature_dim = tmp_dummy_mm_inputs["pixel_values"].shape[1] self.pixel_values_buffer = torch.zeros( (compilation_cases[0], img_feature_dim), dtype=self.dtype, - device=self.device + device=self.device, ) - self.image_grid_thw_buffer = torch.zeros(( - 512, 3), dtype=torch.long, device=self.device + self.image_grid_thw_buffer = torch.zeros( + (512, 3), dtype=torch.long, device=self.device ) if is_global_first_rank(): compilation_cases = tqdm( @@ -5238,9 +5257,9 @@ def _dummy_mm_encoder_run( for capture_size in compilation_cases: dummy_mm_inputs = self._get_dummy_vit_input(capture_size, img_feature_dim) batch_descriptor = BatchDescriptor( - num_tokens=capture_size, - is_vit=True, - ) + num_tokens=capture_size, + is_vit=True, + ) with ( set_forward_context( None, @@ -5462,7 +5481,10 @@ def _capture_cudagraphs( num_active_loras=num_active_loras, is_graph_capturing=True, ) - if cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs: + if ( + cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + and self.supports_mm_inputs + ): vit_capture_sizes = self.vit_cudagraph_batch_sizes if vit_capture_sizes: compilation_cases_vit = list(reversed(vit_capture_sizes)) From ef269187bdce52f9706a494d7e643da729d52b71 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Sun, 23 Nov 2025 20:39:02 +0800 Subject: [PATCH 12/35] feat: Update vit_cudagraph capture size logic Signed-off-by: Hongjian Zhang --- vllm/config/vllm.py | 17 +++++++++++++++-- vllm/v1/worker/gpu_model_runner.py | 14 ++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index f96e9733aecd..54aab49ff2ff 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1402,7 +1402,15 @@ def _set_vit_cudagraph_sizes(self): # sort to make sure the sizes are in ascending order vit_cudagraph_capture_sizes.sort() else: - max_vit_cudagraph_capture_size = 5120 + from vllm.multimodal import MULTIMODAL_REGISTRY + from vllm.v1.core.encoder_cache_manager import compute_encoder_budget + + encoder_compute_budget, _ = compute_encoder_budget( + model_config=self.model_config, + scheduler_config=self.scheduler_config, + mm_registry=MULTIMODAL_REGISTRY, + ) + max_vit_cudagraph_capture_size = min(encoder_compute_budget, 32768) vit_cudagraph_capture_sizes = [ i for i in [16, 32, 64, 128, 256] @@ -1416,7 +1424,12 @@ def _set_vit_cudagraph_sizes(self): if max_vit_cudagraph_capture_size >= 2048: # Step size 128 for larger batch sizes vit_cudagraph_capture_sizes += list( - range(2048, max_vit_cudagraph_capture_size + 1, 128) + range(2048, min(max_vit_cudagraph_capture_size + 1, 4096), 128) + ) + if max_vit_cudagraph_capture_size >= 4096: + # Step size 256 for largest batch sizes + vit_cudagraph_capture_sizes += list( + range(4096, max_vit_cudagraph_capture_size + 1, 256) ) self.compilation_config.vit_cudagraph_capture_sizes = ( vit_cudagraph_capture_sizes diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 775c9523553f..9a5ebe562dae 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2440,7 +2440,7 @@ def _execute_mm_encoder( # size is dynamic depending on the input multimodal items. original_num_imgs = -1 padded_num_tokens = -1 - if "pixel_values" in mm_kwargs_group: + if self.vit_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group: pixel_values = mm_kwargs_group["pixel_values"] num_tokens = pixel_values.shape[0] @@ -2477,7 +2477,17 @@ def _execute_mm_encoder( assert padding_amount % (merge_size * merge_size) == 0 h_patches = merge_size w_patches = padding_amount // h_patches - + if num_images + 1 > self.image_grid_thw_buffer.shape[0]: + new_size = max( + self.image_grid_thw_buffer.shape[0] * 2, + num_images + 1, + ) + new_buffer = torch.zeros( + (new_size, 3), + dtype=torch.long, + device=self.device, + ) + self.image_grid_thw_buffer = new_buffer self.image_grid_thw_buffer[:num_images].copy_( image_grid_thw ) From 2872257dbf4b2975d343f16099059e032db5b33d Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 29 Oct 2025 11:48:06 +0000 Subject: [PATCH 13/35] [Model][Qwen3VL] Add `torch.compile` support for Qwen3VL Signed-off-by: Lukas Geiger Signed-off-by: Hongjian Zhang --- vllm/model_executor/models/qwen3_vl.py | 142 +++++++++++++++---------- 1 file changed, 84 insertions(+), 58 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 97754833953f..c6c818941128 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -52,6 +52,7 @@ from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_pp_group +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.conv import Conv3dLayer @@ -139,6 +140,7 @@ DUMMY_VIDEO_NUM_FRAMES = 2048 +@support_torch_compile(dynamic_arg_dims={"x": 0}) class Qwen3_VisionPatchEmbed(nn.Module): def __init__( self, @@ -205,6 +207,10 @@ def forward(self, x: torch.Tensor): return mlp_output +@support_torch_compile( + dynamic_arg_dims={"x": 0, "cu_seqlens": 0, "rotary_pos_emb": 0, "seqlens": 0}, + mark_unbacked_dims={"seqlens": 0}, +) class Qwen3_VisionBlock(nn.Module): def __init__( self, @@ -257,6 +263,7 @@ def forward( return x +@support_torch_compile(dynamic_arg_dims={"x": 0}) class Qwen3_VisionPatchMerger(nn.Module): def __init__( self, @@ -286,6 +293,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.linear_fc1", disable_tp=use_data_parallel, + return_bias=False, ) self.act_fn = nn.GELU() self.linear_fc2 = RowParallelLinear( @@ -295,6 +303,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.linear_fc2", disable_tp=use_data_parallel, + return_bias=False, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -303,9 +312,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: else: x = self.norm(x).view(-1, self.hidden_size) - x_parallel, _ = self.linear_fc1(x) + x_parallel = self.linear_fc1(x) x_parallel = self.act_fn(x_parallel) - out, _ = self.linear_fc2(x_parallel) + out = self.linear_fc2(x_parallel) return out @@ -333,13 +342,18 @@ def __init__( self.out_hidden_size = vision_config.out_hidden_size * ( 1 + len(self.deepstack_visual_indexes) ) - - self.patch_embed = Qwen3_VisionPatchEmbed( - patch_size=self.patch_size, - temporal_patch_size=self.temporal_patch_size, - in_channels=vision_config.in_channels, - hidden_size=self.hidden_size, - ) + # TODO[@lucaskabela]: Investigate fixing this usage + # see https://github.com/vllm-project/vllm/issues/27044 + # DO NOT MOVE THIS IMPORT + from vllm.compilation.backends import set_model_tag + + with set_model_tag("Qwen3_VisionPatchEmbed", is_encoder=True): + self.patch_embed = Qwen3_VisionPatchEmbed( + patch_size=self.patch_size, + temporal_patch_size=self.temporal_patch_size, + in_channels=vision_config.in_channels, + hidden_size=self.hidden_size, + ) self.pos_embed = nn.Embedding(self.num_position_embeddings, self.hidden_size) @@ -352,29 +366,31 @@ def __init__( rope_parameters={"partial_rotary_factor": 0.5}, ) - self.merger = Qwen3_VisionPatchMerger( - d_model=vision_config.out_hidden_size, - context_dim=self.hidden_size, - norm_layer=norm_layer, - spatial_merge_size=self.spatial_merge_size, - quant_config=quant_config, - prefix=f"{prefix}.merger", - ) + with set_model_tag("Qwen3_VisionPatchMerger", is_encoder=True): + self.merger = Qwen3_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + norm_layer=norm_layer, + spatial_merge_size=self.spatial_merge_size, + quant_config=quant_config, + prefix=f"{prefix}.merger", + ) - self.deepstack_merger_list = nn.ModuleList( - [ - Qwen3_VisionPatchMerger( - d_model=vision_config.out_hidden_size, - context_dim=self.hidden_size, - spatial_merge_size=self.spatial_merge_size, - use_postshuffle_norm=True, - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.deepstack_merger_list.{layer_idx}", - ) - for layer_idx in range(len(self.deepstack_visual_indexes)) - ] - ) + with set_model_tag("Qwen3_VisionPatchMerger_postshuffle_norm", is_encoder=True): + self.deepstack_merger_list = nn.ModuleList( + [ + Qwen3_VisionPatchMerger( + d_model=vision_config.out_hidden_size, + context_dim=self.hidden_size, + spatial_merge_size=self.spatial_merge_size, + use_postshuffle_norm=True, + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.deepstack_merger_list.{layer_idx}", + ) + for layer_idx in range(len(self.deepstack_visual_indexes)) + ] + ) self.attn_backend = get_vit_attn_backend( head_size=head_dim, @@ -389,20 +405,21 @@ def __init__( raise RuntimeError( f"Qwen3-VL does not support {self.attn_backend} backend now." ) - self.blocks = nn.ModuleList( - [ - Qwen3_VisionBlock( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}", - ) - for layer_idx in range(vision_config.depth) - ] - ) + with set_model_tag("Qwen3_VisionBlock", is_encoder=True): + self.blocks = nn.ModuleList( + [ + Qwen3_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}", + ) + for layer_idx in range(vision_config.depth) + ] + ) @property def dtype(self) -> torch.dtype: @@ -1257,6 +1274,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): multimodal_config = vllm_config.model_config.multimodal_config self.config = config + self.vllm_config = vllm_config self.multimodal_config = multimodal_config self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" self.video_pruning_rate = multimodal_config.video_pruning_rate @@ -1409,17 +1427,19 @@ def _process_image_input( ) -> tuple[torch.Tensor, ...]: grid_thw = image_input["image_grid_thw"] assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() if image_input["type"] == "image_embeds": image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - if self.use_data_parallel: - return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d" - ) - else: - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + with set_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, pixel_values, grid_thw_list, rope_type="rope_3d" + ) + else: + image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size @@ -1431,6 +1451,7 @@ def _process_video_input( ) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() if video_input["type"] == "video_embeds": video_embeds = video_input["video_embeds"].type(self.visual.dtype) @@ -1438,13 +1459,18 @@ def _process_video_input( pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype ) - if self.use_data_parallel: - grid_thw_list = grid_thw.tolist() - return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d" - ) - else: - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + with set_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values_videos, + grid_thw_list, + rope_type="rope_3d", + ) + else: + video_embeds = self.visual( + pixel_values_videos, grid_thw=grid_thw_list + ) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size From 8bff371b913b89247e66321f0bebb3ecc471ef64 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Fri, 26 Dec 2025 21:15:50 +0800 Subject: [PATCH 14/35] feat: Enhance Qwen3VL with ViT CUDAGraph support Signed-off-by: Hongjian Zhang --- vllm/model_executor/models/qwen3_vl.py | 107 ++++++++++++++++++++----- 1 file changed, 88 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c6c818941128..328a58361ed5 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -49,10 +49,10 @@ from transformers.video_utils import VideoMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import VllmConfig +from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_pp_group -from vllm.forward_context import set_forward_context +from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.conv import Conv3dLayer @@ -66,6 +66,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.models.vision import should_torch_compile_mm_vit from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.evs import ( compute_mrope_for_media, @@ -210,6 +211,7 @@ def forward(self, x: torch.Tensor): @support_torch_compile( dynamic_arg_dims={"x": 0, "cu_seqlens": 0, "rotary_pos_emb": 0, "seqlens": 0}, mark_unbacked_dims={"seqlens": 0}, + enable_if=should_torch_compile_mm_vit, ) class Qwen3_VisionBlock(nn.Module): def __init__( @@ -263,7 +265,8 @@ def forward( return x -@support_torch_compile(dynamic_arg_dims={"x": 0}) +@support_torch_compile(dynamic_arg_dims={"x": 0}, + enable_if=should_torch_compile_mm_vit) class Qwen3_VisionPatchMerger(nn.Module): def __init__( self, @@ -420,6 +423,17 @@ def __init__( for layer_idx in range(vision_config.depth) ] ) + vllm_config: VllmConfig = get_current_vllm_config() + self._persistent_hidden_states_buffer: torch.Tensor | None = None + self._persistent_rotary_pos_emb_buffer: torch.Tensor | None = None + if vllm_config.compilation_config.vit_cudagraph_capture_sizes: + max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] + self._persistent_hidden_states_buffer = torch.empty( + (max_compile_size, self.patch_embed.proj.input_size), device=self.device, dtype=self.dtype + ) + self._persistent_rotary_pos_emb_buffer = torch.empty( + (max_compile_size, head_dim // 2), device=self.device, dtype=torch.float32 + ) @property def dtype(self) -> torch.dtype: @@ -551,8 +565,25 @@ def forward( x: torch.Tensor, grid_thw: torch.Tensor | list[list[int]], ) -> torch.Tensor: - hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True) - hidden_states = self.patch_embed(hidden_states) + seq_len, _ = x.size() + fwd_ctx = get_forward_context() + if ( + self._persistent_hidden_states_buffer is not None + and fwd_ctx + and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ): + hidden_states = self._persistent_hidden_states_buffer[:seq_len] + hidden_states.copy_(x, non_blocking=True) + else: + hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True) + + from vllm.compilation.backends import ( + set_is_first_graph_in_sequence, + set_is_last_graph_in_sequence, + ) + + with set_is_first_graph_in_sequence(True), set_is_last_graph_in_sequence(False): + hidden_states = self.patch_embed(hidden_states) if isinstance(grid_thw, list): grid_thw_list = grid_thw @@ -562,8 +593,19 @@ def forward( grid_thw = grid_thw.numpy() pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list) + original_hidden_states = hidden_states hidden_states = hidden_states + pos_embeds rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) + rotary_pos_emb = self.rot_pos_emb(grid_thw_list) + rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, non_blocking=True) + if ( + self._persistent_rotary_pos_emb_buffer is not None + and fwd_ctx + and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ): + rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_( + rotary_pos_emb + ) cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( axis=0, dtype=np.int32 @@ -575,21 +617,37 @@ def forward( max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) + if ( + self._persistent_hidden_states_buffer is not None + and fwd_ctx + and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ): + # The above operations will produce temporary new tensors. + # That is not friendly to cudagraphs, + # so we need to copy them back to the persistent buffer + original_hidden_states = original_hidden_states.view(hidden_states.shape) + original_hidden_states.copy_(hidden_states) + hidden_states = original_hidden_states + deepstack_feature_lists = [] - for layer_num, blk in enumerate(self.blocks): - hidden_states = blk( - hidden_states, - cu_seqlens=cu_seqlens, - rotary_pos_emb_cos=rotary_pos_emb_cos, - rotary_pos_emb_sin=rotary_pos_emb_sin, - max_seqlen=max_seqlen, - ) - if layer_num in self.deepstack_visual_indexes: - deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num) - deepstack_feature = self.deepstack_merger_list[deepstack_merger_idx]( - hidden_states + with ( + set_is_first_graph_in_sequence(False), + set_is_last_graph_in_sequence(False), + ): + for layer_num, blk in enumerate(self.blocks): + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + max_seqlen=max_seqlen, + ) + if layer_num in self.deepstack_visual_indexes: + deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num) + deepstack_feature = self.deepstack_merger_list[deepstack_merger_idx]( + hidden_states ) - deepstack_feature_lists.append(deepstack_feature) + deepstack_feature_lists.append(deepstack_feature) hidden_states = self.merger(hidden_states) hidden_states = torch.cat( [hidden_states] + deepstack_feature_lists, dim=1 @@ -1433,7 +1491,18 @@ def _process_image_input( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - with set_forward_context(None, self.vllm_config): + if self.vllm_config.is_in_compile: + with set_forward_context(None, self.vllm_config): + if self.use_data_parallel: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d", + ) + else: + image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) + else: if self.use_data_parallel: return run_dp_sharded_mrope_vision_model( self.visual, pixel_values, grid_thw_list, rope_type="rope_3d" From 7dc0fcfb460be7ba4c1190ec31d518b504939999 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Tue, 30 Dec 2025 15:39:20 +0800 Subject: [PATCH 15/35] feat: add vit dp mode cuda graph Signed-off-by: Hongjian Zhang --- vllm/config/vllm.py | 13 +-- vllm/model_executor/models/qwen3_vl.py | 30 +++-- vllm/model_executor/models/vision.py | 91 +++++++++++---- vllm/v1/worker/gpu_model_runner.py | 147 +++++++++++++------------ 4 files changed, 168 insertions(+), 113 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 54aab49ff2ff..48c1e7be2d23 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1389,8 +1389,6 @@ def _set_vit_cudagraph_sizes(self): and not self.model_config.enforce_eager and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.compile_mm_encoder - and self.model_config.multimodal_config is not None - and self.model_config.multimodal_config.mm_encoder_tp_mode != "data" ): # determine the vit_cudagraph_capture_sizes if self.compilation_config.vit_cudagraph_capture_sizes is not None: @@ -1410,17 +1408,10 @@ def _set_vit_cudagraph_sizes(self): scheduler_config=self.scheduler_config, mm_registry=MULTIMODAL_REGISTRY, ) - max_vit_cudagraph_capture_size = min(encoder_compute_budget, 32768) + max_vit_cudagraph_capture_size = min(encoder_compute_budget, 8192) vit_cudagraph_capture_sizes = [ - i - for i in [16, 32, 64, 128, 256] - if i <= max_vit_cudagraph_capture_size + i for i in [512, 1024, 1536] if i <= max_vit_cudagraph_capture_size ] - if max_vit_cudagraph_capture_size >= 1024: - # Step size 64 for small batch sizes, up to 2048(not included) - vit_cudagraph_capture_sizes += list( - range(512, min(max_vit_cudagraph_capture_size + 1, 2048), 64) - ) if max_vit_cudagraph_capture_size >= 2048: # Step size 128 for larger batch sizes vit_cudagraph_capture_sizes += list( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 328a58361ed5..a4464d9868f4 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -49,10 +49,10 @@ from transformers.video_utils import VideoMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config +from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_pp_group -from vllm.forward_context import get_forward_context, set_forward_context +from vllm.forward_context import get_forward_context, set_forward_context, is_forward_context_available from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.conv import Conv3dLayer @@ -1481,7 +1481,8 @@ def _parse_and_validate_video_input( ) def _process_image_input( - self, image_input: Qwen2_5_VLImageInputs + self, image_input: Qwen2_5_VLImageInputs, + cudagraph_dispatcher: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = image_input["image_grid_thw"] assert grid_thw.ndim == 2 @@ -1491,22 +1492,26 @@ def _process_image_input( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) + maybe_in_vit_cuda_graph_capture = is_forward_context_available() if self.vllm_config.is_in_compile: with set_forward_context(None, self.vllm_config): - if self.use_data_parallel: + if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture: return run_dp_sharded_mrope_vision_model( self.visual, pixel_values, grid_thw_list, rope_type="rope_3d", + cudagraph_dispatcher=cudagraph_dispatcher, ) else: image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) else: - if self.use_data_parallel: - return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values, grid_thw_list, rope_type="rope_3d" - ) + if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture: + with set_current_vllm_config(self.vllm_config): + return run_dp_sharded_mrope_vision_model( + self.visual, pixel_values, grid_thw_list, rope_type="rope_3d", + cudagraph_dispatcher=cudagraph_dispatcher, + ) else: image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) @@ -1516,7 +1521,8 @@ def _process_image_input( return image_embeds.split(sizes) def _process_video_input( - self, video_input: Qwen2_5_VLVideoInputs + self, video_input: Qwen2_5_VLVideoInputs, + cudagraph_dispatcher: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 @@ -1535,6 +1541,7 @@ def _process_video_input( pixel_values_videos, grid_thw_list, rope_type="rope_3d", + cudagraph_dispatcher=cudagraph_dispatcher, ) else: video_embeds = self.visual( @@ -1983,6 +1990,7 @@ def get_mrope_input_positions( return torch.from_numpy(llm_positions), mrope_position_delta def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: + cudagraph_dispatcher = kwargs.pop("cudagraph_dispatcher", None) mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) if not mm_input_by_modality: return None @@ -1996,14 +2004,14 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - image_embeddings = self._process_image_input(multimodal_input) + image_embeddings = self._process_image_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher) if self.is_multimodal_pruning_enabled: image_embeddings = self._postprocess_image_embeds_evs( image_embeddings, multimodal_input ) multimodal_embeddings += tuple(image_embeddings) if modality == "video": - video_embeddings = self._process_video_input(multimodal_input) + video_embeddings = self._process_video_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher) if self.is_multimodal_pruning_enabled: video_embeddings = self._postprocess_video_embeds_evs( video_embeddings, multimodal_input diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index a2b78753a0c6..d4e9dd9fa159 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -16,6 +16,8 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) +from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.v1.attention.backends.registry import AttentionBackendEnum @@ -387,6 +389,7 @@ def run_dp_sharded_mrope_vision_model( grid_thw_list: list[list[int]], *, rope_type: Literal["rope_3d", "rope_2d"], + cudagraph_dispatcher: CudagraphDispatcher | None = None, ) -> tuple[torch.Tensor, ...]: """Run a vision model with data parallelism (DP) sharding. The function will shard the input image tensor on the @@ -462,43 +465,85 @@ def run_dp_sharded_mrope_vision_model( embed_dim_reduction_factor = ( vision_model.merge_kernel_size[0] * vision_model.merge_kernel_size[1] ) + merge_size = vision_model.merge_kernel_size[0] else: embed_dim_reduction_factor = ( vision_model.spatial_merge_size * vision_model.spatial_merge_size ) + merge_size = vision_model.spatial_merge_size # Find the max length across all ranks # The output embedding of every DP rank has to be # padded to this length for tensor_model_parallel_all_gather # to work - max_len_per_rank = max(grouped_pixel_values_len) // embed_dim_reduction_factor + vllm_config = get_current_vllm_config() + use_cudagraph = False + + if (vllm_config and + vllm_config.compilation_config.vit_cudagraph_capture_sizes): + max_input_len = max(grouped_pixel_values_len) if grouped_pixel_values_len else 0 + target_input_len = vllm_config.pad_for_vit_cudagraph(max_input_len) + max_len_per_rank = target_input_len // embed_dim_reduction_factor + use_cudagraph = True + else: + max_len_per_rank = (max(grouped_pixel_values_len) if grouped_pixel_values_len else 0) // embed_dim_reduction_factor + local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local] - # Run the vision model on the local pixel_values_local - if rope_type == "rope_2d": - if pixel_values_local.shape[0] > 0: - image_embeds_local = vision_model( - pixel_values_local, torch.tensor(local_grid_thw_list) - ) - if isinstance(image_embeds_local, list): - image_embeds_local = torch.cat(image_embeds_local, dim=0) - else: - out_dim = getattr(vision_model.config, "hidden_size", None) - image_embeds_local = torch.empty( - (0, embed_dim_reduction_factor, out_dim), - device=pixel_values.device, - dtype=pixel_values.dtype, + # Pad pixel_values_local for CUDA graph if needed + if use_cudagraph: + current_input_len = pixel_values_local.shape[0] + # target_input_len derived from max_len_per_rank for consistency + target_input_len = max_len_per_rank * embed_dim_reduction_factor + + if current_input_len < target_input_len: + padding_size = target_input_len - current_input_len + padding = torch.empty( + (padding_size, pixel_values_local.shape[1]), + device=pixel_values_local.device, + dtype=pixel_values_local.dtype, ) + pixel_values_local = torch.cat([pixel_values_local, padding], dim=0) + local_grid_thw_list.append([1, merge_size, padding_size // merge_size]) + + # Context setup + if cudagraph_dispatcher is not None: + dispatcher = cudagraph_dispatcher else: - if pixel_values_local.shape[0] > 0: - image_embeds_local = vision_model(pixel_values_local, local_grid_thw_list) + dispatcher = CudagraphDispatcher(vllm_config) + batch_descriptor = BatchDescriptor(num_tokens=pixel_values_local.shape[0], is_vit=True) + cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(batch_descriptor, False) + with set_forward_context( + None, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor + ): + # Run the vision model on the local pixel_values_local + if rope_type == "rope_2d": + if pixel_values_local.shape[0] > 0: + image_embeds_local = vision_model( + pixel_values_local, torch.tensor(local_grid_thw_list) + ) + if isinstance(image_embeds_local, list): + image_embeds_local = torch.cat(image_embeds_local, dim=0) + else: + out_dim = getattr(vision_model.config, "hidden_size", None) + image_embeds_local = torch.empty( + (0, embed_dim_reduction_factor, out_dim), + device=pixel_values.device, + dtype=pixel_values.dtype, + ) else: - # Handle empty case - image_embeds_local = torch.empty( - (0, vision_model.out_hidden_size), - device=pixel_values.device, - dtype=pixel_values.dtype, - ) + if pixel_values_local.shape[0] > 0: + image_embeds_local = vision_model(pixel_values_local, local_grid_thw_list) + else: + # Handle empty case + image_embeds_local = torch.empty( + (0, vision_model.out_hidden_size), + device=pixel_values.device, + dtype=pixel_values.dtype, + ) # Pad the output based on max_len_per_rank # for tensor_model_parallel_all_gather to work diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9a5ebe562dae..681c9d0f3563 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2438,88 +2438,99 @@ def _execute_mm_encoder( # 2. A list or tuple (length: num_items) of tensors, # each of shape (feature_size, hidden_size) in case the feature # size is dynamic depending on the input multimodal items. - original_num_imgs = -1 - padded_num_tokens = -1 - if self.vit_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group: - pixel_values = mm_kwargs_group["pixel_values"] - num_tokens = pixel_values.shape[0] - - # Pad to the size expected by CUDA graph - padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph( - num_tokens - ) - - if padded_num_tokens > num_tokens: - assert ( - self.pixel_values_buffer is not None - and self.image_grid_thw_buffer is not None + is_vit_dp_mode = ( + getattr(self.model_config.multimodal_config, "mm_encoder_tp_mode", None) == "data" + and self.parallel_config.tensor_parallel_size > 1 + ) + if not is_vit_dp_mode: + original_num_imgs = -1 + padded_num_tokens = -1 + if self.vit_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group: + pixel_values = mm_kwargs_group["pixel_values"] + num_tokens = pixel_values.shape[0] + + # Pad to the size expected by CUDA graph + padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph( + num_tokens ) - self.pixel_values_buffer[:num_tokens].copy_(pixel_values) # type: ignore - mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[ - :padded_num_tokens - ] - - # Update image_grid_thw to account for padding - if "image_grid_thw" in mm_kwargs_group: - image_grid_thw = mm_kwargs_group["image_grid_thw"] - num_images = image_grid_thw.shape[0] - original_num_imgs = num_images - padding_amount = padded_num_tokens - num_tokens - - # Treat padding as a new virtual image. - # Assuming a fixed patch size where height is merge_size. - merge_size = getattr( - self.model_config.hf_config.vision_config, - "spatial_merge_size", - 1, + if padded_num_tokens > num_tokens: + assert ( + self.pixel_values_buffer is not None + and self.image_grid_thw_buffer is not None ) - assert padding_amount % (merge_size * merge_size) == 0 - h_patches = merge_size - w_patches = padding_amount // h_patches - if num_images + 1 > self.image_grid_thw_buffer.shape[0]: - new_size = max( - self.image_grid_thw_buffer.shape[0] * 2, - num_images + 1, + + self.pixel_values_buffer[:num_tokens].copy_(pixel_values) # type: ignore + mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[ + :padded_num_tokens + ] + + # Update image_grid_thw to account for padding + if "image_grid_thw" in mm_kwargs_group: + image_grid_thw = mm_kwargs_group["image_grid_thw"] + num_images = image_grid_thw.shape[0] + original_num_imgs = num_images + padding_amount = padded_num_tokens - num_tokens + + # Treat padding as a new virtual image. + # Assuming a fixed patch size where height is merge_size. + merge_size = getattr( + self.model_config.hf_config.vision_config, + "spatial_merge_size", + 1, + ) + assert padding_amount % (merge_size * merge_size) == 0 + h_patches = merge_size + w_patches = padding_amount // h_patches + if num_images + 1 > self.image_grid_thw_buffer.shape[0]: + new_size = max( + self.image_grid_thw_buffer.shape[0] * 2, + num_images + 1, + ) + new_buffer = torch.zeros( + (new_size, 3), + dtype=torch.long, + device=self.device, + ) + self.image_grid_thw_buffer = new_buffer + self.image_grid_thw_buffer[:num_images].copy_( + image_grid_thw ) - new_buffer = torch.zeros( - (new_size, 3), + self.image_grid_thw_buffer[num_images] = torch.tensor( + [1, h_patches, w_patches], dtype=torch.long, device=self.device, ) - self.image_grid_thw_buffer = new_buffer - self.image_grid_thw_buffer[:num_images].copy_( - image_grid_thw - ) - self.image_grid_thw_buffer[num_images] = torch.tensor( - [1, h_patches, w_patches], - dtype=torch.long, - device=self.device, - ) - mm_kwargs_group["image_grid_thw"] = ( - self.image_grid_thw_buffer[: num_images + 1] - ) + mm_kwargs_group["image_grid_thw"] = ( + self.image_grid_thw_buffer[: num_images + 1] + ) - # get batch_descriptor from dispatcher - batch_descriptor = BatchDescriptor( - num_tokens=padded_num_tokens, - is_vit=True, - ) - cudagraph_runtime_mode, batch_descriptor = ( - self.cudagraph_dispatcher.dispatch(batch_descriptor, False) - ) - with set_forward_context( + # get batch_descriptor from dispatcher + batch_descriptor = BatchDescriptor( + num_tokens=padded_num_tokens, + is_vit=True, + ) + cudagraph_runtime_mode, batch_descriptor = ( + self.cudagraph_dispatcher.dispatch(batch_descriptor, False) + ) + with set_forward_context( None, vllm_config=self.vllm_config, cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_descriptor, - ), self.timed_encoder_operation( + ), self.timed_encoder_operation( + should_time, mm_lora_refs, current_item_idx, num_items + ): + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + # Remove the padded items before sanity check + if original_num_imgs != -1: + curr_group_outputs = curr_group_outputs[:original_num_imgs] + else: + with self.timed_encoder_operation( should_time, mm_lora_refs, current_item_idx, num_items ): - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) - # Remove the padded items before sanity check - if original_num_imgs != -1: - curr_group_outputs = curr_group_outputs[:original_num_imgs] + mm_kwargs_group["cudagraph_dispatcher"] = self.cudagraph_dispatcher + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) sanity_check_mm_encoder_outputs( curr_group_outputs, expected_num_items=num_items, From c0e8849e46d9c137ebc84cf0938bc003b2f0e892 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Tue, 30 Dec 2025 15:41:16 +0800 Subject: [PATCH 16/35] chore: remove ViT's useless persistent buffer at engine level Signed-off-by: Hongjian Zhang --- vllm/v1/worker/gpu_model_runner.py | 73 +++++++++--------------------- 1 file changed, 21 insertions(+), 52 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 681c9d0f3563..7d57537ca1cc 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -610,13 +610,6 @@ def __init__( ] self.is_mm_embed_idx = 0 - # START: Add persistent buffers for ViT inputs - # Use a large enough size for the CUDA graph - # The feature dimension is model-specific. We'll initialize - # the buffer lazily on the first run to get this dimension. - self.pixel_values_buffer: torch.Tensor | None = None - self.image_grid_thw_buffer: torch.Tensor | None = None - # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: # NOTE: `mrope_positions` is implemented with one additional dummy @@ -2455,22 +2448,20 @@ def _execute_mm_encoder( ) if padded_num_tokens > num_tokens: - assert ( - self.pixel_values_buffer is not None - and self.image_grid_thw_buffer is not None + padding_amount = padded_num_tokens - num_tokens + padding_tensor = torch.zeros( + (padding_amount, pixel_values.shape[1]), + dtype=pixel_values.dtype, + device=pixel_values.device, + ) + mm_kwargs_group["pixel_values"] = torch.cat( + [pixel_values, padding_tensor], dim=0 ) - - self.pixel_values_buffer[:num_tokens].copy_(pixel_values) # type: ignore - mm_kwargs_group["pixel_values"] = self.pixel_values_buffer[ - :padded_num_tokens - ] # Update image_grid_thw to account for padding if "image_grid_thw" in mm_kwargs_group: image_grid_thw = mm_kwargs_group["image_grid_thw"] - num_images = image_grid_thw.shape[0] - original_num_imgs = num_images - padding_amount = padded_num_tokens - num_tokens + original_num_imgs = image_grid_thw.shape[0] # Treat padding as a new virtual image. # Assuming a fixed patch size where height is merge_size. @@ -2482,27 +2473,13 @@ def _execute_mm_encoder( assert padding_amount % (merge_size * merge_size) == 0 h_patches = merge_size w_patches = padding_amount // h_patches - if num_images + 1 > self.image_grid_thw_buffer.shape[0]: - new_size = max( - self.image_grid_thw_buffer.shape[0] * 2, - num_images + 1, - ) - new_buffer = torch.zeros( - (new_size, 3), - dtype=torch.long, - device=self.device, - ) - self.image_grid_thw_buffer = new_buffer - self.image_grid_thw_buffer[:num_images].copy_( - image_grid_thw + padding_grid_info = torch.tensor( + [[1, h_patches, w_patches]], + dtype=image_grid_thw.dtype, + device=image_grid_thw.device, ) - self.image_grid_thw_buffer[num_images] = torch.tensor( - [1, h_patches, w_patches], - dtype=torch.long, - device=self.device, - ) - mm_kwargs_group["image_grid_thw"] = ( - self.image_grid_thw_buffer[: num_images + 1] + mm_kwargs_group["image_grid_thw"] = torch.cat( + [image_grid_thw, padding_grid_info], dim=0 ) # get batch_descriptor from dispatcher @@ -5254,20 +5231,12 @@ def _dummy_mm_encoder_run( self, compilation_cases: list[int], ) -> None: - if self.pixel_values_buffer is None: - tmp_dummy_mm_inputs = self._get_mm_dummy_batch( - "image", - 1, - ) - img_feature_dim = tmp_dummy_mm_inputs["pixel_values"].shape[1] - self.pixel_values_buffer = torch.zeros( - (compilation_cases[0], img_feature_dim), - dtype=self.dtype, - device=self.device, - ) - self.image_grid_thw_buffer = torch.zeros( - (512, 3), dtype=torch.long, device=self.device - ) + tmp_dummy_mm_inputs = self._get_mm_dummy_batch( + "image", + 1, + ) + img_feature_dim = tmp_dummy_mm_inputs["pixel_values"].shape[1] + if is_global_first_rank(): compilation_cases = tqdm( compilation_cases, From ef7e45d74ec5fac5240f5d4ac823ef7b45df93a3 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Tue, 30 Dec 2025 15:41:33 +0800 Subject: [PATCH 17/35] feat: add FA and sdpa wrappers to compilation config Signed-off-by: Hongjian Zhang --- vllm/config/compilation.py | 2 ++ vllm/v1/attention/ops/vit_attn_wrappers.py | 21 ++++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index b74988b2a711..327869bfd44e 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -652,6 +652,8 @@ class CompilationConfig: "vllm::kda_attention", "vllm::sparse_attn_indexer", "vllm::rocm_aiter_sparse_attn_indexer", + "vllm::flash_attn_maxseqlen_wrapper", + "vllm::torch_sdpa_wrapper", ] def compute_hash(self) -> str: diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index f077a61c984f..f6051e54713b 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -26,6 +26,7 @@ def flash_attn_maxseqlen_wrapper( v: torch.Tensor, batch_size: int, is_rocm_aiter: bool, + output: torch.Tensor, fa_version: int | None, scale: float | None = None, cu_seqlens: torch.Tensor | None = None, @@ -48,7 +49,7 @@ def flash_attn_maxseqlen_wrapper( max_seqlen = q_len if max_seqlen is None else max_seqlen.item() q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) - output = flash_attn_varlen_func( + fa_output = flash_attn_varlen_func( q, k, v, @@ -61,8 +62,9 @@ def flash_attn_maxseqlen_wrapper( softmax_scale=scale, **kwargs, ) - context_layer = einops.rearrange(output, "(b s) h d -> b s h d", b=batch_size) - return context_layer + context_layer = einops.rearrange(fa_output, "(b s) h d -> b s h d", b=batch_size) + output.copy_(context_layer) + return output def flash_attn_maxseqlen_wrapper_fake( @@ -71,6 +73,7 @@ def flash_attn_maxseqlen_wrapper_fake( v: torch.Tensor, batch_size: int, is_rocm_aiter: bool, + output: torch.Tensor, fa_version: int | None, scale: float | None = None, cu_seqlens: torch.Tensor | None = None, @@ -97,12 +100,15 @@ def vit_flash_attn_wrapper( cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, ) -> torch.Tensor: + b, s, h, d = q.shape + output = torch.empty((b, s, h, d), dtype=q.dtype, device=q.device) return torch.ops.vllm.flash_attn_maxseqlen_wrapper( q, k, v, batch_size, is_rocm_aiter, + output, fa_version, scale, cu_seqlens, @@ -132,6 +138,7 @@ def torch_sdpa_wrapper( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + output: torch.Tensor, scale: float | None = None, cu_seqlens: torch.Tensor | None = None, ) -> torch.Tensor: @@ -155,13 +162,15 @@ def torch_sdpa_wrapper( output_i = apply_sdpa(q_i, k_i, v_i, scale=scale) outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) - return context_layer + output.copy_(context_layer) + return output def torch_sdpa_wrapper_fake( q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + output: torch.Tensor, scale: float | None, cu_seqlens: torch.Tensor | None, ) -> torch.Tensor: @@ -182,4 +191,6 @@ def vit_torch_sdpa_wrapper( scale: float | None = None, cu_seqlens: torch.Tensor | None = None, ) -> torch.Tensor: - return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, scale, cu_seqlens) + b, s, h, d = q.shape + output = torch.empty((b, s, h, d), dtype=q.dtype, device=q.device) + return torch.ops.vllm.torch_sdpa_wrapper(q, k, v, output, scale, cu_seqlens) From e23899d454ab5400ffbac3e99b1410f7d1300fe3 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Wed, 31 Dec 2025 12:02:58 +0800 Subject: [PATCH 18/35] fix: update dummy input type from image to video to avoid preprocess_opt warmup problem Signed-off-by: Hongjian Zhang --- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 7d57537ca1cc..bda3d3f8999c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5232,10 +5232,10 @@ def _dummy_mm_encoder_run( compilation_cases: list[int], ) -> None: tmp_dummy_mm_inputs = self._get_mm_dummy_batch( - "image", + "video", 1, ) - img_feature_dim = tmp_dummy_mm_inputs["pixel_values"].shape[1] + img_feature_dim = tmp_dummy_mm_inputs["pixel_values_videos"].shape[1] if is_global_first_rank(): compilation_cases = tqdm( From 506f75bb743b3486fd374704ce2b2531c6e22152 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Mon, 5 Jan 2026 12:01:45 +0800 Subject: [PATCH 19/35] feat: add max_vit_cudagraph_capture_size and simplify code Signed-off-by: Hongjian Zhang --- docs/design/torch_compile.md | 7 ++- vllm/compilation/monitor.py | 3 +- vllm/config/compilation.py | 19 ++++++ vllm/config/vllm.py | 77 ++++++++++++++++++------ vllm/model_executor/models/qwen2_5_vl.py | 46 ++++++++------ vllm/model_executor/models/qwen3_vl.py | 43 ++++++------- vllm/v1/worker/gpu_model_runner.py | 72 +++++++++------------- 7 files changed, 159 insertions(+), 108 deletions(-) diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md index 041f029294e4..8532c7020cbc 100644 --- a/docs/design/torch_compile.md +++ b/docs/design/torch_compile.md @@ -253,11 +253,16 @@ By default, vLLM will try to determine a set of sizes to capture cudagraph. You vllm serve meta-llama/Llama-3.2-1B \ --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}' ``` -Similarly, For `Qwen2.5-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. Note that ViT DP mode is **not supported**. By default, this is disabled as `compile_mm_encoder` is `False`. To enable it and specify capture sizes, you can do the following: +Similarly, For `Qwen2.5-VL`,`Qwen3-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. By default, this is disabled as `compile_mm_encoder` is `False`. To enable it and specify capture sizes, you can do the following: ```bash vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": [512, 1024]}' ``` +Alternatively, you can specify `max_vit_cudagraph_capture_size` to generate a default list of capture sizes up to the given value: +```bash +vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ + --compilation-config '{"compile_mm_encoder": true, "max_vit_cudagraph_capture_size": 2048}' +``` Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index 912e3d828abc..ca56574cfb30 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -31,8 +31,6 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None: def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None: - vllm_config.is_in_compile = False - compilation_config: CompilationConfig = vllm_config.compilation_config if compilation_config.mode == CompilationMode.VLLM_COMPILE: logger.info_once( @@ -45,6 +43,7 @@ def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None: context_manager.__exit__(None, None, None) context_manager = None + vllm_config.is_in_compile = False cudagraph_capturing_enabled: bool = True diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 327869bfd44e..99543929f2dc 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -534,6 +534,9 @@ class CompilationConfig: """Sizes to capture vit cudagraph. - None (default): capture sizes are inferred from vllm config. - list[int]: capture sizes are specified as given.""" + max_vit_cudagraph_capture_size: int | None = field(default=None) + """The maximum vit cudagraph capture size. + """ cudagraph_copy_inputs: bool = False """Whether to copy input tensors for cudagraph. If the caller can guarantee that the same input buffers @@ -1166,3 +1169,19 @@ def get_compile_ranges(self) -> list[Range]: Range(start=s + 1, end=e) for s, e in zip([0] + split_points[:-1], split_points) ] + + def compute_bs_to_padded_vit_graph_size(self): + # pre-compute the mapping from batch size to padded graph size + self.bs_to_padded_vit_graph_size = [ + 0 for i in range(self.max_vit_cudagraph_capture_size + 1) + ] + for end, start in zip( + self.vit_cudagraph_capture_sizes + + [self.max_vit_cudagraph_capture_size + 1], + [0] + self.vit_cudagraph_capture_sizes, + ): + for bs in range(start, end): + if bs == start: + self.bs_to_padded_vit_graph_size[bs] = start + else: + self.bs_to_padded_vit_graph_size[bs] = end diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 48c1e7be2d23..9f05046839c3 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import bisect import copy import getpass import json @@ -369,12 +368,16 @@ def pad_for_cudagraph(self, batch_size: int) -> int: return self.compilation_config.bs_to_padded_graph_size[batch_size] def pad_for_vit_cudagraph(self, batch_size: int) -> int: - capture_sizes = self.compilation_config.vit_cudagraph_capture_sizes - # Find the insertion point for batch_size to maintain order. - # This gives the index of the first element >= batch_size. - idx = bisect.bisect_left(capture_sizes, batch_size) - - return capture_sizes[idx] if idx < len(capture_sizes) else batch_size + if ( + self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and hasattr(self.compilation_config, "max_vit_cudagraph_capture_size") + and self.compilation_config.max_vit_cudagraph_capture_size + and batch_size <= self.compilation_config.max_vit_cudagraph_capture_size + ): + # Use CUDA graphs. + # Add padding to the batch size. + return self.compilation_config.bs_to_padded_vit_graph_size[batch_size] + return batch_size @property def needs_dp_coordinator(self) -> bool: @@ -1362,8 +1365,6 @@ def _set_vit_cudagraph_sizes(self): - Eager mode is not enforced. - CUDA graph mode is enabled. - The multimodal encoder compilation is enabled. - - A multimodal config is present. - - The multimodal encoder tensor parallelism mode is not "data". If these conditions are not met, the list of capture sizes will be empty, effectively disabling ViT CUDA graphs. @@ -1373,8 +1374,8 @@ def _set_vit_cudagraph_sizes(self): 3. If no sizes are provided by the user, a default list of sizes is generated up to a maximum of 5120. The default sizes are: - [16, 32, 64, 128, 256] + list(range(512, 2048, 64)) + list( - range(2048, 5120 + 1, 128)) + [512, 1024, 1536] + list(range(2048, 2048, 128)) + list( + range(4096, 8192 + 1, 256)) The final list of sizes is stored in `self.compilation_config.vit_cudagraph_capture_sizes`. @@ -1390,6 +1391,21 @@ def _set_vit_cudagraph_sizes(self): and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.compile_mm_encoder ): + # determine the initial max_vit_cudagraph_capture_size + max_vit_cudagraph_capture_size = ( + self.compilation_config.max_vit_cudagraph_capture_size + ) + if max_vit_cudagraph_capture_size is None: + from vllm.multimodal import MULTIMODAL_REGISTRY + from vllm.v1.core.encoder_cache_manager import compute_encoder_budget + + encoder_compute_budget, _ = compute_encoder_budget( + model_config=self.model_config, + scheduler_config=self.scheduler_config, + mm_registry=MULTIMODAL_REGISTRY, + ) + max_vit_cudagraph_capture_size = min(encoder_compute_budget, 8192) + # determine the vit_cudagraph_capture_sizes if self.compilation_config.vit_cudagraph_capture_sizes is not None: # de-duplicate the sizes provided by the config @@ -1400,15 +1416,6 @@ def _set_vit_cudagraph_sizes(self): # sort to make sure the sizes are in ascending order vit_cudagraph_capture_sizes.sort() else: - from vllm.multimodal import MULTIMODAL_REGISTRY - from vllm.v1.core.encoder_cache_manager import compute_encoder_budget - - encoder_compute_budget, _ = compute_encoder_budget( - model_config=self.model_config, - scheduler_config=self.scheduler_config, - mm_registry=MULTIMODAL_REGISTRY, - ) - max_vit_cudagraph_capture_size = min(encoder_compute_budget, 8192) vit_cudagraph_capture_sizes = [ i for i in [512, 1024, 1536] if i <= max_vit_cudagraph_capture_size ] @@ -1422,13 +1429,43 @@ def _set_vit_cudagraph_sizes(self): vit_cudagraph_capture_sizes += list( range(4096, max_vit_cudagraph_capture_size + 1, 256) ) + + # user-specific compilation_config.max_vit_cudagraph_capture_size get + # truncated to valid_max_size when they are inconsistent. + valid_max_size = ( + vit_cudagraph_capture_sizes[-1] if vit_cudagraph_capture_sizes else 0 + ) + if ( + self.compilation_config.max_vit_cudagraph_capture_size is not None + and self.compilation_config.max_vit_cudagraph_capture_size + != valid_max_size + ): + # raise error only when both two flags are user-specified + # and they are inconsistent with each other + if self.compilation_config.vit_cudagraph_capture_sizes is not None: + raise ValueError( + "customized max_vit_cudagraph_capture_size" + f"(={self.compilation_config.max_vit_cudagraph_capture_size}) " + "should be consistent with the max value of " + f"vit_cudagraph_capture_sizes(={valid_max_size})" + ) + + logger.warning( + "Truncating max_vit_cudagraph_capture_size to %d", + valid_max_size, + ) + # always set the final max_vit_cudagraph_capture_size + self.compilation_config.max_vit_cudagraph_capture_size = valid_max_size self.compilation_config.vit_cudagraph_capture_sizes = ( vit_cudagraph_capture_sizes ) else: # no cudagraph in use + self.compilation_config.max_vit_cudagraph_capture_size = 0 self.compilation_config.vit_cudagraph_capture_sizes = [] + self.compilation_config.compute_bs_to_padded_vit_graph_size() + def try_verify_and_update_config(self): if self.model_config is None: return diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 02e6af8334ac..cd23032b0d0c 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -27,6 +27,7 @@ """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence +from contextlib import nullcontext from functools import lru_cache, partial from typing import Annotated, Any, Literal, TypeAlias @@ -46,7 +47,7 @@ from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.forward_context import get_forward_context, set_forward_context +from vllm.forward_context import get_forward_context, set_forward_context, is_forward_context_available from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.attention import MMEncoderAttention @@ -1256,7 +1257,8 @@ def _parse_and_validate_video_input( ) def _process_image_input( - self, image_input: Qwen2_5_VLImageInputs + self, image_input: Qwen2_5_VLImageInputs, + cudagraph_dispatcher: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = image_input["image_grid_thw"] assert grid_thw.ndim == 2 @@ -1266,21 +1268,24 @@ def _process_image_input( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"] - if self.vllm_config.is_in_compile: - with set_forward_context(None, self.vllm_config): - if self.use_data_parallel: - return run_dp_sharded_mrope_vision_model( - self.visual, - pixel_values, - grid_thw_list, - rope_type="rope_3d", - ) - else: - image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) - else: - if self.use_data_parallel: + maybe_in_vit_cuda_graph_capture = False + if is_forward_context_available(): + ctx = get_forward_context() + if ctx.cudagraph_runtime_mode != CUDAGraphMode.NONE: + maybe_in_vit_cuda_graph_capture = True + context = ( + set_forward_context(None, self.vllm_config) + if self.vllm_config.is_in_compile + else nullcontext() + ) + with context: + if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture: return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values, grid_thw_list, rope_type="rope_3d" + self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d", + cudagraph_dispatcher=cudagraph_dispatcher, ) else: image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) @@ -1322,7 +1327,8 @@ def _postprocess_image_embeds_evs( return tuple(image_embeds_split) def _process_video_input( - self, video_input: Qwen2_5_VLVideoInputs + self, video_input: Qwen2_5_VLVideoInputs, + cudagraph_dispatcher: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 @@ -1339,6 +1345,7 @@ def _process_video_input( pixel_values_videos, grid_thw_list, rope_type="rope_3d", + cudagraph_dispatcher=cudagraph_dispatcher, ) else: video_embeds = self.visual( @@ -1488,6 +1495,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: return mm_input_by_modality def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: + cudagraph_dispatcher = kwargs.pop("cudagraph_dispatcher", None) mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) if not mm_input_by_modality: return [] @@ -1501,14 +1509,14 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - image_embeddings = self._process_image_input(multimodal_input) + image_embeddings = self._process_image_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher) if self.is_multimodal_pruning_enabled: image_embeddings = self._postprocess_image_embeds_evs( image_embeddings, multimodal_input ) multimodal_embeddings += tuple(image_embeddings) if modality == "video": - video_embeddings = self._process_video_input(multimodal_input) + video_embeddings = self._process_video_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher) if self.is_multimodal_pruning_enabled: video_embeddings = self._postprocess_video_embeds_evs( video_embeddings, multimodal_input diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index a4464d9868f4..4ddc0c85e627 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -25,6 +25,7 @@ """Inference-only Qwen3VL model compatible with HuggingFace weights.""" from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence +from contextlib import nullcontext from functools import lru_cache, partial from itertools import islice from typing import Any @@ -49,7 +50,7 @@ from transformers.video_utils import VideoMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config +from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_pp_group from vllm.forward_context import get_forward_context, set_forward_context, is_forward_context_available @@ -648,7 +649,8 @@ def forward( hidden_states ) deepstack_feature_lists.append(deepstack_feature) - hidden_states = self.merger(hidden_states) + with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(True): + hidden_states = self.merger(hidden_states) hidden_states = torch.cat( [hidden_states] + deepstack_feature_lists, dim=1 ) # [seq_len, hidden_size * (1 + depth_of_deepstack)] @@ -1492,26 +1494,25 @@ def _process_image_input( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - maybe_in_vit_cuda_graph_capture = is_forward_context_available() - if self.vllm_config.is_in_compile: - with set_forward_context(None, self.vllm_config): - if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture: - return run_dp_sharded_mrope_vision_model( - self.visual, - pixel_values, - grid_thw_list, - rope_type="rope_3d", - cudagraph_dispatcher=cudagraph_dispatcher, - ) - else: - image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) - else: + maybe_in_vit_cuda_graph_capture = False + if is_forward_context_available(): + ctx = get_forward_context() + if ctx.cudagraph_runtime_mode != CUDAGraphMode.NONE: + maybe_in_vit_cuda_graph_capture = True + context = ( + set_forward_context(None, self.vllm_config) + if self.vllm_config.is_in_compile + else nullcontext() + ) + with context: if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture: - with set_current_vllm_config(self.vllm_config): - return run_dp_sharded_mrope_vision_model( - self.visual, pixel_values, grid_thw_list, rope_type="rope_3d", - cudagraph_dispatcher=cudagraph_dispatcher, - ) + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d", + cudagraph_dispatcher=cudagraph_dispatcher, + ) else: image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index bda3d3f8999c..3db59c96cb30 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -28,6 +28,7 @@ CompilationMode, CUDAGraphMode, VllmConfig, + set_current_vllm_config, get_layers_from_vllm_config, update_config, ) @@ -553,7 +554,7 @@ def __init__( self.compilation_config.cudagraph_capture_sizes ) # self.vit_cudagraph_batch_sizes sorts in ascending order. - self.vit_cudagraph_batch_sizes: list[int] | None + self.vit_cudagraph_batch_sizes: list[int] | None = None if ( self.compilation_config.vit_cudagraph_capture_sizes and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE @@ -561,8 +562,6 @@ def __init__( self.vit_cudagraph_batch_sizes = sorted( self.compilation_config.vit_cudagraph_capture_sizes ) - else: - self.vit_cudagraph_batch_sizes = None # Cache the device properties. self._init_device_properties() @@ -2465,14 +2464,7 @@ def _execute_mm_encoder( # Treat padding as a new virtual image. # Assuming a fixed patch size where height is merge_size. - merge_size = getattr( - self.model_config.hf_config.vision_config, - "spatial_merge_size", - 1, - ) - assert padding_amount % (merge_size * merge_size) == 0 - h_patches = merge_size - w_patches = padding_amount // h_patches + h_patches, w_patches = self._get_dummy_h_w_patches(padding_amount) padding_grid_info = torch.tensor( [[1, h_patches, w_patches]], dtype=image_grid_thw.dtype, @@ -2499,11 +2491,11 @@ def _execute_mm_encoder( should_time, mm_lora_refs, current_item_idx, num_items ): curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) - # Remove the padded items before sanity check - if original_num_imgs != -1: - curr_group_outputs = curr_group_outputs[:original_num_imgs] + # Remove the padded items before sanity check + if original_num_imgs != -1: + curr_group_outputs = curr_group_outputs[:original_num_imgs] else: - with self.timed_encoder_operation( + with set_current_vllm_config(self.vllm_config), self.timed_encoder_operation( should_time, mm_lora_refs, current_item_idx, num_items ): mm_kwargs_group["cudagraph_dispatcher"] = self.cudagraph_dispatcher @@ -4651,44 +4643,34 @@ def rand_inputs_embeds() -> torch.Tensor: yield inputs_embeds.fill_(0) + def _get_dummy_h_w_patches(self, patches: int): + vision_config = self.model_config.hf_config.vision_config + if hasattr(vision_config, "spatial_merge_size"): + merge_size = vision_config.spatial_merge_size + elif hasattr(vision_config, "merge_kernel_size"): + merge_size = vision_config.merge_kernel_size[0] + else: + merge_size = 1 + + assert patches % (merge_size * merge_size) == 0, ( + "Number of patches must be multiple of merge_size squared" + ) + h_patches = merge_size + w_patches = patches // merge_size + return h_patches, w_patches + def _get_dummy_vit_input( self, num_image_tokens: int, img_feature_dim: int ) -> BatchedTensorInputs: - """ - Generates dummy multimodal inputs for a single image, with a controllable - number of resulting image tokens for a Vision Transformer (ViT) like model, - ensuring a square-like aspect ratio for the patch grid. - - This is useful for profiling or testing, allowing the creation of inputs - that result in a specific number of image tokens after vision encoding. - - Args: - num_image_tokens: The desired number of image tokens after encoding. - - Returns: - A BatchedTensorInputs dictionary containing `pixel_values` and - `image_grid_thw` that can be passed as kwargs to - `embed_multimodal`. - """ - - def _get_dummy_h_w_patches(patches: int): - merge_size = getattr( - self.model_config.hf_config.vision_config, "spatial_merge_size", 1 - ) - assert patches % (merge_size * merge_size) == 0, ( - "Number of patches must be multiple of merge_size squared" - ) - h_patches = merge_size - w_patches = patches // merge_size - return h_patches, w_patches + """Dummy data for profiling and precompiling ViT.""" - # The first dimension of pixel_values corresponds to the total number of - # tokens (patches). + # The first dimension of pixel_values corresponds + # to the total number of patches. pixel_values = torch.zeros( (num_image_tokens, img_feature_dim), dtype=self.dtype, device=self.device ) - h_patches, w_patches = _get_dummy_h_w_patches(num_image_tokens) + h_patches, w_patches = self._get_dummy_h_w_patches(num_image_tokens) image_grid_thw = torch.tensor( [[1, h_patches, w_patches]], dtype=torch.long, device=self.device ) From ee801444ad6bccadccb6b1a811c0e72be9f26c70 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Sun, 11 Jan 2026 20:44:51 +0800 Subject: [PATCH 20/35] rebase to v0.13.0 Signed-off-by: Hongjian Zhang --- vllm/compilation/monitor.py | 3 -- vllm/config/compilation.py | 2 +- vllm/config/vllm.py | 9 +++- vllm/model_executor/models/qwen2_5_vl.py | 50 ++++++++++----------- vllm/model_executor/models/qwen3_vl.py | 56 ++++++++++++------------ vllm/model_executor/models/vision.py | 32 +++++++++----- vllm/v1/cudagraph_dispatcher.py | 23 ++++++---- vllm/v1/worker/gpu_model_runner.py | 53 ++++++++++++---------- 8 files changed, 124 insertions(+), 104 deletions(-) diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index ca56574cfb30..2bad5f0a16fc 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -13,8 +13,6 @@ def start_monitoring_torch_compile(vllm_config: VllmConfig) -> None: - vllm_config.is_in_compile = True - global torch_compile_start_time torch_compile_start_time = time.time() @@ -43,7 +41,6 @@ def end_monitoring_torch_compile(vllm_config: VllmConfig) -> None: context_manager.__exit__(None, None, None) context_manager = None - vllm_config.is_in_compile = False cudagraph_capturing_enabled: bool = True diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 99543929f2dc..c573f2636be4 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -534,7 +534,7 @@ class CompilationConfig: """Sizes to capture vit cudagraph. - None (default): capture sizes are inferred from vllm config. - list[int]: capture sizes are specified as given.""" - max_vit_cudagraph_capture_size: int | None = field(default=None) + max_vit_cudagraph_capture_size: int = field(default=None) """The maximum vit cudagraph capture size. """ cudagraph_copy_inputs: bool = False diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 9f05046839c3..975045d085f8 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -257,8 +257,13 @@ class VllmConfig: performance, with -O0 having the best startup time and -O3 having the best performance. -02 is used by defult. See OptimizationLevel for full description.""" - is_in_compile: bool = False - """For ViT Compile, Compile Status Flag""" + is_in_compile_or_vit_cuda_graph_capture: bool = False + """Flag for ViT compilation or ViT CUDA graph capture. + + If true, ViT in DP mode will execute the ViT model directly instead of + `run_dp_sharded_mrope_vision_model` to ensure correct memory profiling + and compilation for each rank. + """ def compute_hash(self) -> str: """ diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index cd23032b0d0c..ce133ac02205 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -27,7 +27,6 @@ """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence -from contextlib import nullcontext from functools import lru_cache, partial from typing import Annotated, Any, Literal, TypeAlias @@ -44,10 +43,10 @@ ) from vllm.compilation.decorators import support_torch_compile -from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config +from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.forward_context import get_forward_context, set_forward_context, is_forward_context_available +from vllm.forward_context import get_forward_context, is_forward_context_available from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.attention import MMEncoderAttention @@ -643,7 +642,8 @@ def __init__( ) vllm_config: VllmConfig = get_current_vllm_config() self._persistent_hidden_states_buffer: torch.Tensor | None = None - self._persistent_rotary_pos_emb_buffer: torch.Tensor | None = None + self._persistent_rotary_pos_emb_cos_buffer: torch.Tensor | None = None + self._persistent_rotary_pos_emb_sin_buffer: torch.Tensor | None = None if vllm_config.compilation_config.vit_cudagraph_capture_sizes: max_compile_size = ( vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] @@ -653,10 +653,11 @@ def __init__( device=self.device, dtype=self.dtype, ) - self._persistent_rotary_pos_emb_buffer = torch.empty( - (max_compile_size, head_dim // 2), - device=self.device, - dtype=torch.float32, + self._persistent_rotary_pos_emb_cos_buffer = torch.empty( + (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16 + ) + self._persistent_rotary_pos_emb_sin_buffer = torch.empty( + (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16 ) @property @@ -802,7 +803,9 @@ def forward( cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)] cu_seqlens: list = [] - fwd_ctx = get_forward_context() + fwd_ctx = None + if is_forward_context_available(): + fwd_ctx = get_forward_context() if ( self._persistent_hidden_states_buffer is not None and fwd_ctx @@ -872,14 +875,17 @@ def forward( rotary_pos_emb_sin = rotary_pos_emb_sin.to( device=self.device, non_blocking=True ) - rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True) if ( - self._persistent_rotary_pos_emb_buffer is not None + self._persistent_rotary_pos_emb_sin_buffer is not None + and self._persistent_rotary_pos_emb_cos_buffer is not None and fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE ): - rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_( - rotary_pos_emb + rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[:seq_len].copy_( + rotary_pos_emb_sin + ) + rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[:seq_len].copy_( + rotary_pos_emb_cos ) window_index = window_index.to(device=hidden_states.device, non_blocking=True) reverse_indices = reverse_indices.to( @@ -1268,18 +1274,8 @@ def _process_image_input( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"] - maybe_in_vit_cuda_graph_capture = False - if is_forward_context_available(): - ctx = get_forward_context() - if ctx.cudagraph_runtime_mode != CUDAGraphMode.NONE: - maybe_in_vit_cuda_graph_capture = True - context = ( - set_forward_context(None, self.vllm_config) - if self.vllm_config.is_in_compile - else nullcontext() - ) - with context: - if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture: + with set_current_vllm_config(self.vllm_config): + if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture: return run_dp_sharded_mrope_vision_model( self.visual, pixel_values, @@ -1338,8 +1334,8 @@ def _process_video_input( video_embeds = video_input["video_embeds"].type(self.visual.dtype) else: pixel_values_videos = video_input["pixel_values_videos"] - with set_forward_context(None, self.vllm_config): - if self.use_data_parallel: + with set_current_vllm_config(self.vllm_config): + if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture: return run_dp_sharded_mrope_vision_model( self.visual, pixel_values_videos, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 4ddc0c85e627..9ee61fb4851c 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -25,7 +25,6 @@ """Inference-only Qwen3VL model compatible with HuggingFace weights.""" from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence -from contextlib import nullcontext from functools import lru_cache, partial from itertools import islice from typing import Any @@ -50,10 +49,10 @@ from transformers.video_utils import VideoMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config +from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_pp_group -from vllm.forward_context import get_forward_context, set_forward_context, is_forward_context_available +from vllm.forward_context import get_forward_context, is_forward_context_available from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.conv import Conv3dLayer @@ -142,7 +141,8 @@ DUMMY_VIDEO_NUM_FRAMES = 2048 -@support_torch_compile(dynamic_arg_dims={"x": 0}) +@support_torch_compile(dynamic_arg_dims={"x": 0}, + enable_if=should_torch_compile_mm_vit) class Qwen3_VisionPatchEmbed(nn.Module): def __init__( self, @@ -210,8 +210,7 @@ def forward(self, x: torch.Tensor): @support_torch_compile( - dynamic_arg_dims={"x": 0, "cu_seqlens": 0, "rotary_pos_emb": 0, "seqlens": 0}, - mark_unbacked_dims={"seqlens": 0}, + dynamic_arg_dims={"x": 0, "cu_seqlens": 0, "rotary_pos_emb_cos": 0, "rotary_pos_emb_sin": 0}, enable_if=should_torch_compile_mm_vit, ) class Qwen3_VisionBlock(nn.Module): @@ -426,14 +425,18 @@ def __init__( ) vllm_config: VllmConfig = get_current_vllm_config() self._persistent_hidden_states_buffer: torch.Tensor | None = None - self._persistent_rotary_pos_emb_buffer: torch.Tensor | None = None + self._persistent_rotary_pos_emb_cos_buffer: torch.Tensor | None = None + self._persistent_rotary_pos_emb_sin_buffer: torch.Tensor | None = None if vllm_config.compilation_config.vit_cudagraph_capture_sizes: max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] self._persistent_hidden_states_buffer = torch.empty( (max_compile_size, self.patch_embed.proj.input_size), device=self.device, dtype=self.dtype ) - self._persistent_rotary_pos_emb_buffer = torch.empty( - (max_compile_size, head_dim // 2), device=self.device, dtype=torch.float32 + self._persistent_rotary_pos_emb_cos_buffer = torch.empty( + (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16 + ) + self._persistent_rotary_pos_emb_sin_buffer = torch.empty( + (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16 ) @property @@ -567,7 +570,9 @@ def forward( grid_thw: torch.Tensor | list[list[int]], ) -> torch.Tensor: seq_len, _ = x.size() - fwd_ctx = get_forward_context() + fwd_ctx = None + if is_forward_context_available(): + fwd_ctx = get_forward_context() if ( self._persistent_hidden_states_buffer is not None and fwd_ctx @@ -597,15 +602,17 @@ def forward( original_hidden_states = hidden_states hidden_states = hidden_states + pos_embeds rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) - rotary_pos_emb = self.rot_pos_emb(grid_thw_list) - rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, non_blocking=True) if ( - self._persistent_rotary_pos_emb_buffer is not None + self._persistent_rotary_pos_emb_sin_buffer is not None + and self._persistent_rotary_pos_emb_cos_buffer is not None and fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE ): - rotary_pos_emb = self._persistent_rotary_pos_emb_buffer[:seq_len].copy_( - rotary_pos_emb + rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[:seq_len].copy_( + rotary_pos_emb_sin + ) + rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[:seq_len].copy_( + rotary_pos_emb_cos ) cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( @@ -1494,18 +1501,9 @@ def _process_image_input( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - maybe_in_vit_cuda_graph_capture = False - if is_forward_context_available(): - ctx = get_forward_context() - if ctx.cudagraph_runtime_mode != CUDAGraphMode.NONE: - maybe_in_vit_cuda_graph_capture = True - context = ( - set_forward_context(None, self.vllm_config) - if self.vllm_config.is_in_compile - else nullcontext() - ) - with context: - if self.use_data_parallel and not maybe_in_vit_cuda_graph_capture: + + with set_current_vllm_config(self.vllm_config): + if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture: return run_dp_sharded_mrope_vision_model( self.visual, pixel_values, @@ -1535,8 +1533,8 @@ def _process_video_input( pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype ) - with set_forward_context(None, self.vllm_config): - if self.use_data_parallel: + with set_current_vllm_config(self.vllm_config): + if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture: return run_dp_sharded_mrope_vision_model( self.visual, pixel_values_videos, diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index d4e9dd9fa159..70e9a0011811 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -10,7 +10,7 @@ import torch from transformers import PretrainedConfig -from vllm.config import MultiModalConfig, VllmConfig, get_current_vllm_config +from vllm.config import MultiModalConfig, VllmConfig, CUDAGraphMode, get_current_vllm_config from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -479,10 +479,25 @@ def run_dp_sharded_mrope_vision_model( vllm_config = get_current_vllm_config() use_cudagraph = False + # Context setup + if cudagraph_dispatcher is not None: + dispatcher = cudagraph_dispatcher + else: + dispatcher = CudagraphDispatcher(vllm_config) + cudagraph_runtime_mode = CUDAGraphMode.NONE + batch_descriptor = None + if (vllm_config and vllm_config.compilation_config.vit_cudagraph_capture_sizes): max_input_len = max(grouped_pixel_values_len) if grouped_pixel_values_len else 0 - target_input_len = vllm_config.pad_for_vit_cudagraph(max_input_len) + cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch( + num_tokens=max_input_len, + uniform_decode=False, + has_lora=False, + disable_full=False, + is_vit=True, + ) + target_input_len = batch_descriptor.num_tokens max_len_per_rank = target_input_len // embed_dim_reduction_factor use_cudagraph = True else: @@ -506,17 +521,10 @@ def run_dp_sharded_mrope_vision_model( pixel_values_local = torch.cat([pixel_values_local, padding], dim=0) local_grid_thw_list.append([1, merge_size, padding_size // merge_size]) - # Context setup - if cudagraph_dispatcher is not None: - dispatcher = cudagraph_dispatcher - else: - dispatcher = CudagraphDispatcher(vllm_config) - batch_descriptor = BatchDescriptor(num_tokens=pixel_values_local.shape[0], is_vit=True) - cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch(batch_descriptor, False) with set_forward_context( - None, - vllm_config=vllm_config, - cudagraph_runtime_mode=cudagraph_runtime_mode, + None, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_descriptor ): # Run the vision model on the local pixel_values_local diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 3368f97fe3b3..71226d7c283d 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -125,10 +125,14 @@ def _create_padded_batch_descriptor( uniform_decode: bool, has_lora: bool, num_active_loras: int = 0, + is_vit: bool = False, ) -> BatchDescriptor: max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs uniform_decode_query_len = self.uniform_decode_query_len - num_tokens_padded = self._bs_to_padded_graph_size[num_tokens] + if is_vit: + num_tokens_padded = self.vllm_config.pad_for_vit_cudagraph(num_tokens) + else: + num_tokens_padded = self._bs_to_padded_graph_size[num_tokens] if uniform_decode and self.cudagraph_mode.has_mode(CUDAGraphMode.FULL): num_reqs = num_tokens_padded // uniform_decode_query_len @@ -143,6 +147,7 @@ def _create_padded_batch_descriptor( uniform=uniform_decode, has_lora=has_lora, num_active_loras=num_active_loras, + is_vit=is_vit ) def add_cudagraph_key( @@ -187,12 +192,12 @@ def initialize_cudagraph_keys( ).relax_for_mixed_batch_cudagraphs(), ) # ViT CUDAGraph Entry - for vit_patch_len in self.compilation_config.vit_cudagraph_capture_sizes: + for patch_len in self.compilation_config.vit_cudagraph_capture_sizes: self.add_cudagraph_key( cudagraph_mode.mixed_mode(), - BatchDescriptor( - num_tokens=vit_patch_len, uniform_decode=False, is_vit=True - ), + self._create_padded_batch_descriptor( + patch_len, False, False, is_vit=True + ).relax_for_mixed_batch_cudagraphs(), ) # if decode cudagraph mode is FULL, and we don't already have mixed @@ -229,6 +234,7 @@ def dispatch( has_lora: bool = False, disable_full: bool = False, num_active_loras: int = 0, + is_vit: bool = False, ) -> tuple[CUDAGraphMode, BatchDescriptor]: """ Given conditions(e.g.,batch descriptor and if using piecewise only), @@ -249,9 +255,10 @@ def dispatch( if ( not self.keys_initialized or self.cudagraph_mode == CUDAGraphMode.NONE - or num_tokens > self.compilation_config.max_cudagraph_capture_size + or (not is_vit and num_tokens > self.compilation_config.max_cudagraph_capture_size) + or (is_vit and num_tokens > self.compilation_config.max_vit_cudagraph_capture_size) ): - return CUDAGraphMode.NONE, BatchDescriptor(num_tokens) + return CUDAGraphMode.NONE, BatchDescriptor(num_tokens, is_vit=is_vit) effective_num_active_loras = num_active_loras if has_lora and num_active_loras > 0: @@ -270,7 +277,7 @@ def dispatch( effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1 batch_desc = self._create_padded_batch_descriptor( - num_tokens, uniform_decode, has_lora, effective_num_active_loras + num_tokens, uniform_decode, has_lora, effective_num_active_loras, is_vit ) relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3db59c96cb30..5d88be3212d0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -28,7 +28,6 @@ CompilationMode, CUDAGraphMode, VllmConfig, - set_current_vllm_config, get_layers_from_vllm_config, update_config, ) @@ -2437,14 +2436,23 @@ def _execute_mm_encoder( if not is_vit_dp_mode: original_num_imgs = -1 padded_num_tokens = -1 + + # Default values for non-ViT cudagraph case + cudagraph_runtime_mode = CUDAGraphMode.NONE + batch_descriptor = None if self.vit_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group: pixel_values = mm_kwargs_group["pixel_values"] num_tokens = pixel_values.shape[0] - # Pad to the size expected by CUDA graph - padded_num_tokens = self.vllm_config.pad_for_vit_cudagraph( - num_tokens + # get batch_descriptor from dispatcher + cudagraph_runtime_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch( + num_tokens=num_tokens, + uniform_decode=False, + has_lora=False, + disable_full=False, + is_vit=True, ) + padded_num_tokens = batch_descriptor.num_tokens if padded_num_tokens > num_tokens: padding_amount = padded_num_tokens - num_tokens @@ -2474,19 +2482,11 @@ def _execute_mm_encoder( [image_grid_thw, padding_grid_info], dim=0 ) - # get batch_descriptor from dispatcher - batch_descriptor = BatchDescriptor( - num_tokens=padded_num_tokens, - is_vit=True, - ) - cudagraph_runtime_mode, batch_descriptor = ( - self.cudagraph_dispatcher.dispatch(batch_descriptor, False) - ) with set_forward_context( - None, - vllm_config=self.vllm_config, - cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor, + None, + vllm_config=self.vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, ), self.timed_encoder_operation( should_time, mm_lora_refs, current_item_idx, num_items ): @@ -5213,6 +5213,7 @@ def _dummy_mm_encoder_run( self, compilation_cases: list[int], ) -> None: + self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = True tmp_dummy_mm_inputs = self._get_mm_dummy_batch( "video", 1, @@ -5228,21 +5229,27 @@ def _dummy_mm_encoder_run( # Lazy initialization of the persistent buffer for capture_size in compilation_cases: dummy_mm_inputs = self._get_dummy_vit_input(capture_size, img_feature_dim) - batch_descriptor = BatchDescriptor( + cudagraph_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch( num_tokens=capture_size, + uniform_decode=False, + has_lora=False, + disable_full=False, is_vit=True, ) with ( set_forward_context( None, - vllm_config=self.vllm_config, - cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE, + self.vllm_config, + num_tokens=capture_size, + cudagraph_runtime_mode=cudagraph_mode, batch_descriptor=batch_descriptor, ), ): self.model.embed_multimodal(**dummy_mm_inputs) + self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = False def profile_run(self) -> None: + self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = True # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: mm_config = self.model_config.multimodal_config @@ -5280,9 +5287,10 @@ def profile_run(self) -> None: ) # Run multimodal encoder. - dummy_encoder_outputs = self.model.embed_multimodal( - **batched_dummy_mm_inputs - ) + with set_forward_context(None, self.vllm_config): + dummy_encoder_outputs = self.model.embed_multimodal( + **batched_dummy_mm_inputs + ) sanity_check_mm_encoder_outputs( dummy_encoder_outputs, @@ -5306,6 +5314,7 @@ def profile_run(self) -> None: del hidden_states, output self.encoder_cache.clear() gc.collect() + self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = False def capture_model(self) -> int: if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE: From f8defd77f5ad37da20966186502887c9762b0ea5 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Sun, 11 Jan 2026 22:15:40 +0800 Subject: [PATCH 21/35] chore: Reduce unnecessary computations in ViT dp mode Signed-off-by: Hongjian Zhang --- vllm/model_executor/models/vision.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 70e9a0011811..6fff76a44a25 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -476,8 +476,10 @@ def run_dp_sharded_mrope_vision_model( # The output embedding of every DP rank has to be # padded to this length for tensor_model_parallel_all_gather # to work + max_len_per_rank = max(grouped_pixel_values_len) // embed_dim_reduction_factor + local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local] + vllm_config = get_current_vllm_config() - use_cudagraph = False # Context setup if cudagraph_dispatcher is not None: @@ -489,28 +491,17 @@ def run_dp_sharded_mrope_vision_model( if (vllm_config and vllm_config.compilation_config.vit_cudagraph_capture_sizes): - max_input_len = max(grouped_pixel_values_len) if grouped_pixel_values_len else 0 + current_input_len = pixel_values_local.shape[0] cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch( - num_tokens=max_input_len, + num_tokens=current_input_len, uniform_decode=False, has_lora=False, disable_full=False, is_vit=True, ) target_input_len = batch_descriptor.num_tokens - max_len_per_rank = target_input_len // embed_dim_reduction_factor - use_cudagraph = True - else: - max_len_per_rank = (max(grouped_pixel_values_len) if grouped_pixel_values_len else 0) // embed_dim_reduction_factor - - local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local] - - # Pad pixel_values_local for CUDA graph if needed - if use_cudagraph: - current_input_len = pixel_values_local.shape[0] - # target_input_len derived from max_len_per_rank for consistency - target_input_len = max_len_per_rank * embed_dim_reduction_factor - + + # Pad pixel_values_local for CUDA graph if needed if current_input_len < target_input_len: padding_size = target_input_len - current_input_len padding = torch.empty( From 602c69273dc32be501594112e258a49715004780 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Fri, 16 Jan 2026 14:22:52 +0800 Subject: [PATCH 22/35] fix: truncate padded output in CUDA graph execution to prevent all_gather hang Signed-off-by: Hongjian Zhang --- vllm/model_executor/models/vision.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 6fff76a44a25..0396408ae819 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -566,6 +566,9 @@ def run_dp_sharded_mrope_vision_model( device=image_embeds_local.device, ) image_embeds_local_padded = torch.cat([image_embeds_local, padding], dim=0) + # truncate the padded output from CUDA graph execution + elif current_len > max_len_per_rank: + image_embeds_local_padded = image_embeds_local[:max_len_per_rank] else: image_embeds_local_padded = image_embeds_local From c1746c1af1815fa2dfe9dfec646b4e8a06de520d Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Wed, 21 Jan 2026 16:53:45 +0800 Subject: [PATCH 23/35] fix: change padding init from empty to zeros to avoid FA3 issues Signed-off-by: Hongjian Zhang --- vllm/model_executor/models/vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 0396408ae819..22c9a30c23c6 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -504,7 +504,7 @@ def run_dp_sharded_mrope_vision_model( # Pad pixel_values_local for CUDA graph if needed if current_input_len < target_input_len: padding_size = target_input_len - current_input_len - padding = torch.empty( + padding = torch.zeros( (padding_size, pixel_values_local.shape[1]), device=pixel_values_local.device, dtype=pixel_values_local.dtype, From 99d8272915f67d35baaeab8eef73c82a8e78d9ff Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Fri, 23 Jan 2026 15:39:44 +0800 Subject: [PATCH 24/35] rebase to main 7ef587 Signed-off-by: Hongjian Zhang --- vllm/compilation/backends.py | 5 ++--- vllm/config/compilation.py | 16 ---------------- vllm/config/vllm.py | 14 -------------- vllm/model_executor/models/qwen2_5_vl.py | 12 ++++++------ vllm/model_executor/models/qwen3_vl.py | 12 ++++++------ vllm/v1/cudagraph_dispatcher.py | 19 ++++++++++++++++++- 6 files changed, 32 insertions(+), 46 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 20e74d619adf..615948072f72 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -56,7 +56,7 @@ @contextmanager -def set_is_last_graph_in_sequence(is_last: bool): +def set_is_last_graph_in_vit_sequence(is_last: bool): """Context manager to indicate if the current graph being compiled is the last one in a sequence of graphs (e.g., a sequence of blocks). """ @@ -75,7 +75,7 @@ def set_is_last_graph_in_sequence(is_last: bool): @contextmanager -def set_is_first_graph_in_sequence(is_first: bool): +def set_is_first_graph_in_vit_sequence(is_first: bool): """Context manager to indicate if the current graph being compiled is the first one in a sequence of graphs (e.g., a sequence of blocks). """ @@ -124,7 +124,6 @@ def copy_and_call(*args: Any) -> Any: return copy_and_call - def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: assert not envs.VLLM_USE_MEGA_AOT_ARTIFACT or envs.VLLM_USE_STANDALONE_COMPILE, ( "VLLM_USE_MEGA_AOT_ARTIFACT=1 requires VLLM_USE_STANDALONE_COMPILE=1" diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index c573f2636be4..2da96d938765 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -1169,19 +1169,3 @@ def get_compile_ranges(self) -> list[Range]: Range(start=s + 1, end=e) for s, e in zip([0] + split_points[:-1], split_points) ] - - def compute_bs_to_padded_vit_graph_size(self): - # pre-compute the mapping from batch size to padded graph size - self.bs_to_padded_vit_graph_size = [ - 0 for i in range(self.max_vit_cudagraph_capture_size + 1) - ] - for end, start in zip( - self.vit_cudagraph_capture_sizes - + [self.max_vit_cudagraph_capture_size + 1], - [0] + self.vit_cudagraph_capture_sizes, - ): - for bs in range(start, end): - if bs == start: - self.bs_to_padded_vit_graph_size[bs] = start - else: - self.bs_to_padded_vit_graph_size[bs] = end diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 975045d085f8..90b54e9e2396 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -372,18 +372,6 @@ def pad_for_cudagraph(self, batch_size: int) -> int: # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size return self.compilation_config.bs_to_padded_graph_size[batch_size] - def pad_for_vit_cudagraph(self, batch_size: int) -> int: - if ( - self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - and hasattr(self.compilation_config, "max_vit_cudagraph_capture_size") - and self.compilation_config.max_vit_cudagraph_capture_size - and batch_size <= self.compilation_config.max_vit_cudagraph_capture_size - ): - # Use CUDA graphs. - # Add padding to the batch size. - return self.compilation_config.bs_to_padded_vit_graph_size[batch_size] - return batch_size - @property def needs_dp_coordinator(self) -> bool: """ @@ -1469,8 +1457,6 @@ def _set_vit_cudagraph_sizes(self): self.compilation_config.max_vit_cudagraph_capture_size = 0 self.compilation_config.vit_cudagraph_capture_sizes = [] - self.compilation_config.compute_bs_to_padded_vit_graph_size() - def try_verify_and_update_config(self): if self.model_config is None: return diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index ce133ac02205..a7c16401360a 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -817,11 +817,11 @@ def forward( hidden_states = x.to(device=self.device, dtype=self.dtype) from vllm.compilation.backends import ( - set_is_first_graph_in_sequence, - set_is_last_graph_in_sequence, + set_is_first_graph_in_vit_sequence, + set_is_last_graph_in_vit_sequence, ) - with set_is_first_graph_in_sequence(True), set_is_last_graph_in_sequence(False): + with set_is_first_graph_in_vit_sequence(True), set_is_last_graph_in_vit_sequence(False): hidden_states = self.patch_embed(hidden_states) window_index_id = 0 @@ -913,8 +913,8 @@ def forward( hidden_states = original_hidden_states with ( - set_is_first_graph_in_sequence(False), - set_is_last_graph_in_sequence(False), + set_is_first_graph_in_vit_sequence(False), + set_is_last_graph_in_vit_sequence(False), ): for layer_num, blk in enumerate(self.blocks): if layer_num in self.fullatt_block_indexes: @@ -938,7 +938,7 @@ def forward( hidden_states = cast_overflow_tensors(hidden_states) # adapter - with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(True): + with set_is_first_graph_in_vit_sequence(False), set_is_last_graph_in_vit_sequence(True): hidden_states = self.merger(hidden_states) hidden_states = hidden_states[reverse_indices, :] return hidden_states diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 9ee61fb4851c..c012e17adfcb 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -584,11 +584,11 @@ def forward( hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True) from vllm.compilation.backends import ( - set_is_first_graph_in_sequence, - set_is_last_graph_in_sequence, + set_is_first_graph_in_vit_sequence, + set_is_last_graph_in_vit_sequence, ) - with set_is_first_graph_in_sequence(True), set_is_last_graph_in_sequence(False): + with set_is_first_graph_in_vit_sequence(True), set_is_last_graph_in_vit_sequence(False): hidden_states = self.patch_embed(hidden_states) if isinstance(grid_thw, list): @@ -639,8 +639,8 @@ def forward( deepstack_feature_lists = [] with ( - set_is_first_graph_in_sequence(False), - set_is_last_graph_in_sequence(False), + set_is_first_graph_in_vit_sequence(False), + set_is_last_graph_in_vit_sequence(False), ): for layer_num, blk in enumerate(self.blocks): hidden_states = blk( @@ -656,7 +656,7 @@ def forward( hidden_states ) deepstack_feature_lists.append(deepstack_feature) - with set_is_first_graph_in_sequence(False), set_is_last_graph_in_sequence(True): + with set_is_first_graph_in_vit_sequence(False), set_is_last_graph_in_vit_sequence(True): hidden_states = self.merger(hidden_states) hidden_states = torch.cat( [hidden_states] + deepstack_feature_lists, dim=1 diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 71226d7c283d..90572805f750 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -119,6 +119,22 @@ def _get_lora_cases(self) -> list[int]: # No specialization: only capture graphs with LoRA active return [lora_config.max_loras + 1] + def _compute_bs_to_padded_vit_graph_size(self) -> None: + """pre-compute the mapping from batch size to ViT padded graph size.""" + max_size = self.compilation_config.max_vit_cudagraph_capture_size + capture_sizes = self.compilation_config.vit_cudagraph_capture_sizes + self._bs_to_padded_vit_graph_size: list[int] = [0] * (max_size + 1) + for end, start in zip( + capture_sizes + [max_size + 1], + [0] + capture_sizes, + ): + for bs in range(start, end): + if bs == start: + self._bs_to_padded_vit_graph_size[bs] = start + else: + self._bs_to_padded_vit_graph_size[bs] = end + + def _create_padded_batch_descriptor( self, num_tokens: int, @@ -130,7 +146,7 @@ def _create_padded_batch_descriptor( max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs uniform_decode_query_len = self.uniform_decode_query_len if is_vit: - num_tokens_padded = self.vllm_config.pad_for_vit_cudagraph(num_tokens) + num_tokens_padded = self._bs_to_padded_vit_graph_size[num_tokens] else: num_tokens_padded = self._bs_to_padded_graph_size[num_tokens] @@ -171,6 +187,7 @@ def initialize_cudagraph_keys( return self._compute_bs_to_padded_graph_size() + self._compute_bs_to_padded_vit_graph_size() # Get LoRA cases to capture lora_cases = self._get_lora_cases() From 79ea2407131d3dbdd7c90f4eab7f3491f8876539 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Mon, 26 Jan 2026 15:53:59 +0800 Subject: [PATCH 25/35] rebase to ff6c1d Signed-off-by: Hongjian Zhang --- vllm/v1/cudagraph_dispatcher.py | 8 +++++--- vllm/v1/worker/gpu_model_runner.py | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 90572805f750..0048ef61c3c3 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -315,7 +315,7 @@ def dispatch( # finally, just return no cudagraphs and a trivial batch descriptor return CUDAGraphMode.NONE, BatchDescriptor(num_tokens) - def get_capture_descs(self) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]: + def get_capture_descs(self, is_vit: bool = False) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]: """ Returns capture descriptors for cudagraph capturing. @@ -333,7 +333,9 @@ def get_capture_descs(self) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]] descs = list(self.cudagraph_keys[mode]) if descs: # Sort by num_tokens descending (largest first) - descs.sort(key=lambda d: d.num_tokens, reverse=True) - result.append((mode, descs)) + filter_descs = [d for d in descs if d.is_vit == is_vit] + if filter_descs: + filter_descs.sort(key=lambda d: d.num_tokens, reverse=True) + result.append((mode, filter_descs)) return result diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5d88be3212d0..9afe730a0641 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -29,6 +29,7 @@ CUDAGraphMode, VllmConfig, get_layers_from_vllm_config, + set_current_vllm_config, update_config, ) from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer From 7be22e790ef389a8a31bc5d4eb4c8a1022bf63ea Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Tue, 27 Jan 2026 16:23:01 +0800 Subject: [PATCH 26/35] feat: add test Signed-off-by: Hongjian Zhang --- .../piecewise/test_qwenvl_vit_cudagraph.py | 261 ++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 tests/compile/piecewise/test_qwenvl_vit_cudagraph.py diff --git a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py new file mode 100644 index 000000000000..cddf2147b137 --- /dev/null +++ b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py @@ -0,0 +1,261 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +import weakref +from functools import partial + +import pytest +import torch + +from vllm import LLM +from vllm.config import CompilationConfig, CUDAGraphMode +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.forward_context import set_forward_context +from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher +from vllm.v1.executor.multiproc_executor import MultiprocExecutor + + +def _worker_embed_multimodal( + worker, vllm_config, cudagraph_runtime_mode, batch_descriptor, multi_modal_data +): + """Helper function to run multimodal embedding on a worker. + This function sets up the necessary forward context for tensor-parallel (TP) + execution and then calls the model's `embed_multimodal` method. + Note: For data-parallel (DP) mode, the forward context is typically + created and managed within the vision dispatcher, which would override + the context set here. + Args: + worker: The worker instance containing the model runner. + vllm_config: The vLLM engine configuration. + cudagraph_runtime_mode: The runtime mode for CUDA graph execution. + batch_descriptor: An object describing the current batch. + multi_modal_data: A dictionary of keyword arguments to be passed to + the model's `embed_multimodal` method. + Returns: + The output from the model's `embed_multimodal` method. + """ + + # Access model via worker.model_runner.model + # Note: Accessing internal attributes. Assuming V1 worker structure. + model = worker.model_runner.model + + # Move multi_modal_data to the model's device + target_device = next(model.parameters()).device + multi_modal_data = { + k: v.to(target_device) if isinstance(v, torch.Tensor) else v + for k, v in multi_modal_data.items() + } + + with ( + set_forward_context( + None, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + ), + torch.inference_mode(), + ): + ans = model.embed_multimodal(**multi_modal_data) + torch.cuda.synchronize() + return ans + + +# Format: (model_name, tp_size, mm_encoder_tp_mode) +TEST_CONFIGS = [ + ("Qwen/Qwen2.5-VL-3B-Instruct", 1, "weights"), + ("Qwen/Qwen3-VL-4B-Instruct", 1, "weights"), + # TP/DP modes with 2 GPUs + ("Qwen/Qwen2.5-VL-3B-Instruct", 2, "data"), + ("Qwen/Qwen2.5-VL-3B-Instruct", 2, "weights"), + ("Qwen/Qwen3-VL-4B-Instruct", 2, "data"), + ("Qwen/Qwen3-VL-4B-Instruct", 2, "weights"), +] + + +@pytest.fixture( + params=TEST_CONFIGS, ids=lambda x: f"{x[0].split('/')[-1]}-tp{x[1]}-{x[2]}" +) +def llm(request): + model_name, tp_size, mm_mode = request.param + + if torch.cuda.device_count() < tp_size: + pytest.skip(f"Not enough GPUs for tp_size={tp_size}") + + os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" + # Common configuration + common_args = { + "model": model_name, + "trust_remote_code": True, + "max_model_len": 4096, + "max_num_seqs": 16, + "gpu_memory_utilization": 0.2, + "tensor_parallel_size": tp_size, + "mm_encoder_tp_mode": mm_mode, + } + + # Initialize LLM with ViT CUDA graph enabled (piecewise) + # We only need one LLM instance. For eager execution, we will force + # cudagraph_runtime_mode=NONE at runtime. + llm_instance = None + try: + llm_instance = LLM( + **common_args, + compilation_config=CompilationConfig( + cudagraph_mode="PIECEWISE", + compile_mm_encoder=True, + vit_cudagraph_capture_sizes=[64, 128, 256], + ), + ) + print(f"LLM initialized for {model_name} tp={tp_size} mode={mm_mode}") + yield weakref.proxy(llm_instance) + finally: + print("Cleaning up LLM after testing.") + if llm_instance: + # Ensure model executor and workers are properly shut down + # llm_instance.llm_engine is vllm.v1.engine.llm_engine.LLMEngine + # which has engine_core (InprocClient). + if hasattr(llm_instance.llm_engine, "engine_core"): + llm_instance.llm_engine.engine_core.shutdown() + del llm_instance + + # Clean up distributed environment + cleanup_dist_env_and_memory() + + +class TestQwenVLCUDAGraph: + def _run_embed_multimodal( + self, llm, multi_modal_data, num_patches, force_eager=False + ): + """Runs the multimodal embedding process, potentially with CUDA graphs. + This method manually constructs a CudagraphDispatcher because accessing the + one within the GPU model runner is difficult. It then dispatches based on + the number of image patches to determine the appropriate CUDA graph or + eager mode for execution. The actual embedding is performed on the + worker(s) via an RPC call. + Args: + llm: The LLM object containing the model engine and configuration. + multi_modal_data: A dictionary containing the multimodal data to be + processed. + num_patches: The number of image patches, used to determine the + number of tokens for the dispatcher. + force_eager: If True, forces the execution to run in eager mode, + bypassing CUDA graphs. + Returns: + The outputs from the multimodal embedding process executed on the + worker. + """ + vllm_config = llm.llm_engine.vllm_config + + dispatcher = CudagraphDispatcher(vllm_config) + dispatcher.initialize_cudagraph_keys( + cudagraph_mode=vllm_config.compilation_config.cudagraph_mode, + uniform_decode_query_len=1, + ) + + # Dispatch to get runtime mode and batch descriptor + cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch( + num_tokens=num_patches, uniform_decode=False, has_lora=False, is_vit=True + ) + + model_executor = llm.llm_engine.model_executor + + rpc_kwargs = {} + # Use collective_rpc to execute on driver worker (rank 0) + if isinstance(model_executor, MultiprocExecutor): + rpc_kwargs["unique_reply_rank"] = 0 + # If force_eager is True, override the runtime mode to NONE + if force_eager: + cudagraph_runtime_mode = CUDAGraphMode.NONE + else: + multi_modal_data["cudagraph_dispatcher"] = dispatcher + outputs = model_executor.collective_rpc( + partial( + _worker_embed_multimodal, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + multi_modal_data=multi_modal_data, + ), + **rpc_kwargs, + ) + + if isinstance(outputs, list) and len(outputs) == 1: + outputs = outputs[0] + return outputs + + def test_vit_cudagraph_consistency(self, llm): + print("Starting test for ViT CUDA graph consistency.") + + model_name = llm.llm_engine.vllm_config.model_config.model + # Qwen3-VL uses patch_size=16, temporal_patch_size=2 -> 16*16*3*2 = 1536 + # Qwen2.5-VL uses patch_size=14, temporal_patch_size=2 -> 14*14*3*2 = 1176 + input_dim = 1536 if "Qwen3-VL" in model_name else 1176 + + num_patches = 64 + for num_imgs in [1, 2, 4]: + image_grid_thw = torch.tensor( + [[1, 2, num_patches // 2]] * num_imgs, dtype=torch.long, device="cpu" + ) + pixel_values = torch.rand( + (num_patches * num_imgs, input_dim), dtype=torch.bfloat16, device="cpu" + ) + + multi_modal_data = { + "pixel_values": pixel_values, + "image_grid_thw": image_grid_thw, + } + print( + "Running inference with single LLM (Piecewise vs Eager via context)." + "num_imgs:", + num_imgs, + ) + + # Run with Piecewise CUDA Graph + piecewise_outputs = self._run_embed_multimodal( + llm, multi_modal_data, num_patches * num_imgs, force_eager=False + ) + + # Run with Eager Mode (simulated by setting runtime mode to NONE) + eager_outputs = self._run_embed_multimodal( + llm, multi_modal_data, num_patches * num_imgs, force_eager=True + ) + + if isinstance(piecewise_outputs, torch.Tensor): + assert torch.allclose( + piecewise_outputs, eager_outputs, atol=1e-3, rtol=1e-5 + ), ( + f"num_imgs: {num_imgs}. Piecewise and Eager outputs do not match. " + "Max abs diff: " + f"{torch.max(torch.abs(piecewise_outputs - eager_outputs))}. " + "Max rel diff: " + f"{ + torch.max( + torch.abs(piecewise_outputs - eager_outputs) + / (torch.abs(eager_outputs) + 1e-8) + ) + }" + ) + elif isinstance(piecewise_outputs, tuple): + assert isinstance(eager_outputs, tuple), ( + "Output types mismatch, piecewise is tuple but eager is not." + ) + assert len(piecewise_outputs) == len(eager_outputs), ( + "Output tuple lengths mismatch." + ) + for i, (p_out, e_out) in enumerate( + zip(piecewise_outputs, eager_outputs) + ): + assert torch.allclose(p_out, e_out, atol=1e-3, rtol=1e-5), ( + f"num_imgs: {num_imgs}. " + f"Tuple element {i} does not match. " + "Max abs diff: " + f"{torch.max(torch.abs(p_out - e_out))}. " + "Max rel diff: " + f"{ + torch.max( + torch.abs(p_out - e_out) / (torch.abs(e_out) + 1e-8) + ) + }" + ) + else: + raise TypeError(f"Unsupported output type: {type(piecewise_outputs)}") From eb91c31014b4f406e0c2650a198da2c6478d5030 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Tue, 27 Jan 2026 16:57:29 +0800 Subject: [PATCH 27/35] ruff Signed-off-by: Hongjian Zhang Signed-off-by: Xingran Wang Co-authored-by: Xingran Wang --- vllm/compilation/backends.py | 13 ++-- vllm/config/vllm.py | 1 + vllm/model_executor/models/qwen2_5_vl.py | 63 ++++++++++----- vllm/model_executor/models/qwen3_vl.py | 98 ++++++++++++++++-------- vllm/model_executor/models/vision.py | 22 ++++-- vllm/v1/cudagraph_dispatcher.py | 15 +++- vllm/v1/worker/gpu_model_runner.py | 64 +++++++++++----- 7 files changed, 189 insertions(+), 87 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 615948072f72..0cff6ed5ef53 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -10,7 +10,7 @@ import os import pprint import time -from collections.abc import Callable, Generator, Sequence +from collections.abc import Callable, Generator, Iterator, Sequence from contextlib import contextmanager from copy import deepcopy from functools import partial @@ -56,7 +56,7 @@ @contextmanager -def set_is_last_graph_in_vit_sequence(is_last: bool): +def set_is_last_graph_in_vit_sequence(is_last: bool) -> Iterator[None]: """Context manager to indicate if the current graph being compiled is the last one in a sequence of graphs (e.g., a sequence of blocks). """ @@ -75,7 +75,7 @@ def set_is_last_graph_in_vit_sequence(is_last: bool): @contextmanager -def set_is_first_graph_in_vit_sequence(is_first: bool): +def set_is_first_graph_in_vit_sequence(is_first: bool) -> Iterator[None]: """Context manager to indicate if the current graph being compiled is the first one in a sequence of graphs (e.g., a sequence of blocks). """ @@ -87,6 +87,7 @@ def set_is_first_graph_in_vit_sequence(is_first: bool): finally: _is_first_graph_in_vit_sequence = original_value + def make_copy_and_call( sym_tensor_indices: list[int], input_buffers: list[torch.Tensor | None], @@ -487,10 +488,8 @@ def wrap_with_cudagraph_if_needed( runtime_mode=CUDAGraphMode.PIECEWISE, cudagraph_options=CUDAGraphOptions( debug_log_enable=is_first_graph, - gc_disable=not is_first_graph - or not _is_first_graph_in_vit_sequence, - weak_ref_output=is_last_graph - and _is_last_graph_in_vit_sequence, + gc_disable=not is_first_graph or not _is_first_graph_in_vit_sequence, + weak_ref_output=is_last_graph and _is_last_graph_in_vit_sequence, ), ) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 90b54e9e2396..f43ee67f3fd1 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1346,6 +1346,7 @@ def _set_compile_ranges(self): compilation_config.compile_ranges_split_points = sorted( computed_compile_ranges_split_points ) + def _set_vit_cudagraph_sizes(self): """Sets the CUDA graph capture sizes for the Vision Transformer (ViT). diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index a7c16401360a..57b7b59fe28a 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -43,7 +43,12 @@ ) from vllm.compilation.decorators import support_torch_compile -from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config +from vllm.config import ( + CUDAGraphMode, + VllmConfig, + get_current_vllm_config, + set_current_vllm_config, +) from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.forward_context import get_forward_context, is_forward_context_available @@ -654,10 +659,14 @@ def __init__( dtype=self.dtype, ) self._persistent_rotary_pos_emb_cos_buffer = torch.empty( - (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16 + (max_compile_size, head_dim // 2), + device=self.device, + dtype=torch.bfloat16, ) self._persistent_rotary_pos_emb_sin_buffer = torch.empty( - (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16 + (max_compile_size, head_dim // 2), + device=self.device, + dtype=torch.bfloat16, ) @property @@ -804,7 +813,7 @@ def forward( cu_seqlens: list = [] fwd_ctx = None - if is_forward_context_available(): + if is_forward_context_available(): fwd_ctx = get_forward_context() if ( self._persistent_hidden_states_buffer is not None @@ -821,7 +830,10 @@ def forward( set_is_last_graph_in_vit_sequence, ) - with set_is_first_graph_in_vit_sequence(True), set_is_last_graph_in_vit_sequence(False): + with ( + set_is_first_graph_in_vit_sequence(True), + set_is_last_graph_in_vit_sequence(False), + ): hidden_states = self.patch_embed(hidden_states) window_index_id = 0 @@ -881,12 +893,12 @@ def forward( and fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE ): - rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[:seq_len].copy_( - rotary_pos_emb_sin - ) - rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[:seq_len].copy_( - rotary_pos_emb_cos - ) + rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[ + :seq_len + ].copy_(rotary_pos_emb_sin) + rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[ + :seq_len + ].copy_(rotary_pos_emb_cos) window_index = window_index.to(device=hidden_states.device, non_blocking=True) reverse_indices = reverse_indices.to( device=hidden_states.device, non_blocking=True @@ -938,7 +950,10 @@ def forward( hidden_states = cast_overflow_tensors(hidden_states) # adapter - with set_is_first_graph_in_vit_sequence(False), set_is_last_graph_in_vit_sequence(True): + with ( + set_is_first_graph_in_vit_sequence(False), + set_is_last_graph_in_vit_sequence(True), + ): hidden_states = self.merger(hidden_states) hidden_states = hidden_states[reverse_indices, :] return hidden_states @@ -1263,7 +1278,8 @@ def _parse_and_validate_video_input( ) def _process_image_input( - self, image_input: Qwen2_5_VLImageInputs, + self, + image_input: Qwen2_5_VLImageInputs, cudagraph_dispatcher: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = image_input["image_grid_thw"] @@ -1275,7 +1291,10 @@ def _process_image_input( else: pixel_values = image_input["pixel_values"] with set_current_vllm_config(self.vllm_config): - if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture: + if ( + self.use_data_parallel + and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture + ): return run_dp_sharded_mrope_vision_model( self.visual, pixel_values, @@ -1323,7 +1342,8 @@ def _postprocess_image_embeds_evs( return tuple(image_embeds_split) def _process_video_input( - self, video_input: Qwen2_5_VLVideoInputs, + self, + video_input: Qwen2_5_VLVideoInputs, cudagraph_dispatcher: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] @@ -1335,7 +1355,10 @@ def _process_video_input( else: pixel_values_videos = video_input["pixel_values_videos"] with set_current_vllm_config(self.vllm_config): - if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture: + if ( + self.use_data_parallel + and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture + ): return run_dp_sharded_mrope_vision_model( self.visual, pixel_values_videos, @@ -1505,14 +1528,18 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - image_embeddings = self._process_image_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher) + image_embeddings = self._process_image_input( + multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher + ) if self.is_multimodal_pruning_enabled: image_embeddings = self._postprocess_image_embeds_evs( image_embeddings, multimodal_input ) multimodal_embeddings += tuple(image_embeddings) if modality == "video": - video_embeddings = self._process_video_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher) + video_embeddings = self._process_video_input( + multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher + ) if self.is_multimodal_pruning_enabled: video_embeddings = self._postprocess_video_embeds_evs( video_embeddings, multimodal_input diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c012e17adfcb..9669809318c9 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -49,7 +49,12 @@ from transformers.video_utils import VideoMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CUDAGraphMode, MultiModalConfig, VllmConfig, set_current_vllm_config, get_current_vllm_config +from vllm.config import ( + CUDAGraphMode, + VllmConfig, + get_current_vllm_config, + set_current_vllm_config, +) from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_pp_group from vllm.forward_context import get_forward_context, is_forward_context_available @@ -141,8 +146,7 @@ DUMMY_VIDEO_NUM_FRAMES = 2048 -@support_torch_compile(dynamic_arg_dims={"x": 0}, - enable_if=should_torch_compile_mm_vit) +@support_torch_compile(dynamic_arg_dims={"x": 0}, enable_if=should_torch_compile_mm_vit) class Qwen3_VisionPatchEmbed(nn.Module): def __init__( self, @@ -210,7 +214,12 @@ def forward(self, x: torch.Tensor): @support_torch_compile( - dynamic_arg_dims={"x": 0, "cu_seqlens": 0, "rotary_pos_emb_cos": 0, "rotary_pos_emb_sin": 0}, + dynamic_arg_dims={ + "x": 0, + "cu_seqlens": 0, + "rotary_pos_emb_cos": 0, + "rotary_pos_emb_sin": 0, + }, enable_if=should_torch_compile_mm_vit, ) class Qwen3_VisionBlock(nn.Module): @@ -265,8 +274,7 @@ def forward( return x -@support_torch_compile(dynamic_arg_dims={"x": 0}, - enable_if=should_torch_compile_mm_vit) +@support_torch_compile(dynamic_arg_dims={"x": 0}, enable_if=should_torch_compile_mm_vit) class Qwen3_VisionPatchMerger(nn.Module): def __init__( self, @@ -428,15 +436,23 @@ def __init__( self._persistent_rotary_pos_emb_cos_buffer: torch.Tensor | None = None self._persistent_rotary_pos_emb_sin_buffer: torch.Tensor | None = None if vllm_config.compilation_config.vit_cudagraph_capture_sizes: - max_compile_size = vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] + max_compile_size = ( + vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] + ) self._persistent_hidden_states_buffer = torch.empty( - (max_compile_size, self.patch_embed.proj.input_size), device=self.device, dtype=self.dtype + (max_compile_size, self.patch_embed.proj.input_size), + device=self.device, + dtype=self.dtype, ) self._persistent_rotary_pos_emb_cos_buffer = torch.empty( - (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16 + (max_compile_size, head_dim // 2), + device=self.device, + dtype=torch.bfloat16, ) self._persistent_rotary_pos_emb_sin_buffer = torch.empty( - (max_compile_size, head_dim // 2), device=self.device, dtype=torch.bfloat16 + (max_compile_size, head_dim // 2), + device=self.device, + dtype=torch.bfloat16, ) @property @@ -571,7 +587,7 @@ def forward( ) -> torch.Tensor: seq_len, _ = x.size() fwd_ctx = None - if is_forward_context_available(): + if is_forward_context_available(): fwd_ctx = get_forward_context() if ( self._persistent_hidden_states_buffer is not None @@ -581,14 +597,19 @@ def forward( hidden_states = self._persistent_hidden_states_buffer[:seq_len] hidden_states.copy_(x, non_blocking=True) else: - hidden_states = x.to(device=self.device, dtype=self.dtype, non_blocking=True) + hidden_states = x.to( + device=self.device, dtype=self.dtype, non_blocking=True + ) from vllm.compilation.backends import ( set_is_first_graph_in_vit_sequence, set_is_last_graph_in_vit_sequence, ) - with set_is_first_graph_in_vit_sequence(True), set_is_last_graph_in_vit_sequence(False): + with ( + set_is_first_graph_in_vit_sequence(True), + set_is_last_graph_in_vit_sequence(False), + ): hidden_states = self.patch_embed(hidden_states) if isinstance(grid_thw, list): @@ -608,12 +629,12 @@ def forward( and fwd_ctx and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE ): - rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[:seq_len].copy_( - rotary_pos_emb_sin - ) - rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[:seq_len].copy_( - rotary_pos_emb_cos - ) + rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[ + :seq_len + ].copy_(rotary_pos_emb_sin) + rotary_pos_emb_cos = self._persistent_rotary_pos_emb_cos_buffer[ + :seq_len + ].copy_(rotary_pos_emb_cos) cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( axis=0, dtype=np.int32 @@ -651,12 +672,17 @@ def forward( max_seqlen=max_seqlen, ) if layer_num in self.deepstack_visual_indexes: - deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num) - deepstack_feature = self.deepstack_merger_list[deepstack_merger_idx]( - hidden_states - ) + deepstack_merger_idx = self.deepstack_visual_indexes.index( + layer_num + ) + deepstack_feature = self.deepstack_merger_list[ + deepstack_merger_idx + ](hidden_states) deepstack_feature_lists.append(deepstack_feature) - with set_is_first_graph_in_vit_sequence(False), set_is_last_graph_in_vit_sequence(True): + with ( + set_is_first_graph_in_vit_sequence(False), + set_is_last_graph_in_vit_sequence(True), + ): hidden_states = self.merger(hidden_states) hidden_states = torch.cat( [hidden_states] + deepstack_feature_lists, dim=1 @@ -1490,7 +1516,8 @@ def _parse_and_validate_video_input( ) def _process_image_input( - self, image_input: Qwen2_5_VLImageInputs, + self, + image_input: Qwen2_5_VLImageInputs, cudagraph_dispatcher: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = image_input["image_grid_thw"] @@ -1503,7 +1530,10 @@ def _process_image_input( pixel_values = image_input["pixel_values"].type(self.visual.dtype) with set_current_vllm_config(self.vllm_config): - if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture: + if ( + self.use_data_parallel + and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture + ): return run_dp_sharded_mrope_vision_model( self.visual, pixel_values, @@ -1520,7 +1550,8 @@ def _process_image_input( return image_embeds.split(sizes) def _process_video_input( - self, video_input: Qwen2_5_VLVideoInputs, + self, + video_input: Qwen2_5_VLVideoInputs, cudagraph_dispatcher: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] @@ -1534,7 +1565,10 @@ def _process_video_input( self.visual.dtype ) with set_current_vllm_config(self.vllm_config): - if self.use_data_parallel and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture: + if ( + self.use_data_parallel + and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture + ): return run_dp_sharded_mrope_vision_model( self.visual, pixel_values_videos, @@ -2003,14 +2037,18 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: for modality in mm_input_by_modality: multimodal_input = mm_input_by_modality[modality] if modality == "image": - image_embeddings = self._process_image_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher) + image_embeddings = self._process_image_input( + multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher + ) if self.is_multimodal_pruning_enabled: image_embeddings = self._postprocess_image_embeds_evs( image_embeddings, multimodal_input ) multimodal_embeddings += tuple(image_embeddings) if modality == "video": - video_embeddings = self._process_video_input(multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher) + video_embeddings = self._process_video_input( + multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher + ) if self.is_multimodal_pruning_enabled: video_embeddings = self._postprocess_video_embeds_evs( video_embeddings, multimodal_input diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 22c9a30c23c6..1637b27209af 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -10,17 +10,22 @@ import torch from transformers import PretrainedConfig -from vllm.config import MultiModalConfig, VllmConfig, CUDAGraphMode, get_current_vllm_config +from vllm.config import ( + CUDAGraphMode, + MultiModalConfig, + VllmConfig, + get_current_vllm_config, +) from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, ) -from vllm.forward_context import BatchDescriptor, set_forward_context -from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.v1.attention.backends.registry import AttentionBackendEnum +from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher logger = init_logger(__name__) @@ -489,8 +494,7 @@ def run_dp_sharded_mrope_vision_model( cudagraph_runtime_mode = CUDAGraphMode.NONE batch_descriptor = None - if (vllm_config and - vllm_config.compilation_config.vit_cudagraph_capture_sizes): + if vllm_config and vllm_config.compilation_config.vit_cudagraph_capture_sizes: current_input_len = pixel_values_local.shape[0] cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch( num_tokens=current_input_len, @@ -500,7 +504,7 @@ def run_dp_sharded_mrope_vision_model( is_vit=True, ) target_input_len = batch_descriptor.num_tokens - + # Pad pixel_values_local for CUDA graph if needed if current_input_len < target_input_len: padding_size = target_input_len - current_input_len @@ -516,7 +520,7 @@ def run_dp_sharded_mrope_vision_model( None, vllm_config=vllm_config, cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor + batch_descriptor=batch_descriptor, ): # Run the vision model on the local pixel_values_local if rope_type == "rope_2d": @@ -535,7 +539,9 @@ def run_dp_sharded_mrope_vision_model( ) else: if pixel_values_local.shape[0] > 0: - image_embeds_local = vision_model(pixel_values_local, local_grid_thw_list) + image_embeds_local = vision_model( + pixel_values_local, local_grid_thw_list + ) else: # Handle empty case image_embeds_local = torch.empty( diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 0048ef61c3c3..bc90d4044f71 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -133,7 +133,6 @@ def _compute_bs_to_padded_vit_graph_size(self) -> None: self._bs_to_padded_vit_graph_size[bs] = start else: self._bs_to_padded_vit_graph_size[bs] = end - def _create_padded_batch_descriptor( self, @@ -272,8 +271,14 @@ def dispatch( if ( not self.keys_initialized or self.cudagraph_mode == CUDAGraphMode.NONE - or (not is_vit and num_tokens > self.compilation_config.max_cudagraph_capture_size) - or (is_vit and num_tokens > self.compilation_config.max_vit_cudagraph_capture_size) + or ( + not is_vit + and num_tokens > self.compilation_config.max_cudagraph_capture_size + ) + or ( + is_vit + and num_tokens > self.compilation_config.max_vit_cudagraph_capture_size + ) ): return CUDAGraphMode.NONE, BatchDescriptor(num_tokens, is_vit=is_vit) @@ -315,7 +320,9 @@ def dispatch( # finally, just return no cudagraphs and a trivial batch descriptor return CUDAGraphMode.NONE, BatchDescriptor(num_tokens) - def get_capture_descs(self, is_vit: bool = False) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]: + def get_capture_descs( + self, is_vit: bool = False + ) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]: """ Returns capture descriptors for cudagraph capturing. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9afe730a0641..b07874cd6a16 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2431,7 +2431,10 @@ def _execute_mm_encoder( # each of shape (feature_size, hidden_size) in case the feature # size is dynamic depending on the input multimodal items. is_vit_dp_mode = ( - getattr(self.model_config.multimodal_config, "mm_encoder_tp_mode", None) == "data" + getattr( + self.model_config.multimodal_config, "mm_encoder_tp_mode", None + ) + == "data" and self.parallel_config.tensor_parallel_size > 1 ) if not is_vit_dp_mode: @@ -2441,17 +2444,24 @@ def _execute_mm_encoder( # Default values for non-ViT cudagraph case cudagraph_runtime_mode = CUDAGraphMode.NONE batch_descriptor = None - if self.vit_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group: - pixel_values = mm_kwargs_group["pixel_values"] + if ( + self.vit_cudagraph_batch_sizes + and "pixel_values" in mm_kwargs_group + ): + pixel_values = cast( + torch.Tensor, mm_kwargs_group["pixel_values"] + ) num_tokens = pixel_values.shape[0] # get batch_descriptor from dispatcher - cudagraph_runtime_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch( - num_tokens=num_tokens, - uniform_decode=False, - has_lora=False, - disable_full=False, - is_vit=True, + cudagraph_runtime_mode, batch_descriptor = ( + self.cudagraph_dispatcher.dispatch( + num_tokens=num_tokens, + uniform_decode=False, + has_lora=False, + disable_full=False, + is_vit=True, + ) ) padded_num_tokens = batch_descriptor.num_tokens @@ -2468,12 +2478,16 @@ def _execute_mm_encoder( # Update image_grid_thw to account for padding if "image_grid_thw" in mm_kwargs_group: - image_grid_thw = mm_kwargs_group["image_grid_thw"] + image_grid_thw = cast( + torch.Tensor, mm_kwargs_group["image_grid_thw"] + ) original_num_imgs = image_grid_thw.shape[0] # Treat padding as a new virtual image. - # Assuming a fixed patch size where height is merge_size. - h_patches, w_patches = self._get_dummy_h_w_patches(padding_amount) + # Assuming a fixed patch size where height = merge_size + h_patches, w_patches = self._get_dummy_h_w_patches( + padding_amount + ) padding_grid_info = torch.tensor( [[1, h_patches, w_patches]], dtype=image_grid_thw.dtype, @@ -2483,23 +2497,31 @@ def _execute_mm_encoder( [image_grid_thw, padding_grid_info], dim=0 ) - with set_forward_context( + with ( + set_forward_context( None, vllm_config=self.vllm_config, cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_descriptor, - ), self.timed_encoder_operation( - should_time, mm_lora_refs, current_item_idx, num_items + ), + self.timed_encoder_operation( + should_time, mm_lora_refs, current_item_idx, num_items + ), ): curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) # Remove the padded items before sanity check if original_num_imgs != -1: curr_group_outputs = curr_group_outputs[:original_num_imgs] else: - with set_current_vllm_config(self.vllm_config), self.timed_encoder_operation( - should_time, mm_lora_refs, current_item_idx, num_items + with ( + set_current_vllm_config(self.vllm_config), + self.timed_encoder_operation( + should_time, mm_lora_refs, current_item_idx, num_items + ), ): - mm_kwargs_group["cudagraph_dispatcher"] = self.cudagraph_dispatcher + mm_kwargs_group["cudagraph_dispatcher"] = ( + self.cudagraph_dispatcher + ) curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -4665,7 +4687,7 @@ def _get_dummy_vit_input( ) -> BatchedTensorInputs: """Dummy data for profiling and precompiling ViT.""" - # The first dimension of pixel_values corresponds + # The first dimension of pixel_values corresponds # to the total number of patches. pixel_values = torch.zeros( (num_image_tokens, img_feature_dim), dtype=self.dtype, device=self.device @@ -5219,7 +5241,9 @@ def _dummy_mm_encoder_run( "video", 1, ) - img_feature_dim = tmp_dummy_mm_inputs["pixel_values_videos"].shape[1] + img_feature_dim = cast( + torch.Tensor, tmp_dummy_mm_inputs["pixel_values_videos"] + ).shape[1] if is_global_first_rank(): compilation_cases = tqdm( From f7e4ea9180443972122da62eeae198cc3b1a83b4 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Thu, 29 Jan 2026 16:16:27 +0800 Subject: [PATCH 28/35] fix review suggestion Signed-off-by: Hongjian Zhang --- docs/design/torch_compile.md | 10 --- docs/design/torch_compile_multimodal.md | 40 +++++++++++- vllm/compilation/backends.py | 72 +++++++++++++--------- vllm/config/vllm.py | 34 +++++----- vllm/forward_context.py | 4 ++ vllm/model_executor/models/qwen2_5_vl.py | 21 +++---- vllm/model_executor/models/qwen3_vl.py | 21 +++---- vllm/v1/attention/ops/vit_attn_wrappers.py | 4 +- vllm/v1/cudagraph_dispatcher.py | 40 +++++++----- 9 files changed, 151 insertions(+), 95 deletions(-) diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md index 8532c7020cbc..4dc0da0c7d65 100644 --- a/docs/design/torch_compile.md +++ b/docs/design/torch_compile.md @@ -253,16 +253,6 @@ By default, vLLM will try to determine a set of sizes to capture cudagraph. You vllm serve meta-llama/Llama-3.2-1B \ --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}' ``` -Similarly, For `Qwen2.5-VL`,`Qwen3-VL` series model, you can specify the capture sizes for the vision transformer (ViT) using `vit_cudagraph_capture_sizes`, the capture sizes should be multiples of the square of `merge_size`. By default, this is disabled as `compile_mm_encoder` is `False`. To enable it and specify capture sizes, you can do the following: -```bash -vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ - --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": [512, 1024]}' -``` -Alternatively, you can specify `max_vit_cudagraph_capture_size` to generate a default list of capture sizes up to the given value: -```bash -vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ - --compilation-config '{"compile_mm_encoder": true, "max_vit_cudagraph_capture_size": 2048}' -``` Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. diff --git a/docs/design/torch_compile_multimodal.md b/docs/design/torch_compile_multimodal.md index 674ddd801d65..f3f3f3b433f5 100644 --- a/docs/design/torch_compile_multimodal.md +++ b/docs/design/torch_compile_multimodal.md @@ -68,7 +68,45 @@ to alert torch.compile to the fact that this range cannot be inferred, and we de ### Cudagraphs -We have not yet explored compilation for multimodal encoders with CUDAGraph integration; behavior is currently unspecified. +vLLM now supports Piecewise CUDA Graph integration for the Vision Transformer (ViT) encoder in Qwen2.5-VL and Qwen3-VL models. This feature captures CUDA graphs at specified patch sizes to reduce kernel launch overhead and improve performance. + +#### Enabling ViT CUDA Graphs + +**Important**: This feature is **not enabled by default**. The Piecewise CUDA Graph implementation relies on `torch.compile` to trace the computation graph and separate the attention operators. Therefore, users must explicitly enable ViT compilation via the `--compilation-config` argument to activate this feature. + +To enable ViT CUDA graph compilation, use: + +```bash +vllm serve --compilation-config '{"compile_mm_encoder": true}' +``` + +#### Configuring Capture Sizes + +You can specify custom patch sizes for CUDA graph capture using `vit_cudagraph_capture_sizes`. For models like `Qwen2.5-VL` and `Qwen3-VL`, the capture sizes should be multiples of the square of `merge_size`: + +```bash +vllm serve --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": [512, 1024]}' +``` + +Alternatively, you can specify `max_vit_cudagraph_capture_size` to generate a default list of capture sizes up to the given value: + +```bash +vllm serve --compilation-config '{"compile_mm_encoder": true, "max_vit_cudagraph_capture_size": 2048}' +``` + +#### Default Behavior + +Once enabled, if `vit_cudagraph_capture_sizes` is not specified, vLLM will use a default set of sizes for capture. Since `compile_mm_encoder` is `False` by default, this feature remains inactive unless configured. + +If you only want to enable `torch.compile` for ViT without using the CUDA Graph feature, you can explicitly set the capture sizes to empty: + +```bash +vllm serve --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": []}' +``` + +#### Limitations & Notes + +- **Image Only**: This feature currently only supports image inference. Video inference is not supported yet. ## Troubleshooting diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 0cff6ed5ef53..63bf3690891a 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -10,8 +10,8 @@ import os import pprint import time -from collections.abc import Callable, Generator, Iterator, Sequence -from contextlib import contextmanager +from collections.abc import Callable, Generator, Sequence +from contextlib import AbstractContextManager, contextmanager from copy import deepcopy from functools import partial from typing import Any @@ -30,6 +30,7 @@ from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig from vllm.config.compilation import DynamicShapesType from vllm.config.utils import Range, hash_factors +from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.logging_utils import lazy from vllm.platforms import current_platform @@ -48,44 +49,45 @@ logger = init_logger(__name__) -# A global flag to indicate if the current graph being compiled -# is the last one in a sequence of graphs (e.g., a sequence of blocks). -# This is a workaround to control CUDAGraph weak_ref_output behavior -# in **vit** piecewise compilation. -_is_last_graph_in_vit_sequence: bool = True - @contextmanager -def set_is_last_graph_in_vit_sequence(is_last: bool) -> Iterator[None]: - """Context manager to indicate if the current graph being compiled - is the last one in a sequence of graphs (e.g., a sequence of blocks). - """ - global _is_last_graph_in_vit_sequence - original_value = _is_last_graph_in_vit_sequence - _is_last_graph_in_vit_sequence = is_last +def _set_mm_encoder_sequence_flag( + attr_name: str, value: bool +) -> Generator[None, None, None]: + try: + ctx = get_forward_context() + original_value = getattr(ctx, attr_name) + setattr(ctx, attr_name, value) + except Exception: + yield + return + try: yield finally: - _is_last_graph_in_vit_sequence = original_value + setattr(ctx, attr_name, original_value) -# A global flag to indicate if the current graph being compiled -# is the first one in a sequence of graphs (e.g., a sequence of blocks). -_is_first_graph_in_vit_sequence: bool = True +def set_is_last_graph_in_mm_encoder_sequence( + is_last: bool, +) -> AbstractContextManager[None]: + """Context manager to indicate if the current graph being compiled + is the last one in a sequence of graphs (e.g., a sequence of blocks). + """ + return _set_mm_encoder_sequence_flag( + "is_last_graph_in_mm_encoder_sequence", is_last + ) -@contextmanager -def set_is_first_graph_in_vit_sequence(is_first: bool) -> Iterator[None]: +def set_is_first_graph_in_mm_encoder_sequence( + is_first: bool, +) -> AbstractContextManager[None]: """Context manager to indicate if the current graph being compiled is the first one in a sequence of graphs (e.g., a sequence of blocks). """ - global _is_first_graph_in_vit_sequence - original_value = _is_first_graph_in_vit_sequence - _is_first_graph_in_vit_sequence = is_first - try: - yield - finally: - _is_first_graph_in_vit_sequence = original_value + return _set_mm_encoder_sequence_flag( + "is_first_graph_in_mm_encoder_sequence", is_first + ) def make_copy_and_call( @@ -482,14 +484,24 @@ def wrap_with_cudagraph_if_needed( # CUDAGraphWrapper for piecewise_backend, to distinguish # it from the FULL cudagraph runtime mode, no matter it # is wrapped on a full or piecewise fx graph. + + try: + fwd_ctx = get_forward_context() + is_first_graph_in_sequence = fwd_ctx.is_first_graph_in_mm_encoder_sequence + is_last_graph_in_sequence = fwd_ctx.is_last_graph_in_mm_encoder_sequence + except Exception: + # Fallback for when ForwardContext is not available + is_first_graph_in_sequence = True + is_last_graph_in_sequence = True + return static_graph_wrapper_class( runnable=piecewise_backend, vllm_config=vllm_config, runtime_mode=CUDAGraphMode.PIECEWISE, cudagraph_options=CUDAGraphOptions( debug_log_enable=is_first_graph, - gc_disable=not is_first_graph or not _is_first_graph_in_vit_sequence, - weak_ref_output=is_last_graph and _is_last_graph_in_vit_sequence, + gc_disable=not is_first_graph or not is_first_graph_in_sequence, + weak_ref_output=is_last_graph and is_last_graph_in_sequence, ), ) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index f43ee67f3fd1..919f214ce720 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1350,15 +1350,16 @@ def _set_compile_ranges(self): def _set_vit_cudagraph_sizes(self): """Sets the CUDA graph capture sizes for the Vision Transformer (ViT). - This method determines the batch sizes for which ViT CUDA graphs will be - captured. CUDA graphs improve performance by reducing kernel launch - overhead for the vision encoder. + This method determines the batch sizes (in terms of number of patches) + for which ViT CUDA graphs will be captured. CUDA graphs improve + performance by reducing kernel launch overhead for the vision encoder. The logic is as follows: 1. The feature is only enabled if all of the following conditions are met: - - Eager mode is not enforced. - - CUDA graph mode is enabled. - - The multimodal encoder compilation is enabled. + - A model is configured (`model_config` is not None). + - Eager mode is not enforced (`enforce_eager` is False). + - CUDA graph mode is enabled (`cudagraph_mode` is not NONE). + - Multimodal encoder compilation is enabled (`compile_mm_encoder` is True). If these conditions are not met, the list of capture sizes will be empty, effectively disabling ViT CUDA graphs. @@ -1367,17 +1368,22 @@ def _set_vit_cudagraph_sizes(self): and sorted in ascending order. 3. If no sizes are provided by the user, a default list of sizes is - generated up to a maximum of 5120. The default sizes are: - [512, 1024, 1536] + list(range(2048, 2048, 128)) + list( - range(4096, 8192 + 1, 256)) - - The final list of sizes is stored in - `self.compilation_config.vit_cudagraph_capture_sizes`. - + generated. The maximum size for this list is determined automatically + by `compute_encoder_budget` (capped at 8192), or by the user-provided + `max_vit_cudagraph_capture_size`. The default sizes are: + [512, 1024, 1536] + list(range(2048, 4096, 128)) + list( + range(4096, max_size + 1, 256)) + + 4. The final list of sizes is stored in + `self.compilation_config.vit_cudagraph_capture_sizes`. The + `max_vit_cudagraph_capture_size` is also updated to be consistent + with the largest value in this final list. + + At runtime: - If a batch's size matches or is smaller than a captured size, the closest captured graph is used. - If a batch's size is larger than the largest captured size, a CUDA - graph will not be used for that batch. + graph will not be used for that batch (fallback to eager execution). """ if ( self.model_config is not None diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 1b89c04851f0..d7a7603f6b41 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -255,6 +255,10 @@ class ForwardContext: all_moe_layers: list[str] | None = None moe_layer_index: int = 0 + # ViT Multi-Modal Encoder flags used by backend compiler + is_first_graph_in_mm_encoder_sequence: bool = True + is_last_graph_in_mm_encoder_sequence: bool = True + additional_kwargs: dict[str, Any] = field(default_factory=dict) def __post_init__(self): diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 57b7b59fe28a..1127cfea1634 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -42,6 +42,10 @@ Qwen2_5_VLVisionConfig, ) +from vllm.compilation.backends import ( + set_is_first_graph_in_mm_encoder_sequence, + set_is_last_graph_in_mm_encoder_sequence, +) from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CUDAGraphMode, @@ -825,14 +829,9 @@ def forward( else: hidden_states = x.to(device=self.device, dtype=self.dtype) - from vllm.compilation.backends import ( - set_is_first_graph_in_vit_sequence, - set_is_last_graph_in_vit_sequence, - ) - with ( - set_is_first_graph_in_vit_sequence(True), - set_is_last_graph_in_vit_sequence(False), + set_is_first_graph_in_mm_encoder_sequence(True), + set_is_last_graph_in_mm_encoder_sequence(False), ): hidden_states = self.patch_embed(hidden_states) @@ -925,8 +924,8 @@ def forward( hidden_states = original_hidden_states with ( - set_is_first_graph_in_vit_sequence(False), - set_is_last_graph_in_vit_sequence(False), + set_is_first_graph_in_mm_encoder_sequence(False), + set_is_last_graph_in_mm_encoder_sequence(False), ): for layer_num, blk in enumerate(self.blocks): if layer_num in self.fullatt_block_indexes: @@ -951,8 +950,8 @@ def forward( # adapter with ( - set_is_first_graph_in_vit_sequence(False), - set_is_last_graph_in_vit_sequence(True), + set_is_first_graph_in_mm_encoder_sequence(False), + set_is_last_graph_in_mm_encoder_sequence(True), ): hidden_states = self.merger(hidden_states) hidden_states = hidden_states[reverse_indices, :] diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 9669809318c9..6a8ef0c239ec 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -48,6 +48,10 @@ ) from transformers.video_utils import VideoMetadata +from vllm.compilation.backends import ( + set_is_first_graph_in_mm_encoder_sequence, + set_is_last_graph_in_mm_encoder_sequence, +) from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CUDAGraphMode, @@ -601,14 +605,9 @@ def forward( device=self.device, dtype=self.dtype, non_blocking=True ) - from vllm.compilation.backends import ( - set_is_first_graph_in_vit_sequence, - set_is_last_graph_in_vit_sequence, - ) - with ( - set_is_first_graph_in_vit_sequence(True), - set_is_last_graph_in_vit_sequence(False), + set_is_first_graph_in_mm_encoder_sequence(True), + set_is_last_graph_in_mm_encoder_sequence(False), ): hidden_states = self.patch_embed(hidden_states) @@ -660,8 +659,8 @@ def forward( deepstack_feature_lists = [] with ( - set_is_first_graph_in_vit_sequence(False), - set_is_last_graph_in_vit_sequence(False), + set_is_first_graph_in_mm_encoder_sequence(False), + set_is_last_graph_in_mm_encoder_sequence(False), ): for layer_num, blk in enumerate(self.blocks): hidden_states = blk( @@ -680,8 +679,8 @@ def forward( ](hidden_states) deepstack_feature_lists.append(deepstack_feature) with ( - set_is_first_graph_in_vit_sequence(False), - set_is_last_graph_in_vit_sequence(True), + set_is_first_graph_in_mm_encoder_sequence(False), + set_is_last_graph_in_mm_encoder_sequence(True), ): hidden_states = self.merger(hidden_states) hidden_states = torch.cat( diff --git a/vllm/v1/attention/ops/vit_attn_wrappers.py b/vllm/v1/attention/ops/vit_attn_wrappers.py index f6051e54713b..b226fb8c1134 100644 --- a/vllm/v1/attention/ops/vit_attn_wrappers.py +++ b/vllm/v1/attention/ops/vit_attn_wrappers.py @@ -150,7 +150,9 @@ def torch_sdpa_wrapper( v = v.contiguous() if cu_seqlens is None: - return apply_sdpa(q, k, v, scale=scale) + context_layer = apply_sdpa(q, k, v, scale=scale) + output.copy_(context_layer) + return output outputs = [] diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index bc90d4044f71..0549a57597ce 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -68,18 +68,11 @@ def __init__(self, vllm_config: VllmConfig): def _compute_bs_to_padded_graph_size(self) -> None: """Pre-compute the mapping from batch size to padded graph size.""" - max_size = self.compilation_config.max_cudagraph_capture_size + max_capture_size = self.compilation_config.max_cudagraph_capture_size capture_sizes = self.compilation_config.cudagraph_capture_sizes - self._bs_to_padded_graph_size: list[int] = [0] * (max_size + 1) - for end, start in zip( - capture_sizes + [max_size + 1], - [0] + capture_sizes, - ): - for bs in range(start, end): - if bs == start: - self._bs_to_padded_graph_size[bs] = start - else: - self._bs_to_padded_graph_size[bs] = end + self._bs_to_padded_graph_size = self._get_padded_size_map( + capture_sizes, max_capture_size + ) # Validate that compile_sizes won't be changed by padding. # Only validate when cudagraphs are actually being used. @@ -88,7 +81,7 @@ def _compute_bs_to_padded_graph_size(self) -> None: and self.cudagraph_mode != CUDAGraphMode.NONE ): for size in self.compilation_config.compile_sizes: - if size <= self.compilation_config.max_cudagraph_capture_size: + if size <= max_capture_size: padded = self._bs_to_padded_graph_size[size] if padded != size: raise ValueError( @@ -121,18 +114,31 @@ def _get_lora_cases(self) -> list[int]: def _compute_bs_to_padded_vit_graph_size(self) -> None: """pre-compute the mapping from batch size to ViT padded graph size.""" - max_size = self.compilation_config.max_vit_cudagraph_capture_size + max_capture_size = self.compilation_config.max_vit_cudagraph_capture_size capture_sizes = self.compilation_config.vit_cudagraph_capture_sizes - self._bs_to_padded_vit_graph_size: list[int] = [0] * (max_size + 1) + + self._bs_to_padded_vit_graph_size = self._get_padded_size_map( + capture_sizes, max_capture_size + ) + + def _get_padded_size_map( + self, capture_sizes: list[int] | None, max_size: int | None + ) -> list[int]: + if capture_sizes is None: + capture_sizes = [] + if max_size is None: + max_size = 0 + padded_size_map: list[int] = [0] * (max_size + 1) for end, start in zip( capture_sizes + [max_size + 1], [0] + capture_sizes, ): for bs in range(start, end): if bs == start: - self._bs_to_padded_vit_graph_size[bs] = start + padded_size_map[bs] = start else: - self._bs_to_padded_vit_graph_size[bs] = end + padded_size_map[bs] = end + return padded_size_map def _create_padded_batch_descriptor( self, @@ -318,7 +324,7 @@ def dispatch( return CUDAGraphMode.PIECEWISE, relaxed_batch_desc # finally, just return no cudagraphs and a trivial batch descriptor - return CUDAGraphMode.NONE, BatchDescriptor(num_tokens) + return CUDAGraphMode.NONE, BatchDescriptor(num_tokens, is_vit=is_vit) def get_capture_descs( self, is_vit: bool = False From 3f9950e9ba6d4883e995deb49bb10c7fef82203c Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Fri, 30 Jan 2026 16:01:24 +0800 Subject: [PATCH 29/35] chore: rename vit to mm_encoder Signed-off-by: Hongjian Zhang Signed-off-by: Hongjian Zhang --- docs/design/torch_compile_multimodal.md | 14 +-- .../piecewise/test_qwenvl_vit_cudagraph.py | 12 +- vllm/config/compilation.py | 8 +- vllm/config/vllm.py | 116 ++++++++++-------- vllm/engine/arg_utils.py | 20 +-- vllm/forward_context.py | 8 +- vllm/model_executor/models/qwen2_5_vl.py | 8 +- vllm/model_executor/models/qwen3_vl.py | 8 +- vllm/model_executor/models/vision.py | 7 +- vllm/v1/cudagraph_dispatcher.py | 53 ++++---- vllm/v1/worker/gpu_model_runner.py | 46 +++---- 11 files changed, 166 insertions(+), 134 deletions(-) diff --git a/docs/design/torch_compile_multimodal.md b/docs/design/torch_compile_multimodal.md index f3f3f3b433f5..260f4e136a58 100644 --- a/docs/design/torch_compile_multimodal.md +++ b/docs/design/torch_compile_multimodal.md @@ -72,7 +72,7 @@ vLLM now supports Piecewise CUDA Graph integration for the Vision Transformer (V #### Enabling ViT CUDA Graphs -**Important**: This feature is **not enabled by default**. The Piecewise CUDA Graph implementation relies on `torch.compile` to trace the computation graph and separate the attention operators. Therefore, users must explicitly enable ViT compilation via the `--compilation-config` argument to activate this feature. +**Important**: This feature is **not enabled by default**. The Piecewise CUDA Graph implementation relies on `torch.compile` to trace the computation graph and separate the attention operators. Therefore, users must explicitly enable mm_encoder compilation via the `--compilation-config` argument to activate this feature. To enable ViT CUDA graph compilation, use: @@ -82,26 +82,26 @@ vllm serve --compilation-config '{"compile_mm_encoder": true}' #### Configuring Capture Sizes -You can specify custom patch sizes for CUDA graph capture using `vit_cudagraph_capture_sizes`. For models like `Qwen2.5-VL` and `Qwen3-VL`, the capture sizes should be multiples of the square of `merge_size`: +You can specify custom patch sizes for CUDA graph capture using `mm_encoder_cudagraph_capture_sizes`. For models like `Qwen2.5-VL` and `Qwen3-VL`, the capture sizes should be multiples of the square of `merge_size`: ```bash -vllm serve --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": [512, 1024]}' +vllm serve --compilation-config '{"compile_mm_encoder": true, "mm_encoder_cudagraph_capture_sizes": [512, 1024]}' ``` -Alternatively, you can specify `max_vit_cudagraph_capture_size` to generate a default list of capture sizes up to the given value: +Alternatively, you can specify `max_mm_encoder_cudagraph_capture_size` to generate a default list of capture sizes up to the given value: ```bash -vllm serve --compilation-config '{"compile_mm_encoder": true, "max_vit_cudagraph_capture_size": 2048}' +vllm serve --compilation-config '{"compile_mm_encoder": true, "max_mm_encoder_cudagraph_capture_size": 2048}' ``` #### Default Behavior -Once enabled, if `vit_cudagraph_capture_sizes` is not specified, vLLM will use a default set of sizes for capture. Since `compile_mm_encoder` is `False` by default, this feature remains inactive unless configured. +Once enabled, if `mm_encoder_cudagraph_capture_sizes` is not specified, vLLM will use a default set of sizes for capture. Since `compile_mm_encoder` is `False` by default, this feature remains inactive unless configured. If you only want to enable `torch.compile` for ViT without using the CUDA Graph feature, you can explicitly set the capture sizes to empty: ```bash -vllm serve --compilation-config '{"compile_mm_encoder": true, "vit_cudagraph_capture_sizes": []}' +vllm serve --compilation-config '{"compile_mm_encoder": true, "mm_encoder_cudagraph_capture_sizes": []}' ``` #### Limitations & Notes diff --git a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py index cddf2147b137..ec3176885409 100644 --- a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py +++ b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py @@ -22,8 +22,9 @@ def _worker_embed_multimodal( This function sets up the necessary forward context for tensor-parallel (TP) execution and then calls the model's `embed_multimodal` method. Note: For data-parallel (DP) mode, the forward context is typically - created and managed within the vision dispatcher, which would override - the context set here. + created and managed within the + vision.py:run_dp_sharded_mrope_vision_model(), which would override the + context set here. Args: worker: The worker instance containing the model runner. vllm_config: The vLLM engine configuration. @@ -103,7 +104,7 @@ def llm(request): compilation_config=CompilationConfig( cudagraph_mode="PIECEWISE", compile_mm_encoder=True, - vit_cudagraph_capture_sizes=[64, 128, 256], + mm_encoder_cudagraph_capture_sizes=[64, 128, 256], ), ) print(f"LLM initialized for {model_name} tp={tp_size} mode={mm_mode}") @@ -154,7 +155,10 @@ def _run_embed_multimodal( # Dispatch to get runtime mode and batch descriptor cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch( - num_tokens=num_patches, uniform_decode=False, has_lora=False, is_vit=True + num_tokens=num_patches, + uniform_decode=False, + has_lora=False, + is_mm_encoder=True, ) model_executor = llm.llm_engine.model_executor diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 2da96d938765..70ba6e68ec5d 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -530,12 +530,12 @@ class CompilationConfig: """Sizes to capture cudagraph. - None (default): capture sizes are inferred from vllm config. - list[int]: capture sizes are specified as given.""" - vit_cudagraph_capture_sizes: list[int] | None = None - """Sizes to capture vit cudagraph. + mm_encoder_cudagraph_capture_sizes: list[int] | None = None + """Sizes to capture mm_encoder cudagraph. - None (default): capture sizes are inferred from vllm config. - list[int]: capture sizes are specified as given.""" - max_vit_cudagraph_capture_size: int = field(default=None) - """The maximum vit cudagraph capture size. + max_mm_encoder_cudagraph_capture_size: int = field(default=None) + """The maximum mm_encoder cudagraph capture size. """ cudagraph_copy_inputs: bool = False """Whether to copy input tensors for diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 919f214ce720..8a05b2533089 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -257,10 +257,10 @@ class VllmConfig: performance, with -O0 having the best startup time and -O3 having the best performance. -02 is used by defult. See OptimizationLevel for full description.""" - is_in_compile_or_vit_cuda_graph_capture: bool = False - """Flag for ViT compilation or ViT CUDA graph capture. + in_mm_encoder_tracing: bool = False + """Flag for mm_encoder compilation or mm_encoder CUDA graph capture. - If true, ViT in DP mode will execute the ViT model directly instead of + If true, mm_encoder in DP mode will execute the mm_encoder model directly instead of `run_dp_sharded_mrope_vision_model` to ensure correct memory profiling and compilation for each rank. """ @@ -827,7 +827,7 @@ def has_blocked_weights(): self.compilation_config.cudagraph_num_of_warmups = 1 self._set_cudagraph_sizes() - self._set_vit_cudagraph_sizes() + self._set_mm_encoder_cudagraph_sizes() else: self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE @@ -1347,12 +1347,12 @@ def _set_compile_ranges(self): computed_compile_ranges_split_points ) - def _set_vit_cudagraph_sizes(self): - """Sets the CUDA graph capture sizes for the Vision Transformer (ViT). + def _set_mm_encoder_cudagraph_sizes(self): + """Sets the CUDA graph capture sizes for the multimodal encoder (MM Encoder). This method determines the batch sizes (in terms of number of patches) - for which ViT CUDA graphs will be captured. CUDA graphs improve - performance by reducing kernel launch overhead for the vision encoder. + for which MM Encoder CUDA graphs will be captured. CUDA graphs improve + performance by reducing kernel launch overhead for the multimodal encoder. The logic is as follows: 1. The feature is only enabled if all of the following conditions are met: @@ -1361,22 +1361,22 @@ def _set_vit_cudagraph_sizes(self): - CUDA graph mode is enabled (`cudagraph_mode` is not NONE). - Multimodal encoder compilation is enabled (`compile_mm_encoder` is True). If these conditions are not met, the list of capture sizes will be empty, - effectively disabling ViT CUDA graphs. + effectively disabling mm_encoder CUDA graphs. - 2. If the user has explicitly provided `vit_cudagraph_capture_sizes` in the - compilation config, those sizes are used. The list is de-duplicated - and sorted in ascending order. + 2. If the user has explicitly provided `mm_encoder_cudagraph_capture_sizes` + in the compilation config, those sizes are used. The list is + de-duplicated and sorted in ascending order. 3. If no sizes are provided by the user, a default list of sizes is generated. The maximum size for this list is determined automatically by `compute_encoder_budget` (capped at 8192), or by the user-provided - `max_vit_cudagraph_capture_size`. The default sizes are: + `max_mm_encoder_cudagraph_capture_size`. The default sizes are: [512, 1024, 1536] + list(range(2048, 4096, 128)) + list( range(4096, max_size + 1, 256)) 4. The final list of sizes is stored in - `self.compilation_config.vit_cudagraph_capture_sizes`. The - `max_vit_cudagraph_capture_size` is also updated to be consistent + `self.compilation_config.mm_encoder_cudagraph_capture_sizes`. The + `max_mm_encoder_cudagraph_capture_size` is also updated to be consistent with the largest value in this final list. At runtime: @@ -1391,11 +1391,11 @@ def _set_vit_cudagraph_sizes(self): and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.compile_mm_encoder ): - # determine the initial max_vit_cudagraph_capture_size - max_vit_cudagraph_capture_size = ( - self.compilation_config.max_vit_cudagraph_capture_size + # determine the initial max_mm_encoder_cudagraph_capture_size + max_mm_encoder_cudagraph_capture_size = ( + self.compilation_config.max_mm_encoder_cudagraph_capture_size ) - if max_vit_cudagraph_capture_size is None: + if max_mm_encoder_cudagraph_capture_size is None: from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -1404,65 +1404,83 @@ def _set_vit_cudagraph_sizes(self): scheduler_config=self.scheduler_config, mm_registry=MULTIMODAL_REGISTRY, ) - max_vit_cudagraph_capture_size = min(encoder_compute_budget, 8192) + max_mm_encoder_cudagraph_capture_size = min( + encoder_compute_budget, 8192 + ) - # determine the vit_cudagraph_capture_sizes - if self.compilation_config.vit_cudagraph_capture_sizes is not None: + # determine the mm_encoder_cudagraph_capture_sizes + if self.compilation_config.mm_encoder_cudagraph_capture_sizes is not None: # de-duplicate the sizes provided by the config dedup_sizes = list( - set(self.compilation_config.vit_cudagraph_capture_sizes) + set(self.compilation_config.mm_encoder_cudagraph_capture_sizes) ) - vit_cudagraph_capture_sizes = dedup_sizes + mm_encoder_cudagraph_capture_sizes = dedup_sizes # sort to make sure the sizes are in ascending order - vit_cudagraph_capture_sizes.sort() + mm_encoder_cudagraph_capture_sizes.sort() else: - vit_cudagraph_capture_sizes = [ - i for i in [512, 1024, 1536] if i <= max_vit_cudagraph_capture_size + mm_encoder_cudagraph_capture_sizes = [ + i + for i in [512, 1024, 1536] + if i <= max_mm_encoder_cudagraph_capture_size ] - if max_vit_cudagraph_capture_size >= 2048: + if max_mm_encoder_cudagraph_capture_size >= 2048: # Step size 128 for larger batch sizes - vit_cudagraph_capture_sizes += list( - range(2048, min(max_vit_cudagraph_capture_size + 1, 4096), 128) + mm_encoder_cudagraph_capture_sizes += list( + range( + 2048, + min(max_mm_encoder_cudagraph_capture_size + 1, 4096), + 128, + ) ) - if max_vit_cudagraph_capture_size >= 4096: + if max_mm_encoder_cudagraph_capture_size >= 4096: # Step size 256 for largest batch sizes - vit_cudagraph_capture_sizes += list( - range(4096, max_vit_cudagraph_capture_size + 1, 256) + mm_encoder_cudagraph_capture_sizes += list( + range(4096, max_mm_encoder_cudagraph_capture_size + 1, 256) ) - # user-specific compilation_config.max_vit_cudagraph_capture_size get + # user-specific compilation_config.max_mm_encoder_cudagraph_capture_size get # truncated to valid_max_size when they are inconsistent. valid_max_size = ( - vit_cudagraph_capture_sizes[-1] if vit_cudagraph_capture_sizes else 0 + mm_encoder_cudagraph_capture_sizes[-1] + if mm_encoder_cudagraph_capture_sizes + else 0 ) if ( - self.compilation_config.max_vit_cudagraph_capture_size is not None - and self.compilation_config.max_vit_cudagraph_capture_size + self.compilation_config.max_mm_encoder_cudagraph_capture_size + is not None + and self.compilation_config.max_mm_encoder_cudagraph_capture_size != valid_max_size ): # raise error only when both two flags are user-specified # and they are inconsistent with each other - if self.compilation_config.vit_cudagraph_capture_sizes is not None: + if ( + self.compilation_config.mm_encoder_cudagraph_capture_sizes + is not None + ): raise ValueError( - "customized max_vit_cudagraph_capture_size" - f"(={self.compilation_config.max_vit_cudagraph_capture_size}) " - "should be consistent with the max value of " - f"vit_cudagraph_capture_sizes(={valid_max_size})" + "customized max_mm_encoder_cudagraph_capture_size(=" + f"{ + self.compilation_config.max_mm_encoder_cudagraph_capture_size + }" + ") should be consistent with the max value of " + f"mm_encoder_cudagraph_capture_sizes(={valid_max_size})" ) logger.warning( - "Truncating max_vit_cudagraph_capture_size to %d", + "Truncating max_mm_encoder_cudagraph_capture_size to %d", valid_max_size, ) - # always set the final max_vit_cudagraph_capture_size - self.compilation_config.max_vit_cudagraph_capture_size = valid_max_size - self.compilation_config.vit_cudagraph_capture_sizes = ( - vit_cudagraph_capture_sizes + # always set the final max_mm_encoder_cudagraph_capture_size + self.compilation_config.max_mm_encoder_cudagraph_capture_size = ( + valid_max_size + ) + self.compilation_config.mm_encoder_cudagraph_capture_sizes = ( + mm_encoder_cudagraph_capture_sizes ) else: # no cudagraph in use - self.compilation_config.max_vit_cudagraph_capture_size = 0 - self.compilation_config.vit_cudagraph_capture_sizes = [] + self.compilation_config.max_mm_encoder_cudagraph_capture_size = 0 + self.compilation_config.mm_encoder_cudagraph_capture_sizes = [] def try_verify_and_update_config(self): if self.model_config is None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9733b0f26ec2..7a8599a29e43 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -378,8 +378,8 @@ class EngineArgs: max_cudagraph_capture_size: int | None = get_field( CompilationConfig, "max_cudagraph_capture_size" ) - vit_cudagraph_capture_sizes: list[int] | None = ( - CompilationConfig.vit_cudagraph_capture_sizes + mm_encoder_cudagraph_capture_sizes: list[int] | None = ( + CompilationConfig.mm_encoder_cudagraph_capture_sizes ) # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without @@ -1152,8 +1152,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"] ) compilation_group.add_argument( - "--vit-cudagraph-capture-sizes", - **compilation_kwargs["vit_cudagraph_capture_sizes"], + "--mm_encoder-cudagraph-capture-sizes", + **compilation_kwargs["mm_encoder_cudagraph_capture_sizes"], ) compilation_group.add_argument( "--max-cudagraph-capture-size", @@ -1745,14 +1745,14 @@ def create_engine_config( ) compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes - if self.vit_cudagraph_capture_sizes is not None: - if compilation_config.vit_cudagraph_capture_sizes is not None: + if self.mm_encoder_cudagraph_capture_sizes is not None: + if compilation_config.mm_encoder_cudagraph_capture_sizes is not None: raise ValueError( - "vit_cudagraph_capture_sizes and compilation_config." - "vit_cudagraph_capture_sizes are mutually exclusive" + "mm_encoder_cudagraph_capture_sizes and compilation_config." + "mm_encoder_cudagraph_capture_sizes are mutually exclusive" ) - compilation_config.vit_cudagraph_capture_sizes = ( - self.vit_cudagraph_capture_sizes + compilation_config.mm_encoder_cudagraph_capture_sizes = ( + self.mm_encoder_cudagraph_capture_sizes ) if self.max_cudagraph_capture_size is not None: diff --git a/vllm/forward_context.py b/vllm/forward_context.py index d7a7603f6b41..be08e2d9a6bc 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -55,9 +55,9 @@ class BatchDescriptor(NamedTuple): (like fused_moe_lora) whose grid size depends on num_active_loras to be properly captured. """ - is_vit: bool = False + is_mm_encoder: bool = False """ - ViT Piecewise CUDA Graph Flag + mm_encoder Piecewise CUDA Graph Flag """ def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor": @@ -71,7 +71,7 @@ def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor": uniform=False, has_lora=self.has_lora, num_active_loras=self.num_active_loras, - is_vit=self.is_vit, + is_mm_encoder=self.is_mm_encoder, ) @@ -255,7 +255,7 @@ class ForwardContext: all_moe_layers: list[str] | None = None moe_layer_index: int = 0 - # ViT Multi-Modal Encoder flags used by backend compiler + # mm_encoder Multi-Modal Encoder flags used by backend compiler is_first_graph_in_mm_encoder_sequence: bool = True is_last_graph_in_mm_encoder_sequence: bool = True diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 1127cfea1634..e2793bae0b4b 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -653,9 +653,9 @@ def __init__( self._persistent_hidden_states_buffer: torch.Tensor | None = None self._persistent_rotary_pos_emb_cos_buffer: torch.Tensor | None = None self._persistent_rotary_pos_emb_sin_buffer: torch.Tensor | None = None - if vllm_config.compilation_config.vit_cudagraph_capture_sizes: + if vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes: max_compile_size = ( - vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] + vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes[-1] ) self._persistent_hidden_states_buffer = torch.empty( (max_compile_size, self.patch_embed.proj.input_size), @@ -1292,7 +1292,7 @@ def _process_image_input( with set_current_vllm_config(self.vllm_config): if ( self.use_data_parallel - and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture + and not self.vllm_config.in_mm_encoder_tracing ): return run_dp_sharded_mrope_vision_model( self.visual, @@ -1356,7 +1356,7 @@ def _process_video_input( with set_current_vllm_config(self.vllm_config): if ( self.use_data_parallel - and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture + and not self.vllm_config.in_mm_encoder_tracing ): return run_dp_sharded_mrope_vision_model( self.visual, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 6a8ef0c239ec..e87d3701c31a 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -439,9 +439,9 @@ def __init__( self._persistent_hidden_states_buffer: torch.Tensor | None = None self._persistent_rotary_pos_emb_cos_buffer: torch.Tensor | None = None self._persistent_rotary_pos_emb_sin_buffer: torch.Tensor | None = None - if vllm_config.compilation_config.vit_cudagraph_capture_sizes: + if vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes: max_compile_size = ( - vllm_config.compilation_config.vit_cudagraph_capture_sizes[-1] + vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes[-1] ) self._persistent_hidden_states_buffer = torch.empty( (max_compile_size, self.patch_embed.proj.input_size), @@ -1531,7 +1531,7 @@ def _process_image_input( with set_current_vllm_config(self.vllm_config): if ( self.use_data_parallel - and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture + and not self.vllm_config.in_mm_encoder_tracing ): return run_dp_sharded_mrope_vision_model( self.visual, @@ -1566,7 +1566,7 @@ def _process_video_input( with set_current_vllm_config(self.vllm_config): if ( self.use_data_parallel - and not self.vllm_config.is_in_compile_or_vit_cuda_graph_capture + and not self.vllm_config.in_mm_encoder_tracing ): return run_dp_sharded_mrope_vision_model( self.visual, diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 1637b27209af..67a50b0da054 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -494,14 +494,17 @@ def run_dp_sharded_mrope_vision_model( cudagraph_runtime_mode = CUDAGraphMode.NONE batch_descriptor = None - if vllm_config and vllm_config.compilation_config.vit_cudagraph_capture_sizes: + if ( + vllm_config + and vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes + ): current_input_len = pixel_values_local.shape[0] cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch( num_tokens=current_input_len, uniform_decode=False, has_lora=False, disable_full=False, - is_vit=True, + is_mm_encoder=True, ) target_input_len = batch_descriptor.num_tokens diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 0549a57597ce..1dfe1d07f1d4 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -70,7 +70,7 @@ def _compute_bs_to_padded_graph_size(self) -> None: """Pre-compute the mapping from batch size to padded graph size.""" max_capture_size = self.compilation_config.max_cudagraph_capture_size capture_sizes = self.compilation_config.cudagraph_capture_sizes - self._bs_to_padded_graph_size = self._get_padded_size_map( + self._bs_to_padded_graph_size: list[int] = self._get_padded_size_map( capture_sizes, max_capture_size ) @@ -112,12 +112,12 @@ def _get_lora_cases(self) -> list[int]: # No specialization: only capture graphs with LoRA active return [lora_config.max_loras + 1] - def _compute_bs_to_padded_vit_graph_size(self) -> None: - """pre-compute the mapping from batch size to ViT padded graph size.""" - max_capture_size = self.compilation_config.max_vit_cudagraph_capture_size - capture_sizes = self.compilation_config.vit_cudagraph_capture_sizes + def _compute_bs_to_padded_mm_encoder_graph_size(self) -> None: + """pre-compute the mapping from batch size to mm_encoder padded graph size.""" + max_capture_size = self.compilation_config.max_mm_encoder_cudagraph_capture_size + capture_sizes = self.compilation_config.mm_encoder_cudagraph_capture_sizes - self._bs_to_padded_vit_graph_size = self._get_padded_size_map( + self._bs_to_padded_mm_encoder_graph_size: list[int] = self._get_padded_size_map( capture_sizes, max_capture_size ) @@ -146,12 +146,12 @@ def _create_padded_batch_descriptor( uniform_decode: bool, has_lora: bool, num_active_loras: int = 0, - is_vit: bool = False, + is_mm_encoder: bool = False, ) -> BatchDescriptor: max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs uniform_decode_query_len = self.uniform_decode_query_len - if is_vit: - num_tokens_padded = self._bs_to_padded_vit_graph_size[num_tokens] + if is_mm_encoder: + num_tokens_padded = self._bs_to_padded_mm_encoder_graph_size[num_tokens] else: num_tokens_padded = self._bs_to_padded_graph_size[num_tokens] @@ -168,7 +168,7 @@ def _create_padded_batch_descriptor( uniform=uniform_decode, has_lora=has_lora, num_active_loras=num_active_loras, - is_vit=is_vit + is_mm_encoder=is_mm_encoder, ) def add_cudagraph_key( @@ -192,7 +192,7 @@ def initialize_cudagraph_keys( return self._compute_bs_to_padded_graph_size() - self._compute_bs_to_padded_vit_graph_size() + self._compute_bs_to_padded_mm_encoder_graph_size() # Get LoRA cases to capture lora_cases = self._get_lora_cases() @@ -213,12 +213,12 @@ def initialize_cudagraph_keys( bs, False, num_active_loras > 0, num_active_loras ).relax_for_mixed_batch_cudagraphs(), ) - # ViT CUDAGraph Entry - for patch_len in self.compilation_config.vit_cudagraph_capture_sizes: + # mm_encoder CUDAGraph Entry + for patch_len in self.compilation_config.mm_encoder_cudagraph_capture_sizes: self.add_cudagraph_key( - cudagraph_mode.mixed_mode(), + CUDAGraphMode.PIECEWISE, self._create_padded_batch_descriptor( - patch_len, False, False, is_vit=True + patch_len, False, False, is_mm_encoder=True ).relax_for_mixed_batch_cudagraphs(), ) @@ -256,7 +256,7 @@ def dispatch( has_lora: bool = False, disable_full: bool = False, num_active_loras: int = 0, - is_vit: bool = False, + is_mm_encoder: bool = False, ) -> tuple[CUDAGraphMode, BatchDescriptor]: """ Given conditions(e.g.,batch descriptor and if using piecewise only), @@ -278,15 +278,18 @@ def dispatch( not self.keys_initialized or self.cudagraph_mode == CUDAGraphMode.NONE or ( - not is_vit + not is_mm_encoder and num_tokens > self.compilation_config.max_cudagraph_capture_size ) or ( - is_vit - and num_tokens > self.compilation_config.max_vit_cudagraph_capture_size + is_mm_encoder + and num_tokens + > self.compilation_config.max_mm_encoder_cudagraph_capture_size ) ): - return CUDAGraphMode.NONE, BatchDescriptor(num_tokens, is_vit=is_vit) + return CUDAGraphMode.NONE, BatchDescriptor( + num_tokens, is_mm_encoder=is_mm_encoder + ) effective_num_active_loras = num_active_loras if has_lora and num_active_loras > 0: @@ -305,7 +308,7 @@ def dispatch( effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1 batch_desc = self._create_padded_batch_descriptor( - num_tokens, uniform_decode, has_lora, effective_num_active_loras, is_vit + num_tokens, uniform_decode, has_lora, effective_num_active_loras, is_mm_encoder ) relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs() @@ -324,10 +327,12 @@ def dispatch( return CUDAGraphMode.PIECEWISE, relaxed_batch_desc # finally, just return no cudagraphs and a trivial batch descriptor - return CUDAGraphMode.NONE, BatchDescriptor(num_tokens, is_vit=is_vit) + return CUDAGraphMode.NONE, BatchDescriptor( + num_tokens, is_mm_encoder=is_mm_encoder + ) def get_capture_descs( - self, is_vit: bool = False + self, is_mm_encoder: bool = False ) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]: """ Returns capture descriptors for cudagraph capturing. @@ -346,7 +351,7 @@ def get_capture_descs( descs = list(self.cudagraph_keys[mode]) if descs: # Sort by num_tokens descending (largest first) - filter_descs = [d for d in descs if d.is_vit == is_vit] + filter_descs = [d for d in descs if d.is_mm_encoder == is_mm_encoder] if filter_descs: filter_descs.sort(key=lambda d: d.num_tokens, reverse=True) result.append((mode, filter_descs)) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b07874cd6a16..a8d7d74ea127 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -553,14 +553,14 @@ def __init__( self.cudagraph_batch_sizes = sorted( self.compilation_config.cudagraph_capture_sizes ) - # self.vit_cudagraph_batch_sizes sorts in ascending order. - self.vit_cudagraph_batch_sizes: list[int] | None = None + # self.mm_encoder_cudagraph_batch_sizes sorts in ascending order. + self.mm_encoder_cudagraph_batch_sizes: list[int] | None = None if ( - self.compilation_config.vit_cudagraph_capture_sizes + self.compilation_config.mm_encoder_cudagraph_capture_sizes and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE ): - self.vit_cudagraph_batch_sizes = sorted( - self.compilation_config.vit_cudagraph_capture_sizes + self.mm_encoder_cudagraph_batch_sizes = sorted( + self.compilation_config.mm_encoder_cudagraph_capture_sizes ) # Cache the device properties. @@ -2441,11 +2441,11 @@ def _execute_mm_encoder( original_num_imgs = -1 padded_num_tokens = -1 - # Default values for non-ViT cudagraph case + # Default values for non-mm_encoder cudagraph case cudagraph_runtime_mode = CUDAGraphMode.NONE batch_descriptor = None if ( - self.vit_cudagraph_batch_sizes + self.mm_encoder_cudagraph_batch_sizes and "pixel_values" in mm_kwargs_group ): pixel_values = cast( @@ -2460,7 +2460,7 @@ def _execute_mm_encoder( uniform_decode=False, has_lora=False, disable_full=False, - is_vit=True, + is_mm_encoder=True, ) ) padded_num_tokens = batch_descriptor.num_tokens @@ -4682,10 +4682,10 @@ def _get_dummy_h_w_patches(self, patches: int): w_patches = patches // merge_size return h_patches, w_patches - def _get_dummy_vit_input( + def _get_dummy_mm_encoder_input( self, num_image_tokens: int, img_feature_dim: int ) -> BatchedTensorInputs: - """Dummy data for profiling and precompiling ViT.""" + """Dummy data for profiling and precompiling mm_encoder.""" # The first dimension of pixel_values corresponds # to the total number of patches. @@ -5236,7 +5236,7 @@ def _dummy_mm_encoder_run( self, compilation_cases: list[int], ) -> None: - self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = True + self.vllm_config.in_mm_encoder_tracing = True tmp_dummy_mm_inputs = self._get_mm_dummy_batch( "video", 1, @@ -5249,17 +5249,19 @@ def _dummy_mm_encoder_run( compilation_cases = tqdm( compilation_cases, disable=not self.load_config.use_tqdm_on_load, - desc="Capturing Vit CUDA graphs (PIECEWISE)", + desc="Capturing mm_encoder CUDA graphs (PIECEWISE)", ) - # Lazy initialization of the persistent buffer + for capture_size in compilation_cases: - dummy_mm_inputs = self._get_dummy_vit_input(capture_size, img_feature_dim) + dummy_mm_inputs = self._get_dummy_mm_encoder_input( + capture_size, img_feature_dim + ) cudagraph_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch( num_tokens=capture_size, uniform_decode=False, has_lora=False, disable_full=False, - is_vit=True, + is_mm_encoder=True, ) with ( set_forward_context( @@ -5271,10 +5273,10 @@ def _dummy_mm_encoder_run( ), ): self.model.embed_multimodal(**dummy_mm_inputs) - self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = False + self.vllm_config.in_mm_encoder_tracing = False def profile_run(self) -> None: - self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = True + self.vllm_config.in_mm_encoder_tracing = True # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: mm_config = self.model_config.multimodal_config @@ -5339,7 +5341,7 @@ def profile_run(self) -> None: del hidden_states, output self.encoder_cache.clear() gc.collect() - self.vllm_config.is_in_compile_or_vit_cuda_graph_capture = False + self.vllm_config.in_mm_encoder_tracing = False def capture_model(self) -> int: if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE: @@ -5491,10 +5493,10 @@ def _capture_cudagraphs( cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and self.supports_mm_inputs ): - vit_capture_sizes = self.vit_cudagraph_batch_sizes - if vit_capture_sizes: - compilation_cases_vit = list(reversed(vit_capture_sizes)) - self._dummy_mm_encoder_run(compilation_cases_vit) + mm_encoder_capture_sizes = self.mm_encoder_cudagraph_batch_sizes + if mm_encoder_capture_sizes: + compilation_cases_mm_encoder = list(reversed(mm_encoder_capture_sizes)) + self._dummy_mm_encoder_run(compilation_cases_mm_encoder) self.maybe_remove_all_loras(self.lora_config) From 13c6422a591d505a3946ef399f5f8cd2bbed96dc Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Tue, 3 Feb 2026 00:10:44 +0800 Subject: [PATCH 30/35] feat: add MMEncoderCudagraphManager and update related components for multimodal input handling Signed-off-by: Hongjian Zhang --- vllm/model_executor/models/qwen2_5_vl.py | 14 +- vllm/model_executor/models/qwen2_vl.py | 19 ++ vllm/model_executor/models/qwen3_vl.py | 33 +++- vllm/model_executor/models/vision.py | 50 ++---- vllm/multimodal/processing/dummy_inputs.py | 50 ++++++ vllm/v1/worker/gpu_model_runner.py | 192 ++++----------------- vllm/v1/worker/mm_cudagraph.py | 173 +++++++++++++++++++ 7 files changed, 324 insertions(+), 207 deletions(-) create mode 100644 vllm/v1/worker/mm_cudagraph.py diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index e2793bae0b4b..270346801437 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1279,7 +1279,7 @@ def _parse_and_validate_video_input( def _process_image_input( self, image_input: Qwen2_5_VLImageInputs, - cudagraph_dispatcher: Any | None = None, + mm_cudagraph_manager: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = image_input["image_grid_thw"] assert grid_thw.ndim == 2 @@ -1299,7 +1299,7 @@ def _process_image_input( pixel_values, grid_thw_list, rope_type="rope_3d", - cudagraph_dispatcher=cudagraph_dispatcher, + mm_cudagraph_manager=mm_cudagraph_manager, ) else: image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) @@ -1343,7 +1343,7 @@ def _postprocess_image_embeds_evs( def _process_video_input( self, video_input: Qwen2_5_VLVideoInputs, - cudagraph_dispatcher: Any | None = None, + mm_cudagraph_manager: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 @@ -1363,7 +1363,7 @@ def _process_video_input( pixel_values_videos, grid_thw_list, rope_type="rope_3d", - cudagraph_dispatcher=cudagraph_dispatcher, + mm_cudagraph_manager=mm_cudagraph_manager, ) else: video_embeds = self.visual( @@ -1513,7 +1513,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: return mm_input_by_modality def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: - cudagraph_dispatcher = kwargs.pop("cudagraph_dispatcher", None) + mm_cudagraph_manager = kwargs.pop("mm_cudagraph_manager", None) mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) if not mm_input_by_modality: return [] @@ -1528,7 +1528,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: multimodal_input = mm_input_by_modality[modality] if modality == "image": image_embeddings = self._process_image_input( - multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher + multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager ) if self.is_multimodal_pruning_enabled: image_embeddings = self._postprocess_image_embeds_evs( @@ -1537,7 +1537,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input( - multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher + multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager ) if self.is_multimodal_pruning_enabled: video_embeddings = self._postprocess_video_embeds_evs( diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index c7c26c206726..94348a77b55a 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1043,6 +1043,25 @@ def get_dummy_mm_data( ), } + def _calculate_patch_size(self, patches: int) -> tuple[int, int]: + vision_config = self.info.get_hf_config().vision_config + merge_size = vision_config.spatial_merge_size + + assert patches % (merge_size * merge_size) == 0, ( + f"Qwen2-VL: Number of patches ({patches}) must be multiple of " + f"merge_size squared ({merge_size}^2)" + ) + h_patches = merge_size + w_patches = patches // merge_size + return h_patches, w_patches + + def _get_img_feature_dim(self) -> int: + vision_config = self.info.get_hf_config().vision_config + in_channels = vision_config.in_channels + temporal_patch_size = vision_config.temporal_patch_size + patch_size = vision_config.patch_size + return in_channels * temporal_patch_size * patch_size * patch_size + class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]): def _get_prompt_updates( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index e87d3701c31a..35d7986d13b6 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1015,6 +1015,25 @@ def _get_dummy_videos( video_items.append(video_item) return video_items + def _calculate_patch_size(self, patches: int) -> tuple[int, int]: + vision_config = self.info.get_hf_config().vision_config + merge_size = vision_config.spatial_merge_size + + assert patches % (merge_size * merge_size) == 0, ( + f"Qwen3-VL: Number of patches ({patches}) must be multiple of " + f"merge_size squared ({merge_size}^2)" + ) + h_patches = merge_size + w_patches = patches // merge_size + return h_patches, w_patches + + def _get_img_feature_dim(self) -> int: + vision_config = self.info.get_hf_config().vision_config + in_channels = vision_config.in_channels + temporal_patch_size = vision_config.temporal_patch_size + patch_size = vision_config.patch_size + return in_channels * temporal_patch_size * patch_size * patch_size + class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]): def _call_hf_processor( @@ -1517,7 +1536,7 @@ def _parse_and_validate_video_input( def _process_image_input( self, image_input: Qwen2_5_VLImageInputs, - cudagraph_dispatcher: Any | None = None, + mm_cudagraph_manager: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = image_input["image_grid_thw"] assert grid_thw.ndim == 2 @@ -1538,7 +1557,7 @@ def _process_image_input( pixel_values, grid_thw_list, rope_type="rope_3d", - cudagraph_dispatcher=cudagraph_dispatcher, + mm_cudagraph_manager=mm_cudagraph_manager, ) else: image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) @@ -1551,7 +1570,7 @@ def _process_image_input( def _process_video_input( self, video_input: Qwen2_5_VLVideoInputs, - cudagraph_dispatcher: Any | None = None, + mm_cudagraph_manager: Any | None = None, ) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 @@ -1573,7 +1592,7 @@ def _process_video_input( pixel_values_videos, grid_thw_list, rope_type="rope_3d", - cudagraph_dispatcher=cudagraph_dispatcher, + mm_cudagraph_manager=mm_cudagraph_manager, ) else: video_embeds = self.visual( @@ -2022,7 +2041,7 @@ def get_mrope_input_positions( return torch.from_numpy(llm_positions), mrope_position_delta def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: - cudagraph_dispatcher = kwargs.pop("cudagraph_dispatcher", None) + mm_cudagraph_manager = kwargs.pop("mm_cudagraph_manager", None) mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) if not mm_input_by_modality: return None @@ -2037,7 +2056,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: multimodal_input = mm_input_by_modality[modality] if modality == "image": image_embeddings = self._process_image_input( - multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher + multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager ) if self.is_multimodal_pruning_enabled: image_embeddings = self._postprocess_image_embeds_evs( @@ -2046,7 +2065,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: multimodal_embeddings += tuple(image_embeddings) if modality == "video": video_embeddings = self._process_video_input( - multimodal_input, cudagraph_dispatcher=cudagraph_dispatcher + multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager ) if self.is_multimodal_pruning_enabled: video_embeddings = self._postprocess_video_embeds_evs( diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 67a50b0da054..538f1c98d64e 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -25,7 +25,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.v1.attention.backends.registry import AttentionBackendEnum -from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher +from vllm.v1.worker.mm_cudagraph import MMEncoderCudagraphManager logger = init_logger(__name__) @@ -394,7 +394,7 @@ def run_dp_sharded_mrope_vision_model( grid_thw_list: list[list[int]], *, rope_type: Literal["rope_3d", "rope_2d"], - cudagraph_dispatcher: CudagraphDispatcher | None = None, + mm_cudagraph_manager: MMEncoderCudagraphManager | None = None, ) -> tuple[torch.Tensor, ...]: """Run a vision model with data parallelism (DP) sharding. The function will shard the input image tensor on the @@ -470,12 +470,10 @@ def run_dp_sharded_mrope_vision_model( embed_dim_reduction_factor = ( vision_model.merge_kernel_size[0] * vision_model.merge_kernel_size[1] ) - merge_size = vision_model.merge_kernel_size[0] else: embed_dim_reduction_factor = ( vision_model.spatial_merge_size * vision_model.spatial_merge_size ) - merge_size = vision_model.spatial_merge_size # Find the max length across all ranks # The output embedding of every DP rank has to be @@ -484,40 +482,24 @@ def run_dp_sharded_mrope_vision_model( max_len_per_rank = max(grouped_pixel_values_len) // embed_dim_reduction_factor local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local] - vllm_config = get_current_vllm_config() - # Context setup - if cudagraph_dispatcher is not None: - dispatcher = cudagraph_dispatcher - else: - dispatcher = CudagraphDispatcher(vllm_config) + vllm_config = get_current_vllm_config() cudagraph_runtime_mode = CUDAGraphMode.NONE batch_descriptor = None - if ( - vllm_config - and vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes - ): - current_input_len = pixel_values_local.shape[0] - cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch( - num_tokens=current_input_len, - uniform_decode=False, - has_lora=False, - disable_full=False, - is_mm_encoder=True, - ) - target_input_len = batch_descriptor.num_tokens - - # Pad pixel_values_local for CUDA graph if needed - if current_input_len < target_input_len: - padding_size = target_input_len - current_input_len - padding = torch.zeros( - (padding_size, pixel_values_local.shape[1]), - device=pixel_values_local.device, - dtype=pixel_values_local.dtype, - ) - pixel_values_local = torch.cat([pixel_values_local, padding], dim=0) - local_grid_thw_list.append([1, merge_size, padding_size // merge_size]) + if mm_cudagraph_manager is not None: + mm_groups: dict[str, torch.Tensor | list] = { + "pixel_values": pixel_values_local, + "image_grid_thw": local_grid_thw_list, + } + ( + cudagraph_runtime_mode, + batch_descriptor, + _, + mm_groups, + ) = mm_cudagraph_manager.dispatch_and_pad_mm_input(mm_groups) + pixel_values_local = mm_groups["pixel_values"] + local_grid_thw_list = mm_groups["image_grid_thw"] with set_forward_context( None, diff --git a/vllm/multimodal/processing/dummy_inputs.py b/vllm/multimodal/processing/dummy_inputs.py index b23e2b86cc20..9eb1020db681 100644 --- a/vllm/multimodal/processing/dummy_inputs.py +++ b/vllm/multimodal/processing/dummy_inputs.py @@ -7,6 +7,7 @@ import numpy as np import numpy.typing as npt +import torch from PIL import Image from vllm.config.multimodal import ( @@ -199,3 +200,52 @@ def _get_dummy_videos( height = min(height, overrides.height) video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8) return [video] * num_videos + + @abstractmethod + def _get_img_feature_dim(self) -> int: + """ + Get the image feature dimension for MM encoder CUDA graph capture. + + Returns: + The image feature dimension. + """ + raise NotImplementedError + + @abstractmethod + def _calculate_patch_size(self, patches: int) -> tuple[int, int]: + """ + Calculate the patch grid size (height, width) from the total number of + patches. + """ + raise NotImplementedError + + def get_dummy_mm_encoder_input( + self, + num_patches: int, + ) -> "dict[str, torch.Tensor]": + """ + Get dummy MM encoder input for CUDA graph capture or padding. + + Args: + num_patches: Number of patches (tokens) for the dummy input + + Returns: + dict with pixel_values and image_grid_thw + """ + img_feature_dim = self._get_img_feature_dim() + + dtype = self.info.ctx.model_config.dtype + + h_patches, w_patches = self._calculate_patch_size(num_patches) + + pixel_values = torch.zeros( + (num_patches, img_feature_dim), dtype=dtype, device="cuda" + ) + grid_thw_list = torch.tensor( + [[1, h_patches, w_patches]], dtype=torch.long, device="cpu" + ) + + return { + "pixel_values": pixel_values, + "image_grid_thw": grid_thw_list, + } diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a8d7d74ea127..36f4e7f5bdcc 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -171,6 +171,7 @@ from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin +from vllm.v1.worker.mm_cudagraph import MMEncoderCudagraphManager from vllm.v1.worker.ubatch_utils import ( UBatchSlices, check_ubatch_thresholds, @@ -553,15 +554,6 @@ def __init__( self.cudagraph_batch_sizes = sorted( self.compilation_config.cudagraph_capture_sizes ) - # self.mm_encoder_cudagraph_batch_sizes sorts in ascending order. - self.mm_encoder_cudagraph_batch_sizes: list[int] | None = None - if ( - self.compilation_config.mm_encoder_cudagraph_capture_sizes - and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE - ): - self.mm_encoder_cudagraph_batch_sizes = sorted( - self.compilation_config.mm_encoder_cudagraph_capture_sizes - ) # Cache the device properties. self._init_device_properties() @@ -660,6 +652,18 @@ def __init__( # Cudagraph dispatcher for runtime cudagraph dispatching. self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config) + # MM encoder CUDA graph manager for ViT piecewise CUDA graph. + self.mm_cudagraph_manager: MMEncoderCudagraphManager | None = None + if self.supports_mm_inputs: + processor = self.mm_registry.create_processor(self.model_config) + dummy_inputs_builder = processor.dummy_inputs + self.mm_cudagraph_manager = MMEncoderCudagraphManager( + self.vllm_config, + self.cudagraph_dispatcher, + self.device, + dummy_inputs_builder, + ) + self.mm_budget = ( MultiModalBudget(self.vllm_config, self.mm_registry) if self.supports_mm_inputs @@ -2430,72 +2434,26 @@ def _execute_mm_encoder( # 2. A list or tuple (length: num_items) of tensors, # each of shape (feature_size, hidden_size) in case the feature # size is dynamic depending on the input multimodal items. - is_vit_dp_mode = ( - getattr( - self.model_config.multimodal_config, "mm_encoder_tp_mode", None - ) - == "data" - and self.parallel_config.tensor_parallel_size > 1 - ) + mm_mgr = self.mm_cudagraph_manager + is_vit_dp_mode = mm_mgr.is_vit_dp_mode if mm_mgr else False + if not is_vit_dp_mode: original_num_imgs = -1 - padded_num_tokens = -1 # Default values for non-mm_encoder cudagraph case cudagraph_runtime_mode = CUDAGraphMode.NONE batch_descriptor = None if ( - self.mm_encoder_cudagraph_batch_sizes + mm_mgr is not None + and mm_mgr.enabled and "pixel_values" in mm_kwargs_group ): - pixel_values = cast( - torch.Tensor, mm_kwargs_group["pixel_values"] - ) - num_tokens = pixel_values.shape[0] - - # get batch_descriptor from dispatcher - cudagraph_runtime_mode, batch_descriptor = ( - self.cudagraph_dispatcher.dispatch( - num_tokens=num_tokens, - uniform_decode=False, - has_lora=False, - disable_full=False, - is_mm_encoder=True, - ) - ) - padded_num_tokens = batch_descriptor.num_tokens - - if padded_num_tokens > num_tokens: - padding_amount = padded_num_tokens - num_tokens - padding_tensor = torch.zeros( - (padding_amount, pixel_values.shape[1]), - dtype=pixel_values.dtype, - device=pixel_values.device, - ) - mm_kwargs_group["pixel_values"] = torch.cat( - [pixel_values, padding_tensor], dim=0 - ) - - # Update image_grid_thw to account for padding - if "image_grid_thw" in mm_kwargs_group: - image_grid_thw = cast( - torch.Tensor, mm_kwargs_group["image_grid_thw"] - ) - original_num_imgs = image_grid_thw.shape[0] - - # Treat padding as a new virtual image. - # Assuming a fixed patch size where height = merge_size - h_patches, w_patches = self._get_dummy_h_w_patches( - padding_amount - ) - padding_grid_info = torch.tensor( - [[1, h_patches, w_patches]], - dtype=image_grid_thw.dtype, - device=image_grid_thw.device, - ) - mm_kwargs_group["image_grid_thw"] = torch.cat( - [image_grid_thw, padding_grid_info], dim=0 - ) + ( + cudagraph_runtime_mode, + batch_descriptor, + original_num_imgs, + mm_kwargs_group, + ) = mm_mgr.dispatch_and_pad_mm_input(mm_kwargs_group) with ( set_forward_context( @@ -2519,9 +2477,7 @@ def _execute_mm_encoder( should_time, mm_lora_refs, current_item_idx, num_items ), ): - mm_kwargs_group["cudagraph_dispatcher"] = ( - self.cudagraph_dispatcher - ) + mm_kwargs_group["mm_cudagraph_manager"] = mm_mgr curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -4666,43 +4622,6 @@ def rand_inputs_embeds() -> torch.Tensor: yield inputs_embeds.fill_(0) - def _get_dummy_h_w_patches(self, patches: int): - vision_config = self.model_config.hf_config.vision_config - if hasattr(vision_config, "spatial_merge_size"): - merge_size = vision_config.spatial_merge_size - elif hasattr(vision_config, "merge_kernel_size"): - merge_size = vision_config.merge_kernel_size[0] - else: - merge_size = 1 - - assert patches % (merge_size * merge_size) == 0, ( - "Number of patches must be multiple of merge_size squared" - ) - h_patches = merge_size - w_patches = patches // merge_size - return h_patches, w_patches - - def _get_dummy_mm_encoder_input( - self, num_image_tokens: int, img_feature_dim: int - ) -> BatchedTensorInputs: - """Dummy data for profiling and precompiling mm_encoder.""" - - # The first dimension of pixel_values corresponds - # to the total number of patches. - pixel_values = torch.zeros( - (num_image_tokens, img_feature_dim), dtype=self.dtype, device=self.device - ) - - h_patches, w_patches = self._get_dummy_h_w_patches(num_image_tokens) - image_grid_thw = torch.tensor( - [[1, h_patches, w_patches]], dtype=torch.long, device=self.device - ) - - return { - "pixel_values": pixel_values, - "image_grid_thw": image_grid_thw, - } - def _get_mm_dummy_batch( self, modality: str, @@ -5231,50 +5150,6 @@ def _dummy_pooler_run( max_task = max(output_size.items(), key=lambda x: x[1])[0] return self._dummy_pooler_run_task(hidden_states, max_task) - @torch.inference_mode() - def _dummy_mm_encoder_run( - self, - compilation_cases: list[int], - ) -> None: - self.vllm_config.in_mm_encoder_tracing = True - tmp_dummy_mm_inputs = self._get_mm_dummy_batch( - "video", - 1, - ) - img_feature_dim = cast( - torch.Tensor, tmp_dummy_mm_inputs["pixel_values_videos"] - ).shape[1] - - if is_global_first_rank(): - compilation_cases = tqdm( - compilation_cases, - disable=not self.load_config.use_tqdm_on_load, - desc="Capturing mm_encoder CUDA graphs (PIECEWISE)", - ) - - for capture_size in compilation_cases: - dummy_mm_inputs = self._get_dummy_mm_encoder_input( - capture_size, img_feature_dim - ) - cudagraph_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch( - num_tokens=capture_size, - uniform_decode=False, - has_lora=False, - disable_full=False, - is_mm_encoder=True, - ) - with ( - set_forward_context( - None, - self.vllm_config, - num_tokens=capture_size, - cudagraph_runtime_mode=cudagraph_mode, - batch_descriptor=batch_descriptor, - ), - ): - self.model.embed_multimodal(**dummy_mm_inputs) - self.vllm_config.in_mm_encoder_tracing = False - def profile_run(self) -> None: self.vllm_config.in_mm_encoder_tracing = True # Profile with multimodal encoder & encoder cache. @@ -5386,6 +5261,14 @@ def freeze_gc(): batch_descriptors=batch_descs, cudagraph_runtime_mode=runtime_mode, ) + # Capture MM encoder CUDA graphs if enabled + if self.mm_cudagraph_manager is not None: + for runtime_mode, _ in self.cudagraph_dispatcher.get_capture_descs( + is_mm_encoder=True + ): + self.mm_cudagraph_manager.capture( + model=self.model, cudagraph_mode=runtime_mode + ) torch.cuda.synchronize() end_free_gpu_memory = torch.cuda.mem_get_info()[0] @@ -5489,15 +5372,6 @@ def _capture_cudagraphs( num_active_loras=num_active_loras, is_graph_capturing=True, ) - if ( - cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE - and self.supports_mm_inputs - ): - mm_encoder_capture_sizes = self.mm_encoder_cudagraph_batch_sizes - if mm_encoder_capture_sizes: - compilation_cases_mm_encoder = list(reversed(mm_encoder_capture_sizes)) - self._dummy_mm_encoder_run(compilation_cases_mm_encoder) - self.maybe_remove_all_loras(self.lora_config) def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: diff --git a/vllm/v1/worker/mm_cudagraph.py b/vllm/v1/worker/mm_cudagraph.py new file mode 100644 index 000000000000..fd572c40c46c --- /dev/null +++ b/vllm/v1/worker/mm_cudagraph.py @@ -0,0 +1,173 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, cast + +import torch +import torch.nn as nn +from tqdm import tqdm + +from vllm.config import CUDAGraphMode, VllmConfig +from vllm.distributed.parallel_state import is_global_first_rank +from vllm.forward_context import ( + BatchDescriptor, + set_forward_context, +) +from vllm.logger import init_logger +from vllm.multimodal import BatchedTensorInputs +from vllm.multimodal.processing import BaseDummyInputsBuilder +from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher + +logger = init_logger(__name__) + + +class MMEncoderCudagraphManager: + def __init__( + self, + vllm_config: VllmConfig, + cudagraph_dispatcher: CudagraphDispatcher, + device: torch.device, + dummy_input_builder: BaseDummyInputsBuilder[Any], + ): + self.vllm_config = vllm_config + self.dispatcher = cudagraph_dispatcher + self.device = device + self.dummy_input_builder = dummy_input_builder + + compilation_config = vllm_config.compilation_config + self.capture_sizes: list[int] = [] + if compilation_config and compilation_config.mm_encoder_cudagraph_capture_sizes: + self.capture_sizes = sorted( + compilation_config.mm_encoder_cudagraph_capture_sizes + ) + + self.enabled = bool( + self.capture_sizes + and compilation_config + and compilation_config.cudagraph_mode != CUDAGraphMode.NONE + ) + + # Check if using data parallel mode for ViT + self.is_vit_dp_mode = self._check_vit_dp_mode(vllm_config) + + def _check_vit_dp_mode(self, vllm_config: VllmConfig) -> bool: + """Check if ViT is running in data parallel mode.""" + mm_config = getattr(vllm_config.model_config, "multimodal_config", None) + if mm_config is None: + return False + + mm_encoder_tp_mode = mm_config.mm_encoder_tp_mode + tp_size = vllm_config.parallel_config.tensor_parallel_size + + return mm_encoder_tp_mode == "data" and tp_size > 1 + + def dispatch_and_pad_mm_input( + self, + mm_kwargs_group: BatchedTensorInputs, + ) -> tuple[CUDAGraphMode, BatchDescriptor | None, int, BatchedTensorInputs]: + pixel_values = cast(torch.Tensor, mm_kwargs_group["pixel_values"]) + num_tokens = pixel_values.shape[0] + + image_grid_thw = mm_kwargs_group["image_grid_thw"] + if isinstance(image_grid_thw, torch.Tensor): + original_num_imgs = image_grid_thw.shape[0] + else: + original_num_imgs = len(image_grid_thw) + + if not self.enabled: + return ( + CUDAGraphMode.NONE, + BatchDescriptor(num_tokens, is_mm_encoder=True), + original_num_imgs, + mm_kwargs_group, + ) + + # Dispatch to get the target padded size + cudagraph_runtime_mode, batch_descriptor = self.dispatcher.dispatch( + num_tokens=num_tokens, + is_mm_encoder=True, + ) + target_num_tokens = batch_descriptor.num_tokens + + # Pad if necessary + if target_num_tokens > num_tokens: + # Pad pixel_values + padding_size = target_num_tokens - num_tokens + padding_mm_inputs = self.dummy_input_builder.get_dummy_mm_encoder_input( + padding_size, + ) + + mm_kwargs_group["pixel_values"] = torch.cat( + [pixel_values, padding_mm_inputs["pixel_values"]], dim=0 + ) + + padding_image_grid_thw = padding_mm_inputs["image_grid_thw"] + if isinstance(image_grid_thw, torch.Tensor): + mm_kwargs_group["image_grid_thw"] = torch.cat( + [image_grid_thw, padding_image_grid_thw], dim=0 + ) + else: + mm_kwargs_group["image_grid_thw"] = ( + image_grid_thw + padding_image_grid_thw.tolist() + ) + + return ( + cudagraph_runtime_mode, + batch_descriptor, + original_num_imgs, + mm_kwargs_group, + ) + + def capture_graph( + self, + num_tokens: int, + model: nn.Module, + cudagraph_mode: CUDAGraphMode, + ) -> None: + dummy_mm_inputs = self.dummy_input_builder.get_dummy_mm_encoder_input( + num_tokens + ) + + batch_descriptor = BatchDescriptor( + num_tokens=num_tokens, + is_mm_encoder=True, + ) + + with set_forward_context( + None, + self.vllm_config, + num_tokens=num_tokens, + cudagraph_runtime_mode=cudagraph_mode, + batch_descriptor=batch_descriptor, + ): + model.embed_multimodal(**dummy_mm_inputs) + + @torch.inference_mode() + def capture( + self, + model: nn.Module, + cudagraph_mode: CUDAGraphMode, + ) -> None: + if not self.enabled or not self.capture_sizes: + return + + self.vllm_config.in_mm_encoder_tracing = True + + capture_sizes_desc = list(reversed(self.capture_sizes)) + + if is_global_first_rank(): + capture_sizes_iter: Any = tqdm( + capture_sizes_desc, + disable=not self.vllm_config.load_config.use_tqdm_on_load, + desc="Capturing MM_Encoder CUDA graphs (PIECEWISE)", + ) + else: + capture_sizes_iter = capture_sizes_desc + + for capture_size in capture_sizes_iter: + self.capture_graph( + capture_size, + model=model, + cudagraph_mode=cudagraph_mode, + ) + + self.vllm_config.in_mm_encoder_tracing = False From 07316832308f02f7e10df42977ada75a72456c6d Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Tue, 3 Feb 2026 15:39:12 +0800 Subject: [PATCH 31/35] simplify cuda graph conditional judgments Signed-off-by: Hongjian Zhang --- vllm/model_executor/models/qwen2_5_vl.py | 56 +++++++++++------------ vllm/model_executor/models/qwen3_vl.py | 57 ++++++++++++------------ vllm/v1/worker/gpu_model_runner.py | 8 +--- 3 files changed, 57 insertions(+), 64 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 270346801437..d6a352392e9c 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -662,16 +662,17 @@ def __init__( device=self.device, dtype=self.dtype, ) - self._persistent_rotary_pos_emb_cos_buffer = torch.empty( - (max_compile_size, head_dim // 2), - device=self.device, - dtype=torch.bfloat16, - ) - self._persistent_rotary_pos_emb_sin_buffer = torch.empty( - (max_compile_size, head_dim // 2), - device=self.device, - dtype=torch.bfloat16, - ) + ( + self._persistent_rotary_pos_emb_cos_buffer, + self._persistent_rotary_pos_emb_sin_buffer, + ) = [ + torch.empty( + (max_compile_size, head_dim // 2), + device=self.device, + dtype=torch.bfloat16, + ) + for _ in range(2) + ] @property def dtype(self) -> torch.dtype: @@ -803,6 +804,17 @@ def invert_permutation(perm: torch.Tensor) -> torch.Tensor: inv[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype) return inv + def _use_piecewise_cudagraph(self) -> bool: + if self._persistent_hidden_states_buffer is None: + return False + if not is_forward_context_available(): + return False + fwd_ctx = get_forward_context() + return ( + fwd_ctx is not None + and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ) + def forward( self, x: torch.Tensor, @@ -816,14 +828,9 @@ def forward( cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)] cu_seqlens: list = [] - fwd_ctx = None - if is_forward_context_available(): - fwd_ctx = get_forward_context() - if ( - self._persistent_hidden_states_buffer is not None - and fwd_ctx - and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE - ): + is_cudagraph_mode = self._use_piecewise_cudagraph() + + if is_cudagraph_mode: hidden_states = self._persistent_hidden_states_buffer[:seq_len] hidden_states.copy_(x, non_blocking=True) else: @@ -886,12 +893,7 @@ def forward( rotary_pos_emb_sin = rotary_pos_emb_sin.to( device=self.device, non_blocking=True ) - if ( - self._persistent_rotary_pos_emb_sin_buffer is not None - and self._persistent_rotary_pos_emb_cos_buffer is not None - and fwd_ctx - and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE - ): + if is_cudagraph_mode: rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[ :seq_len ].copy_(rotary_pos_emb_sin) @@ -911,11 +913,7 @@ def forward( hidden_states = hidden_states.reshape(seq_len, -1) hidden_states = hidden_states.unsqueeze(1) - if ( - self._persistent_hidden_states_buffer is not None - and fwd_ctx - and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE - ): + if is_cudagraph_mode: # The above operations will produce temporary new tensors. # That is not friendly to cudagraphs, # so we need to copy them back to the persistent buffer diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 35d7986d13b6..9e6001f474b0 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -448,16 +448,17 @@ def __init__( device=self.device, dtype=self.dtype, ) - self._persistent_rotary_pos_emb_cos_buffer = torch.empty( - (max_compile_size, head_dim // 2), - device=self.device, - dtype=torch.bfloat16, - ) - self._persistent_rotary_pos_emb_sin_buffer = torch.empty( - (max_compile_size, head_dim // 2), - device=self.device, - dtype=torch.bfloat16, - ) + ( + self._persistent_rotary_pos_emb_cos_buffer, + self._persistent_rotary_pos_emb_sin_buffer, + ) = [ + torch.empty( + (max_compile_size, head_dim // 2), + device=self.device, + dtype=torch.bfloat16, + ) + for _ in range(2) + ] @property def dtype(self) -> torch.dtype: @@ -584,20 +585,26 @@ def compute_attn_mask_seqlen( max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() return max_seqlen + def _use_piecewise_cudagraph(self) -> bool: + if self._persistent_hidden_states_buffer is None: + return False + if not is_forward_context_available(): + return False + fwd_ctx = get_forward_context() + return ( + fwd_ctx is not None + and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ) + def forward( self, x: torch.Tensor, grid_thw: torch.Tensor | list[list[int]], ) -> torch.Tensor: seq_len, _ = x.size() - fwd_ctx = None - if is_forward_context_available(): - fwd_ctx = get_forward_context() - if ( - self._persistent_hidden_states_buffer is not None - and fwd_ctx - and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE - ): + is_cudagraph_mode = self._use_piecewise_cudagraph() + + if is_cudagraph_mode: hidden_states = self._persistent_hidden_states_buffer[:seq_len] hidden_states.copy_(x, non_blocking=True) else: @@ -622,12 +629,8 @@ def forward( original_hidden_states = hidden_states hidden_states = hidden_states + pos_embeds rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) - if ( - self._persistent_rotary_pos_emb_sin_buffer is not None - and self._persistent_rotary_pos_emb_cos_buffer is not None - and fwd_ctx - and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE - ): + + if is_cudagraph_mode: rotary_pos_emb_sin = self._persistent_rotary_pos_emb_sin_buffer[ :seq_len ].copy_(rotary_pos_emb_sin) @@ -645,11 +648,7 @@ def forward( max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens) cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) - if ( - self._persistent_hidden_states_buffer is not None - and fwd_ctx - and fwd_ctx.cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE - ): + if is_cudagraph_mode: # The above operations will produce temporary new tensors. # That is not friendly to cudagraphs, # so we need to copy them back to the persistent buffer diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 36f4e7f5bdcc..f1b1438c81b6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -29,7 +29,6 @@ CUDAGraphMode, VllmConfig, get_layers_from_vllm_config, - set_current_vllm_config, update_config, ) from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer @@ -2471,11 +2470,8 @@ def _execute_mm_encoder( if original_num_imgs != -1: curr_group_outputs = curr_group_outputs[:original_num_imgs] else: - with ( - set_current_vllm_config(self.vllm_config), - self.timed_encoder_operation( - should_time, mm_lora_refs, current_item_idx, num_items - ), + with self.timed_encoder_operation( + should_time, mm_lora_refs, current_item_idx, num_items ): mm_kwargs_group["mm_cudagraph_manager"] = mm_mgr curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) From 53814ecac0d451d8f04747f1c4dafb86e5c89925 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Tue, 3 Feb 2026 16:54:22 +0800 Subject: [PATCH 32/35] rebase Signed-off-by: Hongjian Zhang --- vllm/config/vllm.py | 16 ++++------------ vllm/v1/cudagraph_dispatcher.py | 10 +++++++--- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 8a05b2533089..137ab56b65b9 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -365,13 +365,6 @@ def compute_hash(self) -> str: ] return hash_str - def pad_for_cudagraph(self, batch_size: int) -> int: - # if batch_size > self.compilation_config.max_cudagraph_capture_size, - # it should raise an IndexError. - # the caller should make sure the batch_size is within the range, - # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size - return self.compilation_config.bs_to_padded_graph_size[batch_size] - @property def needs_dp_coordinator(self) -> bool: """ @@ -1397,12 +1390,11 @@ def _set_mm_encoder_cudagraph_sizes(self): ) if max_mm_encoder_cudagraph_capture_size is None: from vllm.multimodal import MULTIMODAL_REGISTRY - from vllm.v1.core.encoder_cache_manager import compute_encoder_budget + from vllm.multimodal.budget import MultiModalBudget - encoder_compute_budget, _ = compute_encoder_budget( - model_config=self.model_config, - scheduler_config=self.scheduler_config, - mm_registry=MULTIMODAL_REGISTRY, + mm_budget = MultiModalBudget(self, MULTIMODAL_REGISTRY) + encoder_compute_budget = ( + mm_budget.encoder_compute_budget if mm_budget else 0 ) max_mm_encoder_cudagraph_capture_size = min( encoder_compute_budget, 8192 diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 1dfe1d07f1d4..af9c90f3b016 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -307,9 +307,13 @@ def dispatch( # so we must use max_loras + 1 for dispatch to find a matching graph. effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1 - batch_desc = self._create_padded_batch_descriptor( - num_tokens, uniform_decode, has_lora, effective_num_active_loras, is_mm_encoder - ) + batch_desc = self._create_padded_batch_descriptor( + num_tokens, + uniform_decode, + has_lora, + effective_num_active_loras, + is_mm_encoder, + ) relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs() if not disable_full: From ae2e8e62cea0f6c030f92234cb863acdc48658cb Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Wed, 4 Feb 2026 17:16:49 +0800 Subject: [PATCH 33/35] add a dedicated dispatcher for mm encoder Signed-off-by: Hongjian Zhang --- .../piecewise/test_qwenvl_vit_cudagraph.py | 161 +++++++++--------- vllm/forward_context.py | 5 - vllm/model_executor/models/vision.py | 20 +-- vllm/v1/cudagraph_dispatcher.py | 121 ++++--------- vllm/v1/worker/gpu_model_runner.py | 23 +-- vllm/v1/worker/mm_cudagraph.py | 13 +- 6 files changed, 140 insertions(+), 203 deletions(-) diff --git a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py index ec3176885409..82cb10394720 100644 --- a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py +++ b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py @@ -11,55 +11,9 @@ from vllm.config import CompilationConfig, CUDAGraphMode from vllm.distributed import cleanup_dist_env_and_memory from vllm.forward_context import set_forward_context -from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.v1.executor.multiproc_executor import MultiprocExecutor - - -def _worker_embed_multimodal( - worker, vllm_config, cudagraph_runtime_mode, batch_descriptor, multi_modal_data -): - """Helper function to run multimodal embedding on a worker. - This function sets up the necessary forward context for tensor-parallel (TP) - execution and then calls the model's `embed_multimodal` method. - Note: For data-parallel (DP) mode, the forward context is typically - created and managed within the - vision.py:run_dp_sharded_mrope_vision_model(), which would override the - context set here. - Args: - worker: The worker instance containing the model runner. - vllm_config: The vLLM engine configuration. - cudagraph_runtime_mode: The runtime mode for CUDA graph execution. - batch_descriptor: An object describing the current batch. - multi_modal_data: A dictionary of keyword arguments to be passed to - the model's `embed_multimodal` method. - Returns: - The output from the model's `embed_multimodal` method. - """ - - # Access model via worker.model_runner.model - # Note: Accessing internal attributes. Assuming V1 worker structure. - model = worker.model_runner.model - - # Move multi_modal_data to the model's device - target_device = next(model.parameters()).device - multi_modal_data = { - k: v.to(target_device) if isinstance(v, torch.Tensor) else v - for k, v in multi_modal_data.items() - } - - with ( - set_forward_context( - None, - vllm_config=vllm_config, - cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor, - ), - torch.inference_mode(), - ): - ans = model.embed_multimodal(**multi_modal_data) - torch.cuda.synchronize() - return ans - +from vllm.v1.worker.mm_cudagraph import MMEncoderCudagraphManager # Format: (model_name, tp_size, mm_encoder_tp_mode) TEST_CONFIGS = [ @@ -123,62 +77,103 @@ def llm(request): cleanup_dist_env_and_memory() -class TestQwenVLCUDAGraph: - def _run_embed_multimodal( - self, llm, multi_modal_data, num_patches, force_eager=False +def _worker_embed_multimodal( + worker, vllm_config, multi_modal_data, enforce_eager=False +): + """Helper function to run multimodal embedding on a worker. + This function sets up the necessary forward context for tensor-parallel (TP) + execution and then calls the model's `embed_multimodal` method. + Note: For data-parallel (DP) mode, the forward context is typically + created and managed within the + vision.py:run_dp_sharded_mrope_vision_model(), which would override the + context set here. + This method manually constructs a MMEncoderCudagraphManager because accessing the + one within the GPU model runner is difficult. + Args: + worker: The worker instance containing the model runner. + vllm_config: The vLLM engine configuration. + multi_modal_data: A dictionary of keyword arguments to be passed to + the model's `embed_multimodal` method. + enforce_eager: If True, forces the execution to run in eager mode + Returns: + The output from the model's `embed_multimodal` method. + """ + + # Access model via worker.model_runner.model + # Note: Accessing internal attributes. Assuming V1 worker structure. + model = worker.model_runner.model + + # Move multi_modal_data to the model's device + target_device = next(model.parameters()).device + multi_modal_data = { + k: v.to(target_device) if isinstance(v, torch.Tensor) else v + for k, v in multi_modal_data.items() + } + + processor = MULTIMODAL_REGISTRY.create_processor(vllm_config.model_config) + dummy_inputs_builder = processor.dummy_inputs + mm_cudagraph_manager = MMEncoderCudagraphManager( + vllm_config, + dummy_inputs_builder, + ) + mm_cudagraph_manager.dispatcher.initialize_cudagraph_keys( + CUDAGraphMode.PIECEWISE, + ) + + # Dispatch to get runtime mode and batch descriptor + ( + cudagraph_runtime_mode, + batch_descriptor, + _, + multi_modal_data, + ) = mm_cudagraph_manager.dispatch_and_pad_mm_input(multi_modal_data) + if enforce_eager: + cudagraph_runtime_mode = CUDAGraphMode.NONE + else: + multi_modal_data["mm_cudagraph_manager"] = mm_cudagraph_manager + + with ( + set_forward_context( + None, + vllm_config=vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + ), + torch.inference_mode(), ): + ans = model.embed_multimodal(**multi_modal_data) + torch.cuda.synchronize() + return ans + + +class TestQwenVLCUDAGraph: + def _run_embed_multimodal(self, llm, multi_modal_data, enforce_eager=False): """Runs the multimodal embedding process, potentially with CUDA graphs. - This method manually constructs a CudagraphDispatcher because accessing the - one within the GPU model runner is difficult. It then dispatches based on - the number of image patches to determine the appropriate CUDA graph or - eager mode for execution. The actual embedding is performed on the - worker(s) via an RPC call. + The actual embedding is performed on the worker(s) via an RPC call. Args: llm: The LLM object containing the model engine and configuration. multi_modal_data: A dictionary containing the multimodal data to be processed. - num_patches: The number of image patches, used to determine the - number of tokens for the dispatcher. - force_eager: If True, forces the execution to run in eager mode, + enforce_eager: If True, forces the execution to run in eager mode, bypassing CUDA graphs. Returns: The outputs from the multimodal embedding process executed on the worker. """ vllm_config = llm.llm_engine.vllm_config - - dispatcher = CudagraphDispatcher(vllm_config) - dispatcher.initialize_cudagraph_keys( - cudagraph_mode=vllm_config.compilation_config.cudagraph_mode, - uniform_decode_query_len=1, - ) - - # Dispatch to get runtime mode and batch descriptor - cudagraph_runtime_mode, batch_descriptor = dispatcher.dispatch( - num_tokens=num_patches, - uniform_decode=False, - has_lora=False, - is_mm_encoder=True, - ) - model_executor = llm.llm_engine.model_executor rpc_kwargs = {} # Use collective_rpc to execute on driver worker (rank 0) if isinstance(model_executor, MultiprocExecutor): rpc_kwargs["unique_reply_rank"] = 0 - # If force_eager is True, override the runtime mode to NONE - if force_eager: - cudagraph_runtime_mode = CUDAGraphMode.NONE - else: - multi_modal_data["cudagraph_dispatcher"] = dispatcher + outputs = model_executor.collective_rpc( partial( _worker_embed_multimodal, vllm_config=vllm_config, - cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor, multi_modal_data=multi_modal_data, + enforce_eager=enforce_eager, ), **rpc_kwargs, ) @@ -216,12 +211,12 @@ def test_vit_cudagraph_consistency(self, llm): # Run with Piecewise CUDA Graph piecewise_outputs = self._run_embed_multimodal( - llm, multi_modal_data, num_patches * num_imgs, force_eager=False + llm, multi_modal_data, enforce_eager=False ) # Run with Eager Mode (simulated by setting runtime mode to NONE) eager_outputs = self._run_embed_multimodal( - llm, multi_modal_data, num_patches * num_imgs, force_eager=True + llm, multi_modal_data, enforce_eager=True ) if isinstance(piecewise_outputs, torch.Tensor): diff --git a/vllm/forward_context.py b/vllm/forward_context.py index be08e2d9a6bc..7d5c48a2e506 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -55,10 +55,6 @@ class BatchDescriptor(NamedTuple): (like fused_moe_lora) whose grid size depends on num_active_loras to be properly captured. """ - is_mm_encoder: bool = False - """ - mm_encoder Piecewise CUDA Graph Flag - """ def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor": """ @@ -71,7 +67,6 @@ def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor": uniform=False, has_lora=self.has_lora, num_active_loras=self.num_active_loras, - is_mm_encoder=self.is_mm_encoder, ) diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 538f1c98d64e..adcfd7d3b370 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib import itertools import math from abc import ABC, abstractmethod @@ -11,7 +12,6 @@ from transformers import PretrainedConfig from vllm.config import ( - CUDAGraphMode, MultiModalConfig, VllmConfig, get_current_vllm_config, @@ -483,9 +483,7 @@ def run_dp_sharded_mrope_vision_model( local_grid_thw_list = [grid_thw_list[i] for i in image_idxs_local] # Context setup - vllm_config = get_current_vllm_config() - cudagraph_runtime_mode = CUDAGraphMode.NONE - batch_descriptor = None + ctx = contextlib.nullcontext() if mm_cudagraph_manager is not None: mm_groups: dict[str, torch.Tensor | list] = { @@ -501,12 +499,14 @@ def run_dp_sharded_mrope_vision_model( pixel_values_local = mm_groups["pixel_values"] local_grid_thw_list = mm_groups["image_grid_thw"] - with set_forward_context( - None, - vllm_config=vllm_config, - cudagraph_runtime_mode=cudagraph_runtime_mode, - batch_descriptor=batch_descriptor, - ): + ctx = set_forward_context( + None, + vllm_config=mm_cudagraph_manager.vllm_config, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + ) + + with ctx: # Run the vision model on the local pixel_values_local if rope_type == "rope_2d": if pixel_values_local.shape[0] > 0: diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index af9c90f3b016..2716fce64ef5 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -29,9 +29,20 @@ class CudagraphDispatcher: runnable without cudagraph (if the mode does not match or mode is NONE). """ - def __init__(self, vllm_config: VllmConfig): + def __init__(self, vllm_config: VllmConfig, is_mm_encoder: bool = False): self.vllm_config = vllm_config self.compilation_config = vllm_config.compilation_config + self.is_mm_encoder = is_mm_encoder + self.max_capture_size = ( + self.compilation_config.max_cudagraph_capture_size + if not is_mm_encoder + else self.compilation_config.max_mm_encoder_cudagraph_capture_size + ) + self.capture_sizes = ( + self.compilation_config.cudagraph_capture_sizes + if not is_mm_encoder + else self.compilation_config.mm_encoder_cudagraph_capture_sizes + ) self.uniform_decode_query_len = ( 1 if not self.vllm_config.speculative_config @@ -68,11 +79,16 @@ def __init__(self, vllm_config: VllmConfig): def _compute_bs_to_padded_graph_size(self) -> None: """Pre-compute the mapping from batch size to padded graph size.""" - max_capture_size = self.compilation_config.max_cudagraph_capture_size - capture_sizes = self.compilation_config.cudagraph_capture_sizes - self._bs_to_padded_graph_size: list[int] = self._get_padded_size_map( - capture_sizes, max_capture_size - ) + self._bs_to_padded_graph_size: list[int] = [0] * (self.max_capture_size + 1) + for end, start in zip( + self.capture_sizes + [self.max_capture_size + 1], + [0] + self.capture_sizes, + ): + for bs in range(start, end): + if bs == start: + self._bs_to_padded_graph_size[bs] = start + else: + self._bs_to_padded_graph_size[bs] = end # Validate that compile_sizes won't be changed by padding. # Only validate when cudagraphs are actually being used. @@ -81,7 +97,7 @@ def _compute_bs_to_padded_graph_size(self) -> None: and self.cudagraph_mode != CUDAGraphMode.NONE ): for size in self.compilation_config.compile_sizes: - if size <= max_capture_size: + if size <= self.max_capture_size: padded = self._bs_to_padded_graph_size[size] if padded != size: raise ValueError( @@ -112,48 +128,16 @@ def _get_lora_cases(self) -> list[int]: # No specialization: only capture graphs with LoRA active return [lora_config.max_loras + 1] - def _compute_bs_to_padded_mm_encoder_graph_size(self) -> None: - """pre-compute the mapping from batch size to mm_encoder padded graph size.""" - max_capture_size = self.compilation_config.max_mm_encoder_cudagraph_capture_size - capture_sizes = self.compilation_config.mm_encoder_cudagraph_capture_sizes - - self._bs_to_padded_mm_encoder_graph_size: list[int] = self._get_padded_size_map( - capture_sizes, max_capture_size - ) - - def _get_padded_size_map( - self, capture_sizes: list[int] | None, max_size: int | None - ) -> list[int]: - if capture_sizes is None: - capture_sizes = [] - if max_size is None: - max_size = 0 - padded_size_map: list[int] = [0] * (max_size + 1) - for end, start in zip( - capture_sizes + [max_size + 1], - [0] + capture_sizes, - ): - for bs in range(start, end): - if bs == start: - padded_size_map[bs] = start - else: - padded_size_map[bs] = end - return padded_size_map - def _create_padded_batch_descriptor( self, num_tokens: int, uniform_decode: bool, has_lora: bool, num_active_loras: int = 0, - is_mm_encoder: bool = False, ) -> BatchDescriptor: max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs uniform_decode_query_len = self.uniform_decode_query_len - if is_mm_encoder: - num_tokens_padded = self._bs_to_padded_mm_encoder_graph_size[num_tokens] - else: - num_tokens_padded = self._bs_to_padded_graph_size[num_tokens] + num_tokens_padded = self._bs_to_padded_graph_size[num_tokens] if uniform_decode and self.cudagraph_mode.has_mode(CUDAGraphMode.FULL): num_reqs = num_tokens_padded // uniform_decode_query_len @@ -168,7 +152,6 @@ def _create_padded_batch_descriptor( uniform=uniform_decode, has_lora=has_lora, num_active_loras=num_active_loras, - is_mm_encoder=is_mm_encoder, ) def add_cudagraph_key( @@ -192,10 +175,9 @@ def initialize_cudagraph_keys( return self._compute_bs_to_padded_graph_size() - self._compute_bs_to_padded_mm_encoder_graph_size() # Get LoRA cases to capture - lora_cases = self._get_lora_cases() + lora_cases = self._get_lora_cases() if not self.is_mm_encoder else [0] self.captured_lora_counts = [ lora_count for lora_count in lora_cases if lora_count ] @@ -204,23 +186,13 @@ def initialize_cudagraph_keys( # guarantee all keys would be used. For example, if we allow lazy # capturing in future PR, some keys may never be triggered. if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE: - for bs, num_active_loras in product( - self.compilation_config.cudagraph_capture_sizes, lora_cases - ): + for bs, num_active_loras in product(self.capture_sizes, lora_cases): self.add_cudagraph_key( cudagraph_mode.mixed_mode(), self._create_padded_batch_descriptor( bs, False, num_active_loras > 0, num_active_loras ).relax_for_mixed_batch_cudagraphs(), ) - # mm_encoder CUDAGraph Entry - for patch_len in self.compilation_config.mm_encoder_cudagraph_capture_sizes: - self.add_cudagraph_key( - CUDAGraphMode.PIECEWISE, - self._create_padded_batch_descriptor( - patch_len, False, False, is_mm_encoder=True - ).relax_for_mixed_batch_cudagraphs(), - ) # if decode cudagraph mode is FULL, and we don't already have mixed # mode full cudagraphs then add them here. @@ -234,7 +206,7 @@ def initialize_cudagraph_keys( ) cudagraph_capture_sizes_for_decode = [ x - for x in self.compilation_config.cudagraph_capture_sizes + for x in self.capture_sizes if x <= max_num_tokens and x >= uniform_decode_query_len ] for bs, num_active_loras in product( @@ -256,7 +228,6 @@ def dispatch( has_lora: bool = False, disable_full: bool = False, num_active_loras: int = 0, - is_mm_encoder: bool = False, ) -> tuple[CUDAGraphMode, BatchDescriptor]: """ Given conditions(e.g.,batch descriptor and if using piecewise only), @@ -277,19 +248,9 @@ def dispatch( if ( not self.keys_initialized or self.cudagraph_mode == CUDAGraphMode.NONE - or ( - not is_mm_encoder - and num_tokens > self.compilation_config.max_cudagraph_capture_size - ) - or ( - is_mm_encoder - and num_tokens - > self.compilation_config.max_mm_encoder_cudagraph_capture_size - ) + or num_tokens > self.max_capture_size ): - return CUDAGraphMode.NONE, BatchDescriptor( - num_tokens, is_mm_encoder=is_mm_encoder - ) + return CUDAGraphMode.NONE, BatchDescriptor(num_tokens) effective_num_active_loras = num_active_loras if has_lora and num_active_loras > 0: @@ -307,13 +268,9 @@ def dispatch( # so we must use max_loras + 1 for dispatch to find a matching graph. effective_num_active_loras = self.vllm_config.lora_config.max_loras + 1 - batch_desc = self._create_padded_batch_descriptor( - num_tokens, - uniform_decode, - has_lora, - effective_num_active_loras, - is_mm_encoder, - ) + batch_desc = self._create_padded_batch_descriptor( + num_tokens, uniform_decode, has_lora, effective_num_active_loras + ) relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs() if not disable_full: @@ -331,13 +288,9 @@ def dispatch( return CUDAGraphMode.PIECEWISE, relaxed_batch_desc # finally, just return no cudagraphs and a trivial batch descriptor - return CUDAGraphMode.NONE, BatchDescriptor( - num_tokens, is_mm_encoder=is_mm_encoder - ) + return CUDAGraphMode.NONE, BatchDescriptor(num_tokens) - def get_capture_descs( - self, is_mm_encoder: bool = False - ) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]: + def get_capture_descs(self) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]: """ Returns capture descriptors for cudagraph capturing. @@ -355,9 +308,7 @@ def get_capture_descs( descs = list(self.cudagraph_keys[mode]) if descs: # Sort by num_tokens descending (largest first) - filter_descs = [d for d in descs if d.is_mm_encoder == is_mm_encoder] - if filter_descs: - filter_descs.sort(key=lambda d: d.num_tokens, reverse=True) - result.append((mode, filter_descs)) + descs.sort(key=lambda d: d.num_tokens, reverse=True) + result.append((mode, descs)) return result diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f1b1438c81b6..b8c88e881a1a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -658,8 +658,6 @@ def __init__( dummy_inputs_builder = processor.dummy_inputs self.mm_cudagraph_manager = MMEncoderCudagraphManager( self.vllm_config, - self.cudagraph_dispatcher, - self.device, dummy_inputs_builder, ) @@ -2442,11 +2440,7 @@ def _execute_mm_encoder( # Default values for non-mm_encoder cudagraph case cudagraph_runtime_mode = CUDAGraphMode.NONE batch_descriptor = None - if ( - mm_mgr is not None - and mm_mgr.enabled - and "pixel_values" in mm_kwargs_group - ): + if mm_mgr is not None and "pixel_values" in mm_kwargs_group: ( cudagraph_runtime_mode, batch_descriptor, @@ -5259,9 +5253,10 @@ def freeze_gc(): ) # Capture MM encoder CUDA graphs if enabled if self.mm_cudagraph_manager is not None: - for runtime_mode, _ in self.cudagraph_dispatcher.get_capture_descs( - is_mm_encoder=True - ): + for ( + runtime_mode, + _, + ) in self.mm_cudagraph_manager.dispatcher.get_capture_descs(): self.mm_cudagraph_manager.capture( model=self.model, cudagraph_mode=runtime_mode ) @@ -5629,6 +5624,14 @@ def _check_and_update_cudagraph_mode( cudagraph_mode, self.uniform_decode_query_len ) + if ( + self.mm_cudagraph_manager is not None + and cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE + ): + self.mm_cudagraph_manager.dispatcher.initialize_cudagraph_keys( + CUDAGraphMode.PIECEWISE, + ) + # Initialize eagle's cudagraph dispatcher if using eagle spec decode. if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) diff --git a/vllm/v1/worker/mm_cudagraph.py b/vllm/v1/worker/mm_cudagraph.py index fd572c40c46c..4f2b1cf72e8e 100644 --- a/vllm/v1/worker/mm_cudagraph.py +++ b/vllm/v1/worker/mm_cudagraph.py @@ -24,13 +24,10 @@ class MMEncoderCudagraphManager: def __init__( self, vllm_config: VllmConfig, - cudagraph_dispatcher: CudagraphDispatcher, - device: torch.device, dummy_input_builder: BaseDummyInputsBuilder[Any], ): self.vllm_config = vllm_config - self.dispatcher = cudagraph_dispatcher - self.device = device + self.dispatcher = CudagraphDispatcher(self.vllm_config, is_mm_encoder=True) self.dummy_input_builder = dummy_input_builder compilation_config = vllm_config.compilation_config @@ -76,7 +73,7 @@ def dispatch_and_pad_mm_input( if not self.enabled: return ( CUDAGraphMode.NONE, - BatchDescriptor(num_tokens, is_mm_encoder=True), + BatchDescriptor(num_tokens), original_num_imgs, mm_kwargs_group, ) @@ -84,7 +81,6 @@ def dispatch_and_pad_mm_input( # Dispatch to get the target padded size cudagraph_runtime_mode, batch_descriptor = self.dispatcher.dispatch( num_tokens=num_tokens, - is_mm_encoder=True, ) target_num_tokens = batch_descriptor.num_tokens @@ -127,10 +123,7 @@ def capture_graph( num_tokens ) - batch_descriptor = BatchDescriptor( - num_tokens=num_tokens, - is_mm_encoder=True, - ) + batch_descriptor = BatchDescriptor(num_tokens=num_tokens) with set_forward_context( None, From 9be3fa64d2278682dd0c6148ad73a4ebd73d4599 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Wed, 4 Feb 2026 19:46:35 +0800 Subject: [PATCH 34/35] modify to be compatible with V1 design Signed-off-by: Hongjian Zhang --- .../piecewise/test_qwenvl_vit_cudagraph.py | 2 +- vllm/model_executor/models/qwen2_5_vl.py | 33 +++++++------ vllm/model_executor/models/qwen3_vl.py | 33 +++++++------ vllm/v1/worker/gpu_model_runner.py | 15 +++--- vllm/v1/worker/mm_cudagraph.py | 48 +++++++------------ 5 files changed, 57 insertions(+), 74 deletions(-) diff --git a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py index 82cb10394720..f59368fcbd1c 100644 --- a/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py +++ b/tests/compile/piecewise/test_qwenvl_vit_cudagraph.py @@ -116,7 +116,7 @@ def _worker_embed_multimodal( vllm_config, dummy_inputs_builder, ) - mm_cudagraph_manager.dispatcher.initialize_cudagraph_keys( + mm_cudagraph_manager.initialize_cudagraph_keys( CUDAGraphMode.PIECEWISE, ) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index d6a352392e9c..bb7e44f9f30b 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -51,11 +51,14 @@ CUDAGraphMode, VllmConfig, get_current_vllm_config, - set_current_vllm_config, ) from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.forward_context import get_forward_context, is_forward_context_available +from vllm.forward_context import ( + get_forward_context, + is_forward_context_available, + set_forward_context, +) from vllm.logger import init_logger from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.attention import MMEncoderAttention @@ -1287,20 +1290,16 @@ def _process_image_input( image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"] - with set_current_vllm_config(self.vllm_config): - if ( - self.use_data_parallel - and not self.vllm_config.in_mm_encoder_tracing - ): - return run_dp_sharded_mrope_vision_model( - self.visual, - pixel_values, - grid_thw_list, - rope_type="rope_3d", - mm_cudagraph_manager=mm_cudagraph_manager, - ) - else: - image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) + if self.use_data_parallel and not self.vllm_config.in_mm_encoder_tracing: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d", + mm_cudagraph_manager=mm_cudagraph_manager, + ) + else: + image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size @@ -1351,7 +1350,7 @@ def _process_video_input( video_embeds = video_input["video_embeds"].type(self.visual.dtype) else: pixel_values_videos = video_input["pixel_values_videos"] - with set_current_vllm_config(self.vllm_config): + with set_forward_context(None, self.vllm_config): if ( self.use_data_parallel and not self.vllm_config.in_mm_encoder_tracing diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 9e6001f474b0..f1fb4a1d2ad4 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -57,11 +57,14 @@ CUDAGraphMode, VllmConfig, get_current_vllm_config, - set_current_vllm_config, ) from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_pp_group -from vllm.forward_context import get_forward_context, is_forward_context_available +from vllm.forward_context import ( + get_forward_context, + is_forward_context_available, + set_forward_context, +) from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.conv import Conv3dLayer @@ -1546,20 +1549,16 @@ def _process_image_input( else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - with set_current_vllm_config(self.vllm_config): - if ( - self.use_data_parallel - and not self.vllm_config.in_mm_encoder_tracing - ): - return run_dp_sharded_mrope_vision_model( - self.visual, - pixel_values, - grid_thw_list, - rope_type="rope_3d", - mm_cudagraph_manager=mm_cudagraph_manager, - ) - else: - image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) + if self.use_data_parallel and not self.vllm_config.in_mm_encoder_tracing: + return run_dp_sharded_mrope_vision_model( + self.visual, + pixel_values, + grid_thw_list, + rope_type="rope_3d", + mm_cudagraph_manager=mm_cudagraph_manager, + ) + else: + image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size @@ -1581,7 +1580,7 @@ def _process_video_input( pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype ) - with set_current_vllm_config(self.vllm_config): + with set_forward_context(None, self.vllm_config): if ( self.use_data_parallel and not self.vllm_config.in_mm_encoder_tracing diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b8c88e881a1a..bc48153ad060 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5255,10 +5255,12 @@ def freeze_gc(): if self.mm_cudagraph_manager is not None: for ( runtime_mode, - _, + batch_descs, ) in self.mm_cudagraph_manager.dispatcher.get_capture_descs(): self.mm_cudagraph_manager.capture( - model=self.model, cudagraph_mode=runtime_mode + model=self.model, + batch_descs=batch_descs, + cudagraph_mode=runtime_mode, ) torch.cuda.synchronize() @@ -5624,13 +5626,8 @@ def _check_and_update_cudagraph_mode( cudagraph_mode, self.uniform_decode_query_len ) - if ( - self.mm_cudagraph_manager is not None - and cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE - ): - self.mm_cudagraph_manager.dispatcher.initialize_cudagraph_keys( - CUDAGraphMode.PIECEWISE, - ) + if self.mm_cudagraph_manager is not None: + self.mm_cudagraph_manager.initialize_cudagraph_keys(cudagraph_mode) # Initialize eagle's cudagraph dispatcher if using eagle spec decode. if self.speculative_config and self.speculative_config.use_eagle(): diff --git a/vllm/v1/worker/mm_cudagraph.py b/vllm/v1/worker/mm_cudagraph.py index 4f2b1cf72e8e..c633d6689ef1 100644 --- a/vllm/v1/worker/mm_cudagraph.py +++ b/vllm/v1/worker/mm_cudagraph.py @@ -30,19 +30,6 @@ def __init__( self.dispatcher = CudagraphDispatcher(self.vllm_config, is_mm_encoder=True) self.dummy_input_builder = dummy_input_builder - compilation_config = vllm_config.compilation_config - self.capture_sizes: list[int] = [] - if compilation_config and compilation_config.mm_encoder_cudagraph_capture_sizes: - self.capture_sizes = sorted( - compilation_config.mm_encoder_cudagraph_capture_sizes - ) - - self.enabled = bool( - self.capture_sizes - and compilation_config - and compilation_config.cudagraph_mode != CUDAGraphMode.NONE - ) - # Check if using data parallel mode for ViT self.is_vit_dp_mode = self._check_vit_dp_mode(vllm_config) @@ -57,6 +44,18 @@ def _check_vit_dp_mode(self, vllm_config: VllmConfig) -> bool: return mm_encoder_tp_mode == "data" and tp_size > 1 + def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode) -> None: + """Initialize cudagraph dispatcher keys for MM Encoder. + + MM Encoder only supports PIECEWISE cudagraphs. + """ + if cudagraph_mode.mixed_mode() in [CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]: + mm_cudagraph_mode = CUDAGraphMode.PIECEWISE + else: + mm_cudagraph_mode = CUDAGraphMode.NONE + + self.dispatcher.initialize_cudagraph_keys(mm_cudagraph_mode) + def dispatch_and_pad_mm_input( self, mm_kwargs_group: BatchedTensorInputs, @@ -70,14 +69,6 @@ def dispatch_and_pad_mm_input( else: original_num_imgs = len(image_grid_thw) - if not self.enabled: - return ( - CUDAGraphMode.NONE, - BatchDescriptor(num_tokens), - original_num_imgs, - mm_kwargs_group, - ) - # Dispatch to get the target padded size cudagraph_runtime_mode, batch_descriptor = self.dispatcher.dispatch( num_tokens=num_tokens, @@ -138,25 +129,22 @@ def capture_graph( def capture( self, model: nn.Module, + batch_descs: "list[BatchDescriptor]", cudagraph_mode: CUDAGraphMode, ) -> None: - if not self.enabled or not self.capture_sizes: - return - self.vllm_config.in_mm_encoder_tracing = True - capture_sizes_desc = list(reversed(self.capture_sizes)) - if is_global_first_rank(): - capture_sizes_iter: Any = tqdm( - capture_sizes_desc, + batch_descriptors: Any = tqdm( + batch_descs, disable=not self.vllm_config.load_config.use_tqdm_on_load, desc="Capturing MM_Encoder CUDA graphs (PIECEWISE)", ) else: - capture_sizes_iter = capture_sizes_desc + batch_descriptors = batch_descs - for capture_size in capture_sizes_iter: + for batch_desc in batch_descriptors: + capture_size = batch_desc.num_tokens self.capture_graph( capture_size, model=model, From 6da90763761c3916f0ccaa58a943184b55c06822 Mon Sep 17 00:00:00 2001 From: Hongjian Zhang Date: Thu, 5 Feb 2026 14:47:24 +0800 Subject: [PATCH 35/35] simplify CudagraphDispatcher init and restore video logic Signed-off-by: Hongjian Zhang --- vllm/model_executor/models/qwen2_5_vl.py | 9 ++----- vllm/model_executor/models/qwen3_vl.py | 15 +++--------- vllm/model_executor/models/vision.py | 6 +---- vllm/v1/cudagraph_dispatcher.py | 31 ++++++++++++------------ vllm/v1/worker/mm_cudagraph.py | 17 +++++++++++-- 5 files changed, 38 insertions(+), 40 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index bb7e44f9f30b..ca13b8c096b1 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1338,9 +1338,7 @@ def _postprocess_image_embeds_evs( return tuple(image_embeds_split) def _process_video_input( - self, - video_input: Qwen2_5_VLVideoInputs, - mm_cudagraph_manager: Any | None = None, + self, video_input: Qwen2_5_VLVideoInputs ) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 @@ -1360,7 +1358,6 @@ def _process_video_input( pixel_values_videos, grid_thw_list, rope_type="rope_3d", - mm_cudagraph_manager=mm_cudagraph_manager, ) else: video_embeds = self.visual( @@ -1533,9 +1530,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: ) multimodal_embeddings += tuple(image_embeddings) if modality == "video": - video_embeddings = self._process_video_input( - multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager - ) + video_embeddings = self._process_video_input(multimodal_input) if self.is_multimodal_pruning_enabled: video_embeddings = self._postprocess_video_embeds_evs( video_embeddings, multimodal_input diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index f1fb4a1d2ad4..3f51ce90fbd2 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1566,13 +1566,10 @@ def _process_image_input( return image_embeds.split(sizes) def _process_video_input( - self, - video_input: Qwen2_5_VLVideoInputs, - mm_cudagraph_manager: Any | None = None, + self, video_input: Qwen2_5_VLVideoInputs ) -> tuple[torch.Tensor, ...]: grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 - grid_thw_list = grid_thw.tolist() if video_input["type"] == "video_embeds": video_embeds = video_input["video_embeds"].type(self.visual.dtype) @@ -1585,17 +1582,15 @@ def _process_video_input( self.use_data_parallel and not self.vllm_config.in_mm_encoder_tracing ): + grid_thw_list = grid_thw.tolist() return run_dp_sharded_mrope_vision_model( self.visual, pixel_values_videos, grid_thw_list, rope_type="rope_3d", - mm_cudagraph_manager=mm_cudagraph_manager, ) else: - video_embeds = self.visual( - pixel_values_videos, grid_thw=grid_thw_list - ) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size @@ -2062,9 +2057,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: ) multimodal_embeddings += tuple(image_embeddings) if modality == "video": - video_embeddings = self._process_video_input( - multimodal_input, mm_cudagraph_manager=mm_cudagraph_manager - ) + video_embeddings = self._process_video_input(multimodal_input) if self.is_multimodal_pruning_enabled: video_embeddings = self._postprocess_video_embeds_evs( video_embeddings, multimodal_input diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index adcfd7d3b370..837ddf4a2534 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -11,11 +11,7 @@ import torch from transformers import PretrainedConfig -from vllm.config import ( - MultiModalConfig, - VllmConfig, - get_current_vllm_config, -) +from vllm.config import MultiModalConfig, VllmConfig, get_current_vllm_config from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 2716fce64ef5..2fca21831c86 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -29,20 +29,9 @@ class CudagraphDispatcher: runnable without cudagraph (if the mode does not match or mode is NONE). """ - def __init__(self, vllm_config: VllmConfig, is_mm_encoder: bool = False): + def __init__(self, vllm_config: VllmConfig): self.vllm_config = vllm_config self.compilation_config = vllm_config.compilation_config - self.is_mm_encoder = is_mm_encoder - self.max_capture_size = ( - self.compilation_config.max_cudagraph_capture_size - if not is_mm_encoder - else self.compilation_config.max_mm_encoder_cudagraph_capture_size - ) - self.capture_sizes = ( - self.compilation_config.cudagraph_capture_sizes - if not is_mm_encoder - else self.compilation_config.mm_encoder_cudagraph_capture_sizes - ) self.uniform_decode_query_len = ( 1 if not self.vllm_config.speculative_config @@ -76,6 +65,8 @@ def __init__(self, vllm_config: VllmConfig, is_mm_encoder: bool = False): ) # Default cudagraph_mode to NONE until initialize_cudagraph_keys is called self.cudagraph_mode = CUDAGraphMode.NONE + self.capture_sizes: list[int] = [] + self.max_capture_size: int = 0 def _compute_bs_to_padded_graph_size(self) -> None: """Pre-compute the mapping from batch size to padded graph size.""" @@ -163,12 +154,22 @@ def add_cudagraph_key( self.cudagraph_keys[runtime_mode].add(batch_descriptor) def initialize_cudagraph_keys( - self, cudagraph_mode: CUDAGraphMode, uniform_decode_query_len: int = 1 + self, + cudagraph_mode: CUDAGraphMode, + uniform_decode_query_len: int = 1, + capture_sizes: list[int] | None = None, + max_capture_size: int | None = None, + enable_lora: bool = True, ): # This should be called only after attention backend is initialized. So we can # get the correct cudagraph mode after backend support is resolved. self.cudagraph_mode = cudagraph_mode - + self.capture_sizes = ( + capture_sizes or self.compilation_config.cudagraph_capture_sizes + ) + self.max_capture_size = ( + max_capture_size or self.compilation_config.max_cudagraph_capture_size + ) # Early exit if cudagraphs are disabled if cudagraph_mode == CUDAGraphMode.NONE: self.keys_initialized = True @@ -177,7 +178,7 @@ def initialize_cudagraph_keys( self._compute_bs_to_padded_graph_size() # Get LoRA cases to capture - lora_cases = self._get_lora_cases() if not self.is_mm_encoder else [0] + lora_cases = self._get_lora_cases() if enable_lora else [0] self.captured_lora_counts = [ lora_count for lora_count in lora_cases if lora_count ] diff --git a/vllm/v1/worker/mm_cudagraph.py b/vllm/v1/worker/mm_cudagraph.py index c633d6689ef1..6175d7d5c893 100644 --- a/vllm/v1/worker/mm_cudagraph.py +++ b/vllm/v1/worker/mm_cudagraph.py @@ -27,7 +27,7 @@ def __init__( dummy_input_builder: BaseDummyInputsBuilder[Any], ): self.vllm_config = vllm_config - self.dispatcher = CudagraphDispatcher(self.vllm_config, is_mm_encoder=True) + self.dispatcher = CudagraphDispatcher(self.vllm_config) self.dummy_input_builder = dummy_input_builder # Check if using data parallel mode for ViT @@ -54,7 +54,20 @@ def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode) -> None: else: mm_cudagraph_mode = CUDAGraphMode.NONE - self.dispatcher.initialize_cudagraph_keys(mm_cudagraph_mode) + max_capture_size = ( + self.vllm_config.compilation_config.max_mm_encoder_cudagraph_capture_size + ) + + capture_sizes = ( + self.vllm_config.compilation_config.mm_encoder_cudagraph_capture_sizes + ) + + self.dispatcher.initialize_cudagraph_keys( + mm_cudagraph_mode, + capture_sizes=capture_sizes, + max_capture_size=max_capture_size, + enable_lora=False, + ) def dispatch_and_pad_mm_input( self,