Enable chunked prefill for qwen2vl

chang-l · chang-l · commit 00d66922353d · 2025-08-12T14:20:33.000-07:00
Signed-off-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_multimodal_utils.py b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py
@@ -17,7 +17,7 @@
 # and s2wrapper: https://github.com/bfshi/scaling_on_scales
 
 import math
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Callable, Dict, Any, Union
 
 import torch
 import torch.nn.functional as F
@@ -30,6 +30,116 @@
 from tensorrt_llm.logger import logger
 
 
+def _get_active_multimodal_params(
+    multimodal_params: List[MultimodalParams],
+) -> List[MultimodalParams]:
+    """
+    Get active multimodal params that need encoder processing for chunk prefill.
+    """
+    params_to_run = []
+
+    for param in multimodal_params:
+        # Skip if no multimodal content
+        if not param.has_content():
+            continue
+
+        # Check if embeddings are already cached
+        if (param.multimodal_data and
+            "multimodal_embedding" in param.multimodal_data and
+            param.multimodal_data["multimodal_embedding"] is not None):
+            logger.debug(f"Skipping encoder forward for param with cached multimodal_embedding")
+            continue
+
+        # This param needs encoder processing
+        params_to_run.append(param)
+
+    return params_to_run
+
+
+def _cache_multimodal_embeddings(
+    multimodal_params: List[MultimodalParams],
+    embeddings: List[torch.Tensor],
+) -> None:
+    """
+    Cache computed multimodal embeddings back to multimodal_data to avoid recomputation.
+    Uses torch.split for efficient tensor splitting without manual indexing.
+    """
+    # TODO: support multiple multimodal modalities per request
+    assert len(embeddings) == 1, "Currently only support single mm_embeds (single modality) per request"
+    mm_embed = embeddings[0]
+
+    # Collect embedding lengths for each parameter
+    embed_lengths = [param.multimodal_runtime.total_mm_tokens for param in multimodal_params]
+
+    # Validate total length matches
+    total_expected = sum(embed_lengths)
+    assert len(mm_embed) == total_expected, \
+        f"Number of mm_embeds ({len(mm_embed)}) does not match expected total ({total_expected})"
+
+    # Use torch.split for efficient tensor splitting
+    split_embeddings = torch.split(mm_embed, embed_lengths, dim=0)
+
+    # Cache split embeddings to each parameter
+    for param, embed_chunk in zip(multimodal_params, split_embeddings):
+        param.multimodal_data["multimodal_embedding"] = embed_chunk
+
+    logger.debug(f"Cached {len(split_embeddings)} multimodal embedding chunks in this iteration")
+
+
+def get_multimodal_embeddings(
+    encoder_forward_fn,
+    multimodal_params: List[MultimodalParams],
+) -> List[torch.Tensor]:
+    """
+    High-level utility to get multimodal embeddings from encoder or cached embeddings.
+
+    This function will:
+    1. Identify which parameters need encoder processing
+    2. Run encoder forward only on uncached parameters
+    3. Cache newly computed embeddings (if enabled)
+    4. Gather all embeddings for the batch
+
+    Args:
+        encoder_forward_fn: Callable that performs encoder forward pass
+                           Should accept List[MultimodalParams] and return List[torch.Tensor]
+        multimodal_params: All multimodal parameters in the batch
+
+    Returns:
+        List of multimodal embeddings for all multimodal params in the batch
+    """
+    if not multimodal_params:
+        return []
+
+    # Step 1: Find active multimodal params that need encoder processing
+    active_multimodal_params = _get_active_multimodal_params(
+        multimodal_params
+    )
+
+    # Step 2: Run encoder forward only on uncached parameters
+    if active_multimodal_params:
+        encoder_outputs = encoder_forward_fn(active_multimodal_params)
+
+        # TODO: support multiple multimodal modalities per request
+        if len(encoder_outputs) > 1:
+            return encoder_outputs
+
+        # Validate that multimodal_runtime has required attributes for caching
+        if (not hasattr(active_multimodal_params[0], 'multimodal_runtime') or
+            active_multimodal_params[0].multimodal_runtime is None or
+            active_multimodal_params[0].multimodal_runtime.total_mm_tokens is None):
+            logger.warning("Multimodal runtime data missing or incomplete - recomputed all embeddings")
+            return encoder_outputs
+
+        # Step 3: Cache the computed embeddings to multimodal_data["multimodal_embedding"]
+        _cache_multimodal_embeddings(
+            active_multimodal_params, encoder_outputs
+        )
+
+    # Step 4: Gather all embeddings for the batch
+    all_embeddings = torch.cat([param.multimodal_data["multimodal_embedding"] for param in multimodal_params], dim=0)
+    return [all_embeddings]
+
+
 def find_input_mm_embeds(
         mm_embeds: List[torch.Tensor],
         multimodal_params: List[MultimodalParams]) -> List[torch.Tensor]:
@@ -66,10 +176,6 @@ def find_input_mm_embeds(
         return mm_embeds
 
     # Calculate total tokens that need processing (both cached and current chunk)
-    total_unseen_mm_tokens = sum([
-        param.multimodal_runtime.num_unseen_mm_tokens
-        for param in multimodal_params
-    ])
     total_mm_tokens = sum([
         param.multimodal_runtime.num_mm_tokens
         for param in multimodal_params
@@ -92,12 +198,11 @@ def find_input_mm_embeds(
         slices.append((current_pos + runtime.num_unseen_mm_tokens,
                        current_pos + runtime.num_unseen_mm_tokens + runtime.num_mm_tokens))
         if len(mm_embeds) == 1:  # pre-concatenated mm_embeds, need global offset
-            current_pos += sum(runtime.mm_token_lengths)
+            current_pos += runtime.total_mm_tokens
 
     sliced_mm_embeds = []
     if len(mm_embeds) == 1:
-        for start, end in slices:
-            sliced_mm_embeds.append(mm_embeds[0][start:end])
+        sliced_mm_embeds = [mm_embeds[0][start:end] for start, end in slices]
     else:  # slice each mm_embeds individually
         for i, (start, end) in enumerate(slices):
             sliced_mm_embeds.append(mm_embeds[i][start:end])
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -19,8 +19,11 @@
 from ..attention_backend import AttentionMetadata
 from ..model_config import ModelConfig
 from .modeling_auto import AutoModelForCausalLM
-from .modeling_multimodal_utils import (find_input_mm_embeds,
-                                        fuse_input_embeds)
+from .modeling_multimodal_utils import (
+    find_input_mm_embeds,
+    fuse_input_embeds,
+    get_multimodal_embeddings
+)
 from .modeling_utils import register_auto_model
 
 DISAGG = os.getenv('TLLM_MULTIMODAL_DISAGGREGATED', '0') == '1'
@@ -613,11 +616,9 @@ def forward(
 
         if len(multimodal_params) > 0:
             if not DISAGG:
-                #mm_embeds = self.mm_encoder.forward(
-                #    multimodal_params[:num_context_requests])
-                # Get the full mm embeds (from cache or compute)
-                mm_embeds = self._get_or_compute_mm_embeds(
-                    multimodal_params[:num_context_requests]
+                mm_embeds = get_multimodal_embeddings(
+                    encoder_forward_fn=self.mm_encoder.forward,
+                    multimodal_params=multimodal_params[:num_context_requests]
                 )
             else:
                 # TODO: this is a dead path for now
diff --git a/tensorrt_llm/inputs/multimodal.py b/tensorrt_llm/inputs/multimodal.py
@@ -96,6 +96,7 @@ class MultimodalRuntimeData:
         chunk_end_pos: End position of the current chunk for chunked prefill
         num_unseen_mm_tokens: Number of multimodal tokens that are cached (computed)
         num_mm_tokens: Number of multimodal tokens in the current chunk (computed)
+        total_mm_tokens: Total number of multimodal tokens in the request sequence (computed)
     """
     past_seen_token_num: int # == num_cached_tokens
     mm_token_lengths: List[int]
@@ -104,10 +105,13 @@ class MultimodalRuntimeData:
 
     num_unseen_mm_tokens: Optional[int] = None
     num_mm_tokens: Optional[int] = None
+    total_mm_tokens: Optional[int] = None
     # TODO: fine-grained control of encoder runner/cache to each mm_item
 
     def __post_init__(self):
         # Validate input data
+        if self.total_mm_tokens is None:
+            self.total_mm_tokens = sum(self.mm_token_lengths)
         if len(self.mm_token_positions) != len(self.mm_token_lengths):
             raise ValueError(
                 f"mm_token_positions ({len(self.mm_token_positions)}) and mm_token_lengths ({len(self.mm_token_lengths)}) must have the same length"
diff --git a/tests/unittest/_torch/multimodal/test_kvcache_reuse.py b/tests/unittest/_torch/multimodal/test_kvcache_reuse.py