Format and add e2e tests

chang-l · chang-l · commit 183d3f031e23 · 2025-08-12T16:17:49.000-07:00
Signed-off-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_multimodal_utils.py b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py
@@ -17,7 +17,7 @@
 # and s2wrapper: https://github.com/bfshi/scaling_on_scales
 
 import math
-from typing import List, Optional, Tuple, Callable, Dict, Any, Union
+from typing import List, Optional, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -31,8 +31,7 @@
 
 
 def _get_active_multimodal_params(
-    multimodal_params: List[MultimodalParams],
-) -> List[MultimodalParams]:
+    multimodal_params: List[MultimodalParams], ) -> List[MultimodalParams]:
     """
     Get active multimodal params that need encoder processing for chunk prefill.
     """
@@ -44,10 +43,12 @@ def _get_active_multimodal_params(
             continue
 
         # Check if embeddings are already cached
-        if (param.multimodal_data and
-            "multimodal_embedding" in param.multimodal_data and
-            param.multimodal_data["multimodal_embedding"] is not None):
-            logger.debug(f"Skipping encoder forward for param with cached multimodal_embedding")
+        if (param.multimodal_data
+                and "multimodal_embedding" in param.multimodal_data
+                and param.multimodal_data["multimodal_embedding"] is not None):
+            logger.debug(
+                f"Skipping encoder forward for param with cached multimodal_embedding"
+            )
             continue
 
         # This param needs encoder processing
@@ -65,11 +66,15 @@ def _cache_multimodal_embeddings(
     Uses torch.split for efficient tensor splitting without manual indexing.
     """
     # TODO: support multiple multimodal modalities per request
-    assert len(embeddings) == 1, "Currently only support single mm_embeds (single modality) per request"
+    assert len(
+        embeddings
+    ) == 1, "Currently only support single mm_embeds (single modality) per request"
     mm_embed = embeddings[0]
 
     # Collect embedding lengths for each parameter
-    embed_lengths = [param.multimodal_runtime.total_mm_tokens for param in multimodal_params]
+    embed_lengths = [
+        param.multimodal_runtime.total_mm_tokens for param in multimodal_params
+    ]
 
     # Validate total length matches
     total_expected = sum(embed_lengths)
@@ -83,7 +88,9 @@ def _cache_multimodal_embeddings(
     for param, embed_chunk in zip(multimodal_params, split_embeddings):
         param.multimodal_data["multimodal_embedding"] = embed_chunk
 
-    logger.debug(f"Cached {len(split_embeddings)} multimodal embedding chunks in this iteration")
+    logger.debug(
+        f"Cached {len(split_embeddings)} multimodal embedding chunks in this iteration"
+    )
 
 
 def get_multimodal_embeddings(
@@ -111,9 +118,7 @@ def get_multimodal_embeddings(
         return []
 
     # Step 1: Find active multimodal params that need encoder processing
-    active_multimodal_params = _get_active_multimodal_params(
-        multimodal_params
-    )
+    active_multimodal_params = _get_active_multimodal_params(multimodal_params)
 
     # Step 2: Run encoder forward only on uncached parameters
     if active_multimodal_params:
@@ -124,19 +129,24 @@ def get_multimodal_embeddings(
             return encoder_outputs
 
         # Validate that multimodal_runtime has required attributes for caching
-        if (not hasattr(active_multimodal_params[0], 'multimodal_runtime') or
-            active_multimodal_params[0].multimodal_runtime is None or
-            active_multimodal_params[0].multimodal_runtime.total_mm_tokens is None):
-            logger.warning("Multimodal runtime data missing or incomplete - recomputed all embeddings")
+        if (not hasattr(active_multimodal_params[0], 'multimodal_runtime')
+                or active_multimodal_params[0].multimodal_runtime is None or
+                active_multimodal_params[0].multimodal_runtime.total_mm_tokens
+                is None):
+            logger.warning(
+                "Multimodal runtime data missing or incomplete - recomputed all embeddings"
+            )
             return encoder_outputs
 
         # Step 3: Cache the computed embeddings to multimodal_data["multimodal_embedding"]
-        _cache_multimodal_embeddings(
-            active_multimodal_params, encoder_outputs
-        )
+        _cache_multimodal_embeddings(active_multimodal_params, encoder_outputs)
 
     # Step 4: Gather all embeddings for the batch
-    all_embeddings = torch.cat([param.multimodal_data["multimodal_embedding"] for param in multimodal_params], dim=0)
+    all_embeddings = torch.cat([
+        param.multimodal_data["multimodal_embedding"]
+        for param in multimodal_params
+    ],
+                               dim=0)
     return [all_embeddings]
 
 
@@ -176,28 +186,27 @@ def find_input_mm_embeds(
         return mm_embeds
 
     # Calculate total tokens that need processing (both cached and current chunk)
-    total_mm_tokens = sum([
-        param.multimodal_runtime.num_mm_tokens
-        for param in multimodal_params
-    ])
+    total_mm_tokens = sum(
+        [param.multimodal_runtime.num_mm_tokens for param in multimodal_params])
 
     if total_mm_tokens == 0:
         # No tokens need processing, return empty list
         logger.debug(
-            "All multimodal tokens are cached or beyond current chunk, skipping vision encoder forward")
+            "All multimodal tokens are cached or beyond current chunk, skipping vision encoder forward"
+        )
         return []
 
     if total_mm_tokens == sum(mm_embed.shape[0] for mm_embed in mm_embeds):
         return mm_embeds
 
-
     current_pos = 0
     slices = []
     for param in multimodal_params:
         runtime = param.multimodal_runtime
-        slices.append((current_pos + runtime.num_unseen_mm_tokens,
-                       current_pos + runtime.num_unseen_mm_tokens + runtime.num_mm_tokens))
-        if len(mm_embeds) == 1:  # pre-concatenated mm_embeds, need global offset
+        slices.append((current_pos + runtime.num_unseen_mm_tokens, current_pos +
+                       runtime.num_unseen_mm_tokens + runtime.num_mm_tokens))
+        if len(mm_embeds
+               ) == 1:  # pre-concatenated mm_embeds, need global offset
             current_pos += runtime.total_mm_tokens
 
     sliced_mm_embeds = []
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -19,11 +19,8 @@
 from ..attention_backend import AttentionMetadata
 from ..model_config import ModelConfig
 from .modeling_auto import AutoModelForCausalLM
-from .modeling_multimodal_utils import (
-    find_input_mm_embeds,
-    fuse_input_embeds,
-    get_multimodal_embeddings
-)
+from .modeling_multimodal_utils import (find_input_mm_embeds, fuse_input_embeds,
+                                        get_multimodal_embeddings)
 from .modeling_utils import register_auto_model
 
 DISAGG = os.getenv('TLLM_MULTIMODAL_DISAGGREGATED', '0') == '1'
@@ -618,8 +615,7 @@ def forward(
             if not DISAGG:
                 mm_embeds = get_multimodal_embeddings(
                     encoder_forward_fn=self.mm_encoder.forward,
-                    multimodal_params=multimodal_params[:num_context_requests]
-                )
+                    multimodal_params=multimodal_params[:num_context_requests])
             else:
                 # TODO: this is a dead path for now
                 mm_embeds = [
@@ -630,7 +626,8 @@ def forward(
                 multimodal_params, num_context_requests,
                 num_generation_requests)
 
-            mm_embeds = find_input_mm_embeds(mm_embeds, multimodal_params[:num_context_requests])
+            mm_embeds = find_input_mm_embeds(
+                mm_embeds, multimodal_params[:num_context_requests])
 
         if 'mrope_position_deltas' in kwargs:
             mrope_config['mrope_position_deltas'] = kwargs[
diff --git a/tensorrt_llm/inputs/multimodal.py b/tensorrt_llm/inputs/multimodal.py
@@ -98,14 +98,15 @@ class MultimodalRuntimeData:
         num_mm_tokens: Number of multimodal tokens in the current chunk (computed)
         total_mm_tokens: Total number of multimodal tokens in the request sequence (computed)
     """
-    past_seen_token_num: int # == num_cached_tokens
+    past_seen_token_num: int
     mm_token_lengths: List[int]
     mm_token_positions: List[int]
-    chunk_end_pos: int # == end_pos
+    chunk_end_pos: int
 
     num_unseen_mm_tokens: Optional[int] = None
     num_mm_tokens: Optional[int] = None
     total_mm_tokens: Optional[int] = None
+
     # TODO: fine-grained control of encoder runner/cache to each mm_item
 
     def __post_init__(self):
@@ -156,7 +157,8 @@ def __post_init__(self):
                         # Full overlap - count the entire mm item chunk
                         self.num_mm_tokens += length
 
-        if self.num_unseen_mm_tokens + self.num_mm_tokens > sum(self.mm_token_lengths):
+        if self.num_unseen_mm_tokens + self.num_mm_tokens > sum(
+                self.mm_token_lengths):
             raise ValueError(
                 f"num_unseen_mm_tokens ({self.num_unseen_mm_tokens}) + num_mm_tokens ({self.num_mm_tokens}) must be less than or equal to sum of mm_token_lengths ({sum(self.mm_token_lengths)})"
             )
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -2121,6 +2121,7 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv):
 
 @pytest.mark.parametrize("use_cuda_graph", [False, True])
 @pytest.mark.parametrize("modality", ["image", "video", "mixture_text_image"])
+@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
 @pytest.mark.parametrize("model_name,model_path", [
     ("NVILA-8B-FP16", "vila/NVILA-8B"),
     ("NVILA-15B-FP16", "NVILA-15B"),
@@ -2135,9 +2136,16 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv):
                  marks=pytest.mark.skip_less_device_memory(80000)),
 ])
 def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
-                                   modality, use_cuda_graph):
+                                   modality, use_cuda_graph,
+                                   enable_chunked_prefill):
     # NOTE: individual tests need to be enabled in
     # tests/integration/test_lists/qa/examples_test_list.txt
+    if model_name not in ["qwen2-vl-7b-instruct", "qwen2.5-vl-7b-instruct"
+                          ] and enable_chunked_prefill:
+        pytest.skip(
+            "Only Qwen2-VL and Qwen2-5-VL support chunked prefill for now")
+    if modality != "image" and enable_chunked_prefill:
+        pytest.skip("Chunked prefill is only supported for image modality")
 
     example_root = Path(os.path.join(llm_root, "examples", "llm-api"))
     test_data_root = Path(
@@ -2262,6 +2270,11 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
     if model_name in ["qwen2-vl-7b-instruct", "qwen2.5-vl-7b-instruct"
                       ] and modality == "video":
         cmd.append("--max_num_tokens=16384")
+    else:
+        if enable_chunked_prefill:
+            cmd.append("--enable_chunked_prefill")
+            cmd.append("--max_num_tokens=256")
+
     if use_cuda_graph:
         cmd.append("--use_cuda_graph")
     # Gemma3 VLM needs a custom mask which is only supported by flashinfer backend currently.
diff --git a/tests/unittest/_torch/multimodal/test_multimodal_runtime.py b/tests/unittest/_torch/multimodal/test_multimodal_runtime.py