Address comments

chang-l · chang-l · commit a6befdc94fd7 · 2025-08-17T21:28:06.000-07:00
Signed-off-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_multimodal_utils.py b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py
@@ -30,10 +30,10 @@
 from tensorrt_llm.logger import logger
 
 
-def _get_active_multimodal_params(
+def _get_uncached_multimodal_params(
     multimodal_params: List[MultimodalParams], ) -> List[MultimodalParams]:
     """
-    Get active multimodal params that need encoder processing for chunk prefill.
+    Get uncached multimodal params that need encoder processing for chunk prefill.
     """
     params_to_run = []
 
@@ -63,7 +63,8 @@ def _cache_multimodal_embeddings(
 ) -> None:
     """
     Cache computed multimodal embeddings back to multimodal_data to avoid recomputation.
-    Uses torch.split for efficient tensor splitting without manual indexing.
+    Note this function only caches multimodal embeddings within the current request context,
+    mostly for chunked prefill. It does not persist embeddings across different requests or sessions.
     """
     # TODO: support multiple multimodal modalities per request
     assert len(
@@ -73,7 +74,8 @@ def _cache_multimodal_embeddings(
 
     # Collect embedding lengths for each parameter
     embed_lengths = [
-        param.multimodal_runtime.total_mm_tokens for param in multimodal_params
+        param.multimodal_runtime.total_mm_tokens_in_request
+        for param in multimodal_params
     ]
 
     # Validate total length matches
@@ -117,29 +119,31 @@ def get_multimodal_embeddings(
     if not multimodal_params:
         return []
 
-    # Step 1: Find active multimodal params that need encoder processing
-    active_multimodal_params = _get_active_multimodal_params(multimodal_params)
+    # Step 1: Find uncached multimodal params that need encoder processing
+    uncached_multimodal_params = _get_uncached_multimodal_params(
+        multimodal_params)
 
     # Step 2: Run encoder forward only on uncached parameters
-    if active_multimodal_params:
-        encoder_outputs = encoder_forward_fn(active_multimodal_params)
+    if uncached_multimodal_params:
+        encoder_outputs = encoder_forward_fn(uncached_multimodal_params)
 
         # TODO: support multiple multimodal modalities per request
         if len(encoder_outputs) > 1:
             return encoder_outputs
 
         # Validate that multimodal_runtime has required attributes for caching
-        if (not hasattr(active_multimodal_params[0], 'multimodal_runtime')
-                or active_multimodal_params[0].multimodal_runtime is None or
-                active_multimodal_params[0].multimodal_runtime.total_mm_tokens
-                is None):
+        if (not hasattr(uncached_multimodal_params[0], 'multimodal_runtime')
+                or uncached_multimodal_params[0].multimodal_runtime is None
+                or uncached_multimodal_params[0].multimodal_runtime.
+                total_mm_tokens_in_request is None):
             logger.warning(
                 "Multimodal runtime data missing or incomplete - recomputed all embeddings"
             )
             return encoder_outputs
 
         # Step 3: Cache the computed embeddings to multimodal_data["multimodal_embedding"]
-        _cache_multimodal_embeddings(active_multimodal_params, encoder_outputs)
+        _cache_multimodal_embeddings(uncached_multimodal_params,
+                                     encoder_outputs)
 
     # Step 4: Gather all embeddings for the batch
     all_embeddings = torch.cat([
@@ -186,8 +190,10 @@ def find_input_mm_embeds(
         return mm_embeds
 
     # Calculate total tokens that need processing (both cached and current chunk)
-    total_mm_tokens = sum(
-        [param.multimodal_runtime.num_mm_tokens for param in multimodal_params])
+    total_mm_tokens = sum([
+        param.multimodal_runtime.num_mm_tokens_in_chunk
+        for param in multimodal_params
+    ])
 
     if total_mm_tokens == 0:
         # No tokens need processing, return empty list
@@ -203,11 +209,12 @@ def find_input_mm_embeds(
     slices = []
     for param in multimodal_params:
         runtime = param.multimodal_runtime
-        slices.append((current_pos + runtime.num_unseen_mm_tokens, current_pos +
-                       runtime.num_unseen_mm_tokens + runtime.num_mm_tokens))
+        slices.append(
+            (current_pos + runtime.num_unseen_mm_tokens, current_pos +
+             runtime.num_unseen_mm_tokens + runtime.num_mm_tokens_in_chunk))
         if len(mm_embeds
                ) == 1:  # pre-concatenated mm_embeds, need global offset
-            current_pos += runtime.total_mm_tokens
+            current_pos += runtime.total_mm_tokens_in_request
 
     sliced_mm_embeds = []
     if len(mm_embeds) == 1:
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -309,7 +309,7 @@ def __call__(
                                                 mm_processor_kwargs)
         if not mm_data:
             fused_input_ids = processed_inputs['input_ids']
-            return fused_input_ids.to(torch.int32).tolist(), {}
+            return fused_input_ids.flatten().to(torch.int32).tolist(), {}
 
         pixel_values = processed_inputs.get('pixel_values', None)
         pixel_values_videos = processed_inputs.get('pixel_values_videos', None)
@@ -619,7 +619,6 @@ def forward(
                     encoder_forward_fn=self.mm_encoder.forward,
                     multimodal_params=multimodal_params[:num_context_requests])
             else:
-                # TODO: this is a dead path for now
                 mm_embeds = [
                     multimodal_param.multimodal_data["multimodal_embedding"]
                     for multimodal_param in multimodal_params
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1241,7 +1241,6 @@ def _prepare_tp_inputs(
             num_cached_tokens_per_seq.append(past_seen_token_num)
 
             # Multimodal
-            # TODO: enable chunk prefill for multimodal (maybe need to pass prompt_tokens to MultimodalRuntimeData)
             py_multimodal_runtime = MultimodalRuntimeData(
                 mm_token_lengths=request.multimodal_lengths,
                 mm_token_positions=request.multimodal_positions,
diff --git a/tensorrt_llm/inputs/multimodal.py b/tensorrt_llm/inputs/multimodal.py
@@ -95,24 +95,24 @@ class MultimodalRuntimeData:
         mm_token_positions: Starting positions of each multimodal token chunk
         chunk_end_pos: End position of the current chunk for chunked prefill
         num_unseen_mm_tokens: Number of multimodal tokens that are cached (computed)
-        num_mm_tokens: Number of multimodal tokens in the current chunk (computed)
-        total_mm_tokens: Total number of multimodal tokens in the request sequence (computed)
+        num_mm_tokens_in_chunk: Number of multimodal tokens in the current chunk (computed)
+        total_mm_tokens_in_request: Total number of multimodal tokens in the request sequence (computed)
     """
     past_seen_token_num: int
     mm_token_lengths: List[int]
     mm_token_positions: List[int]
     chunk_end_pos: int
 
     num_unseen_mm_tokens: Optional[int] = None
-    num_mm_tokens: Optional[int] = None
-    total_mm_tokens: Optional[int] = None
+    num_mm_tokens_in_chunk: Optional[int] = None
+    total_mm_tokens_in_request: Optional[int] = None
 
     # TODO: fine-grained control of encoder runner/cache to each mm_item
 
     def __post_init__(self):
         # Validate input data
-        if self.total_mm_tokens is None:
-            self.total_mm_tokens = sum(self.mm_token_lengths)
+        if self.total_mm_tokens_in_request is None:
+            self.total_mm_tokens_in_request = sum(self.mm_token_lengths)
         if len(self.mm_token_positions) != len(self.mm_token_lengths):
             raise ValueError(
                 f"mm_token_positions ({len(self.mm_token_positions)}) and mm_token_lengths ({len(self.mm_token_lengths)}) must have the same length"
@@ -133,34 +133,36 @@ def __post_init__(self):
                 f"All mm_token_positions must be non-negative, got {self.mm_token_positions}"
             )
 
-        if self.num_unseen_mm_tokens is None or self.num_mm_tokens is None:
+        if self.num_unseen_mm_tokens is None or self.num_mm_tokens_in_chunk is None:
             # Compute cached multimodal tokens based on positions and cached tokens
             self.num_unseen_mm_tokens = 0
-            self.num_mm_tokens = 0
+            self.num_mm_tokens_in_chunk = 0
+            remainder = 0
             for pos, length in zip(self.mm_token_positions,
                                    self.mm_token_lengths):
                 if pos + length <= self.past_seen_token_num:
                     self.num_unseen_mm_tokens += length
                 elif pos < self.past_seen_token_num:
                     # Partial overlap - only count the cached portion
                     self.num_unseen_mm_tokens += self.past_seen_token_num - pos
-                    if pos + length > self.chunk_end_pos:
-                        self.num_mm_tokens += self.chunk_end_pos - self.past_seen_token_num
-                    else:
-                        self.num_mm_tokens += pos + length - self.past_seen_token_num
+                    self.num_mm_tokens_in_chunk += min(
+                        self.chunk_end_pos,
+                        pos + length) - self.past_seen_token_num
                 else:
                     if pos + length > self.chunk_end_pos:
                         # Partial overlap - only count the cached portion
                         if pos < self.chunk_end_pos:
-                            self.num_mm_tokens += self.chunk_end_pos - pos
+                            self.num_mm_tokens_in_chunk += self.chunk_end_pos - pos
+                        else:
+                            remainder += length
                     else:
                         # Full overlap - count the entire mm item chunk
-                        self.num_mm_tokens += length
+                        self.num_mm_tokens_in_chunk += length
 
-        if self.num_unseen_mm_tokens + self.num_mm_tokens > sum(
+        if self.num_unseen_mm_tokens + self.num_mm_tokens_in_chunk + remainder > sum(
                 self.mm_token_lengths):
             raise ValueError(
-                f"num_unseen_mm_tokens ({self.num_unseen_mm_tokens}) + num_mm_tokens ({self.num_mm_tokens}) must be less than or equal to sum of mm_token_lengths ({sum(self.mm_token_lengths)})"
+                f"num_unseen_mm_tokens ({self.num_unseen_mm_tokens}) + num_mm_tokens_in_chunk ({self.num_mm_tokens_in_chunk}) + remainder ({remainder}) must be less than or equal to sum of mm_token_lengths ({sum(self.mm_token_lengths)})"
             )
 
 
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -92,7 +92,7 @@ l0_h100:
   - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]
   - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-enable_request_rate] # negative test
   - test_e2e.py::test_trtllm_bench_help_sanity[meta-llama/Llama-3.1-8B]
-  - test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True]
+  - test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-False-image-True]
 - condition:
     ranges:
       system_gpu_count:
@@ -217,8 +217,8 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[xgrammar]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[llguidance]
-  - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
-  - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
+  - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-False-image-True]
+  - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-False-mixture_text_image-True]
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -19,15 +19,15 @@ l0_l40s:
   - unittest/_torch/modeling -k "modeling_vila"
   - unittest/_torch/modeling -k "modeling_siglip"
   - test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B]
-  - test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False]
-  - test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False]
-  - test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False]
-  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False]
-  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-video-False]
-  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-False]
-  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-True]
-  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-False]
-  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-True]
+  - test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-False-image-False]
+  - test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-False-video-False]
+  - test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-False-image-False]
+  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-True-image-True]
+  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-False-video-False]
+  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-True-image-False]
+  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-False-image-True]
+  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-False-video-False]
+  - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-False-video-True]
   - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[audio]
   - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image]
   - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio]
diff --git a/tests/unittest/_torch/multimodal/test_multimodal_runtime.py b/tests/unittest/_torch/multimodal/test_multimodal_runtime.py