vllm-project · Isotr0py · Mar 13, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Regression tests for Qwen2.5-Omni and Qwen3-Omni audio-in-video processor
+caching.
+
+Tests the use_audio_in_video feature where audio is extracted from video and
+processed together with video frames in an interleaved manner.
+
+Regression test: when use_audio_in_video=True and the multimodal processor
+cache is warm, the second request goes through MultiModalProcessorSenderCache
+which sets mm_kwargs["video"] items to None on a cache hit.  The processor
+must still detect use_audio_in_video=True (via token-count heuristic) and
+produce the same prompt_token_ids as the first (cache-miss) request.
+
+Without the fix the cache-hit path left use_audio_in_video=False, causing
+audio placeholder tokens to be inserted separately instead of being derived
+from the interleaved video placeholders – yielding a different (wrong) token
+sequence on every subsequent request for the same video.
+"""
+
+import numpy as np
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import MultiModalProcessorSenderCache
+
+from ....multimodal.utils import random_audio, random_video
+from ...utils import build_model_context
+
+MODELS = [
+    "Qwen/Qwen2.5-Omni-3B",
+    "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+]
+
+
+@pytest.mark.parametrize("model_id", MODELS)
+def test_audio_in_video_cache_correctness(model_id: str) -> None:
+    """
+    Regression test for https://github.com/vllm-project/vllm/pull/36800
+
+    MultiModalProcessorSenderCache.get_and_update_item returns (None, updates)
+    on a cache hit, so mm_kwargs["video"] items become None on the second call.
+    The Qwen processor override of _maybe_apply_prompt_updates must detect
+    use_audio_in_video=True via token-count heuristics and re-derive the audio
+    placeholders correctly.
+    """
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt={"audio": 1, "image": 0, "video": 1},
+        mm_processor_cache_gb=1,
+    )
+
+    # Baseline: no cache, always processes from scratch.
+    baseline_processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config, cache=None
+    )
+    # Sender cache: on a cache hit returns (None, prompt_updates) for each
+    # item, setting mm_kwargs["video"] = [None] – the exact condition that
+    # triggered the original bug.
+    sender_cache = MultiModalProcessorSenderCache(ctx.model_config)
+    cached_processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config, cache=sender_cache
+    )
+
+    video_token_id = baseline_processor.info.get_hf_config().video_token_id
+
+    rng = np.random.RandomState(0)
+    # Small video (8 frames, 64×64) and ~0.5 s of audio at 16 kHz so the test
+    # stays fast even without a GPU.
+    video = random_video(rng, min_frames=8, max_frames=9, min_wh=64, max_wh=65)
+    audio, sr = random_audio(rng, min_len=8000, max_len=8001, sr=16000)
+    mm_data = {"video": [video], "audio": [(audio, sr)]}
+    hf_processor_mm_kwargs = {"use_audio_in_video": True}
+
+    def run(processor):
+        return processor(
+            [video_token_id],
+            mm_items=baseline_processor.info.parse_mm_data(mm_data),
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )["prompt_token_ids"]
+
+    baseline_ids = run(baseline_processor)
+
+    # First call on the sender-cache processor: cache miss.
+    # mm_kwargs["video"] items are real tensors; use_audio_in_video is
+    # detected normally from the item data.
+    first_ids = run(cached_processor)
+    assert first_ids == baseline_ids, (
+        "Cache-miss call produced different prompt_token_ids than baseline.\n"
+        f"  baseline  : {baseline_ids}\n"
+        f"  cache-miss: {first_ids}"
+    )
+
+    # Second call on the sender-cache processor: cache hit.
+    # MultiModalProcessorSenderCache.get_and_update_item returns (None, …),
+    # so mm_kwargs["video"] = [None].  Before the fix, use_audio_in_video was
+    # not detected, yielding wrong token ids.
+    second_ids = run(cached_processor)
+    assert second_ids == baseline_ids, (
+        "Cache-hit call produced different prompt_token_ids than baseline.\n"
+        "This is the regression introduced when use_audio_in_video detection\n"
+        "fails for None mm_kwargs items on a cache hit.\n"
+        f"  baseline : {baseline_ids}\n"
+        f"  cache-hit: {second_ids}"
+    )
@@ -80,8 +80,6 @@
 )
 from vllm.multimodal.processing import (
     BaseDummyInputsBuilder,
-    ProcessorInputs,
-    TimingContext,
 )
 from vllm.multimodal.processing.processor import (
     BaseMultiModalProcessor,
@@ -609,6 +607,17 @@ def _maybe_apply_prompt_updates(
                     if use_audio_in_video_tensor.numel() > 0:
                         use_audio_in_video = bool(use_audio_in_video_tensor.item())
                         break
+            # for mutilmodality cache
+            if any(item is None for item in mm_kwargs["video"]):
+                video_token_id = self.info.get_hf_config().video_token_id
+                audio_token_id = self.info.get_hf_config().audio_token_id
+                video_audio_item_num = sum(
+                    id in (video_token_id, audio_token_id) for id in prompt_ids
+                )
+                audio_updates_num = len(mm_prompt_updates.get("audio", []))
+                video_updates_num = len(mm_prompt_updates.get("video", []))
+                if video_audio_item_num != video_updates_num + audio_updates_num:
+                    use_audio_in_video = True
 
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
@@ -815,16 +824,6 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             ),
         ]
 
-    def _cached_apply_hf_processor(
-        self,
-        inputs: ProcessorInputs,
-        timing_ctx: TimingContext,
-    ):
-        mm_processor_kwargs = inputs.hf_processor_mm_kwargs
-        if mm_processor_kwargs.get("use_audio_in_video", False):
-            return self._apply_hf_processor(inputs, timing_ctx)
-        return super()._cached_apply_hf_processor(inputs, timing_ctx)
-
     def _apply_hf_processor_main(
         self,
         prompt: str | list[int],

@@ -1326,6 +1326,17 @@ def _maybe_apply_prompt_updates(
                     use_audio_in_video = True
                 else:
                     use_audio_in_video = False
+            # for mutilmodality cache
+            if any(item is None for item in mm_kwargs["video"]):
+                video_token_id = self.info.get_hf_config().video_token_id
+                audio_token_id = self.info.get_hf_config().audio_token_id
+                video_audio_item_num = sum(
+                    id in (video_token_id, audio_token_id) for id in prompt_ids
+                )
+                audio_updates_num = len(mm_prompt_updates.get("audio", []))
+                video_updates_num = len(mm_prompt_updates.get("video", []))
+                if video_audio_item_num != video_updates_num + audio_updates_num:
+                    use_audio_in_video = True
 
         # normal case with `use_audio_in_video=False`
         if is_update_applied: