Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions tests/models/multimodal/processing/test_audio_in_video.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Regression tests for Qwen2.5-Omni and Qwen3-Omni audio-in-video processor
caching.

Tests the use_audio_in_video feature where audio is extracted from video and
processed together with video frames in an interleaved manner.

Regression test: when use_audio_in_video=True and the multimodal processor
cache is warm, the second request goes through MultiModalProcessorSenderCache
which sets mm_kwargs["video"] items to None on a cache hit. The processor
must still detect use_audio_in_video=True (via token-count heuristic) and
produce the same prompt_token_ids as the first (cache-miss) request.

Without the fix the cache-hit path left use_audio_in_video=False, causing
audio placeholder tokens to be inserted separately instead of being derived
from the interleaved video placeholders – yielding a different (wrong) token
sequence on every subsequent request for the same video.
"""

import numpy as np
import pytest

from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import MultiModalProcessorSenderCache

from ....multimodal.utils import random_audio, random_video
from ...utils import build_model_context

MODELS = [
"Qwen/Qwen2.5-Omni-3B",
"Qwen/Qwen3-Omni-30B-A3B-Instruct",
]


@pytest.mark.parametrize("model_id", MODELS)
def test_audio_in_video_cache_correctness(model_id: str) -> None:
"""
Regression test for https://github.com/vllm-project/vllm/pull/36800

MultiModalProcessorSenderCache.get_and_update_item returns (None, updates)
on a cache hit, so mm_kwargs["video"] items become None on the second call.
The Qwen processor override of _maybe_apply_prompt_updates must detect
use_audio_in_video=True via token-count heuristics and re-derive the audio
placeholders correctly.
"""
ctx = build_model_context(
model_id,
limit_mm_per_prompt={"audio": 1, "image": 0, "video": 1},
mm_processor_cache_gb=1,
)

# Baseline: no cache, always processes from scratch.
baseline_processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, cache=None
)
# Sender cache: on a cache hit returns (None, prompt_updates) for each
# item, setting mm_kwargs["video"] = [None] – the exact condition that
# triggered the original bug.
sender_cache = MultiModalProcessorSenderCache(ctx.model_config)
cached_processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, cache=sender_cache
)

video_token_id = baseline_processor.info.get_hf_config().video_token_id

rng = np.random.RandomState(0)
# Small video (8 frames, 64×64) and ~0.5 s of audio at 16 kHz so the test
# stays fast even without a GPU.
video = random_video(rng, min_frames=8, max_frames=9, min_wh=64, max_wh=65)
audio, sr = random_audio(rng, min_len=8000, max_len=8001, sr=16000)
mm_data = {"video": [video], "audio": [(audio, sr)]}
hf_processor_mm_kwargs = {"use_audio_in_video": True}

def run(processor):
return processor(
[video_token_id],
mm_items=baseline_processor.info.parse_mm_data(mm_data),
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)["prompt_token_ids"]

baseline_ids = run(baseline_processor)

# First call on the sender-cache processor: cache miss.
# mm_kwargs["video"] items are real tensors; use_audio_in_video is
# detected normally from the item data.
first_ids = run(cached_processor)
assert first_ids == baseline_ids, (
"Cache-miss call produced different prompt_token_ids than baseline.\n"
f" baseline : {baseline_ids}\n"
f" cache-miss: {first_ids}"
)

# Second call on the sender-cache processor: cache hit.
# MultiModalProcessorSenderCache.get_and_update_item returns (None, …),
# so mm_kwargs["video"] = [None]. Before the fix, use_audio_in_video was
# not detected, yielding wrong token ids.
second_ids = run(cached_processor)
assert second_ids == baseline_ids, (
"Cache-hit call produced different prompt_token_ids than baseline.\n"
"This is the regression introduced when use_audio_in_video detection\n"
"fails for None mm_kwargs items on a cache hit.\n"
f" baseline : {baseline_ids}\n"
f" cache-hit: {second_ids}"
)
23 changes: 11 additions & 12 deletions vllm/model_executor/models/qwen2_5_omni_thinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,6 @@
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
ProcessorInputs,
TimingContext,
)
from vllm.multimodal.processing.processor import (
BaseMultiModalProcessor,
Expand Down Expand Up @@ -609,6 +607,17 @@ def _maybe_apply_prompt_updates(
if use_audio_in_video_tensor.numel() > 0:
use_audio_in_video = bool(use_audio_in_video_tensor.item())
break
# for mutilmodality cache
if any(item is None for item in mm_kwargs["video"]):
video_token_id = self.info.get_hf_config().video_token_id
audio_token_id = self.info.get_hf_config().audio_token_id
video_audio_item_num = sum(
id in (video_token_id, audio_token_id) for id in prompt_ids
)
audio_updates_num = len(mm_prompt_updates.get("audio", []))
video_updates_num = len(mm_prompt_updates.get("video", []))
if video_audio_item_num != video_updates_num + audio_updates_num:
use_audio_in_video = True

if is_update_applied:
mm_placeholders = self._find_mm_placeholders(
Expand Down Expand Up @@ -815,16 +824,6 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int):
),
]

def _cached_apply_hf_processor(
self,
inputs: ProcessorInputs,
timing_ctx: TimingContext,
):
mm_processor_kwargs = inputs.hf_processor_mm_kwargs
if mm_processor_kwargs.get("use_audio_in_video", False):
return self._apply_hf_processor(inputs, timing_ctx)
return super()._cached_apply_hf_processor(inputs, timing_ctx)

def _apply_hf_processor_main(
self,
prompt: str | list[int],
Expand Down
11 changes: 11 additions & 0 deletions vllm/model_executor/models/qwen3_omni_moe_thinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1326,6 +1326,17 @@ def _maybe_apply_prompt_updates(
use_audio_in_video = True
else:
use_audio_in_video = False
# for mutilmodality cache
if any(item is None for item in mm_kwargs["video"]):
video_token_id = self.info.get_hf_config().video_token_id
audio_token_id = self.info.get_hf_config().audio_token_id
video_audio_item_num = sum(
id in (video_token_id, audio_token_id) for id in prompt_ids
)
audio_updates_num = len(mm_prompt_updates.get("audio", []))
video_updates_num = len(mm_prompt_updates.get("video", []))
if video_audio_item_num != video_updates_num + audio_updates_num:
use_audio_in_video = True

# normal case with `use_audio_in_video=False`
if is_update_applied:
Expand Down
Loading