Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 170 additions & 0 deletions examples/offline_inference/qwen3_omni/only_thinker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference
with the correct prompt format on Qwen2.5-Omni (thinker only).
"""

from typing import NamedTuple

from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
from vllm.utils.argparse_utils import FlexibleArgumentParser


class QueryResult(NamedTuple):
inputs: dict
limit_mm_per_prompt: dict[str, int]


# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)


def get_mixed_modalities_query() -> QueryResult:
question = (
"What is recited in the audio? "
"What is the content of this image? Why is this video funny?"
)
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
"<|vision_start|><|image_pad|><|vision_end|>"
"<|vision_start|><|video_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
"image": convert_image_mode(
ImageAsset("cherry_blossom").pil_image, "RGB"
),
"video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
},
},
limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
)


def get_use_audio_in_video_query() -> QueryResult:
question = (
"Describe the content of the video in details, then convert what the "
"baby say into text."
)
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"video": asset.np_ndarrays,
"audio": audio,
},
"mm_processor_kwargs": {
"use_audio_in_video": True,
},
},
limit_mm_per_prompt={"audio": 1, "video": 1},
)


def get_multi_audios_query() -> QueryResult:
question = "Are these two audio clips the same?"
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
"<|audio_start|><|audio_pad|><|audio_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": [
AudioAsset("winning_call").audio_and_sample_rate,
AudioAsset("mary_had_lamb").audio_and_sample_rate,
],
},
},
limit_mm_per_prompt={
"audio": 2,
},
)


query_map = {
"mixed_modalities": get_mixed_modalities_query,
"use_audio_in_video": get_use_audio_in_video_query,
"multi_audios": get_multi_audios_query,
}


def main(args):
model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
query_result = query_map[args.query_type]()

llm = LLM(
model=model_name,
max_model_len=12800,
max_num_seqs=5,
limit_mm_per_prompt=query_result.limit_mm_per_prompt,
seed=args.seed,
)

# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2, max_tokens=256)

outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)

for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)


def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with "
"audio language models"
)
parser.add_argument(
"--query-type",
"-q",
type=str,
default="mixed_modalities",
choices=query_map.keys(),
help="Query type.",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)

return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
main(args)
221 changes: 221 additions & 0 deletions tests/model_executor/test_qwen3_omni.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from unittest.mock import Mock

import pytest
from transformers import PretrainedConfig

from vllm.multimodal.processing import InputProcessingContext


# Helper function to print input IDs with coalesced audio/video tokens.
def print_input_ids(input_ids):
"""
Print input IDs, compressing consecutive special tokens.
- 151675: <|audio_pad|>
- 151656: <|video_pad|>
"""
if not input_ids:
print("[]")
return

result = []
i = 0

while i < len(input_ids):
current_id = input_ids[i]

# Check if it's a special token that should be compressed
if current_id in [151675, 151656]:
# Count consecutive occurrences
count = 1
while i + count < len(input_ids) and input_ids[i + count] == current_id:
count += 1

# Add compressed representation
token_name = "<|audio_pad|>" if current_id == 151675 else "<|video_pad|>"
result.append(f"{token_name} * {count}")
i += count
else:
# Regular token, just add it
result.append(str(current_id))
i += 1

print(", ".join(result))


@pytest.fixture
def mock_qwen3_omni_config():
"""Create a mock Qwen3OmniMoeThinker config."""
config = Mock(spec=PretrainedConfig)
# Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
config.audio_token_id = 151675 # <|audio_pad|>
config.video_token_id = 151656 # <|video_pad|>
config.image_token_id = 151655 # <|image_pad|>
config.audio_start_token_id = 151669 # <|audio_start|>
config.audio_end_token_id = 151670 # <|audio_end|>
config.vision_start_token_id = 151652 # <|vision_start|>
config.position_id_per_seconds = 12.5

# Vision config
vision_config = Mock()
vision_config.spatial_merge_size = 2
config.vision_config = vision_config

return config


@pytest.fixture
def mock_processor():
"""Create a mock HF processor."""
from transformers.models.whisper import WhisperFeatureExtractor

processor = Mock()
processor.audio_token = "<|audio_pad|>"
processor.image_token = "<|image_pad|>"
processor.video_token = "<|video_pad|>"

# Create a real WhisperFeatureExtractor instance for the feature_extractor attribute
feature_extractor = WhisperFeatureExtractor()
processor.feature_extractor = feature_extractor

return processor


@pytest.fixture
def mock_tokenizer():
"""Create a mock tokenizer."""
tokenizer = Mock()
# Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
tokenizer.get_vocab = Mock(
return_value={
"<|audio_pad|>": 151675,
"<|video_pad|>": 151656,
"<|image_pad|>": 151655,
"<|audio_start|>": 151669,
"<|audio_end|>": 151670,
"<|vision_start|>": 151652,
"<|vision_end|>": 151653,
}
)
tokenizer.encode = Mock(
side_effect=lambda x: {
"<|vision_start|>": [151652],
"<|vision_end|>": [151653],
"<|audio_start|>": [151669],
"<|audio_end|>": [151670],
"<|audio_pad|>": [151675],
"<|image_pad|>": [151655],
"<|video_pad|>": [151656],
}.get(x, [0])
)
tokenizer.vision_bos_token = "<|vision_start|>"
tokenizer.vision_eos_token = "<|vision_end|>"
tokenizer.audio_bos_token = "<|audio_start|>"
tokenizer.audio_eos_token = "<|audio_end|>"
return tokenizer


@pytest.fixture
def mock_image_processor():
"""Create a mock image processor."""
image_processor = Mock()
image_processor.merge_size = 2
return image_processor


def test_qwen3_omni_get_updates_use_audio_in_video(
mock_qwen3_omni_config,
mock_processor,
mock_tokenizer,
mock_image_processor,
):
"""Test the get_updates_use_audio_in_video method directly."""

from vllm.model_executor.models.qwen3_omni_moe_thinker import (
Qwen3OmniMoeThinkerMultiModalProcessor,
Qwen3OmniMoeThinkerProcessingInfo,
)

# Create a mock context
mock_ctx = Mock(spec=InputProcessingContext)

# Create processing info
info = Qwen3OmniMoeThinkerProcessingInfo(mock_ctx)
info.get_hf_config = Mock(return_value=mock_qwen3_omni_config)
info.get_hf_processor = Mock(return_value=mock_processor)
info.get_tokenizer = Mock(return_value=mock_tokenizer)
info.get_image_processor = Mock(return_value=mock_image_processor)

# Create a mock dummy_inputs builder
mock_dummy_inputs = Mock()

# Create the processor
processor = Qwen3OmniMoeThinkerMultiModalProcessor(info, mock_dummy_inputs)

# Test parameters from reference video
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4
audio_len = 85
video_grid_thw = [6, 36, 64]
video_second_per_grid_t = 2.0

# Call the method
updates = processor.get_updates_use_audio_in_video(
thinker_config=mock_qwen3_omni_config,
audio_len=audio_len,
video_grid_thw=video_grid_thw,
video_second_per_grid_t=video_second_per_grid_t,
)

# Updated input ids should align with HF implementation.
# 151669,
# <|video_pad|> * 576, <|audio_pad|> * 25,
# <|video_pad|> * 576, <|audio_pad|> * 25,
# <|video_pad|> * 576, <|audio_pad|> * 25,
# <|video_pad|> * 576, <|audio_pad|> * 10,
# <|video_pad|> * 1152,
# 151670
print_input_ids(updates)

# Verify structure
assert isinstance(updates, list)
assert len(updates) > 0

# Verify start and end tokens
audio_start_token_id = mock_qwen3_omni_config.audio_start_token_id
audio_end_token_id = mock_qwen3_omni_config.audio_end_token_id

assert updates[0] == audio_start_token_id
assert updates[-1] == audio_end_token_id

# Verify both audio and video tokens are present
audio_token_id = mock_qwen3_omni_config.audio_token_id
video_token_id = mock_qwen3_omni_config.video_token_id

audio_count = updates.count(audio_token_id)
video_count = updates.count(video_token_id)

assert audio_count == audio_len, (
f"Expected {audio_len} audio tokens, got {audio_count}"
)

# Calculate expected video token count
spatial_merge_size = mock_qwen3_omni_config.vision_config.spatial_merge_size
height = video_grid_thw[1] // spatial_merge_size
width = video_grid_thw[2] // spatial_merge_size
expected_video_count = video_grid_thw[0] * height * width

assert video_count == expected_video_count, (
f"Expected {expected_video_count} video tokens, got {video_count}"
)

# Total tokens should be: 1 (start) + audio_len + video_count + 1 (end)
expected_total = 1 + audio_len + expected_video_count + 1
assert len(updates) == expected_total, (
f"Expected {expected_total} total tokens, got {len(updates)}"
)


if __name__ == "__main__":
pytest.main([__file__, "-v"])
Loading