From dec7eaf1999d360376f7c4130765cc5487394fb0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 17 Dec 2025 04:22:54 +0000 Subject: [PATCH 1/2] [Doc] Show that `use_audio_in_video` is supported in docs Signed-off-by: DarkLight1337 --- docs/models/supported_models.md | 3 --- examples/offline_inference/qwen2_5_omni/README.md | 1 - 2 files changed, 4 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 9ea0588bd417..39e965c1d099 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -767,9 +767,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. For more details, please see: -!!! note - For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported. - #### Transcription Speech2Text models trained specifically for Automatic Speech Recognition. diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md index d8fb50d7fe55..409ac0223b55 100644 --- a/examples/offline_inference/qwen2_5_omni/README.md +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -10,7 +10,6 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \ -q mixed_modalities # Read vision and audio inputs from a single video file -# NOTE: V1 engine does not support interleaved modalities yet. python examples/offline_inference/qwen2_5_omni/only_thinker.py \ -q use_audio_in_video From 44316ba7d07747bde30dc817484e169a3f720e1e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 17 Dec 2025 04:29:29 +0000 Subject: [PATCH 2/2] Remove comments Signed-off-by: DarkLight1337 --- vllm/model_executor/models/qwen2_5_omni_thinker.py | 2 -- vllm/model_executor/models/qwen3_omni_moe_thinker.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 94deeb867c9f..bc9c8b73cff3 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -1128,8 +1128,6 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings - # TODO (ywang96): support overlapping modality embeddings so that - # `use_audio_in_video` will work on V1. def embed_input_ids( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 3f29a14a2d1c..b8b5e893d154 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1371,8 +1371,6 @@ def embed_input_ids( return inputs_embeds deepstack_input_embeds = None - # TODO (ywang96): support overlapping modalitiy embeddings so that - # `use_audio_in_video` will work on V1. # split the feat dim to obtain multi-scale visual feature has_vision_embeddings = [ embeddings.shape[-1] != self.config.text_config.hidden_size