vllm-project · vllm-bot · Jan 6, 2026 · Dec 17, 2025 · Dec 17, 2025
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -767,9 +767,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor
     The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
     For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
 
-!!! note
-    For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
-
 #### Transcription
 
 Speech2Text models trained specifically for Automatic Speech Recognition.

diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
@@ -10,7 +10,6 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
     -q mixed_modalities
 
 # Read vision and audio inputs from a single video file
-# NOTE: V1 engine does not support interleaved modalities yet.
 python examples/offline_inference/qwen2_5_omni/only_thinker.py \
     -q use_audio_in_video
 

@@ -1128,8 +1128,6 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
                 multimodal_embeddings += tuple(audio_embeddings)
         return multimodal_embeddings
 
-    # TODO (ywang96): support overlapping modality embeddings so that
-    # `use_audio_in_video` will work on V1.
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,

@@ -1371,8 +1371,6 @@ def embed_input_ids(
             return inputs_embeds
 
         deepstack_input_embeds = None
-        # TODO (ywang96): support overlapping modalitiy embeddings so that
-        # `use_audio_in_video` will work on V1.
         # split the feat dim to obtain multi-scale visual feature
         has_vision_embeddings = [
             embeddings.shape[-1] != self.config.text_config.hidden_size