From 4325675d557c6866105ac205e102aa4e4c7fed2f Mon Sep 17 00:00:00 2001
From: yaoyu-33 <yaoyu.094@gmail.com>
Date: Mon, 23 Mar 2026 14:50:58 -0700
Subject: [PATCH 1/4] [examples] docs: add qwen2.5-omni conversion and
 inference examples

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 examples/models/vlm/qwen25_omni/README.md     | 79 +++++++++++++++++++
 examples/models/vlm/qwen25_omni/conversion.sh | 40 ++++++++++
 examples/models/vlm/qwen25_omni/inference.sh  | 54 +++++++++++++
 3 files changed, 173 insertions(+)
 create mode 100644 examples/models/vlm/qwen25_omni/README.md
 create mode 100644 examples/models/vlm/qwen25_omni/conversion.sh
 create mode 100644 examples/models/vlm/qwen25_omni/inference.sh

diff --git a/examples/models/vlm/qwen25_omni/README.md b/examples/models/vlm/qwen25_omni/README.md
new file mode 100644
index 0000000000..14d0c405d7
--- /dev/null
+++ b/examples/models/vlm/qwen25_omni/README.md
@@ -0,0 +1,79 @@
+# Qwen2.5-Omni Examples
+
+This directory contains example scripts for Qwen2.5-Omni multimodal models.
+
+Qwen2.5-Omni supports simultaneous processing of images, video, audio, and text using a dense Qwen2 language backbone with multimodal RoPE (mrope).
+
+| Model | HF ID | Architecture | Params |
+|---|---|---|---|
+| Qwen2.5-Omni-7B | `Qwen/Qwen2.5-Omni-7B` | Dense (Qwen2) + Vision + Audio | 7B |
+
+## Prerequisites
+
+```bash
+pip install qwen-omni-utils[decord]
+```
+
+## Workspace Configuration
+
+All scripts use a `WORKSPACE` environment variable for the base directory. Default: `/workspace`.
+
+```bash
+export WORKSPACE=/your/custom/path
+```
+
+## Checkpoint Conversion
+
+See [conversion.sh](conversion.sh) for checkpoint conversion examples.
+
+### Import HF → Megatron
+
+```bash
+python examples/conversion/convert_checkpoints.py import \
+    --hf-model Qwen/Qwen2.5-Omni-7B \
+    --megatron-path ${WORKSPACE}/models/Qwen2.5-Omni-7B
+```
+
+### Export Megatron → HF
+
+```bash
+python examples/conversion/convert_checkpoints.py export \
+    --hf-model Qwen/Qwen2.5-Omni-7B \
+    --megatron-path ${WORKSPACE}/models/Qwen2.5-Omni-7B/iter_0000000 \
+    --hf-path ${WORKSPACE}/models/Qwen2.5-Omni-7B-hf-export
+```
+
+### Round-trip Validation
+
+```bash
+python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
+    --hf-model-id Qwen/Qwen2.5-Omni-7B \
+    --megatron-load-path ${WORKSPACE}/models/Qwen2.5-Omni-7B/iter_0000000 \
+    --tp 2 --pp 1
+```
+
+## Inference
+
+See [inference.sh](inference.sh) for multimodal generation with:
+- Hugging Face checkpoint
+- Imported Megatron checkpoint (after [conversion.sh](conversion.sh) import)
+- Exported HF checkpoint
+
+The default parallelism for 7B is `--tp 2` (2 GPUs). For larger variants scale TP accordingly.
+
+### Example: Video + Audio
+
+```bash
+uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_to_megatron_generate_omni_lm.py \
+    --hf_model_path Qwen/Qwen2.5-Omni-7B \
+    --video_url "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4" \
+    --prompt "What was the first sentence the boy said when he met the girl?" \
+    --use_audio_in_video \
+    --max_new_tokens 64 \
+    --tp 2
+```
+
+Pass `--use_audio_in_video` to include the audio track from the video in the model input.
+Use `--video_path` instead of `--video_url` for a local file.
diff --git a/examples/models/vlm/qwen25_omni/conversion.sh b/examples/models/vlm/qwen25_omni/conversion.sh
new file mode 100644
index 0000000000..011d6d5bb4
--- /dev/null
+++ b/examples/models/vlm/qwen25_omni/conversion.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+MODEL_NAME=Qwen2.5-Omni-7B
+HF_MODEL_ID=Qwen/${MODEL_NAME}
+
+# Import HF → Megatron
+uv run python examples/conversion/convert_checkpoints.py import \
+    --hf-model "$HF_MODEL_ID" \
+    --megatron-path "${WORKSPACE}/models/${MODEL_NAME}"
+
+# Export Megatron → HF
+uv run python examples/conversion/convert_checkpoints.py export \
+    --hf-model "$HF_MODEL_ID" \
+    --megatron-path "${WORKSPACE}/models/${MODEL_NAME}/iter_0000000" \
+    --hf-path "${WORKSPACE}/models/${MODEL_NAME}-hf-export"
+
+# Round-trip validation
+uv run python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
+    --hf-model-id "$HF_MODEL_ID" \
+    --megatron-load-path "${WORKSPACE}/models/${MODEL_NAME}/iter_0000000" \
+    --tp 2 --pp 1
diff --git a/examples/models/vlm/qwen25_omni/inference.sh b/examples/models/vlm/qwen25_omni/inference.sh
new file mode 100644
index 0000000000..f3c81124c7
--- /dev/null
+++ b/examples/models/vlm/qwen25_omni/inference.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+MODEL_NAME=Qwen2.5-Omni-7B
+VIDEO_URL="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4"
+PROMPT="What was the first sentence the boy said when he met the girl?"
+
+# Requires: pip install qwen-omni-utils[decord]
+
+# Inference with Hugging Face checkpoints (video + audio)
+uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_to_megatron_generate_omni_lm.py \
+    --hf_model_path Qwen/${MODEL_NAME} \
+    --video_url "${VIDEO_URL}" \
+    --prompt "${PROMPT}" \
+    --use_audio_in_video \
+    --max_new_tokens 64 \
+    --tp 2
+
+# Inference with imported Megatron checkpoint (video + audio)
+uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_to_megatron_generate_omni_lm.py \
+    --hf_model_path Qwen/${MODEL_NAME} \
+    --megatron_model_path ${WORKSPACE}/models/${MODEL_NAME}/iter_0000000 \
+    --video_url "${VIDEO_URL}" \
+    --prompt "${PROMPT}" \
+    --use_audio_in_video \
+    --max_new_tokens 64 \
+    --tp 2
+
+# Inference with exported HF checkpoint
+uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_to_megatron_generate_omni_lm.py \
+    --hf_model_path ${WORKSPACE}/models/${MODEL_NAME}-hf-export \
+    --video_url "${VIDEO_URL}" \
+    --prompt "${PROMPT}" \
+    --use_audio_in_video \
+    --max_new_tokens 64 \
+    --tp 2

From 9d6b505d04e32f980c29c279820009f7f34d1c5c Mon Sep 17 00:00:00 2001
From: yaoyu-33 <yaoyu.094@gmail.com>
Date: Mon, 23 Mar 2026 15:08:00 -0700
Subject: [PATCH 2/4] [model] fix: extract pooler_output from vision encoder in
 Qwen2.5-Omni thinker model

Qwen2_5OmniVisionEncoder.forward() returns BaseModelOutputWithPooling,
not a plain tensor. Extract .pooler_output (the merger-projected features)
before assigning to combined_embeddings to fix inference crash:
  TypeError: can't assign a BaseModelOutputWithPooling to a BFloat16Tensor

Also update README to clarify qwen-omni-utils install command and note
that --use_audio_in_video requires ffmpeg on the system.

Signed-off-by: Yu Yao <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 examples/models/vlm/qwen25_omni/README.md                   | 6 +++++-
 .../models/qwen_omni/modeling_qwen25_omni/thinker_model.py  | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/models/vlm/qwen25_omni/README.md b/examples/models/vlm/qwen25_omni/README.md
index 14d0c405d7..5105036463 100644
--- a/examples/models/vlm/qwen25_omni/README.md
+++ b/examples/models/vlm/qwen25_omni/README.md
@@ -10,10 +10,14 @@ Qwen2.5-Omni supports simultaneous processing of images, video, audio, and text
 
 ## Prerequisites
 
+Audio and video processing requires `qwen-omni-utils` with `decord`. Install it into the project environment:
+
 ```bash
-pip install qwen-omni-utils[decord]
+uv pip install qwen-omni-utils[decord]
 ```
 
+> **Note:** Audio extraction from video (`--use_audio_in_video`) additionally requires `ffmpeg` to be installed on the system (`apt-get install ffmpeg` or equivalent). Without it, audio input is skipped and the model falls back to video-only mode.
+
 ## Workspace Configuration
 
 All scripts use a `WORKSPACE` environment variable for the base directory. Default: `/workspace`.
diff --git a/src/megatron/bridge/models/qwen_omni/modeling_qwen25_omni/thinker_model.py b/src/megatron/bridge/models/qwen_omni/modeling_qwen25_omni/thinker_model.py
index c52b1396a5..abc754f979 100644
--- a/src/megatron/bridge/models/qwen_omni/modeling_qwen25_omni/thinker_model.py
+++ b/src/megatron/bridge/models/qwen_omni/modeling_qwen25_omni/thinker_model.py
@@ -242,13 +242,13 @@ def forward(
 
                 # Process images through vision encoder
                 if pixel_values is not None and image_grid_thw is not None:
-                    image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                    image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).pooler_output
                 else:
                     image_embeds = None
 
                 # Process videos through vision encoder
                 if pixel_values_videos is not None and video_grid_thw is not None:
-                    video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                    video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw).pooler_output
                 else:
                     video_embeds = None
 

From aba5e9c7c55642631ba86b5f471c460be4182130 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <yaoyu.094@gmail.com>
Date: Mon, 23 Mar 2026 15:44:07 -0700
Subject: [PATCH 3/4] [doc] fix: update Qwen2.5-Omni README with ffmpeg install
 and audio usage notes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 examples/models/vlm/qwen25_omni/README.md | 29 +++++++++++++++++++----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/examples/models/vlm/qwen25_omni/README.md b/examples/models/vlm/qwen25_omni/README.md
index 5105036463..9b637e08fe 100644
--- a/examples/models/vlm/qwen25_omni/README.md
+++ b/examples/models/vlm/qwen25_omni/README.md
@@ -16,7 +16,14 @@ Audio and video processing requires `qwen-omni-utils` with `decord`. Install it
 uv pip install qwen-omni-utils[decord]
 ```
 
-> **Note:** Audio extraction from video (`--use_audio_in_video`) additionally requires `ffmpeg` to be installed on the system (`apt-get install ffmpeg` or equivalent). Without it, audio input is skipped and the model falls back to video-only mode.
+Audio extraction from video (`--use_audio_in_video`) additionally requires `ffmpeg`. If `apt-get install ffmpeg` is unavailable (e.g. in a container), install it via `imageio-ffmpeg`:
+
+```bash
+uv pip install imageio-ffmpeg
+ln -sf $(uv run python -c "import imageio_ffmpeg; print(imageio_ffmpeg.get_ffmpeg_exe())") /usr/local/bin/ffmpeg
+```
+
+> **Note:** `--use_audio_in_video` requires a **local file** passed via `--video_path`. Audio extraction does not work with `--video_url` because `audioread` cannot stream audio directly from a URL.
 
 ## Workspace Configuration
 
@@ -66,7 +73,7 @@ See [inference.sh](inference.sh) for multimodal generation with:
 
 The default parallelism for 7B is `--tp 2` (2 GPUs). For larger variants scale TP accordingly.
 
-### Example: Video + Audio
+### Example: Video only
 
 ```bash
 uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
@@ -74,10 +81,22 @@ uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
     --hf_model_path Qwen/Qwen2.5-Omni-7B \
     --video_url "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4" \
     --prompt "What was the first sentence the boy said when he met the girl?" \
-    --use_audio_in_video \
     --max_new_tokens 64 \
     --tp 2
 ```
 
-Pass `--use_audio_in_video` to include the audio track from the video in the model input.
-Use `--video_path` instead of `--video_url` for a local file.
+### Example: Video + Audio (requires ffmpeg and a local file)
+
+```bash
+# Download the video first
+wget -O /path/to/video.mp4 "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4"
+
+uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_to_megatron_generate_omni_lm.py \
+    --hf_model_path Qwen/Qwen2.5-Omni-7B \
+    --video_path /path/to/video.mp4 \
+    --prompt "What was the first sentence the boy said when he met the girl?" \
+    --use_audio_in_video \
+    --max_new_tokens 64 \
+    --tp 2
+```

From a08769af407280386ba1f665d141ad3828dec99b Mon Sep 17 00:00:00 2001
From: yaoyu-33 <yaoyu.094@gmail.com>
Date: Mon, 23 Mar 2026 16:23:57 -0700
Subject: [PATCH 4/4] [doc] fix: use --video_path in inference.sh for
 audio+video mode

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
---
 examples/models/vlm/qwen25_omni/inference.sh | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/models/vlm/qwen25_omni/inference.sh b/examples/models/vlm/qwen25_omni/inference.sh
index f3c81124c7..96daef9e6d 100644
--- a/examples/models/vlm/qwen25_omni/inference.sh
+++ b/examples/models/vlm/qwen25_omni/inference.sh
@@ -17,16 +17,19 @@
 WORKSPACE=${WORKSPACE:-/workspace}
 
 MODEL_NAME=Qwen2.5-Omni-7B
-VIDEO_URL="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4"
+# For --use_audio_in_video, audio extraction requires a local file (URL streaming not supported).
+# Download the video first: wget -O /path/to/video.mp4 <url>
+VIDEO_PATH=${VIDEO_PATH:-/path/to/video.mp4}
 PROMPT="What was the first sentence the boy said when he met the girl?"
 
-# Requires: pip install qwen-omni-utils[decord]
+# Requires: uv pip install qwen-omni-utils[decord]
+# Requires ffmpeg for audio: uv pip install imageio-ffmpeg && ln -sf $(uv run python -c "import imageio_ffmpeg; print(imageio_ffmpeg.get_ffmpeg_exe())") /usr/local/bin/ffmpeg
 
 # Inference with Hugging Face checkpoints (video + audio)
 uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
     examples/conversion/hf_to_megatron_generate_omni_lm.py \
     --hf_model_path Qwen/${MODEL_NAME} \
-    --video_url "${VIDEO_URL}" \
+    --video_path "${VIDEO_PATH}" \
     --prompt "${PROMPT}" \
     --use_audio_in_video \
     --max_new_tokens 64 \
@@ -37,7 +40,7 @@ uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
     examples/conversion/hf_to_megatron_generate_omni_lm.py \
     --hf_model_path Qwen/${MODEL_NAME} \
     --megatron_model_path ${WORKSPACE}/models/${MODEL_NAME}/iter_0000000 \
-    --video_url "${VIDEO_URL}" \
+    --video_path "${VIDEO_PATH}" \
     --prompt "${PROMPT}" \
     --use_audio_in_video \
     --max_new_tokens 64 \
@@ -47,7 +50,7 @@ uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
 uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
     examples/conversion/hf_to_megatron_generate_omni_lm.py \
     --hf_model_path ${WORKSPACE}/models/${MODEL_NAME}-hf-export \
-    --video_url "${VIDEO_URL}" \
+    --video_path "${VIDEO_PATH}" \
     --prompt "${PROMPT}" \
     --use_audio_in_video \
     --max_new_tokens 64 \