NVIDIA-NeMo · yaoyu-33 · Mar 24, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/examples/models/vlm/qwen25_omni/README.md b/examples/models/vlm/qwen25_omni/README.md
@@ -0,0 +1,102 @@
+# Qwen2.5-Omni Examples
+
+This directory contains example scripts for Qwen2.5-Omni multimodal models.
+
+Qwen2.5-Omni supports simultaneous processing of images, video, audio, and text using a dense Qwen2 language backbone with multimodal RoPE (mrope).
+
+| Model | HF ID | Architecture | Params |
+|---|---|---|---|
+| Qwen2.5-Omni-7B | `Qwen/Qwen2.5-Omni-7B` | Dense (Qwen2) + Vision + Audio | 7B |
+
+## Prerequisites
+
+Audio and video processing requires `qwen-omni-utils` with `decord`. Install it into the project environment:
+
+```bash
+uv pip install qwen-omni-utils[decord]
+```
+
+Audio extraction from video (`--use_audio_in_video`) additionally requires `ffmpeg`. If `apt-get install ffmpeg` is unavailable (e.g. in a container), install it via `imageio-ffmpeg`:
+
+```bash
+uv pip install imageio-ffmpeg
+ln -sf $(uv run python -c "import imageio_ffmpeg; print(imageio_ffmpeg.get_ffmpeg_exe())") /usr/local/bin/ffmpeg
+```
+
+> **Note:** `--use_audio_in_video` requires a **local file** passed via `--video_path`. Audio extraction does not work with `--video_url` because `audioread` cannot stream audio directly from a URL.
+
+## Workspace Configuration
+
+All scripts use a `WORKSPACE` environment variable for the base directory. Default: `/workspace`.
+
+```bash
+export WORKSPACE=/your/custom/path
+```
+
+## Checkpoint Conversion
+
+See [conversion.sh](conversion.sh) for checkpoint conversion examples.
+
+### Import HF → Megatron
+
+```bash
+python examples/conversion/convert_checkpoints.py import \
+    --hf-model Qwen/Qwen2.5-Omni-7B \
+    --megatron-path ${WORKSPACE}/models/Qwen2.5-Omni-7B
+```
+
+### Export Megatron → HF
+
+```bash
+python examples/conversion/convert_checkpoints.py export \
+    --hf-model Qwen/Qwen2.5-Omni-7B \
+    --megatron-path ${WORKSPACE}/models/Qwen2.5-Omni-7B/iter_0000000 \
+    --hf-path ${WORKSPACE}/models/Qwen2.5-Omni-7B-hf-export
+```
+
+### Round-trip Validation
+
+```bash
+python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
+    --hf-model-id Qwen/Qwen2.5-Omni-7B \
+    --megatron-load-path ${WORKSPACE}/models/Qwen2.5-Omni-7B/iter_0000000 \
+    --tp 2 --pp 1
+```
+
+## Inference
+
+See [inference.sh](inference.sh) for multimodal generation with:
+- Hugging Face checkpoint
+- Imported Megatron checkpoint (after [conversion.sh](conversion.sh) import)
+- Exported HF checkpoint
+
+The default parallelism for 7B is `--tp 2` (2 GPUs). For larger variants scale TP accordingly.
+
+### Example: Video only
+
+```bash
+uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_to_megatron_generate_omni_lm.py \
+    --hf_model_path Qwen/Qwen2.5-Omni-7B \
+    --video_url "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4" \
+    --prompt "What was the first sentence the boy said when he met the girl?" \
+    --max_new_tokens 64 \
+    --tp 2
+```
+
+### Example: Video + Audio (requires ffmpeg and a local file)
+
+```bash
+# Download the video first
+wget -O /path/to/video.mp4 "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4"
+
+uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_to_megatron_generate_omni_lm.py \
+    --hf_model_path Qwen/Qwen2.5-Omni-7B \
+    --video_path /path/to/video.mp4 \
+    --prompt "What was the first sentence the boy said when he met the girl?" \
+    --use_audio_in_video \
+    --max_new_tokens 64 \
+    --tp 2
+```
diff --git a/examples/models/vlm/qwen25_omni/conversion.sh b/examples/models/vlm/qwen25_omni/conversion.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -xeuo pipefail
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+MODEL_NAME=Qwen2.5-Omni-7B
+HF_MODEL_ID=Qwen/${MODEL_NAME}
+
+# Import HF → Megatron
+uv run python examples/conversion/convert_checkpoints.py import \
+    --hf-model "$HF_MODEL_ID" \
+    --megatron-path "${WORKSPACE}/models/${MODEL_NAME}"
+
+# Export Megatron → HF
+uv run python examples/conversion/convert_checkpoints.py export \
+    --hf-model "$HF_MODEL_ID" \
+    --megatron-path "${WORKSPACE}/models/${MODEL_NAME}/iter_0000000" \
+    --hf-path "${WORKSPACE}/models/${MODEL_NAME}-hf-export"
+
+# Round-trip validation
+uv run python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
+    --hf-model-id "$HF_MODEL_ID" \
+    --megatron-load-path "${WORKSPACE}/models/${MODEL_NAME}/iter_0000000" \
+    --tp 2 --pp 1
diff --git a/examples/models/vlm/qwen25_omni/inference.sh b/examples/models/vlm/qwen25_omni/inference.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Workspace directory for checkpoints and results
+WORKSPACE=${WORKSPACE:-/workspace}
+
+MODEL_NAME=Qwen2.5-Omni-7B
+# For --use_audio_in_video, audio extraction requires a local file (URL streaming not supported).
+# Download the video first: wget -O /path/to/video.mp4 <url>
+VIDEO_PATH=${VIDEO_PATH:-/path/to/video.mp4}
+PROMPT="What was the first sentence the boy said when he met the girl?"
+
+# Requires: uv pip install qwen-omni-utils[decord]
+# Requires ffmpeg for audio: uv pip install imageio-ffmpeg && ln -sf $(uv run python -c "import imageio_ffmpeg; print(imageio_ffmpeg.get_ffmpeg_exe())") /usr/local/bin/ffmpeg
+
+# Inference with Hugging Face checkpoints (video + audio)
+uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_to_megatron_generate_omni_lm.py \
+    --hf_model_path Qwen/${MODEL_NAME} \
+    --video_path "${VIDEO_PATH}" \
+    --prompt "${PROMPT}" \
+    --use_audio_in_video \
+    --max_new_tokens 64 \
+    --tp 2
+
+# Inference with imported Megatron checkpoint (video + audio)
+uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_to_megatron_generate_omni_lm.py \
+    --hf_model_path Qwen/${MODEL_NAME} \
+    --megatron_model_path ${WORKSPACE}/models/${MODEL_NAME}/iter_0000000 \
+    --video_path "${VIDEO_PATH}" \
+    --prompt "${PROMPT}" \
+    --use_audio_in_video \
+    --max_new_tokens 64 \
+    --tp 2
+
+# Inference with exported HF checkpoint
+uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \
+    examples/conversion/hf_to_megatron_generate_omni_lm.py \
+    --hf_model_path ${WORKSPACE}/models/${MODEL_NAME}-hf-export \
+    --video_path "${VIDEO_PATH}" \
+    --prompt "${PROMPT}" \
+    --use_audio_in_video \
+    --max_new_tokens 64 \
+    --tp 2
diff --git a/src/megatron/bridge/models/qwen_omni/modeling_qwen25_omni/thinker_model.py b/src/megatron/bridge/models/qwen_omni/modeling_qwen25_omni/thinker_model.py
@@ -242,13 +242,13 @@ def forward(
 
                 # Process images through vision encoder
                 if pixel_values is not None and image_grid_thw is not None:
-                    image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                    image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).pooler_output
                 else:
                     image_embeds = None
 
                 # Process videos through vision encoder
                 if pixel_values_videos is not None and video_grid_thw is not None:
-                    video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                    video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw).pooler_output
                 else:
                     video_embeds = None