From 4325675d557c6866105ac205e102aa4e4c7fed2f Mon Sep 17 00:00:00 2001 From: yaoyu-33 Date: Mon, 23 Mar 2026 14:50:58 -0700 Subject: [PATCH 1/4] [examples] docs: add qwen2.5-omni conversion and inference examples Signed-off-by: yaoyu-33 --- examples/models/vlm/qwen25_omni/README.md | 79 +++++++++++++++++++ examples/models/vlm/qwen25_omni/conversion.sh | 40 ++++++++++ examples/models/vlm/qwen25_omni/inference.sh | 54 +++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 examples/models/vlm/qwen25_omni/README.md create mode 100644 examples/models/vlm/qwen25_omni/conversion.sh create mode 100644 examples/models/vlm/qwen25_omni/inference.sh diff --git a/examples/models/vlm/qwen25_omni/README.md b/examples/models/vlm/qwen25_omni/README.md new file mode 100644 index 0000000000..14d0c405d7 --- /dev/null +++ b/examples/models/vlm/qwen25_omni/README.md @@ -0,0 +1,79 @@ +# Qwen2.5-Omni Examples + +This directory contains example scripts for Qwen2.5-Omni multimodal models. + +Qwen2.5-Omni supports simultaneous processing of images, video, audio, and text using a dense Qwen2 language backbone with multimodal RoPE (mrope). + +| Model | HF ID | Architecture | Params | +|---|---|---|---| +| Qwen2.5-Omni-7B | `Qwen/Qwen2.5-Omni-7B` | Dense (Qwen2) + Vision + Audio | 7B | + +## Prerequisites + +```bash +pip install qwen-omni-utils[decord] +``` + +## Workspace Configuration + +All scripts use a `WORKSPACE` environment variable for the base directory. Default: `/workspace`. + +```bash +export WORKSPACE=/your/custom/path +``` + +## Checkpoint Conversion + +See [conversion.sh](conversion.sh) for checkpoint conversion examples. + +### Import HF → Megatron + +```bash +python examples/conversion/convert_checkpoints.py import \ + --hf-model Qwen/Qwen2.5-Omni-7B \ + --megatron-path ${WORKSPACE}/models/Qwen2.5-Omni-7B +``` + +### Export Megatron → HF + +```bash +python examples/conversion/convert_checkpoints.py export \ + --hf-model Qwen/Qwen2.5-Omni-7B \ + --megatron-path ${WORKSPACE}/models/Qwen2.5-Omni-7B/iter_0000000 \ + --hf-path ${WORKSPACE}/models/Qwen2.5-Omni-7B-hf-export +``` + +### Round-trip Validation + +```bash +python -m torch.distributed.run --nproc_per_node=2 \ + examples/conversion/hf_megatron_roundtrip_multi_gpu.py \ + --hf-model-id Qwen/Qwen2.5-Omni-7B \ + --megatron-load-path ${WORKSPACE}/models/Qwen2.5-Omni-7B/iter_0000000 \ + --tp 2 --pp 1 +``` + +## Inference + +See [inference.sh](inference.sh) for multimodal generation with: +- Hugging Face checkpoint +- Imported Megatron checkpoint (after [conversion.sh](conversion.sh) import) +- Exported HF checkpoint + +The default parallelism for 7B is `--tp 2` (2 GPUs). For larger variants scale TP accordingly. + +### Example: Video + Audio + +```bash +uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ + examples/conversion/hf_to_megatron_generate_omni_lm.py \ + --hf_model_path Qwen/Qwen2.5-Omni-7B \ + --video_url "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4" \ + --prompt "What was the first sentence the boy said when he met the girl?" \ + --use_audio_in_video \ + --max_new_tokens 64 \ + --tp 2 +``` + +Pass `--use_audio_in_video` to include the audio track from the video in the model input. +Use `--video_path` instead of `--video_url` for a local file. diff --git a/examples/models/vlm/qwen25_omni/conversion.sh b/examples/models/vlm/qwen25_omni/conversion.sh new file mode 100644 index 0000000000..011d6d5bb4 --- /dev/null +++ b/examples/models/vlm/qwen25_omni/conversion.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -xeuo pipefail + +# Workspace directory for checkpoints and results +WORKSPACE=${WORKSPACE:-/workspace} + +MODEL_NAME=Qwen2.5-Omni-7B +HF_MODEL_ID=Qwen/${MODEL_NAME} + +# Import HF → Megatron +uv run python examples/conversion/convert_checkpoints.py import \ + --hf-model "$HF_MODEL_ID" \ + --megatron-path "${WORKSPACE}/models/${MODEL_NAME}" + +# Export Megatron → HF +uv run python examples/conversion/convert_checkpoints.py export \ + --hf-model "$HF_MODEL_ID" \ + --megatron-path "${WORKSPACE}/models/${MODEL_NAME}/iter_0000000" \ + --hf-path "${WORKSPACE}/models/${MODEL_NAME}-hf-export" + +# Round-trip validation +uv run python -m torch.distributed.run --nproc_per_node=2 \ + examples/conversion/hf_megatron_roundtrip_multi_gpu.py \ + --hf-model-id "$HF_MODEL_ID" \ + --megatron-load-path "${WORKSPACE}/models/${MODEL_NAME}/iter_0000000" \ + --tp 2 --pp 1 diff --git a/examples/models/vlm/qwen25_omni/inference.sh b/examples/models/vlm/qwen25_omni/inference.sh new file mode 100644 index 0000000000..f3c81124c7 --- /dev/null +++ b/examples/models/vlm/qwen25_omni/inference.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Workspace directory for checkpoints and results +WORKSPACE=${WORKSPACE:-/workspace} + +MODEL_NAME=Qwen2.5-Omni-7B +VIDEO_URL="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4" +PROMPT="What was the first sentence the boy said when he met the girl?" + +# Requires: pip install qwen-omni-utils[decord] + +# Inference with Hugging Face checkpoints (video + audio) +uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ + examples/conversion/hf_to_megatron_generate_omni_lm.py \ + --hf_model_path Qwen/${MODEL_NAME} \ + --video_url "${VIDEO_URL}" \ + --prompt "${PROMPT}" \ + --use_audio_in_video \ + --max_new_tokens 64 \ + --tp 2 + +# Inference with imported Megatron checkpoint (video + audio) +uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ + examples/conversion/hf_to_megatron_generate_omni_lm.py \ + --hf_model_path Qwen/${MODEL_NAME} \ + --megatron_model_path ${WORKSPACE}/models/${MODEL_NAME}/iter_0000000 \ + --video_url "${VIDEO_URL}" \ + --prompt "${PROMPT}" \ + --use_audio_in_video \ + --max_new_tokens 64 \ + --tp 2 + +# Inference with exported HF checkpoint +uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ + examples/conversion/hf_to_megatron_generate_omni_lm.py \ + --hf_model_path ${WORKSPACE}/models/${MODEL_NAME}-hf-export \ + --video_url "${VIDEO_URL}" \ + --prompt "${PROMPT}" \ + --use_audio_in_video \ + --max_new_tokens 64 \ + --tp 2 From 9d6b505d04e32f980c29c279820009f7f34d1c5c Mon Sep 17 00:00:00 2001 From: yaoyu-33 Date: Mon, 23 Mar 2026 15:08:00 -0700 Subject: [PATCH 2/4] [model] fix: extract pooler_output from vision encoder in Qwen2.5-Omni thinker model Qwen2_5OmniVisionEncoder.forward() returns BaseModelOutputWithPooling, not a plain tensor. Extract .pooler_output (the merger-projected features) before assigning to combined_embeddings to fix inference crash: TypeError: can't assign a BaseModelOutputWithPooling to a BFloat16Tensor Also update README to clarify qwen-omni-utils install command and note that --use_audio_in_video requires ffmpeg on the system. Signed-off-by: Yu Yao Signed-off-by: yaoyu-33 --- examples/models/vlm/qwen25_omni/README.md | 6 +++++- .../models/qwen_omni/modeling_qwen25_omni/thinker_model.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/models/vlm/qwen25_omni/README.md b/examples/models/vlm/qwen25_omni/README.md index 14d0c405d7..5105036463 100644 --- a/examples/models/vlm/qwen25_omni/README.md +++ b/examples/models/vlm/qwen25_omni/README.md @@ -10,10 +10,14 @@ Qwen2.5-Omni supports simultaneous processing of images, video, audio, and text ## Prerequisites +Audio and video processing requires `qwen-omni-utils` with `decord`. Install it into the project environment: + ```bash -pip install qwen-omni-utils[decord] +uv pip install qwen-omni-utils[decord] ``` +> **Note:** Audio extraction from video (`--use_audio_in_video`) additionally requires `ffmpeg` to be installed on the system (`apt-get install ffmpeg` or equivalent). Without it, audio input is skipped and the model falls back to video-only mode. + ## Workspace Configuration All scripts use a `WORKSPACE` environment variable for the base directory. Default: `/workspace`. diff --git a/src/megatron/bridge/models/qwen_omni/modeling_qwen25_omni/thinker_model.py b/src/megatron/bridge/models/qwen_omni/modeling_qwen25_omni/thinker_model.py index c52b1396a5..abc754f979 100644 --- a/src/megatron/bridge/models/qwen_omni/modeling_qwen25_omni/thinker_model.py +++ b/src/megatron/bridge/models/qwen_omni/modeling_qwen25_omni/thinker_model.py @@ -242,13 +242,13 @@ def forward( # Process images through vision encoder if pixel_values is not None and image_grid_thw is not None: - image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) + image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).pooler_output else: image_embeds = None # Process videos through vision encoder if pixel_values_videos is not None and video_grid_thw is not None: - video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw) + video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw).pooler_output else: video_embeds = None From aba5e9c7c55642631ba86b5f471c460be4182130 Mon Sep 17 00:00:00 2001 From: yaoyu-33 Date: Mon, 23 Mar 2026 15:44:07 -0700 Subject: [PATCH 3/4] [doc] fix: update Qwen2.5-Omni README with ffmpeg install and audio usage notes Signed-off-by: yaoyu-33 --- examples/models/vlm/qwen25_omni/README.md | 29 +++++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/examples/models/vlm/qwen25_omni/README.md b/examples/models/vlm/qwen25_omni/README.md index 5105036463..9b637e08fe 100644 --- a/examples/models/vlm/qwen25_omni/README.md +++ b/examples/models/vlm/qwen25_omni/README.md @@ -16,7 +16,14 @@ Audio and video processing requires `qwen-omni-utils` with `decord`. Install it uv pip install qwen-omni-utils[decord] ``` -> **Note:** Audio extraction from video (`--use_audio_in_video`) additionally requires `ffmpeg` to be installed on the system (`apt-get install ffmpeg` or equivalent). Without it, audio input is skipped and the model falls back to video-only mode. +Audio extraction from video (`--use_audio_in_video`) additionally requires `ffmpeg`. If `apt-get install ffmpeg` is unavailable (e.g. in a container), install it via `imageio-ffmpeg`: + +```bash +uv pip install imageio-ffmpeg +ln -sf $(uv run python -c "import imageio_ffmpeg; print(imageio_ffmpeg.get_ffmpeg_exe())") /usr/local/bin/ffmpeg +``` + +> **Note:** `--use_audio_in_video` requires a **local file** passed via `--video_path`. Audio extraction does not work with `--video_url` because `audioread` cannot stream audio directly from a URL. ## Workspace Configuration @@ -66,7 +73,7 @@ See [inference.sh](inference.sh) for multimodal generation with: The default parallelism for 7B is `--tp 2` (2 GPUs). For larger variants scale TP accordingly. -### Example: Video + Audio +### Example: Video only ```bash uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ @@ -74,10 +81,22 @@ uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ --hf_model_path Qwen/Qwen2.5-Omni-7B \ --video_url "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4" \ --prompt "What was the first sentence the boy said when he met the girl?" \ - --use_audio_in_video \ --max_new_tokens 64 \ --tp 2 ``` -Pass `--use_audio_in_video` to include the audio track from the video in the model input. -Use `--video_path` instead of `--video_url` for a local file. +### Example: Video + Audio (requires ffmpeg and a local file) + +```bash +# Download the video first +wget -O /path/to/video.mp4 "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4" + +uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ + examples/conversion/hf_to_megatron_generate_omni_lm.py \ + --hf_model_path Qwen/Qwen2.5-Omni-7B \ + --video_path /path/to/video.mp4 \ + --prompt "What was the first sentence the boy said when he met the girl?" \ + --use_audio_in_video \ + --max_new_tokens 64 \ + --tp 2 +``` From a08769af407280386ba1f665d141ad3828dec99b Mon Sep 17 00:00:00 2001 From: yaoyu-33 Date: Mon, 23 Mar 2026 16:23:57 -0700 Subject: [PATCH 4/4] [doc] fix: use --video_path in inference.sh for audio+video mode Signed-off-by: yaoyu-33 --- examples/models/vlm/qwen25_omni/inference.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/models/vlm/qwen25_omni/inference.sh b/examples/models/vlm/qwen25_omni/inference.sh index f3c81124c7..96daef9e6d 100644 --- a/examples/models/vlm/qwen25_omni/inference.sh +++ b/examples/models/vlm/qwen25_omni/inference.sh @@ -17,16 +17,19 @@ WORKSPACE=${WORKSPACE:-/workspace} MODEL_NAME=Qwen2.5-Omni-7B -VIDEO_URL="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/cookbook/audio_visual.mp4" +# For --use_audio_in_video, audio extraction requires a local file (URL streaming not supported). +# Download the video first: wget -O /path/to/video.mp4 +VIDEO_PATH=${VIDEO_PATH:-/path/to/video.mp4} PROMPT="What was the first sentence the boy said when he met the girl?" -# Requires: pip install qwen-omni-utils[decord] +# Requires: uv pip install qwen-omni-utils[decord] +# Requires ffmpeg for audio: uv pip install imageio-ffmpeg && ln -sf $(uv run python -c "import imageio_ffmpeg; print(imageio_ffmpeg.get_ffmpeg_exe())") /usr/local/bin/ffmpeg # Inference with Hugging Face checkpoints (video + audio) uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ examples/conversion/hf_to_megatron_generate_omni_lm.py \ --hf_model_path Qwen/${MODEL_NAME} \ - --video_url "${VIDEO_URL}" \ + --video_path "${VIDEO_PATH}" \ --prompt "${PROMPT}" \ --use_audio_in_video \ --max_new_tokens 64 \ @@ -37,7 +40,7 @@ uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ examples/conversion/hf_to_megatron_generate_omni_lm.py \ --hf_model_path Qwen/${MODEL_NAME} \ --megatron_model_path ${WORKSPACE}/models/${MODEL_NAME}/iter_0000000 \ - --video_url "${VIDEO_URL}" \ + --video_path "${VIDEO_PATH}" \ --prompt "${PROMPT}" \ --use_audio_in_video \ --max_new_tokens 64 \ @@ -47,7 +50,7 @@ uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ uv run --no-sync python -m torch.distributed.run --nproc_per_node=2 \ examples/conversion/hf_to_megatron_generate_omni_lm.py \ --hf_model_path ${WORKSPACE}/models/${MODEL_NAME}-hf-export \ - --video_url "${VIDEO_URL}" \ + --video_path "${VIDEO_PATH}" \ --prompt "${PROMPT}" \ --use_audio_in_video \ --max_new_tokens 64 \