From bebb84929186cf627e35b1a2288cb4cd5aaa18ce Mon Sep 17 00:00:00 2001 From: yuanheng Date: Wed, 15 Apr 2026 03:27:15 +0000 Subject: [PATCH 1/2] upd dreamid modules Signed-off-by: yuanheng --- .../x_to_video_audio/x_to_video_audio.py | 38 +++++++++++++++---- .../dreamid_omni/pipeline_dreamid_omni.py | 15 ++++++++ vllm_omni/diffusion/registry.py | 1 + 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py index 49a0f496f81..322b184e520 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py @@ -5,10 +5,12 @@ import re import time +import numpy as np from PIL import Image from vllm.multimodal.media.audio import load_audio from vllm_omni.diffusion.data import DiffusionParallelConfig +from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -131,15 +133,35 @@ def main() -> None: if not outputs: raise RuntimeError("No output returned from DreamID-Omni.") - output = outputs[0].request_output - generated_video = output.images[0][0] - generated_audio = output.images[0][1] - try: - from dreamid_omni.utils.io_utils import save_video - except Exception as e: - raise RuntimeError(f"Failed to extract video and audio from DreamID-Omni output. Error: {e}") + result = outputs[0] + if not result.images: + raise RuntimeError("No video frames found in DreamID-Omni output.") + generated_video = result.images[0] + mm = result.multimodal_output or {} + generated_audio = mm.get("audio") + fps = int(mm.get("fps", 24)) + sample_rate = int(mm.get("audio_sample_rate", 16000)) + + # DreamID-Omni returns video as (C, F, H, W) float32 in [-1, 1]. + # mux_video_audio_bytes expects (F, H, W, C) uint8. + if not isinstance(generated_video, np.ndarray) or generated_video.ndim != 4: + raise RuntimeError(f"Unexpected video shape: {getattr(generated_video, 'shape', None)}") + frames = generated_video.transpose(1, 2, 3, 0) + frames = (np.clip((frames + 1.0) / 2.0, 0.0, 1.0) * 255.0).round().astype(np.uint8) + + audio_np = None + if generated_audio is not None: + audio_np = np.squeeze(np.asarray(generated_audio)).astype(np.float32) + output_path = args.output - save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000) + video_bytes = mux_video_audio_bytes( + frames, + audio_np, + fps=float(fps), + audio_sample_rate=sample_rate, + ) + with open(output_path, "wb") as f: + f.write(video_bytes) print(f"Saved generated video to {output_path}") print(f"Total time: {elapsed:.2f}s") diff --git a/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py b/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py index 974cc582f1d..c7ab4662d14 100644 --- a/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py +++ b/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py @@ -38,6 +38,21 @@ logger = logging.getLogger(__name__) +def get_dreamid_omni_post_process_func(*args, **kwargs): + def post_process(output): + if isinstance(output, tuple) and len(output) == 2: + video, audio = output + return { + "video": video, + "audio": audio, + "audio_sample_rate": 16000, + "fps": 24, + } + return output + + return post_process + + AUDIO_CONFIG = { "patch_size": [1], "model_type": "t2a", diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 97bc7fa2925..f7e32f44727 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -375,6 +375,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) - "HunyuanVideo15ImageToVideoPipeline": "get_hunyuan_video_15_i2v_post_process_func", "MagiHumanPipeline": "get_magi_human_post_process_func", "OmniVoicePipeline": "get_omnivoice_post_process_func", + "DreamIDOmniPipeline": "get_dreamid_omni_post_process_func", } _DIFFUSION_PRE_PROCESS_FUNCS = { From e619bb63d6aac1d018739669cb7c927c8be22eb1 Mon Sep 17 00:00:00 2001 From: yuanheng Date: Wed, 15 Apr 2026 03:42:56 +0000 Subject: [PATCH 2/2] upd doc usage oneip Signed-off-by: yuanheng --- .../offline_inference/x_to_video_audio.md | 28 +++++++++++++++++-- .../x_to_video_audio/x_to_video_audio.md | 28 +++++++++++++++++-- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/docs/user_guide/examples/offline_inference/x_to_video_audio.md b/docs/user_guide/examples/offline_inference/x_to_video_audio.md index 8ea39d81156..cec8d47c591 100644 --- a/docs/user_guide/examples/offline_inference/x_to_video_audio.md +++ b/docs/user_guide/examples/offline_inference/x_to_video_audio.md @@ -31,9 +31,9 @@ dreamid_omni/ ``` ### Run the Inference -``` +```python python x_to_video_audio.py \ - --model /xx/dreamid_omni \ + --model /path/to/dreamid_omni \ --prompt "Two people walking together and singing happily" \ --image-path ./example0.png ./example1.png \ --audio-path ./example0.wav ./example1.wav \ @@ -43,11 +43,33 @@ python x_to_video_audio.py \ --num-inference-steps 45 \ --height 704 \ --width 1280 \ - --output dreamid_omni.mp4 + --output out_dreamid_omni_twoip.mp4 ``` In the current test scenario (2 images + 2 audio inputs), the VRAM requirement is 72GB, regardless of whether cfg-parallel is enabled or disabled. The VRAM usage can be reduced by enabling CPU offload via --enable-cpu-offload. + +You could take reference images/audios from the test cases in the official repo: https://github.com/Guoxu1233/DreamID-Omni + +For example, single IP ref resources can be found under https://github.com/Guoxu1233/DreamID-Omni/tree/main/test_case/oneip, you could download them correspondingly to your local and use them for testing. + +```python +# Example usage for oneip, ref media from the official repo DreamID-Omni +python x_to_video_audio.py \ + --model /path/to/dreamid_omni \ + --prompt ": In the frame, a woman with black long hair is identified as .\n**Overall Environment/Scene**: A lively open-kitchen café at night; stove flames flare, steam rises, and warm pendant lights swing slightly as staff move behind her. The shot is an upper-body close-up.\n**Main Characters/Subjects Appearance**: is a young woman with thick dark wavy hair and a side part. She wears a fitted black top under a light apron, a thin gold chain necklace, and small stud earrings.\n**Main Characters/Subjects Actions**: tastes the sauce with a spoon, then turns her face toward the camera while still holding the spoon, her expression shifting from focused to conflicted.\n maintains eye contact, swallows as if choosing her words, and says, I keep telling myself I’m fine,but some nights it feels like I’m just performing calm." \ + --image-path 9.png \ + --audio-path 9.wav \ + --video-negative-prompt "jitter, bad hands, blur, distortion" \ + --audio-negative-prompt "robotic, muffled, echo, distorted" \ + --cfg-parallel-size 2 \ + --num-inference-steps 45 \ + --height 704 \ + --width 1280 \ + --output out_dreamid_omni_oneip.mp4 +``` + + Key arguments: - `--prompt`: text description (string). - `--model`: path to the model local directory. diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.md b/examples/offline_inference/x_to_video_audio/x_to_video_audio.md index 4b5188f41b2..13f2cfe7c0a 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.md +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.md @@ -30,9 +30,9 @@ dreamid_omni/ ``` ### Run the Inference -``` +```python python x_to_video_audio.py \ - --model /xx/dreamid_omni \ + --model /path/to/dreamid_omni \ --prompt "Two people walking together and singing happily" \ --image-path ./example0.png ./example1.png \ --audio-path ./example0.wav ./example1.wav \ @@ -42,11 +42,33 @@ python x_to_video_audio.py \ --num-inference-steps 45 \ --height 704 \ --width 1280 \ - --output dreamid_omni.mp4 + --output out_dreamid_omni_twoip.mp4 ``` In the current test scenario (2 images + 2 audio inputs), the VRAM requirement is 72GB, regardless of whether cfg-parallel is enabled or disabled. The VRAM usage can be reduced by enabling CPU offload via --enable-cpu-offload. + +You could take reference images/audios from the test cases in the official repo: https://github.com/Guoxu1233/DreamID-Omni + +For example, single IP ref resources can be found under https://github.com/Guoxu1233/DreamID-Omni/tree/main/test_case/oneip, you could download them correspondingly to your local and use them for testing. + +```python +# Example usage for oneip, ref media from the official repo DreamID-Omni +python x_to_video_audio.py \ + --model /path/to/dreamid_omni \ + --prompt ": In the frame, a woman with black long hair is identified as .\n**Overall Environment/Scene**: A lively open-kitchen café at night; stove flames flare, steam rises, and warm pendant lights swing slightly as staff move behind her. The shot is an upper-body close-up.\n**Main Characters/Subjects Appearance**: is a young woman with thick dark wavy hair and a side part. She wears a fitted black top under a light apron, a thin gold chain necklace, and small stud earrings.\n**Main Characters/Subjects Actions**: tastes the sauce with a spoon, then turns her face toward the camera while still holding the spoon, her expression shifting from focused to conflicted.\n maintains eye contact, swallows as if choosing her words, and says, I keep telling myself I’m fine,but some nights it feels like I’m just performing calm." \ + --image-path 9.png \ + --audio-path 9.wav \ + --video-negative-prompt "jitter, bad hands, blur, distortion" \ + --audio-negative-prompt "robotic, muffled, echo, distorted" \ + --cfg-parallel-size 2 \ + --num-inference-steps 45 \ + --height 704 \ + --width 1280 \ + --output out_dreamid_omni_oneip.mp4 +``` + + Key arguments: - `--prompt`: text description (string). - `--model`: path to the model local directory.