vllm-project · princepride · Apr 16, 2026 · Apr 15, 2026 · Apr 15, 2026
@@ -31,9 +31,9 @@ dreamid_omni/
 ```
 
 ### Run the Inference
-```
+```python
 python x_to_video_audio.py \
-  --model /xx/dreamid_omni \
+  --model /path/to/dreamid_omni \
   --prompt "Two people walking together and singing happily" \
   --image-path ./example0.png ./example1.png \
   --audio-path ./example0.wav ./example1.wav \
@@ -43,11 +43,33 @@ python x_to_video_audio.py \
   --num-inference-steps 45 \
   --height 704 \
   --width 1280 \
-  --output dreamid_omni.mp4
+  --output out_dreamid_omni_twoip.mp4
 ```
 In the current test scenario (2 images + 2 audio inputs), the VRAM requirement is 72GB, regardless of whether cfg-parallel is enabled or disabled.
 The VRAM usage can be reduced by enabling CPU offload via --enable-cpu-offload.
 
+
+You could take reference images/audios from the test cases in the official repo: https://github.com/Guoxu1233/DreamID-Omni
+
+For example, single IP ref resources can be found under https://github.com/Guoxu1233/DreamID-Omni/tree/main/test_case/oneip, you could download them correspondingly to your local and use them for testing.
+
+```python
+# Example usage for oneip, ref media from the official repo DreamID-Omni
+python x_to_video_audio.py \
+  --model /path/to/dreamid_omni \
+  --prompt "<img1>: In the frame, a woman with black long hair is identified as <sub1>.\n**Overall Environment/Scene**: A lively open-kitchen café at night; stove flames flare, steam rises, and warm pendant lights swing slightly as staff move behind her. The shot is an upper-body close-up.\n**Main Characters/Subjects Appearance**: <sub1> is a young woman with thick dark wavy hair and a side part. She wears a fitted black top under a light apron, a thin gold chain necklace, and small stud earrings.\n**Main Characters/Subjects Actions**: <sub1> tastes the sauce with a spoon, then turns her face toward the camera while still holding the spoon, her expression shifting from focused to conflicted.\n<sub1> maintains eye contact, swallows as if choosing her words, and says, <S>I keep telling myself I’m fine,but some nights it feels like I’m just performing calm.<E>" \
+  --image-path 9.png \
+  --audio-path 9.wav \
+  --video-negative-prompt "jitter, bad hands, blur, distortion" \
+  --audio-negative-prompt "robotic, muffled, echo, distorted" \
+  --cfg-parallel-size 2 \
+  --num-inference-steps 45 \
+  --height 704 \
+  --width 1280 \
+  --output out_dreamid_omni_oneip.mp4
+```
+
+
 Key arguments:
 - `--prompt`: text description (string).
 - `--model`: path to the model local directory.

@@ -30,9 +30,9 @@ dreamid_omni/
 ```
 
 ### Run the Inference
-```
+```python
 python x_to_video_audio.py \
-  --model /xx/dreamid_omni \
+  --model /path/to/dreamid_omni \
   --prompt "Two people walking together and singing happily" \
   --image-path ./example0.png ./example1.png \
   --audio-path ./example0.wav ./example1.wav \
@@ -42,11 +42,33 @@ python x_to_video_audio.py \
   --num-inference-steps 45 \
   --height 704 \
   --width 1280 \
-  --output dreamid_omni.mp4
+  --output out_dreamid_omni_twoip.mp4
 ```
 In the current test scenario (2 images + 2 audio inputs), the VRAM requirement is 72GB, regardless of whether cfg-parallel is enabled or disabled.
 The VRAM usage can be reduced by enabling CPU offload via --enable-cpu-offload.
 
+
+You could take reference images/audios from the test cases in the official repo: https://github.com/Guoxu1233/DreamID-Omni
+
+For example, single IP ref resources can be found under https://github.com/Guoxu1233/DreamID-Omni/tree/main/test_case/oneip, you could download them correspondingly to your local and use them for testing.
+
+```python
+# Example usage for oneip, ref media from the official repo DreamID-Omni
+python x_to_video_audio.py \
+  --model /path/to/dreamid_omni \
+  --prompt "<img1>: In the frame, a woman with black long hair is identified as <sub1>.\n**Overall Environment/Scene**: A lively open-kitchen café at night; stove flames flare, steam rises, and warm pendant lights swing slightly as staff move behind her. The shot is an upper-body close-up.\n**Main Characters/Subjects Appearance**: <sub1> is a young woman with thick dark wavy hair and a side part. She wears a fitted black top under a light apron, a thin gold chain necklace, and small stud earrings.\n**Main Characters/Subjects Actions**: <sub1> tastes the sauce with a spoon, then turns her face toward the camera while still holding the spoon, her expression shifting from focused to conflicted.\n<sub1> maintains eye contact, swallows as if choosing her words, and says, <S>I keep telling myself I’m fine,but some nights it feels like I’m just performing calm.<E>" \
+  --image-path 9.png \
+  --audio-path 9.wav \
+  --video-negative-prompt "jitter, bad hands, blur, distortion" \
+  --audio-negative-prompt "robotic, muffled, echo, distorted" \
+  --cfg-parallel-size 2 \
+  --num-inference-steps 45 \
+  --height 704 \
+  --width 1280 \
+  --output out_dreamid_omni_oneip.mp4
+```
+
+
 Key arguments:
 - `--prompt`: text description (string).
 - `--model`: path to the model local directory.

@@ -5,10 +5,12 @@
 import re
 import time
 
+import numpy as np
 from PIL import Image
 from vllm.multimodal.media.audio import load_audio
 
 from vllm_omni.diffusion.data import DiffusionParallelConfig
+from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes
 from vllm_omni.entrypoints.omni import Omni
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
@@ -131,15 +133,35 @@ def main() -> None:
 
     if not outputs:
         raise RuntimeError("No output returned from DreamID-Omni.")
-    output = outputs[0].request_output
-    generated_video = output.images[0][0]
-    generated_audio = output.images[0][1]
-    try:
-        from dreamid_omni.utils.io_utils import save_video
-    except Exception as e:
-        raise RuntimeError(f"Failed to extract video and audio from DreamID-Omni output. Error: {e}")
+    result = outputs[0]
+    if not result.images:
+        raise RuntimeError("No video frames found in DreamID-Omni output.")
+    generated_video = result.images[0]
+    mm = result.multimodal_output or {}
+    generated_audio = mm.get("audio")
+    fps = int(mm.get("fps", 24))
+    sample_rate = int(mm.get("audio_sample_rate", 16000))
+
+    # DreamID-Omni returns video as (C, F, H, W) float32 in [-1, 1].
+    # mux_video_audio_bytes expects (F, H, W, C) uint8.
+    if not isinstance(generated_video, np.ndarray) or generated_video.ndim != 4:
+        raise RuntimeError(f"Unexpected video shape: {getattr(generated_video, 'shape', None)}")
+    frames = generated_video.transpose(1, 2, 3, 0)
+    frames = (np.clip((frames + 1.0) / 2.0, 0.0, 1.0) * 255.0).round().astype(np.uint8)
+
+    audio_np = None
+    if generated_audio is not None:
+        audio_np = np.squeeze(np.asarray(generated_audio)).astype(np.float32)
+
     output_path = args.output
-    save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000)
+    video_bytes = mux_video_audio_bytes(
+        frames,
+        audio_np,
+        fps=float(fps),
+        audio_sample_rate=sample_rate,
+    )
+    with open(output_path, "wb") as f:
+        f.write(video_bytes)
     print(f"Saved generated video to {output_path}")
     print(f"Total time: {elapsed:.2f}s")
 

@@ -38,6 +38,21 @@
 logger = logging.getLogger(__name__)
 
 
+def get_dreamid_omni_post_process_func(*args, **kwargs):
+    def post_process(output):
+        if isinstance(output, tuple) and len(output) == 2:
+            video, audio = output
+            return {
+                "video": video,
+                "audio": audio,
+                "audio_sample_rate": 16000,
+                "fps": 24,
+            }
+        return output
+
+    return post_process
+
+
 AUDIO_CONFIG = {
     "patch_size": [1],
     "model_type": "t2a",

@@ -375,6 +375,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) -
     "HunyuanVideo15ImageToVideoPipeline": "get_hunyuan_video_15_i2v_post_process_func",
     "MagiHumanPipeline": "get_magi_human_post_process_func",
     "OmniVoicePipeline": "get_omnivoice_post_process_func",
+    "DreamIDOmniPipeline": "get_dreamid_omni_post_process_func",
 }
 
 _DIFFUSION_PRE_PROCESS_FUNCS = {