Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ repos:
- id: ruff-format

- repo: https://github.com/crate-ci/typos
rev: v1.35.5
rev: v1.38.1
hooks:
- id: typos
# only for staged files
Expand Down
2 changes: 1 addition & 1 deletion docs/user_guide/cache_dit_acceleration.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ omni = Omni(

You can customize the configuration by modifying the `cache_config` dictionary to use only specific methods (e.g., DBCache only, DBCache + SCM, etc.) based on your quality and speed requirements.

To test another model, you can modify `--model` with the target model identifier like `Tongyi-MAI/Z-Image-Turbo` and update `cache_confg` according the model architecture (e.g., number of transformer blocks).
To test another model, you can modify `--model` with the target model identifier like `Tongyi-MAI/Z-Image-Turbo` and update `cache_config` according the model architecture (e.g., number of transformer blocks).


## Additional Resources
Expand Down
18 changes: 9 additions & 9 deletions examples/offline_inference/qwen2_5_omni/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""

import os
from typing import NamedTuple, Optional
from typing import NamedTuple

import librosa
import numpy as np
Expand Down Expand Up @@ -58,9 +58,9 @@ def get_text_query(question: str = None) -> QueryResult:


def get_mixed_modalities_query(
video_path: Optional[str] = None,
image_path: Optional[str] = None,
audio_path: Optional[str] = None,
video_path: str | None = None,
image_path: str | None = None,
audio_path: str | None = None,
num_frames: int = 16,
sampling_rate: int = 16000,
) -> QueryResult:
Expand Down Expand Up @@ -114,7 +114,7 @@ def get_mixed_modalities_query(


def get_use_audio_in_video_query(
video_path: Optional[str] = None, num_frames: int = 16, sampling_rate: int = 16000
video_path: str | None = None, num_frames: int = 16, sampling_rate: int = 16000
) -> QueryResult:
question = "Describe the content of the video, then convert what the baby say into text."
prompt = (
Expand Down Expand Up @@ -151,7 +151,7 @@ def get_use_audio_in_video_query(
)


def get_multi_audios_query(audio_path: Optional[str] = None, sampling_rate: int = 16000) -> QueryResult:
def get_multi_audios_query(audio_path: str | None = None, sampling_rate: int = 16000) -> QueryResult:
question = "Are these two audio clips the same?"
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
Expand Down Expand Up @@ -190,7 +190,7 @@ def get_multi_audios_query(audio_path: Optional[str] = None, sampling_rate: int
)


def get_image_query(question: str = None, image_path: Optional[str] = None) -> QueryResult:
def get_image_query(question: str = None, image_path: str | None = None) -> QueryResult:
if question is None:
question = "What is the content of this image?"
prompt = (
Expand Down Expand Up @@ -219,7 +219,7 @@ def get_image_query(question: str = None, image_path: Optional[str] = None) -> Q
)


def get_video_query(question: str = None, video_path: Optional[str] = None, num_frames: int = 16) -> QueryResult:
def get_video_query(question: str = None, video_path: str | None = None, num_frames: int = 16) -> QueryResult:
if question is None:
question = "Why is this video funny?"
prompt = (
Expand Down Expand Up @@ -247,7 +247,7 @@ def get_video_query(question: str = None, video_path: Optional[str] = None, num_
)


def get_audio_query(question: str = None, audio_path: Optional[str] = None, sampling_rate: int = 16000) -> QueryResult:
def get_audio_query(question: str = None, audio_path: str | None = None, sampling_rate: int = 16000) -> QueryResult:
if question is None:
question = "What is the content of this audio?"
prompt = (
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#!/usr/bin/env python3
import argparse
from typing import Optional


def extract_prompt(line: str) -> Optional[str]:
def extract_prompt(line: str) -> str | None:
# Extract the content between the first '|' and the second '|'
i = line.find("|")
if i == -1:
Expand Down
8 changes: 4 additions & 4 deletions examples/offline_inference/qwen3_omni/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""

import os
from typing import NamedTuple, Optional
from typing import NamedTuple

import librosa
import numpy as np
Expand Down Expand Up @@ -57,7 +57,7 @@ def get_text_query(question: str = None) -> QueryResult:
)


def get_video_query(question: str = None, video_path: Optional[str] = None, num_frames: int = 16) -> QueryResult:
def get_video_query(question: str = None, video_path: str | None = None, num_frames: int = 16) -> QueryResult:
if question is None:
question = "Why is this video funny?"
prompt = (
Expand Down Expand Up @@ -85,7 +85,7 @@ def get_video_query(question: str = None, video_path: Optional[str] = None, num_
)


def get_image_query(question: str = None, image_path: Optional[str] = None) -> QueryResult:
def get_image_query(question: str = None, image_path: str | None = None) -> QueryResult:
if question is None:
question = "What is the content of this image?"
prompt = (
Expand Down Expand Up @@ -114,7 +114,7 @@ def get_image_query(question: str = None, image_path: Optional[str] = None) -> Q
)


def get_audio_query(question: str = None, audio_path: Optional[str] = None, sampling_rate: int = 16000) -> QueryResult:
def get_audio_query(question: str = None, audio_path: str | None = None, sampling_rate: int = 16000) -> QueryResult:
if question is None:
question = "What is the content of this audio?"
prompt = (
Expand Down
32 changes: 16 additions & 16 deletions examples/online_serving/qwen2_5_omni/gradio_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys
from pathlib import Path
from types import SimpleNamespace
from typing import Any, Optional
from typing import Any

import gradio as gr
import numpy as np
Expand Down Expand Up @@ -175,16 +175,16 @@ def create_prompt_args(base_args: argparse.Namespace) -> SimpleNamespace:


def process_audio_file(
audio_file: Optional[Any],
) -> Optional[tuple[np.ndarray, int]]:
audio_file: Any | None,
) -> tuple[np.ndarray, int] | None:
"""Normalize Gradio audio input to (np.ndarray, sample_rate)."""
if audio_file is None:
return None

sample_rate: Optional[int] = None
audio_np: Optional[np.ndarray] = None
sample_rate: int | None = None
audio_np: np.ndarray | None = None

def _load_from_path(path_str: str) -> Optional[tuple[np.ndarray, int]]:
def _load_from_path(path_str: str) -> tuple[np.ndarray, int] | None:
if not path_str:
return None
path = Path(path_str)
Expand Down Expand Up @@ -237,7 +237,7 @@ def _load_from_path(path_str: str) -> Optional[tuple[np.ndarray, int]]:
return audio_np.astype(np.float32), sample_rate


def process_image_file(image_file: Optional[Image.Image]) -> Optional[Image.Image]:
def process_image_file(image_file: Image.Image | None) -> Image.Image | None:
"""Process image file from Gradio input.

Returns:
Expand All @@ -252,10 +252,10 @@ def process_image_file(image_file: Optional[Image.Image]) -> Optional[Image.Imag


def process_video_file(
video_file: Optional[str],
video_file: str | None,
enable_audio_in_video: bool = False,
max_frames: int = 32,
) -> Optional[tuple[np.ndarray, dict[str, Any], Optional[tuple[np.ndarray, int]]]]:
) -> tuple[np.ndarray, dict[str, Any], tuple[np.ndarray, int] | None] | None:
"""Process video file and optionally extract audio track."""
if video_file is None:
return None
Expand All @@ -272,7 +272,7 @@ def process_video_file(
print(f"Failed to decode video {video_path}: {exc}")
return None

audio_tuple: Optional[tuple[np.ndarray, int]] = None
audio_tuple: tuple[np.ndarray, int] | None = None
if enable_audio_in_video:
try:
import librosa # type: ignore import
Expand All @@ -290,9 +290,9 @@ async def run_inference_async_omni(
sampling_params: list[SamplingParams],
prompt_args_template: SimpleNamespace,
user_prompt: str,
audio_file: Optional[tuple[str, tuple[int, np.ndarray]]] = None,
image_file: Optional[Image.Image] = None,
video_file: Optional[str] = None,
audio_file: tuple[str, tuple[int, np.ndarray]] | None = None,
image_file: Image.Image | None = None,
video_file: str | None = None,
use_audio_in_video: bool = False,
):
"""Run inference using AsyncOmni directly with multimodal support."""
Expand Down Expand Up @@ -420,9 +420,9 @@ def build_interface(

async def run_inference(
user_prompt: str,
audio_file: Optional[tuple[str, tuple[int, np.ndarray]]],
image_file: Optional[Image.Image],
video_file: Optional[str],
audio_file: tuple[str, tuple[int, np.ndarray]] | None,
image_file: Image.Image | None,
video_file: str | None,
use_audio_in_video: bool,
):
return await run_inference_async_omni(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import base64
import os
from typing import Optional

import requests
from openai import OpenAI
Expand Down Expand Up @@ -38,7 +37,7 @@ def encode_base64_content_from_file(file_path: str) -> str:
return result


def get_video_url_from_path(video_path: Optional[str]) -> str:
def get_video_url_from_path(video_path: str | None) -> str:
"""Convert a video path (local file or URL) to a video URL format for the API.

If video_path is None or empty, returns the default URL.
Expand Down Expand Up @@ -77,7 +76,7 @@ def get_video_url_from_path(video_path: Optional[str]) -> str:
return f"data:{mime_type};base64,{video_base64}"


def get_image_url_from_path(image_path: Optional[str]) -> str:
def get_image_url_from_path(image_path: str | None) -> str:
"""Convert an image path (local file or URL) to an image URL format for the API.

If image_path is None or empty, returns the default URL.
Expand Down Expand Up @@ -114,7 +113,7 @@ def get_image_url_from_path(image_path: Optional[str]) -> str:
return f"data:{mime_type};base64,{image_base64}"


def get_audio_url_from_path(audio_path: Optional[str]) -> str:
def get_audio_url_from_path(audio_path: str | None) -> str:
"""Convert an audio path (local file or URL) to an audio URL format for the API.

If audio_path is None or empty, returns the default URL.
Expand Down Expand Up @@ -169,7 +168,7 @@ def get_system_prompt():
}


def get_text_query(custom_prompt: Optional[str] = None):
def get_text_query(custom_prompt: str | None = None):
question = (
custom_prompt or "Explain the system architecture for a scalable audio generation pipeline. Answer in 15 words."
)
Expand All @@ -186,10 +185,10 @@ def get_text_query(custom_prompt: Optional[str] = None):


def get_mixed_modalities_query(
video_path: Optional[str] = None,
image_path: Optional[str] = None,
audio_path: Optional[str] = None,
custom_prompt: Optional[str] = None,
video_path: str | None = None,
image_path: str | None = None,
audio_path: str | None = None,
custom_prompt: str | None = None,
):
question = (
custom_prompt or "What is recited in the audio? What is the content of this image? Why is this video funny?"
Expand Down Expand Up @@ -222,7 +221,7 @@ def get_mixed_modalities_query(
return prompt


def get_use_audio_in_video_query(video_path: Optional[str] = None, custom_prompt: Optional[str] = None):
def get_use_audio_in_video_query(video_path: str | None = None, custom_prompt: str | None = None):
question = custom_prompt or "Describe the content of the video, then convert what the baby say into text."
video_url = get_video_url_from_path(video_path)

Expand All @@ -246,7 +245,7 @@ def get_use_audio_in_video_query(video_path: Optional[str] = None, custom_prompt
return prompt


def get_multi_audios_query(audio_path: Optional[str] = None, custom_prompt: Optional[str] = None):
def get_multi_audios_query(audio_path: str | None = None, custom_prompt: str | None = None):
question = custom_prompt or "Are these two audio clips the same?"
audio_url = get_audio_url_from_path(audio_path)
prompt = {
Expand Down
32 changes: 16 additions & 16 deletions examples/online_serving/qwen3_omni/gradio_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys
from pathlib import Path
from types import SimpleNamespace
from typing import Any, Optional
from typing import Any

import gradio as gr
import numpy as np
Expand Down Expand Up @@ -178,16 +178,16 @@ def create_prompt_args(base_args: argparse.Namespace) -> SimpleNamespace:


def process_audio_file(
audio_file: Optional[Any],
) -> Optional[tuple[np.ndarray, int]]:
audio_file: Any | None,
) -> tuple[np.ndarray, int] | None:
"""Normalize Gradio audio input to (np.ndarray, sample_rate)."""
if audio_file is None:
return None

sample_rate: Optional[int] = None
audio_np: Optional[np.ndarray] = None
sample_rate: int | None = None
audio_np: np.ndarray | None = None

def _load_from_path(path_str: str) -> Optional[tuple[np.ndarray, int]]:
def _load_from_path(path_str: str) -> tuple[np.ndarray, int] | None:
if not path_str:
return None
path = Path(path_str)
Expand Down Expand Up @@ -240,7 +240,7 @@ def _load_from_path(path_str: str) -> Optional[tuple[np.ndarray, int]]:
return audio_np.astype(np.float32), sample_rate


def process_image_file(image_file: Optional[Image.Image]) -> Optional[Image.Image]:
def process_image_file(image_file: Image.Image | None) -> Image.Image | None:
"""Process image file from Gradio input.

Returns:
Expand All @@ -255,10 +255,10 @@ def process_image_file(image_file: Optional[Image.Image]) -> Optional[Image.Imag


def process_video_file(
video_file: Optional[str],
video_file: str | None,
enable_audio_in_video: bool = False,
max_frames: int = 32,
) -> Optional[tuple[np.ndarray, dict[str, Any], Optional[tuple[np.ndarray, int]]]]:
) -> tuple[np.ndarray, dict[str, Any], tuple[np.ndarray, int] | None] | None:
"""Process video file and optionally extract audio track."""
if video_file is None:
return None
Expand All @@ -275,7 +275,7 @@ def process_video_file(
print(f"Failed to decode video {video_path}: {exc}")
return None

audio_tuple: Optional[tuple[np.ndarray, int]] = None
audio_tuple: tuple[np.ndarray, int] | None = None
if enable_audio_in_video:
try:
import librosa # type: ignore import
Expand All @@ -293,9 +293,9 @@ async def run_inference_async_omni(
sampling_params: list[SamplingParams],
prompt_args_template: SimpleNamespace,
user_prompt: str,
audio_file: Optional[tuple[str, tuple[int, np.ndarray]]] = None,
image_file: Optional[Image.Image] = None,
video_file: Optional[str] = None,
audio_file: tuple[str, tuple[int, np.ndarray]] | None = None,
image_file: Image.Image | None = None,
video_file: str | None = None,
use_audio_in_video: bool = False,
):
"""Run inference using AsyncOmni directly with multimodal support."""
Expand Down Expand Up @@ -426,9 +426,9 @@ def build_interface(

async def run_inference(
user_prompt: str,
audio_file: Optional[tuple[str, tuple[int, np.ndarray]]],
image_file: Optional[Image.Image],
video_file: Optional[str],
audio_file: tuple[str, tuple[int, np.ndarray]] | None,
image_file: Image.Image | None,
video_file: str | None,
use_audio_in_video: bool,
):
return await run_inference_async_omni(
Expand Down
Loading