vllm-project · DarkLight1337 · Dec 16, 2025 · Dec 1, 2025 · Dec 15, 2025 · Dec 16, 2025
@@ -23,7 +23,7 @@ repos:
     - id: ruff-format
 
   - repo: https://github.com/crate-ci/typos
-    rev: v1.35.5
+    rev: v1.38.1
     hooks:
       - id: typos
         # only for staged files

diff --git a/docs/user_guide/cache_dit_acceleration.md b/docs/user_guide/cache_dit_acceleration.md
@@ -201,7 +201,7 @@ omni = Omni(
 
 You can customize the configuration by modifying the `cache_config` dictionary to use only specific methods (e.g., DBCache only, DBCache + SCM, etc.) based on your quality and speed requirements.
 
-To test another model, you can modify `--model` with the target model identifier like `Tongyi-MAI/Z-Image-Turbo` and update `cache_confg` according the model architecture (e.g., number of transformer blocks).
+To test another model, you can modify `--model` with the target model identifier like `Tongyi-MAI/Z-Image-Turbo` and update `cache_config` according the model architecture (e.g., number of transformer blocks).
 
 
 ## Additional Resources

@@ -6,7 +6,7 @@
 """
 
 import os
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 
 import librosa
 import numpy as np
@@ -58,9 +58,9 @@ def get_text_query(question: str = None) -> QueryResult:
 
 
 def get_mixed_modalities_query(
-    video_path: Optional[str] = None,
-    image_path: Optional[str] = None,
-    audio_path: Optional[str] = None,
+    video_path: str | None = None,
+    image_path: str | None = None,
+    audio_path: str | None = None,
     num_frames: int = 16,
     sampling_rate: int = 16000,
 ) -> QueryResult:
@@ -114,7 +114,7 @@ def get_mixed_modalities_query(
 
 
 def get_use_audio_in_video_query(
-    video_path: Optional[str] = None, num_frames: int = 16, sampling_rate: int = 16000
+    video_path: str | None = None, num_frames: int = 16, sampling_rate: int = 16000
 ) -> QueryResult:
     question = "Describe the content of the video, then convert what the baby say into text."
     prompt = (
@@ -151,7 +151,7 @@ def get_use_audio_in_video_query(
     )
 
 
-def get_multi_audios_query(audio_path: Optional[str] = None, sampling_rate: int = 16000) -> QueryResult:
+def get_multi_audios_query(audio_path: str | None = None, sampling_rate: int = 16000) -> QueryResult:
     question = "Are these two audio clips the same?"
     prompt = (
         f"<|im_start|>system\n{default_system}<|im_end|>\n"
@@ -190,7 +190,7 @@ def get_multi_audios_query(audio_path: Optional[str] = None, sampling_rate: int
     )
 
 
-def get_image_query(question: str = None, image_path: Optional[str] = None) -> QueryResult:
+def get_image_query(question: str = None, image_path: str | None = None) -> QueryResult:
     if question is None:
         question = "What is the content of this image?"
     prompt = (
@@ -219,7 +219,7 @@ def get_image_query(question: str = None, image_path: Optional[str] = None) -> Q
     )
 
 
-def get_video_query(question: str = None, video_path: Optional[str] = None, num_frames: int = 16) -> QueryResult:
+def get_video_query(question: str = None, video_path: str | None = None, num_frames: int = 16) -> QueryResult:
     if question is None:
         question = "Why is this video funny?"
     prompt = (
@@ -247,7 +247,7 @@ def get_video_query(question: str = None, video_path: Optional[str] = None, num_
     )
 
 
-def get_audio_query(question: str = None, audio_path: Optional[str] = None, sampling_rate: int = 16000) -> QueryResult:
+def get_audio_query(question: str = None, audio_path: str | None = None, sampling_rate: int = 16000) -> QueryResult:
     if question is None:
         question = "What is the content of this audio?"
     prompt = (

@@ -1,9 +1,8 @@
 #!/usr/bin/env python3
 import argparse
-from typing import Optional
 
 
-def extract_prompt(line: str) -> Optional[str]:
+def extract_prompt(line: str) -> str | None:
     # Extract the content between the first '|' and the second '|'
     i = line.find("|")
     if i == -1:

@@ -6,7 +6,7 @@
 """
 
 import os
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 
 import librosa
 import numpy as np
@@ -57,7 +57,7 @@ def get_text_query(question: str = None) -> QueryResult:
     )
 
 
-def get_video_query(question: str = None, video_path: Optional[str] = None, num_frames: int = 16) -> QueryResult:
+def get_video_query(question: str = None, video_path: str | None = None, num_frames: int = 16) -> QueryResult:
     if question is None:
         question = "Why is this video funny?"
     prompt = (
@@ -85,7 +85,7 @@ def get_video_query(question: str = None, video_path: Optional[str] = None, num_
     )
 
 
-def get_image_query(question: str = None, image_path: Optional[str] = None) -> QueryResult:
+def get_image_query(question: str = None, image_path: str | None = None) -> QueryResult:
     if question is None:
         question = "What is the content of this image?"
     prompt = (
@@ -114,7 +114,7 @@ def get_image_query(question: str = None, image_path: Optional[str] = None) -> Q
     )
 
 
-def get_audio_query(question: str = None, audio_path: Optional[str] = None, sampling_rate: int = 16000) -> QueryResult:
+def get_audio_query(question: str = None, audio_path: str | None = None, sampling_rate: int = 16000) -> QueryResult:
     if question is None:
         question = "What is the content of this audio?"
     prompt = (

@@ -5,7 +5,7 @@
 import sys
 from pathlib import Path
 from types import SimpleNamespace
-from typing import Any, Optional
+from typing import Any
 
 import gradio as gr
 import numpy as np
@@ -175,16 +175,16 @@ def create_prompt_args(base_args: argparse.Namespace) -> SimpleNamespace:
 
 
 def process_audio_file(
-    audio_file: Optional[Any],
-) -> Optional[tuple[np.ndarray, int]]:
+    audio_file: Any | None,
+) -> tuple[np.ndarray, int] | None:
     """Normalize Gradio audio input to (np.ndarray, sample_rate)."""
     if audio_file is None:
         return None
 
-    sample_rate: Optional[int] = None
-    audio_np: Optional[np.ndarray] = None
+    sample_rate: int | None = None
+    audio_np: np.ndarray | None = None
 
-    def _load_from_path(path_str: str) -> Optional[tuple[np.ndarray, int]]:
+    def _load_from_path(path_str: str) -> tuple[np.ndarray, int] | None:
         if not path_str:
             return None
         path = Path(path_str)
@@ -237,7 +237,7 @@ def _load_from_path(path_str: str) -> Optional[tuple[np.ndarray, int]]:
     return audio_np.astype(np.float32), sample_rate
 
 
-def process_image_file(image_file: Optional[Image.Image]) -> Optional[Image.Image]:
+def process_image_file(image_file: Image.Image | None) -> Image.Image | None:
     """Process image file from Gradio input.
 
     Returns:
@@ -252,10 +252,10 @@ def process_image_file(image_file: Optional[Image.Image]) -> Optional[Image.Imag
 
 
 def process_video_file(
-    video_file: Optional[str],
+    video_file: str | None,
     enable_audio_in_video: bool = False,
     max_frames: int = 32,
-) -> Optional[tuple[np.ndarray, dict[str, Any], Optional[tuple[np.ndarray, int]]]]:
+) -> tuple[np.ndarray, dict[str, Any], tuple[np.ndarray, int] | None] | None:
     """Process video file and optionally extract audio track."""
     if video_file is None:
         return None
@@ -272,7 +272,7 @@ def process_video_file(
         print(f"Failed to decode video {video_path}: {exc}")
         return None
 
-    audio_tuple: Optional[tuple[np.ndarray, int]] = None
+    audio_tuple: tuple[np.ndarray, int] | None = None
     if enable_audio_in_video:
         try:
             import librosa  # type: ignore import
@@ -290,9 +290,9 @@ async def run_inference_async_omni(
     sampling_params: list[SamplingParams],
     prompt_args_template: SimpleNamespace,
     user_prompt: str,
-    audio_file: Optional[tuple[str, tuple[int, np.ndarray]]] = None,
-    image_file: Optional[Image.Image] = None,
-    video_file: Optional[str] = None,
+    audio_file: tuple[str, tuple[int, np.ndarray]] | None = None,
+    image_file: Image.Image | None = None,
+    video_file: str | None = None,
     use_audio_in_video: bool = False,
 ):
     """Run inference using AsyncOmni directly with multimodal support."""
@@ -420,9 +420,9 @@ def build_interface(
 
     async def run_inference(
         user_prompt: str,
-        audio_file: Optional[tuple[str, tuple[int, np.ndarray]]],
-        image_file: Optional[Image.Image],
-        video_file: Optional[str],
+        audio_file: tuple[str, tuple[int, np.ndarray]] | None,
+        image_file: Image.Image | None,
+        video_file: str | None,
         use_audio_in_video: bool,
     ):
         return await run_inference_async_omni(

@@ -1,6 +1,5 @@
 import base64
 import os
-from typing import Optional
 
 import requests
 from openai import OpenAI
@@ -38,7 +37,7 @@ def encode_base64_content_from_file(file_path: str) -> str:
     return result
 
 
-def get_video_url_from_path(video_path: Optional[str]) -> str:
+def get_video_url_from_path(video_path: str | None) -> str:
     """Convert a video path (local file or URL) to a video URL format for the API.
 
     If video_path is None or empty, returns the default URL.
@@ -77,7 +76,7 @@ def get_video_url_from_path(video_path: Optional[str]) -> str:
     return f"data:{mime_type};base64,{video_base64}"
 
 
-def get_image_url_from_path(image_path: Optional[str]) -> str:
+def get_image_url_from_path(image_path: str | None) -> str:
     """Convert an image path (local file or URL) to an image URL format for the API.
 
     If image_path is None or empty, returns the default URL.
@@ -114,7 +113,7 @@ def get_image_url_from_path(image_path: Optional[str]) -> str:
     return f"data:{mime_type};base64,{image_base64}"
 
 
-def get_audio_url_from_path(audio_path: Optional[str]) -> str:
+def get_audio_url_from_path(audio_path: str | None) -> str:
     """Convert an audio path (local file or URL) to an audio URL format for the API.
 
     If audio_path is None or empty, returns the default URL.
@@ -169,7 +168,7 @@ def get_system_prompt():
     }
 
 
-def get_text_query(custom_prompt: Optional[str] = None):
+def get_text_query(custom_prompt: str | None = None):
     question = (
         custom_prompt or "Explain the system architecture for a scalable audio generation pipeline. Answer in 15 words."
     )
@@ -186,10 +185,10 @@ def get_text_query(custom_prompt: Optional[str] = None):
 
 
 def get_mixed_modalities_query(
-    video_path: Optional[str] = None,
-    image_path: Optional[str] = None,
-    audio_path: Optional[str] = None,
-    custom_prompt: Optional[str] = None,
+    video_path: str | None = None,
+    image_path: str | None = None,
+    audio_path: str | None = None,
+    custom_prompt: str | None = None,
 ):
     question = (
         custom_prompt or "What is recited in the audio? What is the content of this image? Why is this video funny?"
@@ -222,7 +221,7 @@ def get_mixed_modalities_query(
     return prompt
 
 
-def get_use_audio_in_video_query(video_path: Optional[str] = None, custom_prompt: Optional[str] = None):
+def get_use_audio_in_video_query(video_path: str | None = None, custom_prompt: str | None = None):
     question = custom_prompt or "Describe the content of the video, then convert what the baby say into text."
     video_url = get_video_url_from_path(video_path)
 
@@ -246,7 +245,7 @@ def get_use_audio_in_video_query(video_path: Optional[str] = None, custom_prompt
     return prompt
 
 
-def get_multi_audios_query(audio_path: Optional[str] = None, custom_prompt: Optional[str] = None):
+def get_multi_audios_query(audio_path: str | None = None, custom_prompt: str | None = None):
     question = custom_prompt or "Are these two audio clips the same?"
     audio_url = get_audio_url_from_path(audio_path)
     prompt = {

@@ -5,7 +5,7 @@
 import sys
 from pathlib import Path
 from types import SimpleNamespace
-from typing import Any, Optional
+from typing import Any
 
 import gradio as gr
 import numpy as np
@@ -178,16 +178,16 @@ def create_prompt_args(base_args: argparse.Namespace) -> SimpleNamespace:
 
 
 def process_audio_file(
-    audio_file: Optional[Any],
-) -> Optional[tuple[np.ndarray, int]]:
+    audio_file: Any | None,
+) -> tuple[np.ndarray, int] | None:
     """Normalize Gradio audio input to (np.ndarray, sample_rate)."""
     if audio_file is None:
         return None
 
-    sample_rate: Optional[int] = None
-    audio_np: Optional[np.ndarray] = None
+    sample_rate: int | None = None
+    audio_np: np.ndarray | None = None
 
-    def _load_from_path(path_str: str) -> Optional[tuple[np.ndarray, int]]:
+    def _load_from_path(path_str: str) -> tuple[np.ndarray, int] | None:
         if not path_str:
             return None
         path = Path(path_str)
@@ -240,7 +240,7 @@ def _load_from_path(path_str: str) -> Optional[tuple[np.ndarray, int]]:
     return audio_np.astype(np.float32), sample_rate
 
 
-def process_image_file(image_file: Optional[Image.Image]) -> Optional[Image.Image]:
+def process_image_file(image_file: Image.Image | None) -> Image.Image | None:
     """Process image file from Gradio input.
 
     Returns:
@@ -255,10 +255,10 @@ def process_image_file(image_file: Optional[Image.Image]) -> Optional[Image.Imag
 
 
 def process_video_file(
-    video_file: Optional[str],
+    video_file: str | None,
     enable_audio_in_video: bool = False,
     max_frames: int = 32,
-) -> Optional[tuple[np.ndarray, dict[str, Any], Optional[tuple[np.ndarray, int]]]]:
+) -> tuple[np.ndarray, dict[str, Any], tuple[np.ndarray, int] | None] | None:
     """Process video file and optionally extract audio track."""
     if video_file is None:
         return None
@@ -275,7 +275,7 @@ def process_video_file(
         print(f"Failed to decode video {video_path}: {exc}")
         return None
 
-    audio_tuple: Optional[tuple[np.ndarray, int]] = None
+    audio_tuple: tuple[np.ndarray, int] | None = None
     if enable_audio_in_video:
         try:
             import librosa  # type: ignore import
@@ -293,9 +293,9 @@ async def run_inference_async_omni(
     sampling_params: list[SamplingParams],
     prompt_args_template: SimpleNamespace,
     user_prompt: str,
-    audio_file: Optional[tuple[str, tuple[int, np.ndarray]]] = None,
-    image_file: Optional[Image.Image] = None,
-    video_file: Optional[str] = None,
+    audio_file: tuple[str, tuple[int, np.ndarray]] | None = None,
+    image_file: Image.Image | None = None,
+    video_file: str | None = None,
     use_audio_in_video: bool = False,
 ):
     """Run inference using AsyncOmni directly with multimodal support."""
@@ -426,9 +426,9 @@ def build_interface(
 
     async def run_inference(
         user_prompt: str,
-        audio_file: Optional[tuple[str, tuple[int, np.ndarray]]],
-        image_file: Optional[Image.Image],
-        video_file: Optional[str],
+        audio_file: tuple[str, tuple[int, np.ndarray]] | None,
+        image_file: Image.Image | None,
+        video_file: str | None,
         use_audio_in_video: bool,
     ):
         return await run_inference_async_omni(