diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index c65dd32fb8..695545fa65 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -39,6 +39,8 @@ Text-to-image generation quickstart with vLLM-Omni: from vllm_omni.entrypoints.omni import Omni if __name__ == "__main__": + # Use Z-Image-Turbo for fast inference (8 steps) + # or Z-Image (Base) for higher quality (28-50 steps with CFG support) omni = Omni(model="Tongyi-MAI/Z-Image-Turbo") prompt = "a cup of coffee on the table" outputs = omni.generate(prompt) @@ -59,8 +61,9 @@ You can pass a list of prompts and wait for them to process altogether, shown be from vllm_omni.entrypoints.omni import Omni if __name__ == "__main__": + # For batch inference with Z-Image models omni = Omni( - model="Tongyi-MAI/Z-Image-Turbo", + model="Tongyi-MAI/Z-Image-Turbo", # or "Tongyi-MAI/Z-Image" for Base # stage_configs_path="./stage-config.yaml", # See below ) prompts = [ @@ -93,7 +96,11 @@ For more usages, please refer to [offline inference](../user_guide/examples/offl Text-to-image generation quickstart with vLLM-Omni: ```bash +# Fast inference with Turbo (8 steps, no CFG) vllm serve Tongyi-MAI/Z-Image-Turbo --omni --port 8091 + +# Or use Base model for higher quality (50 steps, CFG support) +# vllm serve Tongyi-MAI/Z-Image --omni --port 8091 ``` ```bash diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0ba642c923..ed63bf3247 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -26,7 +26,7 @@ th { | `QwenImageEditPlusPipeline` | Qwen-Image-Edit-2509 | `Qwen/Qwen-Image-Edit-2509` | | `QwenImageLayeredPipeline` | Qwen-Image-Layered | `Qwen/Qwen-Image-Layered` | | `GlmImagePipeline` | GLM-Image | `zai-org/GLM-Image` | -|`ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | +|`ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image` (Base), `Tongyi-MAI/Z-Image-Turbo` | | `WanPipeline` | Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | | `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | | `OvisImagePipeline` | Ovis-Image | `OvisAI/Ovis-Image` | @@ -60,7 +60,7 @@ th { | `QwenImageEditPlusPipeline` | Qwen-Image-Edit-2509 | `Qwen/Qwen-Image-Edit-2509` | | `QwenImageLayeredPipeline` | Qwen-Image-Layered | `Qwen/Qwen-Image-Layered` | | `QwenImageEditPlusPipeline` | Qwen-Image-Edit-2511 | `Qwen/Qwen-Image-Edit-2511` | -|`ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | +|`ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image` (Base), `Tongyi-MAI/Z-Image-Turbo` | |`LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` | |`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` | |`Qwen3TTSForConditionalGeneration` | Qwen3-TTS-12Hz-1.7B-CustomVoice | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | diff --git a/examples/offline_inference/text_to_image/README.md b/examples/offline_inference/text_to_image/README.md index 9c57a621cf..6f544b074b 100644 --- a/examples/offline_inference/text_to_image/README.md +++ b/examples/offline_inference/text_to_image/README.md @@ -1,8 +1,9 @@ # Text-To-Image -This folder provides several entrypoints for experimenting with `Qwen/Qwen-Image` `Qwen/Qwen-Image-2512` `Tongyi-MAI/Z-Image-Turbo` using vLLM-Omni: +This folder provides several entrypoints for experimenting with `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512`, `Tongyi-MAI/Z-Image` (Base), and `Tongyi-MAI/Z-Image-Turbo` using vLLM-Omni: - `text_to_image.py`: command-line script for single image generation with advanced options. +- `z_image_examples.py`: comparison examples showing Z-Image Base vs Turbo usage. - `web_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration. Note that when you pass in multiple independent prompts, they will be processed sequentially. Batching requests is currently not supported. @@ -74,17 +75,33 @@ if __name__ == "__main__": ## Local CLI Usage +### Z-Image Turbo (Fast Inference) ```bash python text_to_image.py \ --model Tongyi-MAI/Z-Image-Turbo \ --prompt "a cup of coffee on the table" \ --seed 42 \ - --cfg_scale 4.0 \ --num_images_per_prompt 1 \ - --num_inference_steps 50 \ + --num_inference_steps 8 \ + --guidance_scale 0.0 \ --height 1024 \ --width 1024 \ - --output outputs/coffee.png + --output outputs/coffee_turbo.png +``` + +### Z-Image Base (High Quality with CFG) +```bash +python text_to_image.py \ + --model Tongyi-MAI/Z-Image \ + --prompt "a cup of coffee on the table" \ + --negative_prompt "blurry, low quality, distorted" \ + --seed 42 \ + --num_images_per_prompt 1 \ + --num_inference_steps 50 \ + --guidance_scale 4.0 \ + --height 1280 \ + --width 720 \ + --output outputs/coffee_base.png ``` Key arguments: @@ -103,6 +120,26 @@ Key arguments: > ℹ️ If you encounter OOM errors, try using `--vae_use_slicing` and `--vae_use_tiling` to reduce memory usage. +## Z-Image Base vs Turbo Comparison + +For detailed comparison and usage examples of both Z-Image variants, see: + +```bash +python z_image_examples.py --example all +``` + +Key differences: + +| Feature | Z-Image Base | Z-Image Turbo | +|---------|--------------|---------------| +| Model | `Tongyi-MAI/Z-Image` | `Tongyi-MAI/Z-Image-Turbo` | +| Inference Steps | 28-50 (default: 50) | 8 | +| CFG Support | ✅ Yes (guidance_scale 3.0-5.0) | ❌ Must use 0.0 | +| Negative Prompts | ✅ Supported | ❌ Not supported | +| Fine-tunable | ✅ Yes | ❌ No (distilled) | +| Scheduler Shift | 6.0 | 3.0 | +| Best For | High quality, fine-tuning | Fast iteration, speed | + > ℹ️ Qwen-Image currently publishes best-effort presets at `1328x1328`, `1664x928`, `928x1664`, `1472x1140`, `1140x1472`, `1584x1056`, and `1056x1584`. Adjust `--height/--width` accordingly for the most reliable outcomes. ## Web UI Demo diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index a79e5d640d..4f60b0b165 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -21,7 +21,7 @@ def parse_args() -> argparse.Namespace: "--model", default="Qwen/Qwen-Image", help="Diffusion model name or local path. Supported models: " - "Qwen/Qwen-Image, Tongyi-MAI/Z-Image-Turbo, Qwen/Qwen-Image-2512", + "Qwen/Qwen-Image, Tongyi-MAI/Z-Image (Base), Tongyi-MAI/Z-Image-Turbo, Qwen/Qwen-Image-2512", ) parser.add_argument("--prompt", default="a cup of coffee on the table", help="Text prompt for image generation.") parser.add_argument( diff --git a/examples/offline_inference/text_to_image/z_image_examples.py b/examples/offline_inference/text_to_image/z_image_examples.py new file mode 100755 index 0000000000..d01b1c8ff6 --- /dev/null +++ b/examples/offline_inference/text_to_image/z_image_examples.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Z-Image Base vs Turbo Comparison Examples + +This script demonstrates the differences between Z-Image Base and Z-Image Turbo models. + +Key Differences: +- Z-Image Base: Foundation model with full CFG support, fine-tunable, 28-50 steps +- Z-Image Turbo: Distilled model optimized for speed, 8 steps, guidance_scale must be 0.0 +""" + +from vllm_omni.entrypoints.omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + + +def z_image_base_example(): + """ + Z-Image Base - High Quality Generation + + Features: + - Full CFG support (guidance_scale 3.0-5.0) + - Negative prompts work + - Fine-tunable + - 28-50 inference steps (default 50) + - Scheduler shift: 6.0 + """ + print("\n=== Z-Image Base (High Quality) ===") + + omni_base = Omni(model="Tongyi-MAI/Z-Image") + + outputs_base = omni_base.generate( + { + "prompt": "a majestic mountain landscape at sunset, detailed, photorealistic", + "negative_prompt": "blurry, low quality, distorted, oversaturated", + }, + OmniDiffusionSamplingParams( + height=1280, + width=720, + num_inference_steps=50, + guidance_scale=4.0, + seed=42, + ), + ) + + images = outputs_base[0].request_output[0].images + images[0].save("z_image_base_output.png") + print("Saved to: z_image_base_output.png") + print(f"Generated {len(images)} image(s) with 50 steps and CFG=4.0") + + +def z_image_turbo_example(): + """ + Z-Image Turbo - Fast Inference + + Features: + - Optimized for speed + - guidance_scale MUST be 0.0 (no CFG) + - Negative prompts not supported + - 8 inference steps + - Scheduler shift: 3.0 + """ + print("\n=== Z-Image Turbo (Fast) ===") + + omni_turbo = Omni(model="Tongyi-MAI/Z-Image-Turbo") + + outputs_turbo = omni_turbo.generate( + "a majestic mountain landscape at sunset, detailed, photorealistic", + OmniDiffusionSamplingParams( + height=1024, + width=1024, + num_inference_steps=8, + guidance_scale=0.0, # MUST be 0.0 for Turbo! + seed=42, + ), + ) + + images = outputs_turbo[0].request_output[0].images + images[0].save("z_image_turbo_output.png") + print("Saved to: z_image_turbo_output.png") + print(f"Generated {len(images)} image(s) with 8 steps (no CFG)") + + +def batch_inference_example(): + """ + Batch inference with Z-Image Base + + Note: Batch processing depends on max_batch_size in stage configs. + By default, diffusion models process one prompt at a time. + """ + print("\n=== Batch Inference Example ===") + + omni = Omni(model="Tongyi-MAI/Z-Image") + + prompts = [ + {"prompt": "a cup of coffee on a wooden table", "negative_prompt": "blurry, low quality"}, + {"prompt": "a cat sleeping on a cozy blanket", "negative_prompt": "blurry, low quality"}, + {"prompt": "a futuristic city skyline at night", "negative_prompt": "blurry, low quality"}, + ] + + # Note: These will be processed sequentially unless max_batch_size > 1 + outputs = omni.generate( + prompts, + OmniDiffusionSamplingParams( + height=1024, + width=1024, + num_inference_steps=40, + guidance_scale=4.0, + seed=42, + ), + ) + + for i, output in enumerate(outputs): + image = output.request_output[0].images[0] + image.save(f"batch_output_{i}.png") + print(f"Saved to: batch_output_{i}.png") + + +def recommended_settings(): + """ + Print recommended settings for both models + """ + print("\n=== Recommended Settings ===\n") + + print("Z-Image Base (Tongyi-MAI/Z-Image):") + print(" - num_inference_steps: 28-50 (default: 50)") + print(" - guidance_scale: 3.0-5.0 (default: 4.0)") + print(" - negative_prompt: Supported and recommended") + print(" - resolution: 1280x720 or 720x1280") + print(" - cfg_normalization: False (default)") + print(" - Use when: Quality is priority, fine-tuning needed") + + print("\nZ-Image Turbo (Tongyi-MAI/Z-Image-Turbo):") + print(" - num_inference_steps: 8") + print(" - guidance_scale: 0.0 (REQUIRED)") + print(" - negative_prompt: Not supported") + print(" - resolution: 1024x1024") + print(" - Use when: Speed is priority, quick iterations") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Z-Image Base vs Turbo comparison examples") + parser.add_argument( + "--example", + choices=["base", "turbo", "batch", "all"], + default="all", + help="Which example to run (default: all)", + ) + + args = parser.parse_args() + + recommended_settings() + + if args.example in ("base", "all"): + z_image_base_example() + + if args.example in ("turbo", "all"): + z_image_turbo_example() + + if args.example in ("batch", "all"): + batch_inference_example() + + print("\nDone!")