From 22296c2ef4933c8aaa329d20c5aae34b50964056 Mon Sep 17 00:00:00 2001 From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com> Date: Thu, 12 Mar 2026 16:07:39 +0800 Subject: [PATCH] L4 test for t2i doc examples (rebased til 0319 fix DCO) Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> --- .buildkite/test-nightly.yml | 44 ++- docs/contributing/ci/.nav.yaml | 6 + docs/contributing/ci/CI_5levels.md | 155 ++++---- .../ci/test_examples/doc_example_tests.inc.md | 49 +++ docs/contributing/ci/tests_style.md | 6 + .../offline_inference/text_to_image/README.md | 253 ++++++++----- pyproject.toml | 4 +- tests/examples/conftest.py | 338 ++++++++++++++++++ tests/examples/offline_inference/__init__.py | 0 .../offline_inference/test_text_to_image.py | 38 ++ tests/examples/online_serving/__init__.py | 0 .../online_serving/test_qwen2_5_omni.py | 32 +- .../online_serving/test_qwen3_omni.py | 32 +- .../online_serving/test_text_to_image.py | 136 +++++++ 14 files changed, 872 insertions(+), 221 deletions(-) create mode 100644 docs/contributing/ci/.nav.yaml create mode 100644 docs/contributing/ci/test_examples/doc_example_tests.inc.md create mode 100644 tests/examples/conftest.py create mode 100644 tests/examples/offline_inference/__init__.py create mode 100644 tests/examples/offline_inference/test_text_to_image.py create mode 100644 tests/examples/online_serving/__init__.py create mode 100644 tests/examples/online_serving/test_text_to_image.py diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 53a4f180c59..f9bce484bec 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -70,7 +70,7 @@ steps: - label: ":full_moon: Diffusion Model Test with H100" timeout_in_minutes: 60 depends_on: upload-nightly-pipeline - # if: build.env("NIGHTLY") == "1" + if: build.env("NIGHTLY") == "1" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model" @@ -108,6 +108,48 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate + - label: ":full_moon: Documentation Example Code Test with H100" + timeout_in_minutes: 60 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_TEST_CLEAN_GPU_MEMORY="1" + - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: ":full_moon: Qwen3-TTS Non-Async-Chunk E2E Test" timeout_in_minutes: 30 depends_on: upload-nightly-pipeline diff --git a/docs/contributing/ci/.nav.yaml b/docs/contributing/ci/.nav.yaml new file mode 100644 index 00000000000..0f187f3a15d --- /dev/null +++ b/docs/contributing/ci/.nav.yaml @@ -0,0 +1,6 @@ +nav: + - CI_5levels.md + - failures.md + - test_guide.md + - test_markers.md + - test_style.md diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 03b907f3239..1f9e6a1882f 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -545,97 +545,104 @@ L4 level testing is a comprehensive quality audit before a version release. It e - ***Trigger Timing***: **`Nightly`**, automatically executed every night. - ***Execution Environment***: ***GPU*** server clusters to meet the resource demands of performance testing. - ***Script Example***: -
- Test Examples -When you want to add L4-level performance test cases, you can refer to the following format for case addition in tests/perf/tests/test.json: - -```JSON -{ - "test_name": "test_qwen3_omni", - "server_params": { - "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", - "stage_config_name": "qwen3_omni.yaml" - }, - "benchmark_params": [ - { - "dataset_name": "random", - "num_prompts": [10, 20], - "request_rate": [0.5, 1], - "random_input_len": 2500, - "random_output_len": 900, - "ignore_eos": true, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_ttft_ms": 100000, - "mean_audio_ttfp_ms": 100000, - "mean_audio_rtf": 100000 + +???+ example "Test Examples" + + When adding L4-level ***documentation example Tests***, please pay attention to the following guides. + + --8<-- "docs/contributing/ci/test_examples/doc_example_tests.inc.md" + + When you want to add L4-level ***performance test*** cases, you can refer to the following format for case addition in tests/perf/tests/test.json: + + ```JSON + { + "test_name": "test_qwen3_omni", + "server_params": { + "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct", + "stage_config_name": "qwen3_omni.yaml" + }, + "benchmark_params": [ + { + "dataset_name": "random", + "num_prompts": [10, 20], + "request_rate": [0.5, 1], + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": 100000, + "mean_audio_ttfp_ms": 100000, + "mean_audio_rtf": 100000 + } } - } - ] -} -``` + ] + } + ``` -#### Parameter Explanation + **Parameter Explanation** -***Overview*** + *Overview* -| Field | Required | Description | -| ---------------- | -------- | --------------------------------------------------------------- | -| test_name | Yes | Unique identifier for the test case | -| server_params | Yes | Server-side configuration parameters | -| benchmark_params | Yes | Benchmark running parameters (supports multiple configurations) | + | Field | Required | Description | + | ---------------- | -------- | --------------------------------------------------------------- | + | test_name | Yes | Unique identifier for the test case | + | server_params | Yes | Server-side configuration parameters | + | benchmark_params | Yes | Benchmark running parameters (supports multiple configurations) | -#### server_params Configuration + **server_params Configuration** -##### Basic Parameters + *Basic Parameters* -| Parameter | Required | Example | Description | -| ----------------- | -------- | ---------------------------------- | ----------------------------- | -| model | Yes | "Qwen/Qwen3-Omni-30B-A3B-Instruct" | Model name or path | -| stage_config_name | Yes | "qwen3_omni.yaml" | Stage configuration file name | + | Parameter | Required | Example | Description | + | ----------------- | -------- | ---------------------------------- | ----------------------------- | + | model | Yes | "Qwen/Qwen3-Omni-30B-A3B-Instruct" | Model name or path | + | stage_config_name | Yes | "qwen3_omni.yaml" | Stage configuration file name | -##### Dynamic Configuration (update/delete) + *Dynamic Configuration (update/delete)* -Supports incremental modifications based on the basic configuration: + Supports incremental modifications based on the basic configuration: -| Operation | Description | -| --------- | ------------------------------------ | -| update | Update or add configuration items | -| delete | Delete specified configuration items | + | Operation | Description | + | --------- | ------------------------------------ | + | update | Update or add configuration items | + | delete | Delete specified configuration items | -***Example***: -``` -"update": { - "async_chunk": true, // Enable asynchronous chunk processing - "stage_args": { - "0": { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" + ***Example***: + + ``` + "update": { + "async_chunk": true, // Enable asynchronous chunk processing + "stage_args": { + "0": { + "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" + } + } + }, + "delete": { + "stage_args": { + "2": ["custom_process_input_func"] // Delete this configuration for stage 2 } } -}, -"delete": { - "stage_args": { - "2": ["custom_process_input_func"] // Delete this configuration for stage 2 - } -} -``` -#### benchmark_params Configuration + ``` -You can add any benchmark running parameters you need here. For all optional parameters, refer to the [benchmark documentation](https://github.com/vllm-project/vllm-omni/blob/main/docs/cli/bench/serve.md). General modifications are as follows: + **benchmark_params Configuration** -1. Change the ---xxx-xx-xx running parameters to xxx_xx_xx format and fill them as keys in the JSON file. -2. For boolean variables in the running parameters, modify them to forms such as ignore_eos: true/false and fill them into the JSON file. -3. Add the baseline parameter to specify the required validation values, ensuring the validation metric names match those in the result.json generated by the benchmark. -4. The qps and concurrency modes are mutually exclusive. For detailed explanations, see the table below: + You can add any benchmark running parameters you need here. For all optional parameters, refer to the [benchmark documentation](https://github.com/vllm-project/vllm-omni/blob/main/docs/cli/bench/serve.md). General modifications are as follows: -| Parameter | Type | Required | Example/Values | Description | -| --------------- | ----------- | -------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| num_prompts | int / array | Yes | 10,[10, 20, 30] | Number of requests. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of qps or max_concurrency, e.g., [10,10,10]. If an array is used, its length must match the number of qps or max_concurrency. | -| request_rate | int / array | No | 1, [1, 2, 3] | Queries per second. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of num_prompts, e.g., [1,1,1]. If an array is used, its length must match the number of num_prompts. | -| max_concurrency | int / array | No | 1, [1, 2, 3] | Queries per second. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of num_prompts, e.g., [1,1,1]. If an array is used, its length must match the number of num_prompts. | -
+ 1. Change the ---xxx-xx-xx running parameters to xxx_xx_xx format and fill them as keys in the JSON file. + 2. For boolean variables in the running parameters, modify them to forms such as ignore_eos: true/false and fill them into the JSON file. + 3. Add the baseline parameter to specify the required validation values, ensuring the validation metric names match those in the result.json generated by the benchmark. + 4. The qps and concurrency modes are mutually exclusive. For detailed explanations, see the table below: + + | Parameter | Type | Required | Example/Values | Description | + | --------------- | ----------- | -------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | num_prompts | int / array | Yes | 10,[10, 20, 30] | Number of requests. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of qps or max_concurrency, e.g., [10,10,10]. If an array is used, its length must match the number of qps or max_concurrency. | + | request_rate | int / array | No | 1, [1, 2, 3] | Queries per second. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of num_prompts, e.g., [1,1,1]. If an array is used, its length must match the number of num_prompts. | + | max_concurrency | int / array | No | 1, [1, 2, 3] | Queries per second. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of num_prompts, e.g., [1,1,1]. If an array is used, its length must match the number of num_prompts. | + -- - ***Run Command***: (Specific commands would depend on the performance testing tool and configuration defined in `nightly.json`). + - - ***Run Command***: (Specific commands would depend on the performance testing tool and configuration defined in `nightly.json`). ## Chapter 4: L5 Level Testing - Stability and Reliability Testing diff --git a/docs/contributing/ci/test_examples/doc_example_tests.inc.md b/docs/contributing/ci/test_examples/doc_example_tests.inc.md new file mode 100644 index 00000000000..13dd032e275 --- /dev/null +++ b/docs/contributing/ci/test_examples/doc_example_tests.inc.md @@ -0,0 +1,49 @@ +**Preferred Test Strategy** + +Use one of the following patterns depending on page type: + +- **Dynamic code-block extraction (preferred for offline docs)** + - Extract Python/Bash code blocks from markdown AST analyzer, then execute them directly in tests. + - Benefit: test logic stays automatically aligned with docs. + - Basic idea: Use `ReadmeSnippet.extract_readme_snippets` to extract a list of code blocks as a global variable in file, + use this list as `pytest.mark.parametrize` parameters, and pass each snippet item to `example_runner.run` inside the parametrized test. + Additionally pass an `output_subfolder` argument for the 2nd-level output folder explained in **Output Directory Structure** below. + If any extra environment variable is need for a test (e.g., the example script reads it), `example_runner.run` also accepts a 3rd `env` parameter. + - See [tests/examples/offline_inference/test_text_to_image.py](https://github.com/vllm-project/vllm-omni/blob/main/tests/examples/offline_inference/test_text_to_image.py) for reference implementation. + +- **Explicit copied scripts (used by online docs for now until further update)** + - For online serving pages, it is acceptable to copy code from docs into dedicated test functions, because only client-side, request-sending scripts are tested. + - Benefit: dynamic extraction is overly complex: need to tell server-launch and client-request scripts. + - Requirement: copied test code must be kept in sync with doc updates. + +**Test Case Naming Convention** + +- Dynamic code extraction (auto-generated internally): + - `test_{single_function_name_matching_file_name}[h2_heading_00X]` + - Example: `test_text_to_image[basic_usage_001]` +- Explicit copied scripts: + - `test_{h2_heading_00X}[{dummy_param_id_for_omni_server}]` + - Example: `test_api_calls_001[omni_server0]` + +**Runtime Configuration** + +In the example code tests, do **not** reduce `num_inference_steps` just to speed up the tests unless there is a strong CI reliability reason to do otherwise. + +**Skipping Rules** + +You may skip examples falling in the following categories using `pytest.mark.skip` or `pytest.skip`: + +- Gradio UI scripts +- Scenarios that significantly overlap with existing tests and add little new coverage. + +**Output Directory Structure** + +Use a three-layer output structure to store output artifacts: + +1. Root output directory + - Auto-detected from `OUTPUT_DIR` env var or auto-generated under `/tmp`. +2. Doc-page directory + - Define and use a clear page-level folder name in each `test_*.py` yourself (abbreviations are acceptable, e.g., `example_offline_t2i`). +3. Test-case directory + - Must match the case identifier (e.g., `basic_usage_001`). + - Auto-generated for dynamic extracted tests. diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 5d642fdb95e..53775520315 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -157,6 +157,12 @@ vllm_omni/ tests/ ├── qwen3_omni_ci.yaml ├── bagel_*.yaml └── npu/, rocm/, etc. +examples/ tests +│ └── examples +├── online_serving/ → ├── online_serving/ +│ └── {doc_page_title}/README.md │ └── test_{doc_page_title}.py ⬜ +└── offline_inference/ → └── offline_inference/ + └── {doc_page_title}/README.md └── test_{doc_page_title}.py ⬜ ``` diff --git a/examples/offline_inference/text_to_image/README.md b/examples/offline_inference/text_to_image/README.md index 0de89c753c4..7019bd47bd7 100644 --- a/examples/offline_inference/text_to_image/README.md +++ b/examples/offline_inference/text_to_image/README.md @@ -1,97 +1,129 @@ # Text-To-Image -This folder provides several entrypoints for experimenting with `Qwen/Qwen-Image` `Qwen/Qwen-Image-2512` `Tongyi-MAI/Z-Image-Turbo` `stepfun-ai/NextStep-1.1` using vLLM-Omni, note that NextStep-1.1 has different architecture so we treat it differently regarding running arguments and pipeline. +Generate images from text prompts using vLLM-Omni's diffusion pipeline entrypoints. - `text_to_image.py`: command-line script for single image generation with advanced options. -- `web_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration. +- `gradio_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration. -Note that when you pass in multiple independent prompts, they will be processed sequentially. Batching requests is currently not supported. +## Table of Contents -## Basic Usage +- [Overview](#overview) +- [Quick Start](#quick-start) +- [Key Arguments](#key-arguments) +- [More CLI Examples](#more-cli-examples) +- [Web UI Demo](#web-ui-demo) -```python -from vllm_omni.entrypoints.omni import Omni +## Overview -if __name__ == "__main__": - omni = Omni(model="Qwen/Qwen-Image") - prompt = "a cup of coffee on the table" - outputs = omni.generate(prompt) - images = outputs[0].request_output.images - images[0].save("coffee.png") -``` +This folder provides several entrypoints for experimenting with text-to-image diffusion models using vLLM-Omni. Note that `NextStep-1.1` has a different architecture, so it is treated differently regarding running arguments and pipeline. -Or put more than one prompt in a request. +### Supported Models -```python -from vllm_omni.entrypoints.omni import Omni - -if __name__ == "__main__": - omni = Omni(model="Qwen/Qwen-Image") - prompts = [ - "a cup of coffee on a table", - "a toy dinosaur on a sandy beach", - "a fox waking up in bed and yawning", - ] - outputs = omni.generate(prompts) - for i, output in enumerate(outputs): - image = output.request_output.images[0].save(f"{i}.jpg") -``` +| Model | Image Shape | Peak VRAM (GiB) * | Model Weights (GiB) | +| ----- | ----------- | ----------- | ----------------- | +| `Qwen/Qwen-Image` | 1024 x 1024 | 60.0 | 53.7 | +| `Qwen/Qwen-Image-2512` |1024 x 1024 | 60.0 | 53.7 | +| `Tongyi-MAI/Z-Image-Turbo` | 1024 x 1024 | 24.8 | 19.2 | +| `stepfun-ai/NextStep-1.1` | 512 x 512 | 71.8 | 28.1 | +| `meituan-longcat/LongCat-Image` | 1024 x 1024 | 71.2 | 27.3 | +| `AIDC-AI/Ovis-Image-7B` | 1024 x 1024 | 71.8 | 17.1 | +| `OmniGen2/OmniGen2` | 1024 x 1024 | 20.1 | 14.7 | +| `stabilityai/stable-diffusion-3.5-medium` | 1024 x 1024 | 20.1 | 15.6 | +| `black-forest-labs/FLUX.1-dev` | 1024 x 1024 | 77.6 | 31.4 | +| `black-forest-labs/FLUX.2-klein-4B` | 1024 x 1024 | 72.7 | 14.9 | +| `black-forest-labs/FLUX.2-klein-9B` | 1024 x 1024 | 37.1 | 32.3 | +| `black-forest-labs/FLUX.2-dev` | 1024 x 1024 | 65.7 | >80 (CPU offload required) | !!! info +*Peak VRAM: based on basic single-card usage, batch size =1, without any acceleration/optimization features. FLUX.2-dev requires `--enable-cpu-offload` on a single 80 GiB GPU. - However, it is not currently recommended to do so - because not all models support batch inference, - and batch requesting mostly does not provide significant performance improvement (despite the impression that it does). - This feature is primarily for the sake of interface compatibility with vLLM and to allow for future improvements. +Default model: `Qwen/Qwen-Image` -!!! info +## Quick Start - For diffusion pipelines, the stage config field `stage_args.[].runtime.max_batch_size` is 1 by default, and the input - list is sliced into single-item requests before feeding into the diffusion pipeline. For models that do internally support - batched inputs, you can [modify this configuration](../../../configuration/stage_configs.md) to let the model accept a longer batch of prompts. +### Python API -Apart from string prompt, vLLM-Omni also supports dictionary prompts in the same style as vLLM. -This is useful for models that support negative prompts. +Single-prompt generation: ```python from vllm_omni.entrypoints.omni import Omni if __name__ == "__main__": omni = Omni(model="Qwen/Qwen-Image") - outputs = omni.generate([ - { - "prompt": "a cup of coffee on a table", - "negative_prompt": "low resolution" - }, - { - "prompt": "a toy dinosaur on a sandy beach", - "negative_prompt": "cinematic, realistic" - } - ]) - for i, output in enumerate(outputs): - image = output.request_output.images[0].save(f"{i}.jpg") + prompt = "a cup of coffee on the table" + outputs = omni.generate(prompt) + images = outputs[0].request_output.images + images[0].save("coffee.png") ``` -## Local CLI Usage +### Local CLI Usage -### Qwen/Tongyi Models +```bash +python text_to_image.py \ + --model Qwen/Qwen-Image \ + --prompt "a cup of coffee on the table" \ + --output coffee.png +``` + +## Key Arguments + +**Common arguments:** + +| Argument | Type | Default | Description | +| -------- | ---- | ------- | ----------- | +| `--prompt` | str | `"a cup of coffee on the table"` | Text description for image generation | +| `--seed` | int | `142` | Integer seed for deterministic sampling | +| `--negative-prompt` | str | `None` | Negative prompt for classifier-free conditional guidance | +| `--cfg-scale` | float | `4.0` | True CFG scale (model-specific guidance strength) | +| `--guidance-scale` | float | `1.0` | Classifier-free guidance scale | +| `--num-images-per-prompt` | int | `1` | Number of images per prompt (saved as `output`, `output_1`, ...) | +| `--num-inference-steps` | int | `50` | Diffusion sampling steps (more steps = higher quality, slower) | +| `--height` | int | `1024` | Output image height in pixels | +| `--width` | int | `1024` | Output image width in pixels | +| `--output` | str | `"qwen_image_output.png"` | Path to save the generated image | +| `--vae-use-slicing` | flag | off | Enable VAE slicing for memory optimization | +| `--vae-use-tiling` | flag | off | Enable VAE tiling for memory optimization | +| `--cfg-parallel-size` | int | `1` | Set to `2` to enable CFG Parallel | +| `--enable-cpu-offload` | flag | off | Enable CPU offloading for diffusion models | +| `--lora-path` | str | — | Path to PEFT LoRA adapter folder | +| `--lora-scale` | float | `1.0` | Scale factor for LoRA weights | + +**NextStep-1.1 specific arguments:** + +| Argument | Type | Default | Description | +| -------- | ---- | ------- | ----------- | +| `--guidance-scale-2` | float | `1.0` | Secondary guidance scale (e.g. image-level CFG) | +| `--timesteps-shift` | float | `1.0` | Timesteps shift parameter for sampling | +| `--cfg-schedule` | str | `"constant"` | CFG schedule type: `"constant"` or `"linear"` | +| `--use-norm` | flag | off | Apply layer normalization to sampled tokens | + +> If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage. + +> Qwen-Image currently publishes best-effort presets at `1328x1328`, `1664x928`, `928x1664`, `1472x1140`, `1140x1472`, `1584x1056`, and `1056x1584`. Adjust `--height/--width` accordingly for the most reliable outcomes. + +## More CLI Examples + +### Tongyi Models ```bash python text_to_image.py \ --model Tongyi-MAI/Z-Image-Turbo \ --prompt "a cup of coffee on the table" \ --seed 42 \ - --cfg-scale 4.0 \ + --guidance-scale 0.0 \ --num-images-per-prompt 1 \ - --num-inference-steps 50 \ + --num-inference-steps 9 \ --height 1024 \ --width 1024 \ --output outputs/coffee.png ``` +`Tongyi-MAI/Z-Image-Turbo` is a distilled version of Z-Image. Distilled diffusion models usually require less number of inference steps (4~9), and Classifier-Free Guidance (CFG) is usually NOT applied. Similar distilled models are `black-forest-labs/FLUX.2-klein-4B` and `black-forest-labs/FLUX.2-klein-9B`. + ### NextStep Models -NextStep-1.1 can have extra arguments +NextStep-1.1 supports extra arguments for dual-level CFG control: + ```bash python text_to_image.py \ --model stepfun-ai/NextStep-1.1 \ @@ -106,8 +138,10 @@ python text_to_image.py \ --seed 42 ``` -### Flux.2-dev Models -To start Flux.2-dev with a single GPU, cpu-offload must be enabled because the total size of its weights exceeds the 80GB memory capacity of the GPU. +### FLUX.2-dev Models + +To run FLUX.2-dev on a single GPU, `--enable-cpu-offload` is required because the model weights exceed 80 GiB: + ```bash python examples/offline_inference/text_to_image/text_to_image.py \ --model black-forest-labs/FLUX.2-dev \ @@ -123,38 +157,82 @@ python examples/offline_inference/text_to_image/text_to_image.py \ --output flux2-dev.png ``` -### Key Arguments +### Batch Requests (Multiple Prompts) -**Common arguments:** +You can pass multiple prompts in a single `generate` call. + +```python +from vllm_omni.entrypoints.omni import Omni + +if __name__ == "__main__": + omni = Omni(model="Qwen/Qwen-Image") + prompts = [ + "a cup of coffee on a table", + "a toy dinosaur on a sandy beach", + "a fox waking up in bed and yawning", + ] + outputs = omni.generate(prompts) + for i, output in enumerate(outputs): + output.request_output.images[0].save(f"{i}.jpg") +``` + +!!! info + + Not all models support batch inference, and batch requesting mostly does not provide significant + performance improvement. This feature is primarily for interface compatibility with vLLM and to + allow for future improvements. + +!!! info -- `--prompt`: text description (string). -- `--seed`: integer seed for deterministic sampling. -- `--cfg-scale`: true CFG scale (model-specific guidance strength). -- `--num-images-per-prompt`: number of images to generate per prompt (saves as `output`, `output_1`, ...). -- `--num-inference-steps`: diffusion sampling steps (more steps = higher quality, slower). -- `--height/--width`: output resolution (defaults 1024x1024). -- `--output`: path to save the generated PNG. -- `--vae-use-slicing`: enable VAE slicing for memory optimization. -- `--vae-use-tiling`: enable VAE tiling for memory optimization. -- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](../../../docs/user_guide/diffusion_acceleration.md#using-cfg-parallel). -- `--enable-cpu-offload`: enable CPU offloading for diffusion models. -- `--guidance-scale`: classifier-free guidance scale. + For diffusion pipelines, the stage config field `stage_args.[].runtime.max_batch_size` is 1 by + default, and the input list is sliced into single-item requests before feeding into the diffusion + pipeline. For models that do internally support batched inputs, you can + [modify this configuration](../../../configuration/stage_configs.md) to let the model accept a + longer batch of prompts. -**NextStep-1.1 specific:** -- `--guidance-scale-2`: secondary guidance scale, e.g. image-level CFG (default: 1.0). -- `--timesteps-shift`: timesteps shift parameter for sampling (default: 1.0). -- `--cfg-schedule`: CFG schedule type, "constant" or "linear" (default: "constant"). -- `--use-norm`: apply layer normalization to sampled tokens. +### Negative Prompts -> ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage. +vLLM-Omni supports dictionary prompts for models that accept negative prompts: -> ℹ️ Qwen-Image currently publishes best-effort presets at `1328x1328`, `1664x928`, `928x1664`, `1472x1140`, `1140x1472`, `1584x1056`, and `1056x1584`. Adjust `--height/--width` accordingly for the most reliable outcomes. +```python +from vllm_omni.entrypoints.omni import Omni -## LoRA +if __name__ == "__main__": + omni = Omni(model="Qwen/Qwen-Image") + outputs = omni.generate([ + { + "prompt": "a cup of coffee on a table", + "negative_prompt": "low resolution" + }, + { + "prompt": "a toy dinosaur on a sandy beach", + "negative_prompt": "cinematic, realistic" + } + ]) + for i, output in enumerate(outputs): + output.request_output.images[0].save(f"{i}.jpg") +``` -This example supports Peft-compatible LoRA (Low-Rank Adaptation) adapters for diffusion models. Pass `--lora-path` to use a LoRA adapter and optionally `--lora-scale` (default 1.0); omit it to use the base model only. +You can also pass a negative prompt via the CLI argument `--negative-prompt`: -### Basic usage with LoRA +```bash +python examples/offline_inference/text_to_image/text_to_image.py \ + --model Qwen/Qwen-Image \ + --prompt "a cup of coffee on a table" \ + --negative-prompt "low resolution, blurry" \ + --output coffee.png +``` + +### Advanced Features + +#### CFG Parallel + +Set `--cfg-parallel-size 2` to enable CFG Parallel for faster inference on multi-GPU setups. +See more examples in the [diffusion acceleration user guide](../../../docs/user_guide/diffusion_acceleration.md#using-cfg-parallel). + +#### LoRA + +This example supports PEFT-compatible LoRA (Low-Rank Adaptation) adapters for diffusion models. Pass `--lora-path` to use a LoRA adapter and optionally `--lora-scale` (default `1.0`); omit it to use the base model only. ```bash python text_to_image.py \ @@ -165,14 +243,7 @@ python text_to_image.py \ --output output.png ``` -### LoRA parameters - -- `--lora-path`: Path to LoRA adapter folder (PEFT format). Loaded at initialization and used for generation. -- `--lora-scale`: Scale factor for LoRA weights (default: 1.0). Higher values increase the influence of the LoRA adapter. - -### LoRA adapter format - -LoRA adapters must be in PEFT (Parameter-Efficient Fine-Tuning) format. A typical LoRA adapter directory structure: +LoRA adapters must be in PEFT format. A typical adapter directory structure: ``` lora_adapter/ @@ -182,10 +253,10 @@ lora_adapter/ ## Web UI Demo -Launch the gradio demo: +Launch the Gradio demo: ```bash python gradio_demo.py --port 7862 ``` -Then open `http://localhost:7862/` on your local browser to interact with the web UI. +Then open `http://localhost:7862/` in your local browser to interact with the web UI. diff --git a/pyproject.toml b/pyproject.toml index 15e408a9661..3fd987d99e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,8 @@ dev = [ "mooncake-transfer-engine==0.3.8.post1", "av", # for ComfyUI tests "openpyxl>=3.0.0", # for nightly CI - "pyttsx3>=2.99" + "pyttsx3>=2.99", + "mistune>=3.2.0", # for example tests ] docs = [ @@ -162,6 +163,7 @@ markers = [ "omni: Omni model tests", "cache: Cache backend tests", "parallel: Parallelism/distributed tests", + "example: Doc example code tests", # platform markers "cpu: Tests that run on CPU", "gpu: Tests that run on GPU (auto-added)", diff --git a/tests/examples/conftest.py b/tests/examples/conftest.py new file mode 100644 index 00000000000..8a7705d9297 --- /dev/null +++ b/tests/examples/conftest.py @@ -0,0 +1,338 @@ +""" +Shared fixtures, helpers, and path constants for tests/examples/. +""" + +import json +import os +import re +import shlex +import subprocess +import sys +import tempfile +from collections import defaultdict +from collections.abc import Callable +from pathlib import Path +from typing import Any, NamedTuple, cast + +import mistune +import pytest +import torch +from safetensors.torch import save_file + +# --------------------------------------------------------------------------- +# Path constants and fixtures +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parents[2] +EXAMPLES = REPO_ROOT / "examples" + +# Use Python tempfile instead of pytest's tmp_path_factory because +# OUTPUT_DIR is needed in test collection time, but tmp_path_factory is only available in test running time. +# It is needed during test collection because extract_readme_snippets replaces LoRA path with a generated one under OUTPUT_DIR, +# and extract_readme_snippets is called at collection time to generate separate test cases for each README code block. +OUTPUT_DIR = ( + REPO_ROOT / prefix + if (prefix := os.environ.get("OUTPUT_DIR")) + else Path(tempfile.mkdtemp(prefix="vllm_omni_test_examples_")) +) + + +# --------------------------------------------------------------------------- +# Code snippet extraction and asset file helpers +# --------------------------------------------------------------------------- + +# parameters: language, code, h2_title +ReadmeSnippetExtractionSkipPredicate = Callable[[str, str, str], tuple[bool, str]] + + +class ReadmeSnippet(NamedTuple): + language: str + code: str + h2_title: str + index_in_section: int + output_file_path: Path | None = None + skip: tuple[bool, str] = (False, "") + + @property + def test_id(self) -> str: + return f"{ReadmeSnippet._slug(self.h2_title)}_{self.index_in_section:03d}" + + @staticmethod + def extract_readme_snippets( + readme_path: Path, + skipif: ReadmeSnippetExtractionSkipPredicate | None = None, + ) -> list["ReadmeSnippet"]: + markdown = mistune.create_markdown(renderer="ast") + tokens = markdown(readme_path.read_text(encoding="utf-8")) + tokens = cast(list[dict[str, Any]], tokens) # mistune's AST renderer always produces a list, not a str + + h2_title = "" + section_counts: defaultdict[str, int] = defaultdict(int) + snippets: list[ReadmeSnippet] = [] + + for token in tokens: + token_type = token.get("type") + + if token_type == "heading": + level = (token.get("attrs") or {}).get("level") + title = ReadmeSnippet._heading_text(token) + if level == 2: + h2_title = title + continue + + if token_type != "block_code": + continue + + try: + info = token.get("attrs").get("info") # type: ignore[reportOptionalMemberAccess] + language = info.strip().split()[0].lower() # type: ignore[reportOptionalMemberAccess] + + # Common shell aliases to "bash" in several markdown renderers. + if language in {"shell", "sh", "ksh", "zsh"}: + language = "bash" + + if language not in {"bash", "python"}: + continue + except AttributeError: + # The fence is missing explicit language info; skip it. + continue + + key = h2_title + section_counts[key] += 1 + code = token.get("raw", "") + output_file_path = None + if language == "bash": + argv = ReadmeSnippet._normalize_bash_command(code, Path(readme_path.parent)) + code = shlex.join(argv) + output_file_path = ReadmeSnippet._output_file_path_from_argv(argv) + if skipif is not None: + skip_config = skipif(language, code, h2_title) + else: + skip_config = (False, "") + snippet = ReadmeSnippet( + language=language, + code=code, + h2_title=h2_title, + index_in_section=section_counts[key], + output_file_path=output_file_path, + skip=skip_config, + ) + snippets.append(snippet) + + return snippets + + @staticmethod + def _normalize_bash_command(command: str, readme_dir: Path) -> list[str]: + line_joined_command = re.sub(r"\\\s*\n", " ", command).strip() + argv = shlex.split(line_joined_command, comments=True) + assert argv, "README bash fence produced an empty command" + + # Normalize python directory and example script location + if argv[0] in {"python", "python3"}: + argv[0] = sys.executable + if len(argv) > 1 and argv[1].endswith(".py"): + script_arg = argv[1] + script_path = Path(script_arg) + if script_path.is_absolute(): + resolved_script = script_path + else: + # Take the file name only, and append script_dir to its front + resolved_script = readme_dir / script_path.name + assert resolved_script.exists(), ( + f"README bash snippet references a script that does not exist: {script_arg} (resolved to {resolved_script})" + ) + argv[1] = str(resolved_script) + + # Normalize LoRA adapter path and ensure README LoRA assets exist. + try: + lora_arg_idx = argv.index("--lora-path") # Raise ValueError if not found + assert len(argv) > lora_arg_idx + 1, "README bash snippet uses --lora-path without a following value" + + lora_dir = OUTPUT_DIR / "lora" + adapter_model = lora_dir / "adapter_model.safetensors" + adapter_config = lora_dir / "adapter_config.json" + if not adapter_model.exists() or not adapter_config.exists(): + write_zimage_lora(lora_dir, v_scale=8.0) + + argv[lora_arg_idx + 1] = str(lora_dir) + except ValueError: + pass + + return argv + + @staticmethod + def _output_file_path_from_argv(argv: list[str]) -> Path | None: + if "--output" not in argv: + return None + output_param_idx = argv.index("--output") + assert len(argv) > output_param_idx + 1, "README bash snippet uses --output without a following value" + output_arg = argv[output_param_idx + 1] + return Path(output_arg) + + @staticmethod + def _slug(text: str) -> str: + return "".join(ch.lower() if ch.isalnum() else "_" for ch in text).strip("_") + + @staticmethod + def _heading_text(token: dict) -> str: + return "".join(child.get("raw", "") for child in token.get("children", [])).strip() + + +# [TODO] Duplicate `_write_zimage_lora` in tests/e2e/online_serving/test_images_generations_lora.py. Combine these helpers and tests/e2e/offline_inference/test_diffusion_lora.py to test/utils later +def write_zimage_lora(adapter_dir: Path, *, q_scale: float = 0.0, k_scale: float = 0.0, v_scale: float = 0.0): + adapter_dir.mkdir(parents=True, exist_ok=True) + + # Z-Image transformer uses dim=3840 by default. + dim = 3840 + module_name = "transformer.layers.0.attention.to_qkv" + rank = 1 + + lora_a = torch.zeros((rank, dim), dtype=torch.float32) + lora_a[0, 0] = 1.0 + + # QKVParallelLinear packs (Q, K, V) => out dim is 3 * dim (tp=1). + lora_b = torch.zeros((3 * dim, rank), dtype=torch.float32) + if q_scale: + lora_b[:dim, 0] = q_scale + if k_scale: + lora_b[dim : 2 * dim, 0] = k_scale + if v_scale: + lora_b[2 * dim :, 0] = v_scale + + save_file( + { + f"base_model.model.{module_name}.lora_A.weight": lora_a, + f"base_model.model.{module_name}.lora_B.weight": lora_b, + }, + str(adapter_dir / "adapter_model.safetensors"), + ) + (adapter_dir / "adapter_config.json").write_text( + json.dumps( + { + "r": rank, + "lora_alpha": rank, + "target_modules": [module_name], + } + ), + encoding="utf-8", + ) + + +# --------------------------------------------------------------------------- +# Code runner and subprocess helpers +# --------------------------------------------------------------------------- + + +class ExampleRunResult(NamedTuple): + run_dir: Path + assets: list[Path] + + +class ExampleRunner: + """Run extracted README snippets and return generated assets. + + The output materials are organized in a three-level directory structure: + - Set at init: `self.output_root` for all tests (from env OUTPUT_DIR) + - Set at `self.run(...)`: `output_subfolder` for a specific example page (e.g., `example_offline_t2i`) + - Generated by `extract_readme_snippets`: `snippet.test_id` for a specific code block (matching H2 titles, e.g., `basic_usage_001`) + """ + + IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp"} + + def __init__(self, output_root: Path) -> None: + self.output_root = output_root + + def run( + self, snippet: ReadmeSnippet, *, output_subfolder: Path = Path("."), env: dict[str, str] | None = None + ) -> ExampleRunResult: + run_dir = self.output_root / output_subfolder / snippet.test_id + run_dir.mkdir(parents=True, exist_ok=True) + + if snippet.language == "python": + assets = self._run_python_snippet(snippet, run_dir, env) + return ExampleRunResult(run_dir=run_dir, assets=assets) + + if snippet.language == "bash": + asset = self._run_bash_snippet(snippet, run_dir, env) + return ExampleRunResult(run_dir=run_dir, assets=[asset]) + + raise AssertionError(f"Unsupported snippet language: {snippet.language}") + + def _run_python_snippet( + self, snippet: ReadmeSnippet, run_dir: Path, env: dict[str, str] | None = None + ) -> list[Path]: + # Saving the script to a temporary file and `run_cmd` it. + # Not using `exec(snippet.code)` because the output is lost. + script_path = run_dir / "snippet.py" + script_path.write_text(snippet.code, encoding="utf-8") + + before = self._collect_images(run_dir) + run_cmd([sys.executable, str(script_path)], cwd=run_dir, env=env) + after = self._collect_images(run_dir) + + assets = sorted(after - before) + return assets + + def _run_bash_snippet(self, snippet: ReadmeSnippet, run_dir: Path, env: dict[str, str] | None = None) -> Path: + run_cmd(snippet.code, shell=True, cwd=run_dir, env=env) + + assert snippet.output_file_path is not None, ( + f"README bash snippet is missing --output argument: {snippet.test_id}. " + "The test script cannot guess the output file path." + ) + + # If the code snippet declares a relative path for the output file, append this path to the parent output collection directory. + # If the code snippet declares an absolute path (not likely but just in case), the return value resolution removes `run_dir`, also correctly pointing to this file. + return run_dir / snippet.output_file_path + + def _collect_images(self, root: Path) -> set[Path]: + return {path for path in root.rglob("*") if path.suffix.lower() in self.IMAGE_SUFFIXES} + + +@pytest.fixture +def example_runner() -> ExampleRunner: + return ExampleRunner(output_root=OUTPUT_DIR) + + +def run_cmd( + command: list[str] | str, + *, + shell: bool = False, + env: dict[str, str] | None = None, + cwd: Path | str | None = None, +) -> str: + """Run a command as a subprocess; assert zero exit code and return stdout. + + Output is fully captured and returned as a string so callers can parse it + (e.g. with :func:`extract_content_after_keyword`). + Use this for scripts whose printed output is part of the test assertion. + """ + if env is not None: + env = {**os.environ.copy(), **env} + result = subprocess.run(command, capture_output=True, text=True, shell=shell, env=env, cwd=cwd) + + if result.returncode != 0: + print(f"STDERR: {result.stderr}") + raise subprocess.CalledProcessError(result.returncode, command) + + all_output = result.stdout + print(f"All output:\n{all_output}") + return all_output + + +# --------------------------------------------------------------------------- +# Output validation helpers +# --------------------------------------------------------------------------- + + +def extract_content_after_keyword(keywords: str, text: str) -> str: + """Return the text that follows *keywords* in *text* (regex match). + + Raises ``AssertionError`` if the keyword is not found, so test failures + produce a clear message pointing at the missing keyword. + """ + matches = re.findall(rf"{keywords}\s*(.+)", text, re.DOTALL) + + if not matches: + raise AssertionError(f"Keywords {keywords} not found in provided text output") + return matches[0] diff --git a/tests/examples/offline_inference/__init__.py b/tests/examples/offline_inference/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/examples/offline_inference/test_text_to_image.py b/tests/examples/offline_inference/test_text_to_image.py new file mode 100644 index 00000000000..a08d16f1614 --- /dev/null +++ b/tests/examples/offline_inference/test_text_to_image.py @@ -0,0 +1,38 @@ +""" +Offline inference tests: text-to-image. +See examples/offline_inference/text_to_image/README.md +""" + +from pathlib import Path + +import pytest + +from tests.conftest import assert_image_valid +from tests.examples.conftest import EXAMPLES, ExampleRunner, ReadmeSnippet +from tests.utils import hardware_marks + +pytestmark = [pytest.mark.advanced_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})] + +T2I_SCRIPT = EXAMPLES / "offline_inference" / "text_to_image" / "text_to_image.py" +README_PATH = T2I_SCRIPT.with_name("README.md") +EXAMPLE_OUTPUT_SUBFOLDER = "example_offline_t2i" + + +def _skip_readme_snippet(language: str, code: str, h2_title: str) -> tuple[bool, str]: + if h2_title == "Web UI Demo": + return True, f"README section '{h2_title}' is intentionally excluded for examples tests" + return False, "" + + +README_SNIPPETS = ReadmeSnippet.extract_readme_snippets(README_PATH, skipif=_skip_readme_snippet) + + +@pytest.mark.parametrize("snippet", README_SNIPPETS, ids=lambda snippet: snippet.test_id) +def test_text_to_image(snippet: ReadmeSnippet, example_runner: ExampleRunner): + should_skip, reason = snippet.skip + if should_skip: + pytest.skip(reason) + + result = example_runner.run(snippet, output_subfolder=Path(EXAMPLE_OUTPUT_SUBFOLDER)) + for asset in result.assets: + assert_image_valid(asset) diff --git a/tests/examples/online_serving/__init__.py b/tests/examples/online_serving/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/examples/online_serving/test_qwen2_5_omni.py b/tests/examples/online_serving/test_qwen2_5_omni.py index 6d6ebd318c5..370f2251f54 100644 --- a/tests/examples/online_serving/test_qwen2_5_omni.py +++ b/tests/examples/online_serving/test_qwen2_5_omni.py @@ -1,5 +1,6 @@ """ -Example online tests for Qwen2.5-Omni-7B model. +Online serving tests: Qwen2.5-Omni-7B. +See examples/online_serving/qwen2_5_omni/README.md """ import os @@ -8,15 +9,16 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -import re -import subprocess from pathlib import Path import pytest from tests.conftest import OmniServerParams, convert_audio_file_to_text, cosine_similarity_text +from tests.examples.conftest import extract_content_after_keyword, run_cmd from tests.utils import hardware_test +pytestmark = [pytest.mark.advanced_model, pytest.mark.example] + models = ["Qwen/Qwen2.5-Omni-7B"] @@ -36,30 +38,6 @@ ] -def run_cmd(command): - result = subprocess.run( - command, - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f"STDERR: {result.stderr}") - raise subprocess.CalledProcessError(result.returncode, command) - - all_output = result.stdout - print(f"All output:\n{all_output}") - return all_output - - -def extract_content_after_keyword(keywords, text): - matches = re.findall(rf"{keywords}\s*(.+)", text, re.DOTALL) - - if not matches: - raise AssertionError(f"Keywords {keywords} not found in provided text output") - return matches[0] - - @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) diff --git a/tests/examples/online_serving/test_qwen3_omni.py b/tests/examples/online_serving/test_qwen3_omni.py index abe5322dd13..7743ae55205 100644 --- a/tests/examples/online_serving/test_qwen3_omni.py +++ b/tests/examples/online_serving/test_qwen3_omni.py @@ -1,5 +1,6 @@ """ -Example Online tests for Qwen3-Omni model. +Online serving tests: Qwen3-Omni. +See examples/online_serving/qwen3_omni/README.md """ import os @@ -8,15 +9,16 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -import re -import subprocess from pathlib import Path import pytest from tests.conftest import OmniServerParams, convert_audio_file_to_text, cosine_similarity_text +from tests.examples.conftest import extract_content_after_keyword, run_cmd from tests.utils import hardware_test +pytestmark = [pytest.mark.advanced_model, pytest.mark.example] + models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] @@ -35,30 +37,6 @@ ] -def run_cmd(command): - result = subprocess.run( - command, - capture_output=True, - text=True, - ) - - if result.returncode != 0: - print(f"STDERR: {result.stderr}") - raise subprocess.CalledProcessError(result.returncode, command) - - all_output = result.stdout - print(f"All output:\n{all_output}") - return all_output - - -def extract_content_after_keyword(keywords, text): - matches = re.findall(rf"{keywords}\s*(.+)", text, re.DOTALL) - - if not matches: - raise AssertionError(f"Keywords {keywords} not found in provided text output") - return matches[0] - - @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) diff --git a/tests/examples/online_serving/test_text_to_image.py b/tests/examples/online_serving/test_text_to_image.py new file mode 100644 index 00000000000..51b7ff61bc9 --- /dev/null +++ b/tests/examples/online_serving/test_text_to_image.py @@ -0,0 +1,136 @@ +""" +Online serving tests: text-to-image. +See examples/online_serving/text_to_image/README.md + +The output materials are organized in a three-level directory structure: +- Set at init: `self.output_root` for all tests (from env OUTPUT_DIR) +- Set at `self.run(...)`: `output_subfolder` for a specific example page (e.g., `example_offline_t2i`) +- Generated by `extract_readme_snippets`: `snippet.test_id` for a specific code block (matching H2 titles, e.g., `basic_usage_001`) +""" + +import sys +from pathlib import Path + +import pytest + +from tests.conftest import OmniServer, OmniServerParams, assert_image_valid +from tests.examples.conftest import EXAMPLES, OUTPUT_DIR, run_cmd, write_zimage_lora +from tests.utils import hardware_marks + +pytestmark = [pytest.mark.advanced_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})] + +T2I_ONLINE_CLIENT = EXAMPLES / "online_serving" / "text_to_image" / "openai_chat_client.py" +EXAMPLE_OUTPUT_SUBFOLDER = "example_online_t2i" + + +@pytest.fixture(scope="module") +def example_output_dir() -> Path: + d = OUTPUT_DIR / EXAMPLE_OUTPUT_SUBFOLDER + d.mkdir(parents=True, exist_ok=True) + return d + + +# Ensuring two indirect parametrization receive exactly the same parameter list. +# Then, only one omni_server instance is started for all test functions using this param set. +# Two additional pre-requisites: +# - omni_server must (and indeed is) defined as a module-scoped fixture +# - test functions sharing the same param set must be adjacent to each other +qwen_image_server_params = [OmniServerParams(model="Qwen/Qwen-Image")] +z_image_turbo_server_params = [OmniServerParams(model="Tongyi-MAI/Z-Image-Turbo")] + + +# --- ### Method 1: Using curl --- + + +@pytest.mark.parametrize("omni_server", qwen_image_server_params, indirect=True) +def test_api_calls_001(omni_server: OmniServer, example_output_dir: Path): + url = f"http://{omni_server.host}:{omni_server.port}/v1/chat/completions" + case_dir = example_output_dir / "api_calls-001" + case_dir.mkdir(parents=True, exist_ok=True) + out = case_dir / "api_calls_001.png" + json_str = """{ + "messages": [ + {"role": "user", "content": "A beautiful landscape painting"} + ], + "extra_body": { + "height": 1024, + "width": 1024, + "num_inference_steps": 50, + "true_cfg_scale": 4.0, + "seed": 42 + } +}""" + run_cmd( + f"curl -s '{url}'" + " -H 'Content-Type: application/json'" + f" -d '{json_str}'" + " | jq -r '.choices[0].message.content[0].image_url.url'" + f" | cut -d',' -f2- | base64 -d > '{out}'", + shell=True, + ) + assert_image_valid(out, width=1024, height=1024) + + +# --- ### Method 2: Using Python Client --- + + +@pytest.mark.parametrize("omni_server", qwen_image_server_params, indirect=True) +def test_api_calls_002(omni_server: OmniServer, example_output_dir: Path): + case_dir = example_output_dir / "api_calls-002" + case_dir.mkdir(parents=True, exist_ok=True) + out = case_dir / "api_calls_002.png" + run_cmd( + [ + sys.executable, + str(T2I_ONLINE_CLIENT), + "--prompt", + "A beautiful landscape painting", + "--output", + str(out), + "--server", + f"http://{omni_server.host}:{omni_server.port}", + ] + ) + assert_image_valid(out) + + +@pytest.mark.skip("README section 'Method 3: Using Gradio Demo' is intentionally excluded for examples tests") +def test_api_calls_003(): ... + + +# --- ### Using Python Client with LoRA --- + + +@pytest.mark.parametrize("omni_server", z_image_turbo_server_params, indirect=True) +def test_lora_001(omni_server: OmniServer, example_output_dir: Path, tmp_path: Path): + lora_dir = tmp_path / "zimage_lora_a" + write_zimage_lora(lora_dir, v_scale=8.0) + case_dir = example_output_dir / "lora-001" + case_dir.mkdir(parents=True, exist_ok=True) + out = case_dir / "lora_001.png" + run_cmd( + [ + sys.executable, + str(T2I_ONLINE_CLIENT), + "--prompt", + "A piece of cheesecake", + "--lora-path", + str(lora_dir), + "--lora-name", + "a", + "--lora-scale", + "1.0", + "--output", + str(out), + "--server", + f"http://{omni_server.host}:{omni_server.port}", + ] + ) + assert_image_valid(out) + + +# --- ### Using curl with LoRA (Images API) --- + + +@pytest.mark.skip(reason="Covered by tests/e2e/online_serving/test_images_generations_lora.py") +def test_lora_002(): ...