From 22296c2ef4933c8aaa329d20c5aae34b50964056 Mon Sep 17 00:00:00 2001
From: "Huang, Zeyu" <11222265+fhfuih@users.noreply.github.com>
Date: Thu, 12 Mar 2026 16:07:39 +0800
Subject: [PATCH] L4 test for t2i doc examples (rebased til 0319 fix DCO)

Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com>
---
 .buildkite/test-nightly.yml                   |  44 ++-
 docs/contributing/ci/.nav.yaml                |   6 +
 docs/contributing/ci/CI_5levels.md            | 155 ++++----
 .../ci/test_examples/doc_example_tests.inc.md |  49 +++
 docs/contributing/ci/tests_style.md           |   6 +
 .../offline_inference/text_to_image/README.md | 253 ++++++++-----
 pyproject.toml                                |   4 +-
 tests/examples/conftest.py                    | 338 ++++++++++++++++++
 tests/examples/offline_inference/__init__.py  |   0
 .../offline_inference/test_text_to_image.py   |  38 ++
 tests/examples/online_serving/__init__.py     |   0
 .../online_serving/test_qwen2_5_omni.py       |  32 +-
 .../online_serving/test_qwen3_omni.py         |  32 +-
 .../online_serving/test_text_to_image.py      | 136 +++++++
 14 files changed, 872 insertions(+), 221 deletions(-)
 create mode 100644 docs/contributing/ci/.nav.yaml
 create mode 100644 docs/contributing/ci/test_examples/doc_example_tests.inc.md
 create mode 100644 tests/examples/conftest.py
 create mode 100644 tests/examples/offline_inference/__init__.py
 create mode 100644 tests/examples/offline_inference/test_text_to_image.py
 create mode 100644 tests/examples/online_serving/__init__.py
 create mode 100644 tests/examples/online_serving/test_text_to_image.py

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 53a4f180c59..f9bce484bec 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -70,7 +70,7 @@ steps:
   - label: ":full_moon: Diffusion Model Test with H100"
     timeout_in_minutes: 60
     depends_on: upload-nightly-pipeline
-    # if: build.env("NIGHTLY") == "1"
+    if: build.env("NIGHTLY") == "1"
     commands:
       - export VLLM_WORKER_MULTIPROC_METHOD=spawn
       - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model"
@@ -108,6 +108,48 @@ steps:
                   path: /mnt/hf-cache
                   type: DirectoryOrCreate
 
+  - label: ":full_moon: Documentation Example Code Test with H100"
+    timeout_in_minutes: 60
+    depends_on: upload-nightly-pipeline
+    if: build.env("NIGHTLY") == "1"
+    commands:
+      - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+      - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+      - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model"
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 2
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
+
   - label: ":full_moon: Qwen3-TTS Non-Async-Chunk E2E Test"
     timeout_in_minutes: 30
     depends_on: upload-nightly-pipeline
diff --git a/docs/contributing/ci/.nav.yaml b/docs/contributing/ci/.nav.yaml
new file mode 100644
index 00000000000..0f187f3a15d
--- /dev/null
+++ b/docs/contributing/ci/.nav.yaml
@@ -0,0 +1,6 @@
+nav:
+  - CI_5levels.md
+  - failures.md
+  - test_guide.md
+  - test_markers.md
+  - test_style.md
diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md
index 03b907f3239..1f9e6a1882f 100644
--- a/docs/contributing/ci/CI_5levels.md
+++ b/docs/contributing/ci/CI_5levels.md
@@ -545,97 +545,104 @@ L4 level testing is a comprehensive quality audit before a version release. It e
 -   ***Trigger Timing***: **`Nightly`**, automatically executed every night.
 -   ***Execution Environment***: ***GPU*** server clusters to meet the resource demands of performance testing.
 -   ***Script Example***:
-<details>
-<summary> Test Examples</summary>
-When you want to add L4-level performance test cases, you can refer to the following format for case addition in tests/perf/tests/test.json:
-
-```JSON
-{
-    "test_name": "test_qwen3_omni",
-    "server_params": {
-        "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
-        "stage_config_name": "qwen3_omni.yaml"
-    },
-    "benchmark_params": [
-        {
-            "dataset_name": "random",
-            "num_prompts": [10, 20],
-            "request_rate": [0.5, 1],
-            "random_input_len": 2500,
-            "random_output_len": 900,
-            "ignore_eos": true,
-            "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration",
-            "baseline": {
-                "mean_ttft_ms": 100000,
-                "mean_audio_ttfp_ms": 100000,
-                "mean_audio_rtf": 100000
+
+???+ example "Test Examples"
+
+    When adding L4-level ***documentation example Tests***, please pay attention to the following guides.
+
+    --8<-- "docs/contributing/ci/test_examples/doc_example_tests.inc.md"
+
+    When you want to add L4-level ***performance test*** cases, you can refer to the following format for case addition in tests/perf/tests/test.json:
+
+    ```JSON
+    {
+        "test_name": "test_qwen3_omni",
+        "server_params": {
+            "model": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
+            "stage_config_name": "qwen3_omni.yaml"
+        },
+        "benchmark_params": [
+            {
+                "dataset_name": "random",
+                "num_prompts": [10, 20],
+                "request_rate": [0.5, 1],
+                "random_input_len": 2500,
+                "random_output_len": 900,
+                "ignore_eos": true,
+                "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration",
+                "baseline": {
+                    "mean_ttft_ms": 100000,
+                    "mean_audio_ttfp_ms": 100000,
+                    "mean_audio_rtf": 100000
+                }
             }
-        }
-    ]
-}
-```
+        ]
+    }
+    ```
 
-#### Parameter Explanation
+    **Parameter Explanation**
 
-***Overview***
+    *Overview*
 
-| Field            | Required | Description                                                     |
-| ---------------- | -------- | --------------------------------------------------------------- |
-| test_name        | Yes      | Unique identifier for the test case                             |
-| server_params    | Yes      | Server-side configuration parameters                            |
-| benchmark_params | Yes      | Benchmark running parameters (supports multiple configurations) |
+    | Field            | Required | Description                                                     |
+    | ---------------- | -------- | --------------------------------------------------------------- |
+    | test_name        | Yes      | Unique identifier for the test case                             |
+    | server_params    | Yes      | Server-side configuration parameters                            |
+    | benchmark_params | Yes      | Benchmark running parameters (supports multiple configurations) |
 
-#### server_params Configuration
+    **server_params Configuration**
 
-##### Basic Parameters
+    *Basic Parameters*
 
-| Parameter         | Required | Example                            | Description                   |
-| ----------------- | -------- | ---------------------------------- | ----------------------------- |
-| model             | Yes      | "Qwen/Qwen3-Omni-30B-A3B-Instruct" | Model name or path            |
-| stage_config_name | Yes      | "qwen3_omni.yaml"                  | Stage configuration file name |
+    | Parameter         | Required | Example                            | Description                   |
+    | ----------------- | -------- | ---------------------------------- | ----------------------------- |
+    | model             | Yes      | "Qwen/Qwen3-Omni-30B-A3B-Instruct" | Model name or path            |
+    | stage_config_name | Yes      | "qwen3_omni.yaml"                  | Stage configuration file name |
 
-##### Dynamic Configuration (update/delete)
+    *Dynamic Configuration (update/delete)*
 
-Supports incremental modifications based on the basic configuration:
+    Supports incremental modifications based on the basic configuration:
 
-| Operation | Description                          |
-| --------- | ------------------------------------ |
-| update    | Update or add configuration items    |
-| delete    | Delete specified configuration items |
+    | Operation | Description                          |
+    | --------- | ------------------------------------ |
+    | update    | Update or add configuration items    |
+    | delete    | Delete specified configuration items |
 
-***Example***:
-```
-"update": {
-    "async_chunk": true,  // Enable asynchronous chunk processing
-    "stage_args": {
-        "0": {
-            "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk"
+    ***Example***:
+
+    ```
+    "update": {
+        "async_chunk": true,  // Enable asynchronous chunk processing
+        "stage_args": {
+            "0": {
+                "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk"
+            }
+        }
+    },
+    "delete": {
+        "stage_args": {
+            "2": ["custom_process_input_func"]  // Delete this configuration for stage 2
         }
     }
-},
-"delete": {
-    "stage_args": {
-        "2": ["custom_process_input_func"]  // Delete this configuration for stage 2
-    }
-}
-```
-#### benchmark_params Configuration
+    ```
 
-You can add any benchmark running parameters you need here. For all optional parameters, refer to the [benchmark documentation](https://github.com/vllm-project/vllm-omni/blob/main/docs/cli/bench/serve.md). General modifications are as follows:
+    **benchmark_params Configuration**
 
-1.  Change the ---xxx-xx-xx running parameters to xxx_xx_xx format and fill them as keys in the JSON file.
-2.  For boolean variables in the running parameters, modify them to forms such as ignore_eos: true/false and fill them into the JSON file.
-3.  Add the baseline parameter to specify the required validation values, ensuring the validation metric names match those in the result.json generated by the benchmark.
-4.  The qps and concurrency modes are mutually exclusive. For detailed explanations, see the table below:
+    You can add any benchmark running parameters you need here. For all optional parameters, refer to the [benchmark documentation](https://github.com/vllm-project/vllm-omni/blob/main/docs/cli/bench/serve.md). General modifications are as follows:
 
-| Parameter       | Type        | Required | Example/Values  | Description                                                                                                                                                                                                                                                          |
-| --------------- | ----------- | -------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| num_prompts     | int / array | Yes      | 10,[10, 20, 30] | Number of requests. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of qps or max_concurrency, e.g., [10,10,10]. If an array is used, its length must match the number of qps or max_concurrency. |
-| request_rate    | int / array | No       | 1, [1, 2, 3]    | Queries per second. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of num_prompts, e.g., [1,1,1]. If an array is used, its length must match the number of num_prompts.                          |
-| max_concurrency | int / array | No       | 1, [1, 2, 3]    | Queries per second. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of num_prompts, e.g., [1,1,1]. If an array is used, its length must match the number of num_prompts.                          |
-</details>
+    1.  Change the ---xxx-xx-xx running parameters to xxx_xx_xx format and fill them as keys in the JSON file.
+    2.  For boolean variables in the running parameters, modify them to forms such as ignore_eos: true/false and fill them into the JSON file.
+    3.  Add the baseline parameter to specify the required validation values, ensuring the validation metric names match those in the result.json generated by the benchmark.
+    4.  The qps and concurrency modes are mutually exclusive. For detailed explanations, see the table below:
+
+    | Parameter       | Type        | Required | Example/Values  | Description                                                                                                                                                                                                                                                          |
+    | --------------- | ----------- | -------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+    | num_prompts     | int / array | Yes      | 10,[10, 20, 30] | Number of requests. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of qps or max_concurrency, e.g., [10,10,10]. If an array is used, its length must match the number of qps or max_concurrency. |
+    | request_rate    | int / array | No       | 1, [1, 2, 3]    | Queries per second. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of num_prompts, e.g., [1,1,1]. If an array is used, its length must match the number of num_prompts.                          |
+    | max_concurrency | int / array | No       | 1, [1, 2, 3]    | Queries per second. Supports single values or arrays. If a single value is used, it will be automatically expanded to match the number of num_prompts, e.g., [1,1,1]. If an array is used, its length must match the number of num_prompts.                          |
+    </details>
 
--   -   ***Run Command***: (Specific commands would depend on the performance testing tool and configuration defined in `nightly.json`).
+    -   -   ***Run Command***: (Specific commands would depend on the performance testing tool and configuration defined in `nightly.json`).
 
 ## Chapter 4: L5 Level Testing - Stability and Reliability Testing
 
diff --git a/docs/contributing/ci/test_examples/doc_example_tests.inc.md b/docs/contributing/ci/test_examples/doc_example_tests.inc.md
new file mode 100644
index 00000000000..13dd032e275
--- /dev/null
+++ b/docs/contributing/ci/test_examples/doc_example_tests.inc.md
@@ -0,0 +1,49 @@
+**Preferred Test Strategy**
+
+Use one of the following patterns depending on page type:
+
+- **Dynamic code-block extraction (preferred for offline docs)**
+    - Extract Python/Bash code blocks from markdown AST analyzer, then execute them directly in tests.
+    - Benefit: test logic stays automatically aligned with docs.
+    - Basic idea: Use `ReadmeSnippet.extract_readme_snippets` to extract a list of code blocks as a global variable in file,
+    use this list as `pytest.mark.parametrize` parameters, and pass each snippet item to `example_runner.run` inside the parametrized test.
+    Additionally pass an `output_subfolder` argument for the 2nd-level output folder explained in **Output Directory Structure** below.
+    If any extra environment variable is need for a test (e.g., the example script reads it), `example_runner.run` also accepts a 3rd `env` parameter.
+    - See [tests/examples/offline_inference/test_text_to_image.py](https://github.com/vllm-project/vllm-omni/blob/main/tests/examples/offline_inference/test_text_to_image.py) for reference implementation.
+
+- **Explicit copied scripts (used by online docs for now until further update)**
+    - For online serving pages, it is acceptable to copy code from docs into dedicated test functions, because only client-side, request-sending scripts are tested.
+    - Benefit: dynamic extraction is overly complex: need to tell server-launch and client-request scripts.
+    - Requirement: copied test code must be kept in sync with doc updates.
+
+**Test Case Naming Convention**
+
+- Dynamic code extraction (auto-generated internally):
+    - `test_{single_function_name_matching_file_name}[h2_heading_00X]`
+    - Example: `test_text_to_image[basic_usage_001]`
+- Explicit copied scripts:
+    - `test_{h2_heading_00X}[{dummy_param_id_for_omni_server}]`
+    - Example: `test_api_calls_001[omni_server0]`
+
+**Runtime Configuration**
+
+In the example code tests, do **not** reduce `num_inference_steps` just to speed up the tests  unless there is a strong CI reliability reason to do otherwise.
+
+**Skipping Rules**
+
+You may skip examples falling in the following categories using `pytest.mark.skip` or `pytest.skip`:
+
+- Gradio UI scripts
+- Scenarios that significantly overlap with existing tests and add little new coverage.
+
+**Output Directory Structure**
+
+Use a three-layer output structure to store output artifacts:
+
+1. Root output directory
+    - Auto-detected from `OUTPUT_DIR` env var or auto-generated under `/tmp`.
+2. Doc-page directory
+    - Define and use a clear page-level folder name in each `test_*.py` yourself (abbreviations are acceptable, e.g., `example_offline_t2i`).
+3. Test-case directory
+    - Must match the case identifier (e.g., `basic_usage_001`).
+    - Auto-generated for dynamic extracted tests.
diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md
index 5d642fdb95e..53775520315 100644
--- a/docs/contributing/ci/tests_style.md
+++ b/docs/contributing/ci/tests_style.md
@@ -157,6 +157,12 @@ vllm_omni/                                    tests/
                                                        ├── qwen3_omni_ci.yaml
                                                        ├── bagel_*.yaml
                                                        └── npu/, rocm/, etc.
+examples/                                     tests
+│                                             └── examples
+├── online_serving/                     →         ├── online_serving/
+│   └── {doc_page_title}/README.md                │   └── test_{doc_page_title}.py  ⬜
+└── offline_inference/                  →         └── offline_inference/
+    └── {doc_page_title}/README.md                    └── test_{doc_page_title}.py  ⬜
 ```
 
 
diff --git a/examples/offline_inference/text_to_image/README.md b/examples/offline_inference/text_to_image/README.md
index 0de89c753c4..7019bd47bd7 100644
--- a/examples/offline_inference/text_to_image/README.md
+++ b/examples/offline_inference/text_to_image/README.md
@@ -1,97 +1,129 @@
 # Text-To-Image
 
-This folder provides several entrypoints for experimenting with `Qwen/Qwen-Image` `Qwen/Qwen-Image-2512` `Tongyi-MAI/Z-Image-Turbo` `stepfun-ai/NextStep-1.1` using vLLM-Omni, note that NextStep-1.1 has different architecture so we treat it differently regarding running arguments and pipeline.
+Generate images from text prompts using vLLM-Omni's diffusion pipeline entrypoints.
 
 - `text_to_image.py`: command-line script for single image generation with advanced options.
-- `web_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration.
+- `gradio_demo.py`: lightweight Gradio UI for interactive prompt/seed/CFG exploration.
 
-Note that when you pass in multiple independent prompts, they will be processed sequentially. Batching requests is currently not supported.
+## Table of Contents
 
-## Basic Usage
+- [Overview](#overview)
+- [Quick Start](#quick-start)
+- [Key Arguments](#key-arguments)
+- [More CLI Examples](#more-cli-examples)
+- [Web UI Demo](#web-ui-demo)
 
-```python
-from vllm_omni.entrypoints.omni import Omni
+## Overview
 
-if __name__ == "__main__":
-    omni = Omni(model="Qwen/Qwen-Image")
-    prompt = "a cup of coffee on the table"
-    outputs = omni.generate(prompt)
-    images = outputs[0].request_output.images
-    images[0].save("coffee.png")
-```
+This folder provides several entrypoints for experimenting with text-to-image diffusion models using vLLM-Omni. Note that `NextStep-1.1` has a different architecture, so it is treated differently regarding running arguments and pipeline.
 
-Or put more than one prompt in a request.
+### Supported Models
 
-```python
-from vllm_omni.entrypoints.omni import Omni
-
-if __name__ == "__main__":
-    omni = Omni(model="Qwen/Qwen-Image")
-    prompts = [
-      "a cup of coffee on a table",
-      "a toy dinosaur on a sandy beach",
-      "a fox waking up in bed and yawning",
-    ]
-    outputs = omni.generate(prompts)
-    for i, output in enumerate(outputs):
-      image = output.request_output.images[0].save(f"{i}.jpg")
-```
+| Model | Image Shape  | Peak VRAM (GiB) * | Model Weights (GiB) |
+| ----- | ----------- | ----------- | ----------------- |
+| `Qwen/Qwen-Image` | 1024 x 1024 | 60.0 | 53.7 |
+| `Qwen/Qwen-Image-2512` |1024 x 1024 | 60.0 | 53.7 |
+| `Tongyi-MAI/Z-Image-Turbo` | 1024 x 1024 | 24.8 | 19.2 |
+| `stepfun-ai/NextStep-1.1` | 512 x 512 | 71.8 | 28.1 |
+| `meituan-longcat/LongCat-Image` | 1024 x 1024 | 71.2 | 27.3 |
+| `AIDC-AI/Ovis-Image-7B` | 1024 x 1024 | 71.8 | 17.1 |
+| `OmniGen2/OmniGen2` |  1024 x 1024 | 20.1 | 14.7 |
+| `stabilityai/stable-diffusion-3.5-medium` | 1024 x 1024 | 20.1 | 15.6 |
+| `black-forest-labs/FLUX.1-dev` | 1024 x 1024 | 77.6 | 31.4 |
+| `black-forest-labs/FLUX.2-klein-4B` | 1024 x 1024 | 72.7 | 14.9 |
+| `black-forest-labs/FLUX.2-klein-9B` | 1024 x 1024 | 37.1 | 32.3 |
+| `black-forest-labs/FLUX.2-dev` | 1024 x 1024 | 65.7 | >80 (CPU offload required) |
 
 !!! info
+*Peak VRAM:  based on basic single-card usage, batch size =1, without any acceleration/optimization features. FLUX.2-dev requires `--enable-cpu-offload` on a single 80 GiB GPU.
 
-    However, it is not currently recommended to do so
-    because not all models support batch inference,
-    and batch requesting mostly does not provide significant performance improvement (despite the impression that it does).
-    This feature is primarily for the sake of interface compatibility with vLLM and to allow for future improvements.
+Default model: `Qwen/Qwen-Image`
 
-!!! info
+## Quick Start
 
-    For diffusion pipelines, the stage config field `stage_args.[].runtime.max_batch_size` is 1 by default, and the input
-    list is sliced into single-item requests before feeding into the diffusion pipeline. For models that do internally support
-    batched inputs, you can [modify this configuration](../../../configuration/stage_configs.md) to let the model accept a longer batch of prompts.
+### Python API
 
-Apart from string prompt, vLLM-Omni also supports dictionary prompts in the same style as vLLM.
-This is useful for models that support negative prompts.
+Single-prompt generation:
 
 ```python
 from vllm_omni.entrypoints.omni import Omni
 
 if __name__ == "__main__":
     omni = Omni(model="Qwen/Qwen-Image")
-    outputs = omni.generate([
-      {
-        "prompt": "a cup of coffee on a table"，
-        "negative_prompt": "low resolution"
-      },
-      {
-        "prompt": "a toy dinosaur on a sandy beach"，
-        "negative_prompt": "cinematic, realistic"
-      }
-    ])
-    for i, output in enumerate(outputs):
-      image = output.request_output.images[0].save(f"{i}.jpg")
+    prompt = "a cup of coffee on the table"
+    outputs = omni.generate(prompt)
+    images = outputs[0].request_output.images
+    images[0].save("coffee.png")
 ```
 
-## Local CLI Usage
+### Local CLI Usage
 
-### Qwen/Tongyi Models
+```bash
+python text_to_image.py \
+  --model Qwen/Qwen-Image \
+  --prompt "a cup of coffee on the table" \
+  --output coffee.png
+```
+
+## Key Arguments
+
+**Common arguments:**
+
+| Argument | Type | Default | Description |
+| -------- | ---- | ------- | ----------- |
+| `--prompt` | str | `"a cup of coffee on the table"` | Text description for image generation |
+| `--seed` | int | `142` | Integer seed for deterministic sampling |
+| `--negative-prompt` | str | `None` | Negative prompt for classifier-free conditional guidance |
+| `--cfg-scale` | float | `4.0` | True CFG scale (model-specific guidance strength) |
+| `--guidance-scale` | float | `1.0` | Classifier-free guidance scale |
+| `--num-images-per-prompt` | int | `1` | Number of images per prompt (saved as `output`, `output_1`, ...) |
+| `--num-inference-steps` | int | `50` | Diffusion sampling steps (more steps = higher quality, slower) |
+| `--height` | int | `1024` | Output image height in pixels |
+| `--width` | int | `1024` | Output image width in pixels |
+| `--output` | str | `"qwen_image_output.png"` | Path to save the generated image |
+| `--vae-use-slicing` | flag | off | Enable VAE slicing for memory optimization |
+| `--vae-use-tiling` | flag | off | Enable VAE tiling for memory optimization |
+| `--cfg-parallel-size` | int | `1` | Set to `2` to enable CFG Parallel |
+| `--enable-cpu-offload` | flag | off | Enable CPU offloading for diffusion models |
+| `--lora-path` | str | — | Path to PEFT LoRA adapter folder |
+| `--lora-scale` | float | `1.0` | Scale factor for LoRA weights |
+
+**NextStep-1.1 specific arguments:**
+
+| Argument | Type | Default | Description |
+| -------- | ---- | ------- | ----------- |
+| `--guidance-scale-2` | float | `1.0` | Secondary guidance scale (e.g. image-level CFG) |
+| `--timesteps-shift` | float | `1.0` | Timesteps shift parameter for sampling |
+| `--cfg-schedule` | str | `"constant"` | CFG schedule type: `"constant"` or `"linear"` |
+| `--use-norm` | flag | off | Apply layer normalization to sampled tokens |
+
+> If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage.
+
+> Qwen-Image currently publishes best-effort presets at `1328x1328`, `1664x928`, `928x1664`, `1472x1140`, `1140x1472`, `1584x1056`, and `1056x1584`. Adjust `--height/--width` accordingly for the most reliable outcomes.
+
+## More CLI Examples
+
+### Tongyi Models
 
 ```bash
 python text_to_image.py \
   --model Tongyi-MAI/Z-Image-Turbo \
   --prompt "a cup of coffee on the table" \
   --seed 42 \
-  --cfg-scale 4.0 \
+  --guidance-scale 0.0 \
   --num-images-per-prompt 1 \
-  --num-inference-steps 50 \
+  --num-inference-steps 9 \
   --height 1024 \
   --width 1024 \
   --output outputs/coffee.png
 ```
 
+`Tongyi-MAI/Z-Image-Turbo` is a distilled version of Z-Image. Distilled diffusion models usually require less number of inference steps (4~9), and Classifier-Free Guidance (CFG) is usually NOT applied. Similar distilled models are `black-forest-labs/FLUX.2-klein-4B` and `black-forest-labs/FLUX.2-klein-9B`.
+
 ### NextStep Models
 
-NextStep-1.1 can have extra arguments
+NextStep-1.1 supports extra arguments for dual-level CFG control:
+
 ```bash
 python text_to_image.py \
   --model stepfun-ai/NextStep-1.1 \
@@ -106,8 +138,10 @@ python text_to_image.py \
   --seed 42
 ```
 
-### Flux.2-dev Models
-To start Flux.2-dev with a single GPU, cpu-offload must be enabled because the total size of its weights exceeds the 80GB memory capacity of the GPU.
+### FLUX.2-dev Models
+
+To run FLUX.2-dev on a single GPU, `--enable-cpu-offload` is required because the model weights exceed 80 GiB:
+
 ```bash
 python examples/offline_inference/text_to_image/text_to_image.py \
   --model black-forest-labs/FLUX.2-dev \
@@ -123,38 +157,82 @@ python examples/offline_inference/text_to_image/text_to_image.py \
   --output flux2-dev.png
 ```
 
-### Key Arguments
+### Batch Requests (Multiple Prompts)
 
-**Common arguments:**
+You can pass multiple prompts in a single `generate` call.
+
+```python
+from vllm_omni.entrypoints.omni import Omni
+
+if __name__ == "__main__":
+    omni = Omni(model="Qwen/Qwen-Image")
+    prompts = [
+        "a cup of coffee on a table",
+        "a toy dinosaur on a sandy beach",
+        "a fox waking up in bed and yawning",
+    ]
+    outputs = omni.generate(prompts)
+    for i, output in enumerate(outputs):
+        output.request_output.images[0].save(f"{i}.jpg")
+```
+
+!!! info
+
+    Not all models support batch inference, and batch requesting mostly does not provide significant
+    performance improvement. This feature is primarily for interface compatibility with vLLM and to
+    allow for future improvements.
+
+!!! info
 
-- `--prompt`: text description (string).
-- `--seed`: integer seed for deterministic sampling.
-- `--cfg-scale`: true CFG scale (model-specific guidance strength).
-- `--num-images-per-prompt`: number of images to generate per prompt (saves as `output`, `output_1`, ...).
-- `--num-inference-steps`: diffusion sampling steps (more steps = higher quality, slower).
-- `--height/--width`: output resolution (defaults 1024x1024).
-- `--output`: path to save the generated PNG.
-- `--vae-use-slicing`: enable VAE slicing for memory optimization.
-- `--vae-use-tiling`: enable VAE tiling for memory optimization.
-- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](../../../docs/user_guide/diffusion_acceleration.md#using-cfg-parallel).
-- `--enable-cpu-offload`: enable CPU offloading for diffusion models.
-- `--guidance-scale`: classifier-free guidance scale.
+    For diffusion pipelines, the stage config field `stage_args.[].runtime.max_batch_size` is 1 by
+    default, and the input list is sliced into single-item requests before feeding into the diffusion
+    pipeline. For models that do internally support batched inputs, you can
+    [modify this configuration](../../../configuration/stage_configs.md) to let the model accept a
+    longer batch of prompts.
 
-**NextStep-1.1 specific:**
-- `--guidance-scale-2`: secondary guidance scale, e.g. image-level CFG (default: 1.0).
-- `--timesteps-shift`: timesteps shift parameter for sampling (default: 1.0).
-- `--cfg-schedule`: CFG schedule type, "constant" or "linear" (default: "constant").
-- `--use-norm`: apply layer normalization to sampled tokens.
+### Negative Prompts
 
-> ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage.
+vLLM-Omni supports dictionary prompts for models that accept negative prompts:
 
-> ℹ️ Qwen-Image currently publishes best-effort presets at `1328x1328`, `1664x928`, `928x1664`, `1472x1140`, `1140x1472`, `1584x1056`, and `1056x1584`. Adjust `--height/--width` accordingly for the most reliable outcomes.
+```python
+from vllm_omni.entrypoints.omni import Omni
 
-## LoRA
+if __name__ == "__main__":
+    omni = Omni(model="Qwen/Qwen-Image")
+    outputs = omni.generate([
+        {
+            "prompt": "a cup of coffee on a table",
+            "negative_prompt": "low resolution"
+        },
+        {
+            "prompt": "a toy dinosaur on a sandy beach",
+            "negative_prompt": "cinematic, realistic"
+        }
+    ])
+    for i, output in enumerate(outputs):
+        output.request_output.images[0].save(f"{i}.jpg")
+```
 
-This example supports Peft-compatible LoRA (Low-Rank Adaptation) adapters for diffusion models. Pass `--lora-path` to use a LoRA adapter and optionally `--lora-scale` (default 1.0); omit it to use the base model only.
+You can also pass a negative prompt via the CLI argument `--negative-prompt`:
 
-### Basic usage with LoRA
+```bash
+python examples/offline_inference/text_to_image/text_to_image.py \
+  --model Qwen/Qwen-Image \
+  --prompt "a cup of coffee on a table" \
+  --negative-prompt "low resolution, blurry" \
+  --output coffee.png
+```
+
+### Advanced Features
+
+#### CFG Parallel
+
+Set `--cfg-parallel-size 2` to enable CFG Parallel for faster inference on multi-GPU setups.
+See more examples in the [diffusion acceleration user guide](../../../docs/user_guide/diffusion_acceleration.md#using-cfg-parallel).
+
+#### LoRA
+
+This example supports PEFT-compatible LoRA (Low-Rank Adaptation) adapters for diffusion models. Pass `--lora-path` to use a LoRA adapter and optionally `--lora-scale` (default `1.0`); omit it to use the base model only.
 
 ```bash
 python text_to_image.py \
@@ -165,14 +243,7 @@ python text_to_image.py \
   --output output.png
 ```
 
-### LoRA parameters
-
-- `--lora-path`: Path to LoRA adapter folder (PEFT format). Loaded at initialization and used for generation.
-- `--lora-scale`: Scale factor for LoRA weights (default: 1.0). Higher values increase the influence of the LoRA adapter.
-
-### LoRA adapter format
-
-LoRA adapters must be in PEFT (Parameter-Efficient Fine-Tuning) format. A typical LoRA adapter directory structure:
+LoRA adapters must be in PEFT format. A typical adapter directory structure:
 
 ```
 lora_adapter/
@@ -182,10 +253,10 @@ lora_adapter/
 
 ## Web UI Demo
 
-Launch the gradio demo:
+Launch the Gradio demo:
 
 ```bash
 python gradio_demo.py --port 7862
 ```
 
-Then open `http://localhost:7862/` on your local browser to interact with the web UI.
+Then open `http://localhost:7862/` in your local browser to interact with the web UI.
diff --git a/pyproject.toml b/pyproject.toml
index 15e408a9661..3fd987d99e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,8 @@ dev = [
     "mooncake-transfer-engine==0.3.8.post1",
     "av", # for ComfyUI tests
     "openpyxl>=3.0.0", # for nightly CI
-    "pyttsx3>=2.99"
+    "pyttsx3>=2.99",
+    "mistune>=3.2.0", # for example tests
 ]
 
 docs = [
@@ -162,6 +163,7 @@ markers = [
     "omni: Omni model tests",
     "cache: Cache backend tests",
     "parallel: Parallelism/distributed tests",
+    "example: Doc example code tests",
     # platform markers
     "cpu: Tests that run on CPU",
     "gpu: Tests that run on GPU (auto-added)",
diff --git a/tests/examples/conftest.py b/tests/examples/conftest.py
new file mode 100644
index 00000000000..8a7705d9297
--- /dev/null
+++ b/tests/examples/conftest.py
@@ -0,0 +1,338 @@
+"""
+Shared fixtures, helpers, and path constants for tests/examples/.
+"""
+
+import json
+import os
+import re
+import shlex
+import subprocess
+import sys
+import tempfile
+from collections import defaultdict
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any, NamedTuple, cast
+
+import mistune
+import pytest
+import torch
+from safetensors.torch import save_file
+
+# ---------------------------------------------------------------------------
+# Path constants and fixtures
+# ---------------------------------------------------------------------------
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+EXAMPLES = REPO_ROOT / "examples"
+
+# Use Python tempfile instead of pytest's tmp_path_factory because
+# OUTPUT_DIR is needed in test collection time, but tmp_path_factory is only available in test running time.
+# It is needed during test collection because extract_readme_snippets replaces LoRA path with a generated one under OUTPUT_DIR,
+# and extract_readme_snippets is called at collection time to generate separate test cases for each README code block.
+OUTPUT_DIR = (
+    REPO_ROOT / prefix
+    if (prefix := os.environ.get("OUTPUT_DIR"))
+    else Path(tempfile.mkdtemp(prefix="vllm_omni_test_examples_"))
+)
+
+
+# ---------------------------------------------------------------------------
+# Code snippet extraction and asset file helpers
+# ---------------------------------------------------------------------------
+
+# parameters: language, code, h2_title
+ReadmeSnippetExtractionSkipPredicate = Callable[[str, str, str], tuple[bool, str]]
+
+
+class ReadmeSnippet(NamedTuple):
+    language: str
+    code: str
+    h2_title: str
+    index_in_section: int
+    output_file_path: Path | None = None
+    skip: tuple[bool, str] = (False, "")
+
+    @property
+    def test_id(self) -> str:
+        return f"{ReadmeSnippet._slug(self.h2_title)}_{self.index_in_section:03d}"
+
+    @staticmethod
+    def extract_readme_snippets(
+        readme_path: Path,
+        skipif: ReadmeSnippetExtractionSkipPredicate | None = None,
+    ) -> list["ReadmeSnippet"]:
+        markdown = mistune.create_markdown(renderer="ast")
+        tokens = markdown(readme_path.read_text(encoding="utf-8"))
+        tokens = cast(list[dict[str, Any]], tokens)  # mistune's AST renderer always produces a list, not a str
+
+        h2_title = ""
+        section_counts: defaultdict[str, int] = defaultdict(int)
+        snippets: list[ReadmeSnippet] = []
+
+        for token in tokens:
+            token_type = token.get("type")
+
+            if token_type == "heading":
+                level = (token.get("attrs") or {}).get("level")
+                title = ReadmeSnippet._heading_text(token)
+                if level == 2:
+                    h2_title = title
+                continue
+
+            if token_type != "block_code":
+                continue
+
+            try:
+                info = token.get("attrs").get("info")  # type: ignore[reportOptionalMemberAccess]
+                language = info.strip().split()[0].lower()  # type: ignore[reportOptionalMemberAccess]
+
+                # Common shell aliases to "bash" in several markdown renderers.
+                if language in {"shell", "sh", "ksh", "zsh"}:
+                    language = "bash"
+
+                if language not in {"bash", "python"}:
+                    continue
+            except AttributeError:
+                # The fence is missing explicit language info; skip it.
+                continue
+
+            key = h2_title
+            section_counts[key] += 1
+            code = token.get("raw", "")
+            output_file_path = None
+            if language == "bash":
+                argv = ReadmeSnippet._normalize_bash_command(code, Path(readme_path.parent))
+                code = shlex.join(argv)
+                output_file_path = ReadmeSnippet._output_file_path_from_argv(argv)
+            if skipif is not None:
+                skip_config = skipif(language, code, h2_title)
+            else:
+                skip_config = (False, "")
+            snippet = ReadmeSnippet(
+                language=language,
+                code=code,
+                h2_title=h2_title,
+                index_in_section=section_counts[key],
+                output_file_path=output_file_path,
+                skip=skip_config,
+            )
+            snippets.append(snippet)
+
+        return snippets
+
+    @staticmethod
+    def _normalize_bash_command(command: str, readme_dir: Path) -> list[str]:
+        line_joined_command = re.sub(r"\\\s*\n", " ", command).strip()
+        argv = shlex.split(line_joined_command, comments=True)
+        assert argv, "README bash fence produced an empty command"
+
+        # Normalize python directory and example script location
+        if argv[0] in {"python", "python3"}:
+            argv[0] = sys.executable
+            if len(argv) > 1 and argv[1].endswith(".py"):
+                script_arg = argv[1]
+                script_path = Path(script_arg)
+                if script_path.is_absolute():
+                    resolved_script = script_path
+                else:
+                    # Take the file name only, and append script_dir to its front
+                    resolved_script = readme_dir / script_path.name
+                assert resolved_script.exists(), (
+                    f"README bash snippet references a script that does not exist: {script_arg} (resolved to {resolved_script})"
+                )
+                argv[1] = str(resolved_script)
+
+        # Normalize LoRA adapter path and ensure README LoRA assets exist.
+        try:
+            lora_arg_idx = argv.index("--lora-path")  # Raise ValueError if not found
+            assert len(argv) > lora_arg_idx + 1, "README bash snippet uses --lora-path without a following value"
+
+            lora_dir = OUTPUT_DIR / "lora"
+            adapter_model = lora_dir / "adapter_model.safetensors"
+            adapter_config = lora_dir / "adapter_config.json"
+            if not adapter_model.exists() or not adapter_config.exists():
+                write_zimage_lora(lora_dir, v_scale=8.0)
+
+            argv[lora_arg_idx + 1] = str(lora_dir)
+        except ValueError:
+            pass
+
+        return argv
+
+    @staticmethod
+    def _output_file_path_from_argv(argv: list[str]) -> Path | None:
+        if "--output" not in argv:
+            return None
+        output_param_idx = argv.index("--output")
+        assert len(argv) > output_param_idx + 1, "README bash snippet uses --output without a following value"
+        output_arg = argv[output_param_idx + 1]
+        return Path(output_arg)
+
+    @staticmethod
+    def _slug(text: str) -> str:
+        return "".join(ch.lower() if ch.isalnum() else "_" for ch in text).strip("_")
+
+    @staticmethod
+    def _heading_text(token: dict) -> str:
+        return "".join(child.get("raw", "") for child in token.get("children", [])).strip()
+
+
+# [TODO] Duplicate `_write_zimage_lora` in tests/e2e/online_serving/test_images_generations_lora.py. Combine these helpers and tests/e2e/offline_inference/test_diffusion_lora.py to test/utils later
+def write_zimage_lora(adapter_dir: Path, *, q_scale: float = 0.0, k_scale: float = 0.0, v_scale: float = 0.0):
+    adapter_dir.mkdir(parents=True, exist_ok=True)
+
+    # Z-Image transformer uses dim=3840 by default.
+    dim = 3840
+    module_name = "transformer.layers.0.attention.to_qkv"
+    rank = 1
+
+    lora_a = torch.zeros((rank, dim), dtype=torch.float32)
+    lora_a[0, 0] = 1.0
+
+    # QKVParallelLinear packs (Q, K, V) => out dim is 3 * dim (tp=1).
+    lora_b = torch.zeros((3 * dim, rank), dtype=torch.float32)
+    if q_scale:
+        lora_b[:dim, 0] = q_scale
+    if k_scale:
+        lora_b[dim : 2 * dim, 0] = k_scale
+    if v_scale:
+        lora_b[2 * dim :, 0] = v_scale
+
+    save_file(
+        {
+            f"base_model.model.{module_name}.lora_A.weight": lora_a,
+            f"base_model.model.{module_name}.lora_B.weight": lora_b,
+        },
+        str(adapter_dir / "adapter_model.safetensors"),
+    )
+    (adapter_dir / "adapter_config.json").write_text(
+        json.dumps(
+            {
+                "r": rank,
+                "lora_alpha": rank,
+                "target_modules": [module_name],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Code runner and subprocess helpers
+# ---------------------------------------------------------------------------
+
+
+class ExampleRunResult(NamedTuple):
+    run_dir: Path
+    assets: list[Path]
+
+
+class ExampleRunner:
+    """Run extracted README snippets and return generated assets.
+
+    The output materials are organized in a three-level directory structure:
+    - Set at init: `self.output_root` for all tests (from env OUTPUT_DIR)
+    - Set at `self.run(...)`: `output_subfolder` for a specific example page (e.g., `example_offline_t2i`)
+    - Generated by `extract_readme_snippets`: `snippet.test_id` for a specific code block (matching H2 titles, e.g., `basic_usage_001`)
+    """
+
+    IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp"}
+
+    def __init__(self, output_root: Path) -> None:
+        self.output_root = output_root
+
+    def run(
+        self, snippet: ReadmeSnippet, *, output_subfolder: Path = Path("."), env: dict[str, str] | None = None
+    ) -> ExampleRunResult:
+        run_dir = self.output_root / output_subfolder / snippet.test_id
+        run_dir.mkdir(parents=True, exist_ok=True)
+
+        if snippet.language == "python":
+            assets = self._run_python_snippet(snippet, run_dir, env)
+            return ExampleRunResult(run_dir=run_dir, assets=assets)
+
+        if snippet.language == "bash":
+            asset = self._run_bash_snippet(snippet, run_dir, env)
+            return ExampleRunResult(run_dir=run_dir, assets=[asset])
+
+        raise AssertionError(f"Unsupported snippet language: {snippet.language}")
+
+    def _run_python_snippet(
+        self, snippet: ReadmeSnippet, run_dir: Path, env: dict[str, str] | None = None
+    ) -> list[Path]:
+        # Saving the script to a temporary file and `run_cmd` it.
+        # Not using `exec(snippet.code)` because the output is lost.
+        script_path = run_dir / "snippet.py"
+        script_path.write_text(snippet.code, encoding="utf-8")
+
+        before = self._collect_images(run_dir)
+        run_cmd([sys.executable, str(script_path)], cwd=run_dir, env=env)
+        after = self._collect_images(run_dir)
+
+        assets = sorted(after - before)
+        return assets
+
+    def _run_bash_snippet(self, snippet: ReadmeSnippet, run_dir: Path, env: dict[str, str] | None = None) -> Path:
+        run_cmd(snippet.code, shell=True, cwd=run_dir, env=env)
+
+        assert snippet.output_file_path is not None, (
+            f"README bash snippet is missing --output argument: {snippet.test_id}. "
+            "The test script cannot guess the output file path."
+        )
+
+        # If the code snippet declares a relative path for the output file, append this path to the parent output collection directory.
+        # If the code snippet declares an absolute path (not likely but just in case), the return value resolution removes `run_dir`, also correctly pointing to this file.
+        return run_dir / snippet.output_file_path
+
+    def _collect_images(self, root: Path) -> set[Path]:
+        return {path for path in root.rglob("*") if path.suffix.lower() in self.IMAGE_SUFFIXES}
+
+
+@pytest.fixture
+def example_runner() -> ExampleRunner:
+    return ExampleRunner(output_root=OUTPUT_DIR)
+
+
+def run_cmd(
+    command: list[str] | str,
+    *,
+    shell: bool = False,
+    env: dict[str, str] | None = None,
+    cwd: Path | str | None = None,
+) -> str:
+    """Run a command as a subprocess; assert zero exit code and return stdout.
+
+    Output is fully captured and returned as a string so callers can parse it
+    (e.g. with :func:`extract_content_after_keyword`).
+    Use this for scripts whose printed output is part of the test assertion.
+    """
+    if env is not None:
+        env = {**os.environ.copy(), **env}
+    result = subprocess.run(command, capture_output=True, text=True, shell=shell, env=env, cwd=cwd)
+
+    if result.returncode != 0:
+        print(f"STDERR: {result.stderr}")
+        raise subprocess.CalledProcessError(result.returncode, command)
+
+    all_output = result.stdout
+    print(f"All output:\n{all_output}")
+    return all_output
+
+
+# ---------------------------------------------------------------------------
+# Output validation helpers
+# ---------------------------------------------------------------------------
+
+
+def extract_content_after_keyword(keywords: str, text: str) -> str:
+    """Return the text that follows *keywords* in *text* (regex match).
+
+    Raises ``AssertionError`` if the keyword is not found, so test failures
+    produce a clear message pointing at the missing keyword.
+    """
+    matches = re.findall(rf"{keywords}\s*(.+)", text, re.DOTALL)
+
+    if not matches:
+        raise AssertionError(f"Keywords {keywords} not found in provided text output")
+    return matches[0]
diff --git a/tests/examples/offline_inference/__init__.py b/tests/examples/offline_inference/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/examples/offline_inference/test_text_to_image.py b/tests/examples/offline_inference/test_text_to_image.py
new file mode 100644
index 00000000000..a08d16f1614
--- /dev/null
+++ b/tests/examples/offline_inference/test_text_to_image.py
@@ -0,0 +1,38 @@
+"""
+Offline inference tests: text-to-image.
+See examples/offline_inference/text_to_image/README.md
+"""
+
+from pathlib import Path
+
+import pytest
+
+from tests.conftest import assert_image_valid
+from tests.examples.conftest import EXAMPLES, ExampleRunner, ReadmeSnippet
+from tests.utils import hardware_marks
+
+pytestmark = [pytest.mark.advanced_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})]
+
+T2I_SCRIPT = EXAMPLES / "offline_inference" / "text_to_image" / "text_to_image.py"
+README_PATH = T2I_SCRIPT.with_name("README.md")
+EXAMPLE_OUTPUT_SUBFOLDER = "example_offline_t2i"
+
+
+def _skip_readme_snippet(language: str, code: str, h2_title: str) -> tuple[bool, str]:
+    if h2_title == "Web UI Demo":
+        return True, f"README section '{h2_title}' is intentionally excluded for examples tests"
+    return False, ""
+
+
+README_SNIPPETS = ReadmeSnippet.extract_readme_snippets(README_PATH, skipif=_skip_readme_snippet)
+
+
+@pytest.mark.parametrize("snippet", README_SNIPPETS, ids=lambda snippet: snippet.test_id)
+def test_text_to_image(snippet: ReadmeSnippet, example_runner: ExampleRunner):
+    should_skip, reason = snippet.skip
+    if should_skip:
+        pytest.skip(reason)
+
+    result = example_runner.run(snippet, output_subfolder=Path(EXAMPLE_OUTPUT_SUBFOLDER))
+    for asset in result.assets:
+        assert_image_valid(asset)
diff --git a/tests/examples/online_serving/__init__.py b/tests/examples/online_serving/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/examples/online_serving/test_qwen2_5_omni.py b/tests/examples/online_serving/test_qwen2_5_omni.py
index 6d6ebd318c5..370f2251f54 100644
--- a/tests/examples/online_serving/test_qwen2_5_omni.py
+++ b/tests/examples/online_serving/test_qwen2_5_omni.py
@@ -1,5 +1,6 @@
 """
-Example online tests for Qwen2.5-Omni-7B model.
+Online serving tests: Qwen2.5-Omni-7B.
+See examples/online_serving/qwen2_5_omni/README.md
 """
 
 import os
@@ -8,15 +9,16 @@
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
-import re
-import subprocess
 from pathlib import Path
 
 import pytest
 
 from tests.conftest import OmniServerParams, convert_audio_file_to_text, cosine_similarity_text
+from tests.examples.conftest import extract_content_after_keyword, run_cmd
 from tests.utils import hardware_test
 
+pytestmark = [pytest.mark.advanced_model, pytest.mark.example]
+
 models = ["Qwen/Qwen2.5-Omni-7B"]
 
 
@@ -36,30 +38,6 @@
 ]
 
 
-def run_cmd(command):
-    result = subprocess.run(
-        command,
-        capture_output=True,
-        text=True,
-    )
-
-    if result.returncode != 0:
-        print(f"STDERR: {result.stderr}")
-        raise subprocess.CalledProcessError(result.returncode, command)
-
-    all_output = result.stdout
-    print(f"All output:\n{all_output}")
-    return all_output
-
-
-def extract_content_after_keyword(keywords, text):
-    matches = re.findall(rf"{keywords}\s*(.+)", text, re.DOTALL)
-
-    if not matches:
-        raise AssertionError(f"Keywords {keywords} not found in provided text output")
-    return matches[0]
-
-
 @pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
diff --git a/tests/examples/online_serving/test_qwen3_omni.py b/tests/examples/online_serving/test_qwen3_omni.py
index abe5322dd13..7743ae55205 100644
--- a/tests/examples/online_serving/test_qwen3_omni.py
+++ b/tests/examples/online_serving/test_qwen3_omni.py
@@ -1,5 +1,6 @@
 """
-Example Online tests for Qwen3-Omni model.
+Online serving tests: Qwen3-Omni.
+See examples/online_serving/qwen3_omni/README.md
 """
 
 import os
@@ -8,15 +9,16 @@
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
-import re
-import subprocess
 from pathlib import Path
 
 import pytest
 
 from tests.conftest import OmniServerParams, convert_audio_file_to_text, cosine_similarity_text
+from tests.examples.conftest import extract_content_after_keyword, run_cmd
 from tests.utils import hardware_test
 
+pytestmark = [pytest.mark.advanced_model, pytest.mark.example]
+
 models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
 
 
@@ -35,30 +37,6 @@
 ]
 
 
-def run_cmd(command):
-    result = subprocess.run(
-        command,
-        capture_output=True,
-        text=True,
-    )
-
-    if result.returncode != 0:
-        print(f"STDERR: {result.stderr}")
-        raise subprocess.CalledProcessError(result.returncode, command)
-
-    all_output = result.stdout
-    print(f"All output:\n{all_output}")
-    return all_output
-
-
-def extract_content_after_keyword(keywords, text):
-    matches = re.findall(rf"{keywords}\s*(.+)", text, re.DOTALL)
-
-    if not matches:
-        raise AssertionError(f"Keywords {keywords} not found in provided text output")
-    return matches[0]
-
-
 @pytest.mark.advanced_model
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
diff --git a/tests/examples/online_serving/test_text_to_image.py b/tests/examples/online_serving/test_text_to_image.py
new file mode 100644
index 00000000000..51b7ff61bc9
--- /dev/null
+++ b/tests/examples/online_serving/test_text_to_image.py
@@ -0,0 +1,136 @@
+"""
+Online serving tests: text-to-image.
+See examples/online_serving/text_to_image/README.md
+
+The output materials are organized in a three-level directory structure:
+- Set at init: `self.output_root` for all tests (from env OUTPUT_DIR)
+- Set at `self.run(...)`: `output_subfolder` for a specific example page (e.g., `example_offline_t2i`)
+- Generated by `extract_readme_snippets`: `snippet.test_id` for a specific code block (matching H2 titles, e.g., `basic_usage_001`)
+"""
+
+import sys
+from pathlib import Path
+
+import pytest
+
+from tests.conftest import OmniServer, OmniServerParams, assert_image_valid
+from tests.examples.conftest import EXAMPLES, OUTPUT_DIR, run_cmd, write_zimage_lora
+from tests.utils import hardware_marks
+
+pytestmark = [pytest.mark.advanced_model, pytest.mark.example, *hardware_marks(res={"cuda": "H100"})]
+
+T2I_ONLINE_CLIENT = EXAMPLES / "online_serving" / "text_to_image" / "openai_chat_client.py"
+EXAMPLE_OUTPUT_SUBFOLDER = "example_online_t2i"
+
+
+@pytest.fixture(scope="module")
+def example_output_dir() -> Path:
+    d = OUTPUT_DIR / EXAMPLE_OUTPUT_SUBFOLDER
+    d.mkdir(parents=True, exist_ok=True)
+    return d
+
+
+# Ensuring two indirect parametrization receive exactly the same parameter list.
+# Then, only one omni_server instance is started for all test functions using this param set.
+# Two additional pre-requisites:
+# - omni_server must (and indeed is) defined as a module-scoped fixture
+# - test functions sharing the same param set must be adjacent to each other
+qwen_image_server_params = [OmniServerParams(model="Qwen/Qwen-Image")]
+z_image_turbo_server_params = [OmniServerParams(model="Tongyi-MAI/Z-Image-Turbo")]
+
+
+# --- ### Method 1: Using curl ---
+
+
+@pytest.mark.parametrize("omni_server", qwen_image_server_params, indirect=True)
+def test_api_calls_001(omni_server: OmniServer, example_output_dir: Path):
+    url = f"http://{omni_server.host}:{omni_server.port}/v1/chat/completions"
+    case_dir = example_output_dir / "api_calls-001"
+    case_dir.mkdir(parents=True, exist_ok=True)
+    out = case_dir / "api_calls_001.png"
+    json_str = """{
+    "messages": [
+      {"role": "user", "content": "A beautiful landscape painting"}
+    ],
+    "extra_body": {
+      "height": 1024,
+      "width": 1024,
+      "num_inference_steps": 50,
+      "true_cfg_scale": 4.0,
+      "seed": 42
+    }
+}"""
+    run_cmd(
+        f"curl -s '{url}'"
+        " -H 'Content-Type: application/json'"
+        f" -d '{json_str}'"
+        " | jq -r '.choices[0].message.content[0].image_url.url'"
+        f" | cut -d',' -f2- | base64 -d > '{out}'",
+        shell=True,
+    )
+    assert_image_valid(out, width=1024, height=1024)
+
+
+# --- ### Method 2: Using Python Client ---
+
+
+@pytest.mark.parametrize("omni_server", qwen_image_server_params, indirect=True)
+def test_api_calls_002(omni_server: OmniServer, example_output_dir: Path):
+    case_dir = example_output_dir / "api_calls-002"
+    case_dir.mkdir(parents=True, exist_ok=True)
+    out = case_dir / "api_calls_002.png"
+    run_cmd(
+        [
+            sys.executable,
+            str(T2I_ONLINE_CLIENT),
+            "--prompt",
+            "A beautiful landscape painting",
+            "--output",
+            str(out),
+            "--server",
+            f"http://{omni_server.host}:{omni_server.port}",
+        ]
+    )
+    assert_image_valid(out)
+
+
+@pytest.mark.skip("README section 'Method 3: Using Gradio Demo' is intentionally excluded for examples tests")
+def test_api_calls_003(): ...
+
+
+# --- ### Using Python Client with LoRA ---
+
+
+@pytest.mark.parametrize("omni_server", z_image_turbo_server_params, indirect=True)
+def test_lora_001(omni_server: OmniServer, example_output_dir: Path, tmp_path: Path):
+    lora_dir = tmp_path / "zimage_lora_a"
+    write_zimage_lora(lora_dir, v_scale=8.0)
+    case_dir = example_output_dir / "lora-001"
+    case_dir.mkdir(parents=True, exist_ok=True)
+    out = case_dir / "lora_001.png"
+    run_cmd(
+        [
+            sys.executable,
+            str(T2I_ONLINE_CLIENT),
+            "--prompt",
+            "A piece of cheesecake",
+            "--lora-path",
+            str(lora_dir),
+            "--lora-name",
+            "a",
+            "--lora-scale",
+            "1.0",
+            "--output",
+            str(out),
+            "--server",
+            f"http://{omni_server.host}:{omni_server.port}",
+        ]
+    )
+    assert_image_valid(out)
+
+
+# --- ### Using curl with LoRA (Images API) ---
+
+
+@pytest.mark.skip(reason="Covered by tests/e2e/online_serving/test_images_generations_lora.py")
+def test_lora_002(): ...