From 556881c0eb777c76de3d1de4d6e50fe43fbd2040 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Sun, 26 Apr 2026 21:42:10 -0700
Subject: [PATCH 01/10] vLLM-Omni release docs

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/.nav.yml                                 |   2 +
 docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml     |  28 +++
 .../data/vllm-omni/1.0.0-gpu-sagemaker.yml    |  28 +++
 docs/src/global.yml                           |   3 +
 docs/src/macros.py                            |   2 +
 docs/src/tables/vllm-omni.yml                 |  14 ++
 docs/vllm-omni/index.md                       | 184 ++++++++++++++++++
 examples/vllm-omni/image/invoke_image.sh      |   5 +
 examples/vllm-omni/sagemaker/deploy_tts.py    |  32 +++
 .../vllm-omni/sagemaker/deploy_video_async.py |  35 ++++
 examples/vllm-omni/tts/invoke_tts.sh          |   6 +
 examples/vllm-omni/video/invoke_video.sh      |  22 +++
 12 files changed, 361 insertions(+)
 create mode 100644 docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml
 create mode 100644 docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml
 create mode 100644 docs/src/tables/vllm-omni.yml
 create mode 100644 docs/vllm-omni/index.md
 create mode 100755 examples/vllm-omni/image/invoke_image.sh
 create mode 100644 examples/vllm-omni/sagemaker/deploy_tts.py
 create mode 100644 examples/vllm-omni/sagemaker/deploy_video_async.py
 create mode 100755 examples/vllm-omni/tts/invoke_tts.sh
 create mode 100755 examples/vllm-omni/video/invoke_video.sh

diff --git a/docs/.nav.yml b/docs/.nav.yml
index a94963d53480..4c2a53cdf728 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -5,12 +5,14 @@ nav:
     - Using Deep Learning Containers: get_started/using_dlcs.md
     - Release Notifications: get_started/release_notifications.md
     - Ray: ray/index.md
+    - vLLM-Omni: vllm-omni/index.md
   - Release Notes:
     - releasenotes/index.md
     - Base: releasenotes/base/index.md
     - Ray: releasenotes/ray/index.md
     - SGLang: releasenotes/sglang/index.md
     - vLLM: releasenotes/vllm/index.md
+    - vLLM-Omni: releasenotes/vllm-omni/index.md
     - PyTorch: releasenotes/pytorch/index.md
     - Tensorflow: releasenotes/tensorflow/index.md
   - Tutorials: tutorials
diff --git a/docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml b/docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml
new file mode 100644
index 000000000000..1d5aa65228f6
--- /dev/null
+++ b/docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml
@@ -0,0 +1,28 @@
+framework: vLLM-Omni
+version: "1.0.0"
+accelerator: gpu
+python: py312
+cuda: cu129
+os: amzn2023
+platform: default
+public_registry: true
+
+tags:
+  - "omni-cuda-v1.0.0"
+  - "omni-cuda-v1.0"
+  - "omni-cuda-v1"
+
+announcements:
+  - "Initial release of vLLM-Omni containers for EC2, ECS, EKS"
+  - "Serves omni-modality models: TTS, image generation, video generation, multimodal chat"
+  - "Built on Amazon Linux 2023 with Python 3.12 and CUDA 12.9"
+
+packages:
+  vllm: "0.18.0"
+  vllm_omni: "0.18.0"
+  pytorch: "2.10.0"
+  torchvision: "0.25.0"
+  torchaudio: "2.10.0"
+  cuda: "12.9.1"
+  flashinfer: "0.6.6"
+  efa: "1.47.0"
diff --git a/docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml
new file mode 100644
index 000000000000..588fb7e8fd01
--- /dev/null
+++ b/docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml
@@ -0,0 +1,28 @@
+framework: vLLM-Omni
+version: "1.0.0"
+accelerator: gpu
+python: py312
+cuda: cu129
+os: amzn2023
+platform: sagemaker
+public_registry: true
+
+tags:
+  - "omni-cuda-sagemaker-v1.0.0"
+  - "omni-cuda-sagemaker-v1.0"
+  - "omni-cuda-sagemaker-v1"
+
+announcements:
+  - "Initial release of vLLM-Omni containers for SageMaker"
+  - "Includes ASGI routing middleware for /invocations dispatch via CustomAttributes"
+  - "Built on Amazon Linux 2023 with Python 3.12 and CUDA 12.9"
+
+packages:
+  vllm: "0.18.0"
+  vllm_omni: "0.18.0"
+  pytorch: "2.10.0"
+  torchvision: "0.25.0"
+  torchaudio: "2.10.0"
+  cuda: "12.9.1"
+  flashinfer: "0.6.6"
+  efa: "1.47.0"
diff --git a/docs/src/global.yml b/docs/src/global.yml
index 4f5133c38911..fb7ed4c95d23 100644
--- a/docs/src/global.yml
+++ b/docs/src/global.yml
@@ -67,7 +67,9 @@ display_names:
   sagemaker-xgboost: "XGBoost"
   sglang: "SGLang"
   vllm: "vLLM"
+  vllm_omni: "vLLM-Omni"
   vllm-arm64: "vLLM ARM64"
+  vllm-omni: "vLLM-Omni"
   pytorch-training: "PyTorch Training"
   pytorch-training-arm64: "PyTorch Training ARM64"
   pytorch-inference: "PyTorch Inference"
@@ -167,6 +169,7 @@ table_order:
   - sglang
   - vllm
   - vllm-arm64
+  - vllm-omni
   - pytorch-training
   - pytorch-inference
   - pytorch-training-arm64
diff --git a/docs/src/macros.py b/docs/src/macros.py
index 8eab4e930138..cf5ce81be274 100644
--- a/docs/src/macros.py
+++ b/docs/src/macros.py
@@ -42,4 +42,6 @@ def define_env(env):
         "latest_ray_default_cpu": _get_latest_ray_uri("default", "cpu"),
         "latest_ray_sagemaker_gpu": _get_latest_ray_uri("sagemaker", "gpu"),
         "latest_ray_sagemaker_cpu": _get_latest_ray_uri("sagemaker", "cpu"),
+        "latest_vllm_omni_ec2": get_latest_image_uri("vllm-omni", "default"),
+        "latest_vllm_omni_sagemaker": get_latest_image_uri("vllm-omni", "sagemaker"),
     }
diff --git a/docs/src/tables/vllm-omni.yml b/docs/src/tables/vllm-omni.yml
new file mode 100644
index 000000000000..4c4ffa203fa2
--- /dev/null
+++ b/docs/src/tables/vllm-omni.yml
@@ -0,0 +1,14 @@
+# Table Configuration - vLLM-Omni
+columns:
+  - field: framework_version
+    header: "Framework"
+  - field: python
+    header: "Python"
+  - field: cuda
+    header: "CUDA"
+  - field: accelerator
+    header: "Accelerator"
+  - field: platform
+    header: "Platform"
+  - field: example_url
+    header: "Example URL"
diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
new file mode 100644
index 000000000000..eb1edabcadb8
--- /dev/null
+++ b/docs/vllm-omni/index.md
@@ -0,0 +1,184 @@
+# vLLM-Omni Inference
+
+Pre-built Docker images for serving omni-modality models (text-to-speech, image generation, video generation, and multimodal chat) with
+[vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 12.9 and Python 3.12.
+
+## Latest Announcements
+
+**vLLM-Omni 1.0.0** — Initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a SageMaker routing
+middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`.
+
+## Pull Commands
+
+**EC2:**
+
+```bash
+docker pull {{ images.latest_vllm_omni_ec2 }}
+```
+
+**SageMaker:**
+
+```bash
+docker pull {{ images.latest_vllm_omni_sagemaker }}
+```
+
+See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication
+instructions.
+
+## Packages
+
+For package versions included in each release, see the [Release Notes](../releasenotes/vllm-omni/index.md).
+
+## Supported Modalities
+
+| Modality | Route | Example Model |
+| --- | --- | --- |
+| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` |
+| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B` |
+| Video Generation | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` |
+| Multimodal Chat | `/v1/chat/completions` | `bytedance-research/BAGEL-7B-MoT`, `Qwen/Qwen2.5-Omni-3B` |
+
+## Model Compatibility
+
+- Models must have a standard HuggingFace `config.json` with a recognized `model_type`, or be diffusers pipeline models with `model_index.json`.
+- Models requiring `--stage-configs-path` (e.g., CosyVoice3, Fish Speech) are not supported in v1.0.0 — the engine subprocess cannot resolve custom
+  model types.
+- Multi-stage omni models (thinker + talker + decoder) like Qwen2.5-Omni need significantly more VRAM than the model size suggests. Refer to the
+  individual model cards for minimum GPU requirements.
+
+## EC2 Deployment
+
+The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080.
+
+### Start the Server
+
+```bash
+docker run -d --gpus all \
+  --shm-size=2g \
+  -p 8080:8080 \
+  {{ images.latest_vllm_omni_ec2 }} \
+  --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
+
+until curl -sf http://localhost:8080/health > /dev/null; do sleep 5; done
+```
+
+Any flag accepted by `vllm serve` may be appended to `docker run` (e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`).
+
+### Text-to-Speech
+
+Returns raw audio bytes (WAV).
+
+```bash
+--8<-- "examples/vllm-omni/tts/invoke_tts.sh"
+```
+
+### Image Generation
+
+Returns a JSON response with a base64-encoded image in `data[0].b64_json`.
+
+```bash
+--8<-- "examples/vllm-omni/image/invoke_image.sh"
+```
+
+### Video Generation
+
+The `/v1/videos` endpoint is asynchronous by design — it returns a job ID immediately and generates the video in the background. The request must use
+`multipart/form-data`.
+
+```bash
+--8<-- "examples/vllm-omni/video/invoke_video.sh"
+```
+
+### Multimodal Chat
+
+Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) are supplied as URL or base64 content parts in the message list.
+
+```bash
+curl -X POST http://localhost:8080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"messages": [{"role": "user", "content": "Say hello in one sentence."}], "max_tokens": 64}'
+```
+
+## SageMaker Deployment
+
+### Prerequisites
+
+- AWS CLI configured with appropriate permissions
+- An IAM execution role with SageMaker and ECR permissions (see [Ray tutorial](../ray/index.md#prerequisites) for an example setup)
+- SageMaker Python SDK v2:
+
+```bash
+pip install 'sagemaker>=2,<3'
+```
+
+### Routing Middleware
+
+The SageMaker image includes an ASGI middleware that dispatches `/invocations` to the correct vllm-omni endpoint based on the `CustomAttributes`
+header:
+
+| `CustomAttributes` | Dispatched to |
+| --- | --- |
+| `route=/v1/audio/speech` | TTS |
+| `route=/v1/images/generations` | Image generation |
+| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) |
+| `route=/v1/chat/completions` | Multimodal chat |
+| *(no route)* | vLLM default `/invocations` (chat/completion/embed) |
+
+### Environment Variables
+
+Any `SM_VLLM_*` env var is converted to a `--<name>` CLI argument (e.g., `SM_VLLM_MAX_MODEL_LEN=2048` → `--max-model-len 2048`).
+
+| Variable | Description | Example |
+| --- | --- | --- |
+| `SM_VLLM_MODEL` | Model ID (HuggingFace or local path) | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` |
+| `SM_VLLM_MAX_MODEL_LEN` | Max sequence length | `2048` |
+| `SM_VLLM_ENFORCE_EAGER` | Disable CUDA graphs | `true` |
+| `SM_VLLM_TENSOR_PARALLEL_SIZE` | Number of GPUs for TP | `2` |
+| `HF_TOKEN` | HuggingFace token for gated models | `hf_...` |
+
+### Deploy a TTS Endpoint
+
+!!! warning "SageMaker endpoint deployment takes several minutes and incurs costs. Remember to delete endpoints when done."
+
+```python
+--8<-- "examples/vllm-omni/sagemaker/deploy_tts.py"
+```
+
+GPU deploys require `inference_ami_version` — the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 12.9 images. See
+[ProductionVariant API reference](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html) for valid values.
+
+When done, delete the endpoint:
+
+```python
+predictor.delete_endpoint()
+```
+
+### Async Inference for Video and Long-Running Generation
+
+SageMaker real-time inference has a 60-second timeout. First requests to TTS models may exceed this due to `torch.compile` warmup (~67s); async
+inference avoids the limit, as does retrying after warmup completes.
+
+For `/v1/videos`, async inference is required because the endpoint returns a job ID rather than the final MP4. The MP4 must be retrieved by polling
+the container directly — SageMaker async inference only captures the initial JSON response.
+
+```python
+--8<-- "examples/vllm-omni/sagemaker/deploy_video_async.py"
+```
+
+## Known Limitations
+
+- **Video generation on SageMaker returns a job ID only.** The `/v1/videos` endpoint in v1.0.0 is async by design and `POST /v1/videos/sync` (which
+  blocks and returns raw MP4 bytes) is not available. Direct container access (EC2) supports the full video workflow — create job, poll status,
+  download MP4. A sync endpoint has been added in newer vllm-omni versions and will be supported in a future release.
+- **First-request latency on SageMaker real-time endpoints.** TTS models can exceed the 60s invoke timeout on the first request due to `torch.compile`
+  warmup. Use async inference or retry after warmup.
+
+## Release Notes
+
+See [vLLM-Omni Release Notes](../releasenotes/vllm-omni/index.md) for version history and changelogs.
+
+## Resources
+
+- [vLLM-Omni Documentation](https://github.com/vllm-project/vllm-omni)
+- [GitHub Repository](https://github.com/aws/deep-learning-containers)
+- [Available Images](../reference/available_images.md)
diff --git a/examples/vllm-omni/image/invoke_image.sh b/examples/vllm-omni/image/invoke_image.sh
new file mode 100755
index 000000000000..8830334d2512
--- /dev/null
+++ b/examples/vllm-omni/image/invoke_image.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# Image generation via OpenAI-compatible /v1/images/generations endpoint
+curl -X POST http://localhost:8080/v1/images/generations \
+  -H "Content-Type: application/json" \
+  -d '{"prompt": "a red apple on a white table", "size": "512x512", "n": 1}'
diff --git a/examples/vllm-omni/sagemaker/deploy_tts.py b/examples/vllm-omni/sagemaker/deploy_tts.py
new file mode 100644
index 000000000000..a4e2d8a91a9a
--- /dev/null
+++ b/examples/vllm-omni/sagemaker/deploy_tts.py
@@ -0,0 +1,32 @@
+"""Deploy a vLLM-Omni TTS model to a real-time SageMaker endpoint."""
+
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+
+model = Model(
+    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-sagemaker-v1.0.0",
+    role="arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole",
+    env={"SM_VLLM_MODEL": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"},
+    predictor_cls=Predictor,
+)
+
+predictor = model.deploy(
+    instance_type="ml.g5.xlarge",
+    initial_instance_count=1,
+    endpoint_name="vllm-omni-tts",
+    inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
+    serializer=JSONSerializer(),
+    wait=True,
+)
+
+# Invoke — route /invocations to /v1/audio/speech via CustomAttributes
+sm_runtime = predictor.sagemaker_session.sagemaker_runtime_client
+response = sm_runtime.invoke_endpoint(
+    EndpointName=predictor.endpoint_name,
+    ContentType="application/json",
+    Body='{"input": "Hello world", "voice": "vivian", "language": "English"}',
+    CustomAttributes="route=/v1/audio/speech",
+)
+with open("speech.wav", "wb") as f:
+    f.write(response["Body"].read())
diff --git a/examples/vllm-omni/sagemaker/deploy_video_async.py b/examples/vllm-omni/sagemaker/deploy_video_async.py
new file mode 100644
index 000000000000..d1ac7c807354
--- /dev/null
+++ b/examples/vllm-omni/sagemaker/deploy_video_async.py
@@ -0,0 +1,35 @@
+"""Deploy a vLLM-Omni video model to a SageMaker async inference endpoint.
+
+Video generation is async by design — /v1/videos returns a job ID immediately,
+so only the job metadata JSON is written to S3, not the MP4 file. To retrieve
+the MP4, poll /v1/videos/<id>/content directly against the endpoint.
+"""
+
+from sagemaker.async_inference import AsyncInferenceConfig
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+
+model = Model(
+    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-sagemaker-v1.0.0",
+    role="arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole",
+    env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"},
+    predictor_cls=Predictor,
+)
+
+predictor = model.deploy(
+    instance_type="ml.g6e.xlarge",
+    initial_instance_count=1,
+    endpoint_name="vllm-omni-video-async",
+    inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
+    serializer=JSONSerializer(),
+    async_inference_config=AsyncInferenceConfig(
+        output_path="s3://<BUCKET>/vllm-omni-async-output/",
+        max_concurrent_invocations_per_instance=1,
+    ),
+    wait=True,
+)
+
+# The middleware converts the JSON payload to multipart/form-data for /v1/videos.
+# Response contains the job ID; use the /v1/videos/<id>/content endpoint to
+# retrieve the MP4 bytes directly from the container.
diff --git a/examples/vllm-omni/tts/invoke_tts.sh b/examples/vllm-omni/tts/invoke_tts.sh
new file mode 100755
index 000000000000..935f318492ce
--- /dev/null
+++ b/examples/vllm-omni/tts/invoke_tts.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Text-to-speech via OpenAI-compatible /v1/audio/speech endpoint
+curl -X POST http://localhost:8080/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -d '{"input": "Hello, how are you?", "voice": "vivian", "language": "English"}' \
+  --output speech.wav
diff --git a/examples/vllm-omni/video/invoke_video.sh b/examples/vllm-omni/video/invoke_video.sh
new file mode 100755
index 000000000000..3e6c4ab36d68
--- /dev/null
+++ b/examples/vllm-omni/video/invoke_video.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Video generation via /v1/videos endpoint (async — returns a job ID)
+# The /v1/videos API requires multipart/form-data.
+JOB=$(curl -sf -X POST http://localhost:8080/v1/videos \
+  -F "prompt=a dog running on a beach" \
+  -F "num_frames=17" \
+  -F "num_inference_steps=4" \
+  -F "size=480x320" \
+  -F "seed=42")
+
+JOB_ID=$(echo "$JOB" | python3 -c "import json,sys;print(json.load(sys.stdin)['id'])")
+echo "Job: $JOB_ID"
+
+# Poll until complete, then download
+while true; do
+  STATUS=$(curl -sf "http://localhost:8080/v1/videos/$JOB_ID" | python3 -c "import json,sys;print(json.load(sys.stdin)['status'])")
+  [ "$STATUS" = "succeeded" ] && break
+  [ "$STATUS" = "failed" ] && { echo "Job failed"; exit 1; }
+  sleep 5
+done
+
+curl -sf "http://localhost:8080/v1/videos/$JOB_ID/content" --output video.mp4

From a54efab526532ebf35f3bbe5b49dc35cbb887cc8 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Sun, 26 Apr 2026 22:18:10 -0700
Subject: [PATCH 02/10] Add Qwen2.5-Omni-3B EC2 tutorial

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/.nav.yml                                 |   1 +
 docs/vllm-omni/index.md                       |   4 +
 docs/vllm-omni/qwen2.5-omni.md                | 140 ++++++++++++++++++
 .../qwen2.5-omni/offline_inference.py         |  83 +++++++++++
 .../qwen2.5-omni/online_inference.py          |  25 ++++
 5 files changed, 253 insertions(+)
 create mode 100644 docs/vllm-omni/qwen2.5-omni.md
 create mode 100644 examples/vllm-omni/qwen2.5-omni/offline_inference.py
 create mode 100644 examples/vllm-omni/qwen2.5-omni/online_inference.py

diff --git a/docs/.nav.yml b/docs/.nav.yml
index 4c2a53cdf728..35fd502aba22 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -6,6 +6,7 @@ nav:
     - Release Notifications: get_started/release_notifications.md
     - Ray: ray/index.md
     - vLLM-Omni: vllm-omni/index.md
+    - Qwen2.5-Omni on EC2: vllm-omni/qwen2.5-omni.md
   - Release Notes:
     - releasenotes/index.md
     - Base: releasenotes/base/index.md
diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index eb1edabcadb8..013ac5954fc2 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -177,6 +177,10 @@ the container directly — SageMaker async inference only captures the initial J
 
 See [vLLM-Omni Release Notes](../releasenotes/vllm-omni/index.md) for version history and changelogs.
 
+## Model Tutorials
+
+- [Qwen2.5-Omni-3B on EC2 GPU](qwen2.5-omni.md) — multi-GPU setup, audio output gotchas, offline + online inference
+
 ## Resources
 
 - [vLLM-Omni Documentation](https://github.com/vllm-project/vllm-omni)
diff --git a/docs/vllm-omni/qwen2.5-omni.md b/docs/vllm-omni/qwen2.5-omni.md
new file mode 100644
index 000000000000..dda730ec1c49
--- /dev/null
+++ b/docs/vllm-omni/qwen2.5-omni.md
@@ -0,0 +1,140 @@
+# Qwen2.5-Omni-3B on EC2 GPU
+
+Run [Qwen/Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) (multimodal-in / text + speech-out) using the vLLM-Omni container — both as a
+local (offline) server and as a remote (online) endpoint.
+
+## Requirements
+
+- **EC2 GPU instance with ≥ 4 GPUs**:
+  - `g5.12xlarge` / `g6.12xlarge` (4× A10G, 24 GB each) — tested
+  - `g6e.12xlarge` (4× L40S, 48 GB each) — preferred when available
+- Amazon Linux 2023 with NVIDIA driver, Docker, and `nvidia-container-toolkit` (AWS Deep Learning AMIs include these)
+- AWS credentials with ECR pull permission for `763104351884`
+- Outbound internet to HuggingFace (first run downloads ~6 GB)
+
+!!! note "Single-GPU note" Qwen2.5-Omni-3B's default stage layout puts the talker on GPU 1. On a single-GPU instance it fails or produces distorted
+audio. Use a 4-GPU instance.
+
+## One-time setup
+
+```bash
+# ECR login
+aws ecr get-login-password --region us-west-2 | \
+  docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+
+docker pull {{ images.latest_vllm_omni_ec2 }}
+
+mkdir -p ~/hf-cache
+```
+
+## Start the server
+
+```bash
+docker run -d --name omni3b \
+  --gpus all --shm-size=16g -p 8080:8080 \
+  -v ~/hf-cache:/root/.cache/huggingface \
+  -e HF_HUB_ENABLE_HF_TRANSFER=1 \
+  {{ images.latest_vllm_omni_ec2 }} \
+    Qwen/Qwen2.5-Omni-3B \
+    --host 0.0.0.0 --port 8080 \
+    --max-model-len 16384 --dtype bfloat16
+```
+
+First start takes ~8 minutes (weight download + 3-stage model load). Wait for ready:
+
+```bash
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done
+echo ready
+```
+
+Stop and remove:
+
+```bash
+docker stop omni3b && docker rm omni3b
+```
+
+## Getting clean audio out
+
+Three things are **required** on `/v1/chat/completions` to produce usable speech from Qwen2.5-Omni-3B:
+
+1. `"modalities": ["audio"]`
+2. `"sampling_params_list"` — a 3-element list (thinker, talker, code2wav). The image's built-in per-stage defaults are wrong and produce noise. Use
+   the values shown below (from the official Qwen docs).
+3. The exact Qwen system prompt.
+
+!!! warning "Omitting `sampling_params_list` produces noise even though HTTP returns 200 with valid WAV bytes."
+
+### Working curl
+
+```bash
+curl -s http://localhost:8080/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "model": "Qwen/Qwen2.5-Omni-3B",
+    "modalities": ["audio"],
+    "sampling_params_list": [
+      {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1},
+      {"temperature":0.9,"top_p":0.8,"top_k":40,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.05,"stop_token_ids":[8294]},
+      {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1}
+    ],
+    "messages": [
+      {"role":"system","content":[{"type":"text","text":"You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]},
+      {"role":"user","content":[{"type":"text","text":"Tell me a short, calming bedtime lullaby story for a 6-year-old girl."}]}
+    ]
+  }' | jq -r '.choices[0].message.audio.data' | base64 -d > out.wav
+```
+
+## Offline inference (on the GPU instance)
+
+```python
+--8<-- "examples/vllm-omni/qwen2.5-omni/offline_inference.py"
+```
+
+Run it:
+
+```bash
+python3 offline_inference.py
+aplay out/lullaby.wav   # afplay on macOS
+```
+
+## Online inference (from a remote client)
+
+Open TCP 8080 in the EC2 security group to your client IP, then:
+
+```bash
+export OMNI_ENDPOINT=http://ec2-xx-xx-xx-xx.us-west-2.compute.amazonaws.com:8080
+python3 online_inference.py
+```
+
+```python
+--8<-- "examples/vllm-omni/qwen2.5-omni/online_inference.py"
+```
+
+## API overview
+
+OpenAI-compatible endpoints exposed by the container:
+
+| Endpoint | Purpose |
+| --- | --- |
+| `POST /v1/chat/completions` | Text / multimodal in → text or audio out (see above for audio) |
+| `POST /v1/audio/speech` | Direct text-to-speech shortcut (voices: `Chelsie`, `Ethan`). ⚠️ In v1.0.0 the shortcut bypasses the thinker and does not apply the correct sampling params, producing noisy output. Prefer the chat route. |
+| `GET /v1/audio/voices` | List voices |
+| `GET /v1/models` | Show served model id |
+| `GET /health` | Liveness |
+
+## Troubleshooting
+
+| Symptom | Fix |
+| --- | --- |
+| `NVMLError_InvalidArgument` in stage 1 during startup | Single-GPU instance — use a 4-GPU instance. |
+| Audio sounds like noise/gibberish | Missing `sampling_params_list` — add it per above. |
+| `message.audio: {}` empty on chat completions | Using `"modalities": ["text","audio"]`. Use `["audio"]` only. |
+| `Cannot perform interactive login from non-TTY device` | AWS creds expired. Refresh `~/.aws/credentials` and re-run ECR login. |
+| Health never goes 200 | Inspect `docker logs omni3b`. Weight download or OOM — need ≥4 GPUs with ≥24 GB each. |
+
+## Costs (us-west-2, on-demand, April 2026)
+
+- `g5.12xlarge` ≈ $5.67 / hour
+- `g6e.12xlarge` ≈ $10.49 / hour
+
+Stop the instance when idle; terminate to free EBS.
diff --git a/examples/vllm-omni/qwen2.5-omni/offline_inference.py b/examples/vllm-omni/qwen2.5-omni/offline_inference.py
new file mode 100644
index 000000000000..c71d3c57d3a2
--- /dev/null
+++ b/examples/vllm-omni/qwen2.5-omni/offline_inference.py
@@ -0,0 +1,83 @@
+"""Offline inference against a local vLLM-Omni server running Qwen2.5-Omni-3B.
+
+Assumes the server is already running on http://localhost:8080 (see the
+Qwen2.5-Omni tutorial for `docker run` instructions).
+"""
+
+import base64
+import json
+import os
+import pathlib
+
+import requests
+
+ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080")
+MODEL = "Qwen/Qwen2.5-Omni-3B"
+SYSTEM = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
+    "capable of perceiving auditory and visual inputs, as well as generating "
+    "text and speech."
+)
+
+# Three per-stage sampling params (thinker, talker, code2wav) are REQUIRED for
+# clean audio. The built-in defaults produce noise. Do not omit.
+SAMPLING_PARAMS_LIST = [
+    {
+        "temperature": 0.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "max_tokens": 2048,
+        "seed": 42,
+        "detokenize": True,
+        "repetition_penalty": 1.1,
+    },
+    {
+        "temperature": 0.9,
+        "top_p": 0.8,
+        "top_k": 40,
+        "max_tokens": 2048,
+        "seed": 42,
+        "detokenize": True,
+        "repetition_penalty": 1.05,
+        "stop_token_ids": [8294],
+    },
+    {
+        "temperature": 0.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "max_tokens": 2048,
+        "seed": 42,
+        "detokenize": True,
+        "repetition_penalty": 1.1,
+    },
+]
+
+
+def generate_audio(prompt: str, out_path: pathlib.Path) -> None:
+    payload = {
+        "model": MODEL,
+        "modalities": ["audio"],
+        "sampling_params_list": SAMPLING_PARAMS_LIST,
+        "messages": [
+            {"role": "system", "content": [{"type": "text", "text": SYSTEM}]},
+            {"role": "user", "content": [{"type": "text", "text": prompt}]},
+        ],
+    }
+    response = requests.post(
+        f"{ENDPOINT}/v1/chat/completions",
+        headers={"Content-Type": "application/json"},
+        data=json.dumps(payload),
+        timeout=600,
+    )
+    response.raise_for_status()
+    audio_b64 = response.json()["choices"][0]["message"]["audio"]["data"]
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_bytes(base64.b64decode(audio_b64))
+    print(f"wrote {out_path} ({out_path.stat().st_size} bytes)")
+
+
+if __name__ == "__main__":
+    generate_audio(
+        "Tell me a short, calming bedtime lullaby story for a 6-year-old girl.",
+        pathlib.Path("out/lullaby.wav"),
+    )
diff --git a/examples/vllm-omni/qwen2.5-omni/online_inference.py b/examples/vllm-omni/qwen2.5-omni/online_inference.py
new file mode 100644
index 000000000000..798f87124fea
--- /dev/null
+++ b/examples/vllm-omni/qwen2.5-omni/online_inference.py
@@ -0,0 +1,25 @@
+"""Online inference against a remote vLLM-Omni server running Qwen2.5-Omni-3B.
+
+Set OMNI_ENDPOINT to the public URL of your EC2 instance, e.g.:
+  export OMNI_ENDPOINT=http://ec2-xx-xx-xx-xx.us-west-2.compute.amazonaws.com:8080
+
+See offline_inference.py for the local-server variant — the only difference is
+the default endpoint.
+"""
+
+import os
+import pathlib
+
+from offline_inference import generate_audio
+
+if __name__ == "__main__":
+    endpoint = os.environ.get("OMNI_ENDPOINT")
+    if not endpoint or endpoint.startswith("http://localhost"):
+        raise SystemExit(
+            "Set OMNI_ENDPOINT to the remote server URL, e.g. "
+            "export OMNI_ENDPOINT=http://<ec2-host>:8080"
+        )
+    generate_audio(
+        "Briefly describe the weather on Mars today.",
+        pathlib.Path("out/mars.wav"),
+    )

From 2f42c05e7bb6a76ef7d6ad329c59575a42997feb Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Sun, 26 Apr 2026 22:19:41 -0700
Subject: [PATCH 03/10] Inline Qwen2.5-Omni example into vllm-omni tutorial

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/.nav.yml                                 |   1 -
 docs/vllm-omni/index.md                       |   4 -
 docs/vllm-omni/qwen2.5-omni.md                | 140 ------------------
 .../qwen2.5-omni/online_inference.py          |  25 ----
 4 files changed, 170 deletions(-)
 delete mode 100644 docs/vllm-omni/qwen2.5-omni.md
 delete mode 100644 examples/vllm-omni/qwen2.5-omni/online_inference.py

diff --git a/docs/.nav.yml b/docs/.nav.yml
index 35fd502aba22..4c2a53cdf728 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -6,7 +6,6 @@ nav:
     - Release Notifications: get_started/release_notifications.md
     - Ray: ray/index.md
     - vLLM-Omni: vllm-omni/index.md
-    - Qwen2.5-Omni on EC2: vllm-omni/qwen2.5-omni.md
   - Release Notes:
     - releasenotes/index.md
     - Base: releasenotes/base/index.md
diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index 013ac5954fc2..eb1edabcadb8 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -177,10 +177,6 @@ the container directly — SageMaker async inference only captures the initial J
 
 See [vLLM-Omni Release Notes](../releasenotes/vllm-omni/index.md) for version history and changelogs.
 
-## Model Tutorials
-
-- [Qwen2.5-Omni-3B on EC2 GPU](qwen2.5-omni.md) — multi-GPU setup, audio output gotchas, offline + online inference
-
 ## Resources
 
 - [vLLM-Omni Documentation](https://github.com/vllm-project/vllm-omni)
diff --git a/docs/vllm-omni/qwen2.5-omni.md b/docs/vllm-omni/qwen2.5-omni.md
deleted file mode 100644
index dda730ec1c49..000000000000
--- a/docs/vllm-omni/qwen2.5-omni.md
+++ /dev/null
@@ -1,140 +0,0 @@
-# Qwen2.5-Omni-3B on EC2 GPU
-
-Run [Qwen/Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) (multimodal-in / text + speech-out) using the vLLM-Omni container — both as a
-local (offline) server and as a remote (online) endpoint.
-
-## Requirements
-
-- **EC2 GPU instance with ≥ 4 GPUs**:
-  - `g5.12xlarge` / `g6.12xlarge` (4× A10G, 24 GB each) — tested
-  - `g6e.12xlarge` (4× L40S, 48 GB each) — preferred when available
-- Amazon Linux 2023 with NVIDIA driver, Docker, and `nvidia-container-toolkit` (AWS Deep Learning AMIs include these)
-- AWS credentials with ECR pull permission for `763104351884`
-- Outbound internet to HuggingFace (first run downloads ~6 GB)
-
-!!! note "Single-GPU note" Qwen2.5-Omni-3B's default stage layout puts the talker on GPU 1. On a single-GPU instance it fails or produces distorted
-audio. Use a 4-GPU instance.
-
-## One-time setup
-
-```bash
-# ECR login
-aws ecr get-login-password --region us-west-2 | \
-  docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
-
-docker pull {{ images.latest_vllm_omni_ec2 }}
-
-mkdir -p ~/hf-cache
-```
-
-## Start the server
-
-```bash
-docker run -d --name omni3b \
-  --gpus all --shm-size=16g -p 8080:8080 \
-  -v ~/hf-cache:/root/.cache/huggingface \
-  -e HF_HUB_ENABLE_HF_TRANSFER=1 \
-  {{ images.latest_vllm_omni_ec2 }} \
-    Qwen/Qwen2.5-Omni-3B \
-    --host 0.0.0.0 --port 8080 \
-    --max-model-len 16384 --dtype bfloat16
-```
-
-First start takes ~8 minutes (weight download + 3-stage model load). Wait for ready:
-
-```bash
-until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done
-echo ready
-```
-
-Stop and remove:
-
-```bash
-docker stop omni3b && docker rm omni3b
-```
-
-## Getting clean audio out
-
-Three things are **required** on `/v1/chat/completions` to produce usable speech from Qwen2.5-Omni-3B:
-
-1. `"modalities": ["audio"]`
-2. `"sampling_params_list"` — a 3-element list (thinker, talker, code2wav). The image's built-in per-stage defaults are wrong and produce noise. Use
-   the values shown below (from the official Qwen docs).
-3. The exact Qwen system prompt.
-
-!!! warning "Omitting `sampling_params_list` produces noise even though HTTP returns 200 with valid WAV bytes."
-
-### Working curl
-
-```bash
-curl -s http://localhost:8080/v1/chat/completions \
-  -H 'Content-Type: application/json' \
-  -d '{
-    "model": "Qwen/Qwen2.5-Omni-3B",
-    "modalities": ["audio"],
-    "sampling_params_list": [
-      {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1},
-      {"temperature":0.9,"top_p":0.8,"top_k":40,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.05,"stop_token_ids":[8294]},
-      {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1}
-    ],
-    "messages": [
-      {"role":"system","content":[{"type":"text","text":"You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]},
-      {"role":"user","content":[{"type":"text","text":"Tell me a short, calming bedtime lullaby story for a 6-year-old girl."}]}
-    ]
-  }' | jq -r '.choices[0].message.audio.data' | base64 -d > out.wav
-```
-
-## Offline inference (on the GPU instance)
-
-```python
---8<-- "examples/vllm-omni/qwen2.5-omni/offline_inference.py"
-```
-
-Run it:
-
-```bash
-python3 offline_inference.py
-aplay out/lullaby.wav   # afplay on macOS
-```
-
-## Online inference (from a remote client)
-
-Open TCP 8080 in the EC2 security group to your client IP, then:
-
-```bash
-export OMNI_ENDPOINT=http://ec2-xx-xx-xx-xx.us-west-2.compute.amazonaws.com:8080
-python3 online_inference.py
-```
-
-```python
---8<-- "examples/vllm-omni/qwen2.5-omni/online_inference.py"
-```
-
-## API overview
-
-OpenAI-compatible endpoints exposed by the container:
-
-| Endpoint | Purpose |
-| --- | --- |
-| `POST /v1/chat/completions` | Text / multimodal in → text or audio out (see above for audio) |
-| `POST /v1/audio/speech` | Direct text-to-speech shortcut (voices: `Chelsie`, `Ethan`). ⚠️ In v1.0.0 the shortcut bypasses the thinker and does not apply the correct sampling params, producing noisy output. Prefer the chat route. |
-| `GET /v1/audio/voices` | List voices |
-| `GET /v1/models` | Show served model id |
-| `GET /health` | Liveness |
-
-## Troubleshooting
-
-| Symptom | Fix |
-| --- | --- |
-| `NVMLError_InvalidArgument` in stage 1 during startup | Single-GPU instance — use a 4-GPU instance. |
-| Audio sounds like noise/gibberish | Missing `sampling_params_list` — add it per above. |
-| `message.audio: {}` empty on chat completions | Using `"modalities": ["text","audio"]`. Use `["audio"]` only. |
-| `Cannot perform interactive login from non-TTY device` | AWS creds expired. Refresh `~/.aws/credentials` and re-run ECR login. |
-| Health never goes 200 | Inspect `docker logs omni3b`. Weight download or OOM — need ≥4 GPUs with ≥24 GB each. |
-
-## Costs (us-west-2, on-demand, April 2026)
-
-- `g5.12xlarge` ≈ $5.67 / hour
-- `g6e.12xlarge` ≈ $10.49 / hour
-
-Stop the instance when idle; terminate to free EBS.
diff --git a/examples/vllm-omni/qwen2.5-omni/online_inference.py b/examples/vllm-omni/qwen2.5-omni/online_inference.py
deleted file mode 100644
index 798f87124fea..000000000000
--- a/examples/vllm-omni/qwen2.5-omni/online_inference.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""Online inference against a remote vLLM-Omni server running Qwen2.5-Omni-3B.
-
-Set OMNI_ENDPOINT to the public URL of your EC2 instance, e.g.:
-  export OMNI_ENDPOINT=http://ec2-xx-xx-xx-xx.us-west-2.compute.amazonaws.com:8080
-
-See offline_inference.py for the local-server variant — the only difference is
-the default endpoint.
-"""
-
-import os
-import pathlib
-
-from offline_inference import generate_audio
-
-if __name__ == "__main__":
-    endpoint = os.environ.get("OMNI_ENDPOINT")
-    if not endpoint or endpoint.startswith("http://localhost"):
-        raise SystemExit(
-            "Set OMNI_ENDPOINT to the remote server URL, e.g. "
-            "export OMNI_ENDPOINT=http://<ec2-host>:8080"
-        )
-    generate_audio(
-        "Briefly describe the weather on Mars today.",
-        pathlib.Path("out/mars.wav"),
-    )

From ebc5b220725c16789f11578103d6ee3cd7ef1303 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Sun, 26 Apr 2026 22:22:20 -0700
Subject: [PATCH 04/10] Replace EC2 curl examples with end-to-end Python
 clients

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/vllm-omni/index.md                  | 58 +++++++++++++++------
 examples/vllm-omni/image/inference.py    | 32 ++++++++++++
 examples/vllm-omni/image/invoke_image.sh |  5 --
 examples/vllm-omni/tts/inference.py      | 31 +++++++++++
 examples/vllm-omni/tts/invoke_tts.sh     |  6 ---
 examples/vllm-omni/video/inference.py    | 65 ++++++++++++++++++++++++
 examples/vllm-omni/video/invoke_video.sh | 22 --------
 7 files changed, 171 insertions(+), 48 deletions(-)
 create mode 100644 examples/vllm-omni/image/inference.py
 delete mode 100755 examples/vllm-omni/image/invoke_image.sh
 create mode 100644 examples/vllm-omni/tts/inference.py
 delete mode 100755 examples/vllm-omni/tts/invoke_tts.sh
 create mode 100644 examples/vllm-omni/video/inference.py
 delete mode 100755 examples/vllm-omni/video/invoke_video.sh

diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index eb1edabcadb8..9d3c1fd1c6c7 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -48,45 +48,73 @@ For package versions included in each release, see the [Release Notes](../releas
 
 ## EC2 Deployment
 
-The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080.
+The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080. Each example below picks a representative model for its
+modality — any `vllm serve` flag may be appended to `docker run` (e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`).
 
-### Start the Server
+All three examples below use the same Python client pattern. Install the one dependency:
 
 ```bash
-docker run -d --gpus all \
-  --shm-size=2g \
-  -p 8080:8080 \
+pip install requests
+```
+
+### Text-to-Speech
+
+Start the server with a TTS model:
+
+```bash
+docker run -d --gpus all --shm-size=2g -p 8080:8080 \
   {{ images.latest_vllm_omni_ec2 }} \
   --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
 
-until curl -sf http://localhost:8080/health > /dev/null; do sleep 5; done
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
 ```
 
-Any flag accepted by `vllm serve` may be appended to `docker run` (e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`).
+Submit a request and write the returned WAV bytes to disk:
 
-### Text-to-Speech
-
-Returns raw audio bytes (WAV).
+```python
+--8<-- "examples/vllm-omni/tts/inference.py"
+```
 
 ```bash
---8<-- "examples/vllm-omni/tts/invoke_tts.sh"
+python3 inference.py
+aplay out/speech.wav   # afplay on macOS
 ```
 
 ### Image Generation
 
-Returns a JSON response with a base64-encoded image in `data[0].b64_json`.
+Start the server with an image-generation model:
 
 ```bash
---8<-- "examples/vllm-omni/image/invoke_image.sh"
+docker run -d --gpus all --shm-size=2g -p 8080:8080 \
+  {{ images.latest_vllm_omni_ec2 }} \
+  --model black-forest-labs/FLUX.2-klein-4B
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+```
+
+The response JSON contains a base64-encoded PNG in `data[0].b64_json`:
+
+```python
+--8<-- "examples/vllm-omni/image/inference.py"
 ```
 
 ### Video Generation
 
 The `/v1/videos` endpoint is asynchronous by design — it returns a job ID immediately and generates the video in the background. The request must use
-`multipart/form-data`.
+`multipart/form-data`. The client below submits the job, polls until it completes, then downloads the MP4.
+
+Start the server with a video-generation model:
 
 ```bash
---8<-- "examples/vllm-omni/video/invoke_video.sh"
+docker run -d --gpus all --shm-size=8g -p 8080:8080 \
+  {{ images.latest_vllm_omni_ec2 }} \
+  --model Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+```
+
+```python
+--8<-- "examples/vllm-omni/video/inference.py"
 ```
 
 ### Multimodal Chat
diff --git a/examples/vllm-omni/image/inference.py b/examples/vllm-omni/image/inference.py
new file mode 100644
index 000000000000..70d2f6ccf33a
--- /dev/null
+++ b/examples/vllm-omni/image/inference.py
@@ -0,0 +1,32 @@
+"""End-to-end image generation example against a local vLLM-Omni server.
+
+Prereq: start the server with an image-generation model, e.g.
+  docker run -d --gpus all -p 8080:8080 <vllm-omni-image> \
+    --model black-forest-labs/FLUX.2-klein-4B
+"""
+
+import base64
+import os
+import pathlib
+
+import requests
+
+ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080")
+OUT_PATH = pathlib.Path("out/image.png")
+
+
+def generate(prompt: str, size: str = "512x512") -> bytes:
+    response = requests.post(
+        f"{ENDPOINT}/v1/images/generations",
+        json={"prompt": prompt, "size": size, "n": 1},
+        timeout=300,
+    )
+    response.raise_for_status()
+    return base64.b64decode(response.json()["data"][0]["b64_json"])
+
+
+if __name__ == "__main__":
+    image = generate("a red apple on a white table, studio lighting")
+    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+    OUT_PATH.write_bytes(image)
+    print(f"wrote {OUT_PATH} ({len(image)} bytes)")
diff --git a/examples/vllm-omni/image/invoke_image.sh b/examples/vllm-omni/image/invoke_image.sh
deleted file mode 100755
index 8830334d2512..000000000000
--- a/examples/vllm-omni/image/invoke_image.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-# Image generation via OpenAI-compatible /v1/images/generations endpoint
-curl -X POST http://localhost:8080/v1/images/generations \
-  -H "Content-Type: application/json" \
-  -d '{"prompt": "a red apple on a white table", "size": "512x512", "n": 1}'
diff --git a/examples/vllm-omni/tts/inference.py b/examples/vllm-omni/tts/inference.py
new file mode 100644
index 000000000000..3192b5071ffd
--- /dev/null
+++ b/examples/vllm-omni/tts/inference.py
@@ -0,0 +1,31 @@
+"""End-to-end TTS example against a local vLLM-Omni server.
+
+Prereq: start the server with a TTS model, e.g.
+  docker run -d --gpus all -p 8080:8080 <vllm-omni-image> \
+    --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
+"""
+
+import os
+import pathlib
+
+import requests
+
+ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080")
+OUT_PATH = pathlib.Path("out/speech.wav")
+
+
+def synthesize(text: str, voice: str = "vivian", language: str = "English") -> bytes:
+    response = requests.post(
+        f"{ENDPOINT}/v1/audio/speech",
+        json={"input": text, "voice": voice, "language": language},
+        timeout=300,
+    )
+    response.raise_for_status()
+    return response.content
+
+
+if __name__ == "__main__":
+    audio = synthesize("Hello from vLLM-Omni. This is a text to speech demo.")
+    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+    OUT_PATH.write_bytes(audio)
+    print(f"wrote {OUT_PATH} ({len(audio)} bytes)")
diff --git a/examples/vllm-omni/tts/invoke_tts.sh b/examples/vllm-omni/tts/invoke_tts.sh
deleted file mode 100755
index 935f318492ce..000000000000
--- a/examples/vllm-omni/tts/invoke_tts.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-# Text-to-speech via OpenAI-compatible /v1/audio/speech endpoint
-curl -X POST http://localhost:8080/v1/audio/speech \
-  -H "Content-Type: application/json" \
-  -d '{"input": "Hello, how are you?", "voice": "vivian", "language": "English"}' \
-  --output speech.wav
diff --git a/examples/vllm-omni/video/inference.py b/examples/vllm-omni/video/inference.py
new file mode 100644
index 000000000000..9a982fcedeb3
--- /dev/null
+++ b/examples/vllm-omni/video/inference.py
@@ -0,0 +1,65 @@
+"""End-to-end video generation example against a local vLLM-Omni server.
+
+The /v1/videos endpoint is async — it returns a job ID immediately, and the
+video is generated in the background. This script submits the job, polls
+until it completes, then downloads the MP4.
+
+Prereq: start the server with a video-generation model, e.g.
+  docker run -d --gpus all -p 8080:8080 <vllm-omni-image> \
+    --model Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+"""
+
+import os
+import pathlib
+import time
+
+import requests
+
+ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080")
+OUT_PATH = pathlib.Path("out/video.mp4")
+POLL_INTERVAL_S = 5
+POLL_TIMEOUT_S = 600
+
+
+def submit_job(prompt: str) -> str:
+    # /v1/videos requires multipart/form-data
+    response = requests.post(
+        f"{ENDPOINT}/v1/videos",
+        files={
+            "prompt": (None, prompt),
+            "num_frames": (None, "17"),
+            "num_inference_steps": (None, "4"),
+            "size": (None, "480x320"),
+            "seed": (None, "42"),
+        },
+        timeout=60,
+    )
+    response.raise_for_status()
+    return response.json()["id"]
+
+
+def wait_for_completion(job_id: str) -> None:
+    deadline = time.time() + POLL_TIMEOUT_S
+    while time.time() < deadline:
+        status = requests.get(f"{ENDPOINT}/v1/videos/{job_id}", timeout=30).json()["status"]
+        if status == "succeeded":
+            return
+        if status == "failed":
+            raise RuntimeError(f"Job {job_id} failed")
+        time.sleep(POLL_INTERVAL_S)
+    raise TimeoutError(f"Job {job_id} did not complete within {POLL_TIMEOUT_S}s")
+
+
+def download(job_id: str, out_path: pathlib.Path) -> None:
+    response = requests.get(f"{ENDPOINT}/v1/videos/{job_id}/content", timeout=60)
+    response.raise_for_status()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_bytes(response.content)
+
+
+if __name__ == "__main__":
+    job_id = submit_job("a dog running on a beach at sunset")
+    print(f"submitted job {job_id}")
+    wait_for_completion(job_id)
+    download(job_id, OUT_PATH)
+    print(f"wrote {OUT_PATH} ({OUT_PATH.stat().st_size} bytes)")
diff --git a/examples/vllm-omni/video/invoke_video.sh b/examples/vllm-omni/video/invoke_video.sh
deleted file mode 100755
index 3e6c4ab36d68..000000000000
--- a/examples/vllm-omni/video/invoke_video.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# Video generation via /v1/videos endpoint (async — returns a job ID)
-# The /v1/videos API requires multipart/form-data.
-JOB=$(curl -sf -X POST http://localhost:8080/v1/videos \
-  -F "prompt=a dog running on a beach" \
-  -F "num_frames=17" \
-  -F "num_inference_steps=4" \
-  -F "size=480x320" \
-  -F "seed=42")
-
-JOB_ID=$(echo "$JOB" | python3 -c "import json,sys;print(json.load(sys.stdin)['id'])")
-echo "Job: $JOB_ID"
-
-# Poll until complete, then download
-while true; do
-  STATUS=$(curl -sf "http://localhost:8080/v1/videos/$JOB_ID" | python3 -c "import json,sys;print(json.load(sys.stdin)['status'])")
-  [ "$STATUS" = "succeeded" ] && break
-  [ "$STATUS" = "failed" ] && { echo "Job failed"; exit 1; }
-  sleep 5
-done
-
-curl -sf "http://localhost:8080/v1/videos/$JOB_ID/content" --output video.mp4

From 54fd2ee7bb31a17de588fc4a2edb74e2ab46a4b2 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Sun, 26 Apr 2026 22:23:52 -0700
Subject: [PATCH 05/10] Switch EC2 examples back to end-to-end shell scripts

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/vllm-omni/index.md               | 60 ++++---------------------
 examples/vllm-omni/image/inference.py | 32 -------------
 examples/vllm-omni/image/run.sh       | 22 +++++++++
 examples/vllm-omni/tts/inference.py   | 31 -------------
 examples/vllm-omni/tts/run.sh         | 22 +++++++++
 examples/vllm-omni/video/inference.py | 65 ---------------------------
 examples/vllm-omni/video/run.sh       | 36 +++++++++++++++
 7 files changed, 88 insertions(+), 180 deletions(-)
 delete mode 100644 examples/vllm-omni/image/inference.py
 create mode 100755 examples/vllm-omni/image/run.sh
 delete mode 100644 examples/vllm-omni/tts/inference.py
 create mode 100755 examples/vllm-omni/tts/run.sh
 delete mode 100644 examples/vllm-omni/video/inference.py
 create mode 100755 examples/vllm-omni/video/run.sh

diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index 9d3c1fd1c6c7..8711d4094f91 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -48,73 +48,29 @@ For package versions included in each release, see the [Release Notes](../releas
 
 ## EC2 Deployment
 
-The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080. Each example below picks a representative model for its
-modality — any `vllm serve` flag may be appended to `docker run` (e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`).
-
-All three examples below use the same Python client pattern. Install the one dependency:
-
-```bash
-pip install requests
-```
+The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080. Each example below is a self-contained shell script that
+starts the container, waits for readiness, submits a request, and writes the output to disk. Any `vllm serve` flag may be appended to `docker run`
+(e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`).
 
 ### Text-to-Speech
 
-Start the server with a TTS model:
-
-```bash
-docker run -d --gpus all --shm-size=2g -p 8080:8080 \
-  {{ images.latest_vllm_omni_ec2 }} \
-  --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
-
-until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
-```
-
-Submit a request and write the returned WAV bytes to disk:
-
-```python
---8<-- "examples/vllm-omni/tts/inference.py"
-```
-
 ```bash
-python3 inference.py
-aplay out/speech.wav   # afplay on macOS
+--8<-- "examples/vllm-omni/tts/run.sh"
 ```
 
 ### Image Generation
 
-Start the server with an image-generation model:
-
 ```bash
-docker run -d --gpus all --shm-size=2g -p 8080:8080 \
-  {{ images.latest_vllm_omni_ec2 }} \
-  --model black-forest-labs/FLUX.2-klein-4B
-
-until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
-```
-
-The response JSON contains a base64-encoded PNG in `data[0].b64_json`:
-
-```python
---8<-- "examples/vllm-omni/image/inference.py"
+--8<-- "examples/vllm-omni/image/run.sh"
 ```
 
 ### Video Generation
 
-The `/v1/videos` endpoint is asynchronous by design — it returns a job ID immediately and generates the video in the background. The request must use
-`multipart/form-data`. The client below submits the job, polls until it completes, then downloads the MP4.
-
-Start the server with a video-generation model:
+The `/v1/videos` endpoint is asynchronous — it returns a job ID immediately and generates the video in the background. The script below submits the
+job, polls until it completes, then downloads the MP4.
 
 ```bash
-docker run -d --gpus all --shm-size=8g -p 8080:8080 \
-  {{ images.latest_vllm_omni_ec2 }} \
-  --model Wan-AI/Wan2.1-T2V-1.3B-Diffusers
-
-until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
-```
-
-```python
---8<-- "examples/vllm-omni/video/inference.py"
+--8<-- "examples/vllm-omni/video/run.sh"
 ```
 
 ### Multimodal Chat
diff --git a/examples/vllm-omni/image/inference.py b/examples/vllm-omni/image/inference.py
deleted file mode 100644
index 70d2f6ccf33a..000000000000
--- a/examples/vllm-omni/image/inference.py
+++ /dev/null
@@ -1,32 +0,0 @@
-"""End-to-end image generation example against a local vLLM-Omni server.
-
-Prereq: start the server with an image-generation model, e.g.
-  docker run -d --gpus all -p 8080:8080 <vllm-omni-image> \
-    --model black-forest-labs/FLUX.2-klein-4B
-"""
-
-import base64
-import os
-import pathlib
-
-import requests
-
-ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080")
-OUT_PATH = pathlib.Path("out/image.png")
-
-
-def generate(prompt: str, size: str = "512x512") -> bytes:
-    response = requests.post(
-        f"{ENDPOINT}/v1/images/generations",
-        json={"prompt": prompt, "size": size, "n": 1},
-        timeout=300,
-    )
-    response.raise_for_status()
-    return base64.b64decode(response.json()["data"][0]["b64_json"])
-
-
-if __name__ == "__main__":
-    image = generate("a red apple on a white table, studio lighting")
-    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
-    OUT_PATH.write_bytes(image)
-    print(f"wrote {OUT_PATH} ({len(image)} bytes)")
diff --git a/examples/vllm-omni/image/run.sh b/examples/vllm-omni/image/run.sh
new file mode 100755
index 000000000000..1f017f939f7c
--- /dev/null
+++ b/examples/vllm-omni/image/run.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# End-to-end image-generation example: start server, wait for ready, generate.
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+MODEL="${MODEL:-black-forest-labs/FLUX.2-klein-4B}"
+NAME="${NAME:-omni-image}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8080 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  "${IMAGE}" --model "${MODEL}"
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+
+# Response JSON has data[0].b64_json — decode to PNG.
+curl -sf -X POST http://localhost:8080/v1/images/generations \
+  -H "Content-Type: application/json" \
+  -d '{"prompt": "a red apple on a white table, studio lighting", "size": "512x512", "n": 1}' \
+  | python3 -c "import base64,json,sys;open('image.png','wb').write(base64.b64decode(json.load(sys.stdin)['data'][0]['b64_json']))"
+
+echo "wrote image.png ($(stat -f%z image.png 2>/dev/null || stat -c%s image.png) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"
diff --git a/examples/vllm-omni/tts/inference.py b/examples/vllm-omni/tts/inference.py
deleted file mode 100644
index 3192b5071ffd..000000000000
--- a/examples/vllm-omni/tts/inference.py
+++ /dev/null
@@ -1,31 +0,0 @@
-"""End-to-end TTS example against a local vLLM-Omni server.
-
-Prereq: start the server with a TTS model, e.g.
-  docker run -d --gpus all -p 8080:8080 <vllm-omni-image> \
-    --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
-"""
-
-import os
-import pathlib
-
-import requests
-
-ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080")
-OUT_PATH = pathlib.Path("out/speech.wav")
-
-
-def synthesize(text: str, voice: str = "vivian", language: str = "English") -> bytes:
-    response = requests.post(
-        f"{ENDPOINT}/v1/audio/speech",
-        json={"input": text, "voice": voice, "language": language},
-        timeout=300,
-    )
-    response.raise_for_status()
-    return response.content
-
-
-if __name__ == "__main__":
-    audio = synthesize("Hello from vLLM-Omni. This is a text to speech demo.")
-    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
-    OUT_PATH.write_bytes(audio)
-    print(f"wrote {OUT_PATH} ({len(audio)} bytes)")
diff --git a/examples/vllm-omni/tts/run.sh b/examples/vllm-omni/tts/run.sh
new file mode 100755
index 000000000000..cc526a23f4d6
--- /dev/null
+++ b/examples/vllm-omni/tts/run.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# End-to-end TTS example: start server, wait for ready, synthesize speech.
+# Requires: docker (with NVIDIA runtime), curl, an authenticated ECR pull.
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice}"
+NAME="${NAME:-omni-tts}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8080 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  "${IMAGE}" --model "${MODEL}"
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+
+curl -sf -X POST http://localhost:8080/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -d '{"input": "Hello from vLLM-Omni.", "voice": "vivian", "language": "English"}' \
+  --output speech.wav
+
+echo "wrote speech.wav ($(stat -f%z speech.wav 2>/dev/null || stat -c%s speech.wav) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"
diff --git a/examples/vllm-omni/video/inference.py b/examples/vllm-omni/video/inference.py
deleted file mode 100644
index 9a982fcedeb3..000000000000
--- a/examples/vllm-omni/video/inference.py
+++ /dev/null
@@ -1,65 +0,0 @@
-"""End-to-end video generation example against a local vLLM-Omni server.
-
-The /v1/videos endpoint is async — it returns a job ID immediately, and the
-video is generated in the background. This script submits the job, polls
-until it completes, then downloads the MP4.
-
-Prereq: start the server with a video-generation model, e.g.
-  docker run -d --gpus all -p 8080:8080 <vllm-omni-image> \
-    --model Wan-AI/Wan2.1-T2V-1.3B-Diffusers
-"""
-
-import os
-import pathlib
-import time
-
-import requests
-
-ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080")
-OUT_PATH = pathlib.Path("out/video.mp4")
-POLL_INTERVAL_S = 5
-POLL_TIMEOUT_S = 600
-
-
-def submit_job(prompt: str) -> str:
-    # /v1/videos requires multipart/form-data
-    response = requests.post(
-        f"{ENDPOINT}/v1/videos",
-        files={
-            "prompt": (None, prompt),
-            "num_frames": (None, "17"),
-            "num_inference_steps": (None, "4"),
-            "size": (None, "480x320"),
-            "seed": (None, "42"),
-        },
-        timeout=60,
-    )
-    response.raise_for_status()
-    return response.json()["id"]
-
-
-def wait_for_completion(job_id: str) -> None:
-    deadline = time.time() + POLL_TIMEOUT_S
-    while time.time() < deadline:
-        status = requests.get(f"{ENDPOINT}/v1/videos/{job_id}", timeout=30).json()["status"]
-        if status == "succeeded":
-            return
-        if status == "failed":
-            raise RuntimeError(f"Job {job_id} failed")
-        time.sleep(POLL_INTERVAL_S)
-    raise TimeoutError(f"Job {job_id} did not complete within {POLL_TIMEOUT_S}s")
-
-
-def download(job_id: str, out_path: pathlib.Path) -> None:
-    response = requests.get(f"{ENDPOINT}/v1/videos/{job_id}/content", timeout=60)
-    response.raise_for_status()
-    out_path.parent.mkdir(parents=True, exist_ok=True)
-    out_path.write_bytes(response.content)
-
-
-if __name__ == "__main__":
-    job_id = submit_job("a dog running on a beach at sunset")
-    print(f"submitted job {job_id}")
-    wait_for_completion(job_id)
-    download(job_id, OUT_PATH)
-    print(f"wrote {OUT_PATH} ({OUT_PATH.stat().st_size} bytes)")
diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh
new file mode 100755
index 000000000000..c382ff827ea2
--- /dev/null
+++ b/examples/vllm-omni/video/run.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# End-to-end video-generation example: start server, submit job, poll, download.
+# /v1/videos is async — it returns a job ID; the MP4 is produced in the background.
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}"
+NAME="${NAME:-omni-video}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8080 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  "${IMAGE}" --model "${MODEL}"
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+
+# /v1/videos requires multipart/form-data.
+JOB_ID=$(curl -sf -X POST http://localhost:8080/v1/videos \
+  -F "prompt=a dog running on a beach at sunset" \
+  -F "num_frames=17" -F "num_inference_steps=4" \
+  -F "size=480x320" -F "seed=42" \
+  | python3 -c "import json,sys;print(json.load(sys.stdin)['id'])")
+
+echo "submitted job ${JOB_ID}"
+
+# Poll until succeeded (5s interval, 10 min timeout).
+for _ in $(seq 1 120); do
+  STATUS=$(curl -sf "http://localhost:8080/v1/videos/${JOB_ID}" \
+    | python3 -c "import json,sys;print(json.load(sys.stdin)['status'])")
+  [ "${STATUS}" = "succeeded" ] && break
+  [ "${STATUS}" = "failed" ] && { echo "job failed"; exit 1; }
+  sleep 5
+done
+
+curl -sf "http://localhost:8080/v1/videos/${JOB_ID}/content" --output video.mp4
+echo "wrote video.mp4 ($(stat -f%z video.mp4 2>/dev/null || stat -c%s video.mp4) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"

From 09a04f5402f35486ba6f39ccf3a5e308ea8f7050 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Sun, 26 Apr 2026 22:28:33 -0700
Subject: [PATCH 06/10] Add model intros and HuggingFace links to EC2 examples

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/vllm-omni/index.md | 50 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index 8711d4094f91..764d2c61e222 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -54,18 +54,27 @@ starts the container, waits for readiness, submits a request, and writes the out
 
 ### Text-to-Speech
 
+**Model:** [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) — a 1.7B-parameter Qwen3 text-to-speech
+model supporting multiple voices and languages, runs on a single 24 GB GPU (A10G / L4).
+
 ```bash
 --8<-- "examples/vllm-omni/tts/run.sh"
 ```
 
 ### Image Generation
 
+**Model:** [FLUX.2-klein-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B) — a 4B-parameter rectified-flow transformer from Black Forest
+Labs, produces high-quality 512×512 images from text prompts, runs on a single 24 GB GPU.
+
 ```bash
 --8<-- "examples/vllm-omni/image/run.sh"
 ```
 
 ### Video Generation
 
+**Model:** [Wan2.1-T2V-1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) — a 1.3B-parameter text-to-video diffusion model from the Wan
+team, generates short clips at up to 480×832 resolution. Needs a 48 GB GPU (L40S) or 2× 24 GB GPUs with `--tensor-parallel-size 2`.
+
 The `/v1/videos` endpoint is asynchronous — it returns a job ID immediately and generates the video in the background. The script below submits the
 job, polls until it completes, then downloads the MP4.
 
@@ -77,12 +86,47 @@ job, polls until it completes, then downloads the MP4.
 
 Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) are supplied as URL or base64 content parts in the message list.
 
+**Example model:** [Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) — a 3B-parameter omni model accepting text, image, and audio inputs
+and generating text or speech outputs. Multi-stage architecture (thinker + talker + code2wav) requires **≥ 4 GPUs**: `g5.12xlarge` / `g6.12xlarge` (4×
+A10G) or `g6e.12xlarge` (4× L40S).
+
+Start the server:
+
+```bash
+docker run -d --name omni3b --gpus all --shm-size=16g -p 8080:8080 \
+  -v ~/hf-cache:/root/.cache/huggingface \
+  -e HF_HUB_ENABLE_HF_TRANSFER=1 \
+  {{ images.latest_vllm_omni_ec2 }} \
+  --model Qwen/Qwen2.5-Omni-3B \
+  --host 0.0.0.0 --port 8080 \
+  --max-model-len 16384 --dtype bfloat16
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done
+```
+
+Three things are **required** on `/v1/chat/completions` to produce clean audio from Qwen2.5-Omni:
+
+1. `"modalities": ["audio"]` — not `["text","audio"]` (that returns empty audio).
+2. `"sampling_params_list"` — a 3-element list (thinker, talker, code2wav). The image's built-in per-stage defaults produce noise; use the values from
+   the official Qwen docs.
+3. The exact Qwen system prompt.
+
+!!! warning "Omitting `sampling_params_list` returns 200 with valid WAV bytes that sound like noise — the single most common footgun."
+
+Run the included client (supports local and remote via `OMNI_ENDPOINT`):
+
+```python
+--8<-- "examples/vllm-omni/qwen2.5-omni/offline_inference.py"
+```
+
 ```bash
-curl -X POST http://localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{"messages": [{"role": "user", "content": "Say hello in one sentence."}], "max_tokens": 64}'
+python3 offline_inference.py
+aplay out/lullaby.wav   # afplay on macOS
 ```
 
+The `/v1/audio/speech` shortcut (voices: `Chelsie`, `Ethan`) bypasses the thinker and does not apply the correct sampling params in v1.0.0, so it
+produces noisy output for Qwen2.5-Omni. Prefer `/v1/chat/completions` for this model.
+
 ## SageMaker Deployment
 
 ### Prerequisites

From 357b2613447e035ed28c6083c8ac76422ccf0a3a Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Sun, 26 Apr 2026 22:33:49 -0700
Subject: [PATCH 07/10] Convert Qwen2.5-Omni example to shell script

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/vllm-omni/index.md                       | 25 +-----
 .../qwen2.5-omni/offline_inference.py         | 83 -------------------
 examples/vllm-omni/qwen2.5-omni/run.sh        | 46 ++++++++++
 3 files changed, 48 insertions(+), 106 deletions(-)
 delete mode 100644 examples/vllm-omni/qwen2.5-omni/offline_inference.py
 create mode 100755 examples/vllm-omni/qwen2.5-omni/run.sh

diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index 764d2c61e222..9984c9d588cd 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -90,21 +90,7 @@ Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio)
 and generating text or speech outputs. Multi-stage architecture (thinker + talker + code2wav) requires **≥ 4 GPUs**: `g5.12xlarge` / `g6.12xlarge` (4×
 A10G) or `g6e.12xlarge` (4× L40S).
 
-Start the server:
-
-```bash
-docker run -d --name omni3b --gpus all --shm-size=16g -p 8080:8080 \
-  -v ~/hf-cache:/root/.cache/huggingface \
-  -e HF_HUB_ENABLE_HF_TRANSFER=1 \
-  {{ images.latest_vllm_omni_ec2 }} \
-  --model Qwen/Qwen2.5-Omni-3B \
-  --host 0.0.0.0 --port 8080 \
-  --max-model-len 16384 --dtype bfloat16
-
-until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done
-```
-
-Three things are **required** on `/v1/chat/completions` to produce clean audio from Qwen2.5-Omni:
+Start the server, then submit a request. Three things are **required** on `/v1/chat/completions` to produce clean audio from Qwen2.5-Omni:
 
 1. `"modalities": ["audio"]` — not `["text","audio"]` (that returns empty audio).
 2. `"sampling_params_list"` — a 3-element list (thinker, talker, code2wav). The image's built-in per-stage defaults produce noise; use the values from
@@ -113,15 +99,8 @@ Three things are **required** on `/v1/chat/completions` to produce clean audio f
 
 !!! warning "Omitting `sampling_params_list` returns 200 with valid WAV bytes that sound like noise — the single most common footgun."
 
-Run the included client (supports local and remote via `OMNI_ENDPOINT`):
-
-```python
---8<-- "examples/vllm-omni/qwen2.5-omni/offline_inference.py"
-```
-
 ```bash
-python3 offline_inference.py
-aplay out/lullaby.wav   # afplay on macOS
+--8<-- "examples/vllm-omni/qwen2.5-omni/run.sh"
 ```
 
 The `/v1/audio/speech` shortcut (voices: `Chelsie`, `Ethan`) bypasses the thinker and does not apply the correct sampling params in v1.0.0, so it
diff --git a/examples/vllm-omni/qwen2.5-omni/offline_inference.py b/examples/vllm-omni/qwen2.5-omni/offline_inference.py
deleted file mode 100644
index c71d3c57d3a2..000000000000
--- a/examples/vllm-omni/qwen2.5-omni/offline_inference.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""Offline inference against a local vLLM-Omni server running Qwen2.5-Omni-3B.
-
-Assumes the server is already running on http://localhost:8080 (see the
-Qwen2.5-Omni tutorial for `docker run` instructions).
-"""
-
-import base64
-import json
-import os
-import pathlib
-
-import requests
-
-ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080")
-MODEL = "Qwen/Qwen2.5-Omni-3B"
-SYSTEM = (
-    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
-    "capable of perceiving auditory and visual inputs, as well as generating "
-    "text and speech."
-)
-
-# Three per-stage sampling params (thinker, talker, code2wav) are REQUIRED for
-# clean audio. The built-in defaults produce noise. Do not omit.
-SAMPLING_PARAMS_LIST = [
-    {
-        "temperature": 0.0,
-        "top_p": 1.0,
-        "top_k": -1,
-        "max_tokens": 2048,
-        "seed": 42,
-        "detokenize": True,
-        "repetition_penalty": 1.1,
-    },
-    {
-        "temperature": 0.9,
-        "top_p": 0.8,
-        "top_k": 40,
-        "max_tokens": 2048,
-        "seed": 42,
-        "detokenize": True,
-        "repetition_penalty": 1.05,
-        "stop_token_ids": [8294],
-    },
-    {
-        "temperature": 0.0,
-        "top_p": 1.0,
-        "top_k": -1,
-        "max_tokens": 2048,
-        "seed": 42,
-        "detokenize": True,
-        "repetition_penalty": 1.1,
-    },
-]
-
-
-def generate_audio(prompt: str, out_path: pathlib.Path) -> None:
-    payload = {
-        "model": MODEL,
-        "modalities": ["audio"],
-        "sampling_params_list": SAMPLING_PARAMS_LIST,
-        "messages": [
-            {"role": "system", "content": [{"type": "text", "text": SYSTEM}]},
-            {"role": "user", "content": [{"type": "text", "text": prompt}]},
-        ],
-    }
-    response = requests.post(
-        f"{ENDPOINT}/v1/chat/completions",
-        headers={"Content-Type": "application/json"},
-        data=json.dumps(payload),
-        timeout=600,
-    )
-    response.raise_for_status()
-    audio_b64 = response.json()["choices"][0]["message"]["audio"]["data"]
-    out_path.parent.mkdir(parents=True, exist_ok=True)
-    out_path.write_bytes(base64.b64decode(audio_b64))
-    print(f"wrote {out_path} ({out_path.stat().st_size} bytes)")
-
-
-if __name__ == "__main__":
-    generate_audio(
-        "Tell me a short, calming bedtime lullaby story for a 6-year-old girl.",
-        pathlib.Path("out/lullaby.wav"),
-    )
diff --git a/examples/vllm-omni/qwen2.5-omni/run.sh b/examples/vllm-omni/qwen2.5-omni/run.sh
new file mode 100755
index 000000000000..a04624bdf99d
--- /dev/null
+++ b/examples/vllm-omni/qwen2.5-omni/run.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# End-to-end Qwen2.5-Omni-3B example: start server, wait for ready,
+# generate speech via /v1/chat/completions.
+#
+# REQUIRES ≥ 4 GPUs (e.g., g5.12xlarge / g6.12xlarge / g6e.12xlarge).
+# On single-GPU hosts the model's talker stage fails to load on GPU 1.
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+MODEL="${MODEL:-Qwen/Qwen2.5-Omni-3B}"
+NAME="${NAME:-omni3b}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=16g -p 8080:8080 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  -e HF_HUB_ENABLE_HF_TRANSFER=1 \
+  "${IMAGE}" --model "${MODEL}" \
+  --host 0.0.0.0 --port 8080 \
+  --max-model-len 16384 --dtype bfloat16
+
+# First start takes ~8 min (weight download + 3-stage load).
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done
+
+# Three things are REQUIRED for clean audio:
+#   1. "modalities": ["audio"]  (NOT ["text","audio"] — returns empty audio)
+#   2. "sampling_params_list"   (3-element list: thinker, talker, code2wav;
+#                                built-in defaults produce noise)
+#   3. The exact Qwen system prompt below.
+# Omitting #2 returns 200 OK with valid WAV bytes that sound like noise.
+curl -sf -X POST http://localhost:8080/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "model": "Qwen/Qwen2.5-Omni-3B",
+    "modalities": ["audio"],
+    "sampling_params_list": [
+      {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1},
+      {"temperature":0.9,"top_p":0.8,"top_k":40,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.05,"stop_token_ids":[8294]},
+      {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1}
+    ],
+    "messages": [
+      {"role":"system","content":[{"type":"text","text":"You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]},
+      {"role":"user","content":[{"type":"text","text":"Tell me a short, calming bedtime lullaby story for a 6-year-old girl."}]}
+    ]
+  }' | jq -r '.choices[0].message.audio.data' | base64 -d > lullaby.wav
+
+echo "wrote lullaby.wav ($(stat -f%z lullaby.wav 2>/dev/null || stat -c%s lullaby.wav) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"

From 35e50549d89f700e6ca4218690c1457a275a93c6 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Sun, 26 Apr 2026 23:21:35 -0700
Subject: [PATCH 08/10] Fix port mapping: container listens on 8000, map to
 host 8080

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 examples/vllm-omni/image/run.sh | 2 +-
 examples/vllm-omni/tts/run.sh   | 2 +-
 examples/vllm-omni/video/run.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/vllm-omni/image/run.sh b/examples/vllm-omni/image/run.sh
index 1f017f939f7c..d6dc1615ad95 100755
--- a/examples/vllm-omni/image/run.sh
+++ b/examples/vllm-omni/image/run.sh
@@ -6,7 +6,7 @@ IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda
 MODEL="${MODEL:-black-forest-labs/FLUX.2-klein-4B}"
 NAME="${NAME:-omni-image}"
 
-docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8080 \
+docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \
   -v "${HOME}/hf-cache:/root/.cache/huggingface" \
   "${IMAGE}" --model "${MODEL}"
 
diff --git a/examples/vllm-omni/tts/run.sh b/examples/vllm-omni/tts/run.sh
index cc526a23f4d6..9f4f185a2139 100755
--- a/examples/vllm-omni/tts/run.sh
+++ b/examples/vllm-omni/tts/run.sh
@@ -7,7 +7,7 @@ IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda
 MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice}"
 NAME="${NAME:-omni-tts}"
 
-docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8080 \
+docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \
   -v "${HOME}/hf-cache:/root/.cache/huggingface" \
   "${IMAGE}" --model "${MODEL}"
 
diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh
index c382ff827ea2..2c443695a4f7 100755
--- a/examples/vllm-omni/video/run.sh
+++ b/examples/vllm-omni/video/run.sh
@@ -7,7 +7,7 @@ IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda
 MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}"
 NAME="${NAME:-omni-video}"
 
-docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8080 \
+docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8000 \
   -v "${HOME}/hf-cache:/root/.cache/huggingface" \
   "${IMAGE}" --model "${MODEL}"
 

From c7ef1c14dd031b615ba34d13d4c5a28bfcc38175 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Mon, 27 Apr 2026 15:25:57 -0700
Subject: [PATCH 09/10] docs(vllm-omni): correct ECR repo, image tag, and
 version labeling

- Align version labeling with Ray convention: YAML 'version' now reflects
  the embedded framework version (0.18.0) instead of a DLC release number.
- Add optional 'ecr_repository' field so the data-dir key can differ from
  the actual ECR repo name. vllm-omni images live under the 'vllm' repo,
  not 'vllm-omni'.
- Fix SageMaker image tag: 'omni-sagemaker-cuda-v1' (verified against
  763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm), not the previous
  'omni-cuda-sagemaker-v1'.
- Rewrite the SageMaker async example to deploy TTS (works end-to-end)
  instead of video. The /v1/videos endpoint in 0.18.0 returns a job-ID
  JSON, which is what SageMaker async writes to S3; the MP4 itself is
  never written to S3 and cannot be retrieved via SageMaker in 0.18.0.
- Clarify Known Limitations: video generation is not supported on
  SageMaker in 0.18.0 (use EC2 for the full video workflow).
- Minor fix to EC2 video example (tensor-parallel-size 2, bumped steps,
  status value 'completed').

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 .../{1.0.0-gpu-ec2.yml => 0.18.0-gpu-ec2.yml} |  5 ++-
 ...sagemaker.yml => 0.18.0-gpu-sagemaker.yml} |  7 ++--
 docs/src/generate.py                          |  7 ++--
 docs/src/image_config.py                      | 15 +++++---
 docs/vllm-omni/index.md                       | 33 ++++++++++-------
 examples/vllm-omni/sagemaker/deploy_tts.py    |  2 +-
 .../vllm-omni/sagemaker/deploy_tts_async.py   | 36 +++++++++++++++++++
 .../vllm-omni/sagemaker/deploy_video_async.py | 35 ------------------
 examples/vllm-omni/video/run.sh               |  8 ++---
 9 files changed, 82 insertions(+), 66 deletions(-)
 rename docs/src/data/vllm-omni/{1.0.0-gpu-ec2.yml => 0.18.0-gpu-ec2.yml} (90%)
 rename docs/src/data/vllm-omni/{1.0.0-gpu-sagemaker.yml => 0.18.0-gpu-sagemaker.yml} (82%)
 create mode 100644 examples/vllm-omni/sagemaker/deploy_tts_async.py
 delete mode 100644 examples/vllm-omni/sagemaker/deploy_video_async.py

diff --git a/docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
similarity index 90%
rename from docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml
rename to docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
index 1d5aa65228f6..ada5c1dec88b 100644
--- a/docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml
+++ b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
@@ -1,5 +1,6 @@
 framework: vLLM-Omni
-version: "1.0.0"
+version: "0.18.0"
+ecr_repository: vllm
 accelerator: gpu
 python: py312
 cuda: cu129
@@ -8,8 +9,6 @@ platform: default
 public_registry: true
 
 tags:
-  - "omni-cuda-v1.0.0"
-  - "omni-cuda-v1.0"
   - "omni-cuda-v1"
 
 announcements:
diff --git a/docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
similarity index 82%
rename from docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml
rename to docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
index 588fb7e8fd01..252e3552ead2 100644
--- a/docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml
+++ b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
@@ -1,5 +1,6 @@
 framework: vLLM-Omni
-version: "1.0.0"
+version: "0.18.0"
+ecr_repository: vllm
 accelerator: gpu
 python: py312
 cuda: cu129
@@ -8,9 +9,7 @@ platform: sagemaker
 public_registry: true
 
 tags:
-  - "omni-cuda-sagemaker-v1.0.0"
-  - "omni-cuda-sagemaker-v1.0"
-  - "omni-cuda-sagemaker-v1"
+  - "omni-sagemaker-cuda-v1"
 
 announcements:
   - "Initial release of vLLM-Omni containers for SageMaker"
diff --git a/docs/src/generate.py b/docs/src/generate.py
index 6189cbc5b926..43dca381d322 100644
--- a/docs/src/generate.py
+++ b/docs/src/generate.py
@@ -376,9 +376,12 @@ def generate_available_images(dry_run: bool = False) -> str:
 
         section = f"{AVAILABLE_IMAGES_TABLE_HEADER} {display_name}\n"
         if has_public_registry:
-            url = f"{PUBLIC_GALLERY_URL}/{repository}"
+            # Use ecr_repository from images (falls back to data-dir key when unset) so display
+            # reflects the actual ECR repo when the data-dir key differs (e.g., vllm-omni -> vllm).
+            ecr_repo = images[0].ecr_repository if images else repository
+            url = f"{PUBLIC_GALLERY_URL}/{ecr_repo}"
             section += (
-                f"\nThese images are also available in ECR Public Gallery: [{repository}]({url})\n"
+                f"\nThese images are also available in ECR Public Gallery: [{ecr_repo}]({url})\n"
             )
         if table_config.get("note"):
             section += f"\n{table_config['note']}\n"
diff --git a/docs/src/image_config.py b/docs/src/image_config.py
index f5c02e52837a..cc36a15a4572 100644
--- a/docs/src/image_config.py
+++ b/docs/src/image_config.py
@@ -45,6 +45,13 @@ def repository(self) -> str:
         """Repository name for this image."""
         return self._repository
 
+    @property
+    def ecr_repository(self) -> str:
+        """ECR repository name for image URIs. Defaults to repository, but can be overridden
+        via the optional 'ecr_repository' YAML field when the data-directory key differs from
+        the actual ECR repo name (e.g., data dir 'vllm-omni' -> ECR repo 'vllm')."""
+        return self._data.get("ecr_repository") or self._repository
+
     @property
     def framework_group(self) -> str:
         """Framework group key (or repository if not in a group)."""
@@ -91,11 +98,11 @@ def get_image_uris(self) -> list[str]:
 
         uris = []
         for tag in tags:
-            uris.append(build_ecr_uri(account, self._repository, tag, region))
+            uris.append(build_ecr_uri(account, self.ecr_repository, tag, region))
 
         if self.get("public_registry"):
             for tag in tags:
-                uris.append(build_public_ecr_uri(self._repository, tag))
+                uris.append(build_public_ecr_uri(self.ecr_repository, tag))
 
         return uris
 
@@ -126,7 +133,7 @@ def display_framework_version(self) -> str:
     def display_example_url(self) -> str:
         """Example ECR URL for table display."""
         account = self.get("example_ecr_account", GLOBAL_CONFIG["example_ecr_account"])
-        return f"`{build_ecr_uri(account, self._repository, self.display_tag)}`"
+        return f"`{build_ecr_uri(account, self.ecr_repository, self.display_tag)}`"
 
     @property
     def display_platform(self) -> str:
@@ -277,4 +284,4 @@ def get_latest_image_uri(repo: str, platform: str) -> str:
 
     latest = sort_by_version(matching)[0]
     account = latest.get("example_ecr_account", GLOBAL_CONFIG["example_ecr_account"])
-    return build_ecr_uri(account, repo, latest.display_tag, "us-west-2")
+    return build_ecr_uri(account, latest.ecr_repository, latest.display_tag, "us-west-2")
diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index 9984c9d588cd..ba7e5ffb4689 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -5,8 +5,8 @@ Pre-built Docker images for serving omni-modality models (text-to-speech, image
 
 ## Latest Announcements
 
-**vLLM-Omni 1.0.0** — Initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a SageMaker routing
-middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`.
+**April 24, 2026** — vLLM-Omni 0.18.0 initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a
+SageMaker routing middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`.
 
 ## Pull Commands
 
@@ -41,8 +41,10 @@ For package versions included in each release, see the [Release Notes](../releas
 ## Model Compatibility
 
 - Models must have a standard HuggingFace `config.json` with a recognized `model_type`, or be diffusers pipeline models with `model_index.json`.
-- Models requiring `--stage-configs-path` (e.g., CosyVoice3, Fish Speech) are not supported in v1.0.0 — the engine subprocess cannot resolve custom
-  model types.
+- Some HuggingFace repos ship a `config.json` without a `model_type` field; vllm-omni's config resolver will reject these. Patching the local snapshot
+  with a minimal `config.json` (`{"model_type": "...", "architectures": ["..."]}`) is a common workaround, but the container's pinned `transformers`
+  version must also register the model type — models newer than that pin will fail at engine startup. Upgrading `transformers` in-place risks breaking
+  the supported models; wait for a future vllm-omni release with an updated pin.
 - Multi-stage omni models (thinker + talker + decoder) like Qwen2.5-Omni need significantly more VRAM than the model size suggests. Refer to the
   individual model cards for minimum GPU requirements.
 
@@ -103,7 +105,7 @@ Start the server, then submit a request. Three things are **required** on `/v1/c
 --8<-- "examples/vllm-omni/qwen2.5-omni/run.sh"
 ```
 
-The `/v1/audio/speech` shortcut (voices: `Chelsie`, `Ethan`) bypasses the thinker and does not apply the correct sampling params in v1.0.0, so it
+The `/v1/audio/speech` shortcut (voices: `Chelsie`, `Ethan`) bypasses the thinker and does not apply the correct sampling params in 0.18.0, so it
 produces noisy output for Qwen2.5-Omni. Prefer `/v1/chat/completions` for this model.
 
 ## SageMaker Deployment
@@ -127,7 +129,7 @@ header:
 | --- | --- |
 | `route=/v1/audio/speech` | TTS |
 | `route=/v1/images/generations` | Image generation |
-| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) |
+| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) — returns job-ID only in 0.18.0, MP4 not retrievable via SageMaker |
 | `route=/v1/chat/completions` | Multimodal chat |
 | *(no route)* | vLLM default `/invocations` (chat/completion/embed) |
 
@@ -160,23 +162,28 @@ When done, delete the endpoint:
 predictor.delete_endpoint()
 ```
 
-### Async Inference for Video and Long-Running Generation
+### Async Inference for Long-Running TTS Generation
 
 SageMaker real-time inference has a 60-second timeout. First requests to TTS models may exceed this due to `torch.compile` warmup (~67s); async
 inference avoids the limit, as does retrying after warmup completes.
 
-For `/v1/videos`, async inference is required because the endpoint returns a job ID rather than the final MP4. The MP4 must be retrieved by polling
-the container directly — SageMaker async inference only captures the initial JSON response.
+!!! warning "Video generation is not supported on SageMaker in 0.18.0 — see [Known Limitations](#known-limitations) below. Use EC2 for video."
 
 ```python
---8<-- "examples/vllm-omni/sagemaker/deploy_video_async.py"
+--8<-- "examples/vllm-omni/sagemaker/deploy_tts_async.py"
 ```
 
+For async inference, upload the JSON input payload to S3 first, then call `invoke_endpoint_async` with `InputLocation=<s3-uri>` and
+`CustomAttributes="route=/v1/audio/speech"`. The resulting `.out` object in the configured S3 output path is the raw WAV audio — no polling or
+additional retrieval step required.
+
 ## Known Limitations
 
-- **Video generation on SageMaker returns a job ID only.** The `/v1/videos` endpoint in v1.0.0 is async by design and `POST /v1/videos/sync` (which
-  blocks and returns raw MP4 bytes) is not available. Direct container access (EC2) supports the full video workflow — create job, poll status,
-  download MP4. A sync endpoint has been added in newer vllm-omni versions and will be supported in a future release.
+- **Video generation is not supported on SageMaker in 0.18.0.** The `/v1/videos` endpoint is async by design — it returns a job-ID JSON immediately
+  and generates the MP4 in the background. Through SageMaker async inference, only that job-ID JSON is written to S3; the MP4 itself never lands in S3
+  and cannot be retrieved through `invoke_endpoint` or `invoke_endpoint_async`. Use EC2 for video generation — direct container access supports the
+  full workflow (create job, poll status, download MP4). SageMaker support is expected once `POST /v1/videos/sync` (which blocks and returns raw MP4
+  bytes) is available in a future vllm-omni release.
 - **First-request latency on SageMaker real-time endpoints.** TTS models can exceed the 60s invoke timeout on the first request due to `torch.compile`
   warmup. Use async inference or retry after warmup.
 
diff --git a/examples/vllm-omni/sagemaker/deploy_tts.py b/examples/vllm-omni/sagemaker/deploy_tts.py
index a4e2d8a91a9a..a701bc90548e 100644
--- a/examples/vllm-omni/sagemaker/deploy_tts.py
+++ b/examples/vllm-omni/sagemaker/deploy_tts.py
@@ -5,7 +5,7 @@
 from sagemaker.serializers import JSONSerializer
 
 model = Model(
-    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-sagemaker-v1.0.0",
+    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1",
     role="arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole",
     env={"SM_VLLM_MODEL": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"},
     predictor_cls=Predictor,
diff --git a/examples/vllm-omni/sagemaker/deploy_tts_async.py b/examples/vllm-omni/sagemaker/deploy_tts_async.py
new file mode 100644
index 000000000000..9c793f33d5b2
--- /dev/null
+++ b/examples/vllm-omni/sagemaker/deploy_tts_async.py
@@ -0,0 +1,36 @@
+"""Deploy a vLLM-Omni TTS model to a SageMaker async inference endpoint.
+
+Async inference avoids the 60-second real-time invoke timeout, which the first
+TTS request can exceed due to torch.compile warmup (~67s). The /v1/audio/speech
+endpoint returns raw WAV bytes, so the async output written to S3 is the usable
+audio file — no polling or extra retrieval step needed.
+"""
+
+from sagemaker.async_inference import AsyncInferenceConfig
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+
+model = Model(
+    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1",
+    role="arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole",
+    env={"SM_VLLM_MODEL": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"},
+    predictor_cls=Predictor,
+)
+
+predictor = model.deploy(
+    instance_type="ml.g5.xlarge",
+    initial_instance_count=1,
+    endpoint_name="vllm-omni-tts-async",
+    inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
+    serializer=JSONSerializer(),
+    async_inference_config=AsyncInferenceConfig(
+        output_path="s3://<BUCKET>/vllm-omni-async-output/",
+        max_concurrent_invocations_per_instance=1,
+    ),
+    wait=True,
+)
+
+# Invoke async — upload the JSON input to S3, then call invoke_endpoint_async.
+# The resulting .out object in S3 is the raw WAV audio bytes (content-type audio/wav).
+# Use CustomAttributes to route /invocations → /v1/audio/speech.
diff --git a/examples/vllm-omni/sagemaker/deploy_video_async.py b/examples/vllm-omni/sagemaker/deploy_video_async.py
deleted file mode 100644
index d1ac7c807354..000000000000
--- a/examples/vllm-omni/sagemaker/deploy_video_async.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""Deploy a vLLM-Omni video model to a SageMaker async inference endpoint.
-
-Video generation is async by design — /v1/videos returns a job ID immediately,
-so only the job metadata JSON is written to S3, not the MP4 file. To retrieve
-the MP4, poll /v1/videos/<id>/content directly against the endpoint.
-"""
-
-from sagemaker.async_inference import AsyncInferenceConfig
-from sagemaker.model import Model
-from sagemaker.predictor import Predictor
-from sagemaker.serializers import JSONSerializer
-
-model = Model(
-    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-sagemaker-v1.0.0",
-    role="arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole",
-    env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"},
-    predictor_cls=Predictor,
-)
-
-predictor = model.deploy(
-    instance_type="ml.g6e.xlarge",
-    initial_instance_count=1,
-    endpoint_name="vllm-omni-video-async",
-    inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
-    serializer=JSONSerializer(),
-    async_inference_config=AsyncInferenceConfig(
-        output_path="s3://<BUCKET>/vllm-omni-async-output/",
-        max_concurrent_invocations_per_instance=1,
-    ),
-    wait=True,
-)
-
-# The middleware converts the JSON payload to multipart/form-data for /v1/videos.
-# Response contains the job ID; use the /v1/videos/<id>/content endpoint to
-# retrieve the MP4 bytes directly from the container.
diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh
index 2c443695a4f7..36db972d82f3 100755
--- a/examples/vllm-omni/video/run.sh
+++ b/examples/vllm-omni/video/run.sh
@@ -9,24 +9,24 @@ NAME="${NAME:-omni-video}"
 
 docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8000 \
   -v "${HOME}/hf-cache:/root/.cache/huggingface" \
-  "${IMAGE}" --model "${MODEL}"
+  "${IMAGE}" --model "${MODEL}" --tensor-parallel-size 2
 
 until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
 
 # /v1/videos requires multipart/form-data.
 JOB_ID=$(curl -sf -X POST http://localhost:8080/v1/videos \
   -F "prompt=a dog running on a beach at sunset" \
-  -F "num_frames=17" -F "num_inference_steps=4" \
+  -F "num_frames=17" -F "num_inference_steps=30" \
   -F "size=480x320" -F "seed=42" \
   | python3 -c "import json,sys;print(json.load(sys.stdin)['id'])")
 
 echo "submitted job ${JOB_ID}"
 
-# Poll until succeeded (5s interval, 10 min timeout).
+# Poll until completed (5s interval, 10 min timeout).
 for _ in $(seq 1 120); do
   STATUS=$(curl -sf "http://localhost:8080/v1/videos/${JOB_ID}" \
     | python3 -c "import json,sys;print(json.load(sys.stdin)['status'])")
-  [ "${STATUS}" = "succeeded" ] && break
+  [ "${STATUS}" = "completed" ] && break
   [ "${STATUS}" = "failed" ] && { echo "job failed"; exit 1; }
   sleep 5
 done

From d9fd989133ceac0fa902e85975d94e23b764acd8 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Mon, 27 Apr 2026 15:41:04 -0700
Subject: [PATCH 10/10] docs(vllm-omni): use hyphenated package key matching
 PyPI name

The vllm-omni package on PyPI is named with a hyphen (pip install vllm-omni),
not an underscore. Align the YAML package key with the PyPI project name and
drop the redundant underscore display_names entry in global.yml.

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml       | 2 +-
 docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml | 2 +-
 docs/src/global.yml                              | 6 +++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
index ada5c1dec88b..a6bc7ec8b859 100644
--- a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
+++ b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
@@ -18,7 +18,7 @@ announcements:
 
 packages:
   vllm: "0.18.0"
-  vllm_omni: "0.18.0"
+  vllm-omni: "0.18.0"
   pytorch: "2.10.0"
   torchvision: "0.25.0"
   torchaudio: "2.10.0"
diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
index 252e3552ead2..bb61f8a78299 100644
--- a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
+++ b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
@@ -18,7 +18,7 @@ announcements:
 
 packages:
   vllm: "0.18.0"
-  vllm_omni: "0.18.0"
+  vllm-omni: "0.18.0"
   pytorch: "2.10.0"
   torchvision: "0.25.0"
   torchaudio: "2.10.0"
diff --git a/docs/src/global.yml b/docs/src/global.yml
index fb7ed4c95d23..e76cde854d3d 100644
--- a/docs/src/global.yml
+++ b/docs/src/global.yml
@@ -67,7 +67,6 @@ display_names:
   sagemaker-xgboost: "XGBoost"
   sglang: "SGLang"
   vllm: "vLLM"
-  vllm_omni: "vLLM-Omni"
   vllm-arm64: "vLLM ARM64"
   vllm-omni: "vLLM-Omni"
   pytorch-training: "PyTorch Training"
@@ -101,6 +100,11 @@ display_names:
   known_issues: "Known Issues"
 
   # Packages
+  # Package keys use the same string as the YAML `packages:` field (underscored
+  # where applicable), which is distinct from repository keys in the section
+  # above (hyphenated, matching the data-dir name). For example, `vllm-omni`
+  # is the repo key (display: "vLLM-Omni" in tables/headings) while `vllm_omni`
+  # is the package key used in release notes package tables.
   python: "Python"
   cuda: "CUDA"
   cudnn: "cuDNN"