diff --git a/README.md b/README.md
index 8f9bae83c8d7..4d7579cdecb6 100644
--- a/README.md
+++ b/README.md
@@ -28,11 +28,13 @@ ______________________________________________________________________
 
 ### 🚀 Release Highlights
 
+- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1.1` · SageMaker: `omni-sagemaker-cuda-v1.1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0.
 - **[2026/05/11]** [vLLM v0.20.2](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.2-gpu-py312-ec2` · SageMaker: `0.20.2-gpu-py312` · Bug fixes for DeepSeek V4.
 - **[2026/05/06]** [SGLang v0.5.11](https://gallery.ecr.aws/deep-learning-containers/sglang) — EC2: `0.5.11-gpu-py312-ec2` · SageMaker: `0.5.11-gpu-py312` · Model support for Gemma 4, GLM-5.1, Qwen 3.4, and more
 - **[2026/05/05]** [vLLM v0.20.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.1-gpu-py312-ec2` · SageMaker: `0.20.1-gpu-py312` · Bug fixes for DeepSeek V4.
 - **[2026/04/30]** [PyTorch v2.11.0](https://gallery.ecr.aws/deep-learning-containers/pytorch) — EC2: `2.11.0-cu130-amzn2023` · SageMaker: `2.11.0-cu130-amzn2023-sagemaker` · Amazon Linux 2023 with EFA, flash-attn, and transformer-engine.
 - **[2026/04/28]** [vLLM v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.0-gpu-py312-ec2` · SageMaker: `0.20.0-gpu-py312` · Introduces support for DeepSeek V4.
+- **[2026/04/24]** [vLLM-Omni v0.18.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1.0` · SageMaker: `omni-sagemaker-cuda-v1.0` · Initial release. Serves omni-modality models (TTS, image, video, multimodal chat) through OpenAI-compatible APIs; SageMaker routing middleware via `CustomAttributes`.
 - **[2026/04/20]** [vLLM v0.19.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.19-gpu-py312-ec2` · SageMaker: `0.19-gpu-py312` · This upgrades Transformers to 5.5.4, enabling Gemma 4 support.
 
 ### 📢 Support Updates
diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
index a6bc7ec8b859..696881d9ad22 100644
--- a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
+++ b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
@@ -9,7 +9,7 @@ platform: default
 public_registry: true
 
 tags:
-  - "omni-cuda-v1"
+  - "omni-cuda-v1.0"
 
 announcements:
   - "Initial release of vLLM-Omni containers for EC2, ECS, EKS"
diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
index bb61f8a78299..9953790bf81f 100644
--- a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
+++ b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
@@ -9,7 +9,7 @@ platform: sagemaker
 public_registry: true
 
 tags:
-  - "omni-sagemaker-cuda-v1"
+  - "omni-sagemaker-cuda-v1.0"
 
 announcements:
   - "Initial release of vLLM-Omni containers for SageMaker"
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
new file mode 100644
index 000000000000..2f5827d20928
--- /dev/null
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
@@ -0,0 +1,35 @@
+framework: vLLM-Omni
+version: "0.20.0"
+ecr_repository: vllm
+accelerator: gpu
+python: py312
+cuda: cu130
+os: amzn2023
+platform: default
+public_registry: true
+
+tags:
+  # Only the patch-floating tag is listed per release. The minor-floating
+  # `omni-cuda-v1` tag is documented in docs/vllm-omni/index.md (Pull Commands +
+  # Versioning and Tags) but isn't a per-release identifier — it points at
+  # whichever release is currently the v1-line target. Releases that hold
+  # only their patch-floating tag in this yaml (this convention) auto-correct
+  # when the v1 floater advances; no yaml edits needed.
+  - "omni-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
+
+announcements:
+  - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0"
+  - "CUDA 12.9 → 13.0 base image; PyTorch 2.10.0 → 2.11.0"
+  - "New `/v1/audio/generate` endpoint for diffusion-based audio generation (e.g., stable-audio-open)"
+  - "New `/v1/videos/sync` endpoint — blocking variant of `/v1/videos` that returns the MP4 directly"
+  - "Adds support for CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, and Stable-Audio-Open-1.0"
+
+packages:
+  vllm: "0.20.0"
+  vllm-omni: "0.20.0"
+  pytorch: "2.11.0"
+  torchvision: "0.26.0"
+  torchaudio: "2.11.0"
+  cuda: "13.0.2"
+  flashinfer: "0.6.8.post1"
+  efa: "1.47.0"
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
new file mode 100644
index 000000000000..7511478b14b6
--- /dev/null
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
@@ -0,0 +1,35 @@
+framework: vLLM-Omni
+version: "0.20.0"
+ecr_repository: vllm
+accelerator: gpu
+python: py312
+cuda: cu130
+os: amzn2023
+platform: sagemaker
+public_registry: true
+
+tags:
+  # Only the patch-floating tag is listed per release. The minor-floating
+  # `omni-sagemaker-cuda-v1` tag is documented in docs/vllm-omni/index.md
+  # (Pull Commands + Versioning and Tags) but isn't a per-release identifier —
+  # it points at whichever release is currently the v1-line target. Releases
+  # that hold only their patch-floating tag in this yaml (this convention)
+  # auto-correct when the v1 floater advances; no yaml edits needed.
+  - "omni-sagemaker-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
+
+announcements:
+  - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0"
+  - "CUDA 12.9 → 13.0 base image; PyTorch 2.10.0 → 2.11.0"
+  - "Video generation now supported on SageMaker via the new `/v1/videos/sync` endpoint"
+  - "Adds `/v1/audio/generate` and `/v1/videos/sync` to the routing middleware"
+  - "Adds support for CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, and Stable-Audio-Open-1.0"
+
+packages:
+  vllm: "0.20.0"
+  vllm-omni: "0.20.0"
+  pytorch: "2.11.0"
+  torchvision: "0.26.0"
+  torchaudio: "2.11.0"
+  cuda: "13.0.2"
+  flashinfer: "0.6.8.post1"
+  efa: "1.47.0"
diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index ba7e5ffb4689..d4cdbab3f83f 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -1,29 +1,69 @@
 # vLLM-Omni Inference
 
-Pre-built Docker images for serving omni-modality models (text-to-speech, image generation, video generation, and multimodal chat) with
-[vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 12.9 and Python 3.12.
+Pre-built Docker images for serving omni-modality models (text-to-speech, audio generation, image generation, video generation, and multimodal chat)
+with [vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 13.0 and Python 3.12.
 
 ## Latest Announcements
 
+**May 12, 2026** — vLLM-Omni 0.20.0 release. Aligns with upstream vLLM v0.20.0; bumps CUDA to 13.0 and PyTorch to 2.11.0. Adds two new endpoints:
+`/v1/audio/generate` for diffusion-based audio generation (e.g., stable-audio-open) and `/v1/videos/sync` — a blocking variant of `/v1/videos` that
+returns the MP4 directly and unblocks video generation on SageMaker. New supported models: CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B,
+Stable-Audio-Open-1.0.
+
 **April 24, 2026** — vLLM-Omni 0.18.0 initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a
 SageMaker routing middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`.
 
 ## Pull Commands
 
-**EC2:**
+Images are published to both the public ECR gallery (no AWS credentials required) and the private DLC ECR repository (requires
+`aws ecr get-login-password`, see [Getting Started](../get_started/index.md)).
+
+**Multimodal (TTS, image/video/audio generation, omni chat) on EC2 / EKS:**
 
 ```bash
-docker pull {{ images.latest_vllm_omni_ec2 }}
+# Public ECR (anonymous pull):
+docker pull public.ecr.aws/deep-learning-containers/vllm:omni-cuda
+
+# Private ECR (authenticated; substitute your region):
+docker pull 763104351884.dkr.ecr.<region>.amazonaws.com/vllm:omni-cuda
 ```
 
-**SageMaker:**
+**Multimodal on Amazon SageMaker AI:**
+
+```bash
+# Public ECR (anonymous pull):
+docker pull public.ecr.aws/deep-learning-containers/vllm:omni-sagemaker-cuda
+
+# Private ECR (authenticated; substitute your region):
+docker pull 763104351884.dkr.ecr.<region>.amazonaws.com/vllm:omni-sagemaker-cuda
+```
+
+See [Available Images](../reference/available_images.md) for the full per-region URI table.
+
+## Pin a Version
+
+Append a version suffix to the base tag to control update behavior:
+
+| Suffix | Example | Updates when |
+| --- | --- | --- |
+| (none) | `omni-cuda` | Any release, including breaking changes |
+| `-v<MAJOR>` | `omni-cuda-v1` | New features and fixes, no breaking changes |
+| `-v<MAJOR>.<MINOR>` | `omni-cuda-v1.1` | Security patches and bug fixes only |
+| `-v<MAJOR>.<MINOR>.<PATCH>` | `omni-cuda-v1.1.0` | Never — immutable snapshot |
+
+The same suffixes apply to the SageMaker base tag (`omni-sagemaker-cuda`).
+
+**Recommended for production:** pin to `-v<MAJOR>.<MINOR>` (e.g., `omni-cuda-v1.1`). It auto-accepts security patches and bug fixes within the
+0.20-line release while declining new minor releases that could change behavior — customers pinned here would have been insulated from the Code2Wav
+un-batching regression that landed with the v1.1 minor bump (see [Known Limitations](#known-limitations) below) until they were ready to evaluate it.
+
+For byte-identical reproducibility, pull by digest:
 
 ```bash
-docker pull {{ images.latest_vllm_omni_sagemaker }}
+docker pull public.ecr.aws/deep-learning-containers/vllm@sha256:<digest>
 ```
 
-See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication
-instructions.
+`docker inspect <image>` prints the digest of the image you have. Pulls by digest never change.
 
 ## Packages
 
@@ -31,11 +71,13 @@ For package versions included in each release, see the [Release Notes](../releas
 
 ## Supported Modalities
 
-| Modality | Route | Example Model |
+| Modality | Route | Example Models |
 | --- | --- | --- |
-| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` |
-| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B` |
-| Video Generation | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` |
+| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice`, `Qwen/Qwen3-TTS-12Hz-1.7B-Base`, `FunAudioLLM/CosyVoice3-0.5B` |
+| Audio Generation | `/v1/audio/generate` (new in 0.20.0) | `stabilityai/stable-audio-open-1.0` |
+| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B`, `baidu/ERNIE-Image-Turbo` |
+| Video Generation (async) | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-VACE-1.3B-Diffusers` |
+| Video Generation (sync) | `/v1/videos/sync` (new in 0.20.0) | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-VACE-1.3B-Diffusers` |
 | Multimodal Chat | `/v1/chat/completions` | `bytedance-research/BAGEL-7B-MoT`, `Qwen/Qwen2.5-Omni-3B` |
 
 ## Model Compatibility
@@ -59,15 +101,35 @@ starts the container, waits for readiness, submits a request, and writes the out
 **Model:** [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) — a 1.7B-parameter Qwen3 text-to-speech
 model supporting multiple voices and languages, runs on a single 24 GB GPU (A10G / L4).
 
+For voice cloning, use [Qwen3-TTS-12Hz-1.7B-Base](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-Base) or
+[CosyVoice3-0.5B](https://huggingface.co/FunAudioLLM/CosyVoice3-0.5B) — both accept a reference audio clip plus its transcript and synthesize new
+speech in the reference speaker's voice. CosyVoice3 is zero-shot voice-clone only (no preset voices) and requires `--trust-remote-code`.
+
 ```bash
 --8<-- "examples/vllm-omni/tts/run.sh"
 ```
 
+### Audio Generation
+
+**Model:** [Stable-Audio-Open-1.0](https://huggingface.co/stabilityai/stable-audio-open-1.0) — a diffusion model for text-to-audio (sound effects,
+ambience, short music clips), distinct from TTS. Generates up to ~47 seconds of audio per request, runs on a single 24 GB GPU.
+
+The `/v1/audio/generate` endpoint (new in 0.20.0) takes a text prompt plus diffusion knobs (`audio_length`, `guidance_scale`, `num_inference_steps`,
+`seed`) and returns a single binary WAV blob — no streaming. See the
+[upstream API spec](https://github.com/vllm-project/vllm-omni/blob/main/docs/serving/audio_generate_api.md) for the full request shape.
+
+```bash
+--8<-- "examples/vllm-omni/audio-generate/run.sh"
+```
+
 ### Image Generation
 
 **Model:** [FLUX.2-klein-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B) — a 4B-parameter rectified-flow transformer from Black Forest
 Labs, produces high-quality 512×512 images from text prompts, runs on a single 24 GB GPU.
 
+[ERNIE-Image-Turbo](https://huggingface.co/baidu/ERNIE-Image-Turbo) is also supported as of 0.20.0 — an 8-step distilled DiT for fast inference with a
+matching request shape.
+
 ```bash
 --8<-- "examples/vllm-omni/image/run.sh"
 ```
@@ -76,14 +138,24 @@ Labs, produces high-quality 512×512 images from text prompts, runs on a single
 
 **Model:** [Wan2.1-T2V-1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) — a 1.3B-parameter text-to-video diffusion model from the Wan
 team, generates short clips at up to 480×832 resolution. Needs a 48 GB GPU (L40S) or 2× 24 GB GPUs with `--tensor-parallel-size 2`.
+[Wan2.1-VACE-1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B-Diffusers) (added in 0.20.0) is a unified video creation/editing pipeline that
+accepts text plus optional video, mask, or reference image inputs.
 
-The `/v1/videos` endpoint is asynchronous — it returns a job ID immediately and generates the video in the background. The script below submits the
-job, polls until it completes, then downloads the MP4.
+Two route options:
+
+- **Async** (`POST /v1/videos`) — returns a job ID immediately; poll `GET /v1/videos/{id}` until status is `completed`, then download the MP4 from
+  `GET /v1/videos/{id}/content`. Best for long-running batch jobs and the only option in 0.18.0.
+- **Sync** (`POST /v1/videos/sync`, new in 0.20.0) — blocks until generation completes and returns the raw MP4 in the response body. Simpler client
+  code, and crucially the only video path that works through SageMaker real-time endpoints (see [SageMaker Deployment](#sagemaker-deployment)).
 
 ```bash
 --8<-- "examples/vllm-omni/video/run.sh"
 ```
 
+```bash
+--8<-- "examples/vllm-omni/video-sync/run.sh"
+```
+
 ### Multimodal Chat
 
 Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) are supplied as URL or base64 content parts in the message list.
@@ -128,8 +200,10 @@ header:
 | `CustomAttributes` | Dispatched to |
 | --- | --- |
 | `route=/v1/audio/speech` | TTS |
+| `route=/v1/audio/generate` | Audio generation (new in 0.20.0) |
 | `route=/v1/images/generations` | Image generation |
-| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) — returns job-ID only in 0.18.0, MP4 not retrievable via SageMaker |
+| `route=/v1/videos` | Video generation, async (JSON auto-converted to form-data) — returns job-ID only; MP4 not retrievable via SageMaker. Prefer `/v1/videos/sync` below. |
+| `route=/v1/videos/sync` | Video generation, sync (new in 0.20.0) — blocks server-side and returns raw MP4 bytes; deploy behind SageMaker async inference (first-request `torch.compile` warmup exceeds the 60s real-time invoke timeout) |
 | `route=/v1/chat/completions` | Multimodal chat |
 | *(no route)* | vLLM default `/invocations` (chat/completion/embed) |
 
@@ -153,7 +227,7 @@ Any `SM_VLLM_*` env var is converted to a `--<name>` CLI argument (e.g., `SM_VLL
 --8<-- "examples/vllm-omni/sagemaker/deploy_tts.py"
 ```
 
-GPU deploys require `inference_ami_version` — the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 12.9 images. See
+GPU deploys require `inference_ami_version` — the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 13.0 images. See
 [ProductionVariant API reference](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html) for valid values.
 
 When done, delete the endpoint:
@@ -167,8 +241,6 @@ predictor.delete_endpoint()
 SageMaker real-time inference has a 60-second timeout. First requests to TTS models may exceed this due to `torch.compile` warmup (~67s); async
 inference avoids the limit, as does retrying after warmup completes.
 
-!!! warning "Video generation is not supported on SageMaker in 0.18.0 — see [Known Limitations](#known-limitations) below. Use EC2 for video."
-
 ```python
 --8<-- "examples/vllm-omni/sagemaker/deploy_tts_async.py"
 ```
@@ -177,15 +249,39 @@ For async inference, upload the JSON input payload to S3 first, then call `invok
 `CustomAttributes="route=/v1/audio/speech"`. The resulting `.out` object in the configured S3 output path is the raw WAV audio — no polling or
 additional retrieval step required.
 
+### Deploy a Video Endpoint
+
+The `/v1/videos/sync` endpoint (new in 0.20.0) is the supported path for video on SageMaker. Unlike the async `/v1/videos` route — which writes a
+job-ID JSON to S3 but never the MP4 — `/v1/videos/sync` blocks server-side until generation completes and writes the raw MP4 bytes to the configured
+S3 output path.
+
+Deploy behind **SageMaker async inference** (`AsyncInferenceConfig`), not real-time inference: first-request latency on video models is dominated by
+model load + `torch.compile` warmup (3–4 minutes for Wan2.1-VACE-1.3B), which exceeds the 60-second real-time invoke timeout. Async inference allows
+up to 1 hour and writes the response body verbatim to S3, so the `.out` object *is* the MP4 — no polling on a job ID.
+
+```python
+--8<-- "examples/vllm-omni/sagemaker/deploy_video_sync.py"
+```
+
+Validated 2026-05-11 on `ml.g5.2xlarge` (A10G 24 GB VRAM, 32 GB host RAM): 45 KB MP4 in ~10s after warmup. Reduce `num_inference_steps` and
+`num_frames` to stay under the async ceiling for warm requests.
+
 ## Known Limitations
 
-- **Video generation is not supported on SageMaker in 0.18.0.** The `/v1/videos` endpoint is async by design — it returns a job-ID JSON immediately
-  and generates the MP4 in the background. Through SageMaker async inference, only that job-ID JSON is written to S3; the MP4 itself never lands in S3
-  and cannot be retrieved through `invoke_endpoint` or `invoke_endpoint_async`. Use EC2 for video generation — direct container access supports the
-  full workflow (create job, poll status, download MP4). SageMaker support is expected once `POST /v1/videos/sync` (which blocks and returns raw MP4
-  bytes) is available in a future vllm-omni release.
-- **First-request latency on SageMaker real-time endpoints.** TTS models can exceed the 60s invoke timeout on the first request due to `torch.compile`
-  warmup. Use async inference or retry after warmup.
+- **`/v1/videos` (async) on SageMaker writes only the job-ID JSON to S3, not the MP4.** This is unchanged from 0.18.0 — the async route generates the
+  MP4 in the background and the bytes never land in S3. Use the new `/v1/videos/sync` route on SageMaker (see
+  [Deploy a Video Endpoint](#deploy-a-video-endpoint)) or stay on EC2 for the async workflow with status polling.
+- **First-request latency on SageMaker real-time endpoints.** TTS, audio-generate, and video models can exceed the 60s invoke timeout on the first
+  request due to `torch.compile` warmup. Use async inference or retry after warmup.
+- **Voice-clone TTS (Qwen3-TTS-Base) is slower in 0.20.0 than 0.18.0 due to an upstream Code2Wav decode-chunk un-batching regression**
+  ([vllm-omni#3203](https://github.com/vllm-project/vllm-omni/pull/3203)). Observed on `g6.xlarge` with `qwen3-tts-12hz-1.7b-base`, concurrency 4, 20
+  prompts: requests/s **0.4 → 0.281**, audio RTF multiplier **1.6 → 1.109**, p95 E2E **11s → 15.9s**. TTS quality is unchanged. The fix is merged
+  upstream as [vllm-omni#3485](https://github.com/vllm-project/vllm-omni/pull/3485) post-0.20.0 and will land in the next omni point release.
+  Preset-voice TTS (Qwen3-TTS-CustomVoice) is unaffected.
+- **CosyVoice3 requires `--trust-remote-code` and ~32 GB host RAM during model load.** A 16 GB host can SIGKILL the process during HuggingFace cache
+  hydration. Prefer `g6e.xlarge` or larger for both EC2 and SageMaker instance types.
+- **Stable-Audio-Open output is capped at ~47 seconds per request** by the model itself. For longer clips, run multiple requests with adjusted
+  `audio_start` and concatenate client-side.
 
 ## Release Notes
 
diff --git a/examples/vllm-omni/audio-generate/run.sh b/examples/vllm-omni/audio-generate/run.sh
new file mode 100755
index 000000000000..4252e7b9cdc7
--- /dev/null
+++ b/examples/vllm-omni/audio-generate/run.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# End-to-end audio-generation example: start server, generate a 5-second clip.
+# /v1/audio/generate is a diffusion-based text-to-audio endpoint (new in 0.20.0).
+# Distinct from /v1/audio/speech (which is TTS — a voice reading words).
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
+MODEL="${MODEL:-stabilityai/stable-audio-open-1.0}"
+NAME="${NAME:-omni-audio-generate}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  "${IMAGE}" --model "${MODEL}" --trust-remote-code --enforce-eager
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+
+curl -sf -X POST http://localhost:8080/v1/audio/generate \
+  -H "Content-Type: application/json" \
+  -d '{"input": "A jazz piano improvisation", "audio_length": 5.0, "guidance_scale": 7.0, "num_inference_steps": 50, "seed": 42}' \
+  --output sound.wav
+
+echo "wrote sound.wav ($(stat -f%z sound.wav 2>/dev/null || stat -c%s sound.wav) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"
diff --git a/examples/vllm-omni/image/run.sh b/examples/vllm-omni/image/run.sh
index d6dc1615ad95..28008fb19d3a 100755
--- a/examples/vllm-omni/image/run.sh
+++ b/examples/vllm-omni/image/run.sh
@@ -2,7 +2,7 @@
 # End-to-end image-generation example: start server, wait for ready, generate.
 set -euo pipefail
 
-IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
 MODEL="${MODEL:-black-forest-labs/FLUX.2-klein-4B}"
 NAME="${NAME:-omni-image}"
 
diff --git a/examples/vllm-omni/qwen2.5-omni/run.sh b/examples/vllm-omni/qwen2.5-omni/run.sh
index a04624bdf99d..98e020cb6ff6 100755
--- a/examples/vllm-omni/qwen2.5-omni/run.sh
+++ b/examples/vllm-omni/qwen2.5-omni/run.sh
@@ -6,7 +6,7 @@
 # On single-GPU hosts the model's talker stage fails to load on GPU 1.
 set -euo pipefail
 
-IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
 MODEL="${MODEL:-Qwen/Qwen2.5-Omni-3B}"
 NAME="${NAME:-omni3b}"
 
diff --git a/examples/vllm-omni/sagemaker/deploy_video_sync.py b/examples/vllm-omni/sagemaker/deploy_video_sync.py
new file mode 100644
index 000000000000..0717099aebbd
--- /dev/null
+++ b/examples/vllm-omni/sagemaker/deploy_video_sync.py
@@ -0,0 +1,95 @@
+"""Deploy a vLLM-Omni video model to a SageMaker async inference endpoint
+and invoke the new /v1/videos/sync endpoint, which blocks server-side until
+generation completes and returns raw MP4 bytes.
+
+Async inference is required for video — first-request latency includes model
+load + torch.compile warmup (3-4 min for Wan2.1-VACE-1.3B), well past the
+60s real-time invoke timeout. Async inference allows up to 1 hour and
+deposits the response body verbatim at the configured S3 output path, so the
+.out object is the raw MP4.
+
+Available since vLLM-Omni 0.20.0; supersedes the 0.18.0 limitation that
+SageMaker async inference could only retrieve the job-ID JSON, not the MP4.
+The routing middleware (`CustomAttributes="route=/v1/videos/sync"`) auto-
+converts the JSON request body to multipart/form-data for the underlying
+endpoint; values must therefore be JSON strings.
+
+Validated 2026-05-11 on ml.g5.2xlarge (A10G 24 GB VRAM, 32 GB host RAM):
+45 KB MP4 returned in ~10s after warmup.
+"""
+
+import time
+
+import boto3
+from sagemaker.async_inference import AsyncInferenceConfig
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+
+BUCKET = "<BUCKET>"  # replace with an S3 bucket your role can read/write
+ROLE_ARN = "arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole"
+ENDPOINT_NAME = "vllm-omni-video-sync"
+
+model = Model(
+    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1",
+    role=ROLE_ARN,
+    env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-VACE-1.3B-diffusers"},
+    predictor_cls=Predictor,
+)
+
+predictor = model.deploy(
+    instance_type="ml.g5.2xlarge",
+    initial_instance_count=1,
+    endpoint_name=ENDPOINT_NAME,
+    inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
+    serializer=JSONSerializer(),
+    async_inference_config=AsyncInferenceConfig(
+        output_path=f"s3://{BUCKET}/vllm-omni-async-output/",
+        max_concurrent_invocations_per_instance=1,
+    ),
+    wait=True,
+)
+
+# Upload the input payload to S3, then call invoke_endpoint_async with
+# CustomAttributes routing to /v1/videos/sync. Values are strings because
+# the middleware converts JSON to multipart/form-data.
+s3 = boto3.client("s3")
+s3.put_object(
+    Bucket=BUCKET,
+    Key="vllm-omni-async-input/request.json",
+    Body=(
+        '{"prompt": "a dog running on a beach", '
+        '"num_frames": "17", "num_inference_steps": "4", '
+        '"size": "480x320", "seed": "42"}'
+    ),
+    ContentType="application/json",
+)
+
+runtime = boto3.client("sagemaker-runtime")
+result = runtime.invoke_endpoint_async(
+    EndpointName=ENDPOINT_NAME,
+    InputLocation=f"s3://{BUCKET}/vllm-omni-async-input/request.json",
+    ContentType="application/json",
+    CustomAttributes="route=/v1/videos/sync",
+)
+output_location = result["OutputLocation"]  # s3://.../<id>.out
+print(f"Output will be written to {output_location}")
+
+# Poll for the .out object (raw MP4 bytes). First request takes ~3-4 min
+# due to model load + torch.compile; warm requests are ~3-10s.
+bucket = output_location.split("/", 3)[2]
+key = output_location.split("/", 3)[3]
+for _ in range(120):  # 10 min timeout
+    try:
+        obj = s3.get_object(Bucket=bucket, Key=key)
+        with open("video.mp4", "wb") as f:
+            f.write(obj["Body"].read())
+        print(f"wrote video.mp4 (Content-Type: {obj.get('ContentType', '?')})")
+        break
+    except s3.exceptions.NoSuchKey:
+        time.sleep(5)
+else:
+    raise RuntimeError("timed out waiting for async output")
+
+# When done:
+# predictor.delete_endpoint()
diff --git a/examples/vllm-omni/tts/run.sh b/examples/vllm-omni/tts/run.sh
index 9f4f185a2139..3af3837ed3e4 100755
--- a/examples/vllm-omni/tts/run.sh
+++ b/examples/vllm-omni/tts/run.sh
@@ -3,7 +3,7 @@
 # Requires: docker (with NVIDIA runtime), curl, an authenticated ECR pull.
 set -euo pipefail
 
-IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
 MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice}"
 NAME="${NAME:-omni-tts}"
 
diff --git a/examples/vllm-omni/video-sync/run.sh b/examples/vllm-omni/video-sync/run.sh
new file mode 100755
index 000000000000..e3200a0e2a24
--- /dev/null
+++ b/examples/vllm-omni/video-sync/run.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# End-to-end sync video-generation example: start server, submit, get MP4 back.
+# /v1/videos/sync (new in 0.20.0) blocks until the video is ready and returns
+# raw MP4 bytes — no job-ID polling needed, unlike async /v1/videos.
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
+MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}"
+NAME="${NAME:-omni-video-sync}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8000 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  "${IMAGE}" --model "${MODEL}" --tensor-parallel-size 2
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+
+# /v1/videos/sync requires multipart/form-data and blocks until the MP4 is ready.
+curl -sf -X POST http://localhost:8080/v1/videos/sync \
+  -F "prompt=a dog running on a beach at sunset" \
+  -F "num_frames=17" -F "num_inference_steps=30" \
+  -F "size=480x320" -F "seed=42" \
+  --output video.mp4
+
+echo "wrote video.mp4 ($(stat -f%z video.mp4 2>/dev/null || stat -c%s video.mp4) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"
diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh
index 36db972d82f3..7c716dd2e0fc 100755
--- a/examples/vllm-omni/video/run.sh
+++ b/examples/vllm-omni/video/run.sh
@@ -3,7 +3,7 @@
 # /v1/videos is async — it returns a job ID; the MP4 is produced in the background.
 set -euo pipefail
 
-IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
 MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}"
 NAME="${NAME:-omni-video}"