From d7500edbdd7fb9a8eb5d7e467a7be122dc88a218 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Tue, 12 May 2026 15:16:37 -0700
Subject: [PATCH 01/10] docs(vllm-omni): add 0.20.0 release notes, new
 endpoints, and known limitations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New version data files (auto-feed available_images.md and release-notes pages)
------------------------------------------------------------------------------
  docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
  docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml

Pinned package versions match upstream vllm v0.20.0 requirements/cuda.txt:
PyTorch 2.11.0, torchvision 0.26.0, torchaudio 2.11.0, flashinfer 0.6.8.post1,
CUDA 13.0.2. Same omni-cuda-v1 / omni-sagemaker-cuda-v1 tags are reused for
the new image (both v1 tags now point at 0.20.0).

docs/vllm-omni/index.md
-----------------------
- May 12, 2026 announcement covering the 0.20.0 alignment, CUDA 13.0 bump,
  new /v1/audio/generate and /v1/videos/sync endpoints, and the four new
  supported models (CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B,
  Stable-Audio-Open-1.0).
- Header CUDA reference 12.9 -> 13.0.
- Supported Modalities table grows two rows (Audio Generation, Video sync)
  and the example-model lists are expanded for TTS / image / video.
- New EC2 sections: Audio Generation (stable-audio-open) and Video sync.
- SageMaker routing-middleware table: adds /v1/audio/generate and
  /v1/videos/sync rows; the existing async /v1/videos row now points at
  the sync route as the recommended SageMaker path.
- New SageMaker section: Deploy a Video Endpoint (sync) — replaces the
  previous "video not supported on SageMaker" warning since that was the
  exact gap /v1/videos/sync closes.
- Known Limitations refreshed: drops the SageMaker-video-not-supported
  item, keeps torch.compile warmup, adds usage.completion_tokens=0 caveat
  for omni-chat, CosyVoice3 host-RAM requirement, and stable-audio-open's
  ~47s per-request cap.

New endpoint examples
---------------------
  examples/vllm-omni/audio-generate/run.sh        — stable-audio-open EC2
  examples/vllm-omni/video-sync/run.sh            — sync video EC2
  examples/vllm-omni/sagemaker/deploy_video_sync.py — sync video on SageMaker

All three follow the existing examples' shape (single-shot docker run,
health check, single curl/invoke, exit) so the index.md --8<-- includes
work without further changes.

Auto-generated release notes (docs/releasenotes/vllm-omni/0.20.0-*.md)
and the available_images.md table row are emitted by docs/src/main.py
from the YAMLs above; both are gitignored.

Verified locally with `python docs/src/main.py && mkdocs serve`:
  /deep-learning-containers/vllm-omni/                       (HTTP 200)
  /deep-learning-containers/releasenotes/vllm-omni/0.20.0-*  (rendered)
  /deep-learning-containers/reference/available_images/      (0.20.0 row above 0.18.0)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml    | 29 ++++++
 .../data/vllm-omni/0.20.0-gpu-sagemaker.yml   | 29 ++++++
 docs/vllm-omni/index.md                       | 94 +++++++++++++++----
 examples/vllm-omni/audio-generate/run.sh      | 23 +++++
 .../vllm-omni/sagemaker/deploy_video_sync.py  | 50 ++++++++++
 examples/vllm-omni/video-sync/run.sh          | 25 +++++
 6 files changed, 231 insertions(+), 19 deletions(-)
 create mode 100644 docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
 create mode 100644 docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
 create mode 100755 examples/vllm-omni/audio-generate/run.sh
 create mode 100644 examples/vllm-omni/sagemaker/deploy_video_sync.py
 create mode 100755 examples/vllm-omni/video-sync/run.sh

diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
new file mode 100644
index 000000000000..6daab83318ba
--- /dev/null
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
@@ -0,0 +1,29 @@
+framework: vLLM-Omni
+version: "0.20.0"
+ecr_repository: vllm
+accelerator: gpu
+python: py312
+cuda: cu130
+os: amzn2023
+platform: default
+public_registry: true
+
+tags:
+  - "omni-cuda-v1"
+
+announcements:
+  - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0"
+  - "CUDA 12.9 → 13.0 base image; PyTorch 2.10.0 → 2.11.0"
+  - "New `/v1/audio/generate` endpoint for diffusion-based audio generation (e.g., stable-audio-open)"
+  - "New `/v1/videos/sync` endpoint — blocking variant of `/v1/videos` that returns the MP4 directly"
+  - "Adds support for CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, and Stable-Audio-Open-1.0"
+
+packages:
+  vllm: "0.20.0"
+  vllm-omni: "0.20.0"
+  pytorch: "2.11.0"
+  torchvision: "0.26.0"
+  torchaudio: "2.11.0"
+  cuda: "13.0.2"
+  flashinfer: "0.6.8.post1"
+  efa: "1.47.0"
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
new file mode 100644
index 000000000000..c261b5f24dd9
--- /dev/null
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
@@ -0,0 +1,29 @@
+framework: vLLM-Omni
+version: "0.20.0"
+ecr_repository: vllm
+accelerator: gpu
+python: py312
+cuda: cu130
+os: amzn2023
+platform: sagemaker
+public_registry: true
+
+tags:
+  - "omni-sagemaker-cuda-v1"
+
+announcements:
+  - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0"
+  - "CUDA 12.9 → 13.0 base image; PyTorch 2.10.0 → 2.11.0"
+  - "Video generation now supported on SageMaker via the new `/v1/videos/sync` endpoint"
+  - "Adds `/v1/audio/generate` and `/v1/videos/sync` to the routing middleware"
+  - "Adds support for CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, and Stable-Audio-Open-1.0"
+
+packages:
+  vllm: "0.20.0"
+  vllm-omni: "0.20.0"
+  pytorch: "2.11.0"
+  torchvision: "0.26.0"
+  torchaudio: "2.11.0"
+  cuda: "13.0.2"
+  flashinfer: "0.6.8.post1"
+  efa: "1.47.0"
diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index ba7e5ffb4689..97271b3510d5 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -1,10 +1,15 @@
 # vLLM-Omni Inference
 
-Pre-built Docker images for serving omni-modality models (text-to-speech, image generation, video generation, and multimodal chat) with
-[vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 12.9 and Python 3.12.
+Pre-built Docker images for serving omni-modality models (text-to-speech, audio generation, image generation, video generation, and multimodal chat)
+with [vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 13.0 and Python 3.12.
 
 ## Latest Announcements
 
+**May 12, 2026** — vLLM-Omni 0.20.0 release. Aligns with upstream vLLM v0.20.0; bumps CUDA to 13.0 and PyTorch to 2.11.0. Adds two new endpoints:
+`/v1/audio/generate` for diffusion-based audio generation (e.g., stable-audio-open) and `/v1/videos/sync` — a blocking variant of `/v1/videos` that
+returns the MP4 directly and unblocks video generation on SageMaker. New supported models: CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B,
+Stable-Audio-Open-1.0.
+
 **April 24, 2026** — vLLM-Omni 0.18.0 initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a
 SageMaker routing middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`.
 
@@ -31,11 +36,13 @@ For package versions included in each release, see the [Release Notes](../releas
 
 ## Supported Modalities
 
-| Modality | Route | Example Model |
+| Modality | Route | Example Models |
 | --- | --- | --- |
-| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` |
-| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B` |
-| Video Generation | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` |
+| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice`, `Qwen/Qwen3-TTS-12Hz-1.7B-Base`, `FunAudioLLM/CosyVoice3-0.5B` |
+| Audio Generation | `/v1/audio/generate` (new in 0.20.0) | `stabilityai/stable-audio-open-1.0` |
+| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B`, `baidu/ERNIE-Image-Turbo` |
+| Video Generation (async) | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-VACE-1.3B-Diffusers` |
+| Video Generation (sync) | `/v1/videos/sync` (new in 0.20.0) | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-VACE-1.3B-Diffusers` |
 | Multimodal Chat | `/v1/chat/completions` | `bytedance-research/BAGEL-7B-MoT`, `Qwen/Qwen2.5-Omni-3B` |
 
 ## Model Compatibility
@@ -59,15 +66,35 @@ starts the container, waits for readiness, submits a request, and writes the out
 **Model:** [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) — a 1.7B-parameter Qwen3 text-to-speech
 model supporting multiple voices and languages, runs on a single 24 GB GPU (A10G / L4).
 
+For voice cloning, use [Qwen3-TTS-12Hz-1.7B-Base](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-Base) or
+[CosyVoice3-0.5B](https://huggingface.co/FunAudioLLM/CosyVoice3-0.5B) — both accept a reference audio clip plus its transcript and synthesize new
+speech in the reference speaker's voice. CosyVoice3 is zero-shot voice-clone only (no preset voices) and requires `--trust-remote-code`.
+
 ```bash
 --8<-- "examples/vllm-omni/tts/run.sh"
 ```
 
+### Audio Generation
+
+**Model:** [Stable-Audio-Open-1.0](https://huggingface.co/stabilityai/stable-audio-open-1.0) — a diffusion model for text-to-audio (sound effects,
+ambience, short music clips), distinct from TTS. Generates up to ~47 seconds of audio per request, runs on a single 24 GB GPU.
+
+The `/v1/audio/generate` endpoint (new in 0.20.0) takes a text prompt plus diffusion knobs (`audio_length`, `guidance_scale`, `num_inference_steps`,
+`seed`) and returns a single binary WAV blob — no streaming. See the
+[upstream API spec](https://github.com/vllm-project/vllm-omni/blob/main/docs/serving/audio_generate_api.md) for the full request shape.
+
+```bash
+--8<-- "examples/vllm-omni/audio-generate/run.sh"
+```
+
 ### Image Generation
 
 **Model:** [FLUX.2-klein-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B) — a 4B-parameter rectified-flow transformer from Black Forest
 Labs, produces high-quality 512×512 images from text prompts, runs on a single 24 GB GPU.
 
+[ERNIE-Image-Turbo](https://huggingface.co/baidu/ERNIE-Image-Turbo) is also supported as of 0.20.0 — an 8-step distilled DiT for fast inference with a
+matching request shape.
+
 ```bash
 --8<-- "examples/vllm-omni/image/run.sh"
 ```
@@ -76,14 +103,24 @@ Labs, produces high-quality 512×512 images from text prompts, runs on a single
 
 **Model:** [Wan2.1-T2V-1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) — a 1.3B-parameter text-to-video diffusion model from the Wan
 team, generates short clips at up to 480×832 resolution. Needs a 48 GB GPU (L40S) or 2× 24 GB GPUs with `--tensor-parallel-size 2`.
+[Wan2.1-VACE-1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B-Diffusers) (added in 0.20.0) is a unified video creation/editing pipeline that
+accepts text plus optional video, mask, or reference image inputs.
+
+Two route options:
 
-The `/v1/videos` endpoint is asynchronous — it returns a job ID immediately and generates the video in the background. The script below submits the
-job, polls until it completes, then downloads the MP4.
+- **Async** (`POST /v1/videos`) — returns a job ID immediately; poll `GET /v1/videos/{id}` until status is `completed`, then download the MP4 from
+  `GET /v1/videos/{id}/content`. Best for long-running batch jobs and the only option in 0.18.0.
+- **Sync** (`POST /v1/videos/sync`, new in 0.20.0) — blocks until generation completes and returns the raw MP4 in the response body. Simpler client
+  code, and crucially the only video path that works through SageMaker real-time endpoints (see [SageMaker Deployment](#sagemaker-deployment)).
 
 ```bash
 --8<-- "examples/vllm-omni/video/run.sh"
 ```
 
+```bash
+--8<-- "examples/vllm-omni/video-sync/run.sh"
+```
+
 ### Multimodal Chat
 
 Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) are supplied as URL or base64 content parts in the message list.
@@ -128,8 +165,10 @@ header:
 | `CustomAttributes` | Dispatched to |
 | --- | --- |
 | `route=/v1/audio/speech` | TTS |
+| `route=/v1/audio/generate` | Audio generation (new in 0.20.0) |
 | `route=/v1/images/generations` | Image generation |
-| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) — returns job-ID only in 0.18.0, MP4 not retrievable via SageMaker |
+| `route=/v1/videos` | Video generation, async (JSON auto-converted to form-data) — returns job-ID only; MP4 not retrievable via SageMaker. Prefer `/v1/videos/sync` below. |
+| `route=/v1/videos/sync` | Video generation, sync (new in 0.20.0) — blocks and returns raw MP4 bytes; works through SageMaker real-time endpoints |
 | `route=/v1/chat/completions` | Multimodal chat |
 | *(no route)* | vLLM default `/invocations` (chat/completion/embed) |
 
@@ -153,7 +192,7 @@ Any `SM_VLLM_*` env var is converted to a `--<name>` CLI argument (e.g., `SM_VLL
 --8<-- "examples/vllm-omni/sagemaker/deploy_tts.py"
 ```
 
-GPU deploys require `inference_ami_version` — the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 12.9 images. See
+GPU deploys require `inference_ami_version` — the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 13.0 images. See
 [ProductionVariant API reference](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html) for valid values.
 
 When done, delete the endpoint:
@@ -167,8 +206,6 @@ predictor.delete_endpoint()
 SageMaker real-time inference has a 60-second timeout. First requests to TTS models may exceed this due to `torch.compile` warmup (~67s); async
 inference avoids the limit, as does retrying after warmup completes.
 
-!!! warning "Video generation is not supported on SageMaker in 0.18.0 — see [Known Limitations](#known-limitations) below. Use EC2 for video."
-
 ```python
 --8<-- "examples/vllm-omni/sagemaker/deploy_tts_async.py"
 ```
@@ -177,15 +214,34 @@ For async inference, upload the JSON input payload to S3 first, then call `invok
 `CustomAttributes="route=/v1/audio/speech"`. The resulting `.out` object in the configured S3 output path is the raw WAV audio — no polling or
 additional retrieval step required.
 
+### Deploy a Video Endpoint
+
+The `/v1/videos/sync` endpoint (new in 0.20.0) is the supported path for video on SageMaker. Unlike the async `/v1/videos` route — which writes a
+job-ID JSON to S3 but never the MP4 — `/v1/videos/sync` blocks until generation completes and returns raw MP4 bytes that SageMaker hands back to the
+client directly.
+
+```python
+--8<-- "examples/vllm-omni/sagemaker/deploy_video_sync.py"
+```
+
+Sync video generation can take 30–120 seconds depending on `num_inference_steps` and `num_frames`. If a request approaches the 60s real-time invoke
+timeout, either reduce `num_inference_steps` or use `invoke_endpoint_async` (its 60-minute ceiling accommodates long jobs, and the response body — the
+MP4 — is written verbatim to the S3 output path).
+
 ## Known Limitations
 
-- **Video generation is not supported on SageMaker in 0.18.0.** The `/v1/videos` endpoint is async by design — it returns a job-ID JSON immediately
-  and generates the MP4 in the background. Through SageMaker async inference, only that job-ID JSON is written to S3; the MP4 itself never lands in S3
-  and cannot be retrieved through `invoke_endpoint` or `invoke_endpoint_async`. Use EC2 for video generation — direct container access supports the
-  full workflow (create job, poll status, download MP4). SageMaker support is expected once `POST /v1/videos/sync` (which blocks and returns raw MP4
-  bytes) is available in a future vllm-omni release.
-- **First-request latency on SageMaker real-time endpoints.** TTS models can exceed the 60s invoke timeout on the first request due to `torch.compile`
-  warmup. Use async inference or retry after warmup.
+- **`/v1/videos` (async) on SageMaker writes only the job-ID JSON to S3, not the MP4.** This is unchanged from 0.18.0 — the async route generates the
+  MP4 in the background and the bytes never land in S3. Use the new `/v1/videos/sync` route on SageMaker (see
+  [Deploy a Video Endpoint](#deploy-a-video-endpoint)) or stay on EC2 for the async workflow with status polling.
+- **First-request latency on SageMaker real-time endpoints.** TTS, audio-generate, and video models can exceed the 60s invoke timeout on the first
+  request due to `torch.compile` warmup. Use async inference or retry after warmup.
+- **`usage.completion_tokens` is reported as `0` for omni-chat models.** The `/v1/chat/completions` SSE stream emits `usage.completion_tokens=0` in
+  the terminal block, even when audio and text were generated. Use the per-chunk `metrics.num_tokens_out` field for an accurate engine-side token
+  count (see upstream `vllm_omni/benchmarks/patch/patch.py`).
+- **CosyVoice3 requires `--trust-remote-code` and ~32 GB host RAM during model load.** A 16 GB host can SIGKILL the process during HuggingFace cache
+  hydration. Prefer `g6e.xlarge` or larger for both EC2 and SageMaker instance types.
+- **Stable-Audio-Open output is capped at ~47 seconds per request** by the model itself. For longer clips, run multiple requests with adjusted
+  `audio_start` and concatenate client-side.
 
 ## Release Notes
 
diff --git a/examples/vllm-omni/audio-generate/run.sh b/examples/vllm-omni/audio-generate/run.sh
new file mode 100755
index 000000000000..9d0eb25f592e
--- /dev/null
+++ b/examples/vllm-omni/audio-generate/run.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# End-to-end audio-generation example: start server, generate a 5-second clip.
+# /v1/audio/generate is a diffusion-based text-to-audio endpoint (new in 0.20.0).
+# Distinct from /v1/audio/speech (which is TTS — a voice reading words).
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+MODEL="${MODEL:-stabilityai/stable-audio-open-1.0}"
+NAME="${NAME:-omni-audio-generate}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  "${IMAGE}" --model "${MODEL}" --trust-remote-code --enforce-eager
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+
+curl -sf -X POST http://localhost:8080/v1/audio/generate \
+  -H "Content-Type: application/json" \
+  -d '{"input": "A jazz piano improvisation", "audio_length": 5.0, "guidance_scale": 7.0, "num_inference_steps": 50, "seed": 42}' \
+  --output sound.wav
+
+echo "wrote sound.wav ($(stat -f%z sound.wav 2>/dev/null || stat -c%s sound.wav) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"
diff --git a/examples/vllm-omni/sagemaker/deploy_video_sync.py b/examples/vllm-omni/sagemaker/deploy_video_sync.py
new file mode 100644
index 000000000000..4cf64622d480
--- /dev/null
+++ b/examples/vllm-omni/sagemaker/deploy_video_sync.py
@@ -0,0 +1,50 @@
+"""Deploy a vLLM-Omni video model to a SageMaker real-time endpoint and invoke
+the new /v1/videos/sync endpoint, which blocks until generation completes and
+returns raw MP4 bytes.
+
+Available since vLLM-Omni 0.20.0; supersedes the 0.18.0 limitation that
+SageMaker async inference could only retrieve the job-ID JSON, not the MP4.
+
+Use the routing middleware via `CustomAttributes="route=/v1/videos/sync"`,
+which auto-converts JSON request bodies to multipart/form-data for the
+underlying endpoint.
+"""
+
+import boto3
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+
+model = Model(
+    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1",
+    role="arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole",
+    env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"},
+    predictor_cls=Predictor,
+)
+
+predictor = model.deploy(
+    instance_type="ml.g5.2xlarge",
+    initial_instance_count=1,
+    endpoint_name="vllm-omni-video-sync",
+    inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
+    serializer=JSONSerializer(),
+    wait=True,
+)
+
+# Invoke /v1/videos/sync via CustomAttributes; response body is the MP4 bytes
+# (Content-Type: video/mp4). Prefer invoke_endpoint over invoke_endpoint_async
+# because sync video can take 30–120s and the real-time path's binary response
+# is what we want — async would write base64-encoded JSON to S3.
+runtime = boto3.client("sagemaker-runtime")
+response = runtime.invoke_endpoint(
+    EndpointName="vllm-omni-video-sync",
+    Body='{"prompt": "a dog running on a beach", "num_frames": 17, '
+    '"num_inference_steps": 30, "size": "480x320", "seed": 42}',
+    ContentType="application/json",
+    CustomAttributes="route=/v1/videos/sync",
+)
+with open("video.mp4", "wb") as f:
+    f.write(response["Body"].read())
+
+# When done:
+# predictor.delete_endpoint()
diff --git a/examples/vllm-omni/video-sync/run.sh b/examples/vllm-omni/video-sync/run.sh
new file mode 100755
index 000000000000..2b1456724e9d
--- /dev/null
+++ b/examples/vllm-omni/video-sync/run.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# End-to-end sync video-generation example: start server, submit, get MP4 back.
+# /v1/videos/sync (new in 0.20.0) blocks until the video is ready and returns
+# raw MP4 bytes — no job-ID polling needed, unlike async /v1/videos.
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}"
+NAME="${NAME:-omni-video-sync}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8000 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  "${IMAGE}" --model "${MODEL}" --tensor-parallel-size 2
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+
+# /v1/videos/sync requires multipart/form-data and blocks until the MP4 is ready.
+curl -sf -X POST http://localhost:8080/v1/videos/sync \
+  -F "prompt=a dog running on a beach at sunset" \
+  -F "num_frames=17" -F "num_inference_steps=30" \
+  -F "size=480x320" -F "seed=42" \
+  --output video.mp4
+
+echo "wrote video.mp4 ($(stat -f%z video.mp4 2>/dev/null || stat -c%s video.mp4) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"

From 975f50e60b43a431a339a6fa538a3d44b6fac711 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Tue, 12 May 2026 15:35:08 -0700
Subject: [PATCH 02/10] docs(vllm-omni): note Code2Wav un-batching TTS
 regression in known limitations

Adds a Known Limitations entry documenting the upstream Code2Wav decode-chunk
un-batching regression in vllm-omni#3203 that ships in 0.20.0 and slows
voice-clone TTS (Qwen3-TTS-Base). Observed on g6.xlarge:

  rps           0.4   -> 0.281
  audio rtf     1.6   -> 1.109
  p95 e2e       11s   -> 15.9s

Quality is unchanged. Preset-voice TTS (Qwen3-TTS-CustomVoice) is unaffected.
The fix is already merged upstream as vllm-omni#3485 (post-0.20.0) and will
land in the next omni point release.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/vllm-omni/index.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index 97271b3510d5..11c092284f94 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -238,6 +238,11 @@ MP4 — is written verbatim to the S3 output path).
 - **`usage.completion_tokens` is reported as `0` for omni-chat models.** The `/v1/chat/completions` SSE stream emits `usage.completion_tokens=0` in
   the terminal block, even when audio and text were generated. Use the per-chunk `metrics.num_tokens_out` field for an accurate engine-side token
   count (see upstream `vllm_omni/benchmarks/patch/patch.py`).
+- **Voice-clone TTS (Qwen3-TTS-Base) is slower in 0.20.0 than 0.18.0 due to an upstream Code2Wav decode-chunk un-batching regression**
+  ([vllm-omni#3203](https://github.com/vllm-project/vllm-omni/pull/3203)). Observed on `g6.xlarge` with `qwen3-tts-12hz-1.7b-base`, concurrency 4, 20
+  prompts: requests/s **0.4 → 0.281**, audio RTF multiplier **1.6 → 1.109**, p95 E2E **11s → 15.9s**. TTS quality is unchanged. The fix is merged
+  upstream as [vllm-omni#3485](https://github.com/vllm-project/vllm-omni/pull/3485) post-0.20.0 and will land in the next omni point release.
+  Preset-voice TTS (Qwen3-TTS-CustomVoice) is unaffected.
 - **CosyVoice3 requires `--trust-remote-code` and ~32 GB host RAM during model load.** A 16 GB host can SIGKILL the process during HuggingFace cache
   hydration. Prefer `g6e.xlarge` or larger for both EC2 and SageMaker instance types.
 - **Stable-Audio-Open output is capped at ~47 seconds per request** by the model itself. For longer clips, run multiple requests with adjusted

From 8121da0e90519578cace58820af34cddd6c56342 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Tue, 12 May 2026 15:51:22 -0700
Subject: [PATCH 03/10] docs(vllm-omni): pin 0.18.0 to immutable v1.0 tag,
 leave v1 floating on 0.20.0

The `omni-cuda-v1` and `omni-sagemaker-cuda-v1` tags are now reused for
0.20.0 (per the image config files in main). Switch the 0.18.0 docs to
the immutable `omni-cuda-v1.0` / `omni-sagemaker-cuda-v1.0` tags so users
who want to reproduce the 0.18.0 image have a frozen URI; `v1` continues
to float to the latest release in the v1 line (0.20.0 today).

The v1.0 tags already exist in both 763... and public.ecr.aws.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml       | 2 +-
 docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
index a6bc7ec8b859..696881d9ad22 100644
--- a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
+++ b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
@@ -9,7 +9,7 @@ platform: default
 public_registry: true
 
 tags:
-  - "omni-cuda-v1"
+  - "omni-cuda-v1.0"
 
 announcements:
   - "Initial release of vLLM-Omni containers for EC2, ECS, EKS"
diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
index bb61f8a78299..9953790bf81f 100644
--- a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
+++ b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
@@ -9,7 +9,7 @@ platform: sagemaker
 public_registry: true
 
 tags:
-  - "omni-sagemaker-cuda-v1"
+  - "omni-sagemaker-cuda-v1.0"
 
 announcements:
   - "Initial release of vLLM-Omni containers for SageMaker"

From a566e9a33210cebc4a86b948437bac36db8f71db Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Wed, 13 May 2026 13:47:03 -0700
Subject: [PATCH 04/10] docs(vllm-omni): tag versioning convention, sync-video
 example fix, What's New entries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tag versioning (DLC-level)
--------------------------
Document the two-tier tag convention so customers can choose the right
trade-off between freshness and stability:

- omni-cuda-v1 / omni-sagemaker-cuda-v1 — float across DLC minor + patch
  (auto-upgrade on docker pull). Best for dev, quick-starts.
- omni-cuda-v1.1 / omni-sagemaker-cuda-v1.1 — float across DLC patches only
  in the v1.1 line (auto-accept security fixes, decline new minor releases).
  Recommended for production.
- @sha256:<digest> — escape hatch for byte-identical reproducibility.

The semantic versioning tier is at the DLC level (v1, v1.1, v1.1.x), not
the bundled vllm-omni upstream version (which can advance independently
of DLC patches). Customers pinned to v1.1 would have been insulated from
the Code2Wav un-batching regression that landed with the DLC v1.1 minor
bump until they were ready to evaluate it.

Reflected in:
- docs/src/data/vllm-omni/0.20.0-gpu-{ec2,sagemaker}.yml — list both v1 and
  v1.1 tags with comments explaining the floating semantics
- docs/vllm-omni/index.md — new Versioning and Tags section + expanded
  Pull Commands showing both tiers + digest pin

Sync-video SageMaker example fix
--------------------------------
The previous example used real-time invoke_endpoint, which has a hard
60-second timeout. First-request latency on Wan2.1-VACE-1.3B includes
model load + torch.compile warmup (3-4 min), so the example would always
fail on first invoke.

Rewrote to mirror the pattern proven by test_vllm_omni_video_async_endpoint
(last green 2026-05-11):
- AsyncInferenceConfig with output_path + max_concurrent_invocations=1
- s3.put_object to upload the request payload
- invoke_endpoint_async with InputLocation + CustomAttributes
- Poll the .out object for raw MP4 bytes
- Form-data values as strings (the middleware converts JSON to
  multipart/form-data; numeric values must be JSON strings)
- Wan2.1-VACE-1.3B-diffusers + ml.g5.2xlarge (validated combination)

End-to-end validated 2026-05-13 in account 897880167187:
endpoint deployed, async invoke succeeded, 45 KB MP4 returned with
Content-Type video/mp4 (valid ISO Media MP4 header), endpoint cleaned
up after.

docs/vllm-omni/index.md prose updated to recommend async inference as
the default for video on SageMaker (it's required, not optional, given
the warmup time).

What's New entries
------------------
README.md (which generates docs/index.md): two new vLLM-Omni entries
under Release Highlights:
- 2026/05/13 vLLM-Omni v0.20.0
- 2026/04/24 vLLM-Omni v0.18.0 (initial release)

Both reference the floating tag (omni-cuda-v1 / omni-sagemaker-cuda-v1)
and v1.0 for the 0.18.0 entry.

Removed
-------
The "usage.completion_tokens=0 for omni-chat models" Known Limitations
item — internal benchmark-tooling concern, not user-facing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 README.md                                     |  2 +
 docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml    |  3 +-
 .../data/vllm-omni/0.20.0-gpu-sagemaker.yml   |  3 +-
 docs/vllm-omni/index.md                       | 72 +++++++++++++---
 .../vllm-omni/sagemaker/deploy_video_sync.py  | 83 ++++++++++++++-----
 5 files changed, 129 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 8f9bae83c8d7..ea9601d493c7 100644
--- a/README.md
+++ b/README.md
@@ -28,11 +28,13 @@ ______________________________________________________________________
 
 ### 🚀 Release Highlights
 
+- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1` · SageMaker: `omni-sagemaker-cuda-v1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0.
 - **[2026/05/11]** [vLLM v0.20.2](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.2-gpu-py312-ec2` · SageMaker: `0.20.2-gpu-py312` · Bug fixes for DeepSeek V4.
 - **[2026/05/06]** [SGLang v0.5.11](https://gallery.ecr.aws/deep-learning-containers/sglang) — EC2: `0.5.11-gpu-py312-ec2` · SageMaker: `0.5.11-gpu-py312` · Model support for Gemma 4, GLM-5.1, Qwen 3.4, and more
 - **[2026/05/05]** [vLLM v0.20.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.1-gpu-py312-ec2` · SageMaker: `0.20.1-gpu-py312` · Bug fixes for DeepSeek V4.
 - **[2026/04/30]** [PyTorch v2.11.0](https://gallery.ecr.aws/deep-learning-containers/pytorch) — EC2: `2.11.0-cu130-amzn2023` · SageMaker: `2.11.0-cu130-amzn2023-sagemaker` · Amazon Linux 2023 with EFA, flash-attn, and transformer-engine.
 - **[2026/04/28]** [vLLM v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.0-gpu-py312-ec2` · SageMaker: `0.20.0-gpu-py312` · Introduces support for DeepSeek V4.
+- **[2026/04/24]** [vLLM-Omni v0.18.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1.0` · SageMaker: `omni-sagemaker-cuda-v1.0` · Initial release. Serves omni-modality models (TTS, image, video, multimodal chat) through OpenAI-compatible APIs; SageMaker routing middleware via `CustomAttributes`.
 - **[2026/04/20]** [vLLM v0.19.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.19-gpu-py312-ec2` · SageMaker: `0.19-gpu-py312` · This upgrades Transformers to 5.5.4, enabling Gemma 4 support.
 
 ### 📢 Support Updates
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
index 6daab83318ba..ab829dddb3b1 100644
--- a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
@@ -9,7 +9,8 @@ platform: default
 public_registry: true
 
 tags:
-  - "omni-cuda-v1"
+  - "omni-cuda-v1"      # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0)
+  - "omni-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
 
 announcements:
   - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0"
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
index c261b5f24dd9..3b91ae670c9d 100644
--- a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
@@ -9,7 +9,8 @@ platform: sagemaker
 public_registry: true
 
 tags:
-  - "omni-sagemaker-cuda-v1"
+  - "omni-sagemaker-cuda-v1"      # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0)
+  - "omni-sagemaker-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
 
 announcements:
   - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0"
diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index 11c092284f94..677ca43725b3 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -15,20 +15,66 @@ SageMaker routing middleware for dispatching `/invocations` to any omni endpoint
 
 ## Pull Commands
 
-**EC2:**
+**EC2** — latest supported (floats across DLC minor versions; auto-upgrades on next pull):
 
 ```bash
 docker pull {{ images.latest_vllm_omni_ec2 }}
 ```
 
-**SageMaker:**
+**EC2** — patch-stable (recommended for production; auto-accepts DLC security patches in the v1.1 line, declines new DLC minor releases):
+
+```bash
+docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1.1
+```
+
+**SageMaker** — latest supported:
 
 ```bash
 docker pull {{ images.latest_vllm_omni_sagemaker }}
 ```
 
-See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication
-instructions.
+**SageMaker** — patch-stable:
+
+```bash
+docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1.1
+```
+
+See [Available Images](../reference/available_images.md) for all image URIs, [Versioning and Tags](#versioning-and-tags) below for the convention, and
+[Getting Started](../get_started/index.md) for authentication instructions.
+
+## Versioning and Tags
+
+vLLM-Omni image tags follow a **DLC-level** semantic versioning convention (independent of the bundled vllm-omni upstream version):
+
+- **DLC major (`v1`, `v2`, …)** — incompatible/breaking changes in the DLC itself: image API, entrypoint, removed routes, pinned framework majors.
+  Customer code may need updating when the DLC major bumps.
+- **DLC minor (`v1.0`, `v1.1`, …)** — DLC release tracking new upstream vllm-omni features (e.g., a new endpoint), still API-compatible at the DLC
+  level. May introduce behavioral changes in the bundled engine.
+- **DLC patch** — security patches and bug fixes layered on top of an existing release without bumping the bundled vllm-omni version. Same tag, new
+  image digest.
+
+Two tag tiers, both floating, are exposed to customers:
+
+- **Minor-floating tags** (`omni-cuda-v1`, `omni-sagemaker-cuda-v1`) — track the latest DLC release within a major line. Auto-upgrade across DLC minor
+  *and* patch updates on `docker pull`. Best for development, quick-starts, and "give me whatever is supported right now".
+- **Patch-floating tags** (`omni-cuda-v1.1`, `omni-sagemaker-cuda-v1.1`) — follow only the DLC patch stream within one minor release. They auto-accept
+  security patches and bug fixes, but decline new DLC minor releases that could change behavior. Recommended for production: customers pinned here
+  would have been insulated from the Code2Wav un-batching regression that landed with the DLC `v1.1` minor bump (see
+  [Known Limitations](#known-limitations) below) until they were ready to evaluate it.
+
+If your workload requires byte-identical reproducibility — i.e., declining even DLC patches — pull by digest instead of tag:
+
+```bash
+docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm@sha256:<digest>
+```
+
+`docker inspect <image>` or `docker pull` output prints the digest of the image you currently have. Pulls by digest never change.
+
+| Tag | Tracks | Currently points at |
+| --- | --- | --- |
+| `omni-cuda-v1` / `omni-sagemaker-cuda-v1` | latest DLC release in v1 line (minor + patch) | DLC `v1.1` (vllm-omni 0.20.0) |
+| `omni-cuda-v1.0` / `omni-sagemaker-cuda-v1.0` | DLC v1.0 patch stream (vllm-omni 0.18.0 + DLC patches) | latest v1.0.x DLC patch |
+| `omni-cuda-v1.1` / `omni-sagemaker-cuda-v1.1` | DLC v1.1 patch stream (vllm-omni 0.20.0 + DLC patches) | latest v1.1.x DLC patch |
 
 ## Packages
 
@@ -168,7 +214,7 @@ header:
 | `route=/v1/audio/generate` | Audio generation (new in 0.20.0) |
 | `route=/v1/images/generations` | Image generation |
 | `route=/v1/videos` | Video generation, async (JSON auto-converted to form-data) — returns job-ID only; MP4 not retrievable via SageMaker. Prefer `/v1/videos/sync` below. |
-| `route=/v1/videos/sync` | Video generation, sync (new in 0.20.0) — blocks and returns raw MP4 bytes; works through SageMaker real-time endpoints |
+| `route=/v1/videos/sync` | Video generation, sync (new in 0.20.0) — blocks server-side and returns raw MP4 bytes; deploy behind SageMaker async inference (first-request `torch.compile` warmup exceeds the 60s real-time invoke timeout) |
 | `route=/v1/chat/completions` | Multimodal chat |
 | *(no route)* | vLLM default `/invocations` (chat/completion/embed) |
 
@@ -217,16 +263,19 @@ additional retrieval step required.
 ### Deploy a Video Endpoint
 
 The `/v1/videos/sync` endpoint (new in 0.20.0) is the supported path for video on SageMaker. Unlike the async `/v1/videos` route — which writes a
-job-ID JSON to S3 but never the MP4 — `/v1/videos/sync` blocks until generation completes and returns raw MP4 bytes that SageMaker hands back to the
-client directly.
+job-ID JSON to S3 but never the MP4 — `/v1/videos/sync` blocks server-side until generation completes and writes the raw MP4 bytes to the configured
+S3 output path.
+
+Deploy behind **SageMaker async inference** (`AsyncInferenceConfig`), not real-time inference: first-request latency on video models is dominated by
+model load + `torch.compile` warmup (3–4 minutes for Wan2.1-VACE-1.3B), which exceeds the 60-second real-time invoke timeout. Async inference allows
+up to 1 hour and writes the response body verbatim to S3, so the `.out` object *is* the MP4 — no polling on a job ID.
 
 ```python
 --8<-- "examples/vllm-omni/sagemaker/deploy_video_sync.py"
 ```
 
-Sync video generation can take 30–120 seconds depending on `num_inference_steps` and `num_frames`. If a request approaches the 60s real-time invoke
-timeout, either reduce `num_inference_steps` or use `invoke_endpoint_async` (its 60-minute ceiling accommodates long jobs, and the response body — the
-MP4 — is written verbatim to the S3 output path).
+Validated 2026-05-11 on `ml.g5.2xlarge` (A10G 24 GB VRAM, 32 GB host RAM): 45 KB MP4 in ~10s after warmup. Reduce `num_inference_steps` and
+`num_frames` to stay under the async ceiling for warm requests.
 
 ## Known Limitations
 
@@ -235,9 +284,6 @@ MP4 — is written verbatim to the S3 output path).
   [Deploy a Video Endpoint](#deploy-a-video-endpoint)) or stay on EC2 for the async workflow with status polling.
 - **First-request latency on SageMaker real-time endpoints.** TTS, audio-generate, and video models can exceed the 60s invoke timeout on the first
   request due to `torch.compile` warmup. Use async inference or retry after warmup.
-- **`usage.completion_tokens` is reported as `0` for omni-chat models.** The `/v1/chat/completions` SSE stream emits `usage.completion_tokens=0` in
-  the terminal block, even when audio and text were generated. Use the per-chunk `metrics.num_tokens_out` field for an accurate engine-side token
-  count (see upstream `vllm_omni/benchmarks/patch/patch.py`).
 - **Voice-clone TTS (Qwen3-TTS-Base) is slower in 0.20.0 than 0.18.0 due to an upstream Code2Wav decode-chunk un-batching regression**
   ([vllm-omni#3203](https://github.com/vllm-project/vllm-omni/pull/3203)). Observed on `g6.xlarge` with `qwen3-tts-12hz-1.7b-base`, concurrency 4, 20
   prompts: requests/s **0.4 → 0.281**, audio RTF multiplier **1.6 → 1.109**, p95 E2E **11s → 15.9s**. TTS quality is unchanged. The fix is merged
diff --git a/examples/vllm-omni/sagemaker/deploy_video_sync.py b/examples/vllm-omni/sagemaker/deploy_video_sync.py
index 4cf64622d480..0717099aebbd 100644
--- a/examples/vllm-omni/sagemaker/deploy_video_sync.py
+++ b/examples/vllm-omni/sagemaker/deploy_video_sync.py
@@ -1,50 +1,95 @@
-"""Deploy a vLLM-Omni video model to a SageMaker real-time endpoint and invoke
-the new /v1/videos/sync endpoint, which blocks until generation completes and
-returns raw MP4 bytes.
+"""Deploy a vLLM-Omni video model to a SageMaker async inference endpoint
+and invoke the new /v1/videos/sync endpoint, which blocks server-side until
+generation completes and returns raw MP4 bytes.
+
+Async inference is required for video — first-request latency includes model
+load + torch.compile warmup (3-4 min for Wan2.1-VACE-1.3B), well past the
+60s real-time invoke timeout. Async inference allows up to 1 hour and
+deposits the response body verbatim at the configured S3 output path, so the
+.out object is the raw MP4.
 
 Available since vLLM-Omni 0.20.0; supersedes the 0.18.0 limitation that
 SageMaker async inference could only retrieve the job-ID JSON, not the MP4.
+The routing middleware (`CustomAttributes="route=/v1/videos/sync"`) auto-
+converts the JSON request body to multipart/form-data for the underlying
+endpoint; values must therefore be JSON strings.
 
-Use the routing middleware via `CustomAttributes="route=/v1/videos/sync"`,
-which auto-converts JSON request bodies to multipart/form-data for the
-underlying endpoint.
+Validated 2026-05-11 on ml.g5.2xlarge (A10G 24 GB VRAM, 32 GB host RAM):
+45 KB MP4 returned in ~10s after warmup.
 """
 
+import time
+
 import boto3
+from sagemaker.async_inference import AsyncInferenceConfig
 from sagemaker.model import Model
 from sagemaker.predictor import Predictor
 from sagemaker.serializers import JSONSerializer
 
+BUCKET = "<BUCKET>"  # replace with an S3 bucket your role can read/write
+ROLE_ARN = "arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole"
+ENDPOINT_NAME = "vllm-omni-video-sync"
+
 model = Model(
     image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1",
-    role="arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole",
-    env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"},
+    role=ROLE_ARN,
+    env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-VACE-1.3B-diffusers"},
     predictor_cls=Predictor,
 )
 
 predictor = model.deploy(
     instance_type="ml.g5.2xlarge",
     initial_instance_count=1,
-    endpoint_name="vllm-omni-video-sync",
+    endpoint_name=ENDPOINT_NAME,
     inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
     serializer=JSONSerializer(),
+    async_inference_config=AsyncInferenceConfig(
+        output_path=f"s3://{BUCKET}/vllm-omni-async-output/",
+        max_concurrent_invocations_per_instance=1,
+    ),
     wait=True,
 )
 
-# Invoke /v1/videos/sync via CustomAttributes; response body is the MP4 bytes
-# (Content-Type: video/mp4). Prefer invoke_endpoint over invoke_endpoint_async
-# because sync video can take 30–120s and the real-time path's binary response
-# is what we want — async would write base64-encoded JSON to S3.
+# Upload the input payload to S3, then call invoke_endpoint_async with
+# CustomAttributes routing to /v1/videos/sync. Values are strings because
+# the middleware converts JSON to multipart/form-data.
+s3 = boto3.client("s3")
+s3.put_object(
+    Bucket=BUCKET,
+    Key="vllm-omni-async-input/request.json",
+    Body=(
+        '{"prompt": "a dog running on a beach", '
+        '"num_frames": "17", "num_inference_steps": "4", '
+        '"size": "480x320", "seed": "42"}'
+    ),
+    ContentType="application/json",
+)
+
 runtime = boto3.client("sagemaker-runtime")
-response = runtime.invoke_endpoint(
-    EndpointName="vllm-omni-video-sync",
-    Body='{"prompt": "a dog running on a beach", "num_frames": 17, '
-    '"num_inference_steps": 30, "size": "480x320", "seed": 42}',
+result = runtime.invoke_endpoint_async(
+    EndpointName=ENDPOINT_NAME,
+    InputLocation=f"s3://{BUCKET}/vllm-omni-async-input/request.json",
     ContentType="application/json",
     CustomAttributes="route=/v1/videos/sync",
 )
-with open("video.mp4", "wb") as f:
-    f.write(response["Body"].read())
+output_location = result["OutputLocation"]  # s3://.../<id>.out
+print(f"Output will be written to {output_location}")
+
+# Poll for the .out object (raw MP4 bytes). First request takes ~3-4 min
+# due to model load + torch.compile; warm requests are ~3-10s.
+bucket = output_location.split("/", 3)[2]
+key = output_location.split("/", 3)[3]
+for _ in range(120):  # 10 min timeout
+    try:
+        obj = s3.get_object(Bucket=bucket, Key=key)
+        with open("video.mp4", "wb") as f:
+            f.write(obj["Body"].read())
+        print(f"wrote video.mp4 (Content-Type: {obj.get('ContentType', '?')})")
+        break
+    except s3.exceptions.NoSuchKey:
+        time.sleep(5)
+else:
+    raise RuntimeError("timed out waiting for async output")
 
 # When done:
 # predictor.delete_endpoint()

From 25b7b2274bb141cce0db63adc4c58f22ed166aa0 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Wed, 13 May 2026 13:52:17 -0700
Subject: [PATCH 05/10] docs(vllm-omni): show v1.1 in available_images table
 for 0.20.0 (patch-floating)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reorder the 0.20.0 yamls so omni-cuda-v1.1 / omni-sagemaker-cuda-v1.1
come first; the docs generator uses tags[0] for the per-row table cell
in available_images.md.

Before: 0.18.0 row showed `omni-cuda-v1.0`, 0.20.0 row showed
`omni-cuda-v1` — inconsistent (one patch-floating, one minor-floating).
After: both rows show their patch-floating tag, which uniquely identifies
the release line and won't drift when the minor-floating v1 advances to
a future image.

Also bumps the README.md "What's New" entry for v0.20.0 to reference
omni-cuda-v1.1 / omni-sagemaker-cuda-v1.1 for the same durability reason.
Release-notes pages still print all four URIs (private + public ECR ×
both tag tiers).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 README.md                                        | 2 +-
 docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml       | 5 ++++-
 docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml | 5 ++++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index ea9601d493c7..4d7579cdecb6 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ ______________________________________________________________________
 
 ### 🚀 Release Highlights
 
-- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1` · SageMaker: `omni-sagemaker-cuda-v1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0.
+- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1.1` · SageMaker: `omni-sagemaker-cuda-v1.1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0.
 - **[2026/05/11]** [vLLM v0.20.2](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.2-gpu-py312-ec2` · SageMaker: `0.20.2-gpu-py312` · Bug fixes for DeepSeek V4.
 - **[2026/05/06]** [SGLang v0.5.11](https://gallery.ecr.aws/deep-learning-containers/sglang) — EC2: `0.5.11-gpu-py312-ec2` · SageMaker: `0.5.11-gpu-py312` · Model support for Gemma 4, GLM-5.1, Qwen 3.4, and more
 - **[2026/05/05]** [vLLM v0.20.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.1-gpu-py312-ec2` · SageMaker: `0.20.1-gpu-py312` · Bug fixes for DeepSeek V4.
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
index ab829dddb3b1..f12f04fc1f0c 100644
--- a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
@@ -9,8 +9,11 @@ platform: default
 public_registry: true
 
 tags:
-  - "omni-cuda-v1"      # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0)
+  # v1.1 is listed first so the available_images.md table renders the
+  # patch-floating tag — uniquely identifies "the 0.20.x line" even after
+  # the minor-floating v1 advances to a future release.
   - "omni-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
+  - "omni-cuda-v1"      # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0)
 
 announcements:
   - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0"
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
index 3b91ae670c9d..f884ae7cbc60 100644
--- a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
@@ -9,8 +9,11 @@ platform: sagemaker
 public_registry: true
 
 tags:
-  - "omni-sagemaker-cuda-v1"      # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0)
+  # v1.1 is listed first so the available_images.md table renders the
+  # patch-floating tag — uniquely identifies "the 0.20.x line" even after
+  # the minor-floating v1 advances to a future release.
   - "omni-sagemaker-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
+  - "omni-sagemaker-cuda-v1"      # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0)
 
 announcements:
   - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0"

From 01dd04c47bc148afa1e5afd463c79552af3fa131 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Wed, 13 May 2026 13:55:38 -0700
Subject: [PATCH 06/10] =?UTF-8?q?docs(vllm-omni):=20revert=20tag=20reorder?=
 =?UTF-8?q?=20=E2=80=94=20`tags[0]`=20feeds=20the=20latest=5F*=5Fec2=20mac?=
 =?UTF-8?q?ro?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous commit (25b7b227) reordered tags to put `omni-cuda-v1.1`
first, intending to fix a perceived asymmetry in available_images.md
(0.18.0 → v1.0, 0.20.0 → v1). That broke the Pull Commands section:
both pull commands ended up pointing at v1.1, defeating the two-tier
story.

Root cause: docs/src/macros.py uses `latest.display_tag` (which returns
`tags[0]`) to render `{{ images.latest_vllm_omni_ec2 }}`. That macro is
the "latest supported" pull command in docs/vllm-omni/index.md.

The original asymmetry was actually the convention working as intended:
- 0.20.0 is the current floating-v1 release, so its yaml lists v1 first
- 0.18.0 is no longer the floating-v1 target, so its yaml only lists v1.0

The maintenance pattern when a new release ships: remove v1 from the
*previous* release's yaml. The 0.18.0 yaml already reflects this.

Restore tags[0] = "omni-cuda-v1" on the 0.20.0 yamls and the README
What's New entry; add a comment in each yaml documenting the convention
so the next maintainer doesn't make the same mistake.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 README.md                                        |  2 +-
 docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml       | 10 ++++++----
 docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml | 10 ++++++----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 4d7579cdecb6..ea9601d493c7 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ ______________________________________________________________________
 
 ### 🚀 Release Highlights
 
-- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1.1` · SageMaker: `omni-sagemaker-cuda-v1.1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0.
+- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1` · SageMaker: `omni-sagemaker-cuda-v1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0.
 - **[2026/05/11]** [vLLM v0.20.2](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.2-gpu-py312-ec2` · SageMaker: `0.20.2-gpu-py312` · Bug fixes for DeepSeek V4.
 - **[2026/05/06]** [SGLang v0.5.11](https://gallery.ecr.aws/deep-learning-containers/sglang) — EC2: `0.5.11-gpu-py312-ec2` · SageMaker: `0.5.11-gpu-py312` · Model support for Gemma 4, GLM-5.1, Qwen 3.4, and more
 - **[2026/05/05]** [vLLM v0.20.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.1-gpu-py312-ec2` · SageMaker: `0.20.1-gpu-py312` · Bug fixes for DeepSeek V4.
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
index f12f04fc1f0c..3789513fe954 100644
--- a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
@@ -9,11 +9,13 @@ platform: default
 public_registry: true
 
 tags:
-  # v1.1 is listed first so the available_images.md table renders the
-  # patch-floating tag — uniquely identifies "the 0.20.x line" even after
-  # the minor-floating v1 advances to a future release.
-  - "omni-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
+  # v1 is listed first because tags[0] feeds the {{ images.latest_vllm_omni_* }}
+  # macros and the available_images.md "primary URL" cell. When a future DLC
+  # release ships, drop `omni-cuda-v1` from this yaml (it'll move to the new
+  # release's yaml) and keep only `omni-cuda-v1.1` here — same pattern as the
+  # 0.18.0 yaml currently uses.
   - "omni-cuda-v1"      # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0)
+  - "omni-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
 
 announcements:
   - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0"
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
index f884ae7cbc60..f2ddcec54260 100644
--- a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
@@ -9,11 +9,13 @@ platform: sagemaker
 public_registry: true
 
 tags:
-  # v1.1 is listed first so the available_images.md table renders the
-  # patch-floating tag — uniquely identifies "the 0.20.x line" even after
-  # the minor-floating v1 advances to a future release.
-  - "omni-sagemaker-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
+  # v1 is listed first because tags[0] feeds the {{ images.latest_vllm_omni_* }}
+  # macros and the available_images.md "primary URL" cell. When a future DLC
+  # release ships, drop `omni-sagemaker-cuda-v1` from this yaml (it'll move to
+  # the new release's yaml) and keep only `omni-sagemaker-cuda-v1.1` here —
+  # same pattern as the 0.18.0 yaml currently uses.
   - "omni-sagemaker-cuda-v1"      # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0)
+  - "omni-sagemaker-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
 
 announcements:
   - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0"

From dd625d6ffa361bf343e6562e79908956c9134093 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Wed, 13 May 2026 14:12:03 -0700
Subject: [PATCH 07/10] docs(vllm-omni): decouple per-release tags from
 minor-floating v1 tag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stop listing `omni-cuda-v1` / `omni-sagemaker-cuda-v1` in per-release
docs/src/data/vllm-omni/0.20.0-gpu-{ec2,sagemaker}.yml. Each release's
yaml now lists only its patch-floating tag (`v1.1` for 0.20.0; `v1.0`
already this way in 0.18.0).

The minor-floating `v1` tag is still documented prominently in
docs/vllm-omni/index.md (Pull Commands "latest supported" + Versioning
and Tags section), but it isn't a per-release identifier — it points at
whichever release is currently the v1-line target. Hardcoding the v1
pull URL in index.md (instead of using the `{{ images.latest_vllm_omni_*
}}` macro that reads `tags[0]`) makes the prose source-of-truth for the
floater, decoupled from per-release yaml metadata.

Why this is better:
- available_images.md table is now self-consistent — every row shows the
  release's patch-floating tag, no asymmetry between current and previous
  releases.
- Self-correcting: when a future DLC release ships, no edits to the
  0.20.0 yaml are required to remove `v1` (since it was never there).
  Today's convention required "drop v1 from old yaml on next release",
  easy to forget.
- Decoupled concerns: yamls own per-release metadata, prose owns the
  floating-tag story.

Verified locally with `python docs/src/main.py && mkdocs serve`:
  - reference/available_images table: 0.20.0 → v1.1, 0.18.0 → v1.0
  - releasenotes/vllm-omni-0.20.0-*: only v1.1 URIs (no longer v1)
  - vllm-omni/index.md Pull Commands: both v1 (latest) and v1.1
    (patch-stable) tags shown for EC2 + SageMaker
  - vllm-omni/index.md Versioning section table: unchanged

README.md What's New entry for 0.20.0: bumped from v1 to v1.1 to match
the 0.18.0 entry's pattern (per-release rows always show the
patch-floating, durable identifier).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 README.md                                        |  2 +-
 docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml       | 12 ++++++------
 docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml | 12 ++++++------
 docs/vllm-omni/index.md                          |  4 ++--
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index ea9601d493c7..4d7579cdecb6 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ ______________________________________________________________________
 
 ### 🚀 Release Highlights
 
-- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1` · SageMaker: `omni-sagemaker-cuda-v1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0.
+- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1.1` · SageMaker: `omni-sagemaker-cuda-v1.1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0.
 - **[2026/05/11]** [vLLM v0.20.2](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.2-gpu-py312-ec2` · SageMaker: `0.20.2-gpu-py312` · Bug fixes for DeepSeek V4.
 - **[2026/05/06]** [SGLang v0.5.11](https://gallery.ecr.aws/deep-learning-containers/sglang) — EC2: `0.5.11-gpu-py312-ec2` · SageMaker: `0.5.11-gpu-py312` · Model support for Gemma 4, GLM-5.1, Qwen 3.4, and more
 - **[2026/05/05]** [vLLM v0.20.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.1-gpu-py312-ec2` · SageMaker: `0.20.1-gpu-py312` · Bug fixes for DeepSeek V4.
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
index 3789513fe954..2f5827d20928 100644
--- a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml
@@ -9,12 +9,12 @@ platform: default
 public_registry: true
 
 tags:
-  # v1 is listed first because tags[0] feeds the {{ images.latest_vllm_omni_* }}
-  # macros and the available_images.md "primary URL" cell. When a future DLC
-  # release ships, drop `omni-cuda-v1` from this yaml (it'll move to the new
-  # release's yaml) and keep only `omni-cuda-v1.1` here — same pattern as the
-  # 0.18.0 yaml currently uses.
-  - "omni-cuda-v1"      # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0)
+  # Only the patch-floating tag is listed per release. The minor-floating
+  # `omni-cuda-v1` tag is documented in docs/vllm-omni/index.md (Pull Commands +
+  # Versioning and Tags) but isn't a per-release identifier — it points at
+  # whichever release is currently the v1-line target. Releases that hold
+  # only their patch-floating tag in this yaml (this convention) auto-correct
+  # when the v1 floater advances; no yaml edits needed.
   - "omni-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
 
 announcements:
diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
index f2ddcec54260..7511478b14b6 100644
--- a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
+++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml
@@ -9,12 +9,12 @@ platform: sagemaker
 public_registry: true
 
 tags:
-  # v1 is listed first because tags[0] feeds the {{ images.latest_vllm_omni_* }}
-  # macros and the available_images.md "primary URL" cell. When a future DLC
-  # release ships, drop `omni-sagemaker-cuda-v1` from this yaml (it'll move to
-  # the new release's yaml) and keep only `omni-sagemaker-cuda-v1.1` here —
-  # same pattern as the 0.18.0 yaml currently uses.
-  - "omni-sagemaker-cuda-v1"      # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0)
+  # Only the patch-floating tag is listed per release. The minor-floating
+  # `omni-sagemaker-cuda-v1` tag is documented in docs/vllm-omni/index.md
+  # (Pull Commands + Versioning and Tags) but isn't a per-release identifier —
+  # it points at whichever release is currently the v1-line target. Releases
+  # that hold only their patch-floating tag in this yaml (this convention)
+  # auto-correct when the v1 floater advances; no yaml edits needed.
   - "omni-sagemaker-cuda-v1.1"    # floats across DLC patches in the v1.1 line (auto-accepts security patches)
 
 announcements:
diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index 677ca43725b3..33d8673f3474 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -18,7 +18,7 @@ SageMaker routing middleware for dispatching `/invocations` to any omni endpoint
 **EC2** — latest supported (floats across DLC minor versions; auto-upgrades on next pull):
 
 ```bash
-docker pull {{ images.latest_vllm_omni_ec2 }}
+docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1
 ```
 
 **EC2** — patch-stable (recommended for production; auto-accepts DLC security patches in the v1.1 line, declines new DLC minor releases):
@@ -30,7 +30,7 @@ docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1.1
 **SageMaker** — latest supported:
 
 ```bash
-docker pull {{ images.latest_vllm_omni_sagemaker }}
+docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1
 ```
 
 **SageMaker** — patch-stable:

From e78696d016067169b0a83bef7a2a0c516eb301d0 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Wed, 13 May 2026 14:44:09 -0700
Subject: [PATCH 08/10] docs(vllm-omni): adopt vLLM-server's four-tier Pin a
 Version table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the bespoke two-section "Pull Commands + Versioning and Tags"
prose with the cleaner four-tier convention used by the public vLLM-server
docs. Maps directly to the suffix structure customers already see in
vllm-server pull commands.

Pull Commands now show only the bare base tags (omni-cuda /
omni-sagemaker-cuda) — "give me whatever ships". The Pin a Version
section enumerates the four tiers in one table:

  | Suffix                    | Example          | Updates when                                  |
  |---------------------------|------------------|-----------------------------------------------|
  | (none)                    | omni-cuda        | Any release, including breaking changes      |
  | -v<MAJOR>                 | omni-cuda-v1     | New features and fixes, no breaking changes  |
  | -v<MAJOR>.<MINOR>         | omni-cuda-v1.1   | Security patches and bug fixes only          |
  | -v<MAJOR>.<MINOR>.<PATCH> | omni-cuda-v1.1.0 | Never — immutable snapshot                   |

Production recommendation (pin to -v<MAJOR>.<MINOR>) calls out the
Code2Wav un-batching regression as the concrete example of why
patch-stable insulates production from feature-release surprises.

Switches both Pull Commands URIs from the private 763... ECR to
public.ecr.aws/deep-learning-containers/vllm to match the vLLM-server
docs convention (private ECR is in the per-region table on
available_images.md).

Removes the now-obsolete tag-history table — Pin a Version handles the
same information through suffix semantics.

Verified locally with `python docs/src/main.py && mkdocs serve`:
  - Pull Commands: bare omni-cuda and omni-sagemaker-cuda URIs
  - Pin a Version: 4-row suffix table with examples + update semantics
  - Section order: Latest Announcements -> Pull Commands -> Pin a Version
    -> Packages -> Supported Modalities -> ...

Existing example scripts (deploy_tts.py, deploy_tts_async.py,
deploy_video_sync.py) keep their -v1 URIs unchanged — examples document
behavior validated at v1 and don't need to chase the latest tag.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/vllm-omni/index.md | 63 ++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 42 deletions(-)

diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index 33d8673f3474..095ea6fc8c51 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -15,66 +15,45 @@ SageMaker routing middleware for dispatching `/invocations` to any omni endpoint
 
 ## Pull Commands
 
-**EC2** — latest supported (floats across DLC minor versions; auto-upgrades on next pull):
+**Multimodal (TTS, image/video/audio generation, omni chat) on EC2 / EKS:**
 
 ```bash
-docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1
+docker pull public.ecr.aws/deep-learning-containers/vllm:omni-cuda
 ```
 
-**EC2** — patch-stable (recommended for production; auto-accepts DLC security patches in the v1.1 line, declines new DLC minor releases):
+**Multimodal on Amazon SageMaker AI:**
 
 ```bash
-docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1.1
+docker pull public.ecr.aws/deep-learning-containers/vllm:omni-sagemaker-cuda
 ```
 
-**SageMaker** — latest supported:
+See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication
+instructions.
 
-```bash
-docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1
-```
-
-**SageMaker** — patch-stable:
-
-```bash
-docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1.1
-```
+## Pin a Version
 
-See [Available Images](../reference/available_images.md) for all image URIs, [Versioning and Tags](#versioning-and-tags) below for the convention, and
-[Getting Started](../get_started/index.md) for authentication instructions.
+Append a version suffix to the base tag to control update behavior:
 
-## Versioning and Tags
-
-vLLM-Omni image tags follow a **DLC-level** semantic versioning convention (independent of the bundled vllm-omni upstream version):
-
-- **DLC major (`v1`, `v2`, …)** — incompatible/breaking changes in the DLC itself: image API, entrypoint, removed routes, pinned framework majors.
-  Customer code may need updating when the DLC major bumps.
-- **DLC minor (`v1.0`, `v1.1`, …)** — DLC release tracking new upstream vllm-omni features (e.g., a new endpoint), still API-compatible at the DLC
-  level. May introduce behavioral changes in the bundled engine.
-- **DLC patch** — security patches and bug fixes layered on top of an existing release without bumping the bundled vllm-omni version. Same tag, new
-  image digest.
+| Suffix | Example | Updates when |
+| --- | --- | --- |
+| (none) | `omni-cuda` | Any release, including breaking changes |
+| `-v<MAJOR>` | `omni-cuda-v1` | New features and fixes, no breaking changes |
+| `-v<MAJOR>.<MINOR>` | `omni-cuda-v1.1` | Security patches and bug fixes only |
+| `-v<MAJOR>.<MINOR>.<PATCH>` | `omni-cuda-v1.1.0` | Never — immutable snapshot |
 
-Two tag tiers, both floating, are exposed to customers:
+The same suffixes apply to the SageMaker base tag (`omni-sagemaker-cuda`).
 
-- **Minor-floating tags** (`omni-cuda-v1`, `omni-sagemaker-cuda-v1`) — track the latest DLC release within a major line. Auto-upgrade across DLC minor
-  *and* patch updates on `docker pull`. Best for development, quick-starts, and "give me whatever is supported right now".
-- **Patch-floating tags** (`omni-cuda-v1.1`, `omni-sagemaker-cuda-v1.1`) — follow only the DLC patch stream within one minor release. They auto-accept
-  security patches and bug fixes, but decline new DLC minor releases that could change behavior. Recommended for production: customers pinned here
-  would have been insulated from the Code2Wav un-batching regression that landed with the DLC `v1.1` minor bump (see
-  [Known Limitations](#known-limitations) below) until they were ready to evaluate it.
+**Recommended for production:** pin to `-v<MAJOR>.<MINOR>` (e.g., `omni-cuda-v1.1`). It auto-accepts security patches and bug fixes within the
+0.20-line release while declining new minor releases that could change behavior — customers pinned here would have been insulated from the Code2Wav
+un-batching regression that landed with the v1.1 minor bump (see [Known Limitations](#known-limitations) below) until they were ready to evaluate it.
 
-If your workload requires byte-identical reproducibility — i.e., declining even DLC patches — pull by digest instead of tag:
+For byte-identical reproducibility, pull by digest:
 
 ```bash
-docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm@sha256:<digest>
+docker pull public.ecr.aws/deep-learning-containers/vllm@sha256:<digest>
 ```
 
-`docker inspect <image>` or `docker pull` output prints the digest of the image you currently have. Pulls by digest never change.
-
-| Tag | Tracks | Currently points at |
-| --- | --- | --- |
-| `omni-cuda-v1` / `omni-sagemaker-cuda-v1` | latest DLC release in v1 line (minor + patch) | DLC `v1.1` (vllm-omni 0.20.0) |
-| `omni-cuda-v1.0` / `omni-sagemaker-cuda-v1.0` | DLC v1.0 patch stream (vllm-omni 0.18.0 + DLC patches) | latest v1.0.x DLC patch |
-| `omni-cuda-v1.1` / `omni-sagemaker-cuda-v1.1` | DLC v1.1 patch stream (vllm-omni 0.20.0 + DLC patches) | latest v1.1.x DLC patch |
+`docker inspect <image>` prints the digest of the image you have. Pulls by digest never change.
 
 ## Packages
 

From 7f76bf7dd90dc76d94715cd27e54b16fc5fd4137 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Wed, 13 May 2026 14:46:43 -0700
Subject: [PATCH 09/10] docs(vllm-omni): add private ECR pull commands
 alongside public ECR

Pull Commands section now shows both registry options for each
deployment target:

- Public ECR (anonymous pull): public.ecr.aws/deep-learning-containers/vllm
- Private DLC ECR (authenticated): 763104351884.dkr.ecr.<region>.amazonaws.com/vllm

Customers running on AWS infrastructure (EC2/EKS/SageMaker) typically
prefer the private ECR for better network locality and IAM-controlled
access; public ECR is the right path for local development or workloads
outside AWS.

A short prologue paragraph explains the auth difference and links to
Getting Started for credentials. Per-region URI table still lives in
available_images.md.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docs/vllm-omni/index.md | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
index 095ea6fc8c51..d4cdbab3f83f 100644
--- a/docs/vllm-omni/index.md
+++ b/docs/vllm-omni/index.md
@@ -15,20 +15,30 @@ SageMaker routing middleware for dispatching `/invocations` to any omni endpoint
 
 ## Pull Commands
 
+Images are published to both the public ECR gallery (no AWS credentials required) and the private DLC ECR repository (requires
+`aws ecr get-login-password`, see [Getting Started](../get_started/index.md)).
+
 **Multimodal (TTS, image/video/audio generation, omni chat) on EC2 / EKS:**
 
 ```bash
+# Public ECR (anonymous pull):
 docker pull public.ecr.aws/deep-learning-containers/vllm:omni-cuda
+
+# Private ECR (authenticated; substitute your region):
+docker pull 763104351884.dkr.ecr.<region>.amazonaws.com/vllm:omni-cuda
 ```
 
 **Multimodal on Amazon SageMaker AI:**
 
 ```bash
+# Public ECR (anonymous pull):
 docker pull public.ecr.aws/deep-learning-containers/vllm:omni-sagemaker-cuda
+
+# Private ECR (authenticated; substitute your region):
+docker pull 763104351884.dkr.ecr.<region>.amazonaws.com/vllm:omni-sagemaker-cuda
 ```
 
-See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication
-instructions.
+See [Available Images](../reference/available_images.md) for the full per-region URI table.
 
 ## Pin a Version
 

From 6f09cb5472ab8bf1dbb9e3803b3a44675086dcf0 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Wed, 13 May 2026 14:50:11 -0700
Subject: [PATCH 10/10] fix(vllm-omni): correct ECR repo in EC2 example scripts
 (vllm-omni -> vllm)

All six EC2 example shell scripts hardcoded the legacy repo name
`vllm-omni:omni-cuda-v1`, but the actual ECR repo for these images is
`vllm` (post-#6007's repo unification, also reflected in the docs
generator's `ecr_repository: vllm` field and the prod_image config
`vllm:omni-cuda-v1`).

Customers running these scripts as-is would have hit a "repo does not
exist" error from `docker pull`. Fix the IMAGE default in each script:

  examples/vllm-omni/audio-generate/run.sh
  examples/vllm-omni/image/run.sh
  examples/vllm-omni/qwen2.5-omni/run.sh
  examples/vllm-omni/tts/run.sh
  examples/vllm-omni/video-sync/run.sh
  examples/vllm-omni/video/run.sh

The three SageMaker python examples (deploy_tts.py, deploy_tts_async.py,
deploy_video_sync.py) already used the correct `vllm:omni-sagemaker-cuda-v1`
repo path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 examples/vllm-omni/audio-generate/run.sh | 2 +-
 examples/vllm-omni/image/run.sh          | 2 +-
 examples/vllm-omni/qwen2.5-omni/run.sh   | 2 +-
 examples/vllm-omni/tts/run.sh            | 2 +-
 examples/vllm-omni/video-sync/run.sh     | 2 +-
 examples/vllm-omni/video/run.sh          | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/vllm-omni/audio-generate/run.sh b/examples/vllm-omni/audio-generate/run.sh
index 9d0eb25f592e..4252e7b9cdc7 100755
--- a/examples/vllm-omni/audio-generate/run.sh
+++ b/examples/vllm-omni/audio-generate/run.sh
@@ -4,7 +4,7 @@
 # Distinct from /v1/audio/speech (which is TTS — a voice reading words).
 set -euo pipefail
 
-IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
 MODEL="${MODEL:-stabilityai/stable-audio-open-1.0}"
 NAME="${NAME:-omni-audio-generate}"
 
diff --git a/examples/vllm-omni/image/run.sh b/examples/vllm-omni/image/run.sh
index d6dc1615ad95..28008fb19d3a 100755
--- a/examples/vllm-omni/image/run.sh
+++ b/examples/vllm-omni/image/run.sh
@@ -2,7 +2,7 @@
 # End-to-end image-generation example: start server, wait for ready, generate.
 set -euo pipefail
 
-IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
 MODEL="${MODEL:-black-forest-labs/FLUX.2-klein-4B}"
 NAME="${NAME:-omni-image}"
 
diff --git a/examples/vllm-omni/qwen2.5-omni/run.sh b/examples/vllm-omni/qwen2.5-omni/run.sh
index a04624bdf99d..98e020cb6ff6 100755
--- a/examples/vllm-omni/qwen2.5-omni/run.sh
+++ b/examples/vllm-omni/qwen2.5-omni/run.sh
@@ -6,7 +6,7 @@
 # On single-GPU hosts the model's talker stage fails to load on GPU 1.
 set -euo pipefail
 
-IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
 MODEL="${MODEL:-Qwen/Qwen2.5-Omni-3B}"
 NAME="${NAME:-omni3b}"
 
diff --git a/examples/vllm-omni/tts/run.sh b/examples/vllm-omni/tts/run.sh
index 9f4f185a2139..3af3837ed3e4 100755
--- a/examples/vllm-omni/tts/run.sh
+++ b/examples/vllm-omni/tts/run.sh
@@ -3,7 +3,7 @@
 # Requires: docker (with NVIDIA runtime), curl, an authenticated ECR pull.
 set -euo pipefail
 
-IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
 MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice}"
 NAME="${NAME:-omni-tts}"
 
diff --git a/examples/vllm-omni/video-sync/run.sh b/examples/vllm-omni/video-sync/run.sh
index 2b1456724e9d..e3200a0e2a24 100755
--- a/examples/vllm-omni/video-sync/run.sh
+++ b/examples/vllm-omni/video-sync/run.sh
@@ -4,7 +4,7 @@
 # raw MP4 bytes — no job-ID polling needed, unlike async /v1/videos.
 set -euo pipefail
 
-IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
 MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}"
 NAME="${NAME:-omni-video-sync}"
 
diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh
index 36db972d82f3..7c716dd2e0fc 100755
--- a/examples/vllm-omni/video/run.sh
+++ b/examples/vllm-omni/video/run.sh
@@ -3,7 +3,7 @@
 # /v1/videos is async — it returns a job ID; the MP4 is produced in the background.
 set -euo pipefail
 
-IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}"
 MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}"
 NAME="${NAME:-omni-video}"