From d7500edbdd7fb9a8eb5d7e467a7be122dc88a218 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Tue, 12 May 2026 15:16:37 -0700 Subject: [PATCH 01/10] docs(vllm-omni): add 0.20.0 release notes, new endpoints, and known limitations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New version data files (auto-feed available_images.md and release-notes pages) ------------------------------------------------------------------------------ docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml Pinned package versions match upstream vllm v0.20.0 requirements/cuda.txt: PyTorch 2.11.0, torchvision 0.26.0, torchaudio 2.11.0, flashinfer 0.6.8.post1, CUDA 13.0.2. Same omni-cuda-v1 / omni-sagemaker-cuda-v1 tags are reused for the new image (both v1 tags now point at 0.20.0). docs/vllm-omni/index.md ----------------------- - May 12, 2026 announcement covering the 0.20.0 alignment, CUDA 13.0 bump, new /v1/audio/generate and /v1/videos/sync endpoints, and the four new supported models (CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, Stable-Audio-Open-1.0). - Header CUDA reference 12.9 -> 13.0. - Supported Modalities table grows two rows (Audio Generation, Video sync) and the example-model lists are expanded for TTS / image / video. - New EC2 sections: Audio Generation (stable-audio-open) and Video sync. - SageMaker routing-middleware table: adds /v1/audio/generate and /v1/videos/sync rows; the existing async /v1/videos row now points at the sync route as the recommended SageMaker path. - New SageMaker section: Deploy a Video Endpoint (sync) — replaces the previous "video not supported on SageMaker" warning since that was the exact gap /v1/videos/sync closes. - Known Limitations refreshed: drops the SageMaker-video-not-supported item, keeps torch.compile warmup, adds usage.completion_tokens=0 caveat for omni-chat, CosyVoice3 host-RAM requirement, and stable-audio-open's ~47s per-request cap. New endpoint examples --------------------- examples/vllm-omni/audio-generate/run.sh — stable-audio-open EC2 examples/vllm-omni/video-sync/run.sh — sync video EC2 examples/vllm-omni/sagemaker/deploy_video_sync.py — sync video on SageMaker All three follow the existing examples' shape (single-shot docker run, health check, single curl/invoke, exit) so the index.md --8<-- includes work without further changes. Auto-generated release notes (docs/releasenotes/vllm-omni/0.20.0-*.md) and the available_images.md table row are emitted by docs/src/main.py from the YAMLs above; both are gitignored. Verified locally with `python docs/src/main.py && mkdocs serve`: /deep-learning-containers/vllm-omni/ (HTTP 200) /deep-learning-containers/releasenotes/vllm-omni/0.20.0-* (rendered) /deep-learning-containers/reference/available_images/ (0.20.0 row above 0.18.0) Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yadan Wei --- docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml | 29 ++++++ .../data/vllm-omni/0.20.0-gpu-sagemaker.yml | 29 ++++++ docs/vllm-omni/index.md | 94 +++++++++++++++---- examples/vllm-omni/audio-generate/run.sh | 23 +++++ .../vllm-omni/sagemaker/deploy_video_sync.py | 50 ++++++++++ examples/vllm-omni/video-sync/run.sh | 25 +++++ 6 files changed, 231 insertions(+), 19 deletions(-) create mode 100644 docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml create mode 100644 docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml create mode 100755 examples/vllm-omni/audio-generate/run.sh create mode 100644 examples/vllm-omni/sagemaker/deploy_video_sync.py create mode 100755 examples/vllm-omni/video-sync/run.sh diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml new file mode 100644 index 000000000000..6daab83318ba --- /dev/null +++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml @@ -0,0 +1,29 @@ +framework: vLLM-Omni +version: "0.20.0" +ecr_repository: vllm +accelerator: gpu +python: py312 +cuda: cu130 +os: amzn2023 +platform: default +public_registry: true + +tags: + - "omni-cuda-v1" + +announcements: + - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0" + - "CUDA 12.9 → 13.0 base image; PyTorch 2.10.0 → 2.11.0" + - "New `/v1/audio/generate` endpoint for diffusion-based audio generation (e.g., stable-audio-open)" + - "New `/v1/videos/sync` endpoint — blocking variant of `/v1/videos` that returns the MP4 directly" + - "Adds support for CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, and Stable-Audio-Open-1.0" + +packages: + vllm: "0.20.0" + vllm-omni: "0.20.0" + pytorch: "2.11.0" + torchvision: "0.26.0" + torchaudio: "2.11.0" + cuda: "13.0.2" + flashinfer: "0.6.8.post1" + efa: "1.47.0" diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml new file mode 100644 index 000000000000..c261b5f24dd9 --- /dev/null +++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml @@ -0,0 +1,29 @@ +framework: vLLM-Omni +version: "0.20.0" +ecr_repository: vllm +accelerator: gpu +python: py312 +cuda: cu130 +os: amzn2023 +platform: sagemaker +public_registry: true + +tags: + - "omni-sagemaker-cuda-v1" + +announcements: + - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0" + - "CUDA 12.9 → 13.0 base image; PyTorch 2.10.0 → 2.11.0" + - "Video generation now supported on SageMaker via the new `/v1/videos/sync` endpoint" + - "Adds `/v1/audio/generate` and `/v1/videos/sync` to the routing middleware" + - "Adds support for CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, and Stable-Audio-Open-1.0" + +packages: + vllm: "0.20.0" + vllm-omni: "0.20.0" + pytorch: "2.11.0" + torchvision: "0.26.0" + torchaudio: "2.11.0" + cuda: "13.0.2" + flashinfer: "0.6.8.post1" + efa: "1.47.0" diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index ba7e5ffb4689..97271b3510d5 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -1,10 +1,15 @@ # vLLM-Omni Inference -Pre-built Docker images for serving omni-modality models (text-to-speech, image generation, video generation, and multimodal chat) with -[vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 12.9 and Python 3.12. +Pre-built Docker images for serving omni-modality models (text-to-speech, audio generation, image generation, video generation, and multimodal chat) +with [vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 13.0 and Python 3.12. ## Latest Announcements +**May 12, 2026** — vLLM-Omni 0.20.0 release. Aligns with upstream vLLM v0.20.0; bumps CUDA to 13.0 and PyTorch to 2.11.0. Adds two new endpoints: +`/v1/audio/generate` for diffusion-based audio generation (e.g., stable-audio-open) and `/v1/videos/sync` — a blocking variant of `/v1/videos` that +returns the MP4 directly and unblocks video generation on SageMaker. New supported models: CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, +Stable-Audio-Open-1.0. + **April 24, 2026** — vLLM-Omni 0.18.0 initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a SageMaker routing middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`. @@ -31,11 +36,13 @@ For package versions included in each release, see the [Release Notes](../releas ## Supported Modalities -| Modality | Route | Example Model | +| Modality | Route | Example Models | | --- | --- | --- | -| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | -| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B` | -| Video Generation | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | +| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice`, `Qwen/Qwen3-TTS-12Hz-1.7B-Base`, `FunAudioLLM/CosyVoice3-0.5B` | +| Audio Generation | `/v1/audio/generate` (new in 0.20.0) | `stabilityai/stable-audio-open-1.0` | +| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B`, `baidu/ERNIE-Image-Turbo` | +| Video Generation (async) | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-VACE-1.3B-Diffusers` | +| Video Generation (sync) | `/v1/videos/sync` (new in 0.20.0) | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-VACE-1.3B-Diffusers` | | Multimodal Chat | `/v1/chat/completions` | `bytedance-research/BAGEL-7B-MoT`, `Qwen/Qwen2.5-Omni-3B` | ## Model Compatibility @@ -59,15 +66,35 @@ starts the container, waits for readiness, submits a request, and writes the out **Model:** [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) — a 1.7B-parameter Qwen3 text-to-speech model supporting multiple voices and languages, runs on a single 24 GB GPU (A10G / L4). +For voice cloning, use [Qwen3-TTS-12Hz-1.7B-Base](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-Base) or +[CosyVoice3-0.5B](https://huggingface.co/FunAudioLLM/CosyVoice3-0.5B) — both accept a reference audio clip plus its transcript and synthesize new +speech in the reference speaker's voice. CosyVoice3 is zero-shot voice-clone only (no preset voices) and requires `--trust-remote-code`. + ```bash --8<-- "examples/vllm-omni/tts/run.sh" ``` +### Audio Generation + +**Model:** [Stable-Audio-Open-1.0](https://huggingface.co/stabilityai/stable-audio-open-1.0) — a diffusion model for text-to-audio (sound effects, +ambience, short music clips), distinct from TTS. Generates up to ~47 seconds of audio per request, runs on a single 24 GB GPU. + +The `/v1/audio/generate` endpoint (new in 0.20.0) takes a text prompt plus diffusion knobs (`audio_length`, `guidance_scale`, `num_inference_steps`, +`seed`) and returns a single binary WAV blob — no streaming. See the +[upstream API spec](https://github.com/vllm-project/vllm-omni/blob/main/docs/serving/audio_generate_api.md) for the full request shape. + +```bash +--8<-- "examples/vllm-omni/audio-generate/run.sh" +``` + ### Image Generation **Model:** [FLUX.2-klein-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B) — a 4B-parameter rectified-flow transformer from Black Forest Labs, produces high-quality 512×512 images from text prompts, runs on a single 24 GB GPU. +[ERNIE-Image-Turbo](https://huggingface.co/baidu/ERNIE-Image-Turbo) is also supported as of 0.20.0 — an 8-step distilled DiT for fast inference with a +matching request shape. + ```bash --8<-- "examples/vllm-omni/image/run.sh" ``` @@ -76,14 +103,24 @@ Labs, produces high-quality 512×512 images from text prompts, runs on a single **Model:** [Wan2.1-T2V-1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) — a 1.3B-parameter text-to-video diffusion model from the Wan team, generates short clips at up to 480×832 resolution. Needs a 48 GB GPU (L40S) or 2× 24 GB GPUs with `--tensor-parallel-size 2`. +[Wan2.1-VACE-1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B-Diffusers) (added in 0.20.0) is a unified video creation/editing pipeline that +accepts text plus optional video, mask, or reference image inputs. + +Two route options: -The `/v1/videos` endpoint is asynchronous — it returns a job ID immediately and generates the video in the background. The script below submits the -job, polls until it completes, then downloads the MP4. +- **Async** (`POST /v1/videos`) — returns a job ID immediately; poll `GET /v1/videos/{id}` until status is `completed`, then download the MP4 from + `GET /v1/videos/{id}/content`. Best for long-running batch jobs and the only option in 0.18.0. +- **Sync** (`POST /v1/videos/sync`, new in 0.20.0) — blocks until generation completes and returns the raw MP4 in the response body. Simpler client + code, and crucially the only video path that works through SageMaker real-time endpoints (see [SageMaker Deployment](#sagemaker-deployment)). ```bash --8<-- "examples/vllm-omni/video/run.sh" ``` +```bash +--8<-- "examples/vllm-omni/video-sync/run.sh" +``` + ### Multimodal Chat Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) are supplied as URL or base64 content parts in the message list. @@ -128,8 +165,10 @@ header: | `CustomAttributes` | Dispatched to | | --- | --- | | `route=/v1/audio/speech` | TTS | +| `route=/v1/audio/generate` | Audio generation (new in 0.20.0) | | `route=/v1/images/generations` | Image generation | -| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) — returns job-ID only in 0.18.0, MP4 not retrievable via SageMaker | +| `route=/v1/videos` | Video generation, async (JSON auto-converted to form-data) — returns job-ID only; MP4 not retrievable via SageMaker. Prefer `/v1/videos/sync` below. | +| `route=/v1/videos/sync` | Video generation, sync (new in 0.20.0) — blocks and returns raw MP4 bytes; works through SageMaker real-time endpoints | | `route=/v1/chat/completions` | Multimodal chat | | *(no route)* | vLLM default `/invocations` (chat/completion/embed) | @@ -153,7 +192,7 @@ Any `SM_VLLM_*` env var is converted to a `--` CLI argument (e.g., `SM_VLL --8<-- "examples/vllm-omni/sagemaker/deploy_tts.py" ``` -GPU deploys require `inference_ami_version` — the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 12.9 images. See +GPU deploys require `inference_ami_version` — the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 13.0 images. See [ProductionVariant API reference](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html) for valid values. When done, delete the endpoint: @@ -167,8 +206,6 @@ predictor.delete_endpoint() SageMaker real-time inference has a 60-second timeout. First requests to TTS models may exceed this due to `torch.compile` warmup (~67s); async inference avoids the limit, as does retrying after warmup completes. -!!! warning "Video generation is not supported on SageMaker in 0.18.0 — see [Known Limitations](#known-limitations) below. Use EC2 for video." - ```python --8<-- "examples/vllm-omni/sagemaker/deploy_tts_async.py" ``` @@ -177,15 +214,34 @@ For async inference, upload the JSON input payload to S3 first, then call `invok `CustomAttributes="route=/v1/audio/speech"`. The resulting `.out` object in the configured S3 output path is the raw WAV audio — no polling or additional retrieval step required. +### Deploy a Video Endpoint + +The `/v1/videos/sync` endpoint (new in 0.20.0) is the supported path for video on SageMaker. Unlike the async `/v1/videos` route — which writes a +job-ID JSON to S3 but never the MP4 — `/v1/videos/sync` blocks until generation completes and returns raw MP4 bytes that SageMaker hands back to the +client directly. + +```python +--8<-- "examples/vllm-omni/sagemaker/deploy_video_sync.py" +``` + +Sync video generation can take 30–120 seconds depending on `num_inference_steps` and `num_frames`. If a request approaches the 60s real-time invoke +timeout, either reduce `num_inference_steps` or use `invoke_endpoint_async` (its 60-minute ceiling accommodates long jobs, and the response body — the +MP4 — is written verbatim to the S3 output path). + ## Known Limitations -- **Video generation is not supported on SageMaker in 0.18.0.** The `/v1/videos` endpoint is async by design — it returns a job-ID JSON immediately - and generates the MP4 in the background. Through SageMaker async inference, only that job-ID JSON is written to S3; the MP4 itself never lands in S3 - and cannot be retrieved through `invoke_endpoint` or `invoke_endpoint_async`. Use EC2 for video generation — direct container access supports the - full workflow (create job, poll status, download MP4). SageMaker support is expected once `POST /v1/videos/sync` (which blocks and returns raw MP4 - bytes) is available in a future vllm-omni release. -- **First-request latency on SageMaker real-time endpoints.** TTS models can exceed the 60s invoke timeout on the first request due to `torch.compile` - warmup. Use async inference or retry after warmup. +- **`/v1/videos` (async) on SageMaker writes only the job-ID JSON to S3, not the MP4.** This is unchanged from 0.18.0 — the async route generates the + MP4 in the background and the bytes never land in S3. Use the new `/v1/videos/sync` route on SageMaker (see + [Deploy a Video Endpoint](#deploy-a-video-endpoint)) or stay on EC2 for the async workflow with status polling. +- **First-request latency on SageMaker real-time endpoints.** TTS, audio-generate, and video models can exceed the 60s invoke timeout on the first + request due to `torch.compile` warmup. Use async inference or retry after warmup. +- **`usage.completion_tokens` is reported as `0` for omni-chat models.** The `/v1/chat/completions` SSE stream emits `usage.completion_tokens=0` in + the terminal block, even when audio and text were generated. Use the per-chunk `metrics.num_tokens_out` field for an accurate engine-side token + count (see upstream `vllm_omni/benchmarks/patch/patch.py`). +- **CosyVoice3 requires `--trust-remote-code` and ~32 GB host RAM during model load.** A 16 GB host can SIGKILL the process during HuggingFace cache + hydration. Prefer `g6e.xlarge` or larger for both EC2 and SageMaker instance types. +- **Stable-Audio-Open output is capped at ~47 seconds per request** by the model itself. For longer clips, run multiple requests with adjusted + `audio_start` and concatenate client-side. ## Release Notes diff --git a/examples/vllm-omni/audio-generate/run.sh b/examples/vllm-omni/audio-generate/run.sh new file mode 100755 index 000000000000..9d0eb25f592e --- /dev/null +++ b/examples/vllm-omni/audio-generate/run.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# End-to-end audio-generation example: start server, generate a 5-second clip. +# /v1/audio/generate is a diffusion-based text-to-audio endpoint (new in 0.20.0). +# Distinct from /v1/audio/speech (which is TTS — a voice reading words). +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +MODEL="${MODEL:-stabilityai/stable-audio-open-1.0}" +NAME="${NAME:-omni-audio-generate}" + +docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + "${IMAGE}" --model "${MODEL}" --trust-remote-code --enforce-eager + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done + +curl -sf -X POST http://localhost:8080/v1/audio/generate \ + -H "Content-Type: application/json" \ + -d '{"input": "A jazz piano improvisation", "audio_length": 5.0, "guidance_scale": 7.0, "num_inference_steps": 50, "seed": 42}' \ + --output sound.wav + +echo "wrote sound.wav ($(stat -f%z sound.wav 2>/dev/null || stat -c%s sound.wav) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" diff --git a/examples/vllm-omni/sagemaker/deploy_video_sync.py b/examples/vllm-omni/sagemaker/deploy_video_sync.py new file mode 100644 index 000000000000..4cf64622d480 --- /dev/null +++ b/examples/vllm-omni/sagemaker/deploy_video_sync.py @@ -0,0 +1,50 @@ +"""Deploy a vLLM-Omni video model to a SageMaker real-time endpoint and invoke +the new /v1/videos/sync endpoint, which blocks until generation completes and +returns raw MP4 bytes. + +Available since vLLM-Omni 0.20.0; supersedes the 0.18.0 limitation that +SageMaker async inference could only retrieve the job-ID JSON, not the MP4. + +Use the routing middleware via `CustomAttributes="route=/v1/videos/sync"`, +which auto-converts JSON request bodies to multipart/form-data for the +underlying endpoint. +""" + +import boto3 +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer + +model = Model( + image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1", + role="arn:aws:iam:::role/SageMakerExecutionRole", + env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"}, + predictor_cls=Predictor, +) + +predictor = model.deploy( + instance_type="ml.g5.2xlarge", + initial_instance_count=1, + endpoint_name="vllm-omni-video-sync", + inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", + serializer=JSONSerializer(), + wait=True, +) + +# Invoke /v1/videos/sync via CustomAttributes; response body is the MP4 bytes +# (Content-Type: video/mp4). Prefer invoke_endpoint over invoke_endpoint_async +# because sync video can take 30–120s and the real-time path's binary response +# is what we want — async would write base64-encoded JSON to S3. +runtime = boto3.client("sagemaker-runtime") +response = runtime.invoke_endpoint( + EndpointName="vllm-omni-video-sync", + Body='{"prompt": "a dog running on a beach", "num_frames": 17, ' + '"num_inference_steps": 30, "size": "480x320", "seed": 42}', + ContentType="application/json", + CustomAttributes="route=/v1/videos/sync", +) +with open("video.mp4", "wb") as f: + f.write(response["Body"].read()) + +# When done: +# predictor.delete_endpoint() diff --git a/examples/vllm-omni/video-sync/run.sh b/examples/vllm-omni/video-sync/run.sh new file mode 100755 index 000000000000..2b1456724e9d --- /dev/null +++ b/examples/vllm-omni/video-sync/run.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# End-to-end sync video-generation example: start server, submit, get MP4 back. +# /v1/videos/sync (new in 0.20.0) blocks until the video is ready and returns +# raw MP4 bytes — no job-ID polling needed, unlike async /v1/videos. +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}" +NAME="${NAME:-omni-video-sync}" + +docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8000 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + "${IMAGE}" --model "${MODEL}" --tensor-parallel-size 2 + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done + +# /v1/videos/sync requires multipart/form-data and blocks until the MP4 is ready. +curl -sf -X POST http://localhost:8080/v1/videos/sync \ + -F "prompt=a dog running on a beach at sunset" \ + -F "num_frames=17" -F "num_inference_steps=30" \ + -F "size=480x320" -F "seed=42" \ + --output video.mp4 + +echo "wrote video.mp4 ($(stat -f%z video.mp4 2>/dev/null || stat -c%s video.mp4) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" From 975f50e60b43a431a339a6fa538a3d44b6fac711 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Tue, 12 May 2026 15:35:08 -0700 Subject: [PATCH 02/10] docs(vllm-omni): note Code2Wav un-batching TTS regression in known limitations Adds a Known Limitations entry documenting the upstream Code2Wav decode-chunk un-batching regression in vllm-omni#3203 that ships in 0.20.0 and slows voice-clone TTS (Qwen3-TTS-Base). Observed on g6.xlarge: rps 0.4 -> 0.281 audio rtf 1.6 -> 1.109 p95 e2e 11s -> 15.9s Quality is unchanged. Preset-voice TTS (Qwen3-TTS-CustomVoice) is unaffected. The fix is already merged upstream as vllm-omni#3485 (post-0.20.0) and will land in the next omni point release. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yadan Wei --- docs/vllm-omni/index.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index 97271b3510d5..11c092284f94 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -238,6 +238,11 @@ MP4 — is written verbatim to the S3 output path). - **`usage.completion_tokens` is reported as `0` for omni-chat models.** The `/v1/chat/completions` SSE stream emits `usage.completion_tokens=0` in the terminal block, even when audio and text were generated. Use the per-chunk `metrics.num_tokens_out` field for an accurate engine-side token count (see upstream `vllm_omni/benchmarks/patch/patch.py`). +- **Voice-clone TTS (Qwen3-TTS-Base) is slower in 0.20.0 than 0.18.0 due to an upstream Code2Wav decode-chunk un-batching regression** + ([vllm-omni#3203](https://github.com/vllm-project/vllm-omni/pull/3203)). Observed on `g6.xlarge` with `qwen3-tts-12hz-1.7b-base`, concurrency 4, 20 + prompts: requests/s **0.4 → 0.281**, audio RTF multiplier **1.6 → 1.109**, p95 E2E **11s → 15.9s**. TTS quality is unchanged. The fix is merged + upstream as [vllm-omni#3485](https://github.com/vllm-project/vllm-omni/pull/3485) post-0.20.0 and will land in the next omni point release. + Preset-voice TTS (Qwen3-TTS-CustomVoice) is unaffected. - **CosyVoice3 requires `--trust-remote-code` and ~32 GB host RAM during model load.** A 16 GB host can SIGKILL the process during HuggingFace cache hydration. Prefer `g6e.xlarge` or larger for both EC2 and SageMaker instance types. - **Stable-Audio-Open output is capped at ~47 seconds per request** by the model itself. For longer clips, run multiple requests with adjusted From 8121da0e90519578cace58820af34cddd6c56342 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Tue, 12 May 2026 15:51:22 -0700 Subject: [PATCH 03/10] docs(vllm-omni): pin 0.18.0 to immutable v1.0 tag, leave v1 floating on 0.20.0 The `omni-cuda-v1` and `omni-sagemaker-cuda-v1` tags are now reused for 0.20.0 (per the image config files in main). Switch the 0.18.0 docs to the immutable `omni-cuda-v1.0` / `omni-sagemaker-cuda-v1.0` tags so users who want to reproduce the 0.18.0 image have a frozen URI; `v1` continues to float to the latest release in the v1 line (0.20.0 today). The v1.0 tags already exist in both 763... and public.ecr.aws. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yadan Wei --- docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml | 2 +- docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml index a6bc7ec8b859..696881d9ad22 100644 --- a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml +++ b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml @@ -9,7 +9,7 @@ platform: default public_registry: true tags: - - "omni-cuda-v1" + - "omni-cuda-v1.0" announcements: - "Initial release of vLLM-Omni containers for EC2, ECS, EKS" diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml index bb61f8a78299..9953790bf81f 100644 --- a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml +++ b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml @@ -9,7 +9,7 @@ platform: sagemaker public_registry: true tags: - - "omni-sagemaker-cuda-v1" + - "omni-sagemaker-cuda-v1.0" announcements: - "Initial release of vLLM-Omni containers for SageMaker" From a566e9a33210cebc4a86b948437bac36db8f71db Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Wed, 13 May 2026 13:47:03 -0700 Subject: [PATCH 04/10] docs(vllm-omni): tag versioning convention, sync-video example fix, What's New entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tag versioning (DLC-level) -------------------------- Document the two-tier tag convention so customers can choose the right trade-off between freshness and stability: - omni-cuda-v1 / omni-sagemaker-cuda-v1 — float across DLC minor + patch (auto-upgrade on docker pull). Best for dev, quick-starts. - omni-cuda-v1.1 / omni-sagemaker-cuda-v1.1 — float across DLC patches only in the v1.1 line (auto-accept security fixes, decline new minor releases). Recommended for production. - @sha256: — escape hatch for byte-identical reproducibility. The semantic versioning tier is at the DLC level (v1, v1.1, v1.1.x), not the bundled vllm-omni upstream version (which can advance independently of DLC patches). Customers pinned to v1.1 would have been insulated from the Code2Wav un-batching regression that landed with the DLC v1.1 minor bump until they were ready to evaluate it. Reflected in: - docs/src/data/vllm-omni/0.20.0-gpu-{ec2,sagemaker}.yml — list both v1 and v1.1 tags with comments explaining the floating semantics - docs/vllm-omni/index.md — new Versioning and Tags section + expanded Pull Commands showing both tiers + digest pin Sync-video SageMaker example fix -------------------------------- The previous example used real-time invoke_endpoint, which has a hard 60-second timeout. First-request latency on Wan2.1-VACE-1.3B includes model load + torch.compile warmup (3-4 min), so the example would always fail on first invoke. Rewrote to mirror the pattern proven by test_vllm_omni_video_async_endpoint (last green 2026-05-11): - AsyncInferenceConfig with output_path + max_concurrent_invocations=1 - s3.put_object to upload the request payload - invoke_endpoint_async with InputLocation + CustomAttributes - Poll the .out object for raw MP4 bytes - Form-data values as strings (the middleware converts JSON to multipart/form-data; numeric values must be JSON strings) - Wan2.1-VACE-1.3B-diffusers + ml.g5.2xlarge (validated combination) End-to-end validated 2026-05-13 in account 897880167187: endpoint deployed, async invoke succeeded, 45 KB MP4 returned with Content-Type video/mp4 (valid ISO Media MP4 header), endpoint cleaned up after. docs/vllm-omni/index.md prose updated to recommend async inference as the default for video on SageMaker (it's required, not optional, given the warmup time). What's New entries ------------------ README.md (which generates docs/index.md): two new vLLM-Omni entries under Release Highlights: - 2026/05/13 vLLM-Omni v0.20.0 - 2026/04/24 vLLM-Omni v0.18.0 (initial release) Both reference the floating tag (omni-cuda-v1 / omni-sagemaker-cuda-v1) and v1.0 for the 0.18.0 entry. Removed ------- The "usage.completion_tokens=0 for omni-chat models" Known Limitations item — internal benchmark-tooling concern, not user-facing. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yadan Wei --- README.md | 2 + docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml | 3 +- .../data/vllm-omni/0.20.0-gpu-sagemaker.yml | 3 +- docs/vllm-omni/index.md | 72 +++++++++++++--- .../vllm-omni/sagemaker/deploy_video_sync.py | 83 ++++++++++++++----- 5 files changed, 129 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 8f9bae83c8d7..ea9601d493c7 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,13 @@ ______________________________________________________________________ ### 🚀 Release Highlights +- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1` · SageMaker: `omni-sagemaker-cuda-v1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0. - **[2026/05/11]** [vLLM v0.20.2](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.2-gpu-py312-ec2` · SageMaker: `0.20.2-gpu-py312` · Bug fixes for DeepSeek V4. - **[2026/05/06]** [SGLang v0.5.11](https://gallery.ecr.aws/deep-learning-containers/sglang) — EC2: `0.5.11-gpu-py312-ec2` · SageMaker: `0.5.11-gpu-py312` · Model support for Gemma 4, GLM-5.1, Qwen 3.4, and more - **[2026/05/05]** [vLLM v0.20.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.1-gpu-py312-ec2` · SageMaker: `0.20.1-gpu-py312` · Bug fixes for DeepSeek V4. - **[2026/04/30]** [PyTorch v2.11.0](https://gallery.ecr.aws/deep-learning-containers/pytorch) — EC2: `2.11.0-cu130-amzn2023` · SageMaker: `2.11.0-cu130-amzn2023-sagemaker` · Amazon Linux 2023 with EFA, flash-attn, and transformer-engine. - **[2026/04/28]** [vLLM v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.0-gpu-py312-ec2` · SageMaker: `0.20.0-gpu-py312` · Introduces support for DeepSeek V4. +- **[2026/04/24]** [vLLM-Omni v0.18.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1.0` · SageMaker: `omni-sagemaker-cuda-v1.0` · Initial release. Serves omni-modality models (TTS, image, video, multimodal chat) through OpenAI-compatible APIs; SageMaker routing middleware via `CustomAttributes`. - **[2026/04/20]** [vLLM v0.19.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.19-gpu-py312-ec2` · SageMaker: `0.19-gpu-py312` · This upgrades Transformers to 5.5.4, enabling Gemma 4 support. ### 📢 Support Updates diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml index 6daab83318ba..ab829dddb3b1 100644 --- a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml +++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml @@ -9,7 +9,8 @@ platform: default public_registry: true tags: - - "omni-cuda-v1" + - "omni-cuda-v1" # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0) + - "omni-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) announcements: - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0" diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml index c261b5f24dd9..3b91ae670c9d 100644 --- a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml +++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml @@ -9,7 +9,8 @@ platform: sagemaker public_registry: true tags: - - "omni-sagemaker-cuda-v1" + - "omni-sagemaker-cuda-v1" # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0) + - "omni-sagemaker-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) announcements: - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0" diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index 11c092284f94..677ca43725b3 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -15,20 +15,66 @@ SageMaker routing middleware for dispatching `/invocations` to any omni endpoint ## Pull Commands -**EC2:** +**EC2** — latest supported (floats across DLC minor versions; auto-upgrades on next pull): ```bash docker pull {{ images.latest_vllm_omni_ec2 }} ``` -**SageMaker:** +**EC2** — patch-stable (recommended for production; auto-accepts DLC security patches in the v1.1 line, declines new DLC minor releases): + +```bash +docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1.1 +``` + +**SageMaker** — latest supported: ```bash docker pull {{ images.latest_vllm_omni_sagemaker }} ``` -See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication -instructions. +**SageMaker** — patch-stable: + +```bash +docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1.1 +``` + +See [Available Images](../reference/available_images.md) for all image URIs, [Versioning and Tags](#versioning-and-tags) below for the convention, and +[Getting Started](../get_started/index.md) for authentication instructions. + +## Versioning and Tags + +vLLM-Omni image tags follow a **DLC-level** semantic versioning convention (independent of the bundled vllm-omni upstream version): + +- **DLC major (`v1`, `v2`, …)** — incompatible/breaking changes in the DLC itself: image API, entrypoint, removed routes, pinned framework majors. + Customer code may need updating when the DLC major bumps. +- **DLC minor (`v1.0`, `v1.1`, …)** — DLC release tracking new upstream vllm-omni features (e.g., a new endpoint), still API-compatible at the DLC + level. May introduce behavioral changes in the bundled engine. +- **DLC patch** — security patches and bug fixes layered on top of an existing release without bumping the bundled vllm-omni version. Same tag, new + image digest. + +Two tag tiers, both floating, are exposed to customers: + +- **Minor-floating tags** (`omni-cuda-v1`, `omni-sagemaker-cuda-v1`) — track the latest DLC release within a major line. Auto-upgrade across DLC minor + *and* patch updates on `docker pull`. Best for development, quick-starts, and "give me whatever is supported right now". +- **Patch-floating tags** (`omni-cuda-v1.1`, `omni-sagemaker-cuda-v1.1`) — follow only the DLC patch stream within one minor release. They auto-accept + security patches and bug fixes, but decline new DLC minor releases that could change behavior. Recommended for production: customers pinned here + would have been insulated from the Code2Wav un-batching regression that landed with the DLC `v1.1` minor bump (see + [Known Limitations](#known-limitations) below) until they were ready to evaluate it. + +If your workload requires byte-identical reproducibility — i.e., declining even DLC patches — pull by digest instead of tag: + +```bash +docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm@sha256: +``` + +`docker inspect ` or `docker pull` output prints the digest of the image you currently have. Pulls by digest never change. + +| Tag | Tracks | Currently points at | +| --- | --- | --- | +| `omni-cuda-v1` / `omni-sagemaker-cuda-v1` | latest DLC release in v1 line (minor + patch) | DLC `v1.1` (vllm-omni 0.20.0) | +| `omni-cuda-v1.0` / `omni-sagemaker-cuda-v1.0` | DLC v1.0 patch stream (vllm-omni 0.18.0 + DLC patches) | latest v1.0.x DLC patch | +| `omni-cuda-v1.1` / `omni-sagemaker-cuda-v1.1` | DLC v1.1 patch stream (vllm-omni 0.20.0 + DLC patches) | latest v1.1.x DLC patch | ## Packages @@ -168,7 +214,7 @@ header: | `route=/v1/audio/generate` | Audio generation (new in 0.20.0) | | `route=/v1/images/generations` | Image generation | | `route=/v1/videos` | Video generation, async (JSON auto-converted to form-data) — returns job-ID only; MP4 not retrievable via SageMaker. Prefer `/v1/videos/sync` below. | -| `route=/v1/videos/sync` | Video generation, sync (new in 0.20.0) — blocks and returns raw MP4 bytes; works through SageMaker real-time endpoints | +| `route=/v1/videos/sync` | Video generation, sync (new in 0.20.0) — blocks server-side and returns raw MP4 bytes; deploy behind SageMaker async inference (first-request `torch.compile` warmup exceeds the 60s real-time invoke timeout) | | `route=/v1/chat/completions` | Multimodal chat | | *(no route)* | vLLM default `/invocations` (chat/completion/embed) | @@ -217,16 +263,19 @@ additional retrieval step required. ### Deploy a Video Endpoint The `/v1/videos/sync` endpoint (new in 0.20.0) is the supported path for video on SageMaker. Unlike the async `/v1/videos` route — which writes a -job-ID JSON to S3 but never the MP4 — `/v1/videos/sync` blocks until generation completes and returns raw MP4 bytes that SageMaker hands back to the -client directly. +job-ID JSON to S3 but never the MP4 — `/v1/videos/sync` blocks server-side until generation completes and writes the raw MP4 bytes to the configured +S3 output path. + +Deploy behind **SageMaker async inference** (`AsyncInferenceConfig`), not real-time inference: first-request latency on video models is dominated by +model load + `torch.compile` warmup (3–4 minutes for Wan2.1-VACE-1.3B), which exceeds the 60-second real-time invoke timeout. Async inference allows +up to 1 hour and writes the response body verbatim to S3, so the `.out` object *is* the MP4 — no polling on a job ID. ```python --8<-- "examples/vllm-omni/sagemaker/deploy_video_sync.py" ``` -Sync video generation can take 30–120 seconds depending on `num_inference_steps` and `num_frames`. If a request approaches the 60s real-time invoke -timeout, either reduce `num_inference_steps` or use `invoke_endpoint_async` (its 60-minute ceiling accommodates long jobs, and the response body — the -MP4 — is written verbatim to the S3 output path). +Validated 2026-05-11 on `ml.g5.2xlarge` (A10G 24 GB VRAM, 32 GB host RAM): 45 KB MP4 in ~10s after warmup. Reduce `num_inference_steps` and +`num_frames` to stay under the async ceiling for warm requests. ## Known Limitations @@ -235,9 +284,6 @@ MP4 — is written verbatim to the S3 output path). [Deploy a Video Endpoint](#deploy-a-video-endpoint)) or stay on EC2 for the async workflow with status polling. - **First-request latency on SageMaker real-time endpoints.** TTS, audio-generate, and video models can exceed the 60s invoke timeout on the first request due to `torch.compile` warmup. Use async inference or retry after warmup. -- **`usage.completion_tokens` is reported as `0` for omni-chat models.** The `/v1/chat/completions` SSE stream emits `usage.completion_tokens=0` in - the terminal block, even when audio and text were generated. Use the per-chunk `metrics.num_tokens_out` field for an accurate engine-side token - count (see upstream `vllm_omni/benchmarks/patch/patch.py`). - **Voice-clone TTS (Qwen3-TTS-Base) is slower in 0.20.0 than 0.18.0 due to an upstream Code2Wav decode-chunk un-batching regression** ([vllm-omni#3203](https://github.com/vllm-project/vllm-omni/pull/3203)). Observed on `g6.xlarge` with `qwen3-tts-12hz-1.7b-base`, concurrency 4, 20 prompts: requests/s **0.4 → 0.281**, audio RTF multiplier **1.6 → 1.109**, p95 E2E **11s → 15.9s**. TTS quality is unchanged. The fix is merged diff --git a/examples/vllm-omni/sagemaker/deploy_video_sync.py b/examples/vllm-omni/sagemaker/deploy_video_sync.py index 4cf64622d480..0717099aebbd 100644 --- a/examples/vllm-omni/sagemaker/deploy_video_sync.py +++ b/examples/vllm-omni/sagemaker/deploy_video_sync.py @@ -1,50 +1,95 @@ -"""Deploy a vLLM-Omni video model to a SageMaker real-time endpoint and invoke -the new /v1/videos/sync endpoint, which blocks until generation completes and -returns raw MP4 bytes. +"""Deploy a vLLM-Omni video model to a SageMaker async inference endpoint +and invoke the new /v1/videos/sync endpoint, which blocks server-side until +generation completes and returns raw MP4 bytes. + +Async inference is required for video — first-request latency includes model +load + torch.compile warmup (3-4 min for Wan2.1-VACE-1.3B), well past the +60s real-time invoke timeout. Async inference allows up to 1 hour and +deposits the response body verbatim at the configured S3 output path, so the +.out object is the raw MP4. Available since vLLM-Omni 0.20.0; supersedes the 0.18.0 limitation that SageMaker async inference could only retrieve the job-ID JSON, not the MP4. +The routing middleware (`CustomAttributes="route=/v1/videos/sync"`) auto- +converts the JSON request body to multipart/form-data for the underlying +endpoint; values must therefore be JSON strings. -Use the routing middleware via `CustomAttributes="route=/v1/videos/sync"`, -which auto-converts JSON request bodies to multipart/form-data for the -underlying endpoint. +Validated 2026-05-11 on ml.g5.2xlarge (A10G 24 GB VRAM, 32 GB host RAM): +45 KB MP4 returned in ~10s after warmup. """ +import time + import boto3 +from sagemaker.async_inference import AsyncInferenceConfig from sagemaker.model import Model from sagemaker.predictor import Predictor from sagemaker.serializers import JSONSerializer +BUCKET = "" # replace with an S3 bucket your role can read/write +ROLE_ARN = "arn:aws:iam:::role/SageMakerExecutionRole" +ENDPOINT_NAME = "vllm-omni-video-sync" + model = Model( image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1", - role="arn:aws:iam:::role/SageMakerExecutionRole", - env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"}, + role=ROLE_ARN, + env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-VACE-1.3B-diffusers"}, predictor_cls=Predictor, ) predictor = model.deploy( instance_type="ml.g5.2xlarge", initial_instance_count=1, - endpoint_name="vllm-omni-video-sync", + endpoint_name=ENDPOINT_NAME, inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", serializer=JSONSerializer(), + async_inference_config=AsyncInferenceConfig( + output_path=f"s3://{BUCKET}/vllm-omni-async-output/", + max_concurrent_invocations_per_instance=1, + ), wait=True, ) -# Invoke /v1/videos/sync via CustomAttributes; response body is the MP4 bytes -# (Content-Type: video/mp4). Prefer invoke_endpoint over invoke_endpoint_async -# because sync video can take 30–120s and the real-time path's binary response -# is what we want — async would write base64-encoded JSON to S3. +# Upload the input payload to S3, then call invoke_endpoint_async with +# CustomAttributes routing to /v1/videos/sync. Values are strings because +# the middleware converts JSON to multipart/form-data. +s3 = boto3.client("s3") +s3.put_object( + Bucket=BUCKET, + Key="vllm-omni-async-input/request.json", + Body=( + '{"prompt": "a dog running on a beach", ' + '"num_frames": "17", "num_inference_steps": "4", ' + '"size": "480x320", "seed": "42"}' + ), + ContentType="application/json", +) + runtime = boto3.client("sagemaker-runtime") -response = runtime.invoke_endpoint( - EndpointName="vllm-omni-video-sync", - Body='{"prompt": "a dog running on a beach", "num_frames": 17, ' - '"num_inference_steps": 30, "size": "480x320", "seed": 42}', +result = runtime.invoke_endpoint_async( + EndpointName=ENDPOINT_NAME, + InputLocation=f"s3://{BUCKET}/vllm-omni-async-input/request.json", ContentType="application/json", CustomAttributes="route=/v1/videos/sync", ) -with open("video.mp4", "wb") as f: - f.write(response["Body"].read()) +output_location = result["OutputLocation"] # s3://.../.out +print(f"Output will be written to {output_location}") + +# Poll for the .out object (raw MP4 bytes). First request takes ~3-4 min +# due to model load + torch.compile; warm requests are ~3-10s. +bucket = output_location.split("/", 3)[2] +key = output_location.split("/", 3)[3] +for _ in range(120): # 10 min timeout + try: + obj = s3.get_object(Bucket=bucket, Key=key) + with open("video.mp4", "wb") as f: + f.write(obj["Body"].read()) + print(f"wrote video.mp4 (Content-Type: {obj.get('ContentType', '?')})") + break + except s3.exceptions.NoSuchKey: + time.sleep(5) +else: + raise RuntimeError("timed out waiting for async output") # When done: # predictor.delete_endpoint() From 25b7b2274bb141cce0db63adc4c58f22ed166aa0 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Wed, 13 May 2026 13:52:17 -0700 Subject: [PATCH 05/10] docs(vllm-omni): show v1.1 in available_images table for 0.20.0 (patch-floating) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reorder the 0.20.0 yamls so omni-cuda-v1.1 / omni-sagemaker-cuda-v1.1 come first; the docs generator uses tags[0] for the per-row table cell in available_images.md. Before: 0.18.0 row showed `omni-cuda-v1.0`, 0.20.0 row showed `omni-cuda-v1` — inconsistent (one patch-floating, one minor-floating). After: both rows show their patch-floating tag, which uniquely identifies the release line and won't drift when the minor-floating v1 advances to a future image. Also bumps the README.md "What's New" entry for v0.20.0 to reference omni-cuda-v1.1 / omni-sagemaker-cuda-v1.1 for the same durability reason. Release-notes pages still print all four URIs (private + public ECR × both tag tiers). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yadan Wei --- README.md | 2 +- docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml | 5 ++++- docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml | 5 ++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ea9601d493c7..4d7579cdecb6 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ ______________________________________________________________________ ### 🚀 Release Highlights -- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1` · SageMaker: `omni-sagemaker-cuda-v1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0. +- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1.1` · SageMaker: `omni-sagemaker-cuda-v1.1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0. - **[2026/05/11]** [vLLM v0.20.2](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.2-gpu-py312-ec2` · SageMaker: `0.20.2-gpu-py312` · Bug fixes for DeepSeek V4. - **[2026/05/06]** [SGLang v0.5.11](https://gallery.ecr.aws/deep-learning-containers/sglang) — EC2: `0.5.11-gpu-py312-ec2` · SageMaker: `0.5.11-gpu-py312` · Model support for Gemma 4, GLM-5.1, Qwen 3.4, and more - **[2026/05/05]** [vLLM v0.20.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.1-gpu-py312-ec2` · SageMaker: `0.20.1-gpu-py312` · Bug fixes for DeepSeek V4. diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml index ab829dddb3b1..f12f04fc1f0c 100644 --- a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml +++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml @@ -9,8 +9,11 @@ platform: default public_registry: true tags: - - "omni-cuda-v1" # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0) + # v1.1 is listed first so the available_images.md table renders the + # patch-floating tag — uniquely identifies "the 0.20.x line" even after + # the minor-floating v1 advances to a future release. - "omni-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) + - "omni-cuda-v1" # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0) announcements: - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0" diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml index 3b91ae670c9d..f884ae7cbc60 100644 --- a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml +++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml @@ -9,8 +9,11 @@ platform: sagemaker public_registry: true tags: - - "omni-sagemaker-cuda-v1" # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0) + # v1.1 is listed first so the available_images.md table renders the + # patch-floating tag — uniquely identifies "the 0.20.x line" even after + # the minor-floating v1 advances to a future release. - "omni-sagemaker-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) + - "omni-sagemaker-cuda-v1" # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0) announcements: - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0" From 01dd04c47bc148afa1e5afd463c79552af3fa131 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Wed, 13 May 2026 13:55:38 -0700 Subject: [PATCH 06/10] =?UTF-8?q?docs(vllm-omni):=20revert=20tag=20reorder?= =?UTF-8?q?=20=E2=80=94=20`tags[0]`=20feeds=20the=20latest=5F*=5Fec2=20mac?= =?UTF-8?q?ro?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit (25b7b227) reordered tags to put `omni-cuda-v1.1` first, intending to fix a perceived asymmetry in available_images.md (0.18.0 → v1.0, 0.20.0 → v1). That broke the Pull Commands section: both pull commands ended up pointing at v1.1, defeating the two-tier story. Root cause: docs/src/macros.py uses `latest.display_tag` (which returns `tags[0]`) to render `{{ images.latest_vllm_omni_ec2 }}`. That macro is the "latest supported" pull command in docs/vllm-omni/index.md. The original asymmetry was actually the convention working as intended: - 0.20.0 is the current floating-v1 release, so its yaml lists v1 first - 0.18.0 is no longer the floating-v1 target, so its yaml only lists v1.0 The maintenance pattern when a new release ships: remove v1 from the *previous* release's yaml. The 0.18.0 yaml already reflects this. Restore tags[0] = "omni-cuda-v1" on the 0.20.0 yamls and the README What's New entry; add a comment in each yaml documenting the convention so the next maintainer doesn't make the same mistake. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yadan Wei --- README.md | 2 +- docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml | 10 ++++++---- docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml | 10 ++++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 4d7579cdecb6..ea9601d493c7 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ ______________________________________________________________________ ### 🚀 Release Highlights -- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1.1` · SageMaker: `omni-sagemaker-cuda-v1.1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0. +- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1` · SageMaker: `omni-sagemaker-cuda-v1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0. - **[2026/05/11]** [vLLM v0.20.2](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.2-gpu-py312-ec2` · SageMaker: `0.20.2-gpu-py312` · Bug fixes for DeepSeek V4. - **[2026/05/06]** [SGLang v0.5.11](https://gallery.ecr.aws/deep-learning-containers/sglang) — EC2: `0.5.11-gpu-py312-ec2` · SageMaker: `0.5.11-gpu-py312` · Model support for Gemma 4, GLM-5.1, Qwen 3.4, and more - **[2026/05/05]** [vLLM v0.20.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.1-gpu-py312-ec2` · SageMaker: `0.20.1-gpu-py312` · Bug fixes for DeepSeek V4. diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml index f12f04fc1f0c..3789513fe954 100644 --- a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml +++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml @@ -9,11 +9,13 @@ platform: default public_registry: true tags: - # v1.1 is listed first so the available_images.md table renders the - # patch-floating tag — uniquely identifies "the 0.20.x line" even after - # the minor-floating v1 advances to a future release. - - "omni-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) + # v1 is listed first because tags[0] feeds the {{ images.latest_vllm_omni_* }} + # macros and the available_images.md "primary URL" cell. When a future DLC + # release ships, drop `omni-cuda-v1` from this yaml (it'll move to the new + # release's yaml) and keep only `omni-cuda-v1.1` here — same pattern as the + # 0.18.0 yaml currently uses. - "omni-cuda-v1" # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0) + - "omni-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) announcements: - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0" diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml index f884ae7cbc60..f2ddcec54260 100644 --- a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml +++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml @@ -9,11 +9,13 @@ platform: sagemaker public_registry: true tags: - # v1.1 is listed first so the available_images.md table renders the - # patch-floating tag — uniquely identifies "the 0.20.x line" even after - # the minor-floating v1 advances to a future release. - - "omni-sagemaker-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) + # v1 is listed first because tags[0] feeds the {{ images.latest_vllm_omni_* }} + # macros and the available_images.md "primary URL" cell. When a future DLC + # release ships, drop `omni-sagemaker-cuda-v1` from this yaml (it'll move to + # the new release's yaml) and keep only `omni-sagemaker-cuda-v1.1` here — + # same pattern as the 0.18.0 yaml currently uses. - "omni-sagemaker-cuda-v1" # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0) + - "omni-sagemaker-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) announcements: - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0" From dd625d6ffa361bf343e6562e79908956c9134093 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Wed, 13 May 2026 14:12:03 -0700 Subject: [PATCH 07/10] docs(vllm-omni): decouple per-release tags from minor-floating v1 tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stop listing `omni-cuda-v1` / `omni-sagemaker-cuda-v1` in per-release docs/src/data/vllm-omni/0.20.0-gpu-{ec2,sagemaker}.yml. Each release's yaml now lists only its patch-floating tag (`v1.1` for 0.20.0; `v1.0` already this way in 0.18.0). The minor-floating `v1` tag is still documented prominently in docs/vllm-omni/index.md (Pull Commands "latest supported" + Versioning and Tags section), but it isn't a per-release identifier — it points at whichever release is currently the v1-line target. Hardcoding the v1 pull URL in index.md (instead of using the `{{ images.latest_vllm_omni_* }}` macro that reads `tags[0]`) makes the prose source-of-truth for the floater, decoupled from per-release yaml metadata. Why this is better: - available_images.md table is now self-consistent — every row shows the release's patch-floating tag, no asymmetry between current and previous releases. - Self-correcting: when a future DLC release ships, no edits to the 0.20.0 yaml are required to remove `v1` (since it was never there). Today's convention required "drop v1 from old yaml on next release", easy to forget. - Decoupled concerns: yamls own per-release metadata, prose owns the floating-tag story. Verified locally with `python docs/src/main.py && mkdocs serve`: - reference/available_images table: 0.20.0 → v1.1, 0.18.0 → v1.0 - releasenotes/vllm-omni-0.20.0-*: only v1.1 URIs (no longer v1) - vllm-omni/index.md Pull Commands: both v1 (latest) and v1.1 (patch-stable) tags shown for EC2 + SageMaker - vllm-omni/index.md Versioning section table: unchanged README.md What's New entry for 0.20.0: bumped from v1 to v1.1 to match the 0.18.0 entry's pattern (per-release rows always show the patch-floating, durable identifier). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yadan Wei --- README.md | 2 +- docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml | 12 ++++++------ docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml | 12 ++++++------ docs/vllm-omni/index.md | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index ea9601d493c7..4d7579cdecb6 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ ______________________________________________________________________ ### 🚀 Release Highlights -- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1` · SageMaker: `omni-sagemaker-cuda-v1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0. +- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `omni-cuda-v1.1` · SageMaker: `omni-sagemaker-cuda-v1.1` · Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0. - **[2026/05/11]** [vLLM v0.20.2](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.2-gpu-py312-ec2` · SageMaker: `0.20.2-gpu-py312` · Bug fixes for DeepSeek V4. - **[2026/05/06]** [SGLang v0.5.11](https://gallery.ecr.aws/deep-learning-containers/sglang) — EC2: `0.5.11-gpu-py312-ec2` · SageMaker: `0.5.11-gpu-py312` · Model support for Gemma 4, GLM-5.1, Qwen 3.4, and more - **[2026/05/05]** [vLLM v0.20.1](https://gallery.ecr.aws/deep-learning-containers/vllm) — EC2: `0.20.1-gpu-py312-ec2` · SageMaker: `0.20.1-gpu-py312` · Bug fixes for DeepSeek V4. diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml index 3789513fe954..2f5827d20928 100644 --- a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml +++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml @@ -9,12 +9,12 @@ platform: default public_registry: true tags: - # v1 is listed first because tags[0] feeds the {{ images.latest_vllm_omni_* }} - # macros and the available_images.md "primary URL" cell. When a future DLC - # release ships, drop `omni-cuda-v1` from this yaml (it'll move to the new - # release's yaml) and keep only `omni-cuda-v1.1` here — same pattern as the - # 0.18.0 yaml currently uses. - - "omni-cuda-v1" # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0) + # Only the patch-floating tag is listed per release. The minor-floating + # `omni-cuda-v1` tag is documented in docs/vllm-omni/index.md (Pull Commands + + # Versioning and Tags) but isn't a per-release identifier — it points at + # whichever release is currently the v1-line target. Releases that hold + # only their patch-floating tag in this yaml (this convention) auto-correct + # when the v1 floater advances; no yaml edits needed. - "omni-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) announcements: diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml index f2ddcec54260..7511478b14b6 100644 --- a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml +++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml @@ -9,12 +9,12 @@ platform: sagemaker public_registry: true tags: - # v1 is listed first because tags[0] feeds the {{ images.latest_vllm_omni_* }} - # macros and the available_images.md "primary URL" cell. When a future DLC - # release ships, drop `omni-sagemaker-cuda-v1` from this yaml (it'll move to - # the new release's yaml) and keep only `omni-sagemaker-cuda-v1.1` here — - # same pattern as the 0.18.0 yaml currently uses. - - "omni-sagemaker-cuda-v1" # floats across DLC minor versions (currently → DLC v1.1, vllm-omni 0.20.0) + # Only the patch-floating tag is listed per release. The minor-floating + # `omni-sagemaker-cuda-v1` tag is documented in docs/vllm-omni/index.md + # (Pull Commands + Versioning and Tags) but isn't a per-release identifier — + # it points at whichever release is currently the v1-line target. Releases + # that hold only their patch-floating tag in this yaml (this convention) + # auto-correct when the v1 floater advances; no yaml edits needed. - "omni-sagemaker-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) announcements: diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index 677ca43725b3..33d8673f3474 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -18,7 +18,7 @@ SageMaker routing middleware for dispatching `/invocations` to any omni endpoint **EC2** — latest supported (floats across DLC minor versions; auto-upgrades on next pull): ```bash -docker pull {{ images.latest_vllm_omni_ec2 }} +docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1 ``` **EC2** — patch-stable (recommended for production; auto-accepts DLC security patches in the v1.1 line, declines new DLC minor releases): @@ -30,7 +30,7 @@ docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1.1 **SageMaker** — latest supported: ```bash -docker pull {{ images.latest_vllm_omni_sagemaker }} +docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1 ``` **SageMaker** — patch-stable: From e78696d016067169b0a83bef7a2a0c516eb301d0 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Wed, 13 May 2026 14:44:09 -0700 Subject: [PATCH 08/10] docs(vllm-omni): adopt vLLM-server's four-tier Pin a Version table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the bespoke two-section "Pull Commands + Versioning and Tags" prose with the cleaner four-tier convention used by the public vLLM-server docs. Maps directly to the suffix structure customers already see in vllm-server pull commands. Pull Commands now show only the bare base tags (omni-cuda / omni-sagemaker-cuda) — "give me whatever ships". The Pin a Version section enumerates the four tiers in one table: | Suffix | Example | Updates when | |---------------------------|------------------|-----------------------------------------------| | (none) | omni-cuda | Any release, including breaking changes | | -v | omni-cuda-v1 | New features and fixes, no breaking changes | | -v. | omni-cuda-v1.1 | Security patches and bug fixes only | | -v.. | omni-cuda-v1.1.0 | Never — immutable snapshot | Production recommendation (pin to -v.) calls out the Code2Wav un-batching regression as the concrete example of why patch-stable insulates production from feature-release surprises. Switches both Pull Commands URIs from the private 763... ECR to public.ecr.aws/deep-learning-containers/vllm to match the vLLM-server docs convention (private ECR is in the per-region table on available_images.md). Removes the now-obsolete tag-history table — Pin a Version handles the same information through suffix semantics. Verified locally with `python docs/src/main.py && mkdocs serve`: - Pull Commands: bare omni-cuda and omni-sagemaker-cuda URIs - Pin a Version: 4-row suffix table with examples + update semantics - Section order: Latest Announcements -> Pull Commands -> Pin a Version -> Packages -> Supported Modalities -> ... Existing example scripts (deploy_tts.py, deploy_tts_async.py, deploy_video_sync.py) keep their -v1 URIs unchanged — examples document behavior validated at v1 and don't need to chase the latest tag. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yadan Wei --- docs/vllm-omni/index.md | 63 ++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index 33d8673f3474..095ea6fc8c51 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -15,66 +15,45 @@ SageMaker routing middleware for dispatching `/invocations` to any omni endpoint ## Pull Commands -**EC2** — latest supported (floats across DLC minor versions; auto-upgrades on next pull): +**Multimodal (TTS, image/video/audio generation, omni chat) on EC2 / EKS:** ```bash -docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1 +docker pull public.ecr.aws/deep-learning-containers/vllm:omni-cuda ``` -**EC2** — patch-stable (recommended for production; auto-accepts DLC security patches in the v1.1 line, declines new DLC minor releases): +**Multimodal on Amazon SageMaker AI:** ```bash -docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1.1 +docker pull public.ecr.aws/deep-learning-containers/vllm:omni-sagemaker-cuda ``` -**SageMaker** — latest supported: +See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication +instructions. -```bash -docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1 -``` - -**SageMaker** — patch-stable: - -```bash -docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1.1 -``` +## Pin a Version -See [Available Images](../reference/available_images.md) for all image URIs, [Versioning and Tags](#versioning-and-tags) below for the convention, and -[Getting Started](../get_started/index.md) for authentication instructions. +Append a version suffix to the base tag to control update behavior: -## Versioning and Tags - -vLLM-Omni image tags follow a **DLC-level** semantic versioning convention (independent of the bundled vllm-omni upstream version): - -- **DLC major (`v1`, `v2`, …)** — incompatible/breaking changes in the DLC itself: image API, entrypoint, removed routes, pinned framework majors. - Customer code may need updating when the DLC major bumps. -- **DLC minor (`v1.0`, `v1.1`, …)** — DLC release tracking new upstream vllm-omni features (e.g., a new endpoint), still API-compatible at the DLC - level. May introduce behavioral changes in the bundled engine. -- **DLC patch** — security patches and bug fixes layered on top of an existing release without bumping the bundled vllm-omni version. Same tag, new - image digest. +| Suffix | Example | Updates when | +| --- | --- | --- | +| (none) | `omni-cuda` | Any release, including breaking changes | +| `-v` | `omni-cuda-v1` | New features and fixes, no breaking changes | +| `-v.` | `omni-cuda-v1.1` | Security patches and bug fixes only | +| `-v..` | `omni-cuda-v1.1.0` | Never — immutable snapshot | -Two tag tiers, both floating, are exposed to customers: +The same suffixes apply to the SageMaker base tag (`omni-sagemaker-cuda`). -- **Minor-floating tags** (`omni-cuda-v1`, `omni-sagemaker-cuda-v1`) — track the latest DLC release within a major line. Auto-upgrade across DLC minor - *and* patch updates on `docker pull`. Best for development, quick-starts, and "give me whatever is supported right now". -- **Patch-floating tags** (`omni-cuda-v1.1`, `omni-sagemaker-cuda-v1.1`) — follow only the DLC patch stream within one minor release. They auto-accept - security patches and bug fixes, but decline new DLC minor releases that could change behavior. Recommended for production: customers pinned here - would have been insulated from the Code2Wav un-batching regression that landed with the DLC `v1.1` minor bump (see - [Known Limitations](#known-limitations) below) until they were ready to evaluate it. +**Recommended for production:** pin to `-v.` (e.g., `omni-cuda-v1.1`). It auto-accepts security patches and bug fixes within the +0.20-line release while declining new minor releases that could change behavior — customers pinned here would have been insulated from the Code2Wav +un-batching regression that landed with the v1.1 minor bump (see [Known Limitations](#known-limitations) below) until they were ready to evaluate it. -If your workload requires byte-identical reproducibility — i.e., declining even DLC patches — pull by digest instead of tag: +For byte-identical reproducibility, pull by digest: ```bash -docker pull 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm@sha256: +docker pull public.ecr.aws/deep-learning-containers/vllm@sha256: ``` -`docker inspect ` or `docker pull` output prints the digest of the image you currently have. Pulls by digest never change. - -| Tag | Tracks | Currently points at | -| --- | --- | --- | -| `omni-cuda-v1` / `omni-sagemaker-cuda-v1` | latest DLC release in v1 line (minor + patch) | DLC `v1.1` (vllm-omni 0.20.0) | -| `omni-cuda-v1.0` / `omni-sagemaker-cuda-v1.0` | DLC v1.0 patch stream (vllm-omni 0.18.0 + DLC patches) | latest v1.0.x DLC patch | -| `omni-cuda-v1.1` / `omni-sagemaker-cuda-v1.1` | DLC v1.1 patch stream (vllm-omni 0.20.0 + DLC patches) | latest v1.1.x DLC patch | +`docker inspect ` prints the digest of the image you have. Pulls by digest never change. ## Packages From 7f76bf7dd90dc76d94715cd27e54b16fc5fd4137 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Wed, 13 May 2026 14:46:43 -0700 Subject: [PATCH 09/10] docs(vllm-omni): add private ECR pull commands alongside public ECR Pull Commands section now shows both registry options for each deployment target: - Public ECR (anonymous pull): public.ecr.aws/deep-learning-containers/vllm - Private DLC ECR (authenticated): 763104351884.dkr.ecr..amazonaws.com/vllm Customers running on AWS infrastructure (EC2/EKS/SageMaker) typically prefer the private ECR for better network locality and IAM-controlled access; public ECR is the right path for local development or workloads outside AWS. A short prologue paragraph explains the auth difference and links to Getting Started for credentials. Per-region URI table still lives in available_images.md. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yadan Wei --- docs/vllm-omni/index.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index 095ea6fc8c51..d4cdbab3f83f 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -15,20 +15,30 @@ SageMaker routing middleware for dispatching `/invocations` to any omni endpoint ## Pull Commands +Images are published to both the public ECR gallery (no AWS credentials required) and the private DLC ECR repository (requires +`aws ecr get-login-password`, see [Getting Started](../get_started/index.md)). + **Multimodal (TTS, image/video/audio generation, omni chat) on EC2 / EKS:** ```bash +# Public ECR (anonymous pull): docker pull public.ecr.aws/deep-learning-containers/vllm:omni-cuda + +# Private ECR (authenticated; substitute your region): +docker pull 763104351884.dkr.ecr..amazonaws.com/vllm:omni-cuda ``` **Multimodal on Amazon SageMaker AI:** ```bash +# Public ECR (anonymous pull): docker pull public.ecr.aws/deep-learning-containers/vllm:omni-sagemaker-cuda + +# Private ECR (authenticated; substitute your region): +docker pull 763104351884.dkr.ecr..amazonaws.com/vllm:omni-sagemaker-cuda ``` -See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication -instructions. +See [Available Images](../reference/available_images.md) for the full per-region URI table. ## Pin a Version From 6f09cb5472ab8bf1dbb9e3803b3a44675086dcf0 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Wed, 13 May 2026 14:50:11 -0700 Subject: [PATCH 10/10] fix(vllm-omni): correct ECR repo in EC2 example scripts (vllm-omni -> vllm) All six EC2 example shell scripts hardcoded the legacy repo name `vllm-omni:omni-cuda-v1`, but the actual ECR repo for these images is `vllm` (post-#6007's repo unification, also reflected in the docs generator's `ecr_repository: vllm` field and the prod_image config `vllm:omni-cuda-v1`). Customers running these scripts as-is would have hit a "repo does not exist" error from `docker pull`. Fix the IMAGE default in each script: examples/vllm-omni/audio-generate/run.sh examples/vllm-omni/image/run.sh examples/vllm-omni/qwen2.5-omni/run.sh examples/vllm-omni/tts/run.sh examples/vllm-omni/video-sync/run.sh examples/vllm-omni/video/run.sh The three SageMaker python examples (deploy_tts.py, deploy_tts_async.py, deploy_video_sync.py) already used the correct `vllm:omni-sagemaker-cuda-v1` repo path. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Yadan Wei --- examples/vllm-omni/audio-generate/run.sh | 2 +- examples/vllm-omni/image/run.sh | 2 +- examples/vllm-omni/qwen2.5-omni/run.sh | 2 +- examples/vllm-omni/tts/run.sh | 2 +- examples/vllm-omni/video-sync/run.sh | 2 +- examples/vllm-omni/video/run.sh | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/vllm-omni/audio-generate/run.sh b/examples/vllm-omni/audio-generate/run.sh index 9d0eb25f592e..4252e7b9cdc7 100755 --- a/examples/vllm-omni/audio-generate/run.sh +++ b/examples/vllm-omni/audio-generate/run.sh @@ -4,7 +4,7 @@ # Distinct from /v1/audio/speech (which is TTS — a voice reading words). set -euo pipefail -IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" MODEL="${MODEL:-stabilityai/stable-audio-open-1.0}" NAME="${NAME:-omni-audio-generate}" diff --git a/examples/vllm-omni/image/run.sh b/examples/vllm-omni/image/run.sh index d6dc1615ad95..28008fb19d3a 100755 --- a/examples/vllm-omni/image/run.sh +++ b/examples/vllm-omni/image/run.sh @@ -2,7 +2,7 @@ # End-to-end image-generation example: start server, wait for ready, generate. set -euo pipefail -IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" MODEL="${MODEL:-black-forest-labs/FLUX.2-klein-4B}" NAME="${NAME:-omni-image}" diff --git a/examples/vllm-omni/qwen2.5-omni/run.sh b/examples/vllm-omni/qwen2.5-omni/run.sh index a04624bdf99d..98e020cb6ff6 100755 --- a/examples/vllm-omni/qwen2.5-omni/run.sh +++ b/examples/vllm-omni/qwen2.5-omni/run.sh @@ -6,7 +6,7 @@ # On single-GPU hosts the model's talker stage fails to load on GPU 1. set -euo pipefail -IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" MODEL="${MODEL:-Qwen/Qwen2.5-Omni-3B}" NAME="${NAME:-omni3b}" diff --git a/examples/vllm-omni/tts/run.sh b/examples/vllm-omni/tts/run.sh index 9f4f185a2139..3af3837ed3e4 100755 --- a/examples/vllm-omni/tts/run.sh +++ b/examples/vllm-omni/tts/run.sh @@ -3,7 +3,7 @@ # Requires: docker (with NVIDIA runtime), curl, an authenticated ECR pull. set -euo pipefail -IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice}" NAME="${NAME:-omni-tts}" diff --git a/examples/vllm-omni/video-sync/run.sh b/examples/vllm-omni/video-sync/run.sh index 2b1456724e9d..e3200a0e2a24 100755 --- a/examples/vllm-omni/video-sync/run.sh +++ b/examples/vllm-omni/video-sync/run.sh @@ -4,7 +4,7 @@ # raw MP4 bytes — no job-ID polling needed, unlike async /v1/videos. set -euo pipefail -IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}" NAME="${NAME:-omni-video-sync}" diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh index 36db972d82f3..7c716dd2e0fc 100755 --- a/examples/vllm-omni/video/run.sh +++ b/examples/vllm-omni/video/run.sh @@ -3,7 +3,7 @@ # /v1/videos is async — it returns a job ID; the MP4 is produced in the background. set -euo pipefail -IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}" NAME="${NAME:-omni-video}"