diff --git a/README.md b/README.md index 8f9bae83c8d7..4d7579cdecb6 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,13 @@ ______________________________________________________________________ ### ๐Ÿš€ Release Highlights +- **[2026/05/13]** [vLLM-Omni v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) โ€” EC2: `omni-cuda-v1.1` ยท SageMaker: `omni-sagemaker-cuda-v1.1` ยท Adds `/v1/audio/generate` (stable-audio-open) and `/v1/videos/sync` (unblocks video on SageMaker); supports CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B; CUDA 13.0 + PyTorch 2.11.0. - **[2026/05/11]** [vLLM v0.20.2](https://gallery.ecr.aws/deep-learning-containers/vllm) โ€” EC2: `0.20.2-gpu-py312-ec2` ยท SageMaker: `0.20.2-gpu-py312` ยท Bug fixes for DeepSeek V4. - **[2026/05/06]** [SGLang v0.5.11](https://gallery.ecr.aws/deep-learning-containers/sglang) โ€” EC2: `0.5.11-gpu-py312-ec2` ยท SageMaker: `0.5.11-gpu-py312` ยท Model support for Gemma 4, GLM-5.1, Qwen 3.4, and more - **[2026/05/05]** [vLLM v0.20.1](https://gallery.ecr.aws/deep-learning-containers/vllm) โ€” EC2: `0.20.1-gpu-py312-ec2` ยท SageMaker: `0.20.1-gpu-py312` ยท Bug fixes for DeepSeek V4. - **[2026/04/30]** [PyTorch v2.11.0](https://gallery.ecr.aws/deep-learning-containers/pytorch) โ€” EC2: `2.11.0-cu130-amzn2023` ยท SageMaker: `2.11.0-cu130-amzn2023-sagemaker` ยท Amazon Linux 2023 with EFA, flash-attn, and transformer-engine. - **[2026/04/28]** [vLLM v0.20.0](https://gallery.ecr.aws/deep-learning-containers/vllm) โ€” EC2: `0.20.0-gpu-py312-ec2` ยท SageMaker: `0.20.0-gpu-py312` ยท Introduces support for DeepSeek V4. +- **[2026/04/24]** [vLLM-Omni v0.18.0](https://gallery.ecr.aws/deep-learning-containers/vllm) โ€” EC2: `omni-cuda-v1.0` ยท SageMaker: `omni-sagemaker-cuda-v1.0` ยท Initial release. Serves omni-modality models (TTS, image, video, multimodal chat) through OpenAI-compatible APIs; SageMaker routing middleware via `CustomAttributes`. - **[2026/04/20]** [vLLM v0.19.1](https://gallery.ecr.aws/deep-learning-containers/vllm) โ€” EC2: `0.19-gpu-py312-ec2` ยท SageMaker: `0.19-gpu-py312` ยท This upgrades Transformers to 5.5.4, enabling Gemma 4 support. ### ๐Ÿ“ข Support Updates diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml index a6bc7ec8b859..696881d9ad22 100644 --- a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml +++ b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml @@ -9,7 +9,7 @@ platform: default public_registry: true tags: - - "omni-cuda-v1" + - "omni-cuda-v1.0" announcements: - "Initial release of vLLM-Omni containers for EC2, ECS, EKS" diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml index bb61f8a78299..9953790bf81f 100644 --- a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml +++ b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml @@ -9,7 +9,7 @@ platform: sagemaker public_registry: true tags: - - "omni-sagemaker-cuda-v1" + - "omni-sagemaker-cuda-v1.0" announcements: - "Initial release of vLLM-Omni containers for SageMaker" diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml new file mode 100644 index 000000000000..2f5827d20928 --- /dev/null +++ b/docs/src/data/vllm-omni/0.20.0-gpu-ec2.yml @@ -0,0 +1,35 @@ +framework: vLLM-Omni +version: "0.20.0" +ecr_repository: vllm +accelerator: gpu +python: py312 +cuda: cu130 +os: amzn2023 +platform: default +public_registry: true + +tags: + # Only the patch-floating tag is listed per release. The minor-floating + # `omni-cuda-v1` tag is documented in docs/vllm-omni/index.md (Pull Commands + + # Versioning and Tags) but isn't a per-release identifier โ€” it points at + # whichever release is currently the v1-line target. Releases that hold + # only their patch-floating tag in this yaml (this convention) auto-correct + # when the v1 floater advances; no yaml edits needed. + - "omni-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) + +announcements: + - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0" + - "CUDA 12.9 โ†’ 13.0 base image; PyTorch 2.10.0 โ†’ 2.11.0" + - "New `/v1/audio/generate` endpoint for diffusion-based audio generation (e.g., stable-audio-open)" + - "New `/v1/videos/sync` endpoint โ€” blocking variant of `/v1/videos` that returns the MP4 directly" + - "Adds support for CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, and Stable-Audio-Open-1.0" + +packages: + vllm: "0.20.0" + vllm-omni: "0.20.0" + pytorch: "2.11.0" + torchvision: "0.26.0" + torchaudio: "2.11.0" + cuda: "13.0.2" + flashinfer: "0.6.8.post1" + efa: "1.47.0" diff --git a/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml new file mode 100644 index 000000000000..7511478b14b6 --- /dev/null +++ b/docs/src/data/vllm-omni/0.20.0-gpu-sagemaker.yml @@ -0,0 +1,35 @@ +framework: vLLM-Omni +version: "0.20.0" +ecr_repository: vllm +accelerator: gpu +python: py312 +cuda: cu130 +os: amzn2023 +platform: sagemaker +public_registry: true + +tags: + # Only the patch-floating tag is listed per release. The minor-floating + # `omni-sagemaker-cuda-v1` tag is documented in docs/vllm-omni/index.md + # (Pull Commands + Versioning and Tags) but isn't a per-release identifier โ€” + # it points at whichever release is currently the v1-line target. Releases + # that hold only their patch-floating tag in this yaml (this convention) + # auto-correct when the v1 floater advances; no yaml edits needed. + - "omni-sagemaker-cuda-v1.1" # floats across DLC patches in the v1.1 line (auto-accepts security patches) + +announcements: + - "Bumps vLLM-Omni to 0.20.0 and aligns with upstream vLLM v0.20.0" + - "CUDA 12.9 โ†’ 13.0 base image; PyTorch 2.10.0 โ†’ 2.11.0" + - "Video generation now supported on SageMaker via the new `/v1/videos/sync` endpoint" + - "Adds `/v1/audio/generate` and `/v1/videos/sync` to the routing middleware" + - "Adds support for CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, and Stable-Audio-Open-1.0" + +packages: + vllm: "0.20.0" + vllm-omni: "0.20.0" + pytorch: "2.11.0" + torchvision: "0.26.0" + torchaudio: "2.11.0" + cuda: "13.0.2" + flashinfer: "0.6.8.post1" + efa: "1.47.0" diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index ba7e5ffb4689..d4cdbab3f83f 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -1,29 +1,69 @@ # vLLM-Omni Inference -Pre-built Docker images for serving omni-modality models (text-to-speech, image generation, video generation, and multimodal chat) with -[vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 12.9 and Python 3.12. +Pre-built Docker images for serving omni-modality models (text-to-speech, audio generation, image generation, video generation, and multimodal chat) +with [vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 13.0 and Python 3.12. ## Latest Announcements +**May 12, 2026** โ€” vLLM-Omni 0.20.0 release. Aligns with upstream vLLM v0.20.0; bumps CUDA to 13.0 and PyTorch to 2.11.0. Adds two new endpoints: +`/v1/audio/generate` for diffusion-based audio generation (e.g., stable-audio-open) and `/v1/videos/sync` โ€” a blocking variant of `/v1/videos` that +returns the MP4 directly and unblocks video generation on SageMaker. New supported models: CosyVoice3, ERNIE-Image-Turbo, Wan2.1-VACE-1.3B, +Stable-Audio-Open-1.0. + **April 24, 2026** โ€” vLLM-Omni 0.18.0 initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a SageMaker routing middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`. ## Pull Commands -**EC2:** +Images are published to both the public ECR gallery (no AWS credentials required) and the private DLC ECR repository (requires +`aws ecr get-login-password`, see [Getting Started](../get_started/index.md)). + +**Multimodal (TTS, image/video/audio generation, omni chat) on EC2 / EKS:** ```bash -docker pull {{ images.latest_vllm_omni_ec2 }} +# Public ECR (anonymous pull): +docker pull public.ecr.aws/deep-learning-containers/vllm:omni-cuda + +# Private ECR (authenticated; substitute your region): +docker pull 763104351884.dkr.ecr..amazonaws.com/vllm:omni-cuda ``` -**SageMaker:** +**Multimodal on Amazon SageMaker AI:** + +```bash +# Public ECR (anonymous pull): +docker pull public.ecr.aws/deep-learning-containers/vllm:omni-sagemaker-cuda + +# Private ECR (authenticated; substitute your region): +docker pull 763104351884.dkr.ecr..amazonaws.com/vllm:omni-sagemaker-cuda +``` + +See [Available Images](../reference/available_images.md) for the full per-region URI table. + +## Pin a Version + +Append a version suffix to the base tag to control update behavior: + +| Suffix | Example | Updates when | +| --- | --- | --- | +| (none) | `omni-cuda` | Any release, including breaking changes | +| `-v` | `omni-cuda-v1` | New features and fixes, no breaking changes | +| `-v.` | `omni-cuda-v1.1` | Security patches and bug fixes only | +| `-v..` | `omni-cuda-v1.1.0` | Never โ€” immutable snapshot | + +The same suffixes apply to the SageMaker base tag (`omni-sagemaker-cuda`). + +**Recommended for production:** pin to `-v.` (e.g., `omni-cuda-v1.1`). It auto-accepts security patches and bug fixes within the +0.20-line release while declining new minor releases that could change behavior โ€” customers pinned here would have been insulated from the Code2Wav +un-batching regression that landed with the v1.1 minor bump (see [Known Limitations](#known-limitations) below) until they were ready to evaluate it. + +For byte-identical reproducibility, pull by digest: ```bash -docker pull {{ images.latest_vllm_omni_sagemaker }} +docker pull public.ecr.aws/deep-learning-containers/vllm@sha256: ``` -See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication -instructions. +`docker inspect ` prints the digest of the image you have. Pulls by digest never change. ## Packages @@ -31,11 +71,13 @@ For package versions included in each release, see the [Release Notes](../releas ## Supported Modalities -| Modality | Route | Example Model | +| Modality | Route | Example Models | | --- | --- | --- | -| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | -| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B` | -| Video Generation | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | +| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice`, `Qwen/Qwen3-TTS-12Hz-1.7B-Base`, `FunAudioLLM/CosyVoice3-0.5B` | +| Audio Generation | `/v1/audio/generate` (new in 0.20.0) | `stabilityai/stable-audio-open-1.0` | +| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B`, `baidu/ERNIE-Image-Turbo` | +| Video Generation (async) | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-VACE-1.3B-Diffusers` | +| Video Generation (sync) | `/v1/videos/sync` (new in 0.20.0) | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-VACE-1.3B-Diffusers` | | Multimodal Chat | `/v1/chat/completions` | `bytedance-research/BAGEL-7B-MoT`, `Qwen/Qwen2.5-Omni-3B` | ## Model Compatibility @@ -59,15 +101,35 @@ starts the container, waits for readiness, submits a request, and writes the out **Model:** [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) โ€” a 1.7B-parameter Qwen3 text-to-speech model supporting multiple voices and languages, runs on a single 24 GB GPU (A10G / L4). +For voice cloning, use [Qwen3-TTS-12Hz-1.7B-Base](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-Base) or +[CosyVoice3-0.5B](https://huggingface.co/FunAudioLLM/CosyVoice3-0.5B) โ€” both accept a reference audio clip plus its transcript and synthesize new +speech in the reference speaker's voice. CosyVoice3 is zero-shot voice-clone only (no preset voices) and requires `--trust-remote-code`. + ```bash --8<-- "examples/vllm-omni/tts/run.sh" ``` +### Audio Generation + +**Model:** [Stable-Audio-Open-1.0](https://huggingface.co/stabilityai/stable-audio-open-1.0) โ€” a diffusion model for text-to-audio (sound effects, +ambience, short music clips), distinct from TTS. Generates up to ~47 seconds of audio per request, runs on a single 24 GB GPU. + +The `/v1/audio/generate` endpoint (new in 0.20.0) takes a text prompt plus diffusion knobs (`audio_length`, `guidance_scale`, `num_inference_steps`, +`seed`) and returns a single binary WAV blob โ€” no streaming. See the +[upstream API spec](https://github.com/vllm-project/vllm-omni/blob/main/docs/serving/audio_generate_api.md) for the full request shape. + +```bash +--8<-- "examples/vllm-omni/audio-generate/run.sh" +``` + ### Image Generation **Model:** [FLUX.2-klein-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B) โ€” a 4B-parameter rectified-flow transformer from Black Forest Labs, produces high-quality 512ร—512 images from text prompts, runs on a single 24 GB GPU. +[ERNIE-Image-Turbo](https://huggingface.co/baidu/ERNIE-Image-Turbo) is also supported as of 0.20.0 โ€” an 8-step distilled DiT for fast inference with a +matching request shape. + ```bash --8<-- "examples/vllm-omni/image/run.sh" ``` @@ -76,14 +138,24 @@ Labs, produces high-quality 512ร—512 images from text prompts, runs on a single **Model:** [Wan2.1-T2V-1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) โ€” a 1.3B-parameter text-to-video diffusion model from the Wan team, generates short clips at up to 480ร—832 resolution. Needs a 48 GB GPU (L40S) or 2ร— 24 GB GPUs with `--tensor-parallel-size 2`. +[Wan2.1-VACE-1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B-Diffusers) (added in 0.20.0) is a unified video creation/editing pipeline that +accepts text plus optional video, mask, or reference image inputs. -The `/v1/videos` endpoint is asynchronous โ€” it returns a job ID immediately and generates the video in the background. The script below submits the -job, polls until it completes, then downloads the MP4. +Two route options: + +- **Async** (`POST /v1/videos`) โ€” returns a job ID immediately; poll `GET /v1/videos/{id}` until status is `completed`, then download the MP4 from + `GET /v1/videos/{id}/content`. Best for long-running batch jobs and the only option in 0.18.0. +- **Sync** (`POST /v1/videos/sync`, new in 0.20.0) โ€” blocks until generation completes and returns the raw MP4 in the response body. Simpler client + code, and crucially the only video path that works through SageMaker real-time endpoints (see [SageMaker Deployment](#sagemaker-deployment)). ```bash --8<-- "examples/vllm-omni/video/run.sh" ``` +```bash +--8<-- "examples/vllm-omni/video-sync/run.sh" +``` + ### Multimodal Chat Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) are supplied as URL or base64 content parts in the message list. @@ -128,8 +200,10 @@ header: | `CustomAttributes` | Dispatched to | | --- | --- | | `route=/v1/audio/speech` | TTS | +| `route=/v1/audio/generate` | Audio generation (new in 0.20.0) | | `route=/v1/images/generations` | Image generation | -| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) โ€” returns job-ID only in 0.18.0, MP4 not retrievable via SageMaker | +| `route=/v1/videos` | Video generation, async (JSON auto-converted to form-data) โ€” returns job-ID only; MP4 not retrievable via SageMaker. Prefer `/v1/videos/sync` below. | +| `route=/v1/videos/sync` | Video generation, sync (new in 0.20.0) โ€” blocks server-side and returns raw MP4 bytes; deploy behind SageMaker async inference (first-request `torch.compile` warmup exceeds the 60s real-time invoke timeout) | | `route=/v1/chat/completions` | Multimodal chat | | *(no route)* | vLLM default `/invocations` (chat/completion/embed) | @@ -153,7 +227,7 @@ Any `SM_VLLM_*` env var is converted to a `--` CLI argument (e.g., `SM_VLL --8<-- "examples/vllm-omni/sagemaker/deploy_tts.py" ``` -GPU deploys require `inference_ami_version` โ€” the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 12.9 images. See +GPU deploys require `inference_ami_version` โ€” the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 13.0 images. See [ProductionVariant API reference](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html) for valid values. When done, delete the endpoint: @@ -167,8 +241,6 @@ predictor.delete_endpoint() SageMaker real-time inference has a 60-second timeout. First requests to TTS models may exceed this due to `torch.compile` warmup (~67s); async inference avoids the limit, as does retrying after warmup completes. -!!! warning "Video generation is not supported on SageMaker in 0.18.0 โ€” see [Known Limitations](#known-limitations) below. Use EC2 for video." - ```python --8<-- "examples/vllm-omni/sagemaker/deploy_tts_async.py" ``` @@ -177,15 +249,39 @@ For async inference, upload the JSON input payload to S3 first, then call `invok `CustomAttributes="route=/v1/audio/speech"`. The resulting `.out` object in the configured S3 output path is the raw WAV audio โ€” no polling or additional retrieval step required. +### Deploy a Video Endpoint + +The `/v1/videos/sync` endpoint (new in 0.20.0) is the supported path for video on SageMaker. Unlike the async `/v1/videos` route โ€” which writes a +job-ID JSON to S3 but never the MP4 โ€” `/v1/videos/sync` blocks server-side until generation completes and writes the raw MP4 bytes to the configured +S3 output path. + +Deploy behind **SageMaker async inference** (`AsyncInferenceConfig`), not real-time inference: first-request latency on video models is dominated by +model load + `torch.compile` warmup (3โ€“4 minutes for Wan2.1-VACE-1.3B), which exceeds the 60-second real-time invoke timeout. Async inference allows +up to 1 hour and writes the response body verbatim to S3, so the `.out` object *is* the MP4 โ€” no polling on a job ID. + +```python +--8<-- "examples/vllm-omni/sagemaker/deploy_video_sync.py" +``` + +Validated 2026-05-11 on `ml.g5.2xlarge` (A10G 24 GB VRAM, 32 GB host RAM): 45 KB MP4 in ~10s after warmup. Reduce `num_inference_steps` and +`num_frames` to stay under the async ceiling for warm requests. + ## Known Limitations -- **Video generation is not supported on SageMaker in 0.18.0.** The `/v1/videos` endpoint is async by design โ€” it returns a job-ID JSON immediately - and generates the MP4 in the background. Through SageMaker async inference, only that job-ID JSON is written to S3; the MP4 itself never lands in S3 - and cannot be retrieved through `invoke_endpoint` or `invoke_endpoint_async`. Use EC2 for video generation โ€” direct container access supports the - full workflow (create job, poll status, download MP4). SageMaker support is expected once `POST /v1/videos/sync` (which blocks and returns raw MP4 - bytes) is available in a future vllm-omni release. -- **First-request latency on SageMaker real-time endpoints.** TTS models can exceed the 60s invoke timeout on the first request due to `torch.compile` - warmup. Use async inference or retry after warmup. +- **`/v1/videos` (async) on SageMaker writes only the job-ID JSON to S3, not the MP4.** This is unchanged from 0.18.0 โ€” the async route generates the + MP4 in the background and the bytes never land in S3. Use the new `/v1/videos/sync` route on SageMaker (see + [Deploy a Video Endpoint](#deploy-a-video-endpoint)) or stay on EC2 for the async workflow with status polling. +- **First-request latency on SageMaker real-time endpoints.** TTS, audio-generate, and video models can exceed the 60s invoke timeout on the first + request due to `torch.compile` warmup. Use async inference or retry after warmup. +- **Voice-clone TTS (Qwen3-TTS-Base) is slower in 0.20.0 than 0.18.0 due to an upstream Code2Wav decode-chunk un-batching regression** + ([vllm-omni#3203](https://github.com/vllm-project/vllm-omni/pull/3203)). Observed on `g6.xlarge` with `qwen3-tts-12hz-1.7b-base`, concurrency 4, 20 + prompts: requests/s **0.4 โ†’ 0.281**, audio RTF multiplier **1.6 โ†’ 1.109**, p95 E2E **11s โ†’ 15.9s**. TTS quality is unchanged. The fix is merged + upstream as [vllm-omni#3485](https://github.com/vllm-project/vllm-omni/pull/3485) post-0.20.0 and will land in the next omni point release. + Preset-voice TTS (Qwen3-TTS-CustomVoice) is unaffected. +- **CosyVoice3 requires `--trust-remote-code` and ~32 GB host RAM during model load.** A 16 GB host can SIGKILL the process during HuggingFace cache + hydration. Prefer `g6e.xlarge` or larger for both EC2 and SageMaker instance types. +- **Stable-Audio-Open output is capped at ~47 seconds per request** by the model itself. For longer clips, run multiple requests with adjusted + `audio_start` and concatenate client-side. ## Release Notes diff --git a/examples/vllm-omni/audio-generate/run.sh b/examples/vllm-omni/audio-generate/run.sh new file mode 100755 index 000000000000..4252e7b9cdc7 --- /dev/null +++ b/examples/vllm-omni/audio-generate/run.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# End-to-end audio-generation example: start server, generate a 5-second clip. +# /v1/audio/generate is a diffusion-based text-to-audio endpoint (new in 0.20.0). +# Distinct from /v1/audio/speech (which is TTS โ€” a voice reading words). +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" +MODEL="${MODEL:-stabilityai/stable-audio-open-1.0}" +NAME="${NAME:-omni-audio-generate}" + +docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + "${IMAGE}" --model "${MODEL}" --trust-remote-code --enforce-eager + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done + +curl -sf -X POST http://localhost:8080/v1/audio/generate \ + -H "Content-Type: application/json" \ + -d '{"input": "A jazz piano improvisation", "audio_length": 5.0, "guidance_scale": 7.0, "num_inference_steps": 50, "seed": 42}' \ + --output sound.wav + +echo "wrote sound.wav ($(stat -f%z sound.wav 2>/dev/null || stat -c%s sound.wav) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" diff --git a/examples/vllm-omni/image/run.sh b/examples/vllm-omni/image/run.sh index d6dc1615ad95..28008fb19d3a 100755 --- a/examples/vllm-omni/image/run.sh +++ b/examples/vllm-omni/image/run.sh @@ -2,7 +2,7 @@ # End-to-end image-generation example: start server, wait for ready, generate. set -euo pipefail -IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" MODEL="${MODEL:-black-forest-labs/FLUX.2-klein-4B}" NAME="${NAME:-omni-image}" diff --git a/examples/vllm-omni/qwen2.5-omni/run.sh b/examples/vllm-omni/qwen2.5-omni/run.sh index a04624bdf99d..98e020cb6ff6 100755 --- a/examples/vllm-omni/qwen2.5-omni/run.sh +++ b/examples/vllm-omni/qwen2.5-omni/run.sh @@ -6,7 +6,7 @@ # On single-GPU hosts the model's talker stage fails to load on GPU 1. set -euo pipefail -IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" MODEL="${MODEL:-Qwen/Qwen2.5-Omni-3B}" NAME="${NAME:-omni3b}" diff --git a/examples/vllm-omni/sagemaker/deploy_video_sync.py b/examples/vllm-omni/sagemaker/deploy_video_sync.py new file mode 100644 index 000000000000..0717099aebbd --- /dev/null +++ b/examples/vllm-omni/sagemaker/deploy_video_sync.py @@ -0,0 +1,95 @@ +"""Deploy a vLLM-Omni video model to a SageMaker async inference endpoint +and invoke the new /v1/videos/sync endpoint, which blocks server-side until +generation completes and returns raw MP4 bytes. + +Async inference is required for video โ€” first-request latency includes model +load + torch.compile warmup (3-4 min for Wan2.1-VACE-1.3B), well past the +60s real-time invoke timeout. Async inference allows up to 1 hour and +deposits the response body verbatim at the configured S3 output path, so the +.out object is the raw MP4. + +Available since vLLM-Omni 0.20.0; supersedes the 0.18.0 limitation that +SageMaker async inference could only retrieve the job-ID JSON, not the MP4. +The routing middleware (`CustomAttributes="route=/v1/videos/sync"`) auto- +converts the JSON request body to multipart/form-data for the underlying +endpoint; values must therefore be JSON strings. + +Validated 2026-05-11 on ml.g5.2xlarge (A10G 24 GB VRAM, 32 GB host RAM): +45 KB MP4 returned in ~10s after warmup. +""" + +import time + +import boto3 +from sagemaker.async_inference import AsyncInferenceConfig +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer + +BUCKET = "" # replace with an S3 bucket your role can read/write +ROLE_ARN = "arn:aws:iam:::role/SageMakerExecutionRole" +ENDPOINT_NAME = "vllm-omni-video-sync" + +model = Model( + image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1", + role=ROLE_ARN, + env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-VACE-1.3B-diffusers"}, + predictor_cls=Predictor, +) + +predictor = model.deploy( + instance_type="ml.g5.2xlarge", + initial_instance_count=1, + endpoint_name=ENDPOINT_NAME, + inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", + serializer=JSONSerializer(), + async_inference_config=AsyncInferenceConfig( + output_path=f"s3://{BUCKET}/vllm-omni-async-output/", + max_concurrent_invocations_per_instance=1, + ), + wait=True, +) + +# Upload the input payload to S3, then call invoke_endpoint_async with +# CustomAttributes routing to /v1/videos/sync. Values are strings because +# the middleware converts JSON to multipart/form-data. +s3 = boto3.client("s3") +s3.put_object( + Bucket=BUCKET, + Key="vllm-omni-async-input/request.json", + Body=( + '{"prompt": "a dog running on a beach", ' + '"num_frames": "17", "num_inference_steps": "4", ' + '"size": "480x320", "seed": "42"}' + ), + ContentType="application/json", +) + +runtime = boto3.client("sagemaker-runtime") +result = runtime.invoke_endpoint_async( + EndpointName=ENDPOINT_NAME, + InputLocation=f"s3://{BUCKET}/vllm-omni-async-input/request.json", + ContentType="application/json", + CustomAttributes="route=/v1/videos/sync", +) +output_location = result["OutputLocation"] # s3://.../.out +print(f"Output will be written to {output_location}") + +# Poll for the .out object (raw MP4 bytes). First request takes ~3-4 min +# due to model load + torch.compile; warm requests are ~3-10s. +bucket = output_location.split("/", 3)[2] +key = output_location.split("/", 3)[3] +for _ in range(120): # 10 min timeout + try: + obj = s3.get_object(Bucket=bucket, Key=key) + with open("video.mp4", "wb") as f: + f.write(obj["Body"].read()) + print(f"wrote video.mp4 (Content-Type: {obj.get('ContentType', '?')})") + break + except s3.exceptions.NoSuchKey: + time.sleep(5) +else: + raise RuntimeError("timed out waiting for async output") + +# When done: +# predictor.delete_endpoint() diff --git a/examples/vllm-omni/tts/run.sh b/examples/vllm-omni/tts/run.sh index 9f4f185a2139..3af3837ed3e4 100755 --- a/examples/vllm-omni/tts/run.sh +++ b/examples/vllm-omni/tts/run.sh @@ -3,7 +3,7 @@ # Requires: docker (with NVIDIA runtime), curl, an authenticated ECR pull. set -euo pipefail -IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice}" NAME="${NAME:-omni-tts}" diff --git a/examples/vllm-omni/video-sync/run.sh b/examples/vllm-omni/video-sync/run.sh new file mode 100755 index 000000000000..e3200a0e2a24 --- /dev/null +++ b/examples/vllm-omni/video-sync/run.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# End-to-end sync video-generation example: start server, submit, get MP4 back. +# /v1/videos/sync (new in 0.20.0) blocks until the video is ready and returns +# raw MP4 bytes โ€” no job-ID polling needed, unlike async /v1/videos. +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" +MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}" +NAME="${NAME:-omni-video-sync}" + +docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8000 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + "${IMAGE}" --model "${MODEL}" --tensor-parallel-size 2 + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done + +# /v1/videos/sync requires multipart/form-data and blocks until the MP4 is ready. +curl -sf -X POST http://localhost:8080/v1/videos/sync \ + -F "prompt=a dog running on a beach at sunset" \ + -F "num_frames=17" -F "num_inference_steps=30" \ + -F "size=480x320" -F "seed=42" \ + --output video.mp4 + +echo "wrote video.mp4 ($(stat -f%z video.mp4 2>/dev/null || stat -c%s video.mp4) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh index 36db972d82f3..7c716dd2e0fc 100755 --- a/examples/vllm-omni/video/run.sh +++ b/examples/vllm-omni/video/run.sh @@ -3,7 +3,7 @@ # /v1/videos is async โ€” it returns a job ID; the MP4 is produced in the background. set -euo pipefail -IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-cuda-v1}" MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}" NAME="${NAME:-omni-video}"