From 556881c0eb777c76de3d1de4d6e50fe43fbd2040 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Sun, 26 Apr 2026 21:42:10 -0700 Subject: [PATCH 01/10] vLLM-Omni release docs Signed-off-by: Yadan Wei --- docs/.nav.yml | 2 + docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml | 28 +++ .../data/vllm-omni/1.0.0-gpu-sagemaker.yml | 28 +++ docs/src/global.yml | 3 + docs/src/macros.py | 2 + docs/src/tables/vllm-omni.yml | 14 ++ docs/vllm-omni/index.md | 184 ++++++++++++++++++ examples/vllm-omni/image/invoke_image.sh | 5 + examples/vllm-omni/sagemaker/deploy_tts.py | 32 +++ .../vllm-omni/sagemaker/deploy_video_async.py | 35 ++++ examples/vllm-omni/tts/invoke_tts.sh | 6 + examples/vllm-omni/video/invoke_video.sh | 22 +++ 12 files changed, 361 insertions(+) create mode 100644 docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml create mode 100644 docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml create mode 100644 docs/src/tables/vllm-omni.yml create mode 100644 docs/vllm-omni/index.md create mode 100755 examples/vllm-omni/image/invoke_image.sh create mode 100644 examples/vllm-omni/sagemaker/deploy_tts.py create mode 100644 examples/vllm-omni/sagemaker/deploy_video_async.py create mode 100755 examples/vllm-omni/tts/invoke_tts.sh create mode 100755 examples/vllm-omni/video/invoke_video.sh diff --git a/docs/.nav.yml b/docs/.nav.yml index a94963d53480..4c2a53cdf728 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -5,12 +5,14 @@ nav: - Using Deep Learning Containers: get_started/using_dlcs.md - Release Notifications: get_started/release_notifications.md - Ray: ray/index.md + - vLLM-Omni: vllm-omni/index.md - Release Notes: - releasenotes/index.md - Base: releasenotes/base/index.md - Ray: releasenotes/ray/index.md - SGLang: releasenotes/sglang/index.md - vLLM: releasenotes/vllm/index.md + - vLLM-Omni: releasenotes/vllm-omni/index.md - PyTorch: releasenotes/pytorch/index.md - Tensorflow: releasenotes/tensorflow/index.md - Tutorials: tutorials diff --git a/docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml b/docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml new file mode 100644 index 000000000000..1d5aa65228f6 --- /dev/null +++ b/docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml @@ -0,0 +1,28 @@ +framework: vLLM-Omni +version: "1.0.0" +accelerator: gpu +python: py312 +cuda: cu129 +os: amzn2023 +platform: default +public_registry: true + +tags: + - "omni-cuda-v1.0.0" + - "omni-cuda-v1.0" + - "omni-cuda-v1" + +announcements: + - "Initial release of vLLM-Omni containers for EC2, ECS, EKS" + - "Serves omni-modality models: TTS, image generation, video generation, multimodal chat" + - "Built on Amazon Linux 2023 with Python 3.12 and CUDA 12.9" + +packages: + vllm: "0.18.0" + vllm_omni: "0.18.0" + pytorch: "2.10.0" + torchvision: "0.25.0" + torchaudio: "2.10.0" + cuda: "12.9.1" + flashinfer: "0.6.6" + efa: "1.47.0" diff --git a/docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml new file mode 100644 index 000000000000..588fb7e8fd01 --- /dev/null +++ b/docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml @@ -0,0 +1,28 @@ +framework: vLLM-Omni +version: "1.0.0" +accelerator: gpu +python: py312 +cuda: cu129 +os: amzn2023 +platform: sagemaker +public_registry: true + +tags: + - "omni-cuda-sagemaker-v1.0.0" + - "omni-cuda-sagemaker-v1.0" + - "omni-cuda-sagemaker-v1" + +announcements: + - "Initial release of vLLM-Omni containers for SageMaker" + - "Includes ASGI routing middleware for /invocations dispatch via CustomAttributes" + - "Built on Amazon Linux 2023 with Python 3.12 and CUDA 12.9" + +packages: + vllm: "0.18.0" + vllm_omni: "0.18.0" + pytorch: "2.10.0" + torchvision: "0.25.0" + torchaudio: "2.10.0" + cuda: "12.9.1" + flashinfer: "0.6.6" + efa: "1.47.0" diff --git a/docs/src/global.yml b/docs/src/global.yml index 4f5133c38911..fb7ed4c95d23 100644 --- a/docs/src/global.yml +++ b/docs/src/global.yml @@ -67,7 +67,9 @@ display_names: sagemaker-xgboost: "XGBoost" sglang: "SGLang" vllm: "vLLM" + vllm_omni: "vLLM-Omni" vllm-arm64: "vLLM ARM64" + vllm-omni: "vLLM-Omni" pytorch-training: "PyTorch Training" pytorch-training-arm64: "PyTorch Training ARM64" pytorch-inference: "PyTorch Inference" @@ -167,6 +169,7 @@ table_order: - sglang - vllm - vllm-arm64 + - vllm-omni - pytorch-training - pytorch-inference - pytorch-training-arm64 diff --git a/docs/src/macros.py b/docs/src/macros.py index 8eab4e930138..cf5ce81be274 100644 --- a/docs/src/macros.py +++ b/docs/src/macros.py @@ -42,4 +42,6 @@ def define_env(env): "latest_ray_default_cpu": _get_latest_ray_uri("default", "cpu"), "latest_ray_sagemaker_gpu": _get_latest_ray_uri("sagemaker", "gpu"), "latest_ray_sagemaker_cpu": _get_latest_ray_uri("sagemaker", "cpu"), + "latest_vllm_omni_ec2": get_latest_image_uri("vllm-omni", "default"), + "latest_vllm_omni_sagemaker": get_latest_image_uri("vllm-omni", "sagemaker"), } diff --git a/docs/src/tables/vllm-omni.yml b/docs/src/tables/vllm-omni.yml new file mode 100644 index 000000000000..4c4ffa203fa2 --- /dev/null +++ b/docs/src/tables/vllm-omni.yml @@ -0,0 +1,14 @@ +# Table Configuration - vLLM-Omni +columns: + - field: framework_version + header: "Framework" + - field: python + header: "Python" + - field: cuda + header: "CUDA" + - field: accelerator + header: "Accelerator" + - field: platform + header: "Platform" + - field: example_url + header: "Example URL" diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md new file mode 100644 index 000000000000..eb1edabcadb8 --- /dev/null +++ b/docs/vllm-omni/index.md @@ -0,0 +1,184 @@ +# vLLM-Omni Inference + +Pre-built Docker images for serving omni-modality models (text-to-speech, image generation, video generation, and multimodal chat) with +[vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 12.9 and Python 3.12. + +## Latest Announcements + +**vLLM-Omni 1.0.0** — Initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a SageMaker routing +middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`. + +## Pull Commands + +**EC2:** + +```bash +docker pull {{ images.latest_vllm_omni_ec2 }} +``` + +**SageMaker:** + +```bash +docker pull {{ images.latest_vllm_omni_sagemaker }} +``` + +See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication +instructions. + +## Packages + +For package versions included in each release, see the [Release Notes](../releasenotes/vllm-omni/index.md). + +## Supported Modalities + +| Modality | Route | Example Model | +| --- | --- | --- | +| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | +| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B` | +| Video Generation | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | +| Multimodal Chat | `/v1/chat/completions` | `bytedance-research/BAGEL-7B-MoT`, `Qwen/Qwen2.5-Omni-3B` | + +## Model Compatibility + +- Models must have a standard HuggingFace `config.json` with a recognized `model_type`, or be diffusers pipeline models with `model_index.json`. +- Models requiring `--stage-configs-path` (e.g., CosyVoice3, Fish Speech) are not supported in v1.0.0 — the engine subprocess cannot resolve custom + model types. +- Multi-stage omni models (thinker + talker + decoder) like Qwen2.5-Omni need significantly more VRAM than the model size suggests. Refer to the + individual model cards for minimum GPU requirements. + +## EC2 Deployment + +The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080. + +### Start the Server + +```bash +docker run -d --gpus all \ + --shm-size=2g \ + -p 8080:8080 \ + {{ images.latest_vllm_omni_ec2 }} \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice + +until curl -sf http://localhost:8080/health > /dev/null; do sleep 5; done +``` + +Any flag accepted by `vllm serve` may be appended to `docker run` (e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`). + +### Text-to-Speech + +Returns raw audio bytes (WAV). + +```bash +--8<-- "examples/vllm-omni/tts/invoke_tts.sh" +``` + +### Image Generation + +Returns a JSON response with a base64-encoded image in `data[0].b64_json`. + +```bash +--8<-- "examples/vllm-omni/image/invoke_image.sh" +``` + +### Video Generation + +The `/v1/videos` endpoint is asynchronous by design — it returns a job ID immediately and generates the video in the background. The request must use +`multipart/form-data`. + +```bash +--8<-- "examples/vllm-omni/video/invoke_video.sh" +``` + +### Multimodal Chat + +Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) are supplied as URL or base64 content parts in the message list. + +```bash +curl -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"messages": [{"role": "user", "content": "Say hello in one sentence."}], "max_tokens": 64}' +``` + +## SageMaker Deployment + +### Prerequisites + +- AWS CLI configured with appropriate permissions +- An IAM execution role with SageMaker and ECR permissions (see [Ray tutorial](../ray/index.md#prerequisites) for an example setup) +- SageMaker Python SDK v2: + +```bash +pip install 'sagemaker>=2,<3' +``` + +### Routing Middleware + +The SageMaker image includes an ASGI middleware that dispatches `/invocations` to the correct vllm-omni endpoint based on the `CustomAttributes` +header: + +| `CustomAttributes` | Dispatched to | +| --- | --- | +| `route=/v1/audio/speech` | TTS | +| `route=/v1/images/generations` | Image generation | +| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) | +| `route=/v1/chat/completions` | Multimodal chat | +| *(no route)* | vLLM default `/invocations` (chat/completion/embed) | + +### Environment Variables + +Any `SM_VLLM_*` env var is converted to a `--` CLI argument (e.g., `SM_VLLM_MAX_MODEL_LEN=2048` → `--max-model-len 2048`). + +| Variable | Description | Example | +| --- | --- | --- | +| `SM_VLLM_MODEL` | Model ID (HuggingFace or local path) | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | +| `SM_VLLM_MAX_MODEL_LEN` | Max sequence length | `2048` | +| `SM_VLLM_ENFORCE_EAGER` | Disable CUDA graphs | `true` | +| `SM_VLLM_TENSOR_PARALLEL_SIZE` | Number of GPUs for TP | `2` | +| `HF_TOKEN` | HuggingFace token for gated models | `hf_...` | + +### Deploy a TTS Endpoint + +!!! warning "SageMaker endpoint deployment takes several minutes and incurs costs. Remember to delete endpoints when done." + +```python +--8<-- "examples/vllm-omni/sagemaker/deploy_tts.py" +``` + +GPU deploys require `inference_ami_version` — the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 12.9 images. See +[ProductionVariant API reference](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html) for valid values. + +When done, delete the endpoint: + +```python +predictor.delete_endpoint() +``` + +### Async Inference for Video and Long-Running Generation + +SageMaker real-time inference has a 60-second timeout. First requests to TTS models may exceed this due to `torch.compile` warmup (~67s); async +inference avoids the limit, as does retrying after warmup completes. + +For `/v1/videos`, async inference is required because the endpoint returns a job ID rather than the final MP4. The MP4 must be retrieved by polling +the container directly — SageMaker async inference only captures the initial JSON response. + +```python +--8<-- "examples/vllm-omni/sagemaker/deploy_video_async.py" +``` + +## Known Limitations + +- **Video generation on SageMaker returns a job ID only.** The `/v1/videos` endpoint in v1.0.0 is async by design and `POST /v1/videos/sync` (which + blocks and returns raw MP4 bytes) is not available. Direct container access (EC2) supports the full video workflow — create job, poll status, + download MP4. A sync endpoint has been added in newer vllm-omni versions and will be supported in a future release. +- **First-request latency on SageMaker real-time endpoints.** TTS models can exceed the 60s invoke timeout on the first request due to `torch.compile` + warmup. Use async inference or retry after warmup. + +## Release Notes + +See [vLLM-Omni Release Notes](../releasenotes/vllm-omni/index.md) for version history and changelogs. + +## Resources + +- [vLLM-Omni Documentation](https://github.com/vllm-project/vllm-omni) +- [GitHub Repository](https://github.com/aws/deep-learning-containers) +- [Available Images](../reference/available_images.md) diff --git a/examples/vllm-omni/image/invoke_image.sh b/examples/vllm-omni/image/invoke_image.sh new file mode 100755 index 000000000000..8830334d2512 --- /dev/null +++ b/examples/vllm-omni/image/invoke_image.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# Image generation via OpenAI-compatible /v1/images/generations endpoint +curl -X POST http://localhost:8080/v1/images/generations \ + -H "Content-Type: application/json" \ + -d '{"prompt": "a red apple on a white table", "size": "512x512", "n": 1}' diff --git a/examples/vllm-omni/sagemaker/deploy_tts.py b/examples/vllm-omni/sagemaker/deploy_tts.py new file mode 100644 index 000000000000..a4e2d8a91a9a --- /dev/null +++ b/examples/vllm-omni/sagemaker/deploy_tts.py @@ -0,0 +1,32 @@ +"""Deploy a vLLM-Omni TTS model to a real-time SageMaker endpoint.""" + +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer + +model = Model( + image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-sagemaker-v1.0.0", + role="arn:aws:iam:::role/SageMakerExecutionRole", + env={"SM_VLLM_MODEL": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"}, + predictor_cls=Predictor, +) + +predictor = model.deploy( + instance_type="ml.g5.xlarge", + initial_instance_count=1, + endpoint_name="vllm-omni-tts", + inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", + serializer=JSONSerializer(), + wait=True, +) + +# Invoke — route /invocations to /v1/audio/speech via CustomAttributes +sm_runtime = predictor.sagemaker_session.sagemaker_runtime_client +response = sm_runtime.invoke_endpoint( + EndpointName=predictor.endpoint_name, + ContentType="application/json", + Body='{"input": "Hello world", "voice": "vivian", "language": "English"}', + CustomAttributes="route=/v1/audio/speech", +) +with open("speech.wav", "wb") as f: + f.write(response["Body"].read()) diff --git a/examples/vllm-omni/sagemaker/deploy_video_async.py b/examples/vllm-omni/sagemaker/deploy_video_async.py new file mode 100644 index 000000000000..d1ac7c807354 --- /dev/null +++ b/examples/vllm-omni/sagemaker/deploy_video_async.py @@ -0,0 +1,35 @@ +"""Deploy a vLLM-Omni video model to a SageMaker async inference endpoint. + +Video generation is async by design — /v1/videos returns a job ID immediately, +so only the job metadata JSON is written to S3, not the MP4 file. To retrieve +the MP4, poll /v1/videos//content directly against the endpoint. +""" + +from sagemaker.async_inference import AsyncInferenceConfig +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer + +model = Model( + image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-sagemaker-v1.0.0", + role="arn:aws:iam:::role/SageMakerExecutionRole", + env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"}, + predictor_cls=Predictor, +) + +predictor = model.deploy( + instance_type="ml.g6e.xlarge", + initial_instance_count=1, + endpoint_name="vllm-omni-video-async", + inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", + serializer=JSONSerializer(), + async_inference_config=AsyncInferenceConfig( + output_path="s3:///vllm-omni-async-output/", + max_concurrent_invocations_per_instance=1, + ), + wait=True, +) + +# The middleware converts the JSON payload to multipart/form-data for /v1/videos. +# Response contains the job ID; use the /v1/videos//content endpoint to +# retrieve the MP4 bytes directly from the container. diff --git a/examples/vllm-omni/tts/invoke_tts.sh b/examples/vllm-omni/tts/invoke_tts.sh new file mode 100755 index 000000000000..935f318492ce --- /dev/null +++ b/examples/vllm-omni/tts/invoke_tts.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# Text-to-speech via OpenAI-compatible /v1/audio/speech endpoint +curl -X POST http://localhost:8080/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{"input": "Hello, how are you?", "voice": "vivian", "language": "English"}' \ + --output speech.wav diff --git a/examples/vllm-omni/video/invoke_video.sh b/examples/vllm-omni/video/invoke_video.sh new file mode 100755 index 000000000000..3e6c4ab36d68 --- /dev/null +++ b/examples/vllm-omni/video/invoke_video.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Video generation via /v1/videos endpoint (async — returns a job ID) +# The /v1/videos API requires multipart/form-data. +JOB=$(curl -sf -X POST http://localhost:8080/v1/videos \ + -F "prompt=a dog running on a beach" \ + -F "num_frames=17" \ + -F "num_inference_steps=4" \ + -F "size=480x320" \ + -F "seed=42") + +JOB_ID=$(echo "$JOB" | python3 -c "import json,sys;print(json.load(sys.stdin)['id'])") +echo "Job: $JOB_ID" + +# Poll until complete, then download +while true; do + STATUS=$(curl -sf "http://localhost:8080/v1/videos/$JOB_ID" | python3 -c "import json,sys;print(json.load(sys.stdin)['status'])") + [ "$STATUS" = "succeeded" ] && break + [ "$STATUS" = "failed" ] && { echo "Job failed"; exit 1; } + sleep 5 +done + +curl -sf "http://localhost:8080/v1/videos/$JOB_ID/content" --output video.mp4 From a54efab526532ebf35f3bbe5b49dc35cbb887cc8 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Sun, 26 Apr 2026 22:18:10 -0700 Subject: [PATCH 02/10] Add Qwen2.5-Omni-3B EC2 tutorial Signed-off-by: Yadan Wei --- docs/.nav.yml | 1 + docs/vllm-omni/index.md | 4 + docs/vllm-omni/qwen2.5-omni.md | 140 ++++++++++++++++++ .../qwen2.5-omni/offline_inference.py | 83 +++++++++++ .../qwen2.5-omni/online_inference.py | 25 ++++ 5 files changed, 253 insertions(+) create mode 100644 docs/vllm-omni/qwen2.5-omni.md create mode 100644 examples/vllm-omni/qwen2.5-omni/offline_inference.py create mode 100644 examples/vllm-omni/qwen2.5-omni/online_inference.py diff --git a/docs/.nav.yml b/docs/.nav.yml index 4c2a53cdf728..35fd502aba22 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -6,6 +6,7 @@ nav: - Release Notifications: get_started/release_notifications.md - Ray: ray/index.md - vLLM-Omni: vllm-omni/index.md + - Qwen2.5-Omni on EC2: vllm-omni/qwen2.5-omni.md - Release Notes: - releasenotes/index.md - Base: releasenotes/base/index.md diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index eb1edabcadb8..013ac5954fc2 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -177,6 +177,10 @@ the container directly — SageMaker async inference only captures the initial J See [vLLM-Omni Release Notes](../releasenotes/vllm-omni/index.md) for version history and changelogs. +## Model Tutorials + +- [Qwen2.5-Omni-3B on EC2 GPU](qwen2.5-omni.md) — multi-GPU setup, audio output gotchas, offline + online inference + ## Resources - [vLLM-Omni Documentation](https://github.com/vllm-project/vllm-omni) diff --git a/docs/vllm-omni/qwen2.5-omni.md b/docs/vllm-omni/qwen2.5-omni.md new file mode 100644 index 000000000000..dda730ec1c49 --- /dev/null +++ b/docs/vllm-omni/qwen2.5-omni.md @@ -0,0 +1,140 @@ +# Qwen2.5-Omni-3B on EC2 GPU + +Run [Qwen/Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) (multimodal-in / text + speech-out) using the vLLM-Omni container — both as a +local (offline) server and as a remote (online) endpoint. + +## Requirements + +- **EC2 GPU instance with ≥ 4 GPUs**: + - `g5.12xlarge` / `g6.12xlarge` (4× A10G, 24 GB each) — tested + - `g6e.12xlarge` (4× L40S, 48 GB each) — preferred when available +- Amazon Linux 2023 with NVIDIA driver, Docker, and `nvidia-container-toolkit` (AWS Deep Learning AMIs include these) +- AWS credentials with ECR pull permission for `763104351884` +- Outbound internet to HuggingFace (first run downloads ~6 GB) + +!!! note "Single-GPU note" Qwen2.5-Omni-3B's default stage layout puts the talker on GPU 1. On a single-GPU instance it fails or produces distorted +audio. Use a 4-GPU instance. + +## One-time setup + +```bash +# ECR login +aws ecr get-login-password --region us-west-2 | \ + docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com + +docker pull {{ images.latest_vllm_omni_ec2 }} + +mkdir -p ~/hf-cache +``` + +## Start the server + +```bash +docker run -d --name omni3b \ + --gpus all --shm-size=16g -p 8080:8080 \ + -v ~/hf-cache:/root/.cache/huggingface \ + -e HF_HUB_ENABLE_HF_TRANSFER=1 \ + {{ images.latest_vllm_omni_ec2 }} \ + Qwen/Qwen2.5-Omni-3B \ + --host 0.0.0.0 --port 8080 \ + --max-model-len 16384 --dtype bfloat16 +``` + +First start takes ~8 minutes (weight download + 3-stage model load). Wait for ready: + +```bash +until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done +echo ready +``` + +Stop and remove: + +```bash +docker stop omni3b && docker rm omni3b +``` + +## Getting clean audio out + +Three things are **required** on `/v1/chat/completions` to produce usable speech from Qwen2.5-Omni-3B: + +1. `"modalities": ["audio"]` +2. `"sampling_params_list"` — a 3-element list (thinker, talker, code2wav). The image's built-in per-stage defaults are wrong and produce noise. Use + the values shown below (from the official Qwen docs). +3. The exact Qwen system prompt. + +!!! warning "Omitting `sampling_params_list` produces noise even though HTTP returns 200 with valid WAV bytes." + +### Working curl + +```bash +curl -s http://localhost:8080/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen2.5-Omni-3B", + "modalities": ["audio"], + "sampling_params_list": [ + {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1}, + {"temperature":0.9,"top_p":0.8,"top_k":40,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.05,"stop_token_ids":[8294]}, + {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1} + ], + "messages": [ + {"role":"system","content":[{"type":"text","text":"You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]}, + {"role":"user","content":[{"type":"text","text":"Tell me a short, calming bedtime lullaby story for a 6-year-old girl."}]} + ] + }' | jq -r '.choices[0].message.audio.data' | base64 -d > out.wav +``` + +## Offline inference (on the GPU instance) + +```python +--8<-- "examples/vllm-omni/qwen2.5-omni/offline_inference.py" +``` + +Run it: + +```bash +python3 offline_inference.py +aplay out/lullaby.wav # afplay on macOS +``` + +## Online inference (from a remote client) + +Open TCP 8080 in the EC2 security group to your client IP, then: + +```bash +export OMNI_ENDPOINT=http://ec2-xx-xx-xx-xx.us-west-2.compute.amazonaws.com:8080 +python3 online_inference.py +``` + +```python +--8<-- "examples/vllm-omni/qwen2.5-omni/online_inference.py" +``` + +## API overview + +OpenAI-compatible endpoints exposed by the container: + +| Endpoint | Purpose | +| --- | --- | +| `POST /v1/chat/completions` | Text / multimodal in → text or audio out (see above for audio) | +| `POST /v1/audio/speech` | Direct text-to-speech shortcut (voices: `Chelsie`, `Ethan`). ⚠️ In v1.0.0 the shortcut bypasses the thinker and does not apply the correct sampling params, producing noisy output. Prefer the chat route. | +| `GET /v1/audio/voices` | List voices | +| `GET /v1/models` | Show served model id | +| `GET /health` | Liveness | + +## Troubleshooting + +| Symptom | Fix | +| --- | --- | +| `NVMLError_InvalidArgument` in stage 1 during startup | Single-GPU instance — use a 4-GPU instance. | +| Audio sounds like noise/gibberish | Missing `sampling_params_list` — add it per above. | +| `message.audio: {}` empty on chat completions | Using `"modalities": ["text","audio"]`. Use `["audio"]` only. | +| `Cannot perform interactive login from non-TTY device` | AWS creds expired. Refresh `~/.aws/credentials` and re-run ECR login. | +| Health never goes 200 | Inspect `docker logs omni3b`. Weight download or OOM — need ≥4 GPUs with ≥24 GB each. | + +## Costs (us-west-2, on-demand, April 2026) + +- `g5.12xlarge` ≈ $5.67 / hour +- `g6e.12xlarge` ≈ $10.49 / hour + +Stop the instance when idle; terminate to free EBS. diff --git a/examples/vllm-omni/qwen2.5-omni/offline_inference.py b/examples/vllm-omni/qwen2.5-omni/offline_inference.py new file mode 100644 index 000000000000..c71d3c57d3a2 --- /dev/null +++ b/examples/vllm-omni/qwen2.5-omni/offline_inference.py @@ -0,0 +1,83 @@ +"""Offline inference against a local vLLM-Omni server running Qwen2.5-Omni-3B. + +Assumes the server is already running on http://localhost:8080 (see the +Qwen2.5-Omni tutorial for `docker run` instructions). +""" + +import base64 +import json +import os +import pathlib + +import requests + +ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080") +MODEL = "Qwen/Qwen2.5-Omni-3B" +SYSTEM = ( + "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, " + "capable of perceiving auditory and visual inputs, as well as generating " + "text and speech." +) + +# Three per-stage sampling params (thinker, talker, code2wav) are REQUIRED for +# clean audio. The built-in defaults produce noise. Do not omit. +SAMPLING_PARAMS_LIST = [ + { + "temperature": 0.0, + "top_p": 1.0, + "top_k": -1, + "max_tokens": 2048, + "seed": 42, + "detokenize": True, + "repetition_penalty": 1.1, + }, + { + "temperature": 0.9, + "top_p": 0.8, + "top_k": 40, + "max_tokens": 2048, + "seed": 42, + "detokenize": True, + "repetition_penalty": 1.05, + "stop_token_ids": [8294], + }, + { + "temperature": 0.0, + "top_p": 1.0, + "top_k": -1, + "max_tokens": 2048, + "seed": 42, + "detokenize": True, + "repetition_penalty": 1.1, + }, +] + + +def generate_audio(prompt: str, out_path: pathlib.Path) -> None: + payload = { + "model": MODEL, + "modalities": ["audio"], + "sampling_params_list": SAMPLING_PARAMS_LIST, + "messages": [ + {"role": "system", "content": [{"type": "text", "text": SYSTEM}]}, + {"role": "user", "content": [{"type": "text", "text": prompt}]}, + ], + } + response = requests.post( + f"{ENDPOINT}/v1/chat/completions", + headers={"Content-Type": "application/json"}, + data=json.dumps(payload), + timeout=600, + ) + response.raise_for_status() + audio_b64 = response.json()["choices"][0]["message"]["audio"]["data"] + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_bytes(base64.b64decode(audio_b64)) + print(f"wrote {out_path} ({out_path.stat().st_size} bytes)") + + +if __name__ == "__main__": + generate_audio( + "Tell me a short, calming bedtime lullaby story for a 6-year-old girl.", + pathlib.Path("out/lullaby.wav"), + ) diff --git a/examples/vllm-omni/qwen2.5-omni/online_inference.py b/examples/vllm-omni/qwen2.5-omni/online_inference.py new file mode 100644 index 000000000000..798f87124fea --- /dev/null +++ b/examples/vllm-omni/qwen2.5-omni/online_inference.py @@ -0,0 +1,25 @@ +"""Online inference against a remote vLLM-Omni server running Qwen2.5-Omni-3B. + +Set OMNI_ENDPOINT to the public URL of your EC2 instance, e.g.: + export OMNI_ENDPOINT=http://ec2-xx-xx-xx-xx.us-west-2.compute.amazonaws.com:8080 + +See offline_inference.py for the local-server variant — the only difference is +the default endpoint. +""" + +import os +import pathlib + +from offline_inference import generate_audio + +if __name__ == "__main__": + endpoint = os.environ.get("OMNI_ENDPOINT") + if not endpoint or endpoint.startswith("http://localhost"): + raise SystemExit( + "Set OMNI_ENDPOINT to the remote server URL, e.g. " + "export OMNI_ENDPOINT=http://:8080" + ) + generate_audio( + "Briefly describe the weather on Mars today.", + pathlib.Path("out/mars.wav"), + ) From 2f42c05e7bb6a76ef7d6ad329c59575a42997feb Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Sun, 26 Apr 2026 22:19:41 -0700 Subject: [PATCH 03/10] Inline Qwen2.5-Omni example into vllm-omni tutorial Signed-off-by: Yadan Wei --- docs/.nav.yml | 1 - docs/vllm-omni/index.md | 4 - docs/vllm-omni/qwen2.5-omni.md | 140 ------------------ .../qwen2.5-omni/online_inference.py | 25 ---- 4 files changed, 170 deletions(-) delete mode 100644 docs/vllm-omni/qwen2.5-omni.md delete mode 100644 examples/vllm-omni/qwen2.5-omni/online_inference.py diff --git a/docs/.nav.yml b/docs/.nav.yml index 35fd502aba22..4c2a53cdf728 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -6,7 +6,6 @@ nav: - Release Notifications: get_started/release_notifications.md - Ray: ray/index.md - vLLM-Omni: vllm-omni/index.md - - Qwen2.5-Omni on EC2: vllm-omni/qwen2.5-omni.md - Release Notes: - releasenotes/index.md - Base: releasenotes/base/index.md diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index 013ac5954fc2..eb1edabcadb8 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -177,10 +177,6 @@ the container directly — SageMaker async inference only captures the initial J See [vLLM-Omni Release Notes](../releasenotes/vllm-omni/index.md) for version history and changelogs. -## Model Tutorials - -- [Qwen2.5-Omni-3B on EC2 GPU](qwen2.5-omni.md) — multi-GPU setup, audio output gotchas, offline + online inference - ## Resources - [vLLM-Omni Documentation](https://github.com/vllm-project/vllm-omni) diff --git a/docs/vllm-omni/qwen2.5-omni.md b/docs/vllm-omni/qwen2.5-omni.md deleted file mode 100644 index dda730ec1c49..000000000000 --- a/docs/vllm-omni/qwen2.5-omni.md +++ /dev/null @@ -1,140 +0,0 @@ -# Qwen2.5-Omni-3B on EC2 GPU - -Run [Qwen/Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) (multimodal-in / text + speech-out) using the vLLM-Omni container — both as a -local (offline) server and as a remote (online) endpoint. - -## Requirements - -- **EC2 GPU instance with ≥ 4 GPUs**: - - `g5.12xlarge` / `g6.12xlarge` (4× A10G, 24 GB each) — tested - - `g6e.12xlarge` (4× L40S, 48 GB each) — preferred when available -- Amazon Linux 2023 with NVIDIA driver, Docker, and `nvidia-container-toolkit` (AWS Deep Learning AMIs include these) -- AWS credentials with ECR pull permission for `763104351884` -- Outbound internet to HuggingFace (first run downloads ~6 GB) - -!!! note "Single-GPU note" Qwen2.5-Omni-3B's default stage layout puts the talker on GPU 1. On a single-GPU instance it fails or produces distorted -audio. Use a 4-GPU instance. - -## One-time setup - -```bash -# ECR login -aws ecr get-login-password --region us-west-2 | \ - docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com - -docker pull {{ images.latest_vllm_omni_ec2 }} - -mkdir -p ~/hf-cache -``` - -## Start the server - -```bash -docker run -d --name omni3b \ - --gpus all --shm-size=16g -p 8080:8080 \ - -v ~/hf-cache:/root/.cache/huggingface \ - -e HF_HUB_ENABLE_HF_TRANSFER=1 \ - {{ images.latest_vllm_omni_ec2 }} \ - Qwen/Qwen2.5-Omni-3B \ - --host 0.0.0.0 --port 8080 \ - --max-model-len 16384 --dtype bfloat16 -``` - -First start takes ~8 minutes (weight download + 3-stage model load). Wait for ready: - -```bash -until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done -echo ready -``` - -Stop and remove: - -```bash -docker stop omni3b && docker rm omni3b -``` - -## Getting clean audio out - -Three things are **required** on `/v1/chat/completions` to produce usable speech from Qwen2.5-Omni-3B: - -1. `"modalities": ["audio"]` -2. `"sampling_params_list"` — a 3-element list (thinker, talker, code2wav). The image's built-in per-stage defaults are wrong and produce noise. Use - the values shown below (from the official Qwen docs). -3. The exact Qwen system prompt. - -!!! warning "Omitting `sampling_params_list` produces noise even though HTTP returns 200 with valid WAV bytes." - -### Working curl - -```bash -curl -s http://localhost:8080/v1/chat/completions \ - -H 'Content-Type: application/json' \ - -d '{ - "model": "Qwen/Qwen2.5-Omni-3B", - "modalities": ["audio"], - "sampling_params_list": [ - {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1}, - {"temperature":0.9,"top_p":0.8,"top_k":40,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.05,"stop_token_ids":[8294]}, - {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1} - ], - "messages": [ - {"role":"system","content":[{"type":"text","text":"You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]}, - {"role":"user","content":[{"type":"text","text":"Tell me a short, calming bedtime lullaby story for a 6-year-old girl."}]} - ] - }' | jq -r '.choices[0].message.audio.data' | base64 -d > out.wav -``` - -## Offline inference (on the GPU instance) - -```python ---8<-- "examples/vllm-omni/qwen2.5-omni/offline_inference.py" -``` - -Run it: - -```bash -python3 offline_inference.py -aplay out/lullaby.wav # afplay on macOS -``` - -## Online inference (from a remote client) - -Open TCP 8080 in the EC2 security group to your client IP, then: - -```bash -export OMNI_ENDPOINT=http://ec2-xx-xx-xx-xx.us-west-2.compute.amazonaws.com:8080 -python3 online_inference.py -``` - -```python ---8<-- "examples/vllm-omni/qwen2.5-omni/online_inference.py" -``` - -## API overview - -OpenAI-compatible endpoints exposed by the container: - -| Endpoint | Purpose | -| --- | --- | -| `POST /v1/chat/completions` | Text / multimodal in → text or audio out (see above for audio) | -| `POST /v1/audio/speech` | Direct text-to-speech shortcut (voices: `Chelsie`, `Ethan`). ⚠️ In v1.0.0 the shortcut bypasses the thinker and does not apply the correct sampling params, producing noisy output. Prefer the chat route. | -| `GET /v1/audio/voices` | List voices | -| `GET /v1/models` | Show served model id | -| `GET /health` | Liveness | - -## Troubleshooting - -| Symptom | Fix | -| --- | --- | -| `NVMLError_InvalidArgument` in stage 1 during startup | Single-GPU instance — use a 4-GPU instance. | -| Audio sounds like noise/gibberish | Missing `sampling_params_list` — add it per above. | -| `message.audio: {}` empty on chat completions | Using `"modalities": ["text","audio"]`. Use `["audio"]` only. | -| `Cannot perform interactive login from non-TTY device` | AWS creds expired. Refresh `~/.aws/credentials` and re-run ECR login. | -| Health never goes 200 | Inspect `docker logs omni3b`. Weight download or OOM — need ≥4 GPUs with ≥24 GB each. | - -## Costs (us-west-2, on-demand, April 2026) - -- `g5.12xlarge` ≈ $5.67 / hour -- `g6e.12xlarge` ≈ $10.49 / hour - -Stop the instance when idle; terminate to free EBS. diff --git a/examples/vllm-omni/qwen2.5-omni/online_inference.py b/examples/vllm-omni/qwen2.5-omni/online_inference.py deleted file mode 100644 index 798f87124fea..000000000000 --- a/examples/vllm-omni/qwen2.5-omni/online_inference.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Online inference against a remote vLLM-Omni server running Qwen2.5-Omni-3B. - -Set OMNI_ENDPOINT to the public URL of your EC2 instance, e.g.: - export OMNI_ENDPOINT=http://ec2-xx-xx-xx-xx.us-west-2.compute.amazonaws.com:8080 - -See offline_inference.py for the local-server variant — the only difference is -the default endpoint. -""" - -import os -import pathlib - -from offline_inference import generate_audio - -if __name__ == "__main__": - endpoint = os.environ.get("OMNI_ENDPOINT") - if not endpoint or endpoint.startswith("http://localhost"): - raise SystemExit( - "Set OMNI_ENDPOINT to the remote server URL, e.g. " - "export OMNI_ENDPOINT=http://:8080" - ) - generate_audio( - "Briefly describe the weather on Mars today.", - pathlib.Path("out/mars.wav"), - ) From ebc5b220725c16789f11578103d6ee3cd7ef1303 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Sun, 26 Apr 2026 22:22:20 -0700 Subject: [PATCH 04/10] Replace EC2 curl examples with end-to-end Python clients Signed-off-by: Yadan Wei --- docs/vllm-omni/index.md | 58 +++++++++++++++------ examples/vllm-omni/image/inference.py | 32 ++++++++++++ examples/vllm-omni/image/invoke_image.sh | 5 -- examples/vllm-omni/tts/inference.py | 31 +++++++++++ examples/vllm-omni/tts/invoke_tts.sh | 6 --- examples/vllm-omni/video/inference.py | 65 ++++++++++++++++++++++++ examples/vllm-omni/video/invoke_video.sh | 22 -------- 7 files changed, 171 insertions(+), 48 deletions(-) create mode 100644 examples/vllm-omni/image/inference.py delete mode 100755 examples/vllm-omni/image/invoke_image.sh create mode 100644 examples/vllm-omni/tts/inference.py delete mode 100755 examples/vllm-omni/tts/invoke_tts.sh create mode 100644 examples/vllm-omni/video/inference.py delete mode 100755 examples/vllm-omni/video/invoke_video.sh diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index eb1edabcadb8..9d3c1fd1c6c7 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -48,45 +48,73 @@ For package versions included in each release, see the [Release Notes](../releas ## EC2 Deployment -The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080. +The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080. Each example below picks a representative model for its +modality — any `vllm serve` flag may be appended to `docker run` (e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`). -### Start the Server +All three examples below use the same Python client pattern. Install the one dependency: ```bash -docker run -d --gpus all \ - --shm-size=2g \ - -p 8080:8080 \ +pip install requests +``` + +### Text-to-Speech + +Start the server with a TTS model: + +```bash +docker run -d --gpus all --shm-size=2g -p 8080:8080 \ {{ images.latest_vllm_omni_ec2 }} \ --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice -until curl -sf http://localhost:8080/health > /dev/null; do sleep 5; done +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done ``` -Any flag accepted by `vllm serve` may be appended to `docker run` (e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`). +Submit a request and write the returned WAV bytes to disk: -### Text-to-Speech - -Returns raw audio bytes (WAV). +```python +--8<-- "examples/vllm-omni/tts/inference.py" +``` ```bash ---8<-- "examples/vllm-omni/tts/invoke_tts.sh" +python3 inference.py +aplay out/speech.wav # afplay on macOS ``` ### Image Generation -Returns a JSON response with a base64-encoded image in `data[0].b64_json`. +Start the server with an image-generation model: ```bash ---8<-- "examples/vllm-omni/image/invoke_image.sh" +docker run -d --gpus all --shm-size=2g -p 8080:8080 \ + {{ images.latest_vllm_omni_ec2 }} \ + --model black-forest-labs/FLUX.2-klein-4B + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done +``` + +The response JSON contains a base64-encoded PNG in `data[0].b64_json`: + +```python +--8<-- "examples/vllm-omni/image/inference.py" ``` ### Video Generation The `/v1/videos` endpoint is asynchronous by design — it returns a job ID immediately and generates the video in the background. The request must use -`multipart/form-data`. +`multipart/form-data`. The client below submits the job, polls until it completes, then downloads the MP4. + +Start the server with a video-generation model: ```bash ---8<-- "examples/vllm-omni/video/invoke_video.sh" +docker run -d --gpus all --shm-size=8g -p 8080:8080 \ + {{ images.latest_vllm_omni_ec2 }} \ + --model Wan-AI/Wan2.1-T2V-1.3B-Diffusers + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done +``` + +```python +--8<-- "examples/vllm-omni/video/inference.py" ``` ### Multimodal Chat diff --git a/examples/vllm-omni/image/inference.py b/examples/vllm-omni/image/inference.py new file mode 100644 index 000000000000..70d2f6ccf33a --- /dev/null +++ b/examples/vllm-omni/image/inference.py @@ -0,0 +1,32 @@ +"""End-to-end image generation example against a local vLLM-Omni server. + +Prereq: start the server with an image-generation model, e.g. + docker run -d --gpus all -p 8080:8080 \ + --model black-forest-labs/FLUX.2-klein-4B +""" + +import base64 +import os +import pathlib + +import requests + +ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080") +OUT_PATH = pathlib.Path("out/image.png") + + +def generate(prompt: str, size: str = "512x512") -> bytes: + response = requests.post( + f"{ENDPOINT}/v1/images/generations", + json={"prompt": prompt, "size": size, "n": 1}, + timeout=300, + ) + response.raise_for_status() + return base64.b64decode(response.json()["data"][0]["b64_json"]) + + +if __name__ == "__main__": + image = generate("a red apple on a white table, studio lighting") + OUT_PATH.parent.mkdir(parents=True, exist_ok=True) + OUT_PATH.write_bytes(image) + print(f"wrote {OUT_PATH} ({len(image)} bytes)") diff --git a/examples/vllm-omni/image/invoke_image.sh b/examples/vllm-omni/image/invoke_image.sh deleted file mode 100755 index 8830334d2512..000000000000 --- a/examples/vllm-omni/image/invoke_image.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -# Image generation via OpenAI-compatible /v1/images/generations endpoint -curl -X POST http://localhost:8080/v1/images/generations \ - -H "Content-Type: application/json" \ - -d '{"prompt": "a red apple on a white table", "size": "512x512", "n": 1}' diff --git a/examples/vllm-omni/tts/inference.py b/examples/vllm-omni/tts/inference.py new file mode 100644 index 000000000000..3192b5071ffd --- /dev/null +++ b/examples/vllm-omni/tts/inference.py @@ -0,0 +1,31 @@ +"""End-to-end TTS example against a local vLLM-Omni server. + +Prereq: start the server with a TTS model, e.g. + docker run -d --gpus all -p 8080:8080 \ + --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice +""" + +import os +import pathlib + +import requests + +ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080") +OUT_PATH = pathlib.Path("out/speech.wav") + + +def synthesize(text: str, voice: str = "vivian", language: str = "English") -> bytes: + response = requests.post( + f"{ENDPOINT}/v1/audio/speech", + json={"input": text, "voice": voice, "language": language}, + timeout=300, + ) + response.raise_for_status() + return response.content + + +if __name__ == "__main__": + audio = synthesize("Hello from vLLM-Omni. This is a text to speech demo.") + OUT_PATH.parent.mkdir(parents=True, exist_ok=True) + OUT_PATH.write_bytes(audio) + print(f"wrote {OUT_PATH} ({len(audio)} bytes)") diff --git a/examples/vllm-omni/tts/invoke_tts.sh b/examples/vllm-omni/tts/invoke_tts.sh deleted file mode 100755 index 935f318492ce..000000000000 --- a/examples/vllm-omni/tts/invoke_tts.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -# Text-to-speech via OpenAI-compatible /v1/audio/speech endpoint -curl -X POST http://localhost:8080/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{"input": "Hello, how are you?", "voice": "vivian", "language": "English"}' \ - --output speech.wav diff --git a/examples/vllm-omni/video/inference.py b/examples/vllm-omni/video/inference.py new file mode 100644 index 000000000000..9a982fcedeb3 --- /dev/null +++ b/examples/vllm-omni/video/inference.py @@ -0,0 +1,65 @@ +"""End-to-end video generation example against a local vLLM-Omni server. + +The /v1/videos endpoint is async — it returns a job ID immediately, and the +video is generated in the background. This script submits the job, polls +until it completes, then downloads the MP4. + +Prereq: start the server with a video-generation model, e.g. + docker run -d --gpus all -p 8080:8080 \ + --model Wan-AI/Wan2.1-T2V-1.3B-Diffusers +""" + +import os +import pathlib +import time + +import requests + +ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080") +OUT_PATH = pathlib.Path("out/video.mp4") +POLL_INTERVAL_S = 5 +POLL_TIMEOUT_S = 600 + + +def submit_job(prompt: str) -> str: + # /v1/videos requires multipart/form-data + response = requests.post( + f"{ENDPOINT}/v1/videos", + files={ + "prompt": (None, prompt), + "num_frames": (None, "17"), + "num_inference_steps": (None, "4"), + "size": (None, "480x320"), + "seed": (None, "42"), + }, + timeout=60, + ) + response.raise_for_status() + return response.json()["id"] + + +def wait_for_completion(job_id: str) -> None: + deadline = time.time() + POLL_TIMEOUT_S + while time.time() < deadline: + status = requests.get(f"{ENDPOINT}/v1/videos/{job_id}", timeout=30).json()["status"] + if status == "succeeded": + return + if status == "failed": + raise RuntimeError(f"Job {job_id} failed") + time.sleep(POLL_INTERVAL_S) + raise TimeoutError(f"Job {job_id} did not complete within {POLL_TIMEOUT_S}s") + + +def download(job_id: str, out_path: pathlib.Path) -> None: + response = requests.get(f"{ENDPOINT}/v1/videos/{job_id}/content", timeout=60) + response.raise_for_status() + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_bytes(response.content) + + +if __name__ == "__main__": + job_id = submit_job("a dog running on a beach at sunset") + print(f"submitted job {job_id}") + wait_for_completion(job_id) + download(job_id, OUT_PATH) + print(f"wrote {OUT_PATH} ({OUT_PATH.stat().st_size} bytes)") diff --git a/examples/vllm-omni/video/invoke_video.sh b/examples/vllm-omni/video/invoke_video.sh deleted file mode 100755 index 3e6c4ab36d68..000000000000 --- a/examples/vllm-omni/video/invoke_video.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Video generation via /v1/videos endpoint (async — returns a job ID) -# The /v1/videos API requires multipart/form-data. -JOB=$(curl -sf -X POST http://localhost:8080/v1/videos \ - -F "prompt=a dog running on a beach" \ - -F "num_frames=17" \ - -F "num_inference_steps=4" \ - -F "size=480x320" \ - -F "seed=42") - -JOB_ID=$(echo "$JOB" | python3 -c "import json,sys;print(json.load(sys.stdin)['id'])") -echo "Job: $JOB_ID" - -# Poll until complete, then download -while true; do - STATUS=$(curl -sf "http://localhost:8080/v1/videos/$JOB_ID" | python3 -c "import json,sys;print(json.load(sys.stdin)['status'])") - [ "$STATUS" = "succeeded" ] && break - [ "$STATUS" = "failed" ] && { echo "Job failed"; exit 1; } - sleep 5 -done - -curl -sf "http://localhost:8080/v1/videos/$JOB_ID/content" --output video.mp4 From 54fd2ee7bb31a17de588fc4a2edb74e2ab46a4b2 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Sun, 26 Apr 2026 22:23:52 -0700 Subject: [PATCH 05/10] Switch EC2 examples back to end-to-end shell scripts Signed-off-by: Yadan Wei --- docs/vllm-omni/index.md | 60 ++++--------------------- examples/vllm-omni/image/inference.py | 32 ------------- examples/vllm-omni/image/run.sh | 22 +++++++++ examples/vllm-omni/tts/inference.py | 31 ------------- examples/vllm-omni/tts/run.sh | 22 +++++++++ examples/vllm-omni/video/inference.py | 65 --------------------------- examples/vllm-omni/video/run.sh | 36 +++++++++++++++ 7 files changed, 88 insertions(+), 180 deletions(-) delete mode 100644 examples/vllm-omni/image/inference.py create mode 100755 examples/vllm-omni/image/run.sh delete mode 100644 examples/vllm-omni/tts/inference.py create mode 100755 examples/vllm-omni/tts/run.sh delete mode 100644 examples/vllm-omni/video/inference.py create mode 100755 examples/vllm-omni/video/run.sh diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index 9d3c1fd1c6c7..8711d4094f91 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -48,73 +48,29 @@ For package versions included in each release, see the [Release Notes](../releas ## EC2 Deployment -The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080. Each example below picks a representative model for its -modality — any `vllm serve` flag may be appended to `docker run` (e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`). - -All three examples below use the same Python client pattern. Install the one dependency: - -```bash -pip install requests -``` +The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080. Each example below is a self-contained shell script that +starts the container, waits for readiness, submits a request, and writes the output to disk. Any `vllm serve` flag may be appended to `docker run` +(e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`). ### Text-to-Speech -Start the server with a TTS model: - -```bash -docker run -d --gpus all --shm-size=2g -p 8080:8080 \ - {{ images.latest_vllm_omni_ec2 }} \ - --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice - -until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done -``` - -Submit a request and write the returned WAV bytes to disk: - -```python ---8<-- "examples/vllm-omni/tts/inference.py" -``` - ```bash -python3 inference.py -aplay out/speech.wav # afplay on macOS +--8<-- "examples/vllm-omni/tts/run.sh" ``` ### Image Generation -Start the server with an image-generation model: - ```bash -docker run -d --gpus all --shm-size=2g -p 8080:8080 \ - {{ images.latest_vllm_omni_ec2 }} \ - --model black-forest-labs/FLUX.2-klein-4B - -until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done -``` - -The response JSON contains a base64-encoded PNG in `data[0].b64_json`: - -```python ---8<-- "examples/vllm-omni/image/inference.py" +--8<-- "examples/vllm-omni/image/run.sh" ``` ### Video Generation -The `/v1/videos` endpoint is asynchronous by design — it returns a job ID immediately and generates the video in the background. The request must use -`multipart/form-data`. The client below submits the job, polls until it completes, then downloads the MP4. - -Start the server with a video-generation model: +The `/v1/videos` endpoint is asynchronous — it returns a job ID immediately and generates the video in the background. The script below submits the +job, polls until it completes, then downloads the MP4. ```bash -docker run -d --gpus all --shm-size=8g -p 8080:8080 \ - {{ images.latest_vllm_omni_ec2 }} \ - --model Wan-AI/Wan2.1-T2V-1.3B-Diffusers - -until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done -``` - -```python ---8<-- "examples/vllm-omni/video/inference.py" +--8<-- "examples/vllm-omni/video/run.sh" ``` ### Multimodal Chat diff --git a/examples/vllm-omni/image/inference.py b/examples/vllm-omni/image/inference.py deleted file mode 100644 index 70d2f6ccf33a..000000000000 --- a/examples/vllm-omni/image/inference.py +++ /dev/null @@ -1,32 +0,0 @@ -"""End-to-end image generation example against a local vLLM-Omni server. - -Prereq: start the server with an image-generation model, e.g. - docker run -d --gpus all -p 8080:8080 \ - --model black-forest-labs/FLUX.2-klein-4B -""" - -import base64 -import os -import pathlib - -import requests - -ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080") -OUT_PATH = pathlib.Path("out/image.png") - - -def generate(prompt: str, size: str = "512x512") -> bytes: - response = requests.post( - f"{ENDPOINT}/v1/images/generations", - json={"prompt": prompt, "size": size, "n": 1}, - timeout=300, - ) - response.raise_for_status() - return base64.b64decode(response.json()["data"][0]["b64_json"]) - - -if __name__ == "__main__": - image = generate("a red apple on a white table, studio lighting") - OUT_PATH.parent.mkdir(parents=True, exist_ok=True) - OUT_PATH.write_bytes(image) - print(f"wrote {OUT_PATH} ({len(image)} bytes)") diff --git a/examples/vllm-omni/image/run.sh b/examples/vllm-omni/image/run.sh new file mode 100755 index 000000000000..1f017f939f7c --- /dev/null +++ b/examples/vllm-omni/image/run.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# End-to-end image-generation example: start server, wait for ready, generate. +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +MODEL="${MODEL:-black-forest-labs/FLUX.2-klein-4B}" +NAME="${NAME:-omni-image}" + +docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8080 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + "${IMAGE}" --model "${MODEL}" + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done + +# Response JSON has data[0].b64_json — decode to PNG. +curl -sf -X POST http://localhost:8080/v1/images/generations \ + -H "Content-Type: application/json" \ + -d '{"prompt": "a red apple on a white table, studio lighting", "size": "512x512", "n": 1}' \ + | python3 -c "import base64,json,sys;open('image.png','wb').write(base64.b64decode(json.load(sys.stdin)['data'][0]['b64_json']))" + +echo "wrote image.png ($(stat -f%z image.png 2>/dev/null || stat -c%s image.png) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" diff --git a/examples/vllm-omni/tts/inference.py b/examples/vllm-omni/tts/inference.py deleted file mode 100644 index 3192b5071ffd..000000000000 --- a/examples/vllm-omni/tts/inference.py +++ /dev/null @@ -1,31 +0,0 @@ -"""End-to-end TTS example against a local vLLM-Omni server. - -Prereq: start the server with a TTS model, e.g. - docker run -d --gpus all -p 8080:8080 \ - --model Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice -""" - -import os -import pathlib - -import requests - -ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080") -OUT_PATH = pathlib.Path("out/speech.wav") - - -def synthesize(text: str, voice: str = "vivian", language: str = "English") -> bytes: - response = requests.post( - f"{ENDPOINT}/v1/audio/speech", - json={"input": text, "voice": voice, "language": language}, - timeout=300, - ) - response.raise_for_status() - return response.content - - -if __name__ == "__main__": - audio = synthesize("Hello from vLLM-Omni. This is a text to speech demo.") - OUT_PATH.parent.mkdir(parents=True, exist_ok=True) - OUT_PATH.write_bytes(audio) - print(f"wrote {OUT_PATH} ({len(audio)} bytes)") diff --git a/examples/vllm-omni/tts/run.sh b/examples/vllm-omni/tts/run.sh new file mode 100755 index 000000000000..cc526a23f4d6 --- /dev/null +++ b/examples/vllm-omni/tts/run.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# End-to-end TTS example: start server, wait for ready, synthesize speech. +# Requires: docker (with NVIDIA runtime), curl, an authenticated ECR pull. +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice}" +NAME="${NAME:-omni-tts}" + +docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8080 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + "${IMAGE}" --model "${MODEL}" + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done + +curl -sf -X POST http://localhost:8080/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{"input": "Hello from vLLM-Omni.", "voice": "vivian", "language": "English"}' \ + --output speech.wav + +echo "wrote speech.wav ($(stat -f%z speech.wav 2>/dev/null || stat -c%s speech.wav) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" diff --git a/examples/vllm-omni/video/inference.py b/examples/vllm-omni/video/inference.py deleted file mode 100644 index 9a982fcedeb3..000000000000 --- a/examples/vllm-omni/video/inference.py +++ /dev/null @@ -1,65 +0,0 @@ -"""End-to-end video generation example against a local vLLM-Omni server. - -The /v1/videos endpoint is async — it returns a job ID immediately, and the -video is generated in the background. This script submits the job, polls -until it completes, then downloads the MP4. - -Prereq: start the server with a video-generation model, e.g. - docker run -d --gpus all -p 8080:8080 \ - --model Wan-AI/Wan2.1-T2V-1.3B-Diffusers -""" - -import os -import pathlib -import time - -import requests - -ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080") -OUT_PATH = pathlib.Path("out/video.mp4") -POLL_INTERVAL_S = 5 -POLL_TIMEOUT_S = 600 - - -def submit_job(prompt: str) -> str: - # /v1/videos requires multipart/form-data - response = requests.post( - f"{ENDPOINT}/v1/videos", - files={ - "prompt": (None, prompt), - "num_frames": (None, "17"), - "num_inference_steps": (None, "4"), - "size": (None, "480x320"), - "seed": (None, "42"), - }, - timeout=60, - ) - response.raise_for_status() - return response.json()["id"] - - -def wait_for_completion(job_id: str) -> None: - deadline = time.time() + POLL_TIMEOUT_S - while time.time() < deadline: - status = requests.get(f"{ENDPOINT}/v1/videos/{job_id}", timeout=30).json()["status"] - if status == "succeeded": - return - if status == "failed": - raise RuntimeError(f"Job {job_id} failed") - time.sleep(POLL_INTERVAL_S) - raise TimeoutError(f"Job {job_id} did not complete within {POLL_TIMEOUT_S}s") - - -def download(job_id: str, out_path: pathlib.Path) -> None: - response = requests.get(f"{ENDPOINT}/v1/videos/{job_id}/content", timeout=60) - response.raise_for_status() - out_path.parent.mkdir(parents=True, exist_ok=True) - out_path.write_bytes(response.content) - - -if __name__ == "__main__": - job_id = submit_job("a dog running on a beach at sunset") - print(f"submitted job {job_id}") - wait_for_completion(job_id) - download(job_id, OUT_PATH) - print(f"wrote {OUT_PATH} ({OUT_PATH.stat().st_size} bytes)") diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh new file mode 100755 index 000000000000..c382ff827ea2 --- /dev/null +++ b/examples/vllm-omni/video/run.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# End-to-end video-generation example: start server, submit job, poll, download. +# /v1/videos is async — it returns a job ID; the MP4 is produced in the background. +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}" +NAME="${NAME:-omni-video}" + +docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8080 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + "${IMAGE}" --model "${MODEL}" + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done + +# /v1/videos requires multipart/form-data. +JOB_ID=$(curl -sf -X POST http://localhost:8080/v1/videos \ + -F "prompt=a dog running on a beach at sunset" \ + -F "num_frames=17" -F "num_inference_steps=4" \ + -F "size=480x320" -F "seed=42" \ + | python3 -c "import json,sys;print(json.load(sys.stdin)['id'])") + +echo "submitted job ${JOB_ID}" + +# Poll until succeeded (5s interval, 10 min timeout). +for _ in $(seq 1 120); do + STATUS=$(curl -sf "http://localhost:8080/v1/videos/${JOB_ID}" \ + | python3 -c "import json,sys;print(json.load(sys.stdin)['status'])") + [ "${STATUS}" = "succeeded" ] && break + [ "${STATUS}" = "failed" ] && { echo "job failed"; exit 1; } + sleep 5 +done + +curl -sf "http://localhost:8080/v1/videos/${JOB_ID}/content" --output video.mp4 +echo "wrote video.mp4 ($(stat -f%z video.mp4 2>/dev/null || stat -c%s video.mp4) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" From 09a04f5402f35486ba6f39ccf3a5e308ea8f7050 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Sun, 26 Apr 2026 22:28:33 -0700 Subject: [PATCH 06/10] Add model intros and HuggingFace links to EC2 examples Signed-off-by: Yadan Wei --- docs/vllm-omni/index.md | 50 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index 8711d4094f91..764d2c61e222 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -54,18 +54,27 @@ starts the container, waits for readiness, submits a request, and writes the out ### Text-to-Speech +**Model:** [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) — a 1.7B-parameter Qwen3 text-to-speech +model supporting multiple voices and languages, runs on a single 24 GB GPU (A10G / L4). + ```bash --8<-- "examples/vllm-omni/tts/run.sh" ``` ### Image Generation +**Model:** [FLUX.2-klein-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B) — a 4B-parameter rectified-flow transformer from Black Forest +Labs, produces high-quality 512×512 images from text prompts, runs on a single 24 GB GPU. + ```bash --8<-- "examples/vllm-omni/image/run.sh" ``` ### Video Generation +**Model:** [Wan2.1-T2V-1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) — a 1.3B-parameter text-to-video diffusion model from the Wan +team, generates short clips at up to 480×832 resolution. Needs a 48 GB GPU (L40S) or 2× 24 GB GPUs with `--tensor-parallel-size 2`. + The `/v1/videos` endpoint is asynchronous — it returns a job ID immediately and generates the video in the background. The script below submits the job, polls until it completes, then downloads the MP4. @@ -77,12 +86,47 @@ job, polls until it completes, then downloads the MP4. Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) are supplied as URL or base64 content parts in the message list. +**Example model:** [Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) — a 3B-parameter omni model accepting text, image, and audio inputs +and generating text or speech outputs. Multi-stage architecture (thinker + talker + code2wav) requires **≥ 4 GPUs**: `g5.12xlarge` / `g6.12xlarge` (4× +A10G) or `g6e.12xlarge` (4× L40S). + +Start the server: + +```bash +docker run -d --name omni3b --gpus all --shm-size=16g -p 8080:8080 \ + -v ~/hf-cache:/root/.cache/huggingface \ + -e HF_HUB_ENABLE_HF_TRANSFER=1 \ + {{ images.latest_vllm_omni_ec2 }} \ + --model Qwen/Qwen2.5-Omni-3B \ + --host 0.0.0.0 --port 8080 \ + --max-model-len 16384 --dtype bfloat16 + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done +``` + +Three things are **required** on `/v1/chat/completions` to produce clean audio from Qwen2.5-Omni: + +1. `"modalities": ["audio"]` — not `["text","audio"]` (that returns empty audio). +2. `"sampling_params_list"` — a 3-element list (thinker, talker, code2wav). The image's built-in per-stage defaults produce noise; use the values from + the official Qwen docs. +3. The exact Qwen system prompt. + +!!! warning "Omitting `sampling_params_list` returns 200 with valid WAV bytes that sound like noise — the single most common footgun." + +Run the included client (supports local and remote via `OMNI_ENDPOINT`): + +```python +--8<-- "examples/vllm-omni/qwen2.5-omni/offline_inference.py" +``` + ```bash -curl -X POST http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"messages": [{"role": "user", "content": "Say hello in one sentence."}], "max_tokens": 64}' +python3 offline_inference.py +aplay out/lullaby.wav # afplay on macOS ``` +The `/v1/audio/speech` shortcut (voices: `Chelsie`, `Ethan`) bypasses the thinker and does not apply the correct sampling params in v1.0.0, so it +produces noisy output for Qwen2.5-Omni. Prefer `/v1/chat/completions` for this model. + ## SageMaker Deployment ### Prerequisites From 357b2613447e035ed28c6083c8ac76422ccf0a3a Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Sun, 26 Apr 2026 22:33:49 -0700 Subject: [PATCH 07/10] Convert Qwen2.5-Omni example to shell script Signed-off-by: Yadan Wei --- docs/vllm-omni/index.md | 25 +----- .../qwen2.5-omni/offline_inference.py | 83 ------------------- examples/vllm-omni/qwen2.5-omni/run.sh | 46 ++++++++++ 3 files changed, 48 insertions(+), 106 deletions(-) delete mode 100644 examples/vllm-omni/qwen2.5-omni/offline_inference.py create mode 100755 examples/vllm-omni/qwen2.5-omni/run.sh diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index 764d2c61e222..9984c9d588cd 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -90,21 +90,7 @@ Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) and generating text or speech outputs. Multi-stage architecture (thinker + talker + code2wav) requires **≥ 4 GPUs**: `g5.12xlarge` / `g6.12xlarge` (4× A10G) or `g6e.12xlarge` (4× L40S). -Start the server: - -```bash -docker run -d --name omni3b --gpus all --shm-size=16g -p 8080:8080 \ - -v ~/hf-cache:/root/.cache/huggingface \ - -e HF_HUB_ENABLE_HF_TRANSFER=1 \ - {{ images.latest_vllm_omni_ec2 }} \ - --model Qwen/Qwen2.5-Omni-3B \ - --host 0.0.0.0 --port 8080 \ - --max-model-len 16384 --dtype bfloat16 - -until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done -``` - -Three things are **required** on `/v1/chat/completions` to produce clean audio from Qwen2.5-Omni: +Start the server, then submit a request. Three things are **required** on `/v1/chat/completions` to produce clean audio from Qwen2.5-Omni: 1. `"modalities": ["audio"]` — not `["text","audio"]` (that returns empty audio). 2. `"sampling_params_list"` — a 3-element list (thinker, talker, code2wav). The image's built-in per-stage defaults produce noise; use the values from @@ -113,15 +99,8 @@ Three things are **required** on `/v1/chat/completions` to produce clean audio f !!! warning "Omitting `sampling_params_list` returns 200 with valid WAV bytes that sound like noise — the single most common footgun." -Run the included client (supports local and remote via `OMNI_ENDPOINT`): - -```python ---8<-- "examples/vllm-omni/qwen2.5-omni/offline_inference.py" -``` - ```bash -python3 offline_inference.py -aplay out/lullaby.wav # afplay on macOS +--8<-- "examples/vllm-omni/qwen2.5-omni/run.sh" ``` The `/v1/audio/speech` shortcut (voices: `Chelsie`, `Ethan`) bypasses the thinker and does not apply the correct sampling params in v1.0.0, so it diff --git a/examples/vllm-omni/qwen2.5-omni/offline_inference.py b/examples/vllm-omni/qwen2.5-omni/offline_inference.py deleted file mode 100644 index c71d3c57d3a2..000000000000 --- a/examples/vllm-omni/qwen2.5-omni/offline_inference.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Offline inference against a local vLLM-Omni server running Qwen2.5-Omni-3B. - -Assumes the server is already running on http://localhost:8080 (see the -Qwen2.5-Omni tutorial for `docker run` instructions). -""" - -import base64 -import json -import os -import pathlib - -import requests - -ENDPOINT = os.environ.get("OMNI_ENDPOINT", "http://localhost:8080") -MODEL = "Qwen/Qwen2.5-Omni-3B" -SYSTEM = ( - "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, " - "capable of perceiving auditory and visual inputs, as well as generating " - "text and speech." -) - -# Three per-stage sampling params (thinker, talker, code2wav) are REQUIRED for -# clean audio. The built-in defaults produce noise. Do not omit. -SAMPLING_PARAMS_LIST = [ - { - "temperature": 0.0, - "top_p": 1.0, - "top_k": -1, - "max_tokens": 2048, - "seed": 42, - "detokenize": True, - "repetition_penalty": 1.1, - }, - { - "temperature": 0.9, - "top_p": 0.8, - "top_k": 40, - "max_tokens": 2048, - "seed": 42, - "detokenize": True, - "repetition_penalty": 1.05, - "stop_token_ids": [8294], - }, - { - "temperature": 0.0, - "top_p": 1.0, - "top_k": -1, - "max_tokens": 2048, - "seed": 42, - "detokenize": True, - "repetition_penalty": 1.1, - }, -] - - -def generate_audio(prompt: str, out_path: pathlib.Path) -> None: - payload = { - "model": MODEL, - "modalities": ["audio"], - "sampling_params_list": SAMPLING_PARAMS_LIST, - "messages": [ - {"role": "system", "content": [{"type": "text", "text": SYSTEM}]}, - {"role": "user", "content": [{"type": "text", "text": prompt}]}, - ], - } - response = requests.post( - f"{ENDPOINT}/v1/chat/completions", - headers={"Content-Type": "application/json"}, - data=json.dumps(payload), - timeout=600, - ) - response.raise_for_status() - audio_b64 = response.json()["choices"][0]["message"]["audio"]["data"] - out_path.parent.mkdir(parents=True, exist_ok=True) - out_path.write_bytes(base64.b64decode(audio_b64)) - print(f"wrote {out_path} ({out_path.stat().st_size} bytes)") - - -if __name__ == "__main__": - generate_audio( - "Tell me a short, calming bedtime lullaby story for a 6-year-old girl.", - pathlib.Path("out/lullaby.wav"), - ) diff --git a/examples/vllm-omni/qwen2.5-omni/run.sh b/examples/vllm-omni/qwen2.5-omni/run.sh new file mode 100755 index 000000000000..a04624bdf99d --- /dev/null +++ b/examples/vllm-omni/qwen2.5-omni/run.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# End-to-end Qwen2.5-Omni-3B example: start server, wait for ready, +# generate speech via /v1/chat/completions. +# +# REQUIRES ≥ 4 GPUs (e.g., g5.12xlarge / g6.12xlarge / g6e.12xlarge). +# On single-GPU hosts the model's talker stage fails to load on GPU 1. +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +MODEL="${MODEL:-Qwen/Qwen2.5-Omni-3B}" +NAME="${NAME:-omni3b}" + +docker run -d --name "${NAME}" --gpus all --shm-size=16g -p 8080:8080 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + -e HF_HUB_ENABLE_HF_TRANSFER=1 \ + "${IMAGE}" --model "${MODEL}" \ + --host 0.0.0.0 --port 8080 \ + --max-model-len 16384 --dtype bfloat16 + +# First start takes ~8 min (weight download + 3-stage load). +until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done + +# Three things are REQUIRED for clean audio: +# 1. "modalities": ["audio"] (NOT ["text","audio"] — returns empty audio) +# 2. "sampling_params_list" (3-element list: thinker, talker, code2wav; +# built-in defaults produce noise) +# 3. The exact Qwen system prompt below. +# Omitting #2 returns 200 OK with valid WAV bytes that sound like noise. +curl -sf -X POST http://localhost:8080/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen2.5-Omni-3B", + "modalities": ["audio"], + "sampling_params_list": [ + {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1}, + {"temperature":0.9,"top_p":0.8,"top_k":40,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.05,"stop_token_ids":[8294]}, + {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1} + ], + "messages": [ + {"role":"system","content":[{"type":"text","text":"You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]}, + {"role":"user","content":[{"type":"text","text":"Tell me a short, calming bedtime lullaby story for a 6-year-old girl."}]} + ] + }' | jq -r '.choices[0].message.audio.data' | base64 -d > lullaby.wav + +echo "wrote lullaby.wav ($(stat -f%z lullaby.wav 2>/dev/null || stat -c%s lullaby.wav) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" From 35e50549d89f700e6ca4218690c1457a275a93c6 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Sun, 26 Apr 2026 23:21:35 -0700 Subject: [PATCH 08/10] Fix port mapping: container listens on 8000, map to host 8080 Signed-off-by: Yadan Wei --- examples/vllm-omni/image/run.sh | 2 +- examples/vllm-omni/tts/run.sh | 2 +- examples/vllm-omni/video/run.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/vllm-omni/image/run.sh b/examples/vllm-omni/image/run.sh index 1f017f939f7c..d6dc1615ad95 100755 --- a/examples/vllm-omni/image/run.sh +++ b/examples/vllm-omni/image/run.sh @@ -6,7 +6,7 @@ IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda MODEL="${MODEL:-black-forest-labs/FLUX.2-klein-4B}" NAME="${NAME:-omni-image}" -docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8080 \ +docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \ -v "${HOME}/hf-cache:/root/.cache/huggingface" \ "${IMAGE}" --model "${MODEL}" diff --git a/examples/vllm-omni/tts/run.sh b/examples/vllm-omni/tts/run.sh index cc526a23f4d6..9f4f185a2139 100755 --- a/examples/vllm-omni/tts/run.sh +++ b/examples/vllm-omni/tts/run.sh @@ -7,7 +7,7 @@ IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice}" NAME="${NAME:-omni-tts}" -docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8080 \ +docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \ -v "${HOME}/hf-cache:/root/.cache/huggingface" \ "${IMAGE}" --model "${MODEL}" diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh index c382ff827ea2..2c443695a4f7 100755 --- a/examples/vllm-omni/video/run.sh +++ b/examples/vllm-omni/video/run.sh @@ -7,7 +7,7 @@ IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}" NAME="${NAME:-omni-video}" -docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8080 \ +docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8000 \ -v "${HOME}/hf-cache:/root/.cache/huggingface" \ "${IMAGE}" --model "${MODEL}" From c7ef1c14dd031b615ba34d13d4c5a28bfcc38175 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Mon, 27 Apr 2026 15:25:57 -0700 Subject: [PATCH 09/10] docs(vllm-omni): correct ECR repo, image tag, and version labeling - Align version labeling with Ray convention: YAML 'version' now reflects the embedded framework version (0.18.0) instead of a DLC release number. - Add optional 'ecr_repository' field so the data-dir key can differ from the actual ECR repo name. vllm-omni images live under the 'vllm' repo, not 'vllm-omni'. - Fix SageMaker image tag: 'omni-sagemaker-cuda-v1' (verified against 763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm), not the previous 'omni-cuda-sagemaker-v1'. - Rewrite the SageMaker async example to deploy TTS (works end-to-end) instead of video. The /v1/videos endpoint in 0.18.0 returns a job-ID JSON, which is what SageMaker async writes to S3; the MP4 itself is never written to S3 and cannot be retrieved via SageMaker in 0.18.0. - Clarify Known Limitations: video generation is not supported on SageMaker in 0.18.0 (use EC2 for the full video workflow). - Minor fix to EC2 video example (tensor-parallel-size 2, bumped steps, status value 'completed'). Signed-off-by: Yadan Wei --- .../{1.0.0-gpu-ec2.yml => 0.18.0-gpu-ec2.yml} | 5 ++- ...sagemaker.yml => 0.18.0-gpu-sagemaker.yml} | 7 ++-- docs/src/generate.py | 7 ++-- docs/src/image_config.py | 15 +++++--- docs/vllm-omni/index.md | 33 ++++++++++------- examples/vllm-omni/sagemaker/deploy_tts.py | 2 +- .../vllm-omni/sagemaker/deploy_tts_async.py | 36 +++++++++++++++++++ .../vllm-omni/sagemaker/deploy_video_async.py | 35 ------------------ examples/vllm-omni/video/run.sh | 8 ++--- 9 files changed, 82 insertions(+), 66 deletions(-) rename docs/src/data/vllm-omni/{1.0.0-gpu-ec2.yml => 0.18.0-gpu-ec2.yml} (90%) rename docs/src/data/vllm-omni/{1.0.0-gpu-sagemaker.yml => 0.18.0-gpu-sagemaker.yml} (82%) create mode 100644 examples/vllm-omni/sagemaker/deploy_tts_async.py delete mode 100644 examples/vllm-omni/sagemaker/deploy_video_async.py diff --git a/docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml similarity index 90% rename from docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml rename to docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml index 1d5aa65228f6..ada5c1dec88b 100644 --- a/docs/src/data/vllm-omni/1.0.0-gpu-ec2.yml +++ b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml @@ -1,5 +1,6 @@ framework: vLLM-Omni -version: "1.0.0" +version: "0.18.0" +ecr_repository: vllm accelerator: gpu python: py312 cuda: cu129 @@ -8,8 +9,6 @@ platform: default public_registry: true tags: - - "omni-cuda-v1.0.0" - - "omni-cuda-v1.0" - "omni-cuda-v1" announcements: diff --git a/docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml similarity index 82% rename from docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml rename to docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml index 588fb7e8fd01..252e3552ead2 100644 --- a/docs/src/data/vllm-omni/1.0.0-gpu-sagemaker.yml +++ b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml @@ -1,5 +1,6 @@ framework: vLLM-Omni -version: "1.0.0" +version: "0.18.0" +ecr_repository: vllm accelerator: gpu python: py312 cuda: cu129 @@ -8,9 +9,7 @@ platform: sagemaker public_registry: true tags: - - "omni-cuda-sagemaker-v1.0.0" - - "omni-cuda-sagemaker-v1.0" - - "omni-cuda-sagemaker-v1" + - "omni-sagemaker-cuda-v1" announcements: - "Initial release of vLLM-Omni containers for SageMaker" diff --git a/docs/src/generate.py b/docs/src/generate.py index 6189cbc5b926..43dca381d322 100644 --- a/docs/src/generate.py +++ b/docs/src/generate.py @@ -376,9 +376,12 @@ def generate_available_images(dry_run: bool = False) -> str: section = f"{AVAILABLE_IMAGES_TABLE_HEADER} {display_name}\n" if has_public_registry: - url = f"{PUBLIC_GALLERY_URL}/{repository}" + # Use ecr_repository from images (falls back to data-dir key when unset) so display + # reflects the actual ECR repo when the data-dir key differs (e.g., vllm-omni -> vllm). + ecr_repo = images[0].ecr_repository if images else repository + url = f"{PUBLIC_GALLERY_URL}/{ecr_repo}" section += ( - f"\nThese images are also available in ECR Public Gallery: [{repository}]({url})\n" + f"\nThese images are also available in ECR Public Gallery: [{ecr_repo}]({url})\n" ) if table_config.get("note"): section += f"\n{table_config['note']}\n" diff --git a/docs/src/image_config.py b/docs/src/image_config.py index f5c02e52837a..cc36a15a4572 100644 --- a/docs/src/image_config.py +++ b/docs/src/image_config.py @@ -45,6 +45,13 @@ def repository(self) -> str: """Repository name for this image.""" return self._repository + @property + def ecr_repository(self) -> str: + """ECR repository name for image URIs. Defaults to repository, but can be overridden + via the optional 'ecr_repository' YAML field when the data-directory key differs from + the actual ECR repo name (e.g., data dir 'vllm-omni' -> ECR repo 'vllm').""" + return self._data.get("ecr_repository") or self._repository + @property def framework_group(self) -> str: """Framework group key (or repository if not in a group).""" @@ -91,11 +98,11 @@ def get_image_uris(self) -> list[str]: uris = [] for tag in tags: - uris.append(build_ecr_uri(account, self._repository, tag, region)) + uris.append(build_ecr_uri(account, self.ecr_repository, tag, region)) if self.get("public_registry"): for tag in tags: - uris.append(build_public_ecr_uri(self._repository, tag)) + uris.append(build_public_ecr_uri(self.ecr_repository, tag)) return uris @@ -126,7 +133,7 @@ def display_framework_version(self) -> str: def display_example_url(self) -> str: """Example ECR URL for table display.""" account = self.get("example_ecr_account", GLOBAL_CONFIG["example_ecr_account"]) - return f"`{build_ecr_uri(account, self._repository, self.display_tag)}`" + return f"`{build_ecr_uri(account, self.ecr_repository, self.display_tag)}`" @property def display_platform(self) -> str: @@ -277,4 +284,4 @@ def get_latest_image_uri(repo: str, platform: str) -> str: latest = sort_by_version(matching)[0] account = latest.get("example_ecr_account", GLOBAL_CONFIG["example_ecr_account"]) - return build_ecr_uri(account, repo, latest.display_tag, "us-west-2") + return build_ecr_uri(account, latest.ecr_repository, latest.display_tag, "us-west-2") diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md index 9984c9d588cd..ba7e5ffb4689 100644 --- a/docs/vllm-omni/index.md +++ b/docs/vllm-omni/index.md @@ -5,8 +5,8 @@ Pre-built Docker images for serving omni-modality models (text-to-speech, image ## Latest Announcements -**vLLM-Omni 1.0.0** — Initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a SageMaker routing -middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`. +**April 24, 2026** — vLLM-Omni 0.18.0 initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a +SageMaker routing middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`. ## Pull Commands @@ -41,8 +41,10 @@ For package versions included in each release, see the [Release Notes](../releas ## Model Compatibility - Models must have a standard HuggingFace `config.json` with a recognized `model_type`, or be diffusers pipeline models with `model_index.json`. -- Models requiring `--stage-configs-path` (e.g., CosyVoice3, Fish Speech) are not supported in v1.0.0 — the engine subprocess cannot resolve custom - model types. +- Some HuggingFace repos ship a `config.json` without a `model_type` field; vllm-omni's config resolver will reject these. Patching the local snapshot + with a minimal `config.json` (`{"model_type": "...", "architectures": ["..."]}`) is a common workaround, but the container's pinned `transformers` + version must also register the model type — models newer than that pin will fail at engine startup. Upgrading `transformers` in-place risks breaking + the supported models; wait for a future vllm-omni release with an updated pin. - Multi-stage omni models (thinker + talker + decoder) like Qwen2.5-Omni need significantly more VRAM than the model size suggests. Refer to the individual model cards for minimum GPU requirements. @@ -103,7 +105,7 @@ Start the server, then submit a request. Three things are **required** on `/v1/c --8<-- "examples/vllm-omni/qwen2.5-omni/run.sh" ``` -The `/v1/audio/speech` shortcut (voices: `Chelsie`, `Ethan`) bypasses the thinker and does not apply the correct sampling params in v1.0.0, so it +The `/v1/audio/speech` shortcut (voices: `Chelsie`, `Ethan`) bypasses the thinker and does not apply the correct sampling params in 0.18.0, so it produces noisy output for Qwen2.5-Omni. Prefer `/v1/chat/completions` for this model. ## SageMaker Deployment @@ -127,7 +129,7 @@ header: | --- | --- | | `route=/v1/audio/speech` | TTS | | `route=/v1/images/generations` | Image generation | -| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) | +| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) — returns job-ID only in 0.18.0, MP4 not retrievable via SageMaker | | `route=/v1/chat/completions` | Multimodal chat | | *(no route)* | vLLM default `/invocations` (chat/completion/embed) | @@ -160,23 +162,28 @@ When done, delete the endpoint: predictor.delete_endpoint() ``` -### Async Inference for Video and Long-Running Generation +### Async Inference for Long-Running TTS Generation SageMaker real-time inference has a 60-second timeout. First requests to TTS models may exceed this due to `torch.compile` warmup (~67s); async inference avoids the limit, as does retrying after warmup completes. -For `/v1/videos`, async inference is required because the endpoint returns a job ID rather than the final MP4. The MP4 must be retrieved by polling -the container directly — SageMaker async inference only captures the initial JSON response. +!!! warning "Video generation is not supported on SageMaker in 0.18.0 — see [Known Limitations](#known-limitations) below. Use EC2 for video." ```python ---8<-- "examples/vllm-omni/sagemaker/deploy_video_async.py" +--8<-- "examples/vllm-omni/sagemaker/deploy_tts_async.py" ``` +For async inference, upload the JSON input payload to S3 first, then call `invoke_endpoint_async` with `InputLocation=` and +`CustomAttributes="route=/v1/audio/speech"`. The resulting `.out` object in the configured S3 output path is the raw WAV audio — no polling or +additional retrieval step required. + ## Known Limitations -- **Video generation on SageMaker returns a job ID only.** The `/v1/videos` endpoint in v1.0.0 is async by design and `POST /v1/videos/sync` (which - blocks and returns raw MP4 bytes) is not available. Direct container access (EC2) supports the full video workflow — create job, poll status, - download MP4. A sync endpoint has been added in newer vllm-omni versions and will be supported in a future release. +- **Video generation is not supported on SageMaker in 0.18.0.** The `/v1/videos` endpoint is async by design — it returns a job-ID JSON immediately + and generates the MP4 in the background. Through SageMaker async inference, only that job-ID JSON is written to S3; the MP4 itself never lands in S3 + and cannot be retrieved through `invoke_endpoint` or `invoke_endpoint_async`. Use EC2 for video generation — direct container access supports the + full workflow (create job, poll status, download MP4). SageMaker support is expected once `POST /v1/videos/sync` (which blocks and returns raw MP4 + bytes) is available in a future vllm-omni release. - **First-request latency on SageMaker real-time endpoints.** TTS models can exceed the 60s invoke timeout on the first request due to `torch.compile` warmup. Use async inference or retry after warmup. diff --git a/examples/vllm-omni/sagemaker/deploy_tts.py b/examples/vllm-omni/sagemaker/deploy_tts.py index a4e2d8a91a9a..a701bc90548e 100644 --- a/examples/vllm-omni/sagemaker/deploy_tts.py +++ b/examples/vllm-omni/sagemaker/deploy_tts.py @@ -5,7 +5,7 @@ from sagemaker.serializers import JSONSerializer model = Model( - image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-sagemaker-v1.0.0", + image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1", role="arn:aws:iam:::role/SageMakerExecutionRole", env={"SM_VLLM_MODEL": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"}, predictor_cls=Predictor, diff --git a/examples/vllm-omni/sagemaker/deploy_tts_async.py b/examples/vllm-omni/sagemaker/deploy_tts_async.py new file mode 100644 index 000000000000..9c793f33d5b2 --- /dev/null +++ b/examples/vllm-omni/sagemaker/deploy_tts_async.py @@ -0,0 +1,36 @@ +"""Deploy a vLLM-Omni TTS model to a SageMaker async inference endpoint. + +Async inference avoids the 60-second real-time invoke timeout, which the first +TTS request can exceed due to torch.compile warmup (~67s). The /v1/audio/speech +endpoint returns raw WAV bytes, so the async output written to S3 is the usable +audio file — no polling or extra retrieval step needed. +""" + +from sagemaker.async_inference import AsyncInferenceConfig +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer + +model = Model( + image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1", + role="arn:aws:iam:::role/SageMakerExecutionRole", + env={"SM_VLLM_MODEL": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"}, + predictor_cls=Predictor, +) + +predictor = model.deploy( + instance_type="ml.g5.xlarge", + initial_instance_count=1, + endpoint_name="vllm-omni-tts-async", + inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", + serializer=JSONSerializer(), + async_inference_config=AsyncInferenceConfig( + output_path="s3:///vllm-omni-async-output/", + max_concurrent_invocations_per_instance=1, + ), + wait=True, +) + +# Invoke async — upload the JSON input to S3, then call invoke_endpoint_async. +# The resulting .out object in S3 is the raw WAV audio bytes (content-type audio/wav). +# Use CustomAttributes to route /invocations → /v1/audio/speech. diff --git a/examples/vllm-omni/sagemaker/deploy_video_async.py b/examples/vllm-omni/sagemaker/deploy_video_async.py deleted file mode 100644 index d1ac7c807354..000000000000 --- a/examples/vllm-omni/sagemaker/deploy_video_async.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Deploy a vLLM-Omni video model to a SageMaker async inference endpoint. - -Video generation is async by design — /v1/videos returns a job ID immediately, -so only the job metadata JSON is written to S3, not the MP4 file. To retrieve -the MP4, poll /v1/videos//content directly against the endpoint. -""" - -from sagemaker.async_inference import AsyncInferenceConfig -from sagemaker.model import Model -from sagemaker.predictor import Predictor -from sagemaker.serializers import JSONSerializer - -model = Model( - image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-sagemaker-v1.0.0", - role="arn:aws:iam:::role/SageMakerExecutionRole", - env={"SM_VLLM_MODEL": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"}, - predictor_cls=Predictor, -) - -predictor = model.deploy( - instance_type="ml.g6e.xlarge", - initial_instance_count=1, - endpoint_name="vllm-omni-video-async", - inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", - serializer=JSONSerializer(), - async_inference_config=AsyncInferenceConfig( - output_path="s3:///vllm-omni-async-output/", - max_concurrent_invocations_per_instance=1, - ), - wait=True, -) - -# The middleware converts the JSON payload to multipart/form-data for /v1/videos. -# Response contains the job ID; use the /v1/videos//content endpoint to -# retrieve the MP4 bytes directly from the container. diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh index 2c443695a4f7..36db972d82f3 100755 --- a/examples/vllm-omni/video/run.sh +++ b/examples/vllm-omni/video/run.sh @@ -9,24 +9,24 @@ NAME="${NAME:-omni-video}" docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8000 \ -v "${HOME}/hf-cache:/root/.cache/huggingface" \ - "${IMAGE}" --model "${MODEL}" + "${IMAGE}" --model "${MODEL}" --tensor-parallel-size 2 until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done # /v1/videos requires multipart/form-data. JOB_ID=$(curl -sf -X POST http://localhost:8080/v1/videos \ -F "prompt=a dog running on a beach at sunset" \ - -F "num_frames=17" -F "num_inference_steps=4" \ + -F "num_frames=17" -F "num_inference_steps=30" \ -F "size=480x320" -F "seed=42" \ | python3 -c "import json,sys;print(json.load(sys.stdin)['id'])") echo "submitted job ${JOB_ID}" -# Poll until succeeded (5s interval, 10 min timeout). +# Poll until completed (5s interval, 10 min timeout). for _ in $(seq 1 120); do STATUS=$(curl -sf "http://localhost:8080/v1/videos/${JOB_ID}" \ | python3 -c "import json,sys;print(json.load(sys.stdin)['status'])") - [ "${STATUS}" = "succeeded" ] && break + [ "${STATUS}" = "completed" ] && break [ "${STATUS}" = "failed" ] && { echo "job failed"; exit 1; } sleep 5 done From d9fd989133ceac0fa902e85975d94e23b764acd8 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Mon, 27 Apr 2026 15:41:04 -0700 Subject: [PATCH 10/10] docs(vllm-omni): use hyphenated package key matching PyPI name The vllm-omni package on PyPI is named with a hyphen (pip install vllm-omni), not an underscore. Align the YAML package key with the PyPI project name and drop the redundant underscore display_names entry in global.yml. Signed-off-by: Yadan Wei --- docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml | 2 +- docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml | 2 +- docs/src/global.yml | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml index ada5c1dec88b..a6bc7ec8b859 100644 --- a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml +++ b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml @@ -18,7 +18,7 @@ announcements: packages: vllm: "0.18.0" - vllm_omni: "0.18.0" + vllm-omni: "0.18.0" pytorch: "2.10.0" torchvision: "0.25.0" torchaudio: "2.10.0" diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml index 252e3552ead2..bb61f8a78299 100644 --- a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml +++ b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml @@ -18,7 +18,7 @@ announcements: packages: vllm: "0.18.0" - vllm_omni: "0.18.0" + vllm-omni: "0.18.0" pytorch: "2.10.0" torchvision: "0.25.0" torchaudio: "2.10.0" diff --git a/docs/src/global.yml b/docs/src/global.yml index fb7ed4c95d23..e76cde854d3d 100644 --- a/docs/src/global.yml +++ b/docs/src/global.yml @@ -67,7 +67,6 @@ display_names: sagemaker-xgboost: "XGBoost" sglang: "SGLang" vllm: "vLLM" - vllm_omni: "vLLM-Omni" vllm-arm64: "vLLM ARM64" vllm-omni: "vLLM-Omni" pytorch-training: "PyTorch Training" @@ -101,6 +100,11 @@ display_names: known_issues: "Known Issues" # Packages + # Package keys use the same string as the YAML `packages:` field (underscored + # where applicable), which is distinct from repository keys in the section + # above (hyphenated, matching the data-dir name). For example, `vllm-omni` + # is the repo key (display: "vLLM-Omni" in tables/headings) while `vllm_omni` + # is the package key used in release notes package tables. python: "Python" cuda: "CUDA" cudnn: "cuDNN"