diff --git a/docs/.nav.yml b/docs/.nav.yml index a94963d53480..4c2a53cdf728 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -5,12 +5,14 @@ nav: - Using Deep Learning Containers: get_started/using_dlcs.md - Release Notifications: get_started/release_notifications.md - Ray: ray/index.md + - vLLM-Omni: vllm-omni/index.md - Release Notes: - releasenotes/index.md - Base: releasenotes/base/index.md - Ray: releasenotes/ray/index.md - SGLang: releasenotes/sglang/index.md - vLLM: releasenotes/vllm/index.md + - vLLM-Omni: releasenotes/vllm-omni/index.md - PyTorch: releasenotes/pytorch/index.md - Tensorflow: releasenotes/tensorflow/index.md - Tutorials: tutorials diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml new file mode 100644 index 000000000000..a6bc7ec8b859 --- /dev/null +++ b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml @@ -0,0 +1,27 @@ +framework: vLLM-Omni +version: "0.18.0" +ecr_repository: vllm +accelerator: gpu +python: py312 +cuda: cu129 +os: amzn2023 +platform: default +public_registry: true + +tags: + - "omni-cuda-v1" + +announcements: + - "Initial release of vLLM-Omni containers for EC2, ECS, EKS" + - "Serves omni-modality models: TTS, image generation, video generation, multimodal chat" + - "Built on Amazon Linux 2023 with Python 3.12 and CUDA 12.9" + +packages: + vllm: "0.18.0" + vllm-omni: "0.18.0" + pytorch: "2.10.0" + torchvision: "0.25.0" + torchaudio: "2.10.0" + cuda: "12.9.1" + flashinfer: "0.6.6" + efa: "1.47.0" diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml new file mode 100644 index 000000000000..bb61f8a78299 --- /dev/null +++ b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml @@ -0,0 +1,27 @@ +framework: vLLM-Omni +version: "0.18.0" +ecr_repository: vllm +accelerator: gpu +python: py312 +cuda: cu129 +os: amzn2023 +platform: sagemaker +public_registry: true + +tags: + - "omni-sagemaker-cuda-v1" + +announcements: + - "Initial release of vLLM-Omni containers for SageMaker" + - "Includes ASGI routing middleware for /invocations dispatch via CustomAttributes" + - "Built on Amazon Linux 2023 with Python 3.12 and CUDA 12.9" + +packages: + vllm: "0.18.0" + vllm-omni: "0.18.0" + pytorch: "2.10.0" + torchvision: "0.25.0" + torchaudio: "2.10.0" + cuda: "12.9.1" + flashinfer: "0.6.6" + efa: "1.47.0" diff --git a/docs/src/generate.py b/docs/src/generate.py index 6189cbc5b926..43dca381d322 100644 --- a/docs/src/generate.py +++ b/docs/src/generate.py @@ -376,9 +376,12 @@ def generate_available_images(dry_run: bool = False) -> str: section = f"{AVAILABLE_IMAGES_TABLE_HEADER} {display_name}\n" if has_public_registry: - url = f"{PUBLIC_GALLERY_URL}/{repository}" + # Use ecr_repository from images (falls back to data-dir key when unset) so display + # reflects the actual ECR repo when the data-dir key differs (e.g., vllm-omni -> vllm). + ecr_repo = images[0].ecr_repository if images else repository + url = f"{PUBLIC_GALLERY_URL}/{ecr_repo}" section += ( - f"\nThese images are also available in ECR Public Gallery: [{repository}]({url})\n" + f"\nThese images are also available in ECR Public Gallery: [{ecr_repo}]({url})\n" ) if table_config.get("note"): section += f"\n{table_config['note']}\n" diff --git a/docs/src/global.yml b/docs/src/global.yml index 4f5133c38911..e76cde854d3d 100644 --- a/docs/src/global.yml +++ b/docs/src/global.yml @@ -68,6 +68,7 @@ display_names: sglang: "SGLang" vllm: "vLLM" vllm-arm64: "vLLM ARM64" + vllm-omni: "vLLM-Omni" pytorch-training: "PyTorch Training" pytorch-training-arm64: "PyTorch Training ARM64" pytorch-inference: "PyTorch Inference" @@ -99,6 +100,11 @@ display_names: known_issues: "Known Issues" # Packages + # Package keys use the same string as the YAML `packages:` field (underscored + # where applicable), which is distinct from repository keys in the section + # above (hyphenated, matching the data-dir name). For example, `vllm-omni` + # is the repo key (display: "vLLM-Omni" in tables/headings) while `vllm_omni` + # is the package key used in release notes package tables. python: "Python" cuda: "CUDA" cudnn: "cuDNN" @@ -167,6 +173,7 @@ table_order: - sglang - vllm - vllm-arm64 + - vllm-omni - pytorch-training - pytorch-inference - pytorch-training-arm64 diff --git a/docs/src/image_config.py b/docs/src/image_config.py index f5c02e52837a..cc36a15a4572 100644 --- a/docs/src/image_config.py +++ b/docs/src/image_config.py @@ -45,6 +45,13 @@ def repository(self) -> str: """Repository name for this image.""" return self._repository + @property + def ecr_repository(self) -> str: + """ECR repository name for image URIs. Defaults to repository, but can be overridden + via the optional 'ecr_repository' YAML field when the data-directory key differs from + the actual ECR repo name (e.g., data dir 'vllm-omni' -> ECR repo 'vllm').""" + return self._data.get("ecr_repository") or self._repository + @property def framework_group(self) -> str: """Framework group key (or repository if not in a group).""" @@ -91,11 +98,11 @@ def get_image_uris(self) -> list[str]: uris = [] for tag in tags: - uris.append(build_ecr_uri(account, self._repository, tag, region)) + uris.append(build_ecr_uri(account, self.ecr_repository, tag, region)) if self.get("public_registry"): for tag in tags: - uris.append(build_public_ecr_uri(self._repository, tag)) + uris.append(build_public_ecr_uri(self.ecr_repository, tag)) return uris @@ -126,7 +133,7 @@ def display_framework_version(self) -> str: def display_example_url(self) -> str: """Example ECR URL for table display.""" account = self.get("example_ecr_account", GLOBAL_CONFIG["example_ecr_account"]) - return f"`{build_ecr_uri(account, self._repository, self.display_tag)}`" + return f"`{build_ecr_uri(account, self.ecr_repository, self.display_tag)}`" @property def display_platform(self) -> str: @@ -277,4 +284,4 @@ def get_latest_image_uri(repo: str, platform: str) -> str: latest = sort_by_version(matching)[0] account = latest.get("example_ecr_account", GLOBAL_CONFIG["example_ecr_account"]) - return build_ecr_uri(account, repo, latest.display_tag, "us-west-2") + return build_ecr_uri(account, latest.ecr_repository, latest.display_tag, "us-west-2") diff --git a/docs/src/macros.py b/docs/src/macros.py index 8eab4e930138..cf5ce81be274 100644 --- a/docs/src/macros.py +++ b/docs/src/macros.py @@ -42,4 +42,6 @@ def define_env(env): "latest_ray_default_cpu": _get_latest_ray_uri("default", "cpu"), "latest_ray_sagemaker_gpu": _get_latest_ray_uri("sagemaker", "gpu"), "latest_ray_sagemaker_cpu": _get_latest_ray_uri("sagemaker", "cpu"), + "latest_vllm_omni_ec2": get_latest_image_uri("vllm-omni", "default"), + "latest_vllm_omni_sagemaker": get_latest_image_uri("vllm-omni", "sagemaker"), } diff --git a/docs/src/tables/vllm-omni.yml b/docs/src/tables/vllm-omni.yml new file mode 100644 index 000000000000..4c4ffa203fa2 --- /dev/null +++ b/docs/src/tables/vllm-omni.yml @@ -0,0 +1,14 @@ +# Table Configuration - vLLM-Omni +columns: + - field: framework_version + header: "Framework" + - field: python + header: "Python" + - field: cuda + header: "CUDA" + - field: accelerator + header: "Accelerator" + - field: platform + header: "Platform" + - field: example_url + header: "Example URL" diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md new file mode 100644 index 000000000000..ba7e5ffb4689 --- /dev/null +++ b/docs/vllm-omni/index.md @@ -0,0 +1,198 @@ +# vLLM-Omni Inference + +Pre-built Docker images for serving omni-modality models (text-to-speech, image generation, video generation, and multimodal chat) with +[vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 12.9 and Python 3.12. + +## Latest Announcements + +**April 24, 2026** — vLLM-Omni 0.18.0 initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a +SageMaker routing middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`. + +## Pull Commands + +**EC2:** + +```bash +docker pull {{ images.latest_vllm_omni_ec2 }} +``` + +**SageMaker:** + +```bash +docker pull {{ images.latest_vllm_omni_sagemaker }} +``` + +See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication +instructions. + +## Packages + +For package versions included in each release, see the [Release Notes](../releasenotes/vllm-omni/index.md). + +## Supported Modalities + +| Modality | Route | Example Model | +| --- | --- | --- | +| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | +| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B` | +| Video Generation | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | +| Multimodal Chat | `/v1/chat/completions` | `bytedance-research/BAGEL-7B-MoT`, `Qwen/Qwen2.5-Omni-3B` | + +## Model Compatibility + +- Models must have a standard HuggingFace `config.json` with a recognized `model_type`, or be diffusers pipeline models with `model_index.json`. +- Some HuggingFace repos ship a `config.json` without a `model_type` field; vllm-omni's config resolver will reject these. Patching the local snapshot + with a minimal `config.json` (`{"model_type": "...", "architectures": ["..."]}`) is a common workaround, but the container's pinned `transformers` + version must also register the model type — models newer than that pin will fail at engine startup. Upgrading `transformers` in-place risks breaking + the supported models; wait for a future vllm-omni release with an updated pin. +- Multi-stage omni models (thinker + talker + decoder) like Qwen2.5-Omni need significantly more VRAM than the model size suggests. Refer to the + individual model cards for minimum GPU requirements. + +## EC2 Deployment + +The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080. Each example below is a self-contained shell script that +starts the container, waits for readiness, submits a request, and writes the output to disk. Any `vllm serve` flag may be appended to `docker run` +(e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`). + +### Text-to-Speech + +**Model:** [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) — a 1.7B-parameter Qwen3 text-to-speech +model supporting multiple voices and languages, runs on a single 24 GB GPU (A10G / L4). + +```bash +--8<-- "examples/vllm-omni/tts/run.sh" +``` + +### Image Generation + +**Model:** [FLUX.2-klein-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B) — a 4B-parameter rectified-flow transformer from Black Forest +Labs, produces high-quality 512×512 images from text prompts, runs on a single 24 GB GPU. + +```bash +--8<-- "examples/vllm-omni/image/run.sh" +``` + +### Video Generation + +**Model:** [Wan2.1-T2V-1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) — a 1.3B-parameter text-to-video diffusion model from the Wan +team, generates short clips at up to 480×832 resolution. Needs a 48 GB GPU (L40S) or 2× 24 GB GPUs with `--tensor-parallel-size 2`. + +The `/v1/videos` endpoint is asynchronous — it returns a job ID immediately and generates the video in the background. The script below submits the +job, polls until it completes, then downloads the MP4. + +```bash +--8<-- "examples/vllm-omni/video/run.sh" +``` + +### Multimodal Chat + +Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) are supplied as URL or base64 content parts in the message list. + +**Example model:** [Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) — a 3B-parameter omni model accepting text, image, and audio inputs +and generating text or speech outputs. Multi-stage architecture (thinker + talker + code2wav) requires **≥ 4 GPUs**: `g5.12xlarge` / `g6.12xlarge` (4× +A10G) or `g6e.12xlarge` (4× L40S). + +Start the server, then submit a request. Three things are **required** on `/v1/chat/completions` to produce clean audio from Qwen2.5-Omni: + +1. `"modalities": ["audio"]` — not `["text","audio"]` (that returns empty audio). +2. `"sampling_params_list"` — a 3-element list (thinker, talker, code2wav). The image's built-in per-stage defaults produce noise; use the values from + the official Qwen docs. +3. The exact Qwen system prompt. + +!!! warning "Omitting `sampling_params_list` returns 200 with valid WAV bytes that sound like noise — the single most common footgun." + +```bash +--8<-- "examples/vllm-omni/qwen2.5-omni/run.sh" +``` + +The `/v1/audio/speech` shortcut (voices: `Chelsie`, `Ethan`) bypasses the thinker and does not apply the correct sampling params in 0.18.0, so it +produces noisy output for Qwen2.5-Omni. Prefer `/v1/chat/completions` for this model. + +## SageMaker Deployment + +### Prerequisites + +- AWS CLI configured with appropriate permissions +- An IAM execution role with SageMaker and ECR permissions (see [Ray tutorial](../ray/index.md#prerequisites) for an example setup) +- SageMaker Python SDK v2: + +```bash +pip install 'sagemaker>=2,<3' +``` + +### Routing Middleware + +The SageMaker image includes an ASGI middleware that dispatches `/invocations` to the correct vllm-omni endpoint based on the `CustomAttributes` +header: + +| `CustomAttributes` | Dispatched to | +| --- | --- | +| `route=/v1/audio/speech` | TTS | +| `route=/v1/images/generations` | Image generation | +| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) — returns job-ID only in 0.18.0, MP4 not retrievable via SageMaker | +| `route=/v1/chat/completions` | Multimodal chat | +| *(no route)* | vLLM default `/invocations` (chat/completion/embed) | + +### Environment Variables + +Any `SM_VLLM_*` env var is converted to a `--` CLI argument (e.g., `SM_VLLM_MAX_MODEL_LEN=2048` → `--max-model-len 2048`). + +| Variable | Description | Example | +| --- | --- | --- | +| `SM_VLLM_MODEL` | Model ID (HuggingFace or local path) | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` | +| `SM_VLLM_MAX_MODEL_LEN` | Max sequence length | `2048` | +| `SM_VLLM_ENFORCE_EAGER` | Disable CUDA graphs | `true` | +| `SM_VLLM_TENSOR_PARALLEL_SIZE` | Number of GPUs for TP | `2` | +| `HF_TOKEN` | HuggingFace token for gated models | `hf_...` | + +### Deploy a TTS Endpoint + +!!! warning "SageMaker endpoint deployment takes several minutes and incurs costs. Remember to delete endpoints when done." + +```python +--8<-- "examples/vllm-omni/sagemaker/deploy_tts.py" +``` + +GPU deploys require `inference_ami_version` — the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 12.9 images. See +[ProductionVariant API reference](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html) for valid values. + +When done, delete the endpoint: + +```python +predictor.delete_endpoint() +``` + +### Async Inference for Long-Running TTS Generation + +SageMaker real-time inference has a 60-second timeout. First requests to TTS models may exceed this due to `torch.compile` warmup (~67s); async +inference avoids the limit, as does retrying after warmup completes. + +!!! warning "Video generation is not supported on SageMaker in 0.18.0 — see [Known Limitations](#known-limitations) below. Use EC2 for video." + +```python +--8<-- "examples/vllm-omni/sagemaker/deploy_tts_async.py" +``` + +For async inference, upload the JSON input payload to S3 first, then call `invoke_endpoint_async` with `InputLocation=` and +`CustomAttributes="route=/v1/audio/speech"`. The resulting `.out` object in the configured S3 output path is the raw WAV audio — no polling or +additional retrieval step required. + +## Known Limitations + +- **Video generation is not supported on SageMaker in 0.18.0.** The `/v1/videos` endpoint is async by design — it returns a job-ID JSON immediately + and generates the MP4 in the background. Through SageMaker async inference, only that job-ID JSON is written to S3; the MP4 itself never lands in S3 + and cannot be retrieved through `invoke_endpoint` or `invoke_endpoint_async`. Use EC2 for video generation — direct container access supports the + full workflow (create job, poll status, download MP4). SageMaker support is expected once `POST /v1/videos/sync` (which blocks and returns raw MP4 + bytes) is available in a future vllm-omni release. +- **First-request latency on SageMaker real-time endpoints.** TTS models can exceed the 60s invoke timeout on the first request due to `torch.compile` + warmup. Use async inference or retry after warmup. + +## Release Notes + +See [vLLM-Omni Release Notes](../releasenotes/vllm-omni/index.md) for version history and changelogs. + +## Resources + +- [vLLM-Omni Documentation](https://github.com/vllm-project/vllm-omni) +- [GitHub Repository](https://github.com/aws/deep-learning-containers) +- [Available Images](../reference/available_images.md) diff --git a/examples/vllm-omni/image/run.sh b/examples/vllm-omni/image/run.sh new file mode 100755 index 000000000000..d6dc1615ad95 --- /dev/null +++ b/examples/vllm-omni/image/run.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# End-to-end image-generation example: start server, wait for ready, generate. +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +MODEL="${MODEL:-black-forest-labs/FLUX.2-klein-4B}" +NAME="${NAME:-omni-image}" + +docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + "${IMAGE}" --model "${MODEL}" + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done + +# Response JSON has data[0].b64_json — decode to PNG. +curl -sf -X POST http://localhost:8080/v1/images/generations \ + -H "Content-Type: application/json" \ + -d '{"prompt": "a red apple on a white table, studio lighting", "size": "512x512", "n": 1}' \ + | python3 -c "import base64,json,sys;open('image.png','wb').write(base64.b64decode(json.load(sys.stdin)['data'][0]['b64_json']))" + +echo "wrote image.png ($(stat -f%z image.png 2>/dev/null || stat -c%s image.png) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" diff --git a/examples/vllm-omni/qwen2.5-omni/run.sh b/examples/vllm-omni/qwen2.5-omni/run.sh new file mode 100755 index 000000000000..a04624bdf99d --- /dev/null +++ b/examples/vllm-omni/qwen2.5-omni/run.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# End-to-end Qwen2.5-Omni-3B example: start server, wait for ready, +# generate speech via /v1/chat/completions. +# +# REQUIRES ≥ 4 GPUs (e.g., g5.12xlarge / g6.12xlarge / g6e.12xlarge). +# On single-GPU hosts the model's talker stage fails to load on GPU 1. +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +MODEL="${MODEL:-Qwen/Qwen2.5-Omni-3B}" +NAME="${NAME:-omni3b}" + +docker run -d --name "${NAME}" --gpus all --shm-size=16g -p 8080:8080 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + -e HF_HUB_ENABLE_HF_TRANSFER=1 \ + "${IMAGE}" --model "${MODEL}" \ + --host 0.0.0.0 --port 8080 \ + --max-model-len 16384 --dtype bfloat16 + +# First start takes ~8 min (weight download + 3-stage load). +until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done + +# Three things are REQUIRED for clean audio: +# 1. "modalities": ["audio"] (NOT ["text","audio"] — returns empty audio) +# 2. "sampling_params_list" (3-element list: thinker, talker, code2wav; +# built-in defaults produce noise) +# 3. The exact Qwen system prompt below. +# Omitting #2 returns 200 OK with valid WAV bytes that sound like noise. +curl -sf -X POST http://localhost:8080/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen2.5-Omni-3B", + "modalities": ["audio"], + "sampling_params_list": [ + {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1}, + {"temperature":0.9,"top_p":0.8,"top_k":40,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.05,"stop_token_ids":[8294]}, + {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1} + ], + "messages": [ + {"role":"system","content":[{"type":"text","text":"You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]}, + {"role":"user","content":[{"type":"text","text":"Tell me a short, calming bedtime lullaby story for a 6-year-old girl."}]} + ] + }' | jq -r '.choices[0].message.audio.data' | base64 -d > lullaby.wav + +echo "wrote lullaby.wav ($(stat -f%z lullaby.wav 2>/dev/null || stat -c%s lullaby.wav) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" diff --git a/examples/vllm-omni/sagemaker/deploy_tts.py b/examples/vllm-omni/sagemaker/deploy_tts.py new file mode 100644 index 000000000000..a701bc90548e --- /dev/null +++ b/examples/vllm-omni/sagemaker/deploy_tts.py @@ -0,0 +1,32 @@ +"""Deploy a vLLM-Omni TTS model to a real-time SageMaker endpoint.""" + +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer + +model = Model( + image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1", + role="arn:aws:iam:::role/SageMakerExecutionRole", + env={"SM_VLLM_MODEL": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"}, + predictor_cls=Predictor, +) + +predictor = model.deploy( + instance_type="ml.g5.xlarge", + initial_instance_count=1, + endpoint_name="vllm-omni-tts", + inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", + serializer=JSONSerializer(), + wait=True, +) + +# Invoke — route /invocations to /v1/audio/speech via CustomAttributes +sm_runtime = predictor.sagemaker_session.sagemaker_runtime_client +response = sm_runtime.invoke_endpoint( + EndpointName=predictor.endpoint_name, + ContentType="application/json", + Body='{"input": "Hello world", "voice": "vivian", "language": "English"}', + CustomAttributes="route=/v1/audio/speech", +) +with open("speech.wav", "wb") as f: + f.write(response["Body"].read()) diff --git a/examples/vllm-omni/sagemaker/deploy_tts_async.py b/examples/vllm-omni/sagemaker/deploy_tts_async.py new file mode 100644 index 000000000000..9c793f33d5b2 --- /dev/null +++ b/examples/vllm-omni/sagemaker/deploy_tts_async.py @@ -0,0 +1,36 @@ +"""Deploy a vLLM-Omni TTS model to a SageMaker async inference endpoint. + +Async inference avoids the 60-second real-time invoke timeout, which the first +TTS request can exceed due to torch.compile warmup (~67s). The /v1/audio/speech +endpoint returns raw WAV bytes, so the async output written to S3 is the usable +audio file — no polling or extra retrieval step needed. +""" + +from sagemaker.async_inference import AsyncInferenceConfig +from sagemaker.model import Model +from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer + +model = Model( + image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1", + role="arn:aws:iam:::role/SageMakerExecutionRole", + env={"SM_VLLM_MODEL": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"}, + predictor_cls=Predictor, +) + +predictor = model.deploy( + instance_type="ml.g5.xlarge", + initial_instance_count=1, + endpoint_name="vllm-omni-tts-async", + inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", + serializer=JSONSerializer(), + async_inference_config=AsyncInferenceConfig( + output_path="s3:///vllm-omni-async-output/", + max_concurrent_invocations_per_instance=1, + ), + wait=True, +) + +# Invoke async — upload the JSON input to S3, then call invoke_endpoint_async. +# The resulting .out object in S3 is the raw WAV audio bytes (content-type audio/wav). +# Use CustomAttributes to route /invocations → /v1/audio/speech. diff --git a/examples/vllm-omni/tts/run.sh b/examples/vllm-omni/tts/run.sh new file mode 100755 index 000000000000..9f4f185a2139 --- /dev/null +++ b/examples/vllm-omni/tts/run.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# End-to-end TTS example: start server, wait for ready, synthesize speech. +# Requires: docker (with NVIDIA runtime), curl, an authenticated ECR pull. +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice}" +NAME="${NAME:-omni-tts}" + +docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + "${IMAGE}" --model "${MODEL}" + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done + +curl -sf -X POST http://localhost:8080/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{"input": "Hello from vLLM-Omni.", "voice": "vivian", "language": "English"}' \ + --output speech.wav + +echo "wrote speech.wav ($(stat -f%z speech.wav 2>/dev/null || stat -c%s speech.wav) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}" diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh new file mode 100755 index 000000000000..36db972d82f3 --- /dev/null +++ b/examples/vllm-omni/video/run.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# End-to-end video-generation example: start server, submit job, poll, download. +# /v1/videos is async — it returns a job ID; the MP4 is produced in the background. +set -euo pipefail + +IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}" +MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}" +NAME="${NAME:-omni-video}" + +docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8000 \ + -v "${HOME}/hf-cache:/root/.cache/huggingface" \ + "${IMAGE}" --model "${MODEL}" --tensor-parallel-size 2 + +until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done + +# /v1/videos requires multipart/form-data. +JOB_ID=$(curl -sf -X POST http://localhost:8080/v1/videos \ + -F "prompt=a dog running on a beach at sunset" \ + -F "num_frames=17" -F "num_inference_steps=30" \ + -F "size=480x320" -F "seed=42" \ + | python3 -c "import json,sys;print(json.load(sys.stdin)['id'])") + +echo "submitted job ${JOB_ID}" + +# Poll until completed (5s interval, 10 min timeout). +for _ in $(seq 1 120); do + STATUS=$(curl -sf "http://localhost:8080/v1/videos/${JOB_ID}" \ + | python3 -c "import json,sys;print(json.load(sys.stdin)['status'])") + [ "${STATUS}" = "completed" ] && break + [ "${STATUS}" = "failed" ] && { echo "job failed"; exit 1; } + sleep 5 +done + +curl -sf "http://localhost:8080/v1/videos/${JOB_ID}/content" --output video.mp4 +echo "wrote video.mp4 ($(stat -f%z video.mp4 2>/dev/null || stat -c%s video.mp4) bytes)" +# Cleanup: docker stop "${NAME}" && docker rm "${NAME}"