diff --git a/docs/.nav.yml b/docs/.nav.yml
index a94963d53480..4c2a53cdf728 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -5,12 +5,14 @@ nav:
     - Using Deep Learning Containers: get_started/using_dlcs.md
     - Release Notifications: get_started/release_notifications.md
     - Ray: ray/index.md
+    - vLLM-Omni: vllm-omni/index.md
   - Release Notes:
     - releasenotes/index.md
     - Base: releasenotes/base/index.md
     - Ray: releasenotes/ray/index.md
     - SGLang: releasenotes/sglang/index.md
     - vLLM: releasenotes/vllm/index.md
+    - vLLM-Omni: releasenotes/vllm-omni/index.md
     - PyTorch: releasenotes/pytorch/index.md
     - Tensorflow: releasenotes/tensorflow/index.md
   - Tutorials: tutorials
diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
new file mode 100644
index 000000000000..a6bc7ec8b859
--- /dev/null
+++ b/docs/src/data/vllm-omni/0.18.0-gpu-ec2.yml
@@ -0,0 +1,27 @@
+framework: vLLM-Omni
+version: "0.18.0"
+ecr_repository: vllm
+accelerator: gpu
+python: py312
+cuda: cu129
+os: amzn2023
+platform: default
+public_registry: true
+
+tags:
+  - "omni-cuda-v1"
+
+announcements:
+  - "Initial release of vLLM-Omni containers for EC2, ECS, EKS"
+  - "Serves omni-modality models: TTS, image generation, video generation, multimodal chat"
+  - "Built on Amazon Linux 2023 with Python 3.12 and CUDA 12.9"
+
+packages:
+  vllm: "0.18.0"
+  vllm-omni: "0.18.0"
+  pytorch: "2.10.0"
+  torchvision: "0.25.0"
+  torchaudio: "2.10.0"
+  cuda: "12.9.1"
+  flashinfer: "0.6.6"
+  efa: "1.47.0"
diff --git a/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
new file mode 100644
index 000000000000..bb61f8a78299
--- /dev/null
+++ b/docs/src/data/vllm-omni/0.18.0-gpu-sagemaker.yml
@@ -0,0 +1,27 @@
+framework: vLLM-Omni
+version: "0.18.0"
+ecr_repository: vllm
+accelerator: gpu
+python: py312
+cuda: cu129
+os: amzn2023
+platform: sagemaker
+public_registry: true
+
+tags:
+  - "omni-sagemaker-cuda-v1"
+
+announcements:
+  - "Initial release of vLLM-Omni containers for SageMaker"
+  - "Includes ASGI routing middleware for /invocations dispatch via CustomAttributes"
+  - "Built on Amazon Linux 2023 with Python 3.12 and CUDA 12.9"
+
+packages:
+  vllm: "0.18.0"
+  vllm-omni: "0.18.0"
+  pytorch: "2.10.0"
+  torchvision: "0.25.0"
+  torchaudio: "2.10.0"
+  cuda: "12.9.1"
+  flashinfer: "0.6.6"
+  efa: "1.47.0"
diff --git a/docs/src/generate.py b/docs/src/generate.py
index 6189cbc5b926..43dca381d322 100644
--- a/docs/src/generate.py
+++ b/docs/src/generate.py
@@ -376,9 +376,12 @@ def generate_available_images(dry_run: bool = False) -> str:
 
         section = f"{AVAILABLE_IMAGES_TABLE_HEADER} {display_name}\n"
         if has_public_registry:
-            url = f"{PUBLIC_GALLERY_URL}/{repository}"
+            # Use ecr_repository from images (falls back to data-dir key when unset) so display
+            # reflects the actual ECR repo when the data-dir key differs (e.g., vllm-omni -> vllm).
+            ecr_repo = images[0].ecr_repository if images else repository
+            url = f"{PUBLIC_GALLERY_URL}/{ecr_repo}"
             section += (
-                f"\nThese images are also available in ECR Public Gallery: [{repository}]({url})\n"
+                f"\nThese images are also available in ECR Public Gallery: [{ecr_repo}]({url})\n"
             )
         if table_config.get("note"):
             section += f"\n{table_config['note']}\n"
diff --git a/docs/src/global.yml b/docs/src/global.yml
index 4f5133c38911..e76cde854d3d 100644
--- a/docs/src/global.yml
+++ b/docs/src/global.yml
@@ -68,6 +68,7 @@ display_names:
   sglang: "SGLang"
   vllm: "vLLM"
   vllm-arm64: "vLLM ARM64"
+  vllm-omni: "vLLM-Omni"
   pytorch-training: "PyTorch Training"
   pytorch-training-arm64: "PyTorch Training ARM64"
   pytorch-inference: "PyTorch Inference"
@@ -99,6 +100,11 @@ display_names:
   known_issues: "Known Issues"
 
   # Packages
+  # Package keys use the same string as the YAML `packages:` field (underscored
+  # where applicable), which is distinct from repository keys in the section
+  # above (hyphenated, matching the data-dir name). For example, `vllm-omni`
+  # is the repo key (display: "vLLM-Omni" in tables/headings) while `vllm_omni`
+  # is the package key used in release notes package tables.
   python: "Python"
   cuda: "CUDA"
   cudnn: "cuDNN"
@@ -167,6 +173,7 @@ table_order:
   - sglang
   - vllm
   - vllm-arm64
+  - vllm-omni
   - pytorch-training
   - pytorch-inference
   - pytorch-training-arm64
diff --git a/docs/src/image_config.py b/docs/src/image_config.py
index f5c02e52837a..cc36a15a4572 100644
--- a/docs/src/image_config.py
+++ b/docs/src/image_config.py
@@ -45,6 +45,13 @@ def repository(self) -> str:
         """Repository name for this image."""
         return self._repository
 
+    @property
+    def ecr_repository(self) -> str:
+        """ECR repository name for image URIs. Defaults to repository, but can be overridden
+        via the optional 'ecr_repository' YAML field when the data-directory key differs from
+        the actual ECR repo name (e.g., data dir 'vllm-omni' -> ECR repo 'vllm')."""
+        return self._data.get("ecr_repository") or self._repository
+
     @property
     def framework_group(self) -> str:
         """Framework group key (or repository if not in a group)."""
@@ -91,11 +98,11 @@ def get_image_uris(self) -> list[str]:
 
         uris = []
         for tag in tags:
-            uris.append(build_ecr_uri(account, self._repository, tag, region))
+            uris.append(build_ecr_uri(account, self.ecr_repository, tag, region))
 
         if self.get("public_registry"):
             for tag in tags:
-                uris.append(build_public_ecr_uri(self._repository, tag))
+                uris.append(build_public_ecr_uri(self.ecr_repository, tag))
 
         return uris
 
@@ -126,7 +133,7 @@ def display_framework_version(self) -> str:
     def display_example_url(self) -> str:
         """Example ECR URL for table display."""
         account = self.get("example_ecr_account", GLOBAL_CONFIG["example_ecr_account"])
-        return f"`{build_ecr_uri(account, self._repository, self.display_tag)}`"
+        return f"`{build_ecr_uri(account, self.ecr_repository, self.display_tag)}`"
 
     @property
     def display_platform(self) -> str:
@@ -277,4 +284,4 @@ def get_latest_image_uri(repo: str, platform: str) -> str:
 
     latest = sort_by_version(matching)[0]
     account = latest.get("example_ecr_account", GLOBAL_CONFIG["example_ecr_account"])
-    return build_ecr_uri(account, repo, latest.display_tag, "us-west-2")
+    return build_ecr_uri(account, latest.ecr_repository, latest.display_tag, "us-west-2")
diff --git a/docs/src/macros.py b/docs/src/macros.py
index 8eab4e930138..cf5ce81be274 100644
--- a/docs/src/macros.py
+++ b/docs/src/macros.py
@@ -42,4 +42,6 @@ def define_env(env):
         "latest_ray_default_cpu": _get_latest_ray_uri("default", "cpu"),
         "latest_ray_sagemaker_gpu": _get_latest_ray_uri("sagemaker", "gpu"),
         "latest_ray_sagemaker_cpu": _get_latest_ray_uri("sagemaker", "cpu"),
+        "latest_vllm_omni_ec2": get_latest_image_uri("vllm-omni", "default"),
+        "latest_vllm_omni_sagemaker": get_latest_image_uri("vllm-omni", "sagemaker"),
     }
diff --git a/docs/src/tables/vllm-omni.yml b/docs/src/tables/vllm-omni.yml
new file mode 100644
index 000000000000..4c4ffa203fa2
--- /dev/null
+++ b/docs/src/tables/vllm-omni.yml
@@ -0,0 +1,14 @@
+# Table Configuration - vLLM-Omni
+columns:
+  - field: framework_version
+    header: "Framework"
+  - field: python
+    header: "Python"
+  - field: cuda
+    header: "CUDA"
+  - field: accelerator
+    header: "Accelerator"
+  - field: platform
+    header: "Platform"
+  - field: example_url
+    header: "Example URL"
diff --git a/docs/vllm-omni/index.md b/docs/vllm-omni/index.md
new file mode 100644
index 000000000000..ba7e5ffb4689
--- /dev/null
+++ b/docs/vllm-omni/index.md
@@ -0,0 +1,198 @@
+# vLLM-Omni Inference
+
+Pre-built Docker images for serving omni-modality models (text-to-speech, image generation, video generation, and multimodal chat) with
+[vLLM-Omni](https://github.com/vllm-project/vllm-omni). Built on Amazon Linux 2023 with CUDA 12.9 and Python 3.12.
+
+## Latest Announcements
+
+**April 24, 2026** — vLLM-Omni 0.18.0 initial release. Serves TTS, image, video, and omni-chat models through OpenAI-compatible APIs. Includes a
+SageMaker routing middleware for dispatching `/invocations` to any omni endpoint via `CustomAttributes`.
+
+## Pull Commands
+
+**EC2:**
+
+```bash
+docker pull {{ images.latest_vllm_omni_ec2 }}
+```
+
+**SageMaker:**
+
+```bash
+docker pull {{ images.latest_vllm_omni_sagemaker }}
+```
+
+See [Available Images](../reference/available_images.md) for all image URIs and [Getting Started](../get_started/index.md) for authentication
+instructions.
+
+## Packages
+
+For package versions included in each release, see the [Release Notes](../releasenotes/vllm-omni/index.md).
+
+## Supported Modalities
+
+| Modality | Route | Example Model |
+| --- | --- | --- |
+| Text-to-Speech | `/v1/audio/speech` | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` |
+| Image Generation | `/v1/images/generations` | `black-forest-labs/FLUX.2-klein-4B` |
+| Video Generation | `/v1/videos` | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` |
+| Multimodal Chat | `/v1/chat/completions` | `bytedance-research/BAGEL-7B-MoT`, `Qwen/Qwen2.5-Omni-3B` |
+
+## Model Compatibility
+
+- Models must have a standard HuggingFace `config.json` with a recognized `model_type`, or be diffusers pipeline models with `model_index.json`.
+- Some HuggingFace repos ship a `config.json` without a `model_type` field; vllm-omni's config resolver will reject these. Patching the local snapshot
+  with a minimal `config.json` (`{"model_type": "...", "architectures": ["..."]}`) is a common workaround, but the container's pinned `transformers`
+  version must also register the model type — models newer than that pin will fail at engine startup. Upgrading `transformers` in-place risks breaking
+  the supported models; wait for a future vllm-omni release with an updated pin.
+- Multi-stage omni models (thinker + talker + decoder) like Qwen2.5-Omni need significantly more VRAM than the model size suggests. Refer to the
+  individual model cards for minimum GPU requirements.
+
+## EC2 Deployment
+
+The container runs `vllm serve --omni` and exposes the OpenAI-compatible API on port 8080. Each example below is a self-contained shell script that
+starts the container, waits for readiness, submits a request, and writes the output to disk. Any `vllm serve` flag may be appended to `docker run`
+(e.g., `--tensor-parallel-size 2`, `--max-model-len 2048`, `--enforce-eager`).
+
+### Text-to-Speech
+
+**Model:** [Qwen3-TTS-12Hz-1.7B-CustomVoice](https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) — a 1.7B-parameter Qwen3 text-to-speech
+model supporting multiple voices and languages, runs on a single 24 GB GPU (A10G / L4).
+
+```bash
+--8<-- "examples/vllm-omni/tts/run.sh"
+```
+
+### Image Generation
+
+**Model:** [FLUX.2-klein-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-4B) — a 4B-parameter rectified-flow transformer from Black Forest
+Labs, produces high-quality 512×512 images from text prompts, runs on a single 24 GB GPU.
+
+```bash
+--8<-- "examples/vllm-omni/image/run.sh"
+```
+
+### Video Generation
+
+**Model:** [Wan2.1-T2V-1.3B](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) — a 1.3B-parameter text-to-video diffusion model from the Wan
+team, generates short clips at up to 480×832 resolution. Needs a 48 GB GPU (L40S) or 2× 24 GB GPUs with `--tensor-parallel-size 2`.
+
+The `/v1/videos` endpoint is asynchronous — it returns a job ID immediately and generates the video in the background. The script below submits the
+job, polls until it completes, then downloads the MP4.
+
+```bash
+--8<-- "examples/vllm-omni/video/run.sh"
+```
+
+### Multimodal Chat
+
+Use the standard OpenAI chat-completions API. Multimodal inputs (images, audio) are supplied as URL or base64 content parts in the message list.
+
+**Example model:** [Qwen2.5-Omni-3B](https://huggingface.co/Qwen/Qwen2.5-Omni-3B) — a 3B-parameter omni model accepting text, image, and audio inputs
+and generating text or speech outputs. Multi-stage architecture (thinker + talker + code2wav) requires **≥ 4 GPUs**: `g5.12xlarge` / `g6.12xlarge` (4×
+A10G) or `g6e.12xlarge` (4× L40S).
+
+Start the server, then submit a request. Three things are **required** on `/v1/chat/completions` to produce clean audio from Qwen2.5-Omni:
+
+1. `"modalities": ["audio"]` — not `["text","audio"]` (that returns empty audio).
+2. `"sampling_params_list"` — a 3-element list (thinker, talker, code2wav). The image's built-in per-stage defaults produce noise; use the values from
+   the official Qwen docs.
+3. The exact Qwen system prompt.
+
+!!! warning "Omitting `sampling_params_list` returns 200 with valid WAV bytes that sound like noise — the single most common footgun."
+
+```bash
+--8<-- "examples/vllm-omni/qwen2.5-omni/run.sh"
+```
+
+The `/v1/audio/speech` shortcut (voices: `Chelsie`, `Ethan`) bypasses the thinker and does not apply the correct sampling params in 0.18.0, so it
+produces noisy output for Qwen2.5-Omni. Prefer `/v1/chat/completions` for this model.
+
+## SageMaker Deployment
+
+### Prerequisites
+
+- AWS CLI configured with appropriate permissions
+- An IAM execution role with SageMaker and ECR permissions (see [Ray tutorial](../ray/index.md#prerequisites) for an example setup)
+- SageMaker Python SDK v2:
+
+```bash
+pip install 'sagemaker>=2,<3'
+```
+
+### Routing Middleware
+
+The SageMaker image includes an ASGI middleware that dispatches `/invocations` to the correct vllm-omni endpoint based on the `CustomAttributes`
+header:
+
+| `CustomAttributes` | Dispatched to |
+| --- | --- |
+| `route=/v1/audio/speech` | TTS |
+| `route=/v1/images/generations` | Image generation |
+| `route=/v1/videos` | Video generation (JSON auto-converted to form-data) — returns job-ID only in 0.18.0, MP4 not retrievable via SageMaker |
+| `route=/v1/chat/completions` | Multimodal chat |
+| *(no route)* | vLLM default `/invocations` (chat/completion/embed) |
+
+### Environment Variables
+
+Any `SM_VLLM_*` env var is converted to a `--<name>` CLI argument (e.g., `SM_VLLM_MAX_MODEL_LEN=2048` → `--max-model-len 2048`).
+
+| Variable | Description | Example |
+| --- | --- | --- |
+| `SM_VLLM_MODEL` | Model ID (HuggingFace or local path) | `Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice` |
+| `SM_VLLM_MAX_MODEL_LEN` | Max sequence length | `2048` |
+| `SM_VLLM_ENFORCE_EAGER` | Disable CUDA graphs | `true` |
+| `SM_VLLM_TENSOR_PARALLEL_SIZE` | Number of GPUs for TP | `2` |
+| `HF_TOKEN` | HuggingFace token for gated models | `hf_...` |
+
+### Deploy a TTS Endpoint
+
+!!! warning "SageMaker endpoint deployment takes several minutes and incurs costs. Remember to delete endpoints when done."
+
+```python
+--8<-- "examples/vllm-omni/sagemaker/deploy_tts.py"
+```
+
+GPU deploys require `inference_ami_version` — the default SageMaker host AMI has incompatible NVIDIA drivers for CUDA 12.9 images. See
+[ProductionVariant API reference](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html) for valid values.
+
+When done, delete the endpoint:
+
+```python
+predictor.delete_endpoint()
+```
+
+### Async Inference for Long-Running TTS Generation
+
+SageMaker real-time inference has a 60-second timeout. First requests to TTS models may exceed this due to `torch.compile` warmup (~67s); async
+inference avoids the limit, as does retrying after warmup completes.
+
+!!! warning "Video generation is not supported on SageMaker in 0.18.0 — see [Known Limitations](#known-limitations) below. Use EC2 for video."
+
+```python
+--8<-- "examples/vllm-omni/sagemaker/deploy_tts_async.py"
+```
+
+For async inference, upload the JSON input payload to S3 first, then call `invoke_endpoint_async` with `InputLocation=<s3-uri>` and
+`CustomAttributes="route=/v1/audio/speech"`. The resulting `.out` object in the configured S3 output path is the raw WAV audio — no polling or
+additional retrieval step required.
+
+## Known Limitations
+
+- **Video generation is not supported on SageMaker in 0.18.0.** The `/v1/videos` endpoint is async by design — it returns a job-ID JSON immediately
+  and generates the MP4 in the background. Through SageMaker async inference, only that job-ID JSON is written to S3; the MP4 itself never lands in S3
+  and cannot be retrieved through `invoke_endpoint` or `invoke_endpoint_async`. Use EC2 for video generation — direct container access supports the
+  full workflow (create job, poll status, download MP4). SageMaker support is expected once `POST /v1/videos/sync` (which blocks and returns raw MP4
+  bytes) is available in a future vllm-omni release.
+- **First-request latency on SageMaker real-time endpoints.** TTS models can exceed the 60s invoke timeout on the first request due to `torch.compile`
+  warmup. Use async inference or retry after warmup.
+
+## Release Notes
+
+See [vLLM-Omni Release Notes](../releasenotes/vllm-omni/index.md) for version history and changelogs.
+
+## Resources
+
+- [vLLM-Omni Documentation](https://github.com/vllm-project/vllm-omni)
+- [GitHub Repository](https://github.com/aws/deep-learning-containers)
+- [Available Images](../reference/available_images.md)
diff --git a/examples/vllm-omni/image/run.sh b/examples/vllm-omni/image/run.sh
new file mode 100755
index 000000000000..d6dc1615ad95
--- /dev/null
+++ b/examples/vllm-omni/image/run.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# End-to-end image-generation example: start server, wait for ready, generate.
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+MODEL="${MODEL:-black-forest-labs/FLUX.2-klein-4B}"
+NAME="${NAME:-omni-image}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  "${IMAGE}" --model "${MODEL}"
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+
+# Response JSON has data[0].b64_json — decode to PNG.
+curl -sf -X POST http://localhost:8080/v1/images/generations \
+  -H "Content-Type: application/json" \
+  -d '{"prompt": "a red apple on a white table, studio lighting", "size": "512x512", "n": 1}' \
+  | python3 -c "import base64,json,sys;open('image.png','wb').write(base64.b64decode(json.load(sys.stdin)['data'][0]['b64_json']))"
+
+echo "wrote image.png ($(stat -f%z image.png 2>/dev/null || stat -c%s image.png) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"
diff --git a/examples/vllm-omni/qwen2.5-omni/run.sh b/examples/vllm-omni/qwen2.5-omni/run.sh
new file mode 100755
index 000000000000..a04624bdf99d
--- /dev/null
+++ b/examples/vllm-omni/qwen2.5-omni/run.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# End-to-end Qwen2.5-Omni-3B example: start server, wait for ready,
+# generate speech via /v1/chat/completions.
+#
+# REQUIRES ≥ 4 GPUs (e.g., g5.12xlarge / g6.12xlarge / g6e.12xlarge).
+# On single-GPU hosts the model's talker stage fails to load on GPU 1.
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+MODEL="${MODEL:-Qwen/Qwen2.5-Omni-3B}"
+NAME="${NAME:-omni3b}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=16g -p 8080:8080 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  -e HF_HUB_ENABLE_HF_TRANSFER=1 \
+  "${IMAGE}" --model "${MODEL}" \
+  --host 0.0.0.0 --port 8080 \
+  --max-model-len 16384 --dtype bfloat16
+
+# First start takes ~8 min (weight download + 3-stage load).
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 10; done
+
+# Three things are REQUIRED for clean audio:
+#   1. "modalities": ["audio"]  (NOT ["text","audio"] — returns empty audio)
+#   2. "sampling_params_list"   (3-element list: thinker, talker, code2wav;
+#                                built-in defaults produce noise)
+#   3. The exact Qwen system prompt below.
+# Omitting #2 returns 200 OK with valid WAV bytes that sound like noise.
+curl -sf -X POST http://localhost:8080/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "model": "Qwen/Qwen2.5-Omni-3B",
+    "modalities": ["audio"],
+    "sampling_params_list": [
+      {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1},
+      {"temperature":0.9,"top_p":0.8,"top_k":40,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.05,"stop_token_ids":[8294]},
+      {"temperature":0.0,"top_p":1.0,"top_k":-1,"max_tokens":2048,"seed":42,"detokenize":true,"repetition_penalty":1.1}
+    ],
+    "messages": [
+      {"role":"system","content":[{"type":"text","text":"You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}]},
+      {"role":"user","content":[{"type":"text","text":"Tell me a short, calming bedtime lullaby story for a 6-year-old girl."}]}
+    ]
+  }' | jq -r '.choices[0].message.audio.data' | base64 -d > lullaby.wav
+
+echo "wrote lullaby.wav ($(stat -f%z lullaby.wav 2>/dev/null || stat -c%s lullaby.wav) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"
diff --git a/examples/vllm-omni/sagemaker/deploy_tts.py b/examples/vllm-omni/sagemaker/deploy_tts.py
new file mode 100644
index 000000000000..a701bc90548e
--- /dev/null
+++ b/examples/vllm-omni/sagemaker/deploy_tts.py
@@ -0,0 +1,32 @@
+"""Deploy a vLLM-Omni TTS model to a real-time SageMaker endpoint."""
+
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+
+model = Model(
+    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1",
+    role="arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole",
+    env={"SM_VLLM_MODEL": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"},
+    predictor_cls=Predictor,
+)
+
+predictor = model.deploy(
+    instance_type="ml.g5.xlarge",
+    initial_instance_count=1,
+    endpoint_name="vllm-omni-tts",
+    inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
+    serializer=JSONSerializer(),
+    wait=True,
+)
+
+# Invoke — route /invocations to /v1/audio/speech via CustomAttributes
+sm_runtime = predictor.sagemaker_session.sagemaker_runtime_client
+response = sm_runtime.invoke_endpoint(
+    EndpointName=predictor.endpoint_name,
+    ContentType="application/json",
+    Body='{"input": "Hello world", "voice": "vivian", "language": "English"}',
+    CustomAttributes="route=/v1/audio/speech",
+)
+with open("speech.wav", "wb") as f:
+    f.write(response["Body"].read())
diff --git a/examples/vllm-omni/sagemaker/deploy_tts_async.py b/examples/vllm-omni/sagemaker/deploy_tts_async.py
new file mode 100644
index 000000000000..9c793f33d5b2
--- /dev/null
+++ b/examples/vllm-omni/sagemaker/deploy_tts_async.py
@@ -0,0 +1,36 @@
+"""Deploy a vLLM-Omni TTS model to a SageMaker async inference endpoint.
+
+Async inference avoids the 60-second real-time invoke timeout, which the first
+TTS request can exceed due to torch.compile warmup (~67s). The /v1/audio/speech
+endpoint returns raw WAV bytes, so the async output written to S3 is the usable
+audio file — no polling or extra retrieval step needed.
+"""
+
+from sagemaker.async_inference import AsyncInferenceConfig
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.serializers import JSONSerializer
+
+model = Model(
+    image_uri="763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm:omni-sagemaker-cuda-v1",
+    role="arn:aws:iam::<ACCOUNT>:role/SageMakerExecutionRole",
+    env={"SM_VLLM_MODEL": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"},
+    predictor_cls=Predictor,
+)
+
+predictor = model.deploy(
+    instance_type="ml.g5.xlarge",
+    initial_instance_count=1,
+    endpoint_name="vllm-omni-tts-async",
+    inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1",
+    serializer=JSONSerializer(),
+    async_inference_config=AsyncInferenceConfig(
+        output_path="s3://<BUCKET>/vllm-omni-async-output/",
+        max_concurrent_invocations_per_instance=1,
+    ),
+    wait=True,
+)
+
+# Invoke async — upload the JSON input to S3, then call invoke_endpoint_async.
+# The resulting .out object in S3 is the raw WAV audio bytes (content-type audio/wav).
+# Use CustomAttributes to route /invocations → /v1/audio/speech.
diff --git a/examples/vllm-omni/tts/run.sh b/examples/vllm-omni/tts/run.sh
new file mode 100755
index 000000000000..9f4f185a2139
--- /dev/null
+++ b/examples/vllm-omni/tts/run.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# End-to-end TTS example: start server, wait for ready, synthesize speech.
+# Requires: docker (with NVIDIA runtime), curl, an authenticated ECR pull.
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+MODEL="${MODEL:-Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice}"
+NAME="${NAME:-omni-tts}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=2g -p 8080:8000 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  "${IMAGE}" --model "${MODEL}"
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+
+curl -sf -X POST http://localhost:8080/v1/audio/speech \
+  -H "Content-Type: application/json" \
+  -d '{"input": "Hello from vLLM-Omni.", "voice": "vivian", "language": "English"}' \
+  --output speech.wav
+
+echo "wrote speech.wav ($(stat -f%z speech.wav 2>/dev/null || stat -c%s speech.wav) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"
diff --git a/examples/vllm-omni/video/run.sh b/examples/vllm-omni/video/run.sh
new file mode 100755
index 000000000000..36db972d82f3
--- /dev/null
+++ b/examples/vllm-omni/video/run.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# End-to-end video-generation example: start server, submit job, poll, download.
+# /v1/videos is async — it returns a job ID; the MP4 is produced in the background.
+set -euo pipefail
+
+IMAGE="${IMAGE:-763104351884.dkr.ecr.us-west-2.amazonaws.com/vllm-omni:omni-cuda-v1}"
+MODEL="${MODEL:-Wan-AI/Wan2.1-T2V-1.3B-Diffusers}"
+NAME="${NAME:-omni-video}"
+
+docker run -d --name "${NAME}" --gpus all --shm-size=8g -p 8080:8000 \
+  -v "${HOME}/hf-cache:/root/.cache/huggingface" \
+  "${IMAGE}" --model "${MODEL}" --tensor-parallel-size 2
+
+until curl -sf http://localhost:8080/health >/dev/null; do sleep 5; done
+
+# /v1/videos requires multipart/form-data.
+JOB_ID=$(curl -sf -X POST http://localhost:8080/v1/videos \
+  -F "prompt=a dog running on a beach at sunset" \
+  -F "num_frames=17" -F "num_inference_steps=30" \
+  -F "size=480x320" -F "seed=42" \
+  | python3 -c "import json,sys;print(json.load(sys.stdin)['id'])")
+
+echo "submitted job ${JOB_ID}"
+
+# Poll until completed (5s interval, 10 min timeout).
+for _ in $(seq 1 120); do
+  STATUS=$(curl -sf "http://localhost:8080/v1/videos/${JOB_ID}" \
+    | python3 -c "import json,sys;print(json.load(sys.stdin)['status'])")
+  [ "${STATUS}" = "completed" ] && break
+  [ "${STATUS}" = "failed" ] && { echo "job failed"; exit 1; }
+  sleep 5
+done
+
+curl -sf "http://localhost:8080/v1/videos/${JOB_ID}/content" --output video.mp4
+echo "wrote video.mp4 ($(stat -f%z video.mp4 2>/dev/null || stat -c%s video.mp4) bytes)"
+# Cleanup:  docker stop "${NAME}" && docker rm "${NAME}"