From 1cc40592cd24f8b65b5cb569a1cc3f7beb4622c3 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 14 May 2026 13:20:41 +0200
Subject: [PATCH 01/41] Added Cosmos3 model

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 docs/models/supported_models.md               |    1 +
 .../diffusion/cache_acceleration/cache_dit.md |   21 +
 .../diffusion/cpu_offload_diffusion.md        |    2 +
 .../diffusion/parallelism/cfg_parallel.md     |    9 +
 docs/user_guide/diffusion_features.md         |    6 +
 .../offline_inference/image_to_video.md       |   42 +-
 .../offline_inference/text_to_image.md        |   27 +
 .../offline_inference/text_to_video.md        |   25 +
 .../examples/online_serving/image_to_video.md |   76 +-
 .../examples/online_serving/text_to_image.md  |   36 +
 .../examples/online_serving/text_to_video.md  |  109 +-
 .../image_to_video/README.md                  |   42 +-
 .../image_to_video/image_to_video.py          |   70 +-
 .../offline_inference/text_to_image/README.md |   27 +-
 .../text_to_image/text_to_image.py            |    3 +-
 .../text_to_video/text_to_video.md            |   24 +
 .../text_to_video/text_to_video.py            |   29 +-
 .../online_serving/image_to_video/README.md   |   73 +-
 .../online_serving/text_to_image/README.md    |   36 +
 .../online_serving/text_to_video/README.md    |   65 +-
 tests/diffusion/cache/test_cache_dit.py       |   19 +
 tests/diffusion/models/cosmos3/__init__.py    |    2 +
 tests/diffusion/models/cosmos3/conftest.py    |  191 ++
 .../models/cosmos3/test_cosmos3_pipeline.py   | 1108 ++++++++++
 .../cosmos3/test_cosmos3_transformer.py       |  577 +++++
 tests/diffusion/test_diffusion_ipc.py         |   25 +
 tests/e2e/accuracy/test_cosmos3_similarity.py |  155 ++
 .../openai_api/test_image_server.py           |    1 +
 .../openai_api/test_video_server.py           |  116 ++
 .../diffusion/attention/backends/sdpa.py      |    3 +
 .../diffusion/cache/cache_dit_backend.py      |   71 +
 vllm_omni/diffusion/diffusion_engine.py       |   25 +-
 vllm_omni/diffusion/ipc.py                    |   14 +-
 .../diffusion/models/cosmos3/__init__.py      |   16 +
 vllm_omni/diffusion/models/cosmos3/action.py  |  217 ++
 .../cosmos3/audio_tokenizer/__init__.py       |    6 +
 .../cosmos3/audio_tokenizer/activations.py    |  147 ++
 .../alias_free_torch/__init__.py              |   16 +
 .../audio_tokenizer/alias_free_torch/act.py   |   32 +
 .../alias_free_torch/filter.py                |   95 +
 .../alias_free_torch/resample.py              |   48 +
 .../models/cosmos3/audio_tokenizer/avae.py    |  271 +++
 .../cosmos3/audio_tokenizer/bottlenecks.py    |  133 ++
 .../models/cosmos3/audio_tokenizer/config.py  |   20 +
 .../models/cosmos3/audio_tokenizer/models.py  |  614 ++++++
 .../models/cosmos3/audio_tokenizer/modules.py |  418 ++++
 .../audio_tokenizer/modules_encodec.py        |  297 +++
 .../diffusion/models/cosmos3/guardrails.py    |  430 ++++
 .../models/cosmos3/pipeline_cosmos3.py        | 1848 +++++++++++++++++
 .../models/cosmos3/sound_tokenizer.py         |  232 +++
 .../models/cosmos3/transformer_cosmos3.py     | 1586 ++++++++++++++
 vllm_omni/diffusion/registry.py               |    7 +
 vllm_omni/engine/async_omni_engine.py         |    1 +
 vllm_omni/entrypoints/openai/api_server.py    |   13 +-
 .../entrypoints/openai/protocol/__init__.py   |    2 +
 .../entrypoints/openai/protocol/videos.py     |   22 +
 vllm_omni/entrypoints/openai/serving_chat.py  |    2 +
 vllm_omni/entrypoints/openai/serving_video.py |  139 +-
 vllm_omni/inputs/data.py                      |    1 +
 59 files changed, 9561 insertions(+), 82 deletions(-)
 create mode 100644 tests/diffusion/models/cosmos3/__init__.py
 create mode 100644 tests/diffusion/models/cosmos3/conftest.py
 create mode 100644 tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
 create mode 100644 tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
 create mode 100644 tests/diffusion/test_diffusion_ipc.py
 create mode 100644 tests/e2e/accuracy/test_cosmos3_similarity.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/__init__.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/action.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
 create mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py
 create mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/__init__.py
 create mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/act.py
 create mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/filter.py
 create mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/resample.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
 create mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/config.py
 create mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/models.py
 create mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py
 create mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules_encodec.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/guardrails.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index f30f3475888..880d7f3939d 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -32,6 +32,7 @@ th {
 | `ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanPipeline` | Wan2.1-T2V, Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-T2V-14B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
+| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V | local Diffusers-format Cosmos3 checkpoint (`$COSMOS3_MODEL`) | ✅︎ | | | |
 | `WanSpeechToVideoPipeline` | Wan2.2-S2V | `Wan-AI/Wan2.2-S2V-14B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Wan22VACEPipeline` | Wan2.1-VACE | `Wan-AI/Wan2.1-VACE-1.3B-diffusers`, `Wan-AI/Wan2.1-VACE-14B-diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `LTX2Pipeline` | LTX-2-T2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | |
diff --git a/docs/user_guide/diffusion/cache_acceleration/cache_dit.md b/docs/user_guide/diffusion/cache_acceleration/cache_dit.md
index eaaca84ad6d..8e55e36bd57 100644
--- a/docs/user_guide/diffusion/cache_acceleration/cache_dit.md
+++ b/docs/user_guide/diffusion/cache_acceleration/cache_dit.md
@@ -128,6 +128,22 @@ python image_edit.py \
 
 See the [image_edit.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py) for detailed configuration options.
 
+For Cosmos3 text-to-video or image-to-video, use the video examples with the Cosmos3 pipeline class:
+
+```bash
+cd examples/offline_inference/text_to_video
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+python text_to_video.py \
+    --model "$COSMOS3_MODEL" \
+    --model-class-name Cosmos3OmniDiffusersPipeline \
+    --prompt "A small warehouse robot moves a blue box across a clean floor." \
+    --cache-backend cache_dit \
+    --num-inference-steps 35
+```
+
+Cosmos3 Cache-DiT wraps the GEN denoising path. TeaCache is not implemented for Cosmos3.
+
 ### Online Serving
 
 ```bash
@@ -138,6 +154,11 @@ vllm serve Qwen/Qwen-Image --omni --port 8091 --cache-backend cache_dit
 vllm serve Qwen/Qwen-Image --omni --port 8091 \
   --cache-backend cache_dit \
   --cache-config '{"Fn_compute_blocks": 1, "residual_diff_threshold": 0.12}'
+
+# Cosmos3
+vllm serve "$COSMOS3_MODEL" --omni --port 8091 \
+  --model-class-name Cosmos3OmniDiffusersPipeline \
+  --cache-backend cache_dit
 ```
 
 ---
diff --git a/docs/user_guide/diffusion/cpu_offload_diffusion.md b/docs/user_guide/diffusion/cpu_offload_diffusion.md
index 39dc366485e..d725502da1d 100644
--- a/docs/user_guide/diffusion/cpu_offload_diffusion.md
+++ b/docs/user_guide/diffusion/cpu_offload_diffusion.md
@@ -194,6 +194,7 @@ Factory function `get_offload_backend()` selects the appropriate backend based o
 | OvisImagePipeline | `AIDC-AI/Ovis-Image-7B` | `OvisImageTransformer2DModel` | - | ✓ | `"transformer"` |
 | QwenImagePipeline | `Qwen/Qwen-Image` | `QwenImageTransformer2DModel` | ✓ | ✓ | `"transformer_blocks"` |
 | StableDiffusion3Pipeline | `stabilityai/stable-diffusion-3.5-medium` | `SD3Transformer2DModel` | - | ✓ | `"transformer_blocks"` |
+| Cosmos3OmniDiffusersPipeline | `$COSMOS3_MODEL` | `Cosmos3VFMTransformer` | - | ✓ | `"gen_layers"` |
 | Wan22I2VPipeline | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` |
 | Wan22Pipeline | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` |
 | BagelPipeline | `ByteDance-Seed/BAGEL-7B-MoT` | `Qwen2MoTModel` | - | ✓ | `"layers"`, `"customized modules"` |
@@ -201,3 +202,4 @@ Factory function `get_offload_backend()` selects the appropriate backend based o
 **Notes:**
 - Model-Level Offloading is expected to be supported by all common diffusion models (DiT and encoders) naturally
 - Layerwise Offloading requires DiT class to define `_layerwise_offload_blocks_attrs` pointing to transformer blocks
+- Cosmos3 uses the singular `_layerwise_offload_blocks_attr` compatibility path and offloads GEN decoder layers.
diff --git a/docs/user_guide/diffusion/parallelism/cfg_parallel.md b/docs/user_guide/diffusion/parallelism/cfg_parallel.md
index 5541106680a..ce468d817cd 100644
--- a/docs/user_guide/diffusion/parallelism/cfg_parallel.md
+++ b/docs/user_guide/diffusion/parallelism/cfg_parallel.md
@@ -144,6 +144,15 @@ sampling_params = OmniDiffusionSamplingParams(
 )
 ```
 
+For Cosmos3, use `guidance_scale` rather than `true_cfg_scale`:
+
+```python
+sampling_params = OmniDiffusionSamplingParams(
+    num_inference_steps=35,
+    guidance_scale=4.0,
+)
+```
+
 2. **Add negative prompt:**
 ```python
 outputs = omni.generate(
diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md
index 18792a9d665..606c8b9aeca 100644
--- a/docs/user_guide/diffusion_features.md
+++ b/docs/user_guide/diffusion_features.md
@@ -108,6 +108,7 @@ The following tables show which models support each feature:
 | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution |
 |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:|
 | **Bagel** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅  | ❌ | ❌ | ❌ |
+| **Cosmos3 (T2I)** | ❌ | ✅ | ✅ (Ulysses) | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ❌ |
 | **FLUX.1-dev** | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
 | **FLUX.1-schnell** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
 | **FLUX.2-klein** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
@@ -135,12 +136,14 @@ The following tables show which models support each feature:
 > Notes:
 > 1. Nextstep_1(T2I) does not support cache acceleration methods such as TeaCache or Cache-DiT.
 > 2. `Tongyi-MAI/Z-Image-Turbo` and `SII-GAIR/daVinci-MagiHuman-Base-1080p` are distilled models with minimal NFEs; CFG-Parallel is not necessary.
+> 3. Cosmos3 T2I uses `Cosmos3OmniDiffusersPipeline` with `modalities=["image"]`. Model-level CPU offload is not supported; use layerwise offload.
 
 ### VideoGen
 
 | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution |
 |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:|
 | **Wan2.2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (encode/decode) | ❌ | ❌ |
+| **Cosmos3 (T2V/I2V)** | ❌ | ✅ | ✅ (Ulysses) | ✅ | ✅ | ✅ | ✅ | ✅ (encode/decode) | ✅ | ❌ |
 | **Wan2.1-VACE** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ❌ | ❌ |
 | **LTX-2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 | **LTX-2.3** | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
@@ -148,6 +151,9 @@ The following tables show which models support each feature:
 | **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ❌ |
 | **DreamID-Omni** | ❌ | ❌ | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 
+> Notes:
+> 1. Cosmos3 T2V and I2V use `Cosmos3OmniDiffusersPipeline` with video output. I2V is selected when the request includes an input image. Model-level CPU offload is not supported; use layerwise offload.
+
 **Frame Interpolation Support**
 
 - **Supported**: Wan2.2 text-to-video, image-to-video, and TI2V pipelines
diff --git a/docs/user_guide/examples/offline_inference/image_to_video.md b/docs/user_guide/examples/offline_inference/image_to_video.md
index 6e105741a7e..5011ccf1978 100644
--- a/docs/user_guide/examples/offline_inference/image_to_video.md
+++ b/docs/user_guide/examples/offline_inference/image_to_video.md
@@ -3,7 +3,15 @@
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video>.
 
 
-This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models with vLLM-Omni's offline inference API.
+This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models and Cosmos3 with vLLM-Omni's offline inference API.
+
+## Supported Models
+
+| Model | Default Resolution | Default Frames | Default Steps | Guidance |
+|-------|--------------------|----------------|---------------|----------|
+| `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
+| `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
+| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | auto, 720p area | 81 | 35 | 4.0 |
 
 ## Local CLI Usage
 
@@ -51,20 +59,46 @@ python image_to_video.py \
   --output i2v_output.mp4
 ```
 
+### Cosmos3
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+python image_to_video.py \
+  --model "$COSMOS3_MODEL" \
+  --model-class-name Cosmos3OmniDiffusersPipeline \
+  --image cherry_blossom.jpg \
+  --prompt "Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
+  --negative-prompt "blurry, distorted, low quality" \
+  --height 720 \
+  --width 1280 \
+  --num-frames 81 \
+  --guidance-scale 4.0 \
+  --num-inference-steps 35 \
+  --fps 24 \
+  --output cosmos3_i2v_output.mp4
+```
+
+For Cosmos3 I2V, the input image is resized and center-cropped by the pipeline. If `--height` and `--width` are omitted, this example chooses a 720p-area resolution from the input aspect ratio. Cosmos3 currently supports one prompt and one video per request, and model-level CPU offload is not supported; use `--enable-layerwise-offload` instead.
+
 Key arguments:
 
 - `--model`: Model ID (I2V-A14B for MoE, TI2V-5B for unified T2V+I2V).
+- `--model-class-name`: explicit pipeline class. Use `Cosmos3OmniDiffusersPipeline` for Cosmos3 checkpoints.
 - `--image`: Path to input image (required).
 - `--prompt`: Text description of desired motion/animation.
 - `--height/--width`: Output resolution (auto-calculated from image if not set). Dimensions should be multiples of 16.
-- `--num-frames`: Number of frames (default 81).
+- `--num-frames`: Number of frames (default is model-specific).
 - `--guidance-scale` and `--guidance-scale-high`: CFG scale (applied to low/high-noise stages for MoE).
 - `--negative-prompt`: Optional list of artifacts to suppress.
 - `--boundary-ratio`: Boundary split ratio for two-stage MoE models.
-- `--flow-shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p).
+- `--flow-shift`: Scheduler flow shift. Defaults are model-specific.
 - `--sample-solver`: Wan2.2 sampling solver. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints.
-- `--num-inference-steps`: Number of denoising steps (default 50).
+- `--num-inference-steps`: Number of denoising steps (default is model-specific).
 - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video).
+- `--frame-rate`: Generation frame rate for models that use it. Defaults to `--fps`.
 - `--output`: Path to save the generated video.
 - `--vae-use-slicing`: Enable VAE slicing for memory optimization.
 - `--vae-use-tiling`: Enable VAE tiling for memory optimization.
diff --git a/docs/user_guide/examples/offline_inference/text_to_image.md b/docs/user_guide/examples/offline_inference/text_to_image.md
index 3a97ffbf74b..e9bf48d7aa1 100644
--- a/docs/user_guide/examples/offline_inference/text_to_image.md
+++ b/docs/user_guide/examples/offline_inference/text_to_image.md
@@ -36,6 +36,7 @@ This folder provides several entrypoints for experimenting with text-to-image di
 | `black-forest-labs/FLUX.2-klein-4B` | 1024 x 1024 | 72.7 | 14.9 |
 | `black-forest-labs/FLUX.2-klein-9B` | 1024 x 1024 | 37.1 | 32.3 |
 | `black-forest-labs/FLUX.2-dev` | 1024 x 1024 | 65.7 | >80 (CPU offload required) |
+| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | 1024 x 1024 | model/checkpoint dependent | local checkpoint |
 
 !!! info
 *Peak VRAM:  based on basic single-card usage, batch size =1, without any acceleration/optimization features. FLUX.2-dev requires `--enable-cpu-offload` on a single 80 GiB GPU.
@@ -74,6 +75,7 @@ python text_to_image.py \
 
 | Argument | Type | Default | Description |
 | -------- | ---- | ------- | ----------- |
+| `--model` | str | `"Qwen/Qwen-Image"` | Diffusion model name or local path |
 | `--prompt` | str | `"a cup of coffee on the table"` | Text description for image generation |
 | `--seed` | int | `142` | Integer seed for deterministic sampling |
 | `--negative-prompt` | str | `None` | Negative prompt for classifier-free conditional guidance |
@@ -87,6 +89,9 @@ python text_to_image.py \
 | `--vae-use-slicing` | flag | off | Enable VAE slicing for memory optimization |
 | `--vae-use-tiling` | flag | off | Enable VAE tiling for memory optimization |
 | `--cfg-parallel-size` | int | `1` | Set to `2` to enable CFG Parallel |
+| `--ulysses-degree` | int | `1` | Ulysses sequence parallel degree for multi-GPU inference |
+| `--ring-degree` | int | `1` | Ring sequence parallel degree for hybrid Ulysses + Ring inference |
+| `--ulysses-mode` | str | `"strict"` | Ulysses SP mode: `"strict"` or `"advanced_uaa"` |
 | `--enable-cpu-offload` | flag | off | Enable CPU offloading for diffusion models |
 | `--lora-path` | str | — | Path to PEFT LoRA adapter folder |
 | `--lora-scale` | float | `1.0` | Scale factor for LoRA weights |
@@ -160,6 +165,28 @@ python examples/offline_inference/text_to_image/text_to_image.py \
   --output flux2-dev.png
 ```
 
+### Cosmos3
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+python text_to_image.py \
+  --model "$COSMOS3_MODEL" \
+  --model-class-name Cosmos3OmniDiffusersPipeline \
+  --prompt "A small warehouse robot carrying a blue box, clean product photography" \
+  --negative-prompt "blurry, distorted, low quality" \
+  --guidance-scale 7.0 \
+  --num-inference-steps 50 \
+  --height 1024 \
+  --width 1024 \
+  --num-images-per-prompt 1 \
+  --output cosmos3_t2i.png
+```
+
+This script marks text-to-image requests with `modalities=["image"]`, which selects Cosmos3 T2I. Cosmos3 currently supports one prompt per request; use `--num-images-per-prompt` to request multiple images for that prompt. Model-level CPU offload is not supported for Cosmos3, so use `--enable-layerwise-offload` for offload instead.
+
 ### Batch Requests (Multiple Prompts)
 
 You can pass multiple prompts in a single `generate` call.
diff --git a/docs/user_guide/examples/offline_inference/text_to_video.md b/docs/user_guide/examples/offline_inference/text_to_video.md
index a09dbfc979f..861af8ca1d4 100644
--- a/docs/user_guide/examples/offline_inference/text_to_video.md
+++ b/docs/user_guide/examples/offline_inference/text_to_video.md
@@ -14,6 +14,7 @@ For backend selection and SageAttention usage, see the [Diffusion Attention Back
 | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | 720x1280 | 81 | 40 | 4.0 | ~60 GiB |
 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v` | 480x832 | 121 | 50 | 6.0 | 1×A100 80GB |
 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v` | 720x1280 | 121 | 50 | 6.0 | FP8 + VAE tiling required |
+| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | 720x1280 | 81 | 35 | 4.0 | model/checkpoint dependent |
 
 ## Local CLI Usage
 
@@ -50,6 +51,29 @@ python text_to_video.py \
   --output ltx2_out.mp4
 ```
 
+### Cosmos3
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+python text_to_video.py \
+  --model "$COSMOS3_MODEL" \
+  --model-class-name Cosmos3OmniDiffusersPipeline \
+  --prompt "A small warehouse robot moves a blue box across a clean floor." \
+  --negative-prompt "blurry, distorted, low quality" \
+  --height 720 \
+  --width 1280 \
+  --num-frames 81 \
+  --guidance-scale 4.0 \
+  --num-inference-steps 35 \
+  --fps 24 \
+  --output cosmos3_t2v_output.mp4
+```
+
+Cosmos3 video generation currently supports one prompt and one video per request. The implementation supports `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--ulysses-degree`, `--tensor-parallel-size`, `--use-hsdp`, and `--enable-layerwise-offload`. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
+
 ### HunyuanVideo-1.5 (480p)
 
 ```bash
@@ -127,6 +151,7 @@ python text_to_video.py \
 - `--audio-sample-rate`: audio sample rate for embedded audio (when the pipeline returns audio).
 - `--quantization`: quantization method (`fp8` for FP8, `gguf` for GGUF).
 - `--flow-shift`: scheduler flow_shift parameter.
+- `--cache-backend`: `cache_dit` for supported models.
 
 ### Wan2.2-specific
 
diff --git a/docs/user_guide/examples/online_serving/image_to_video.md b/docs/user_guide/examples/online_serving/image_to_video.md
index 781f0c2a5ed..1ef5c9be318 100644
--- a/docs/user_guide/examples/online_serving/image_to_video.md
+++ b/docs/user_guide/examples/online_serving/image_to_video.md
@@ -3,7 +3,15 @@
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/image_to_video>.
 
 
-This example demonstrates how to deploy the Wan2.2 image-to-video model for online video generation using vLLM-Omni.
+This example demonstrates how to deploy image-to-video models, including Wan2.2 and Cosmos3, for online video generation using vLLM-Omni.
+
+## Supported Models
+
+| Model | Model ID |
+|-------|----------|
+| Wan2.2 I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` |
+| Wan2.2 TI2V | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` |
+| Cosmos3 I2V | `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` |
 
 ## Start Server
 
@@ -29,6 +37,22 @@ The script allows overriding:
 - `CACHE_BACKEND` (default: `none`)
 - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`)
 
+### Cosmos3
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+vllm serve "$COSMOS3_MODEL" \
+  --omni \
+  --port 8091 \
+  --model-class-name Cosmos3OmniDiffusersPipeline \
+  --allowed-local-media-path /
+```
+
+Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
+
 ## Async Job Behavior
 
 `POST /v1/videos` is asynchronous. It creates a video job and immediately
@@ -59,6 +83,7 @@ file. Metadata is returned via response headers:
 - `X-Model`: model name used for generation
 - `X-Inference-Time-S`: wall-clock inference time in seconds
 
+### Wan2.2 Sync Request
 ```bash
 curl -X POST http://localhost:8091/v1/videos/sync \
   -F "prompt=A bear playing with yarn, smooth motion" \
@@ -79,6 +104,53 @@ curl -X POST http://localhost:8091/v1/videos/sync \
   -o sync_i2v_output.mp4
 ```
 
+### Cosmos3 Sync Request
+
+```bash
+curl -X POST http://localhost:8091/v1/videos/sync \
+  -F "prompt=Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "input_reference=@/path/to/cherry_blossom.jpg" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "seed=42" \
+  -o cosmos3_i2v_output.mp4
+```
+
+For async generation, send the same form fields to `POST /v1/videos`, poll `GET /v1/videos/{video_id}`, and download from `GET /v1/videos/{video_id}/content`. Cosmos3 currently supports one prompt and one video per request.
+
+```bash
+create_response=$(curl -s http://localhost:8091/v1/videos \
+  -H "Accept: application/json" \
+  -F "prompt=Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "input_reference=@/path/to/cherry_blossom.jpg" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "seed=42")
+
+video_id=$(echo "$create_response" | jq -r '.id')
+while true; do
+  status=$(curl -s "http://localhost:8091/v1/videos/${video_id}" | jq -r '.status')
+  if [ "$status" = "completed" ]; then
+    break
+  fi
+  if [ "$status" = "failed" ]; then
+    echo "Video generation failed"
+    exit 1
+  fi
+  sleep 2
+done
+
+curl -L "http://localhost:8091/v1/videos/${video_id}/content" -o cosmos3_i2v_output.mp4
+```
+
 ## Storage
 
 Generated video files are stored on local disk by the async video API.
@@ -103,6 +175,7 @@ export VLLM_OMNI_STORAGE_MAX_CONCURRENCY=8
 bash run_curl_image_to_video.sh
 
 # Or execute directly (OpenAI-style multipart)
+# Note: frame interpolation specific arguments are relevant only for Wan2.2 models
 create_response=$(curl -s http://localhost:8091/v1/videos \
   -H "Accept: application/json" \
   -F "prompt=A bear playing with yarn, smooth motion" \
@@ -165,6 +238,7 @@ curl -X POST http://localhost:8091/v1/videos \
 ### Generation with Parameters
 
 ```bash
+# Note: frame interpolation specific arguments are relevant only for Wan2.2 models
 curl -X POST http://localhost:8091/v1/videos \
   -F "prompt=A bear playing with yarn, smooth motion" \
   -F "negative_prompt=low quality, blurry, static" \
diff --git a/docs/user_guide/examples/online_serving/text_to_image.md b/docs/user_guide/examples/online_serving/text_to_image.md
index 69c1480e39f..894a1b4be6b 100644
--- a/docs/user_guide/examples/online_serving/text_to_image.md
+++ b/docs/user_guide/examples/online_serving/text_to_image.md
@@ -23,6 +23,21 @@ Or use the startup script:
 bash run_server.sh
 ```
 
+### Cosmos3
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+vllm serve "$COSMOS3_MODEL" \
+  --omni \
+  --port 8091 \
+  --model-class-name Cosmos3OmniDiffusersPipeline
+```
+
+Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
+
 ### Start with Parallelism Acceleration
 
 Enable Tensor Parallelism and VAE Patch Parallelism for faster inference:
@@ -71,6 +86,26 @@ curl -s http://localhost:8091/v1/chat/completions \
   }' | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png
 ```
 
+#### Cosmos3 Images API
+
+The dedicated image endpoint sets `modalities=["image"]` internally, which selects Cosmos3 text-to-image.
+
+```bash
+curl -X POST http://localhost:8091/v1/images/generations \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "A small warehouse robot carrying a blue box, clean product photography",
+    "size": "1024x1024",
+    "n": 1,
+    "num_inference_steps": 50,
+    "guidance_scale": 7.0,
+    "negative_prompt": "blurry, distorted, low quality",
+    "seed": 42
+  }' | jq -r '.data[0].b64_json' | base64 -d > cosmos3_t2i.png
+```
+
+Cosmos3 currently supports one prompt per request. Use `n` to request multiple images for that prompt.
+
 ### Method 2: Using OpenAI Python SDK
 
 ```python
@@ -248,6 +283,7 @@ directly. For image dimensions and count, use `size` and `n` rather than
 | `height`                 | int   | None    | Image height in pixels         |
 | `width`                  | int   | None    | Image width in pixels          |
 | `size`                   | str   | None    | Image size (e.g., "1024x1024") |
+| `n`                      | int   | 1       | Number of images for `/v1/images/generations` |
 | `num_inference_steps`    | int   | 50      | Number of denoising steps      |
 | `true_cfg_scale`         | float | 4.0     | Qwen-Image CFG scale           |
 | `seed`                   | int   | None    | Random seed (reproducible)     |
diff --git a/docs/user_guide/examples/online_serving/text_to_video.md b/docs/user_guide/examples/online_serving/text_to_video.md
index b918aac19d0..d5aa8a154ff 100644
--- a/docs/user_guide/examples/online_serving/text_to_video.md
+++ b/docs/user_guide/examples/online_serving/text_to_video.md
@@ -13,6 +13,7 @@ This example demonstrates how to deploy text-to-video models for online video ge
 | Wan2.1 T2V (14B) | `Wan-AI/Wan2.1-T2V-14B-Diffusers` |
 | Wan2.2 T2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` |
 | LTX-2 | `Lightricks/LTX-2` |
+| Cosmos3 T2V | `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` |
 
 ## Wan2.2 T2V
 
@@ -40,6 +41,23 @@ The script allows overriding:
 - `CACHE_BACKEND` (default: `none`)
 - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`)
 
+## Cosmos3 T2V
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+### Start Server
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+vllm serve "$COSMOS3_MODEL" \
+  --omni \
+  --port 8091 \
+  --model-class-name Cosmos3OmniDiffusersPipeline
+```
+
+Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
+
 ## Async Job Behavior
 
 `POST /v1/videos` is asynchronous. It creates a video job and immediately
@@ -85,6 +103,51 @@ curl -X POST http://localhost:8091/v1/videos/sync \
   -o sync_t2v_output.mp4
 ```
 
+### Cosmos3 Sync Request
+
+```bash
+curl -X POST http://localhost:8091/v1/videos/sync \
+  -F "prompt=A small warehouse robot moves a blue box across a clean floor." \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "seed=42" \
+  -o cosmos3_t2v_output.mp4
+```
+
+For async generation, send the same form fields to `POST /v1/videos`, poll `GET /v1/videos/{video_id}`, and download from `GET /v1/videos/{video_id}/content`. Cosmos3 currently supports one prompt and one video per request.
+
+```bash
+create_response=$(curl -s http://localhost:8091/v1/videos \
+  -H "Accept: application/json" \
+  -F "prompt=A small warehouse robot moves a blue box across a clean floor." \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "seed=42")
+
+video_id=$(echo "$create_response" | jq -r '.id')
+while true; do
+  status=$(curl -s "http://localhost:8091/v1/videos/${video_id}" | jq -r '.status')
+  if [ "$status" = "completed" ]; then
+    break
+  fi
+  if [ "$status" = "failed" ]; then
+    echo "Video generation failed"
+    exit 1
+  fi
+  sleep 2
+done
+
+curl -L "http://localhost:8091/v1/videos/${video_id}/content" -o cosmos3_t2v_output.mp4
+```
+
 ## Storage
 
 Generated video files are stored on local disk by the async video API.
@@ -153,6 +216,7 @@ curl -X POST http://localhost:8091/v1/videos \
 ### Generation with Parameters
 
 ```bash
+# Note: frame interpolation specific arguments are relevant only for Wan2.2 models
 curl -X POST http://localhost:8091/v1/videos \
   -F "prompt=A cinematic view of a futuristic city at sunset" \
   -F "width=832" \
@@ -173,32 +237,32 @@ curl -X POST http://localhost:8091/v1/videos \
 
 ## Generation Parameters
 
-| Parameter             | Type   | Default | Description                                      |
-| --------------------- | ------ | ------- | ------------------------------------------------ |
-| `prompt`              | str    | -       | Text description of the desired video            |
-| `seconds`             | str    | None    | Clip duration in seconds                         |
-| `size`                | str    | None    | Output size in `WIDTHxHEIGHT` format             |
-| `negative_prompt`     | str    | None    | Negative prompt                                  |
-| `width`               | int    | None    | Video width in pixels                            |
-| `height`              | int    | None    | Video height in pixels                           |
-| `num_frames`          | int    | None    | Number of frames to generate                     |
-| `fps`                 | int    | None    | Frames per second for output video               |
-| `num_inference_steps` | int    | None    | Number of denoising steps                        |
-| `guidance_scale`      | float  | None    | CFG guidance scale (low-noise stage)             |
-| `guidance_scale_2`    | float  | None    | CFG guidance scale (high-noise stage, Wan2.2)     |
-| `boundary_ratio`      | float  | None    | Boundary split ratio for low/high DiT (Wan2.2)   |
-| `flow_shift`          | float  | None    | Scheduler flow shift (Wan2.2)                    |
-| `seed`                | int    | None    | Random seed (reproducible)                       |
-| `lora`                | object | None    | LoRA configuration                               |
-| `enable_frame_interpolation` | bool | false | Enable RIFE frame interpolation before MP4 encoding |
-| `frame_interpolation_exp` | int | 1 | Interpolation exponent; 1=2x temporal resolution, 2=4x |
-| `frame_interpolation_scale` | float | 1.0 | RIFE inference scale; use 0.5 for high-resolution inputs |
-| `frame_interpolation_model_path` | str | None | Local directory or Hugging Face repo ID with `flownet.pkl`; defaults to `elfgum/RIFE-4.22.lite` |
+| Parameter             | Type   | Default | Description                                                                                              |
+| --------------------- | ------ | ------- |----------------------------------------------------------------------------------------------------------|
+| `prompt`              | str    | -       | Text description of the desired video                                                                    |
+| `seconds`             | str    | None    | Clip duration in seconds                                                                                 |
+| `size`                | str    | None    | Output size in `WIDTHxHEIGHT` format                                                                     |
+| `negative_prompt`     | str    | None    | Negative prompt                                                                                          |
+| `width`               | int    | None    | Video width in pixels                                                                                    |
+| `height`              | int    | None    | Video height in pixels                                                                                   |
+| `num_frames`          | int    | None    | Number of frames to generate                                                                             |
+| `fps`                 | int    | None    | Frames per second for output video                                                                       |
+| `num_inference_steps` | int    | None    | Number of denoising steps                                                                                |
+| `guidance_scale`      | float  | None    | CFG guidance scale (low-noise stage)                                                                     |
+| `guidance_scale_2`    | float  | None    | CFG guidance scale (high-noise stage, Wan2.2)                                                            |
+| `boundary_ratio`      | float  | None    | Boundary split ratio for low/high DiT (Wan2.2)                                                           |
+| `flow_shift`          | float  | None    | Scheduler flow shift                                                                                     |
+| `seed`                | int    | None    | Random seed (reproducible)                                                                               |
+| `lora`                | object | None    | LoRA configuration                                                                                       |
+| `enable_frame_interpolation` | bool | false | Enable RIFE frame interpolation before MP4 encoding (Wan2.2)                                             |
+| `frame_interpolation_exp` | int | 1 | Interpolation exponent; 1=2x temporal resolution, 2=4x (Wan2.2)                                          |
+| `frame_interpolation_scale` | float | 1.0 | RIFE inference scale; use 0.5 for high-resolution inputs (Wan2.2)                                        |
+| `frame_interpolation_model_path` | str | None | Local directory or Hugging Face repo ID with `flownet.pkl`; defaults to `elfgum/RIFE-4.22.lite` (Wan2.2) |
 
 ## Frame Interpolation
 
 Frame interpolation is an optional post-processing step for `/v1/videos` and
-`/v1/videos/sync`. It synthesizes intermediate frames between generated frames
+`/v1/videos/sync`, supported by Wan2.2 models. It synthesizes intermediate frames between generated frames
 without rerunning the diffusion model. If the generated video has `N` frames,
 the interpolated output frame count is `(N - 1) * 2**exp + 1`. The encoder FPS
 is multiplied by `2**exp` so the output duration remains close to the original.
@@ -210,6 +274,7 @@ device without blocking the FastAPI event loop.
 Example: generate 5 frames and interpolate to 9 frames:
 
 ```bash
+# Note: frame interpolation specific arguments are relevant only for Wan2.2 models
 curl -X POST http://localhost:8091/v1/videos/sync \
   -F "prompt=A dog running through a park" \
   -F "num_frames=5" \
diff --git a/examples/offline_inference/image_to_video/README.md b/examples/offline_inference/image_to_video/README.md
index a458850a02b..8de4cafce78 100644
--- a/examples/offline_inference/image_to_video/README.md
+++ b/examples/offline_inference/image_to_video/README.md
@@ -1,6 +1,14 @@
 # Image-To-Video
 
-This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models with vLLM-Omni's offline inference API.
+This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models and Cosmos3 with vLLM-Omni's offline inference API.
+
+## Supported Models
+
+| Model | Default Resolution | Default Frames | Default Steps | Guidance |
+|-------|--------------------|----------------|---------------|----------|
+| `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
+| `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
+| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | auto, 720p area | 81 | 35 | 4.0 |
 
 ## Local CLI Usage
 
@@ -48,20 +56,46 @@ python image_to_video.py \
   --output i2v_output.mp4
 ```
 
+### Cosmos3
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+python image_to_video.py \
+  --model "$COSMOS3_MODEL" \
+  --model-class-name Cosmos3OmniDiffusersPipeline \
+  --image cherry_blossom.jpg \
+  --prompt "Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
+  --negative-prompt "blurry, distorted, low quality" \
+  --height 720 \
+  --width 1280 \
+  --num-frames 81 \
+  --guidance-scale 4.0 \
+  --num-inference-steps 35 \
+  --fps 24 \
+  --output cosmos3_i2v_output.mp4
+```
+
+For Cosmos3 I2V, the input image is resized and center-cropped by the pipeline. If `--height` and `--width` are omitted, this example chooses a 720p-area resolution from the input aspect ratio. Cosmos3 currently supports one prompt and one video per request, and model-level CPU offload is not supported; use `--enable-layerwise-offload` instead.
+
 Key arguments:
 
 - `--model`: Model ID (I2V-A14B for MoE, TI2V-5B for unified T2V+I2V).
+- `--model-class-name`: explicit pipeline class. Use `Cosmos3OmniDiffusersPipeline` for Cosmos3 checkpoints.
 - `--image`: Path to input image (required).
 - `--prompt`: Text description of desired motion/animation.
 - `--height/--width`: Output resolution (auto-calculated from image if not set). Dimensions should be multiples of 16.
-- `--num-frames`: Number of frames (default 81).
+- `--num-frames`: Number of frames (default is model-specific).
 - `--guidance-scale` and `--guidance-scale-high`: CFG scale (applied to low/high-noise stages for MoE).
 - `--negative-prompt`: Optional list of artifacts to suppress.
 - `--boundary-ratio`: Boundary split ratio for two-stage MoE models.
-- `--flow-shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p).
+- `--flow-shift`: Scheduler flow shift. Defaults are model-specific.
 - `--sample-solver`: Wan2.2 sampling solver. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints.
-- `--num-inference-steps`: Number of denoising steps (default 50).
+- `--num-inference-steps`: Number of denoising steps (default is model-specific).
 - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video).
+- `--frame-rate`: Generation frame rate for models that use it. Defaults to `--fps`.
 - `--output`: Path to save the generated video.
 - `--vae-use-slicing`: Enable VAE slicing for memory optimization.
 - `--vae-use-tiling`: Enable VAE tiling for memory optimization.
diff --git a/examples/offline_inference/image_to_video/image_to_video.py b/examples/offline_inference/image_to_video/image_to_video.py
index 84fbf2a94ca..b89409e50e4 100644
--- a/examples/offline_inference/image_to_video/image_to_video.py
+++ b/examples/offline_inference/image_to_video/image_to_video.py
@@ -2,13 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 """
-Image-to-Video generation example using Wan2.2 I2V/TI2V models, LTX2, or HunyuanVideo-1.5.
+Image-to-Video generation example using Wan2.2 I2V/TI2V models, LTX2, HunyuanVideo-1.5, or Cosmos3.
 
 Supports:
 - Wan2.2-I2V-A14B-Diffusers: MoE model with CLIP image encoder
 - Wan2.2-TI2V-5B-Diffusers: Unified T2V+I2V model (dense 5B)
 - LTX2 image-to-video pipeline
 - HunyuanVideo-1.5 I2V: SigLIP + VAE dual image conditioning
+- Cosmos3: unified text-to-image, text-to-video, and image-to-video pipeline
 
 Usage:
     # Wan I2V-A14B (MoE)
@@ -30,6 +31,13 @@
     python image_to_video.py --model hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_i2v \
         --image input.jpg --prompt "A cat playing with yarn" \
         --flow-shift 5.0 --guidance-scale 6.0
+
+    # Cosmos3 image-to-video
+    python image_to_video.py --model "$COSMOS3_MODEL" \
+        --model-class-name Cosmos3OmniDiffusersPipeline \
+        --image input.jpg --prompt "A cinematic dolly shot of a boat" \
+        --height 720 --width 1280 --num-frames 81 \
+        --num-inference-steps 35 --guidance-scale 4.0 --fps 24
 """
 
 import argparse
@@ -60,7 +68,9 @@ def parse_profiler_config(value: str) -> dict[str, Any]:
 
 
 def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Generate a video from an image (Wan2.2, LTX2, HunyuanVideo-1.5).")
+    parser = argparse.ArgumentParser(
+        description="Generate a video from an image (Wan2.2, LTX2, HunyuanVideo-1.5, Cosmos3)."
+    )
     parser.add_argument(
         "--model",
         default="Wan-AI/Wan2.2-I2V-A14B-Diffusers",
@@ -69,13 +79,13 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--model-class-name",
         default=None,
-        help="Override model class name (e.g., LTX2ImageToVideoPipeline).",
+        help="Override model class name (e.g., Cosmos3OmniDiffusersPipeline or LTX2ImageToVideoPipeline).",
     )
     parser.add_argument("--image", required=True, help="Path to input image.")
     parser.add_argument("--prompt", default="", help="Text prompt describing the desired motion.")
     parser.add_argument("--negative-prompt", default="", help="Negative prompt.")
     parser.add_argument("--seed", type=int, default=42, help="Random seed.")
-    parser.add_argument("--guidance-scale", type=float, default=5.0, help="CFG scale.")
+    parser.add_argument("--guidance-scale", type=float, default=None, help="CFG scale. Default: model-specific.")
     parser.add_argument(
         "--guidance-scale-high", type=float, default=None, help="Optional separate CFG for high-noise (MoE only)."
     )
@@ -83,8 +93,10 @@ def parse_args() -> argparse.Namespace:
         "--height", type=int, default=None, help="Video height (auto-calculated from image if not set)."
     )
     parser.add_argument("--width", type=int, default=None, help="Video width (auto-calculated from image if not set).")
-    parser.add_argument("--num-frames", type=int, default=81, help="Number of frames.")
-    parser.add_argument("--num-inference-steps", type=int, default=50, help="Sampling steps.")
+    parser.add_argument("--num-frames", type=int, default=None, help="Number of frames. Default: model-specific.")
+    parser.add_argument(
+        "--num-inference-steps", type=int, default=None, help="Sampling steps. Default: model-specific."
+    )
     parser.add_argument("--boundary-ratio", type=float, default=0.875, help="Boundary split ratio for MoE models.")
     parser.add_argument(
         "--frame-rate",
@@ -93,7 +105,10 @@ def parse_args() -> argparse.Namespace:
         help="Optional generation frame rate (used by models like LTX2). Defaults to --fps.",
     )
     parser.add_argument(
-        "--flow-shift", type=float, default=5.0, help="Scheduler flow_shift (5.0 for 720p, 12.0 for 480p)."
+        "--flow-shift",
+        type=float,
+        default=None,
+        help="Scheduler flow_shift. Default: model-specific.",
     )
     parser.add_argument(
         "--sample-solver",
@@ -253,31 +268,51 @@ def calculate_dimensions(
     return height, width
 
 
+def _is_cosmos3_model(model_name: str, model_class_name: str | None = None) -> bool:
+    combined = f"{model_name} {model_class_name or ''}".lower()
+    return "cosmos3" in combined or "cosmos-3" in combined
+
+
 def main():
     args = parse_args()
     generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
     model_name = str(args.model).lower() if args.model is not None else ""
     model_class_name = args.model_class_name
     is_ltx2 = "ltx2" in model_name or (model_class_name and "ltx2" in model_class_name.lower())
+    is_cosmos3 = _is_cosmos3_model(model_name, model_class_name)
     if model_class_name is None and is_ltx2:
         model_class_name = "LTX2ImageToVideoPipeline"
+    elif model_class_name is None and is_cosmos3:
+        model_class_name = "Cosmos3OmniDiffusersPipeline"
 
     # Load input image
     image = PIL.Image.open(args.image).convert("RGB")
 
-    fps = args.fps if args.fps is not None else (24 if is_ltx2 else 16)
+    fps = args.fps if args.fps is not None else (24 if (is_ltx2 or is_cosmos3) else 16)
     frame_rate = args.frame_rate if args.frame_rate is not None else float(fps)
-    guidance_scale = args.guidance_scale if args.guidance_scale is not None else (4.0 if is_ltx2 else 5.0)
+    guidance_scale = (
+        args.guidance_scale if args.guidance_scale is not None else (4.0 if (is_ltx2 or is_cosmos3) else 5.0)
+    )
     num_frames = args.num_frames if args.num_frames is not None else (121 if is_ltx2 else 81)
-    num_inference_steps = args.num_inference_steps if args.num_inference_steps is not None else (40 if is_ltx2 else 50)
+    num_inference_steps = (
+        args.num_inference_steps
+        if args.num_inference_steps is not None
+        else (40 if is_ltx2 else (35 if is_cosmos3 else 50))
+    )
 
     # Calculate dimensions if not provided
     height = args.height
     width = args.width
     if height is None or width is None:
-        # Default to 480P area for Wan2.2 I2V, 512x768 area for LTX2
-        max_area = 512 * 768 if is_ltx2 else 480 * 832
-        mod_value = 32 if is_ltx2 else 16
+        if is_ltx2:
+            max_area = 512 * 768
+            mod_value = 32
+        elif is_cosmos3:
+            max_area = 720 * 1280
+            mod_value = 16
+        else:
+            max_area = 480 * 832
+            mod_value = 16
         calc_height, calc_width = calculate_dimensions(image, max_area=max_area, mod_value=mod_value)
         height = height or calc_height
         width = width or calc_width
@@ -358,8 +393,10 @@ def main():
     print(f"\n{'=' * 60}")
     print("Generation Configuration:")
     print(f"  Model: {args.model}")
-    print(f"  Inference steps: {args.num_inference_steps}")
-    print(f"  Frames: {args.num_frames}")
+    if model_class_name:
+        print(f"  Model class: {model_class_name}")
+    print(f"  Inference steps: {num_inference_steps}")
+    print(f"  Frames: {num_frames}")
     print(f"  Solver: {args.sample_solver}")
     print(f"  kv_cache_dtype(config): {args.kv_cache_dtype}")
     print(f"  kv_cache_skip_steps(config): {args.kv_cache_skip_steps}")
@@ -368,7 +405,7 @@ def main():
         f"  Parallel configuration: cfg_parallel_size={args.cfg_parallel_size},"
         f" tensor_parallel_size={args.tensor_parallel_size}, vae_patch_parallel_size={args.vae_patch_parallel_size}"
     )
-    print(f"  Video size: {args.width}x{args.height}")
+    print(f"  Video size: {width}x{height}")
     print(f"{'=' * 60}\n")
 
     generation_start = time.perf_counter()
@@ -377,6 +414,7 @@ def main():
         {
             "prompt": args.prompt,
             "negative_prompt": args.negative_prompt,
+            "modalities": ["video"],
             "multi_modal_data": {"image": image},
         },
         OmniDiffusionSamplingParams(
diff --git a/examples/offline_inference/text_to_image/README.md b/examples/offline_inference/text_to_image/README.md
index c71773972b3..149e4260904 100644
--- a/examples/offline_inference/text_to_image/README.md
+++ b/examples/offline_inference/text_to_image/README.md
@@ -34,6 +34,7 @@ This folder provides several entrypoints for experimenting with text-to-image di
 | `black-forest-labs/FLUX.2-klein-4B` | 1024 x 1024 | 72.7 | 14.9 |
 | `black-forest-labs/FLUX.2-klein-9B` | 1024 x 1024 | 37.1 | 32.3 |
 | `black-forest-labs/FLUX.2-dev` | 1024 x 1024 | 65.7 | >80 (CPU offload required) |
+| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | 1024 x 1024 | model/checkpoint dependent | local checkpoint |
 | `HunyuanImage-3.0` | 1024 x 1024 | 80.0 (TP≥3)  | 160 |
 
 !!! info
@@ -73,11 +74,13 @@ python text_to_image.py \
 
 | Argument | Type | Default | Description |
 | -------- | ---- | ------- | ----------- |
+| `--model` | str | `"Qwen/Qwen-Image"` | Diffusion model name or local path |
+| `--model-class-name` | str | `None` | Override pipeline class, for example `Cosmos3OmniDiffusersPipeline` |
 | `--prompt` | str | `"a cup of coffee on the table"` | Text description for image generation |
 | `--seed` | int | `142` | Integer seed for deterministic sampling |
 | `--negative-prompt` | str | `None` | Negative prompt for classifier-free conditional guidance |
 | `--cfg-scale` | float | `4.0` | True CFG scale (model-specific guidance strength) |
-| `--guidance-scale` | float | `1.0` | Classifier-free guidance scale |
+| `--guidance-scale` | float | `4.0` | Classifier-free guidance scale |
 | `--num-images-per-prompt` | int | `1` | Number of images per prompt (saved as `output`, `output_1`, ...) |
 | `--num-inference-steps` | int | `50` | Diffusion sampling steps (more steps = higher quality, slower) |
 | `--height` | int | `1024` | Output image height in pixels |
@@ -177,6 +180,28 @@ python examples/offline_inference/text_to_image/text_to_image.py \
   --output flux2-dev.png
 ```
 
+### Cosmos3
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+python text_to_image.py \
+  --model "$COSMOS3_MODEL" \
+  --model-class-name Cosmos3OmniDiffusersPipeline \
+  --prompt "A small warehouse robot carrying a blue box, clean product photography" \
+  --negative-prompt "blurry, distorted, low quality" \
+  --guidance-scale 7.0 \
+  --num-inference-steps 50 \
+  --height 1024 \
+  --width 1024 \
+  --num-images-per-prompt 1 \
+  --output cosmos3_t2i.png
+```
+
+This script marks text-to-image requests with `modalities=["image"]`, which selects Cosmos3 T2I. Cosmos3 currently supports one prompt per request; use `--num-images-per-prompt` to request multiple images for that prompt. Model-level CPU offload is not supported for Cosmos3, so use `--enable-layerwise-offload` for offload instead.
+
 ### Batch Requests (Multiple Prompts)
 
 You can pass multiple prompts in a single `generate` call.
diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py
index c0fd337bd93..6978b8bc1a9 100644
--- a/examples/offline_inference/text_to_image/text_to_image.py
+++ b/examples/offline_inference/text_to_image/text_to_image.py
@@ -49,7 +49,7 @@ def parse_args() -> argparse.Namespace:
         "Qwen/Qwen-Image, Tongyi-MAI/Z-Image-Turbo, Qwen/Qwen-Image-2512, stepfun-ai/NextStep-1.1, "
         "black-forest-labs/FLUX.1-dev, black-forest-labs/FLUX.2-klein-9B, "
         "black-forest-labs/FLUX.2-dev, tencent/HunyuanImage-3.0-Instruct, "
-        "meituan-longcat/LongCat-Image, OvisAI/Ovis-Image, "
+        "meituan-longcat/LongCat-Image, OvisAI/Ovis-Image, Cosmos3, "
         "stabilityai/stable-diffusion-3.5-medium, Tongyi-MAI/Z-Image-Turbo and etc.",
     )
     parser.add_argument(
@@ -456,6 +456,7 @@ def main():
         {
             "prompt": args.prompt,
             "negative_prompt": args.negative_prompt,
+            "modalities": ["image"],
         },
         OmniDiffusionSamplingParams(
             height=args.height,
diff --git a/examples/offline_inference/text_to_video/text_to_video.md b/examples/offline_inference/text_to_video/text_to_video.md
index f852e980a78..69ef1dadfe7 100644
--- a/examples/offline_inference/text_to_video/text_to_video.md
+++ b/examples/offline_inference/text_to_video/text_to_video.md
@@ -9,6 +9,7 @@ A unified script for text-to-video generation. Supports multiple models with mod
 | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | 720x1280 | 81 | 40 | 4.0 | ~60 GiB |
 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v` | 480x832 | 121 | 50 | 6.0 | 1×A100 80GB |
 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v` | 720x1280 | 121 | 50 | 6.0 | FP8 + VAE tiling required |
+| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | 720x1280 | 81 | 35 | 4.0 | model/checkpoint dependent |
 
 ## Local CLI Usage
 
@@ -45,6 +46,28 @@ python text_to_video.py \
   --output ltx2_out.mp4
 ```
 
+### Cosmos3
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+python text_to_video.py \
+  --model "$COSMOS3_MODEL" \
+  --prompt "A small warehouse robot moves a blue box across a clean floor." \
+  --negative-prompt "blurry, distorted, low quality" \
+  --height 720 \
+  --width 1280 \
+  --num-frames 81 \
+  --guidance-scale 4.0 \
+  --num-inference-steps 35 \
+  --fps 24 \
+  --output cosmos3_t2v_output.mp4
+```
+
+Cosmos3 video generation currently supports one prompt and one video per request. The implementation supports `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--ulysses-degree`, `--tensor-parallel-size`, `--use-hsdp`, and `--enable-layerwise-offload`. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
+
 ### HunyuanVideo-1.5 (480p)
 
 ```bash
@@ -122,6 +145,7 @@ python text_to_video.py \
 - `--audio-sample-rate`: audio sample rate for embedded audio (when the pipeline returns audio).
 - `--quantization`: quantization method (`fp8` for FP8, `gguf` for GGUF).
 - `--flow-shift`: scheduler flow_shift parameter.
+- `--cache-backend`: `cache_dit` for supported models.
 
 ### Wan2.2-specific
 
diff --git a/examples/offline_inference/text_to_video/text_to_video.py b/examples/offline_inference/text_to_video/text_to_video.py
index b19f0095e64..b704f1b87eb 100644
--- a/examples/offline_inference/text_to_video/text_to_video.py
+++ b/examples/offline_inference/text_to_video/text_to_video.py
@@ -35,10 +35,26 @@
         "fps": 24,
         "output": "hunyuan_video_15_output.mp4",
     },
+    "cosmos3": {
+        "height": 720,
+        "width": 1280,
+        "num_frames": 81,
+        "num_inference_steps": 35,
+        "guidance_scale": 4.0,
+        "fps": 24,
+        "output": "cosmos3_t2v_output.mp4",
+    },
 }
 
 
-def _detect_preset(model: str) -> dict:
+def _is_cosmos3_model(model: str, model_class_name: str | None = None) -> bool:
+    combined = f"{model} {model_class_name or ''}".lower()
+    return "cosmos3" in combined or "cosmos-3" in combined
+
+
+def _detect_preset(model: str, model_class_name: str | None = None) -> dict:
+    if _is_cosmos3_model(model, model_class_name):
+        return _MODEL_PRESETS["cosmos3"]
     model_lower = model.lower()
     if "hunyuan" in model_lower:
         return _MODEL_PRESETS["hunyuan"]
@@ -58,19 +74,19 @@ def parse_profiler_config(value: str) -> dict[str, Any]:
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Generate a video from a text prompt. "
-        "Supports Wan2.2, HunyuanVideo-1.5, and other text-to-video models."
+        "Supports Wan2.2, HunyuanVideo-1.5, Cosmos3, and other text-to-video models."
     )
     parser.add_argument(
         "--model",
         default="Wan-AI/Wan2.2-T2V-A14B-Diffusers",
         help="Diffusers model ID or local path. "
         "Examples: Wan-AI/Wan2.2-T2V-A14B-Diffusers, "
-        "hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v",
+        "hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v, $COSMOS3_MODEL",
     )
     parser.add_argument(
         "--model-class-name",
         default=None,
-        help="Override model class name (e.g., LTX2TwoStagesVideoPipeline).",
+        help="Override model class name (e.g., Cosmos3OmniDiffusersPipeline or LTX2TwoStagesVideoPipeline).",
     )
     parser.add_argument("--prompt", default="A serene lakeside sunrise with mist over the water.", help="Text prompt.")
     parser.add_argument("--negative-prompt", default="", help="Negative prompt.")
@@ -108,7 +124,7 @@ def parse_args() -> argparse.Namespace:
         type=str,
         default=None,
         choices=["cache_dit"],
-        help="Cache backend for acceleration (Wan2.2). Default: None.",
+        help="Cache backend for acceleration on supported models. Default: None.",
     )
     parser.add_argument(
         "--enable-cache-dit-summary",
@@ -312,7 +328,7 @@ def main():
     print(f"  Video size: {args.width}x{args.height}")
     print(f"{'=' * 60}\n")
 
-    prompt_dict = {"prompt": args.prompt}
+    prompt_dict = {"prompt": args.prompt, "modalities": ["video"]}
     if args.negative_prompt:
         prompt_dict["negative_prompt"] = args.negative_prompt
 
@@ -323,6 +339,7 @@ def main():
         guidance_scale=args.guidance_scale,
         num_inference_steps=args.num_inference_steps,
         num_frames=args.num_frames,
+        frame_rate=args.frame_rate if args.frame_rate is not None else float(args.fps),
     )
     if args.guidance_scale_high is not None:
         sampling_kwargs["guidance_scale_2"] = args.guidance_scale_high
diff --git a/examples/online_serving/image_to_video/README.md b/examples/online_serving/image_to_video/README.md
index 285eeb27983..6f82d3a2019 100644
--- a/examples/online_serving/image_to_video/README.md
+++ b/examples/online_serving/image_to_video/README.md
@@ -1,6 +1,14 @@
 # Image-To-Video
 
-This example demonstrates how to deploy the Wan2.2 image-to-video model for online video generation using vLLM-Omni.
+This example demonstrates how to deploy image-to-video models, including Wan2.2 and Cosmos3, for online video generation using vLLM-Omni.
+
+## Supported Models
+
+| Model | Model ID |
+|-------|----------|
+| Wan2.2 I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` |
+| Wan2.2 TI2V | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` |
+| Cosmos3 I2V | `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` |
 
 ## Start Server
 
@@ -26,6 +34,22 @@ The script allows overriding:
 - `CACHE_BACKEND` (default: `none`)
 - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`)
 
+### Cosmos3
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+vllm serve "$COSMOS3_MODEL" \
+  --omni \
+  --port 8091 \
+  --model-class-name Cosmos3OmniDiffusersPipeline \
+  --allowed-local-media-path /
+```
+
+Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
+
 ### Ascend / Local LightX2V Example
 
 For a local Wan2.2-LightX2V Diffusers directory on Ascend/NPU, you can start the server like this:
@@ -91,6 +115,53 @@ curl -X POST http://localhost:8091/v1/videos/sync \
   -o sync_i2v_output.mp4
 ```
 
+### Cosmos3 Sync Request
+
+```bash
+curl -X POST http://localhost:8091/v1/videos/sync \
+  -F "prompt=Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "input_reference=@/path/to/cherry_blossom.jpg" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "seed=42" \
+  -o cosmos3_i2v_output.mp4
+```
+
+For async generation, send the same form fields to `POST /v1/videos`, poll `GET /v1/videos/{video_id}`, and download from `GET /v1/videos/{video_id}/content`. Cosmos3 currently supports one prompt and one video per request.
+
+```bash
+create_response=$(curl -s http://localhost:8091/v1/videos \
+  -H "Accept: application/json" \
+  -F "prompt=Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "input_reference=@/path/to/cherry_blossom.jpg" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "seed=42")
+
+video_id=$(echo "$create_response" | jq -r '.id')
+while true; do
+  status=$(curl -s "http://localhost:8091/v1/videos/${video_id}" | jq -r '.status')
+  if [ "$status" = "completed" ]; then
+    break
+  fi
+  if [ "$status" = "failed" ]; then
+    echo "Video generation failed"
+    exit 1
+  fi
+  sleep 2
+done
+
+curl -L "http://localhost:8091/v1/videos/${video_id}/content" -o cosmos3_i2v_output.mp4
+```
+
 For Wan Lightning/Distill checkpoints, pass `{"sample_solver":"euler"}` via `extra_params`. The default solver is `unipc`.
 
 Example matching the local LightX2V deployment above:
diff --git a/examples/online_serving/text_to_image/README.md b/examples/online_serving/text_to_image/README.md
index 17d377ea3e2..41062f718b2 100644
--- a/examples/online_serving/text_to_image/README.md
+++ b/examples/online_serving/text_to_image/README.md
@@ -20,6 +20,21 @@ Or use the startup script:
 bash run_server.sh
 ```
 
+### Cosmos3
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+vllm serve "$COSMOS3_MODEL" \
+  --omni \
+  --port 8091 \
+  --model-class-name Cosmos3OmniDiffusersPipeline
+```
+
+Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
+
 ### Start with Parallelism Acceleration
 
 Enable Tensor Parallelism and VAE Patch Parallelism for faster inference:
@@ -68,6 +83,26 @@ curl -s http://localhost:8091/v1/chat/completions \
   }' | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png
 ```
 
+#### Cosmos3 Images API
+
+The dedicated image endpoint sets `modalities=["image"]` internally, which selects Cosmos3 text-to-image.
+
+```bash
+curl -X POST http://localhost:8091/v1/images/generations \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "A small warehouse robot carrying a blue box, clean product photography",
+    "size": "1024x1024",
+    "n": 1,
+    "num_inference_steps": 50,
+    "guidance_scale": 7.0,
+    "negative_prompt": "blurry, distorted, low quality",
+    "seed": 42
+  }' | jq -r '.data[0].b64_json' | base64 -d > cosmos3_t2i.png
+```
+
+Cosmos3 currently supports one prompt per request. Use `n` to request multiple images for that prompt.
+
 ### Method 2: Using OpenAI Python SDK
 
 ```python
@@ -226,6 +261,7 @@ count, use `size` and `n` rather than `height`, `width`, or
 | `height`                 | int   | None    | Image height in pixels         |
 | `width`                  | int   | None    | Image width in pixels          |
 | `size`                   | str   | None    | Image size (e.g., "1024x1024") |
+| `n`                      | int   | 1       | Number of images for `/v1/images/generations` |
 | `num_inference_steps`    | int   | 50      | Number of denoising steps      |
 | `true_cfg_scale`         | float | 4.0     | Qwen-Image CFG scale           |
 | `seed`                   | int   | None    | Random seed (reproducible)     |
diff --git a/examples/online_serving/text_to_video/README.md b/examples/online_serving/text_to_video/README.md
index c01e0602ff9..57922abd38a 100644
--- a/examples/online_serving/text_to_video/README.md
+++ b/examples/online_serving/text_to_video/README.md
@@ -10,6 +10,7 @@ This example demonstrates how to deploy text-to-video models for online video ge
 | Wan2.1 T2V (14B) | `Wan-AI/Wan2.1-T2V-14B-Diffusers` |
 | Wan2.2 T2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` |
 | LTX-2 | `Lightricks/LTX-2` |
+| Cosmos3 T2V | `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` |
 
 ## Wan2.2 T2V
 
@@ -37,6 +38,23 @@ The script allows overriding:
 - `CACHE_BACKEND` (default: `none`)
 - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`)
 
+## Cosmos3 T2V
+
+Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+
+### Start Server
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+
+vllm serve "$COSMOS3_MODEL" \
+  --omni \
+  --port 8091 \
+  --model-class-name Cosmos3OmniDiffusersPipeline
+```
+
+Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
+
 ## Async Job Behavior
 
 `POST /v1/videos` is asynchronous. It creates a video job and immediately
@@ -82,6 +100,51 @@ curl -X POST http://localhost:8091/v1/videos/sync \
   -o sync_t2v_output.mp4
 ```
 
+### Cosmos3 Sync Request
+
+```bash
+curl -X POST http://localhost:8091/v1/videos/sync \
+  -F "prompt=A small warehouse robot moves a blue box across a clean floor." \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "seed=42" \
+  -o cosmos3_t2v_output.mp4
+```
+
+For async generation, send the same form fields to `POST /v1/videos`, poll `GET /v1/videos/{video_id}`, and download from `GET /v1/videos/{video_id}/content`. Cosmos3 currently supports one prompt and one video per request.
+
+```bash
+create_response=$(curl -s http://localhost:8091/v1/videos \
+  -H "Accept: application/json" \
+  -F "prompt=A small warehouse robot moves a blue box across a clean floor." \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "seed=42")
+
+video_id=$(echo "$create_response" | jq -r '.id')
+while true; do
+  status=$(curl -s "http://localhost:8091/v1/videos/${video_id}" | jq -r '.status')
+  if [ "$status" = "completed" ]; then
+    break
+  fi
+  if [ "$status" = "failed" ]; then
+    echo "Video generation failed"
+    exit 1
+  fi
+  sleep 2
+done
+
+curl -L "http://localhost:8091/v1/videos/${video_id}/content" -o cosmos3_t2v_output.mp4
+```
+
 ## Storage
 
 Generated video files are stored on local disk by the async video API.
@@ -181,7 +244,7 @@ curl -X POST http://localhost:8091/v1/videos \
 | `guidance_scale`      | float  | None    | CFG guidance scale (low-noise stage)             |
 | `guidance_scale_2`    | float  | None    | CFG guidance scale (high-noise stage, Wan2.2)     |
 | `boundary_ratio`      | float  | None    | Boundary split ratio for low/high DiT (Wan2.2)   |
-| `flow_shift`          | float  | None    | Scheduler flow shift (Wan2.2)                    |
+| `flow_shift`          | float  | None    | Scheduler flow shift                             |
 | `seed`                | int    | None    | Random seed (reproducible)                       |
 | `lora`                | object | None    | LoRA configuration                               |
 
diff --git a/tests/diffusion/cache/test_cache_dit.py b/tests/diffusion/cache/test_cache_dit.py
index 0b7ef723585..8499aa39e8c 100644
--- a/tests/diffusion/cache/test_cache_dit.py
+++ b/tests/diffusion/cache/test_cache_dit.py
@@ -18,6 +18,7 @@
     cd_backend.enable_cache_for_ltx2,
     cd_backend.enable_cache_for_wan22,
     cd_backend.enable_cache_for_longcat_image,
+    cd_backend.enable_cache_for_cosmos3,
 ]
 
 SAMPLE_CACHE_CONFIG = DiffusionCacheConfig()
@@ -38,3 +39,21 @@ def test_separate_cfg(mock_cache_dit, mock_block_adapter, enabler):
     mock_cache_dit.enable_cache.assert_called_once()
     adapter_kwargs = mock_block_adapter.call_args.kwargs
     assert adapter_kwargs["has_separate_cfg"] is True
+
+
+@patch("vllm_omni.diffusion.cache.cache_dit_backend.BlockAdapter")
+@patch("vllm_omni.diffusion.cache.cache_dit_backend.cache_dit")
+def test_cosmos3_cache_dit_wraps_gen_layers(mock_cache_dit, mock_block_adapter):
+    """Cosmos3 should cache only the repeated GEN pathway blocks."""
+    mock_pipeline = Mock()
+    gen_layers = object()
+    mock_pipeline.transformer.gen_layers = gen_layers
+
+    cd_backend.enable_cache_for_cosmos3(mock_pipeline, SAMPLE_CACHE_CONFIG)
+
+    mock_cache_dit.enable_cache.assert_called_once()
+    adapter_kwargs = mock_block_adapter.call_args.kwargs
+    assert adapter_kwargs["transformer"] is mock_pipeline.transformer
+    assert adapter_kwargs["blocks"] == [gen_layers]
+    assert adapter_kwargs["has_separate_cfg"] is True
+    assert adapter_kwargs["check_forward_pattern"] is False
diff --git a/tests/diffusion/models/cosmos3/__init__.py b/tests/diffusion/models/cosmos3/__init__.py
new file mode 100644
index 00000000000..208f01a7cb5
--- /dev/null
+++ b/tests/diffusion/models/cosmos3/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/tests/diffusion/models/cosmos3/conftest.py b/tests/diffusion/models/cosmos3/conftest.py
new file mode 100644
index 00000000000..58d4af9bf85
--- /dev/null
+++ b/tests/diffusion/models/cosmos3/conftest.py
@@ -0,0 +1,191 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import sys
+import types
+from types import SimpleNamespace
+from typing import Any
+
+import pytest
+import torch
+from torch import nn
+
+
+class StubScheduler:
+    def __init__(self, timesteps: list[int] | None = None, *, flow_shift: float = 1.0) -> None:
+        self.timesteps = torch.tensor(timesteps or [9, 3], dtype=torch.int64)
+        self.config = SimpleNamespace(num_train_timesteps=1000, flow_shift=flow_shift)
+        self.set_timesteps_calls: list[tuple[int, torch.device]] = []
+        self.step_calls: list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = []
+
+    def set_timesteps(self, num_steps: int, device: torch.device) -> None:
+        self.set_timesteps_calls.append((num_steps, device))
+        self.timesteps = torch.arange(num_steps, 0, -1, dtype=torch.int64, device=device)
+
+    def step(self, noise_pred: torch.Tensor, timestep: torch.Tensor, latents: torch.Tensor, **kwargs):
+        del kwargs
+        self.step_calls.append((noise_pred.clone(), timestep.clone(), latents.clone()))
+        return (latents + noise_pred,)
+
+
+class _ModeLatentDist:
+    def __init__(self, latents: torch.Tensor) -> None:
+        self._latents = latents
+
+    def mode(self) -> torch.Tensor:
+        return self._latents
+
+
+class StubCosmos3VAE:
+    dtype = torch.float32
+
+    def __init__(self, z_dim: int = 2, *, temporal: int = 4, spatial: int = 8) -> None:
+        self.config = SimpleNamespace(
+            z_dim=z_dim,
+            scale_factor_temporal=temporal,
+            scale_factor_spatial=spatial,
+            latents_mean=[0.0] * z_dim,
+            latents_std=[1.0] * z_dim,
+        )
+
+    def encode(self, video: torch.Tensor):
+        latent_frames = (video.shape[2] - 1) // self.config.scale_factor_temporal + 1
+        latent_height = video.shape[-2] // self.config.scale_factor_spatial
+        latent_width = video.shape[-1] // self.config.scale_factor_spatial
+        latents = torch.ones(
+            video.shape[0],
+            self.config.z_dim,
+            latent_frames,
+            latent_height,
+            latent_width,
+            dtype=video.dtype,
+            device=video.device,
+        )
+        return SimpleNamespace(latent_dist=_ModeLatentDist(latents))
+
+    def decode(self, latents: torch.Tensor, return_dict: bool = False):
+        del return_dict
+        return (latents,)
+
+
+class StubCosmos3Transformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        latent_channel_size: int = 2,
+        sound_gen: bool = False,
+        sound_dim: int = 3,
+        action_gen: bool = False,
+        action_dim: int = 4,
+    ) -> None:
+        super().__init__()
+        self.latent_channel_size = latent_channel_size
+        self.sound_gen = sound_gen
+        self.sound_dim = sound_dim
+        self.action_gen = action_gen
+        self.action_dim = action_dim
+        self.cached_kv: Any | None = None
+        self.cached_freqs_gen: Any | None = None
+        self.calls: list[dict[str, Any]] = []
+        self.reset_calls = 0
+
+    def reset_cache(self) -> None:
+        self.reset_calls += 1
+        self.cached_kv = None
+        self.cached_freqs_gen = None
+
+    def forward(
+        self,
+        *,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        text_ids: torch.Tensor,
+        text_mask: torch.Tensor,
+        **kwargs: Any,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        token = int(text_ids.reshape(-1)[0].item()) if text_ids.numel() else 0
+        sound_latents = kwargs.get("sound_latents")
+        self.calls.append(
+            {
+                "token": token,
+                "timestep": timestep.clone(),
+                "text_mask": text_mask.clone(),
+                "cache_before": self.cached_kv,
+                "kwargs": dict(kwargs),
+            }
+        )
+        if self.cached_kv is None:
+            marker = torch.tensor([token], dtype=torch.float32)
+            self.cached_kv = [(marker, marker + 100)]
+            self.cached_freqs_gen = (marker + 200, marker + 300)
+        action_latents = kwargs.get("action_latents")
+        outputs: list[torch.Tensor] = [torch.full_like(hidden_states, float(token))]
+        if action_latents is not None:
+            outputs.append(torch.full_like(action_latents, float(token + 20)))
+        if sound_latents is not None:
+            outputs.append(torch.full_like(sound_latents, float(token + 10)))
+        return outputs[0] if len(outputs) == 1 else tuple(outputs)
+
+
+def passthrough_progress_bar(iterable):
+    return iterable
+
+
+@pytest.fixture(autouse=True)
+def fake_cosmos3_guardrails(monkeypatch: pytest.MonkeyPatch):
+    module = types.ModuleType("vllm_omni.diffusion.models.cosmos3.guardrails")
+    module.is_guardrails_enabled = lambda od_config: False
+    module.ensure_initialized = lambda od_config: None
+    module.check_text_safety = lambda text: None
+    module.check_video_safety = lambda video: video
+    monkeypatch.setitem(sys.modules, module.__name__, module)
+    return module
+
+
+@pytest.fixture
+def make_cosmos3_pipeline():
+    def _make():
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
+            Cosmos3OmniDiffusersPipeline,
+        )
+
+        pipeline = object.__new__(Cosmos3OmniDiffusersPipeline)
+        nn.Module.__init__(pipeline)
+        pipeline.od_config = SimpleNamespace()
+        pipeline.device = torch.device("cpu")
+        pipeline.dtype = torch.float32
+        pipeline.transformer = StubCosmos3Transformer(latent_channel_size=2)
+        pipeline.vae = StubCosmos3VAE(z_dim=2)
+        pipeline.vae_scale_factor_temporal = 4
+        pipeline.vae_scale_factor_spatial = 8
+        pipeline.scheduler = StubScheduler([9, 3], flow_shift=1.0)
+        pipeline._base_scheduler_config = pipeline.scheduler.config
+        pipeline._engine_init_flow_shift = 1.0
+        pipeline._current_flow_shift = 1.0
+        pipeline._guidance_scale = None
+        pipeline._num_timesteps = None
+        pipeline.progress_bar = passthrough_progress_bar
+        pipeline._sound_tokenizer = None
+        return pipeline
+
+    return _make
+
+
+def make_sampling_params(**overrides: Any) -> SimpleNamespace:
+    values = {
+        "height": None,
+        "width": None,
+        "num_frames": None,
+        "num_inference_steps": None,
+        "guidance_scale": None,
+        "seed": 123,
+        "num_outputs_per_prompt": 1,
+        "frame_rate": None,
+        "resolved_frame_rate": None,
+        "max_sequence_length": None,
+        "extra_args": {},
+    }
+    values.update(overrides)
+    return SimpleNamespace(**values)
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
new file mode 100644
index 00000000000..b068ea7e74a
--- /dev/null
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -0,0 +1,1108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import logging
+from types import SimpleNamespace
+
+import pytest
+import torch
+from PIL import Image
+from torch import nn
+
+from tests.diffusion.models.cosmos3.conftest import (
+    StubScheduler,
+    make_sampling_params,
+)
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
+
+
+def _ids(value: int) -> torch.Tensor:
+    return torch.tensor([[value]], dtype=torch.long)
+
+
+def _mask() -> torch.Tensor:
+    return torch.ones(1, 1, dtype=torch.long)
+
+
+class TestRegistryIntegration:
+    def test_pipeline_registered_and_exported(self) -> None:
+        from vllm_omni.diffusion.cache.cache_dit_backend import CUSTOM_DIT_ENABLERS
+        from vllm_omni.diffusion.models import cosmos3
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
+            Cosmos3OmniDiffusersPipeline,
+        )
+        from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin
+        from vllm_omni.diffusion.registry import (
+            _DIFFUSION_MODELS,
+            _DIFFUSION_POST_PROCESS_FUNCS,
+            _DIFFUSION_PRE_PROCESS_FUNCS,
+        )
+
+        assert issubclass(Cosmos3OmniDiffusersPipeline, nn.Module)
+        assert issubclass(Cosmos3OmniDiffusersPipeline, ProgressBarMixin)
+        assert Cosmos3OmniDiffusersPipeline.support_image_input is True
+        assert _DIFFUSION_MODELS["Cosmos3OmniDiffusersPipeline"] == (
+            "cosmos3",
+            "pipeline_cosmos3",
+            "Cosmos3OmniDiffusersPipeline",
+        )
+        assert _DIFFUSION_PRE_PROCESS_FUNCS["Cosmos3OmniDiffusersPipeline"] == "get_cosmos3_pre_process_func"
+        assert _DIFFUSION_POST_PROCESS_FUNCS["Cosmos3OmniDiffusersPipeline"] == "get_cosmos3_post_process_func"
+        assert "Cosmos3OmniDiffusersPipeline" in CUSTOM_DIT_ENABLERS
+        assert hasattr(cosmos3, "Cosmos3OmniDiffusersPipeline")
+        assert "Cosmos3OmniDiffusersPipeline" in cosmos3.__all__
+
+
+class TestPreAndPostProcess:
+    def test_preprocess_leaves_t2v_string_prompt_unchanged(self) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
+
+        request = SimpleNamespace(
+            prompts=["A robot walks through a warehouse."],
+            sampling_params=SimpleNamespace(height=None, width=None),
+        )
+
+        result = get_cosmos3_pre_process_func(SimpleNamespace())(request)
+
+        assert result is request
+        assert result.prompts == ["A robot walks through a warehouse."]
+        assert result.sampling_params.height is None
+        assert result.sampling_params.width is None
+
+    def test_preprocess_resizes_i2v_image_to_720p_aspect_and_stores_tensor(self) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
+
+        request = SimpleNamespace(
+            prompts=[
+                {
+                    "prompt": "A slow camera push.",
+                    "multi_modal_data": {"image": Image.new("RGB", (320, 160), "red")},
+                }
+            ],
+            sampling_params=SimpleNamespace(height=None, width=None),
+        )
+
+        result = get_cosmos3_pre_process_func(SimpleNamespace())(request)
+        prompt = result.prompts[0]
+
+        assert result.sampling_params.height == 672
+        assert result.sampling_params.width == 1344
+        preprocessed = prompt["additional_information"]["preprocessed_image"]
+        assert isinstance(preprocessed, torch.Tensor)
+        assert tuple(preprocessed.shape[-2:]) == (672, 1344)
+
+    def test_preprocess_preserves_explicit_size_for_i2v(self) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
+
+        request = SimpleNamespace(
+            prompts=[
+                {
+                    "prompt": "A slow camera push.",
+                    "multi_modal_data": {"image": Image.new("RGB", (320, 160), "red")},
+                }
+            ],
+            sampling_params=SimpleNamespace(height=64, width=96),
+        )
+
+        result = get_cosmos3_pre_process_func(SimpleNamespace())(request)
+
+        assert tuple(result.prompts[0]["additional_information"]["preprocessed_image"].shape[-2:]) == (64, 96)
+
+    def test_postprocess_latent_passthrough_and_t2i_shape_validation(self) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_post_process_func
+
+        func = get_cosmos3_post_process_func(SimpleNamespace())
+        video = torch.zeros(1, 3, 1, 4, 4)
+
+        assert func(video, output_type="latent") is video
+
+        images = func({"image": video})
+        assert len(images) == 1
+        assert images[0].size == (4, 4)
+
+        video_result = func({"video": video})
+        assert "video" in video_result
+
+        sound_result = func(
+            {
+                "video": video,
+                "audio": torch.ones(1, 2, 16),
+                "audio_sample_rate": 48000,
+            },
+            sampling_params=SimpleNamespace(extra_args={"resolved_frame_rate": 12}),
+        )
+        assert "video" in sound_result
+        assert sound_result["audio"].shape == (1, 2, 16)
+        assert sound_result["audio_sample_rate"] == 48000
+        assert sound_result["fps"] == 12
+
+        with pytest.raises(ValueError, match="text-to-image postprocess expects"):
+            func({"image": torch.zeros(1, 3, 2, 4, 4)})
+
+        with pytest.raises(ValueError, match="both image and video"):
+            func({"image": video, "video": video})
+
+        with pytest.raises(ValueError, match="does not support audio output"):
+            func({"image": video, "audio": torch.ones(1, 2, 16)})
+
+
+class TestPipelineHelpers:
+    def test_get_sp_param_prefers_extra_args_then_direct_attribute(self) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
+            Cosmos3OmniDiffusersPipeline,
+        )
+
+        sp = SimpleNamespace(extra_args={"flow_shift": 3.0}, flow_shift=2.0)
+        assert Cosmos3OmniDiffusersPipeline._get_sp_param(sp, "flow_shift", 1.0) == 3.0
+
+        sp = SimpleNamespace(extra_args={}, flow_shift=2.0)
+        assert Cosmos3OmniDiffusersPipeline._get_sp_param(sp, "flow_shift", 1.0) == 2.0
+
+        sp = SimpleNamespace(extra_args={})
+        assert Cosmos3OmniDiffusersPipeline._get_sp_param(sp, "flow_shift", 1.0) == 1.0
+
+    def test_apply_metadata_templates_adds_duration_and_resolution(self) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
+            Cosmos3OmniDiffusersPipeline,
+        )
+
+        prompt = Cosmos3OmniDiffusersPipeline._apply_metadata_templates(
+            "A city street.",
+            num_frames=48,
+            frame_rate=24,
+            height=720,
+            width=1280,
+        )
+
+        assert prompt == (
+            "A city street. The video is 2.0 seconds long and is of 24 FPS. This video is of 720x1280 resolution."
+        )
+
+    @pytest.mark.parametrize(
+        "tokenized",
+        [
+            [1, 2],
+            (1, 2),
+            {"input_ids": [[1, 2]]},
+            torch.tensor([1, 2]),
+        ],
+    )
+    def test_normalize_token_ids_accepts_common_tokenizer_outputs(self, tokenized) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
+            Cosmos3OmniDiffusersPipeline,
+        )
+
+        assert Cosmos3OmniDiffusersPipeline._normalize_token_ids(tokenized) == [1, 2]
+
+    def test_normalize_token_ids_rejects_unknown_or_non_integer_values(self) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
+            Cosmos3OmniDiffusersPipeline,
+        )
+
+        with pytest.raises(TypeError, match="must return token IDs"):
+            Cosmos3OmniDiffusersPipeline._normalize_token_ids(object())
+
+        with pytest.raises(TypeError, match="non-integer token"):
+            Cosmos3OmniDiffusersPipeline._normalize_token_ids([object()])
+
+    def test_tokenize_prompt_adds_generation_tokens_and_padding(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+
+        class FakeTokenizer:
+            eos_token_id = 99
+            pad_token_id = 0
+
+            def __init__(self) -> None:
+                self.conversations = None
+
+            def apply_chat_template(self, conversations, tokenize: bool, add_generation_prompt: bool):
+                self.conversations = conversations
+                assert tokenize is True
+                assert add_generation_prompt is True
+                return [10, 11]
+
+            def convert_tokens_to_ids(self, token: str) -> int:
+                assert token == "<|vision_start|>"
+                return 88
+
+        tokenizer = FakeTokenizer()
+        pipeline.tokenizer = tokenizer
+
+        input_ids, attention_mask = pipeline._tokenize_prompt(
+            "hello",
+            max_sequence_length=6,
+            use_system_prompt=True,
+            system_prompt="system",
+        )
+
+        assert input_ids.tolist() == [[10, 11, 99, 88, 0, 0]]
+        assert attention_mask.tolist() == [[1, 1, 1, 1, 0, 0]]
+        assert tokenizer.conversations == [
+            {"role": "system", "content": "system"},
+            {"role": "user", "content": "hello"},
+        ]
+
+    def test_format_and_tokenize_uses_video_and_image_metadata_modes(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        captured: list[tuple[str, bool, str | None]] = []
+
+        def fake_tokenize(text, max_sequence_length, use_system_prompt=False, system_prompt=None):
+            del max_sequence_length
+            captured.append((text, use_system_prompt, system_prompt))
+            return _ids(len(captured)), _mask()
+
+        pipeline._tokenize_prompt = fake_tokenize  # type: ignore[method-assign]
+
+        pipeline._format_and_tokenize_prompts(
+            "A robot",
+            "bad",
+            num_frames=48,
+            frame_rate=24,
+            height=720,
+            width=1280,
+            max_sequence_length=32,
+            sp=SimpleNamespace(extra_args={"negative_metadata_mode": "inverse"}),
+            use_system_prompt=True,
+            is_t2i=False,
+        )
+        assert "The video is 2.0 seconds long" in captured[0][0]
+        assert "This video is of 720x1280 resolution" in captured[0][0]
+        assert "The video is not 2.0 seconds long" in captured[1][0]
+        assert captured[0][1] is True
+
+        captured.clear()
+        pipeline._format_and_tokenize_prompts(
+            "A robot",
+            "bad",
+            num_frames=1,
+            frame_rate=24,
+            height=1024,
+            width=1024,
+            max_sequence_length=32,
+            sp=SimpleNamespace(extra_args={}),
+            use_system_prompt=False,
+            is_t2i=True,
+        )
+        assert "This image is of 1024x1024 resolution" in captured[0][0]
+        assert "seconds long" not in captured[0][0]
+        assert captured[1][0] == "bad"
+
+    @pytest.mark.parametrize(
+        ("key", "expected"),
+        [
+            ("transformer.vae2llm.weight", "transformer.vae2llm.weight"),
+            ("model.embed_tokens.weight", "transformer.language_model.embed_tokens.weight"),
+            ("model.norm.weight", "transformer.language_model.norm.weight"),
+            ("model.norm_moe_gen.weight", "transformer.norm_moe_gen.weight"),
+            (
+                "model.layers.3.self_attn.q_proj.weight",
+                "transformer.language_model.layers.3.self_attn.q_proj.weight",
+            ),
+            (
+                "model.layers.3.self_attn.q_proj_moe_gen.weight",
+                "transformer.gen_layers.3.cross_attention.q_proj.weight",
+            ),
+            (
+                "model.layers.3.mlp_moe_gen.down_proj.weight",
+                "transformer.gen_layers.3.mlp.down_proj.weight",
+            ),
+            ("sound2llm.weight", "transformer.sound2llm.weight"),
+            ("llm2sound.bias", "transformer.llm2sound.bias"),
+            ("sound_modality_embed", "transformer.sound_modality_embed"),
+            ("sound_modality_embed.weight", "transformer.sound_modality_embed"),
+            ("action2llm.fc.weight", "transformer.action2llm.fc.weight"),
+            ("llm2action.bias.weight", "transformer.llm2action.bias.weight"),
+            ("action_modality_embed", "transformer.action_modality_embed"),
+            ("action_modality_embed.weight", "transformer.action_modality_embed"),
+            ("action_pos_embed.weight", None),
+            ("lm_head.weight", None),
+            ("other.weight", None),
+        ],
+    )
+    def test_remap_ckpt_key(self, key: str, expected: str | None) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
+            Cosmos3OmniDiffusersPipeline,
+        )
+
+        assert Cosmos3OmniDiffusersPipeline._remap_ckpt_key(key) == expected
+
+    def test_prepare_latents_shape_uses_cosmos_temporal_and_spatial_factors(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+
+        latents = pipeline._prepare_latents(
+            height=16,
+            width=24,
+            num_frames=5,
+            generator=torch.Generator(device="cpu").manual_seed(0),
+        )
+
+        assert latents.shape == (1, 2, 2, 2, 3)
+        assert latents.dtype == torch.float32
+
+    def test_sound_request_detection_uses_prompt_and_extra_args(self) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
+            Cosmos3OmniDiffusersPipeline,
+        )
+
+        assert Cosmos3OmniDiffusersPipeline._is_sound_request(
+            {"prompt": "x", "generate_sound": True},
+            SimpleNamespace(extra_args={}),
+        )
+        assert Cosmos3OmniDiffusersPipeline._is_sound_request(
+            {"prompt": "x"},
+            SimpleNamespace(extra_args={"enable_sound_generation": "true"}),
+        )
+        assert not Cosmos3OmniDiffusersPipeline._is_sound_request(
+            {"prompt": "x"},
+            SimpleNamespace(extra_args={"generate_sound": False}),
+        )
+
+    def test_prepare_sound_latents_uses_lazy_tokenizer_and_duration(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+
+        class FakeSoundTokenizer:
+            sample_rate = 10
+            latent_ch = 3
+
+            def get_latent_num_samples(self, samples: int) -> int:
+                assert samples == 20
+                return 5
+
+            def decode(self, latents: torch.Tensor) -> torch.Tensor:
+                return torch.ones(latents.shape[0], 2, 7)
+
+        pipeline._sound_tokenizer = FakeSoundTokenizer()
+
+        target_samples, duration, sample_rate = pipeline._resolve_sound_target_samples(
+            SimpleNamespace(extra_args={"sound_duration": 2.0}),
+            num_frames=9,
+            frame_rate=3.0,
+        )
+        latents, latent_frames = pipeline._prepare_sound_latents(
+            target_samples,
+            torch.Generator(device="cpu").manual_seed(0),
+        )
+        audio = pipeline._decode_sound_latents(torch.zeros(1, 3, 5), target_audio_samples=5)
+
+        assert (target_samples, duration, sample_rate) == (20, 2.0, 10)
+        assert latents.shape == (1, 3, 5)
+        assert latent_frames == 5
+        assert audio.shape == (1, 2, 5)
+
+    def test_init_eagerly_loads_sound_tokenizer_when_transformer_supports_sound(
+        self,
+        tmp_path,
+        monkeypatch: pytest.MonkeyPatch,
+    ) -> None:
+        import vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 as cosmos3_module
+        from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+        class FakeTokenizer:
+            @classmethod
+            def from_pretrained(cls, *args, **kwargs):
+                return cls()
+
+        class FakeVAE:
+            config = SimpleNamespace(scale_factor_temporal=4, scale_factor_spatial=8)
+
+            @classmethod
+            def from_pretrained(cls, *args, **kwargs):
+                return cls()
+
+            def to(self, device):
+                self.device = device
+                return self
+
+        class FakeScheduler:
+            config = SimpleNamespace(flow_shift=1.0)
+
+            @classmethod
+            def from_pretrained(cls, *args, **kwargs):
+                return cls()
+
+        class FakeTransformer:
+            sound_gen = True
+
+        fake_sound_tokenizer = object()
+        calls = []
+
+        def fake_from_config(od_config):
+            calls.append(od_config)
+            return fake_sound_tokenizer
+
+        monkeypatch.setattr(cosmos3_module, "AutoTokenizer", FakeTokenizer)
+        monkeypatch.setattr(cosmos3_module, "DistributedAutoencoderKLWan", FakeVAE)
+        monkeypatch.setattr(cosmos3_module, "UniPCMultistepScheduler", FakeScheduler)
+        monkeypatch.setattr(cosmos3_module, "Cosmos3VFMTransformer", lambda *args, **kwargs: FakeTransformer())
+        monkeypatch.setattr(sound_tokenizer.Cosmos3SoundTokenizer, "from_config", staticmethod(fake_from_config))
+        monkeypatch.setattr(
+            cosmos3_module.Cosmos3OmniDiffusersPipeline,
+            "setup_diffusion_pipeline_profiler",
+            lambda self, **kwargs: None,
+        )
+
+        od_config = SimpleNamespace(
+            model=str(tmp_path),
+            dtype=torch.float32,
+            enable_cpu_offload=False,
+            flow_shift=None,
+            enable_diffusion_pipeline_profiler=False,
+        )
+        pipeline = cosmos3_module.Cosmos3OmniDiffusersPipeline(od_config=od_config)
+
+        assert calls == [od_config]
+        assert pipeline._sound_tokenizer is fake_sound_tokenizer
+        source = pipeline.weights_sources[0]
+        assert source.subfolder is None
+        assert source.prefix == "transformer."
+        assert source.allow_patterns_overrides == ["transformer/*.safetensors"]
+
+    def test_prepare_latents_i2v_conditions_first_latent_frame(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+
+        def fake_encode(image_tensor, num_frames, height, width):
+            del image_tensor, num_frames, height, width
+            return torch.full((1, 2, 2, 2, 3), 5.0)
+
+        pipeline._encode_conditioning_video = fake_encode  # type: ignore[method-assign]
+
+        latents, velocity_mask, image_latent = pipeline._prepare_latents_i2v(
+            image_tensor=torch.zeros(1, 3, 16, 24),
+            height=16,
+            width=24,
+            num_frames=5,
+            generator=torch.Generator(device="cpu").manual_seed(0),
+        )
+
+        assert latents.shape == (1, 2, 2, 2, 3)
+        torch.testing.assert_close(latents[:, :, 0], torch.full((1, 2, 2, 3), 5.0))
+        assert velocity_mask.tolist() == [[[[[0.0]], [[1.0]]]]]
+        torch.testing.assert_close(image_latent, torch.full((1, 2, 1, 2, 3), 5.0))
+
+    def test_prepare_action_latents_policy_uses_noise_and_raw_dim_mask(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(action_gen=True, action_dim=4)
+
+        action, velocity_mask, clean, raw_dim = pipeline._prepare_action_latents(
+            mode="policy",
+            action_chunk_size=3,
+            raw_action_dim=2,
+            generator=torch.Generator(device="cpu").manual_seed(0),
+            sp=SimpleNamespace(extra_args={}),
+        )
+
+        assert action.shape == (1, 3, 4)
+        assert raw_dim == 2
+        assert velocity_mask.tolist() == [[[1.0], [1.0], [1.0]]]
+        torch.testing.assert_close(action[:, :, 2:], torch.zeros(1, 3, 2))
+        torch.testing.assert_close(clean, torch.zeros(1, 3, 4))
+
+    def test_prepare_action_latents_forward_dynamics_conditions_supplied_actions(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(action_gen=True, action_dim=4)
+
+        action, velocity_mask, clean, raw_dim = pipeline._prepare_action_latents(
+            mode="forward_dynamics",
+            action_chunk_size=2,
+            raw_action_dim=None,
+            generator=torch.Generator(device="cpu").manual_seed(0),
+            sp=SimpleNamespace(extra_args={"action": [[1.0, 2.0], [3.0, 4.0]]}),
+        )
+
+        assert raw_dim == 2
+        assert velocity_mask.tolist() == [[[0.0], [0.0]]]
+        torch.testing.assert_close(action, clean)
+        torch.testing.assert_close(action[0, :, :2], torch.tensor([[1.0, 2.0], [3.0, 4.0]]))
+
+    def test_set_flow_shift_rebuilds_only_when_target_changes(self, make_cosmos3_pipeline, monkeypatch) -> None:
+        import vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 as cosmos3_module
+
+        pipeline = make_cosmos3_pipeline()
+
+        class FakeUniPCMultistepScheduler:
+            calls: list[tuple[object, float]] = []
+
+            @classmethod
+            def from_config(cls, config, flow_shift: float):
+                cls.calls.append((config, flow_shift))
+                return StubScheduler([1], flow_shift=flow_shift)
+
+        monkeypatch.setattr(cosmos3_module, "UniPCMultistepScheduler", FakeUniPCMultistepScheduler)
+        original_scheduler = pipeline.scheduler
+
+        pipeline._set_flow_shift(1.0)
+        assert pipeline.scheduler is original_scheduler
+        assert FakeUniPCMultistepScheduler.calls == []
+
+        pipeline._set_flow_shift(3.0)
+        assert pipeline.scheduler is not original_scheduler
+        assert pipeline._current_flow_shift == 3.0
+        assert FakeUniPCMultistepScheduler.calls == [(pipeline._base_scheduler_config, 3.0)]
+
+
+class TestDiffuse:
+    def test_diffuse_without_cfg_runs_one_cond_forward_per_step(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        latents = torch.zeros(1, 2, 2, 1, 1)
+
+        result = pipeline.diffuse(
+            latents=latents,
+            timesteps=torch.tensor([7, 3]),
+            cond_ids=_ids(2),
+            cond_mask=_mask(),
+            uncond_ids=_ids(1),
+            uncond_mask=_mask(),
+            guidance_scale=1.0,
+            shared_kwargs={"video_shape": (2, 1, 1), "fps": 24.0},
+        )
+
+        assert pipeline.transformer.reset_calls == 1
+        assert [call["token"] for call in pipeline.transformer.calls] == [2, 2]
+        torch.testing.assert_close(result, torch.full_like(latents, 4.0))
+
+    def test_diffuse_sequential_cfg_uses_separate_caches_and_interval_skip(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        latents = torch.zeros(1, 2, 1, 1, 1)
+
+        result = pipeline.diffuse(
+            latents=latents,
+            timesteps=torch.tensor([900, 100]),
+            cond_ids=_ids(2),
+            cond_mask=_mask(),
+            uncond_ids=_ids(1),
+            uncond_mask=_mask(),
+            guidance_scale=3.0,
+            shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0},
+            guidance_interval=(500.0, 1000.0),
+        )
+
+        assert [call["token"] for call in pipeline.transformer.calls] == [2, 1, 2]
+        assert pipeline.transformer.calls[0]["cache_before"] is None
+        assert pipeline.transformer.calls[1]["cache_before"] is None
+        assert pipeline.transformer.calls[2]["cache_before"] is not None
+        torch.testing.assert_close(result, torch.full_like(latents, 6.0))
+
+    def test_diffuse_cfg_parallel_uses_scale_one_outside_guidance_interval(
+        self,
+        make_cosmos3_pipeline,
+    ) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline._cfg_parallel_active = lambda: True  # type: ignore[method-assign]
+        latents = torch.zeros(1, 2, 1, 1, 1)
+        calls = []
+
+        def fake_predict_noise_maybe_with_cfg(**kwargs):
+            calls.append(kwargs)
+            return torch.ones_like(latents)
+
+        pipeline.predict_noise_maybe_with_cfg = fake_predict_noise_maybe_with_cfg  # type: ignore[method-assign]
+
+        result = pipeline.diffuse(
+            latents=latents,
+            timesteps=torch.tensor([900, 100]),
+            cond_ids=_ids(2),
+            cond_mask=_mask(),
+            uncond_ids=_ids(1),
+            uncond_mask=_mask(),
+            guidance_scale=4.0,
+            shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0},
+            guidance_interval=(500.0, 1000.0),
+        )
+
+        assert [call["true_cfg_scale"] for call in calls] == [4.0, 1.0]
+        assert calls[0]["positive_kwargs"]["text_ids"].item() == 2
+        assert calls[0]["negative_kwargs"]["text_ids"].item() == 1
+        torch.testing.assert_close(result, torch.full_like(latents, 2.0))
+
+    def test_diffuse_i2v_masks_conditioned_frame_and_reinjects_image_latent(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        latents = torch.zeros(1, 2, 2, 1, 1)
+        velocity_mask = torch.tensor([[[[[0.0]], [[1.0]]]]])
+        image_latent = torch.full((1, 2, 1, 1, 1), 7.0)
+
+        result = pipeline.diffuse(
+            latents=latents,
+            timesteps=torch.tensor([7]),
+            cond_ids=_ids(2),
+            cond_mask=_mask(),
+            uncond_ids=_ids(1),
+            uncond_mask=_mask(),
+            guidance_scale=1.0,
+            shared_kwargs={"video_shape": (2, 1, 1), "fps": 24.0},
+            velocity_mask=velocity_mask,
+            image_latent=image_latent,
+        )
+
+        torch.testing.assert_close(result[:, :, 0:1], image_latent)
+        torch.testing.assert_close(result[:, :, 1:2], torch.full((1, 2, 1, 1, 1), 2.0))
+
+    def test_diffuse_with_sound_steps_video_and_sound_jointly(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+        latents = torch.zeros(1, 2, 1, 1, 1)
+        sound_latents = torch.zeros(1, 3, 2)
+
+        video_result, sound_result = pipeline.diffuse(
+            latents=latents,
+            sound_latents=sound_latents,
+            timesteps=torch.tensor([7, 3]),
+            cond_ids=_ids(2),
+            cond_mask=_mask(),
+            uncond_ids=_ids(1),
+            uncond_mask=_mask(),
+            guidance_scale=1.0,
+            shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0},
+        )
+
+        torch.testing.assert_close(video_result, torch.full_like(latents, 4.0))
+        torch.testing.assert_close(sound_result, torch.full_like(sound_latents, 24.0))
+        assert pipeline.scheduler.step_calls[0][0].shape == (1, latents.numel() + sound_latents.numel())
+
+    def test_diffuse_with_action_steps_video_and_action_jointly(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
+        latents = torch.zeros(1, 2, 1, 1, 1)
+        action_latents = torch.zeros(1, 3, 4)
+
+        video_result, action_result = pipeline.diffuse(
+            latents=latents,
+            action_latents=action_latents,
+            action_velocity_mask=torch.ones(1, 3, 1),
+            action_condition_latents=torch.zeros(1, 3, 4),
+            timesteps=torch.tensor([7, 3]),
+            cond_ids=_ids(2),
+            cond_mask=_mask(),
+            uncond_ids=_ids(1),
+            uncond_mask=_mask(),
+            guidance_scale=1.0,
+            shared_kwargs={
+                "video_shape": (1, 1, 1),
+                "fps": 24.0,
+                "action_domain_ids": torch.tensor([0]),
+                "action_noisy_mask": torch.ones(1, 3, 1),
+            },
+        )
+
+        torch.testing.assert_close(video_result, torch.full_like(latents, 4.0))
+        torch.testing.assert_close(action_result, torch.full_like(action_latents, 44.0))
+        assert pipeline.scheduler.step_calls[0][0].shape == (1, latents.numel() + action_latents.numel())
+
+
+class TestForwardRouting:
+    def _install_forward_stubs(self, pipeline):
+        captured: dict[str, object] = {"diffuse_calls": [], "prepare_calls": []}
+
+        def fake_format(
+            prompt,
+            negative_prompt,
+            num_frames,
+            frame_rate,
+            height,
+            width,
+            max_sequence_length,
+            sp,
+            use_system_prompt=False,
+            is_t2i=False,
+        ):
+            captured["format"] = {
+                "prompt": prompt,
+                "negative_prompt": negative_prompt,
+                "num_frames": num_frames,
+                "frame_rate": frame_rate,
+                "height": height,
+                "width": width,
+                "max_sequence_length": max_sequence_length,
+                "use_system_prompt": use_system_prompt,
+                "is_t2i": is_t2i,
+                "sp": sp,
+            }
+            return _ids(2), _mask(), _ids(1), _mask()
+
+        def fake_prepare(height, width, num_frames, generator):
+            captured["prepare_calls"].append((height, width, num_frames, generator.initial_seed()))
+            return torch.zeros(1, 2, 1, 1, 1)
+
+        def fake_set_flow_shift(target):
+            captured.setdefault("flow_shifts", []).append(target)
+            pipeline._current_flow_shift = target
+
+        def fake_set_scheduler_timesteps(num_inference_steps):
+            captured.setdefault("scheduler_steps", []).append(num_inference_steps)
+            pipeline.scheduler.timesteps = torch.tensor([7])
+
+        def fake_diffuse(**kwargs):
+            captured["diffuse_calls"].append(kwargs)
+            outputs = [kwargs["latents"] + len(captured["diffuse_calls"])]
+            if kwargs.get("action_latents") is not None:
+                outputs.append(kwargs["action_latents"] + 3.0)
+            if kwargs.get("sound_latents") is not None:
+                outputs.append(kwargs["sound_latents"] + 2.0)
+            return outputs[0] if len(outputs) == 1 else tuple(outputs)
+
+        pipeline._format_and_tokenize_prompts = fake_format  # type: ignore[method-assign]
+        pipeline._prepare_latents = fake_prepare  # type: ignore[method-assign]
+        pipeline._set_flow_shift = fake_set_flow_shift  # type: ignore[method-assign]
+        pipeline._set_scheduler_timesteps = fake_set_scheduler_timesteps  # type: ignore[method-assign]
+        pipeline.diffuse = fake_diffuse  # type: ignore[method-assign]
+        pipeline._decode_latents = lambda latents: latents  # type: ignore[method-assign]
+        return captured
+
+    def _install_sound_stubs(self, pipeline):
+        sound_latents = torch.zeros(1, 3, 4)
+        decoded_audio = torch.ones(1, 2, 20)
+
+        def fake_resolve_sound_target_samples(sp, num_frames, frame_rate):
+            del sp, num_frames, frame_rate
+            return 20, 2.0, 10
+
+        def fake_prepare_sound_latents(target_samples, generator):
+            del target_samples, generator
+            return sound_latents, 4
+
+        pipeline._resolve_sound_target_samples = fake_resolve_sound_target_samples  # type: ignore[method-assign]
+        pipeline._prepare_sound_latents = fake_prepare_sound_latents  # type: ignore[method-assign]
+        pipeline._decode_sound_latents = lambda latents, target_samples: decoded_audio  # type: ignore[method-assign]
+        return sound_latents, decoded_audio
+
+    def test_forward_uses_t2i_defaults_and_generates_multiple_outputs(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        captured = self._install_forward_stubs(pipeline)
+        req = SimpleNamespace(
+            prompts=[{"prompt": "A painted robot", "modalities": ["image"]}],
+            sampling_params=make_sampling_params(num_outputs_per_prompt=2),
+        )
+
+        output = pipeline.forward(req)
+
+        assert captured["flow_shifts"] == [3.0]
+        assert captured["scheduler_steps"] == [50, 50]
+        assert captured["format"]["is_t2i"] is True
+        assert captured["format"]["height"] == 1024
+        assert captured["format"]["width"] == 1024
+        assert captured["format"]["num_frames"] == 1
+        assert len(captured["diffuse_calls"]) == 2
+        assert captured["diffuse_calls"][0]["guidance_interval"] == (400.0, 1000.0)
+        assert output.output["image"].shape[0] == 2
+
+    def test_forward_uses_t2v_defaults_and_engine_flow_shift(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        captured = self._install_forward_stubs(pipeline)
+        req = SimpleNamespace(
+            prompts=[{"prompt": "A warehouse robot", "modalities": ["video"]}],
+            sampling_params=make_sampling_params(),
+        )
+
+        pipeline.forward(req)
+
+        assert captured["flow_shifts"] == [1.0]
+        assert captured["scheduler_steps"] == [35]
+        assert captured["format"]["is_t2i"] is False
+        assert captured["format"]["height"] == 720
+        assert captured["format"]["width"] == 1280
+        assert captured["format"]["num_frames"] == 81
+        assert captured["diffuse_calls"][0]["guidance_interval"] is None
+
+    def test_forward_defaults_to_video_without_modalities(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        captured = self._install_forward_stubs(pipeline)
+        req = SimpleNamespace(
+            prompts=["A warehouse robot"],
+            sampling_params=make_sampling_params(),
+        )
+
+        output = pipeline.forward(req)
+
+        assert captured["format"]["is_t2i"] is False
+        assert "video" in output.output
+
+    def test_forward_selects_i2v_latents_for_image_conditioning(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        captured = self._install_forward_stubs(pipeline)
+        image_tensor = torch.zeros(1, 3, 16, 16)
+        velocity_mask = torch.tensor([[[[[0.0]], [[1.0]]]]])
+        image_latent = torch.full((1, 2, 1, 1, 1), 5.0)
+
+        def fake_prepare_i2v(image, height, width, num_frames, generator):
+            captured["i2v_prepare"] = (image, height, width, num_frames, generator.initial_seed())
+            return torch.zeros(1, 2, 2, 1, 1), velocity_mask, image_latent
+
+        def fail_prepare(*args, **kwargs):
+            del args, kwargs
+            raise AssertionError("T2V latent preparation should not run for an I2V request")
+
+        pipeline._prepare_latents = fail_prepare  # type: ignore[method-assign]
+        pipeline._prepare_latents_i2v = fake_prepare_i2v  # type: ignore[method-assign]
+        req = SimpleNamespace(
+            prompts=[
+                {
+                    "prompt": "A robot starts moving.",
+                    "modalities": ["video"],
+                    "negative_prompt": "bad",
+                    "additional_information": {"preprocessed_image": image_tensor},
+                }
+            ],
+            sampling_params=make_sampling_params(height=16, width=16, num_frames=5),
+        )
+
+        pipeline.forward(req)
+
+        prepared_image, prepared_height, prepared_width, prepared_frames, _ = captured["i2v_prepare"]
+        assert prepared_image is image_tensor
+        assert prepared_height == 16
+        assert prepared_width == 16
+        assert prepared_frames == 5
+        diffuse_call = captured["diffuse_calls"][0]
+        assert diffuse_call["velocity_mask"] is velocity_mask
+        assert diffuse_call["image_latent"] is image_latent
+        assert diffuse_call["shared_kwargs"]["noisy_frame_mask"] is velocity_mask
+
+    def test_forward_policy_action_returns_custom_output(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
+        captured = self._install_forward_stubs(pipeline)
+        image_tensor = torch.zeros(1, 3, 16, 16)
+        req = SimpleNamespace(
+            prompts=[
+                {
+                    "prompt": "Pick the block.",
+                    "modalities": ["video"],
+                    "additional_information": {"preprocessed_image": image_tensor},
+                }
+            ],
+            sampling_params=make_sampling_params(
+                height=16,
+                width=16,
+                extra_args={
+                    "action_mode": "policy",
+                    "action_chunk_size": 2,
+                    "raw_action_dim": 2,
+                    "domain_name": "bridge_orig_lerobot",
+                },
+            ),
+        )
+
+        output = pipeline.forward(req)
+
+        diffuse_call = captured["diffuse_calls"][0]
+        assert diffuse_call["action_latents"].shape == (1, 2, 4)
+        assert diffuse_call["action_velocity_mask"].tolist() == [[[1.0], [1.0]]]
+        assert diffuse_call["shared_kwargs"]["action_domain_ids"].tolist() == [7]
+        assert diffuse_call["shared_kwargs"]["action_start_frame_offset"] == 1
+        assert output.custom_output["action"].shape == (1, 2, 2)
+        assert output.custom_output["raw_action_dim"] == 2
+        assert output.custom_output["action_mode"] == "policy"
+        assert output.custom_output["domain_id"] == 7
+
+    def test_forward_action_defaults_to_reference_chunk_size(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
+        captured = self._install_forward_stubs(pipeline)
+        req = SimpleNamespace(
+            prompts=[
+                {
+                    "prompt": "Pick the block.",
+                    "modalities": ["video"],
+                    "additional_information": {"preprocessed_image": torch.zeros(1, 3, 16, 16)},
+                }
+            ],
+            sampling_params=make_sampling_params(
+                height=16,
+                width=16,
+                extra_args={
+                    "action_mode": "policy",
+                    "raw_action_dim": 2,
+                    "domain_id": 0,
+                },
+            ),
+        )
+
+        pipeline.forward(req)
+
+        assert captured["format"]["num_frames"] == 17
+        assert captured["diffuse_calls"][0]["action_latents"].shape == (1, 16, 4)
+
+    def test_forward_video_sound_decodes_and_returns_audio_payload(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+        captured = self._install_forward_stubs(pipeline)
+        sound_latents = torch.zeros(1, 3, 4)
+        decoded_audio = torch.ones(1, 2, 20)
+
+        def fake_resolve_sound_target_samples(sp, num_frames, frame_rate):
+            del sp, num_frames, frame_rate
+            return 20, 2.0, 10
+
+        def fake_prepare_sound_latents(target_samples, generator):
+            del target_samples, generator
+            return sound_latents, 4
+
+        pipeline._resolve_sound_target_samples = fake_resolve_sound_target_samples  # type: ignore[method-assign]
+        pipeline._prepare_sound_latents = fake_prepare_sound_latents  # type: ignore[method-assign]
+        pipeline._decode_sound_latents = lambda latents, target_samples: decoded_audio  # type: ignore[method-assign]
+
+        req = SimpleNamespace(
+            prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
+            sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0),
+        )
+
+        output = pipeline.forward(req)
+
+        assert captured["diffuse_calls"][0]["sound_latents"] is sound_latents
+        assert output.output["audio"] is decoded_audio
+        assert output.output["audio_sample_rate"] == 10
+        assert "video" in output.output
+
+    def test_forward_decode_info_logs_only_on_rank_zero(
+        self,
+        make_cosmos3_pipeline,
+        monkeypatch: pytest.MonkeyPatch,
+        caplog,
+    ) -> None:
+        from vllm_omni.diffusion.models.cosmos3 import pipeline_cosmos3 as cosmos3_pipeline
+
+        monkeypatch.setattr(cosmos3_pipeline, "_is_rank_zero", lambda: True)
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+        self._install_forward_stubs(pipeline)
+        self._install_sound_stubs(pipeline)
+        req = SimpleNamespace(
+            prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
+            sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0),
+        )
+
+        target_logger = logging.getLogger(cosmos3_pipeline.logger.name)
+        target_logger.addHandler(caplog.handler)
+        prev_level = target_logger.level
+        target_logger.setLevel(logging.INFO)
+        try:
+            pipeline.forward(req)
+        finally:
+            target_logger.removeHandler(caplog.handler)
+            target_logger.setLevel(prev_level)
+
+        messages = [record.getMessage() for record in caplog.records if record.name == cosmos3_pipeline.logger.name]
+        assert "Decoding video..." in messages
+        assert any(message.startswith("Video decoded in ") for message in messages)
+        assert any(message.startswith("Total pipeline time: ") for message in messages)
+        assert "Decoding sound..." in messages
+
+    def test_forward_decode_info_logs_suppressed_on_nonzero_rank(
+        self,
+        make_cosmos3_pipeline,
+        monkeypatch: pytest.MonkeyPatch,
+        caplog,
+    ) -> None:
+        from vllm_omni.diffusion.models.cosmos3 import pipeline_cosmos3 as cosmos3_pipeline
+
+        monkeypatch.setattr(cosmos3_pipeline, "_is_rank_zero", lambda: False)
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+        self._install_forward_stubs(pipeline)
+        _, decoded_audio = self._install_sound_stubs(pipeline)
+        req = SimpleNamespace(
+            prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
+            sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0),
+        )
+
+        target_logger = logging.getLogger(cosmos3_pipeline.logger.name)
+        target_logger.addHandler(caplog.handler)
+        prev_level = target_logger.level
+        target_logger.setLevel(logging.INFO)
+        try:
+            output = pipeline.forward(req)
+        finally:
+            target_logger.removeHandler(caplog.handler)
+            target_logger.setLevel(prev_level)
+
+        messages = [record.getMessage() for record in caplog.records if record.name == cosmos3_pipeline.logger.name]
+        assert output.output["audio"] is decoded_audio
+        assert not any(
+            message == "Decoding video..."
+            or message.startswith("Video decoded in ")
+            or message.startswith("Total pipeline time: ")
+            or message == "Decoding sound..."
+            for message in messages
+        )
+
+    def test_forward_rejects_multiple_prompts(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        req = SimpleNamespace(
+            prompts=["one", "two"],
+            sampling_params=make_sampling_params(),
+        )
+
+        with pytest.raises(ValueError, match="currently supports a single prompt"):
+            pipeline.forward(req)
+
+    def test_forward_rejects_conflicting_modalities(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        req = SimpleNamespace(
+            prompts=[{"prompt": "one", "modalities": ["image", "video"]}],
+            sampling_params=make_sampling_params(),
+        )
+
+        with pytest.raises(ValueError, match="cannot request both image and video"):
+            pipeline.forward(req)
+
+    def test_forward_rejects_sound_for_text_to_image(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+        req = SimpleNamespace(
+            prompts=[{"prompt": "A robot", "modalities": ["image"], "generate_sound": True}],
+            sampling_params=make_sampling_params(),
+        )
+
+        with pytest.raises(ValueError, match="only for video outputs"):
+            pipeline.forward(req)
+
+    def test_forward_rejects_action_without_action_modules(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        req = SimpleNamespace(
+            prompts=[{"prompt": "A robot", "modalities": ["video"]}],
+            sampling_params=make_sampling_params(extra_args={"action_mode": "policy", "raw_action_dim": 2}),
+        )
+
+        with pytest.raises(ValueError, match="without action modules"):
+            pipeline.forward(req)
+
+    def test_forward_rejects_action_without_explicit_domain(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
+        req = SimpleNamespace(
+            prompts=[
+                {
+                    "prompt": "A robot",
+                    "modalities": ["video"],
+                    "additional_information": {"preprocessed_image": torch.zeros(1, 3, 16, 16)},
+                }
+            ],
+            sampling_params=make_sampling_params(
+                height=16,
+                width=16,
+                extra_args={"action_mode": "policy", "raw_action_dim": 2},
+            ),
+        )
+
+        with pytest.raises(ValueError, match=r"domain_id.*domain_name"):
+            pipeline.forward(req)
+
+    def test_forward_rejects_action_with_sound(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(
+            latent_channel_size=2,
+            action_gen=True,
+            action_dim=4,
+            sound_gen=True,
+            sound_dim=3,
+        )
+        req = SimpleNamespace(
+            prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
+            sampling_params=make_sampling_params(extra_args={"action_mode": "policy", "raw_action_dim": 2}),
+        )
+
+        with pytest.raises(ValueError, match=r"action\+sound"):
+            pipeline.forward(req)
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
new file mode 100644
index 00000000000..49b5821347c
--- /dev/null
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -0,0 +1,577 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+import torch
+from torch import nn
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
+
+
+def test_compute_mrope_position_ids_text_offsets_all_axes() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
+        compute_mrope_position_ids_text,
+    )
+
+    ids, next_offset = compute_mrope_position_ids_text(num_tokens=3, temporal_offset=5)
+
+    assert ids.tolist() == [[5, 6, 7], [5, 6, 7], [5, 6, 7]]
+    assert next_offset == 8
+
+
+def test_compute_mrope_position_ids_vision_without_fps_modulation() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
+        compute_mrope_position_ids_vision,
+    )
+
+    ids, next_offset = compute_mrope_position_ids_vision(
+        grid_t=2,
+        grid_h=2,
+        grid_w=3,
+        temporal_offset=10,
+        fps=None,
+    )
+
+    assert ids.shape == (3, 12)
+    assert ids[0].tolist() == [10] * 6 + [11] * 6
+    assert ids[1].tolist() == [0, 0, 0, 1, 1, 1] * 2
+    assert ids[2].tolist() == [0, 1, 2, 0, 1, 2] * 2
+    assert next_offset == 12
+
+
+def test_compute_mrope_position_ids_vision_with_fps_modulation() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
+        compute_mrope_position_ids_vision,
+    )
+
+    ids, next_offset = compute_mrope_position_ids_vision(
+        grid_t=2,
+        grid_h=1,
+        grid_w=1,
+        temporal_offset=10,
+        fps=12.0,
+        base_fps=24.0,
+        temporal_compression_factor=4,
+    )
+
+    torch.testing.assert_close(ids[0], torch.tensor([10.0, 12.0]))
+    assert ids.dtype == torch.float32
+    assert next_offset == 13
+
+
+def test_compute_mrope_position_ids_sound_uses_sound_latent_fps() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
+        compute_mrope_position_ids_sound,
+    )
+
+    ids, next_offset = compute_mrope_position_ids_sound(
+        grid_t=3,
+        temporal_offset=10,
+        sound_latent_fps=24.0,
+        base_fps=24.0,
+        base_temporal_compression_factor=4,
+    )
+
+    torch.testing.assert_close(ids[0], torch.tensor([10.0, 10.25, 10.5]))
+    assert ids[1].tolist() == [0.0, 0.0, 0.0]
+    assert ids[2].tolist() == [0.0, 0.0, 0.0]
+    assert next_offset == 11
+
+
+def test_compute_mrope_position_ids_action_uses_start_frame_offset() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
+        compute_mrope_position_ids_action,
+    )
+
+    ids, next_offset = compute_mrope_position_ids_action(
+        grid_t=3,
+        temporal_offset=10,
+        action_fps=None,
+        start_frame_offset=1,
+    )
+
+    assert ids.tolist() == [[11, 12, 13], [0, 0, 0], [0, 0, 0]]
+    assert next_offset == 14
+
+
+@pytest.mark.parametrize(
+    ("key", "value"),
+    [
+        ("qk_norm_for_diffusion", False),
+        ("qk_norm_for_text", False),
+        ("position_embedding_type", "rotary"),
+        ("unified_3d_mrope_reset_spatial_ids", False),
+        ("joint_attn_implementation", "one_way"),
+    ],
+)
+def test_validate_supported_config_rejects_unsupported_flags(key: str, value) -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    with pytest.raises(ValueError, match=f"{key}="):
+        Cosmos3VFMTransformer._validate_supported_config({key: value})
+
+
+def test_validate_supported_config_accepts_defaults() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    Cosmos3VFMTransformer._validate_supported_config({})
+    Cosmos3VFMTransformer._validate_supported_config(None)
+
+
+def test_cosmos3_hsdp_conditions_match_und_and_gen_blocks() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.language_model = nn.Module()
+    model.language_model.layers = nn.ModuleList([nn.Linear(2, 2) for _ in range(2)])
+    model.gen_layers = nn.ModuleList([nn.Linear(2, 2)])
+    model.norm_moe_gen = nn.LayerNorm(2)
+
+    conditions = model._hsdp_shard_conditions
+    matched = [
+        name for name, module in model.named_modules() if any(condition(name, module) for condition in conditions)
+    ]
+
+    assert matched == [
+        "language_model.layers.0",
+        "language_model.layers.1",
+        "gen_layers.0",
+    ]
+
+
+def test_cosmos3_transformer_exposes_layerwise_offload_and_repeated_blocks() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    assert Cosmos3VFMTransformer._layerwise_offload_blocks_attr == "gen_layers"
+    assert Cosmos3VFMTransformer._repeated_blocks == ["Cosmos3GenDecoderLayer"]
+
+
+def test_patchify_unpatchify_round_trip_crops_padding() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.latent_patch_size = 2
+    model.latent_channel_size = 3
+
+    latents = torch.arange(1 * 3 * 1 * 3 * 5, dtype=torch.float32).reshape(1, 3, 1, 3, 5)
+
+    tokens = model.patchify(latents, t=1, h=3, w=5)
+    restored = model.unpatchify(tokens, t=1, h=3, w=5)
+
+    assert tokens.shape == (1, 6, 12)
+    torch.testing.assert_close(restored, latents)
+
+
+def _tiny_cosmos3_config(**overrides):
+    config = {
+        "hidden_size": 8,
+        "num_hidden_layers": 0,
+        "num_attention_heads": 2,
+        "num_key_value_heads": 2,
+        "head_dim": 4,
+        "intermediate_size": 16,
+        "vocab_size": 32,
+        "latent_patch_size": 1,
+        "latent_channel": 2,
+        "rope_scaling": {"mrope_section": [1, 1, 0]},
+    }
+    config.update(overrides)
+    return config
+
+
+def test_sound_modules_created_only_when_sound_config_present() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    tiny = _tiny_cosmos3_config()
+
+    no_sound = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32))
+    explicit_disabled = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config={**tiny, "sound_gen": False, "sound_dim": 3},
+            dtype=torch.float32,
+        )
+    )
+    with_sound = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config={**tiny, "sound_gen": True, "sound_dim": 3},
+            dtype=torch.float32,
+        )
+    )
+
+    assert no_sound.sound_gen is False
+    assert not hasattr(no_sound, "sound2llm")
+    assert explicit_disabled.sound_gen is False
+    assert not hasattr(explicit_disabled, "sound2llm")
+    assert with_sound.sound_gen is True
+    assert with_sound.sound2llm.in_features == 3
+    assert with_sound.llm2sound.out_features == 3
+    assert tuple(with_sound.sound_modality_embed.shape) == (8,)
+
+
+def test_action_modules_created_only_when_action_config_present() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    tiny = _tiny_cosmos3_config()
+
+    no_action = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32))
+    explicit_disabled = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config={**tiny, "action_gen": False, "max_action_dim": 6},
+            dtype=torch.float32,
+        )
+    )
+    with_action = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config={**tiny, "action_gen": True, "max_action_dim": 6, "num_embodiment_domains": 9},
+            dtype=torch.float32,
+        )
+    )
+
+    assert no_action.action_gen is False
+    assert not hasattr(no_action, "action2llm")
+    assert explicit_disabled.action_gen is False
+    assert not hasattr(explicit_disabled, "action2llm")
+    assert with_action.action_gen is True
+    assert with_action.action_dim == 6
+    assert with_action.action2llm.num_domains == 9
+    assert tuple(with_action.action_modality_embed.shape) == (8,)
+
+
+def test_sound_latent_fps_derives_from_sound_tokenizer_config() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    tiny = _tiny_cosmos3_config(sound_gen=True, sound_dim=3)
+
+    derived = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config=tiny,
+            custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800},
+            dtype=torch.float32,
+        )
+    )
+    explicit = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config=tiny,
+            custom_pipeline_args={
+                "sound_sample_rate": 32000,
+                "sound_hop_size": 800,
+                "sound_latent_fps": 12.5,
+            },
+            dtype=torch.float32,
+        )
+    )
+
+    assert derived.sound_latent_fps == 40.0
+    assert explicit.sound_latent_fps == 12.5
+
+
+def test_pack_unpack_sound_round_trip_and_shape_validation() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.sound_dim = 3
+
+    latents = torch.arange(2 * 3 * 4, dtype=torch.float32).reshape(2, 3, 4)
+    tokens = model.pack_sound(latents)
+    restored = model.unpack_sound(tokens)
+
+    assert tokens.shape == (2, 4, 3)
+    torch.testing.assert_close(restored, latents)
+    with pytest.raises(ValueError, match="channel mismatch"):
+        model.pack_sound(torch.zeros(1, 4, 2))
+
+
+def test_pack_unpack_action_round_trip_and_shape_validation() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.action_dim = 3
+
+    latents = torch.arange(2 * 4 * 3, dtype=torch.float32).reshape(2, 4, 3)
+    tokens = model.pack_action(latents)
+    restored = model.unpack_action(tokens)
+
+    assert tokens.shape == (2, 4, 3)
+    torch.testing.assert_close(restored, latents)
+    with pytest.raises(ValueError, match="dimension mismatch"):
+        model.pack_action(torch.zeros(1, 2, 4))
+
+
+def test_forward_with_sound_returns_video_and_sound_predictions() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    model = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0),
+            dtype=torch.float32,
+        )
+    )
+
+    video = torch.zeros(1, 2, 1, 2, 2)
+    sound = torch.zeros(1, 3, 4)
+    output = model(
+        hidden_states=video,
+        timestep=torch.tensor([1.0]),
+        text_ids=torch.tensor([[1, 2]], dtype=torch.long),
+        text_mask=torch.ones(1, 2, dtype=torch.long),
+        video_shape=(1, 2, 2),
+        fps=24.0,
+        sound_latents=sound,
+    )
+
+    assert isinstance(output, tuple)
+    video_pred, sound_pred = output
+    assert video_pred.shape == video.shape
+    assert sound_pred.shape == sound.shape
+
+
+def test_forward_with_action_returns_video_and_action_predictions() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    model = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config=_tiny_cosmos3_config(
+                action_gen=True,
+                max_action_dim=3,
+                num_embodiment_domains=4,
+            ),
+            dtype=torch.float32,
+        )
+    )
+
+    video = torch.zeros(1, 2, 1, 2, 2)
+    action = torch.zeros(1, 5, 3)
+    output = model(
+        hidden_states=video,
+        timestep=torch.tensor([1.0]),
+        text_ids=torch.tensor([[1, 2]], dtype=torch.long),
+        text_mask=torch.ones(1, 2, dtype=torch.long),
+        video_shape=(1, 2, 2),
+        fps=24.0,
+        action_latents=action,
+        action_domain_ids=torch.tensor([2]),
+        action_noisy_mask=torch.ones(1, 5, 1),
+    )
+
+    assert isinstance(output, tuple)
+    video_pred, action_pred = output
+    assert video_pred.shape == video.shape
+    assert action_pred.shape == action.shape
+
+
+def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch: pytest.MonkeyPatch) -> None:
+    import vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 as cosmos3_module
+
+    model = cosmos3_module.Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3),
+            dtype=torch.float32,
+        )
+    )
+    monkeypatch.setattr(cosmos3_module, "_get_ulysses_state", lambda: (2, 0, None))
+
+    with pytest.raises(
+        ValueError,
+        match=r"GEN sequence length \(3 = video tokens 2 \+ sound tokens 1\).*combined media sequence",
+    ):
+        model(
+            hidden_states=torch.zeros(1, 2, 1, 1, 2),
+            timestep=torch.tensor([1.0]),
+            text_ids=torch.tensor([[1, 2]], dtype=torch.long),
+            text_mask=torch.ones(1, 2, dtype=torch.long),
+            video_shape=(1, 1, 2),
+            fps=24.0,
+            sound_latents=torch.zeros(1, 3, 1),
+        )
+
+
+def test_reset_cache_clears_und_and_gen_cache() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.cached_kv = object()
+    model.cached_freqs_gen = object()
+
+    model.reset_cache()
+
+    assert model.cached_kv is None
+    assert model.cached_freqs_gen is None
+
+
+def test_compute_rope_freqs_pads_text_and_offsets_vision_positions() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    class FakeRotary:
+        def __init__(self) -> None:
+            self.position_ids: list[torch.Tensor] = []
+
+        def __call__(self, x, position_ids):
+            del x
+            self.position_ids.append(position_ids.detach().cpu())
+            batch = position_ids.shape[1]
+            seq = position_ids.shape[2]
+            return torch.zeros(batch, seq, 4), torch.ones(batch, seq, 4)
+
+    rotary = FakeRotary()
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.language_model = SimpleNamespace(rotary_emb=rotary)
+    model.temporal_modality_margin = 100
+    model.base_fps = 24.0
+    model.temporal_compression_factor = 4
+    model.enable_fps_modulation = False
+
+    freqs_und, freqs_gen = model._compute_rope_freqs(
+        text_mask=torch.tensor([[1, 1, 0], [1, 0, 0]], dtype=torch.long),
+        t=2,
+        hp=1,
+        wp=1,
+        fps=24.0,
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+    )
+
+    text_pos, vision_pos = rotary.position_ids
+    assert text_pos[:, 0, :].tolist() == [[0, 1, 0], [0, 1, 0], [0, 1, 0]]
+    assert text_pos[:, 1, :].tolist() == [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
+    assert vision_pos[0, 0].tolist() == [102, 103]
+    assert vision_pos[0, 1].tolist() == [101, 102]
+    assert freqs_und[0].shape == (2, 3, 1, 4)
+    assert freqs_gen[0].shape == (2, 2, 1, 4)
+
+
+def test_compute_rope_freqs_appends_sound_positions_after_vision() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    class FakeRotary:
+        def __init__(self) -> None:
+            self.position_ids: list[torch.Tensor] = []
+
+        def __call__(self, x, position_ids):
+            del x
+            self.position_ids.append(position_ids.detach().cpu())
+            batch = position_ids.shape[1]
+            seq = position_ids.shape[2]
+            return torch.zeros(batch, seq, 4), torch.ones(batch, seq, 4)
+
+    rotary = FakeRotary()
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.language_model = SimpleNamespace(rotary_emb=rotary)
+    model.temporal_modality_margin = 100
+    model.base_fps = 24.0
+    model.temporal_compression_factor = 4
+    model.enable_fps_modulation = True
+    model.sound_latent_fps = 24.0
+
+    model._compute_rope_freqs(
+        text_mask=torch.tensor([[1, 1]], dtype=torch.long),
+        t=2,
+        hp=1,
+        wp=1,
+        fps=24.0,
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+        t_sound=3,
+    )
+
+    _, gen_pos = rotary.position_ids
+    assert gen_pos.shape == (3, 1, 5)
+    torch.testing.assert_close(
+        gen_pos[0, 0],
+        torch.tensor([102.0, 103.0, 102.0, 102.25, 102.5]),
+    )
+
+
+def test_compute_rope_freqs_appends_action_positions_between_vision_and_sound() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    class FakeRotary:
+        def __init__(self) -> None:
+            self.position_ids: list[torch.Tensor] = []
+
+        def __call__(self, x, position_ids):
+            del x
+            self.position_ids.append(position_ids.detach().cpu())
+            batch = position_ids.shape[1]
+            seq = position_ids.shape[2]
+            return torch.zeros(batch, seq, 4), torch.ones(batch, seq, 4)
+
+    rotary = FakeRotary()
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.language_model = SimpleNamespace(rotary_emb=rotary)
+    model.temporal_modality_margin = 100
+    model.base_fps = 24.0
+    model.temporal_compression_factor = 4
+    model.enable_fps_modulation = False
+    model.sound_latent_fps = 24.0
+
+    model._compute_rope_freqs(
+        text_mask=torch.tensor([[1, 1]], dtype=torch.long),
+        t=2,
+        hp=1,
+        wp=1,
+        fps=24.0,
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+        t_action=2,
+        action_start_frame_offset=1,
+        t_sound=1,
+    )
+
+    _, gen_pos = rotary.position_ids
+    assert gen_pos.shape == (3, 1, 5)
+    assert gen_pos[0, 0].tolist() == [102, 103, 103, 104, 102]
+
+
+def test_compute_rope_freqs_promotes_mixed_video_sound_position_dtypes() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    class FakeRotary:
+        def __init__(self) -> None:
+            self.position_ids: list[torch.Tensor] = []
+
+        def __call__(self, x, position_ids):
+            del x
+            self.position_ids.append(position_ids.detach().cpu())
+            batch = position_ids.shape[1]
+            seq = position_ids.shape[2]
+            return torch.zeros(batch, seq, 4), torch.ones(batch, seq, 4)
+
+    rotary = FakeRotary()
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.language_model = SimpleNamespace(rotary_emb=rotary)
+    model.temporal_modality_margin = 100
+    model.base_fps = 24.0
+    model.temporal_compression_factor = 4
+    model.enable_fps_modulation = True
+    model.sound_latent_fps = 24.0
+
+    model._compute_rope_freqs(
+        text_mask=torch.tensor([[1, 1]], dtype=torch.long),
+        t=1,
+        hp=1,
+        wp=1,
+        fps=None,
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+        t_sound=3,
+    )
+
+    _, gen_pos = rotary.position_ids
+    assert gen_pos.dtype == torch.float32
+    torch.testing.assert_close(
+        gen_pos[0, 0],
+        torch.tensor([102.0, 102.0, 102.25, 102.5]),
+    )
diff --git a/tests/diffusion/test_diffusion_ipc.py b/tests/diffusion/test_diffusion_ipc.py
new file mode 100644
index 00000000000..43e96b834f6
--- /dev/null
+++ b/tests/diffusion/test_diffusion_ipc.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm_omni.diffusion.data import DiffusionOutput
+from vllm_omni.diffusion.ipc import pack_diffusion_output_shm, unpack_diffusion_output_shm
+
+
+def test_diffusion_output_dict_tensors_round_trip_through_shm() -> None:
+    image = torch.arange(300_000, dtype=torch.float32)
+    video = torch.arange(300_000, dtype=torch.float32) * 2
+    output = DiffusionOutput(output={"image": image, "video": video, "metadata": {"keep": "inline"}})
+
+    pack_diffusion_output_shm(output)
+
+    assert output.output["image"]["__tensor_shm__"] is True
+    assert output.output["video"]["__tensor_shm__"] is True
+    assert output.output["metadata"] == {"keep": "inline"}
+
+    unpack_diffusion_output_shm(output)
+
+    torch.testing.assert_close(output.output["image"], image)
+    torch.testing.assert_close(output.output["video"], video)
+    assert output.output["metadata"] == {"keep": "inline"}
diff --git a/tests/e2e/accuracy/test_cosmos3_similarity.py b/tests/e2e/accuracy/test_cosmos3_similarity.py
new file mode 100644
index 00000000000..166c56a9318
--- /dev/null
+++ b/tests/e2e/accuracy/test_cosmos3_similarity.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import base64
+import io
+import json
+import os
+from pathlib import Path
+
+import pytest
+import requests
+import torch
+from PIL import Image
+
+from tests.e2e.accuracy.helpers import model_output_dir
+from tests.helpers.mark import hardware_test
+from tests.helpers.runtime import OmniServer
+
+pytestmark = [pytest.mark.full_model, pytest.mark.diffusion]
+
+
+MODEL_ENV_VAR = "VLLM_TEST_COSMOS3_MODEL"
+MODEL_ID = "cosmos3"
+PROMPT = "A small warehouse robot moves a blue box across a clean floor."
+NEGATIVE_PROMPT = "blurry, distorted, low quality"
+SEED = 42
+WIDTH = 256
+HEIGHT = 256
+NUM_INFERENCE_STEPS = 2
+
+
+def _model_name() -> str:
+    model = os.environ.get(MODEL_ENV_VAR)
+    if not model:
+        pytest.skip(f"Set {MODEL_ENV_VAR} to run Cosmos3 full-model smoke tests.")
+    return model
+
+
+def _server_args() -> list[str]:
+    return [
+        "--num-gpus",
+        "1",
+        "--model-class-name",
+        "Cosmos3OmniDiffusersPipeline",
+        "--stage-init-timeout",
+        "900",
+        "--init-timeout",
+        "1200",
+    ]
+
+
+def _image_data_url(image: Image.Image) -> str:
+    buf = io.BytesIO()
+    image.save(buf, format="PNG")
+    encoded = base64.b64encode(buf.getvalue()).decode("ascii")
+    return f"data:image/png;base64,{encoded}"
+
+
+@pytest.mark.benchmark
+@hardware_test(res={"cuda": "H100"}, num_cards=1)
+def test_cosmos3_t2i_serving_smoke(accuracy_artifact_root: Path) -> None:
+    if not torch.cuda.is_available():
+        pytest.skip("Cosmos3 full-model smoke tests require CUDA.")
+
+    model = _model_name()
+    output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID)
+    with OmniServer(model, _server_args(), use_omni=True) as server:
+        response = requests.post(
+            f"http://{server.host}:{server.port}/v1/images/generations",
+            json={
+                "model": server.model,
+                "prompt": PROMPT,
+                "negative_prompt": NEGATIVE_PROMPT,
+                "size": f"{WIDTH}x{HEIGHT}",
+                "n": 1,
+                "response_format": "b64_json",
+                "num_inference_steps": NUM_INFERENCE_STEPS,
+                "guidance_scale": 1.0,
+                "seed": SEED,
+            },
+            timeout=1800,
+        )
+
+    response.raise_for_status()
+    payload = response.json()
+    assert len(payload["data"]) == 1
+    image = Image.open(io.BytesIO(base64.b64decode(payload["data"][0]["b64_json"]))).convert("RGB")
+    image.save(output_dir / "cosmos3_t2i.png")
+    assert image.size == (WIDTH, HEIGHT)
+
+
+@pytest.mark.benchmark
+@hardware_test(res={"cuda": "H100"}, num_cards=1)
+def test_cosmos3_t2v_sync_serving_smoke(accuracy_artifact_root: Path) -> None:
+    if not torch.cuda.is_available():
+        pytest.skip("Cosmos3 full-model smoke tests require CUDA.")
+
+    model = _model_name()
+    output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID)
+    with OmniServer(model, _server_args(), use_omni=True) as server:
+        response = requests.post(
+            f"http://{server.host}:{server.port}/v1/videos/sync",
+            data={
+                "model": server.model,
+                "prompt": PROMPT,
+                "negative_prompt": NEGATIVE_PROMPT,
+                "size": f"{WIDTH}x{HEIGHT}",
+                "num_frames": "1",
+                "fps": "1",
+                "num_inference_steps": str(NUM_INFERENCE_STEPS),
+                "guidance_scale": "1.0",
+                "seed": str(SEED),
+            },
+            timeout=1800,
+        )
+
+    response.raise_for_status()
+    assert response.headers["content-type"].startswith("video/mp4")
+    assert response.content
+    (output_dir / "cosmos3_t2v.mp4").write_bytes(response.content)
+
+
+@pytest.mark.benchmark
+@hardware_test(res={"cuda": "H100"}, num_cards=1)
+def test_cosmos3_i2v_sync_serving_smoke(accuracy_artifact_root: Path) -> None:
+    if not torch.cuda.is_available():
+        pytest.skip("Cosmos3 full-model smoke tests require CUDA.")
+
+    model = _model_name()
+    output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID)
+    reference = Image.new("RGB", (96, 64), color=(40, 80, 160))
+    with OmniServer(model, _server_args(), use_omni=True) as server:
+        response = requests.post(
+            f"http://{server.host}:{server.port}/v1/videos/sync",
+            data={
+                "model": server.model,
+                "prompt": "The blue rectangle moves slowly forward.",
+                "negative_prompt": NEGATIVE_PROMPT,
+                "image_reference": json.dumps({"image_url": _image_data_url(reference)}),
+                "size": f"{WIDTH}x{HEIGHT}",
+                "num_frames": "5",
+                "fps": "1",
+                "num_inference_steps": str(NUM_INFERENCE_STEPS),
+                "guidance_scale": "1.0",
+                "seed": str(SEED),
+            },
+            timeout=1800,
+        )
+
+    response.raise_for_status()
+    assert response.headers["content-type"].startswith("video/mp4")
+    assert response.content
+    (output_dir / "cosmos3_i2v.mp4").write_bytes(response.content)
diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py
index 40adb7a9151..cdf6ca3f8a7 100644
--- a/tests/entrypoints/openai_api/test_image_server.py
+++ b/tests/entrypoints/openai_api/test_image_server.py
@@ -495,6 +495,7 @@ def test_generate_single_image(test_client):
     img_bytes = base64.b64decode(data["data"][0]["b64_json"])
     img = Image.open(io.BytesIO(img_bytes))
     assert img.size == (64, 64)  # Our mock returns 64x64 images
+    assert test_client.app.state.engine_client.captured_prompt["modalities"] == ["image"]
 
 
 def test_generate_images_async_omni_sampling_params(async_omni_test_client):
diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py
index a29f4493c28..57a09872397 100644
--- a/tests/entrypoints/openai_api/test_video_server.py
+++ b/tests/entrypoints/openai_api/test_video_server.py
@@ -13,6 +13,7 @@
 import time
 from types import SimpleNamespace
 
+import numpy as np
 import pytest
 from fastapi import FastAPI
 from fastapi.testclient import TestClient
@@ -243,6 +244,7 @@ def _fake_encode(video, fps, audio=None, audio_sample_rate=None, **kwargs):
     _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value)
 
     engine = test_client.app.state.openai_serving_video._engine_client
+    assert engine.captured_prompt["modalities"] == ["video"]
     captured = engine.captured_sampling_params_list[0]
     assert captured.num_outputs_per_prompt == 1
     assert captured.width == 640
@@ -398,6 +400,8 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture):
             "true_cfg_scale": "4.0",
             "boundary_ratio": "0.7",
             "flow_shift": "0.25",
+            "generate_sound": "true",
+            "sound_duration": "2.5",
         },
     )
 
@@ -412,6 +416,8 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture):
     assert captured.true_cfg_scale == 4.0
     assert captured.boundary_ratio == 0.7
     assert captured.extra_args["flow_shift"] == 0.25
+    assert captured.extra_args["generate_sound"] is True
+    assert captured.extra_args["sound_duration"] == 2.5
 
 
 def test_frame_interpolation_params_pass_to_diffusion_sampling_params(test_client, mocker: MockerFixture):
@@ -622,6 +628,109 @@ async def _generate(prompt, request_id, sampling_params_list):
 
     assert completed["stage_durations"] == {"diffuse": 2.5, "vae.decode": 0.3}
     assert completed["peak_memory_mb"] == 4096.5
+    assert completed["action"] is None
+
+
+def test_video_generation_response_exposes_action_payload(mocker: MockerFixture):
+    engine = FakeAsyncOmni()
+    handler = OmniOpenAIServingVideo.for_diffusion(
+        diffusion_engine=engine,
+        model_name="Cosmos3-8B-UVA",
+    )
+
+    async def _generate(prompt, request_id, sampling_params_list):
+        del prompt, request_id, sampling_params_list
+        yield MockVideoResult(
+            [object()],
+            custom_output={
+                "action": np.array([[[1.5, 2.5], [3.5, 4.5]]], dtype=np.float32),
+                "raw_action_dim": 2,
+                "action_mode": "policy",
+                "domain_id": 7,
+            },
+        )
+
+    engine.generate = _generate
+    mocker.patch(
+        "vllm_omni.entrypoints.openai.serving_video.encode_video_base64",
+        return_value="encoded-video",
+    )
+
+    response = asyncio.run(
+        handler.generate_videos(
+            VideoGenerationRequest(prompt="predict actions"),
+            "action-json",
+        )
+    )
+
+    action = response.data[0].action
+    assert action is not None
+    assert action.data == [[1.5, 2.5], [3.5, 4.5]]
+    assert action.shape == [2, 2]
+    assert action.dtype == "float32"
+    assert action.raw_action_dim == 2
+    assert action.action_mode == "policy"
+    assert action.domain_id == 7
+    assert response.model_dump(mode="json")["data"][0]["action"]["data"] == [[1.5, 2.5], [3.5, 4.5]]
+
+
+def test_video_job_persists_action_metadata(test_client, mocker: MockerFixture):
+    engine = test_client.app.state.openai_serving_video._engine_client
+
+    async def _generate(prompt, request_id, sampling_params_list):
+        engine.captured_prompt = prompt
+        engine.captured_sampling_params_list = sampling_params_list
+        yield MockVideoResult(
+            [object()],
+            custom_output={
+                "action": np.array([[[1.0, 2.0], [3.0, 4.0]]], dtype=np.float32),
+                "raw_action_dim": 2,
+                "action_mode": "policy",
+                "domain_id": 7,
+            },
+        )
+
+    engine.generate = _generate
+    mocker.patch(
+        "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes",
+        return_value=b"fake-video",
+    )
+
+    response = test_client.post("/v1/videos", data={"prompt": "profile me"})
+    assert response.status_code == 200
+    video_id = response.json()["id"]
+    completed = _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value)
+
+    expected_action = {
+        "data": [[1.0, 2.0], [3.0, 4.0]],
+        "shape": [2, 2],
+        "dtype": "float32",
+        "raw_action_dim": 2,
+        "action_mode": "policy",
+        "domain_id": 7,
+    }
+    assert completed["action"] == expected_action
+
+    listed = test_client.get("/v1/videos").json()
+    assert listed["data"][0]["action"] == expected_action
+
+
+def test_action_extraction_accepts_unbatched_action():
+    result = MockVideoResult(
+        [object()],
+        custom_output={
+            "action": np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
+            "raw_action_dim": 2,
+            "action_mode": "policy",
+            "domain_id": 7,
+        },
+    )
+
+    actions = OmniOpenAIServingVideo._extract_action_outputs(result, expected_count=1)
+
+    assert actions[0] is not None
+    assert actions[0].data == [[1.0, 2.0], [3.0, 4.0]]
+    assert actions[0].shape == [2, 2]
 
 
 def test_missing_handler_returns_503():
@@ -755,6 +864,9 @@ def test_invalid_uploaded_input_reference_returns_400(test_client):
 def test_video_request_validation():
     req = VideoGenerationRequest(prompt="test")
     assert req.prompt == "test"
+    assert req.generate_sound is False
+    assert req.sound_duration is None
+    assert VideoGenerationRequest(prompt="test", generate_sound=True, sound_duration=1.5).generate_sound is True
     with pytest.raises(ValueError):
         VideoGenerationRequest(prompt="test", size="invalid")
 
@@ -767,6 +879,8 @@ def test_video_request_validation():
         VideoGenerationRequest(prompt="test", frame_interpolation_exp=0)
     with pytest.raises(ValueError):
         VideoGenerationRequest(prompt="test", frame_interpolation_scale=0)
+    with pytest.raises(ValueError):
+        VideoGenerationRequest(prompt="test", sound_duration=0)
 
 
 def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerFixture):
@@ -1063,6 +1177,8 @@ def test_sync_t2v_returns_video_bytes(test_client, mocker: MockerFixture):
     assert float(response.headers["x-inference-time-s"]) >= 0
     assert json.loads(response.headers["x-stage-durations"]) == {}
     assert float(response.headers["x-peak-memory-mb"]) == 0.0
+    engine = test_client.app.state.openai_serving_video._engine_client
+    assert engine.captured_prompt["modalities"] == ["video"]
 
 
 def test_sync_t2v_returns_profiler_headers(test_client, mocker: MockerFixture):
diff --git a/vllm_omni/diffusion/attention/backends/sdpa.py b/vllm_omni/diffusion/attention/backends/sdpa.py
index ab71e753b25..c650313698d 100644
--- a/vllm_omni/diffusion/attention/backends/sdpa.py
+++ b/vllm_omni/diffusion/attention/backends/sdpa.py
@@ -91,6 +91,8 @@ def __init__(
         self.softmax_scale = softmax_scale
         if backend_kwargs:
             logger.warning("SDPAImpl ignoring backend_kwargs: %s", list(backend_kwargs.keys()))
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
 
     def _forward_impl(
         self,
@@ -115,6 +117,7 @@ def _forward_impl(
             dropout_p=0.0,
             is_causal=self.causal,
             scale=self.softmax_scale,
+            enable_gqa=self.num_heads != self.num_kv_heads,
         )
         out = output.permute(0, 2, 1, 3)
         return out
diff --git a/vllm_omni/diffusion/cache/cache_dit_backend.py b/vllm_omni/diffusion/cache/cache_dit_backend.py
index 436ea29664a..9e9276e467d 100644
--- a/vllm_omni/diffusion/cache/cache_dit_backend.py
+++ b/vllm_omni/diffusion/cache/cache_dit_backend.py
@@ -1438,6 +1438,76 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool
     return refresh_cache_context
 
 
+def enable_cache_for_cosmos3(pipeline: Any, cache_config: Any) -> Callable[[int], None]:
+    """Enable cache-dit for Cosmos3 (T2V and I2V).
+
+    Cosmos3 has a dual-pathway architecture (UND + GEN) but only the GEN
+    pathway (``gen_layers``) runs at every denoising step.  The UND pathway
+    computes once and its K/V are cached by the pipeline itself; no cache-dit
+    needed there.  We wrap only ``gen_layers`` via ``BlockAdapter``.
+
+    Args:
+        pipeline: The Cosmos3 pipeline instance.
+        cache_config: DiffusionCacheConfig instance with cache configuration.
+
+    Returns:
+        A refresh function that can be called to update cache context with new num_inference_steps.
+    """
+    db_cache_config = _build_db_cache_config(cache_config)
+
+    calibrator_config = None
+    if cache_config.enable_taylorseer:
+        taylorseer_order = cache_config.taylorseer_order
+        calibrator_config = TaylorSeerCalibratorConfig(taylorseer_order=taylorseer_order)
+        logger.info(f"TaylorSeer enabled with order={taylorseer_order}")
+
+    logger.info(
+        f"Enabling cache-dit on Cosmos3 gen_layers: "
+        f"Fn={db_cache_config.Fn_compute_blocks}, "
+        f"Bn={db_cache_config.Bn_compute_blocks}, "
+        f"W={db_cache_config.max_warmup_steps}, "
+    )
+
+    cache_dit.enable_cache(
+        BlockAdapter(
+            transformer=pipeline.transformer,
+            blocks=[pipeline.transformer.gen_layers],
+            # Cosmos3 GEN blocks return only hidden_states.  Per-layer UND K/V
+            # conditioning uses the transformer's cache-dit fallback path.
+            forward_pattern=[ForwardPattern.Pattern_3],
+            params_modifiers=[
+                ParamsModifier(
+                    cache_config=db_cache_config,
+                    calibrator_config=calibrator_config,
+                ),
+            ],
+            check_forward_pattern=False,
+            has_separate_cfg=True,
+        ),
+        cache_config=db_cache_config,
+        calibrator_config=calibrator_config,
+    )
+
+    def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool = True) -> None:
+        if cache_config.scm_steps_mask_policy is None:
+            cache_dit.refresh_context(pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose)
+        else:
+            cache_dit.refresh_context(
+                pipeline.transformer,
+                cache_config=DBCacheConfig().reset(
+                    num_inference_steps=num_inference_steps,
+                    steps_computation_mask=cache_dit.steps_mask(
+                        mask_policy=cache_config.scm_steps_mask_policy,
+                        total_steps=num_inference_steps,
+                    ),
+                    steps_computation_policy=cache_config.scm_steps_policy,
+                ),
+                verbose=verbose,
+            )
+
+    return refresh_cache_context
+
+
 # Register custom cache-dit enablers after function definitions
 CUSTOM_DIT_ENABLERS.update(
     {
@@ -1463,6 +1533,7 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool
         "ErnieImagePipeline": enable_cache_for_ernie_image,
         "HunyuanVideo15Pipeline": enable_cache_for_hunyuan_video_15,
         "HunyuanVideo15I2VPipeline": enable_cache_for_hunyuan_video_15,
+        "Cosmos3OmniDiffusersPipeline": enable_cache_for_cosmos3,
     }
 )
 
diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py
index fe4a4c77e5e..17259467a64 100644
--- a/vllm_omni/diffusion/diffusion_engine.py
+++ b/vllm_omni/diffusion/diffusion_engine.py
@@ -83,6 +83,18 @@ def supports_audio_output(model_class_name: str) -> bool:
     return bool(getattr(model_cls, "support_audio_output", False))
 
 
+def _move_tensor_tree_to_cpu(value: object) -> object:
+    if isinstance(value, torch.Tensor):
+        return value.cpu() if value.device.type != "cpu" else value
+    if isinstance(value, dict):
+        return {key: _move_tensor_tree_to_cpu(item) for key, item in value.items()}
+    if isinstance(value, list):
+        return [_move_tensor_tree_to_cpu(item) for item in value]
+    if isinstance(value, tuple):
+        return tuple(_move_tensor_tree_to_cpu(item) for item in value)
+    return value
+
+
 def get_extra_body_params(model_class_name: str) -> frozenset[str]:
     """Return the set of extra_body keys accepted by a pipeline.
 
@@ -223,12 +235,8 @@ async def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]:
         # post-processing to avoid device OOM — model weights may still
         # reside on the device and leave no headroom for intermediates.
         output_data = output.output
-        if (
-            self.od_config.enable_cpu_offload
-            and isinstance(output_data, torch.Tensor)
-            and output_data.device.type != "cpu"
-        ):
-            output_data = output_data.cpu()
+        if self.od_config.enable_cpu_offload:
+            output_data = _move_tensor_tree_to_cpu(output_data)
 
         postprocess_start_time = time.perf_counter()
         if self.post_process_func is not None:
@@ -249,7 +257,10 @@ async def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]:
             custom_output.update(outputs.get("custom_output") or {})
             model_audio_sample_rate = outputs.get("audio_sample_rate")
             model_fps = outputs.get("fps")
-            outputs = outputs.get("video", outputs)
+            if "image" in outputs:
+                outputs = outputs["image"]
+            elif "video" in outputs:
+                outputs = outputs["video"]
         postprocess_time = time.perf_counter() - postprocess_start_time
         logger.debug("Post-processing completed in %.4f seconds", postprocess_time)
 
diff --git a/vllm_omni/diffusion/ipc.py b/vllm_omni/diffusion/ipc.py
index 6a96533fd40..d4989da3d9e 100644
--- a/vllm_omni/diffusion/ipc.py
+++ b/vllm_omni/diffusion/ipc.py
@@ -85,16 +85,26 @@ def _pack_tensor_if_large(val: torch.Tensor) -> torch.Tensor | dict:
     return val
 
 
+def _pack_value_if_large(val: object) -> object:
+    if isinstance(val, torch.Tensor):
+        return _pack_tensor_if_large(val)
+    if isinstance(val, dict):
+        return {key: _pack_value_if_large(value) for key, value in val.items()}
+    return val
+
+
 def _unpack_if_shm_handle(val: object) -> object:
     """Reconstruct a tensor from an SHM handle dict, or return as-is."""
     if isinstance(val, dict) and val.get("__tensor_shm__"):
         return _tensor_from_shm(val)
+    if isinstance(val, dict):
+        return {key: _unpack_if_shm_handle(value) for key, value in val.items()}
     return val
 
 
 def _pack_diffusion_fields(output: DiffusionOutput) -> DiffusionOutput:
-    if output.output is not None and isinstance(output.output, torch.Tensor):
-        output.output = _pack_tensor_if_large(output.output)
+    if output.output is not None:
+        output.output = _pack_value_if_large(output.output)
     if output.trajectory_latents is not None and isinstance(output.trajectory_latents, torch.Tensor):
         output.trajectory_latents = _pack_tensor_if_large(output.trajectory_latents)
     if output.trajectory_timesteps is not None and isinstance(output.trajectory_timesteps, torch.Tensor):
diff --git a/vllm_omni/diffusion/models/cosmos3/__init__.py b/vllm_omni/diffusion/models/cosmos3/__init__.py
new file mode 100644
index 00000000000..6df062b5c0d
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .pipeline_cosmos3 import (
+    Cosmos3OmniDiffusersPipeline,
+    get_cosmos3_post_process_func,
+    get_cosmos3_pre_process_func,
+)
+from .transformer_cosmos3 import Cosmos3VFMTransformer
+
+__all__ = [
+    "Cosmos3OmniDiffusersPipeline",
+    "get_cosmos3_post_process_func",
+    "get_cosmos3_pre_process_func",
+    "Cosmos3VFMTransformer",
+]
diff --git a/vllm_omni/diffusion/models/cosmos3/action.py b/vllm_omni/diffusion/models/cosmos3/action.py
new file mode 100644
index 00000000000..e2572bbb733
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/action.py
@@ -0,0 +1,217 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Action-token helpers for Cosmos3 UVA/action generation."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+ACTION_MODE_POLICY = "policy"
+ACTION_MODE_FORWARD_DYNAMICS = "forward_dynamics"
+ACTION_MODE_INVERSE_DYNAMICS = "inverse_dynamics"
+ACTION_MODES = {
+    ACTION_MODE_POLICY,
+    ACTION_MODE_FORWARD_DYNAMICS,
+    ACTION_MODE_INVERSE_DYNAMICS,
+}
+
+
+EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = {
+    "no_action": 0,
+    "av": 1,
+    "camera_pose": 2,
+    "hand_pose": 3,
+    "pusht": 4,
+    "libero": 5,
+    "umi": 6,
+    "bridge_orig_lerobot": 7,
+    "droid_lerobot": 8,
+    "robomind-franka": 8,
+    "galbot": 9,
+    "robomind-franka-dual": 12,
+    "robomind-ur": 13,
+    "agibotworld": 15,
+    "agibot_gear_gripper": 15,
+    "agibot_gear_gripper_ext": 15,
+    "fractal": 20,
+}
+
+
+VIDEO_RES_SIZE_INFO: dict[str, dict[str, tuple[int, int]]] = {
+    "256": {
+        "1,1": (256, 256),
+        "4,3": (320, 256),
+        "3,4": (256, 320),
+        "16,9": (320, 192),
+        "9,16": (192, 320),
+    },
+    "480": {
+        "1,1": (640, 640),
+        "4,3": (736, 544),
+        "3,4": (544, 736),
+        "16,9": (832, 480),
+        "9,16": (480, 832),
+    },
+    "704": {
+        "1,1": (960, 960),
+        "4,3": (1088, 832),
+        "3,4": (832, 1088),
+        "16,9": (1280, 704),
+        "9,16": (704, 1280),
+    },
+    "720": {
+        "1,1": (960, 960),
+        "4,3": (1104, 832),
+        "3,4": (832, 1104),
+        "16,9": (1280, 720),
+        "9,16": (720, 1280),
+    },
+}
+
+
+def normalize_action_mode(mode: Any) -> str | None:
+    if mode is None:
+        return None
+    normalized = str(mode).strip().lower()
+    if not normalized:
+        return None
+    if normalized not in ACTION_MODES:
+        raise ValueError(f"Unsupported Cosmos3 action_mode={mode!r}; expected one of {sorted(ACTION_MODES)}.")
+    return normalized
+
+
+def resolve_domain_id(
+    *,
+    domain_id: Any = None,
+    domain_name: Any = None,
+    require_explicit: bool = False,
+) -> int:
+    if domain_id is not None:
+        resolved = int(domain_id)
+        if resolved < 0:
+            raise ValueError(f"Cosmos3 domain_id must be non-negative, got {resolved}.")
+        return resolved
+
+    if domain_name is None or str(domain_name).strip() == "":
+        if require_explicit:
+            raise ValueError(
+                "Cosmos3 action generation requires extra_args['domain_id'] or non-empty extra_args['domain_name']."
+            )
+        return 0
+
+    key = str(domain_name).strip().lower()
+    if key not in EMBODIMENT_TO_DOMAIN_ID:
+        raise ValueError(
+            f"Unknown Cosmos3 action domain_name={domain_name!r}; "
+            f"expected one of {sorted(EMBODIMENT_TO_DOMAIN_ID)} or pass domain_id directly."
+        )
+    return EMBODIMENT_TO_DOMAIN_ID[key]
+
+
+def action_condition_indexes(mode: str, action_length: int) -> list[int]:
+    mode = normalize_action_mode(mode)
+    if mode == ACTION_MODE_FORWARD_DYNAMICS:
+        return list(range(action_length))
+    if mode in {ACTION_MODE_POLICY, ACTION_MODE_INVERSE_DYNAMICS}:
+        return []
+    raise AssertionError(f"Unexpected action mode: {mode!r}")
+
+
+def vision_condition_indexes(mode: str, video_length: int, temporal_compression_factor: int) -> list[int]:
+    mode = normalize_action_mode(mode)
+    latent_frames = (video_length - 1) // temporal_compression_factor + 1
+    if mode in {ACTION_MODE_POLICY, ACTION_MODE_FORWARD_DYNAMICS}:
+        return [0]
+    if mode == ACTION_MODE_INVERSE_DYNAMICS:
+        return list(range(latent_frames))
+    raise AssertionError(f"Unexpected action mode: {mode!r}")
+
+
+def action_start_frame_offset(mode: str, action_length: int, video_length: int) -> int:
+    del mode
+    if action_length == video_length - 1:
+        return 1
+    if action_length == video_length:
+        return 0
+    raise ValueError(
+        "Cosmos3 action_chunk_size must equal num_frames - 1 or num_frames; "
+        f"got action_chunk_size={action_length}, num_frames={video_length}."
+    )
+
+
+def build_action_condition_mask(
+    mode: str,
+    action_length: int,
+    *,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    mask = torch.zeros(1, action_length, 1, device=device, dtype=dtype)
+    for idx in action_condition_indexes(mode, action_length):
+        mask[:, idx, :] = 1.0
+    return mask
+
+
+def build_vision_condition_mask(
+    mode: str,
+    video_length: int,
+    temporal_compression_factor: int,
+    *,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    latent_frames = (video_length - 1) // temporal_compression_factor + 1
+    mask = torch.zeros(1, 1, latent_frames, 1, 1, device=device, dtype=dtype)
+    for idx in vision_condition_indexes(mode, video_length, temporal_compression_factor):
+        mask[:, :, idx, :, :] = 1.0
+    return mask
+
+
+def pad_action_to_dim(action: torch.Tensor, action_dim: int) -> torch.Tensor:
+    if action.shape[-1] > action_dim:
+        raise ValueError(f"Cosmos3 action dimension {action.shape[-1]} exceeds model action_dim={action_dim}.")
+    if action.shape[-1] == action_dim:
+        return action
+    padding = torch.zeros(*action.shape[:-1], action_dim - action.shape[-1], dtype=action.dtype, device=action.device)
+    return torch.cat([action, padding], dim=-1)
+
+
+def load_action_tensor(action: Any = None, action_path: str | Path | None = None) -> torch.Tensor:
+    if action is None and action_path is None:
+        raise ValueError(
+            "Cosmos3 forward_dynamics action mode requires extra_args['action'] or extra_args['action_path']."
+        )
+    if action is None:
+        action = json.loads(Path(str(action_path)).read_text())
+    if isinstance(action, torch.Tensor):
+        tensor = action.detach().to(dtype=torch.float32)
+    else:
+        tensor = torch.as_tensor(np.asarray(action), dtype=torch.float32)
+    if tensor.ndim == 3 and tensor.shape[0] == 1:
+        tensor = tensor.squeeze(0)
+    if tensor.ndim != 2:
+        raise ValueError(f"Cosmos3 action must have shape [T, D], got {tuple(tensor.shape)}.")
+    return tensor
+
+
+def find_closest_target_size(h: int, w: int, resolution: str | int) -> tuple[int, int]:
+    key = str(resolution)
+    if key not in VIDEO_RES_SIZE_INFO:
+        raise ValueError(
+            f"Unknown Cosmos3 action resolution={resolution!r}; expected one of {sorted(VIDEO_RES_SIZE_INFO)}."
+        )
+    input_ratio = h / w
+    best_size = None
+    best_diff = float("inf")
+    for cand_w, cand_h in VIDEO_RES_SIZE_INFO[key].values():
+        diff = abs(input_ratio - cand_h / cand_w)
+        if diff < best_diff:
+            best_diff = diff
+            best_size = (cand_w, cand_h)
+    assert best_size is not None
+    return best_size
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
new file mode 100644
index 00000000000..cfb794705ba
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .avae import Cosmos3AVAEAudioTokenizer
+
+__all__ = ["Cosmos3AVAEAudioTokenizer"]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py
new file mode 100755
index 00000000000..02678a4ef09
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+
+
+import torch
+from torch import nn, pow, sin
+from torch.nn import Parameter
+
+
+# https://github.com/jaywalnut310/vits/blob/main/commons.py
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(
+    input_a: torch.Tensor, input_b: torch.Tensor, n_channels: list[int]
+) -> torch.Tensor:
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b  # [B,2*C,T]
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])  # [B,C,T]
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])  # [B,C,T]
+    acts = t_act * s_act  # [B,C,T]
+    return acts  # [B,C,T]
+
+
+# about 10% faster training. no_div_by_zero (1e-9) baked in
+@torch.jit.script
+def fused_snake(x: torch.Tensor, alpha: torch.Tensor, beta: torch.Tensor) -> torch.Tensor:
+    return x + (1.0 / (beta + 1e-9)) * pow(sin(x * alpha), 2)
+
+
+class Snake(nn.Module):
+    """
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+
+    def __init__(
+        self, in_features: int, alpha: float = 1.0, alpha_trainable: bool = True, alpha_logscale: bool = True
+    ) -> None:
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        """
+        super().__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self: "Snake", x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake := x + 1/a * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # [1,C,1]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)  # [1,C,1]
+
+        return fused_snake(x, alpha, alpha)  # [B,C,T]
+        # x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        # return x
+
+
+class SnakeBeta(nn.Module):
+    """
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - Modified from the paper by Liu Ziyin, Tilman Hartwig, and Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+
+    def __init__(
+        self, in_features: int, alpha: float = 1.0, alpha_trainable: bool = True, alpha_logscale: bool = True
+    ) -> None:
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        """
+        super().__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self: "SnakeBeta", x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta := x + 1/b * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # [1,C,1]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)  # [1,C,1]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)  # [1,C,1]
+            beta = torch.exp(beta)  # [1,C,1]
+
+        return fused_snake(x, alpha, beta)  # [B,C,T]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/__init__.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/__init__.py
new file mode 100755
index 00000000000..28f76f7d706
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+
+from .act import Activation1d
+from .filter import LowPassFilter1d, kaiser_sinc_filter1d, sinc
+from .resample import DownSample1d, UpSample1d
+
+__all__ = [
+    "Activation1d",
+    "LowPassFilter1d",
+    "kaiser_sinc_filter1d",
+    "sinc",
+    "DownSample1d",
+    "UpSample1d",
+]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/act.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/act.py
new file mode 100755
index 00000000000..0825c181fa5
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/act.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+
+import torch.nn as nn
+
+from .resample import DownSample1d, UpSample1d
+
+
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation: nn.Module,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+
+        return x
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/filter.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/filter.py
new file mode 100755
index 00000000000..56a45011ed9
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/filter.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+if "sinc" in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(
+            x == 0, torch.tensor(1.0, device=x.device, dtype=x.dtype), torch.sin(math.pi * x) / math.pi / x
+        )
+
+
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):  # return filter [1,1,kernel_size]
+    even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+
+    # For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.0:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.0:
+        beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
+    else:
+        beta = 0.0
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)  # [kernel_size]
+
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = torch.arange(-half_size, half_size) + 0.5  # [kernel_size]
+    else:
+        time = torch.arange(kernel_size) - half_size  # [kernel_size]
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)  # [kernel_size]
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)  # [kernel_size]
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+    filter = filter_.view(1, 1, kernel_size)  # [1,1,kernel_size]
+
+    return filter  # [1,1,kernel_size]
+
+
+class LowPassFilter1d(nn.Module):
+    def __init__(
+        self,
+        cutoff=0.5,
+        half_width=0.6,
+        stride: int = 1,
+        padding: bool = True,
+        padding_mode: str = "replicate",
+        kernel_size: int = 12,
+    ):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = kernel_size % 2 == 0
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+
+    # input [B,C,T]
+    def forward(self, x):  # x: [B,C,T]
+        _, C, _ = x.shape
+
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)  # [B,C,T+pad_left+pad_right]
+        out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)  # [B,C,T//stride]
+
+        return out  # [B,C,T//stride]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/resample.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/resample.py
new file mode 100755
index 00000000000..30e9663fe18
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/resample.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+
+import torch.nn as nn
+from torch.nn import functional as F
+
+from .filter import LowPassFilter1d, kaiser_sinc_filter1d
+
+
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size)
+        self.register_buffer("filter", filter)
+
+    # x: [B,C,T]
+    def forward(self, x):  # x: [B,C,T]
+        _, C, _ = x.shape
+
+        x = F.pad(x, (self.pad, self.pad), mode="replicate")  # [B,C,T+2*pad]
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
+        )  # [B,C,T*ratio+pad_left+pad_right]
+        x = x[..., self.pad_left : -self.pad_right]  # [B,C,T*ratio]
+
+        return x  # [B,C,T*ratio]
+
+
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.lowpass = LowPassFilter1d(
+            cutoff=0.5 / ratio, half_width=0.6 / ratio, stride=ratio, kernel_size=self.kernel_size
+        )
+
+    def forward(self, x):  # x: [B,C,T]
+        xx = self.lowpass(x)  # [B,C,T//ratio]
+
+        return xx  # [B,C,T//ratio]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
new file mode 100644
index 00000000000..03367071f4f
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Local AVAE audio tokenizer used by Cosmos3 sound generation."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from vllm.logger import init_logger
+
+from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
+
+from .config import AttrDict
+from .models import load_generator
+
+logger = init_logger(__name__)
+
+
+def _default_avae_config(
+    *,
+    sample_rate: int,
+    audio_channels: int,
+    io_channels: int,
+    hop_size: int,
+) -> AttrDict:
+    return AttrDict(
+        {
+            "model_type": "autoencoder_v2",
+            "sampling_rate": sample_rate,
+            "stereo": audio_channels == 2,
+            "use_wav_as_input": True,
+            "normalize_volume": True,
+            "hop_size": hop_size,
+            "input_channels": 1,
+            "enc_type": "spec_convnext",
+            "enc_dim": 192,
+            "enc_intermediate_dim": 768,
+            "enc_num_layers": 12,
+            "enc_num_blocks": 2,
+            "enc_n_fft": 64,
+            "enc_hop_length": 16,
+            "enc_latent_dim": 128,
+            "enc_c_mults": [1, 2, 4],
+            "enc_strides": [4, 4, 8],
+            "enc_identity_init": False,
+            "enc_use_snake": True,
+            "dec_type": "oobleck",
+            "dec_dim": 320,
+            "dec_c_mults": [1, 2, 4, 8, 16],
+            "dec_strides": [2, 4, 4, 8, 8],
+            "dec_use_snake": True,
+            "dec_final_tanh": False,
+            "dec_out_channels": audio_channels,
+            "dec_anti_aliasing": False,
+            "dec_use_nearest_upsample": False,
+            "dec_use_tanh_at_final": False,
+            "bottleneck_type": "vae",
+            "bottleneck": {"type": "vae"},
+            "activation": "snakebeta",
+            "snake_logscale": True,
+            "anti_aliasing": False,
+            "use_cuda_kernel": False,
+            "causal": False,
+            "padding_mode": "zeros",
+            "vocoder_input_dim": io_channels,
+        }
+    )
+
+
+def _load_config(
+    config_path: str | Path | None,
+    *,
+    sample_rate: int,
+    audio_channels: int,
+    io_channels: int,
+    hop_size: int,
+) -> AttrDict:
+    if config_path:
+        with open(config_path, encoding="utf-8") as f:
+            return AttrDict(json.load(f))
+    return _default_avae_config(
+        sample_rate=sample_rate,
+        audio_channels=audio_channels,
+        io_channels=io_channels,
+        hop_size=hop_size,
+    )
+
+
+def _load_checkpoint(path: str | Path, map_location: torch.device | str) -> dict[str, torch.Tensor]:
+    path = Path(path)
+    if path.suffix == ".safetensors":
+        try:
+            from safetensors.torch import load_file
+        except ImportError as exc:
+            raise ImportError("Loading AVAE .safetensors checkpoints requires safetensors.") from exc
+        checkpoint = load_file(str(path), device=str(map_location))
+    else:
+        checkpoint = torch.load(path, map_location=map_location)
+
+    if not isinstance(checkpoint, dict):
+        raise TypeError(f"AVAE checkpoint must be a dict, got {type(checkpoint)!r}.")
+
+    for key in ("generator", "state_dict", "model"):
+        value = checkpoint.get(key)
+        if isinstance(value, dict):
+            checkpoint = value
+            break
+
+    if not all(isinstance(value, torch.Tensor) for value in checkpoint.values()):
+        tensor_items = {key: value for key, value in checkpoint.items() if isinstance(value, torch.Tensor)}
+        if not tensor_items:
+            raise RuntimeError(f"No tensor state dict found in AVAE checkpoint keys: {list(checkpoint.keys())[:16]}")
+        checkpoint = tensor_items
+
+    return checkpoint
+
+
+def _strip_prefixes(
+    state_dict: dict[str, torch.Tensor],
+    model_state: dict[str, torch.Tensor],
+) -> dict[str, torch.Tensor]:
+    prefixes = ("module.", "generator.", "model.")
+    normalized: dict[str, torch.Tensor] = {}
+    for key, value in state_dict.items():
+        candidates = [key]
+        current = key
+        changed = True
+        while changed:
+            changed = False
+            for prefix in prefixes:
+                if current.startswith(prefix):
+                    current = current[len(prefix) :]
+                    candidates.append(current)
+                    changed = True
+                    break
+        selected = next((candidate for candidate in candidates if candidate in model_state), candidates[-1])
+        normalized[selected] = value
+    return normalized
+
+
+class Cosmos3AVAEAudioTokenizer(nn.Module):
+    """AVAE tokenizer/decoder for Cosmos3 audio latents."""
+
+    def __init__(
+        self,
+        *,
+        checkpoint_path: str | Path,
+        config_path: str | Path | None = None,
+        sample_rate: int = 48000,
+        audio_channels: int = 2,
+        io_channels: int = 64,
+        hop_size: int = 1920,
+        normalize_latents: bool = True,
+        normalization_type: str = "none",
+        tanh_input_scale: float = 1.5,
+        tanh_output_scale: float = 3.5,
+        tanh_clamp: float = 0.995,
+        dtype: torch.dtype = torch.bfloat16,
+        device: torch.device | str = "cuda",
+    ) -> None:
+        super().__init__()
+        self.sample_rate = int(sample_rate)
+        self.audio_channels = int(audio_channels)
+        self.latent_ch = int(io_channels)
+        self.hop_size = int(hop_size)
+        self.dtype = dtype
+        self.device = torch.device(device)
+        self.normalize_volume = True
+
+        if normalization_type == "none" and normalize_latents:
+            normalization_type = "tanh"
+        self.normalization_type = normalization_type
+        self.tanh_input_scale = float(tanh_input_scale)
+        self.tanh_output_scale = float(tanh_output_scale)
+        self.tanh_clamp = float(tanh_clamp)
+
+        config = _load_config(
+            config_path,
+            sample_rate=self.sample_rate,
+            audio_channels=self.audio_channels,
+            io_channels=self.latent_ch,
+            hop_size=self.hop_size,
+        )
+        self.model = load_generator(config.model_type, config, self.device)
+        state_dict = _strip_prefixes(
+            _load_checkpoint(checkpoint_path, self.device),
+            self.model.state_dict(),
+        )
+        matching_keys = set(state_dict).intersection(self.model.state_dict())
+        if not matching_keys:
+            raise RuntimeError("AVAE checkpoint did not contain any keys matching the local AVAE model.")
+        missing, unexpected = self.model.load_state_dict(state_dict, strict=False)
+        if _is_rank_zero():
+            logger.info(
+                "Loaded Cosmos3 AVAE checkpoint from %s; missing=%d unexpected=%d",
+                checkpoint_path,
+                len(missing),
+                len(unexpected),
+            )
+
+        self.model.eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
+        if hasattr(self.model, "remove_weight_norm"):
+            self.model.remove_weight_norm()
+        self.model.to(dtype=self.dtype)
+
+    @property
+    def temporal_compression_factor(self) -> int:
+        return self.hop_size
+
+    def get_latent_num_samples(self, num_audio_samples: int) -> int:
+        return int(num_audio_samples) // self.temporal_compression_factor
+
+    def get_audio_num_samples(self, num_latent_samples: int) -> int:
+        return int(num_latent_samples) * self.temporal_compression_factor
+
+    def _normalize_latent(self, latent: torch.Tensor) -> torch.Tensor:
+        if self.normalization_type == "tanh":
+            in_dtype = latent.dtype
+            return (torch.tanh(latent.float() / self.tanh_input_scale) * self.tanh_output_scale).to(in_dtype)
+        if self.normalization_type != "none":
+            raise ValueError(f"Unsupported AVAE normalization_type={self.normalization_type!r}.")
+        return latent
+
+    def _denormalize_latent(self, latent: torch.Tensor) -> torch.Tensor:
+        if self.normalization_type == "tanh":
+            in_dtype = latent.dtype
+            latent = torch.clamp(
+                latent.float() / self.tanh_output_scale,
+                -self.tanh_clamp,
+                self.tanh_clamp,
+            )
+            return (torch.atanh(latent) * self.tanh_input_scale).to(in_dtype)
+        if self.normalization_type != "none":
+            raise ValueError(f"Unsupported AVAE normalization_type={self.normalization_type!r}.")
+        return latent
+
+    @torch.no_grad()
+    def encode(self, audio: torch.Tensor, force_pad: bool = False) -> torch.Tensor:
+        in_dtype = audio.dtype
+        x = audio.to(self.device)
+        if x.ndim != 3:
+            raise ValueError(f"AVAE audio input must be [B, C, T], got {tuple(x.shape)}.")
+        if x.shape[1] == 1 and self.audio_channels == 2:
+            x = x.repeat(1, 2, 1)
+        elif x.shape[1] > self.audio_channels:
+            x = x[:, : self.audio_channels]
+        if self.normalize_volume:
+            x = x / (x.abs().amax(dim=(-2, -1), keepdim=True) + 1e-5) * 0.95
+        if force_pad or not self.model.training:
+            pad_amount = (self.hop_size - (x.shape[-1] % self.hop_size)) % self.hop_size
+            if pad_amount:
+                x = F.pad(x, (0, pad_amount), mode="constant", value=0)
+        encoded = self.model.encode(x.to(self.dtype))
+        latent = encoded["latent"] if isinstance(encoded, dict) else encoded
+        return self._normalize_latent(latent).to(in_dtype)
+
+    @torch.no_grad()
+    def decode(self, latent: torch.Tensor) -> torch.Tensor:
+        in_dtype = latent.dtype
+        z = self._denormalize_latent(latent.to(self.device)).to(self.dtype)
+        decoded = self.model.decode(z)
+        if not isinstance(decoded, dict) or "decoder_out" not in decoded:
+            raise RuntimeError("AVAE decoder did not return decoder_out.")
+        audio = decoded["decoder_out"].clamp(-1.0, 1.0)
+        return audio.to(in_dtype)
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py
new file mode 100755
index 00000000000..191f653c470
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Bottleneck modules for AVAE tokenizer.
+
+This cleaned-up version only includes VAEBottleneck which is used
+by the spec_convnext encoder + oobleck decoder + vae configuration.
+"""
+
+from typing import Any
+
+import torch
+from torch import Tensor, nn
+
+
+# Base class
+class Bottleneck(nn.Module):
+    """Base class for bottleneck modules."""
+
+    def __init__(self: "Bottleneck", is_discrete: bool = False) -> None:
+        super().__init__()
+        self.is_discrete = is_discrete
+
+    def encode(
+        self: "Bottleneck", x: Tensor, return_info: bool = False, **kwargs: Any
+    ) -> Tensor | tuple[Tensor, dict[str, Any]]:
+        raise NotImplementedError
+
+    def decode(self: "Bottleneck", x: Tensor, return_info: bool = False) -> Tensor | tuple[Tensor, dict[str, Any]]:
+        raise NotImplementedError
+
+
+def vae_sample(mean: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]:
+    """
+    Sample from VAE latent distribution.
+
+    Args:
+        mean: Mean of the latent distribution
+        scale: Scale parameter (will be passed through softplus)
+
+    Returns:
+        latents: Sampled latents
+        kl: KL divergence loss
+    """
+    stdev = nn.functional.softplus(scale) + 1e-4  # [B,C,T]
+    var = stdev * stdev  # [B,C,T]
+    logvar = torch.log(var)  # [B,C,T]
+    latents = torch.randn_like(mean) * stdev + mean  # [B,C,T]
+
+    kl = (mean * mean + var - logvar - 1).sum(1).mean()  # scalar
+
+    return latents, kl
+
+
+class VAEBottleneck(Bottleneck):
+    """
+    Variational Autoencoder (VAE) bottleneck.
+
+    Applies VAE reparameterization trick during encoding.
+    """
+
+    def __init__(self: "VAEBottleneck") -> None:
+        super().__init__(is_discrete=False)
+
+    def encode(
+        self: "VAEBottleneck", x: Tensor, return_info: bool = False, **kwargs: Any
+    ) -> Tensor | tuple[Tensor, dict[str, Any]]:
+        """
+        Encode input through VAE bottleneck.
+
+        Args:
+            x: Input tensor with shape [B, C*2, T] where C*2 contains
+               concatenated mean and scale parameters
+            return_info: Whether to return additional info dict
+
+        Returns:
+            Sampled latents (and optionally info dict with KL divergence)
+        """
+        info: dict[str, Any] = {}
+
+        mean, scale = x.chunk(2, dim=1)  # mean,scale: [B,C,T]
+        x, kl = vae_sample(mean, scale)  # x: [B,C,T]
+
+        info["kl"] = kl
+
+        if return_info:
+            return x, info
+        else:
+            return x
+
+    def decode(self: "VAEBottleneck", x: Tensor, return_info: bool = False) -> Tensor | tuple[Tensor, dict[str, Any]]:
+        """
+        Decode from latents (identity operation for VAE).
+
+        Args:
+            x: Latent tensor
+            return_info: Whether to return additional info dict
+
+        Returns:
+            Latents (and optionally empty info dict)
+        """
+        info: dict[str, Any] = {}
+        if return_info:
+            return x, info
+        else:
+            return x
+
+
+def create_bottleneck_from_config(bottleneck_config: dict[str, Any]) -> Bottleneck:
+    """
+    Create a bottleneck module from configuration.
+
+    Args:
+        bottleneck_config: Dictionary with 'type' key specifying bottleneck type
+
+    Returns:
+        Bottleneck module instance
+
+    Note:
+        This cleaned version only supports 'vae' bottleneck type.
+    """
+    bottleneck_type = bottleneck_config.get("type", None)
+
+    assert bottleneck_type is not None, "type must be specified in bottleneck config"
+
+    if bottleneck_type == "vae":
+        bottleneck = VAEBottleneck()
+    else:
+        raise NotImplementedError(
+            f"Bottleneck type '{bottleneck_type}' not supported in cleaned AVAE. "
+            f"Only 'vae' is supported for the spec_convnext + oobleck + vae configuration."
+        )
+
+    return bottleneck
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/config.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/config.py
new file mode 100644
index 00000000000..c52a956ce4b
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/config.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+
+from typing import Any
+
+
+class AttrDict(dict):
+    def __init__(self: "AttrDict", *args: Any, **kwargs: Any) -> None:
+        values = dict(*args, **kwargs)
+        super().__init__({key: self._convert(value) for key, value in values.items()})
+        self.__dict__ = self
+
+    @classmethod
+    def _convert(cls, value: Any) -> Any:
+        if isinstance(value, dict) and not isinstance(value, AttrDict):
+            return cls(value)
+        if isinstance(value, list):
+            return [cls._convert(item) for item in value]
+        return value
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/models.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/models.py
new file mode 100755
index 00000000000..41ebe5b7b65
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/models.py
@@ -0,0 +1,614 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+
+"""AVAE Models.
+
+This file contains only the models needed for the spec_convnext encoder +
+oobleck decoder + vae configuration.
+"""
+
+import math
+from collections.abc import Callable
+from functools import partial
+from typing import Any
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrize import remove_parametrizations
+
+from .config import AttrDict
+from .modules import ConvNeXtBlock, OobleckDecoderBlock, WNConv1d, get_activation
+
+# for causal models we use encodec modules
+from .modules_encodec import SConv1d
+
+
+def load_generator(model_type: str, h: AttrDict, device: torch.device | str) -> nn.Module:
+    """
+    Load generator model based on model_type.
+
+    Cleaned version only supports 'autoencoder_v2' type.
+    """
+    if model_type in ["autoencoder_v2"]:
+        generator = LatentAutoEncoderV2(h).to(device)
+    else:
+        raise NotImplementedError(
+            f"Model type '{model_type}' not supported in cleaned AVAE. Only 'autoencoder_v2' is supported."
+        )
+
+    return generator
+
+
+class TrimPadding(nn.Module):
+    """
+    Used for causal convolution support of a conv layer wrapped with nn.Sequential
+    """
+
+    def __init__(self: "TrimPadding", padding: int) -> None:
+        super().__init__()
+        self.padding = padding
+
+    def forward(self: "TrimPadding", x: torch.Tensor) -> torch.Tensor:
+        return x[:, :, : -self.padding]  # [B,C,T-padding]
+
+
+class SpectrogramConvNeXtEncoder(nn.Module):
+    """
+    Spectrogram Encoder with ConvNeXtBlocks
+
+    This encoder processes input waveforms by converting them into spectrograms
+    (magnitude and phase concatenated along the channel dimension) and encodes them
+    using a sequence of ConvNeXtBlocks and downsampling layers.
+
+    Args (mapped from h):
+        in_channels (int): Number of input audio channels (1 for mono, 2 for stereo).
+        channels (int): Base number of channels for the encoder.
+        latent_dim (int): Dimensionality of the final latent representation.
+        c_mults (List[int]): Channel multipliers at each depth of the encoder.
+        strides (List[int]): Downsampling strides for each depth.
+        num_blocks (int): Number of ConvNeXtBlocks to stack per depth.
+        identity_init (bool): Whether to initialize the 1x1 convs in residual paths as zeros.
+        n_fft (int): Number of FFT points for spectrogram computation.
+        hop_length (int): Hop length for the STFT.
+        use_snake (bool): Whether to use Snake activation in ConvNeXtBlocks.
+        causal (bool): If True, uses causal convolutions.
+        padding_mode (str): Padding mode for convolutions (default: 'zeros').
+
+    Inputs:
+        x (torch.Tensor): Input waveform tensor of shape `[batch, in_channels, time]`.
+
+    Outputs:
+        torch.Tensor: Encoded representation of shape `[batch, time_out, latent_dim]`.
+
+    Forward Pass:
+        - Converts waveform input into spectrograms (concatenates magnitude and phase).
+        - Processes the spectrogram through stacked ConvNeXtBlocks and downsampling layers.
+        - Outputs the final latent representation of specified dimensionality.
+
+    Example:
+        encoder = SpectrogramConvNeXtEncoder(
+            in_channels=2, channels=256, latent_dim=128, c_mults=[1, 2, 4], strides=[4, 4, 8]
+        )
+        waveform = torch.randn(8, 2, 65536)  # [batch, channels, time]
+        encoded = encoder(waveform)  # Output: [8, time_out, 128]
+
+    NOTE: output is in [B, T, C] to be consistent with other encoders
+    """
+
+    def __init__(self: "SpectrogramConvNeXtEncoder", h: AttrDict, **kwargs: Any) -> None:
+        super().__init__()
+
+        self.in_channels = h.input_channels
+        if getattr(h, "stereo", False):
+            self.in_channels *= 2
+
+        # if "enc_latent_dim" is found in v2 config, set it as latent_dim
+        if hasattr(h, "enc_latent_dim"):
+            self.latent_dim = h.enc_latent_dim
+        else:
+            # if not found, fallback to v1 logic
+            self.latent_dim = h.vocoder_input_dim
+            if h.model_type == "vae":
+                self.latent_dim *= 2
+
+        self.channels = h.enc_dim
+
+        self.c_mults = h.enc_c_mults
+        self.strides = h.enc_strides
+        self.num_blocks = h.enc_num_blocks
+        self.identity_init = h.enc_identity_init
+        self.causal = h.causal
+        self.padding_mode = h.padding_mode
+
+        self.use_snake = h.enc_use_snake
+
+        # Basic checks
+        assert len(self.c_mults) == len(self.strides), (
+            f"The length of c_mults and strides must match. Got {len(self.c_mults)} vs {len(self.strides)}."
+        )
+
+        # Spectrogram function
+        self.n_fft = h.enc_n_fft
+        self.hop_length = h.enc_hop_length
+        self.spectrogram_fn = partial(
+            self.spectrogram,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.n_fft,
+            window_fn=torch.hann_window,
+        )
+
+        # ---------------------------------------------------------------------
+        # 1) Initial projection (similar to the first_conv in OobleckEncoder),
+        #    but here we typically use a 1x1 conv for a "spectrogram style" input.
+        # ---------------------------------------------------------------------
+        layers = []
+        layers.append(
+            WNConv1d((self.n_fft + 2) * self.in_channels, self.c_mults[0] * self.channels, kernel_size=1, bias=False)
+        )
+
+        # ---------------------------------------------------------------------
+        # 2) Stages: For each i in range(len(c_mults)):
+        #       - Stack num_blocks of ConvNeXtBlock
+        #       - Downsample via stride convolution
+        # ---------------------------------------------------------------------
+        for i in range(len(self.c_mults)):
+            dim_in = self.c_mults[i] * self.channels
+            # Determine output dimension for the block
+            if i < len(self.c_mults) - 1:  # If not the last block
+                dim_out = self.c_mults[i + 1] * self.channels
+            else:  # For the last block, dim_out is c_mults[-1] * channels
+                dim_out = self.c_mults[-1] * self.channels
+            ds_rate = self.strides[i]
+
+            # (a) Repeated ConvNeXtBlocks
+            for _ in range(self.num_blocks):
+                layers.append(
+                    ConvNeXtBlock(
+                        dim=dim_in,
+                        intermediate_dim=dim_in * 4,
+                        identity_init=self.identity_init,
+                        use_snake=self.use_snake,
+                        causal=self.causal,
+                    )
+                )
+
+            # (b) Downsampling convolution
+            layers.append(self._create_downsample_layer(dim_in, dim_out, ds_rate, self.causal, self.padding_mode))
+
+        # ---------------------------------------------------------------------
+        # 3) Final projection from the last channel dimension to latent_dim.
+        # ---------------------------------------------------------------------
+        layers.append(WNConv1d(self.c_mults[-1] * self.channels, self.latent_dim, kernel_size=1, bias=False))
+
+        self.layers = nn.Sequential(*layers)
+
+    def spectrogram(
+        self: "SpectrogramConvNeXtEncoder",
+        wav: Tensor,
+        n_fft: int,
+        hop_length: int,
+        win_length: int,
+        window_fn: Callable[[int], torch.Tensor] = torch.hann_window,
+    ) -> Tensor:
+        """
+        wav: [B_ch,T_audio] where B_ch = batch * channels (channel folded into batch)
+        returns: [B_ch,n_fft//2+1,T_frames] complex
+        """
+        pad_size_l = (n_fft - hop_length) // 2
+        pad_size_r = (n_fft - hop_length) - pad_size_l
+        with torch.autocast(device_type=wav.device.type, enabled=False):
+            wav = F.pad(wav, (pad_size_l, pad_size_r)).float()  # [B_ch,T_audio+pad]
+            spec = torch.stft(
+                wav,
+                n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                window=window_fn(win_length).to(wav),
+                center=False,
+                normalized=False,
+                onesided=True,
+                return_complex=True,
+            )  # [B_ch,n_fft//2+1,T_frames]
+        return spec  # [B_ch,n_fft//2+1,T_frames]
+
+    def _create_downsample_layer(
+        self: "SpectrogramConvNeXtEncoder",
+        in_channels: int,
+        out_channels: int,
+        stride: int,
+        causal: bool,
+        padding_mode: str,
+    ) -> nn.Module:
+        if (
+            causal
+        ):  # use EnCodec's SConv1d for convenience without reinventing the wheels. padding_mode is reflect by default
+            downsample_layer = SConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                causal=True,
+                norm="weight_norm",
+            )
+        else:  # original non-causal implementation
+            downsample_layer = WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                padding_mode=padding_mode,
+            )
+        return downsample_layer
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: [B,C,T_audio] waveform (mono: C=1, stereo: C=2)
+
+        Returns:
+            [B,T_latent,latent_dim]
+        """
+
+        # Handle stereo input by merging channel dim into batch dim
+        batch, channels, length = x.shape
+        if channels > 1:  # Stereo case
+            x = x.reshape(batch * channels, 1, length)  # [B*C,1,T_audio] (channel folded into batch)
+
+        # Compute the spectrogram
+        with torch.autocast(device_type=x.device.type, enabled=False):
+            spec = self.spectrogram_fn(x.float().squeeze(1))  # [B*C,n_fft//2+1,T_frames] complex
+            mag, ph = torch.view_as_real(spec).chunk(2, dim=-1)  # each [B*C,n_fft//2+1,T_frames,1]
+            spectrogram = torch.cat([mag, ph], dim=1).squeeze(-1)  # [B*C,n_fft+2,T_frames]
+
+        # Cast spectrogram back to original dtype
+        spectrogram = spectrogram.to(x.dtype)  # [B*C,n_fft+2,T_frames]
+
+        # Restore stereo structure if needed
+        if channels > 1:  # Stereo case
+            freq = spectrogram.shape[1]  # Get the frequency dimension
+            spectrogram = spectrogram.reshape(
+                batch, channels * freq, *spectrogram.shape[2:]
+            )  # [B,(n_fft+2)*C,T_frames]
+
+        # forward pass the encoder
+        output = self.layers(spectrogram)  # [B,latent_dim,T_latent]
+
+        return output.transpose(1, 2)  # [B,T_latent,latent_dim]
+
+    def remove_weight_norm(self: "SpectrogramConvNeXtEncoder") -> None:
+        for module in self.modules():
+            if hasattr(module, "parametrizations"):  # for new WN implementation using parameterizations
+                try:
+                    remove_parametrizations(module, "weight")
+                except ValueError:
+                    pass
+            elif hasattr(module, "weight"):
+                try:
+                    remove_weight_norm(module)
+                except ValueError:
+                    pass
+
+
+class OobleckDecoder(nn.Module):
+    """
+    Oobleck Decoder for audio synthesis.
+
+    Decodes latent representations into audio waveforms using
+    upsampling blocks with optional Snake activation and anti-aliasing.
+    """
+
+    def __init__(
+        self: "OobleckDecoder",
+        h: AttrDict,
+    ) -> None:
+        super().__init__()
+
+        self.h = h
+
+        latent_dim = self.h.vocoder_input_dim
+
+        out_channels = self.h.input_channels
+        if getattr(h, "stereo", False):
+            out_channels *= 2
+
+        channels = self.h.dec_dim
+        c_mults = self.h.dec_c_mults
+        strides = self.h.dec_strides
+        use_snake = self.h.dec_use_snake
+        use_nearest_upsample = self.h.dec_use_nearest_upsample
+        antialias_activation = self.h.dec_anti_aliasing
+        causal = self.h.causal
+        final_tanh = self.h.dec_use_tanh_at_final
+        padding_mode = self.h.padding_mode
+
+        c_mults = [1, *c_mults]
+
+        self.depth = len(c_mults)
+
+        # Padding for the first convolution layer
+        self.first_padding = 6 if causal else 3
+        first_conv = WNConv1d(
+            in_channels=latent_dim,
+            out_channels=c_mults[-1] * channels,
+            kernel_size=7,
+            padding=self.first_padding,
+            padding_mode=padding_mode,
+        )
+
+        if causal:
+            first_conv = nn.Sequential(first_conv, TrimPadding(self.first_padding))
+
+        layers = [first_conv]
+
+        for i in range(self.depth - 1, 0, -1):
+            layers += [
+                OobleckDecoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i - 1] * channels,
+                    stride=strides[i - 1],
+                    use_snake=use_snake,
+                    antialias_activation=antialias_activation,
+                    use_nearest_upsample=use_nearest_upsample,
+                    causal=causal,
+                    padding_mode=padding_mode,
+                )
+            ]
+
+        # Padding for the final convolution layer
+        self.final_padding = 6 if causal else 3
+        final_conv = WNConv1d(
+            in_channels=c_mults[0] * channels,
+            out_channels=out_channels,
+            kernel_size=7,
+            padding=self.final_padding,
+            padding_mode=padding_mode,
+            bias=False,
+        )
+
+        if causal:
+            final_conv = nn.Sequential(final_conv, TrimPadding(self.final_padding))
+
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu", antialias=antialias_activation, channels=c_mults[0] * channels
+            ),
+            final_conv,
+            nn.Tanh() if final_tanh else nn.Identity(),
+        ]
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self: "OobleckDecoder", x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: [B,latent_dim,T_latent]
+
+        Returns:
+            [B,C,T_audio]
+        """
+        x = self.layers(x)  # [B,C,T_audio]
+        return x  # [B,C,T_audio]
+
+    def remove_weight_norm(self: "OobleckDecoder") -> None:
+        for module in self.modules():
+            if hasattr(module, "parametrizations"):  # for new WN implementation using parameterizations
+                try:
+                    remove_parametrizations(module, "weight")
+                except ValueError:
+                    pass
+            elif hasattr(module, "weight"):
+                try:
+                    remove_weight_norm(module)
+                except ValueError:
+                    pass
+
+
+class LatentAutoEncoderV2(nn.Module):
+    """
+    A Latent AutoEncoder class with cleaner implementation to generalize using bottleneck.py
+
+    Attributes:
+        h: Configuration object containing model hyperparameters.
+        encoder (nn.Module): The encoder module based on configuration.
+        bottleneck (Bottleneck): Bottleneck module from bottleneck.py.
+        decoder (nn.Module): The decoder module based on configuration.
+    """
+
+    def __init__(self: "LatentAutoEncoderV2", h: AttrDict) -> None:
+        super().__init__()
+        self.h = h
+
+        # Set up basic model properties
+        self.stereo = getattr(self.h, "stereo", False)
+
+        # Determine input type
+        self.input_type = None
+        if getattr(self.h, "use_wav_as_input", False):
+            self.input_type = "waveform"
+            self.h.input_channels = 1
+        elif getattr(self.h, "use_linear_spec_as_input", False):
+            self.input_type = "linear"
+            self.h.input_channels = self.h.num_linears
+        elif getattr(self.h, "use_discrete_code_as_input", False):
+            self.input_type = "discrete_code"
+            self.h.input_channels = 1
+        else:
+            self.input_type = "mel"
+            self.h.input_channels = self.h.num_mels
+
+        # hop_size defines the down/up sampling factor of the autoencoder
+        self.hop_size = self.h.hop_size
+
+        # Initialize encoder
+        self.enc_type = getattr(self.h, "enc_type", "convnext")
+
+        # Define encoder (only spec_convnext supported in cleaned version)
+        if self.enc_type == "spec_convnext":
+            self.encoder = SpectrogramConvNeXtEncoder(self.h)
+        else:
+            raise NotImplementedError(
+                f"Encoder type '{self.enc_type}' not supported in cleaned AVAE. Only 'spec_convnext' is supported."
+            )
+
+        # Initialize encoder projector (Identity for spec_convnext)
+        self.encoder_proj = nn.Identity()
+
+        # Initialize bottleneck from config
+        from .bottlenecks import create_bottleneck_from_config
+
+        if hasattr(self.h, "bottleneck"):
+            self.bottleneck = create_bottleneck_from_config(self.h.bottleneck)
+        else:
+            raise ValueError("Bottleneck configuration must be specified")
+
+        # Check for encoder-only mode
+        self.encoder_only = getattr(self.h, "encoder_only", False)
+
+        if not self.encoder_only:
+            # Initialize decoder
+            self.dec_type = getattr(self.h, "dec_type", "oobleck")
+            if self.dec_type == "oobleck":
+                self.decoder = OobleckDecoder(self.h)
+            else:
+                raise NotImplementedError(
+                    f"Decoder type '{self.dec_type}' not supported in cleaned AVAE. Only 'oobleck' is supported."
+                )
+        else:
+            # Skip decoder initialization
+            self.decoder = None
+
+        # Whether to freeze encoder
+        self.freeze_encoder = getattr(self.h, "freeze_encoder", False)
+        if self.freeze_encoder:
+            for param in self.encoder.parameters():
+                param.requires_grad = False
+
+    def calculate_latent_lengths(self: "LatentAutoEncoderV2", audio_lengths: torch.Tensor) -> torch.Tensor:
+        """
+        Calculates the latent lengths given the original audio lengths.
+
+        Args:
+            audio_lengths (torch.Tensor): A tensor of shape [B] containing the lengths of the original audio samples.
+
+        Returns:
+            torch.Tensor: A tensor of shape [B] containing the corresponding latent lengths.
+        """
+        if self.input_type == "waveform":
+            # The latent length is the audio length divided by the hop_size
+            latent_lengths = torch.ceil(audio_lengths.float() / self.hop_size).long()  # [B]
+        else:
+            # The latent length is same as audio_lengths
+            latent_lengths = audio_lengths  # [B]
+
+        return latent_lengths
+
+    def forward(self: "LatentAutoEncoderV2", x: torch.Tensor) -> dict[str, torch.Tensor]:
+        """
+        Forward pass through the model.
+
+        Args:
+            x (torch.Tensor): Input tensor to the model with shape [B,C,T_audio].
+
+        Returns:
+            dict[str, torch.Tensor]: Dictionary of output tensors including:
+                - encoder_out: Raw encoder output
+                - latent: Bottleneck latent representation
+                - decoder_out: Decoded output (if decoder exists)
+                - Additional outputs specific to the bottleneck type
+        """
+        return_dict = {}
+
+        # Encoder
+        encoder_out = self.encoder(x)  # [B,T_latent,enc_latent_dim]
+        encoder_out_proj = self.encoder_proj(encoder_out)  # [B,T_latent,enc_latent_dim]
+
+        # Apply bottleneck after reshaping to [B, C, T] again
+        latent, bottleneck_enc_info = self.bottleneck.encode(
+            encoder_out_proj.transpose(1, 2),
+            return_info=True,  # transpose: [B,enc_latent_dim,T_latent]
+        )  # [B,C,T_latent]
+
+        # Update return dictionary
+        return_dict.update(
+            {"encoder_out": encoder_out.transpose(1, 2), "latent": latent}  # encoder_out: [B,enc_latent_dim,T_latent]
+        )
+        # Add bottleneck-specific info to return dict
+        for k, v in bottleneck_enc_info.items():
+            return_dict[k] = v
+
+        # Decode (if decoder exists)
+        if self.decoder is not None:
+            # Apply bottleneck decode
+            decoded_latent, bottleneck_dec_info = self.bottleneck.decode(latent, return_info=True)  # [B,C,T_latent]
+            # Apply decoder
+            decoder_out = self.decoder(decoded_latent)  # [B,C,T_audio]
+
+            # Update return dictionary
+            return_dict["decoder_out"] = decoder_out  # [B,C,T_audio]
+            # Add bottleneck-specific info to return dict
+            for k, v in bottleneck_dec_info.items():
+                return_dict[k] = v
+
+        return return_dict
+
+    def encode(self: "LatentAutoEncoderV2", x: torch.Tensor) -> dict[str, torch.Tensor]:
+        """
+        Encodes input x into latent representation using encoder and bottleneck.
+
+        Args:
+            x (torch.Tensor): Input tensor with shape [B, C, T].
+
+        Returns:
+            dict[str, torch.Tensor]: Dictionary containing:
+                - latent: Bottleneck latent representation
+                - Additional outputs specific to the bottleneck type
+        """
+        encoder_out = self.encoder(x)  # [B,T_latent,enc_latent_dim]
+        encoder_out_proj = self.encoder_proj(encoder_out)  # [B,T_latent,enc_latent_dim]
+        latent, bottleneck_info = self.bottleneck.encode(
+            encoder_out_proj.transpose(1, 2),
+            return_info=True,  # transpose: [B,enc_latent_dim,T_latent]
+        )  # [B,C,T_latent]
+
+        return_dict = {"latent": latent}  # latent: [B,C,T_latent]
+        # Add bottleneck-specific info to return dict
+        for k, v in bottleneck_info.items():
+            return_dict[k] = v
+
+        return return_dict
+
+    def decode(self: "LatentAutoEncoderV2", latent: torch.Tensor) -> dict[str, torch.Tensor]:
+        """
+        Decodes continuous latent representation into output using bottleneck and decoder.
+
+        Args:
+            latent (torch.Tensor): continuous latent representation with shape [B, C, T].
+
+        Returns:
+            dict[str, torch.Tensor]: Dictionary containing:
+                - decoder_out: The output from the decoder
+                - Additional outputs from the bottleneck decode process
+        """
+        # Apply bottleneck decode
+        decoded_latent, bottleneck_info = self.bottleneck.decode(latent, return_info=True)  # [B,C,T_latent]
+
+        # Apply decoder
+        decoder_out = self.decoder(decoded_latent)  # [B,C,T_audio]
+
+        return_dict = {"decoder_out": decoder_out}  # decoder_out: [B,C,T_audio]
+        # Add bottleneck-specific info to return dict
+        for k, v in bottleneck_info.items():
+            return_dict[k] = v
+
+        return return_dict
+
+    def remove_weight_norm(self: "LatentAutoEncoderV2") -> None:
+        """Remove weight normalization from all components."""
+        self.encoder.remove_weight_norm()
+        if self.decoder is not None:
+            self.decoder.remove_weight_norm()
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py
new file mode 100755
index 00000000000..03c08938dcf
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py
@@ -0,0 +1,418 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+
+"""AVAE Modules.
+
+This file contains only the modules needed for the spec_convnext encoder +
+oobleck decoder + vae configuration.
+"""
+
+import math
+from typing import Any, Literal
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.cuda import amp
+from torch.nn.utils import weight_norm
+
+from . import activations
+from .alias_free_torch.act import Activation1d as TorchActivation1d
+
+# for causal models we use encodec modules
+from .modules_encodec import SConvTranspose1d
+
+
+def WNConv1d(*args: Any, **kwargs: Any) -> nn.Conv1d:
+    """Weight-normalized 1D convolution."""
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+
+
+def WNConvTranspose1d(*args: Any, **kwargs: Any) -> nn.ConvTranspose1d:
+    """Weight-normalized 1D transpose convolution."""
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+
+
+def zero_module(module: nn.Module) -> nn.Module:
+    """
+    Zero out the parameters of a module and return it.
+    Used for identity initialization in ConvNeXt blocks.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def may_mask(
+    x: Tensor,
+    mask: Tensor | None = None,
+) -> Tensor:
+    """
+    Apply mask to tensor if provided.
+
+    Args:
+        x: Input tensor
+        mask: Optional mask tensor
+
+    Returns:
+        Masked tensor if mask is provided, otherwise original tensor
+    """
+    if mask is not None:
+        x = x * mask
+    return x
+
+
+class LayerNorm(nn.Module):
+    """
+    LayerNorm with optional bias.
+    PyTorch doesn't support bias=False natively.
+    """
+
+    def __init__(self, size: int, gamma0: float = 1, eps: float = 1e-5, use_bias: bool = False) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(size))
+        self.bias = nn.Parameter(torch.zeros(size)) if use_bias else None
+        self.eps = eps
+        self.size = size
+
+    def forward(self, tensor: Tensor) -> Tensor:
+        """
+        Forward pass.
+
+        Args:
+            tensor: Input tensor of shape (B, T, C)
+
+        Returns:
+            Normalized tensor
+        """
+        dtype = tensor.dtype
+        # fp32 to avoid numerical issues
+        with amp.autocast(enabled=True, dtype=torch.float32):
+            tensor = F.layer_norm(tensor, self.weight.shape, self.weight, self.bias, self.eps)
+        return tensor.to(dtype)
+
+
+class ConvNeXtBlock(nn.Module):
+    """
+    ConvNeXt 1D Block adapted from https://github.com/charactr-platform/vocos
+    which is adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Supports causal and non-causal mode.
+
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        identity_init (bool): If True, initializes the 1x1 conv in residual paths to zero (identity-friendly).
+        use_snake (bool): If True, uses SnakeBeta activation; otherwise, GELU.
+        causal (bool): If True, applies causal padding; otherwise, applies symmetric padding for non-causal.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        identity_init: bool = False,
+        use_snake: bool = False,
+        causal: bool = False,
+    ):
+        super().__init__()
+        self.causal = causal
+
+        if causal:
+            # Causal padding: Only pad on the left
+            self.dwconv = nn.Sequential(
+                nn.ConstantPad1d((6, 0), 0),  # causal padding
+                nn.Conv1d(dim, dim, kernel_size=7, groups=dim),
+            )
+        else:
+            # Non-causal padding: Symmetric padding
+            self.dwconv = nn.Sequential(
+                nn.ConstantPad1d((3, 3), 0),  # symmetric padding (kernel_size // 2 on both sides)
+                nn.Conv1d(dim, dim, kernel_size=7, groups=dim),
+            )
+
+        self.norm = LayerNorm(dim)
+        self.pwconv1 = nn.Conv1d(dim, intermediate_dim, 1)  # pointwise/1x1 convs
+        self.act = activations.SnakeBeta(intermediate_dim) if use_snake else nn.GELU()
+
+        if identity_init:
+            self.pwconv2 = zero_module(nn.Conv1d(intermediate_dim, dim, 1))
+        else:
+            self.pwconv2 = nn.Conv1d(intermediate_dim, dim, 1)
+
+    def forward(self, x: Tensor, mask: Tensor | None = None) -> Tensor:
+        """
+        Forward pass.
+
+        Args:
+            x: Input tensor of shape (B, C, T)
+            mask: Optional mask tensor
+
+        Returns:
+            Output tensor of shape (B, C, T)
+        """
+        residual = x  # [B,C,T]
+        x = self.dwconv(may_mask(x, mask))  # [B,C,T]
+        x = self.norm(x.permute(0, 2, 1)).permute(0, 2, 1)  # [B,C,T] -> [B,T,C] -> [B,C,T]
+        x = self.pwconv1(x)  # [B,intermediate_dim,T]
+        x = self.act(x)  # [B,intermediate_dim,T]
+        x = self.pwconv2(x)  # [B,C,T]
+        x = residual + x  # [B,C,T]
+        return may_mask(x, mask)  # [B,C,T]
+
+    def remove_weight_norm(self) -> None:
+        """No weight norm is applied in ConvNeXtBlock."""
+        pass
+
+
+def get_activation(
+    activation: Literal["elu", "snake", "none"],
+    antialias: bool = False,
+    channels: int | None = None,
+    use_cuda_kernel: bool = False,
+) -> nn.Module:
+    """
+    Get activation module by name.
+
+    Args:
+        activation: Activation type ('elu', 'snake', or 'none')
+        antialias: Whether to wrap with anti-aliasing
+        channels: Number of channels (required for snake activation)
+        use_cuda_kernel: Whether to use CUDA kernel (not supported)
+
+    Returns:
+        Activation module
+    """
+    if activation == "elu":
+        act = nn.ELU()
+    elif activation == "snake":
+        act = activations.SnakeBeta(channels)
+    elif activation == "none":
+        act = nn.Identity()
+    else:
+        raise ValueError(f"Unknown activation {activation}")
+
+    if antialias:
+        # select which Activation1d, lazy-load cuda version to ensure backward compatibility
+        if use_cuda_kernel:
+            raise NotImplementedError("CUDA kernels not supported in this port")
+        else:
+            Activation1d = TorchActivation1d
+
+        act = Activation1d(act)
+
+    return act
+
+
+class ResidualUnit(nn.Module):
+    """
+    Residual unit with dilated convolutions.
+    Used in OobleckDecoderBlock.
+
+    Args:
+        in_channels: Number of input channels
+        out_channels: Number of output channels
+        dilation: Dilation rate
+        kernel_size: Convolution kernel size (default: 7)
+        use_snake: Whether to use Snake activation (default: False)
+        antialias_activation: Whether to use anti-aliasing (default: False)
+        causal: Whether to use causal convolutions (default: False)
+        padding_mode: Padding mode for convolutions (default: 'zeros')
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dilation: int,
+        kernel_size: int = 7,
+        use_snake: bool = False,
+        antialias_activation: bool = False,
+        causal: bool = False,
+        padding_mode: str = "zeros",
+    ) -> None:
+        super().__init__()
+
+        self.dilation = dilation
+        self.causal = causal
+        self.kernel_size = kernel_size
+
+        if causal:
+            self.padding = dilation * (kernel_size - 1)
+        else:
+            self.padding = (dilation * (kernel_size - 1)) // 2
+
+        # original non-causal impl used zero padding (DAC, SAVAE)
+        # Reflect padding may reduce edge artifacts (EnCodec's default), but
+        # it increases VRAM usage during training.
+        self.padding_mode = padding_mode
+
+        self.layers = nn.Sequential(
+            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=out_channels),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                dilation=dilation,
+                padding=self.padding,
+                padding_mode=self.padding_mode,
+            ),
+            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=out_channels),
+            WNConv1d(in_channels=out_channels, out_channels=out_channels, kernel_size=1, padding=0),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass.
+
+        Args:
+            x: Input tensor of shape (B, C, T)
+
+        Returns:
+            Output tensor of shape (B, C, T)
+        """
+        res = x  # [B,C,T]
+
+        # apply conv layers
+        x = self.layers(x)  # [B,C,T] (padded if causal)
+
+        if self.causal:
+            # Trim right padding to get the causal output
+            x = x[:, :, : -self.padding]  # [B,C,T]
+
+        return x + res  # [B,C,T]
+
+
+class OobleckDecoderBlock(nn.Module):
+    """
+    Oobleck decoder block with upsampling and residual units.
+
+    Args:
+        in_channels: Number of input channels
+        out_channels: Number of output channels
+        stride: Upsampling stride
+        use_snake: Whether to use Snake activation (default: False)
+        antialias_activation: Whether to use anti-aliasing (default: False)
+        use_nearest_upsample: Whether to use nearest neighbor upsampling (default: False)
+        causal: Whether to use causal convolutions (default: False)
+        padding_mode: Padding mode for convolutions (default: 'zeros')
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        stride: int,
+        use_snake: bool = False,
+        antialias_activation: bool = False,
+        use_nearest_upsample: bool = False,
+        causal: bool = False,
+        padding_mode: str = "zeros",
+    ) -> None:
+        super().__init__()
+
+        self.causal = causal
+
+        self.layers = nn.Sequential(
+            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=in_channels),
+            self._create_upsample_layer(in_channels, out_channels, stride, use_nearest_upsample, causal, padding_mode),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=1,
+                use_snake=use_snake,
+                causal=causal,
+                padding_mode=padding_mode,
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=3,
+                use_snake=use_snake,
+                causal=causal,
+                padding_mode=padding_mode,
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=9,
+                use_snake=use_snake,
+                causal=causal,
+                padding_mode=padding_mode,
+            ),
+        )
+
+    def _create_upsample_layer(
+        self,
+        in_channels: int,
+        out_channels: int,
+        stride: int,
+        use_nearest_upsample: bool,
+        causal: bool,
+        padding_mode: str,
+    ) -> nn.Module:
+        """
+        Create upsampling layer based on configuration.
+
+        Note: padding_mode parameter is not used in this function.
+        """
+
+        if causal:  # use EnCodec's SConvTransposed1d for convenience. padding_mode is reflect by default
+            assert not use_nearest_upsample, "use_nearest_upsample is not implemented for causal mode!"
+            upsample_layer = SConvTranspose1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                causal=True,
+                norm="weight_norm",
+            )
+        else:
+            if use_nearest_upsample:
+                upsample_layer = nn.Sequential(
+                    nn.Upsample(scale_factor=stride, mode="nearest"),
+                    WNConv1d(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=2 * stride,
+                        stride=1,
+                        bias=False,
+                        padding="same",
+                    ),
+                )
+            else:
+                # WNConvTranspose1d only supports zeros padding mode so it's hardcoded
+                upsample_layer = WNConvTranspose1d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=2 * stride,
+                    stride=stride,
+                    padding=math.ceil(stride / 2),
+                    output_padding=stride % 2,
+                    padding_mode="zeros",
+                )
+
+        return upsample_layer
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass.
+
+        Args:
+            x: Input tensor of shape (B, C, T)
+
+        Returns:
+            Output tensor of shape (B, C, T_upsampled)
+        """
+        return self.layers(x)
+
+    def remove_weight_norm(self) -> None:
+        """Remove weight normalization from all layers."""
+        from torch.nn.utils import remove_weight_norm
+
+        for layer in self.layers:
+            try:
+                remove_weight_norm(layer)
+            except (ValueError, AttributeError):
+                # Layer doesn't have weight norm or is not a module with weight norm
+                pass
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules_encodec.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules_encodec.py
new file mode 100755
index 00000000000..007e13f24df
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules_encodec.py
@@ -0,0 +1,297 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from https://github.com/facebookresearch/encodec under the MIT license.
+
+"""Convolutional layers wrappers and utilities."""
+
+import math
+import warnings
+from typing import Any
+
+import einops
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils import spectral_norm, weight_norm
+
+CONV_NORMALIZATIONS = frozenset(
+    ["none", "weight_norm", "spectral_norm", "time_layer_norm", "layer_norm", "time_group_norm"]
+)
+
+
+class ConvLayerNorm(nn.LayerNorm):
+    """
+    Convolution-friendly LayerNorm that moves channels to last dimensions
+    before running the normalization and moves them back to original position right after.
+    """
+
+    def __init__(self: "ConvLayerNorm", normalized_shape: int | list[int] | torch.Size, **kwargs: Any) -> None:
+        super().__init__(normalized_shape, **kwargs)
+
+    def forward(self: "ConvLayerNorm", x: torch.Tensor) -> torch.Tensor:
+        x = einops.rearrange(x, "b ... t -> b t ...")  # [B,T,C]
+        x = super().forward(x)  # [B,T,C]
+        x = einops.rearrange(x, "b t ... -> b ... t")  # [B,C,T]
+        return x  # [B,C,T]
+
+
+def apply_parametrization_norm(module: nn.Module, norm: str = "none") -> nn.Module:
+    assert norm in CONV_NORMALIZATIONS
+    if norm == "weight_norm":
+        return weight_norm(module)
+    elif norm == "spectral_norm":
+        return spectral_norm(module)
+    else:
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn't need reparameterization.
+        return module
+
+
+def get_norm_module(module: nn.Module, causal: bool = False, norm: str = "none", **norm_kwargs) -> nn.Module:
+    """Return the proper normalization module. If causal is True, this will ensure the returned
+    module is causal, or return an error if the normalization doesn't support causal evaluation.
+    """
+    assert norm in CONV_NORMALIZATIONS
+    if norm == "layer_norm":
+        assert isinstance(module, nn.modules.conv._ConvAnd)
+        return ConvLayerNorm(module.out_channels, **norm_kwargs)
+    elif norm == "time_group_norm":
+        if causal:
+            raise ValueError("GroupNorm doesn't support causal evaluation.")
+        assert isinstance(module, nn.modules.conv._ConvAnd)
+        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
+    else:
+        return nn.Identity()
+
+
+def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+
+
+def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
+    """Pad for a convolution to make sure that the last window is full.
+    Extra padding is added at the end. This is required to ensure that we can rebuild
+    an output of the same length, as otherwise, even with padding, some time steps
+    might get removed.
+    For instance, with total padding = 4, kernel size = 4, stride = 2:
+        0 0 1 2 3 4 5 0 0   # (0s are padding)
+        1   2   3           # (output frames of a convolution, last 0 is never used)
+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
+            1 2 3 4         # once you removed padding, we are missing one time step !
+    """
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    return F.pad(x, (0, extra_padding))  # [B,C,T+extra_padding]
+
+
+def pad1d(x: torch.Tensor, paddings: tuple[int, int], mode: str = "zero", value: float = 0.0):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))  # [B,C,T+extra_pad]
+        padded = F.pad(x, paddings, mode, value)  # [B,C,T+padding_left+padding_right]
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]  # [B,C,T+padding_left+padding_right]
+    else:
+        return F.pad(x, paddings, mode, value)  # [B,C,T+padding_left+padding_right]
+
+
+def unpad1d(x: torch.Tensor, paddings: tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left:end]
+
+
+class NormConv1d(nn.Module):
+    """Wrapper around Conv1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+
+    def __init__(self, *args, causal: bool = False, norm: str = "none", norm_kwargs: dict[str, Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x)  # [B,C_out,T_out]
+        x = self.norm(x)  # [B,C_out,T_out]
+        return x  # [B,C_out,T_out]
+
+
+class NormConv2d(nn.Module):
+    """Wrapper around Conv2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+
+    def __init__(self, *args, norm: str = "none", norm_kwargs: dict[str, Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv(x)  # [B,C_out,H_out,W_out]
+        x = self.norm(x)  # [B,C_out,H_out,W_out]
+        return x  # [B,C_out,H_out,W_out]
+
+
+class NormConvTranspose1d(nn.Module):
+    """Wrapper around ConvTranspose1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+
+    def __init__(self, *args, causal: bool = False, norm: str = "none", norm_kwargs: dict[str, Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.convtr(x)  # [B,C_out,T_out]
+        x = self.norm(x)  # [B,C_out,T_out]
+        return x  # [B,C_out,T_out]
+
+
+class NormConvTranspose2d(nn.Module):
+    """Wrapper around ConvTranspose2d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+
+    def __init__(self, *args, norm: str = "none", norm_kwargs: dict[str, Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose2d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.convtr(x)  # [B,C_out,H_out,W_out]
+        x = self.norm(x)  # [B,C_out,H_out,W_out]
+        return x  # [B,C_out,H_out,W_out]
+
+
+class SConv1d(nn.Module):
+    """Conv1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: dict[str, Any] = {},
+        pad_mode: str = "reflect",
+    ):
+        super().__init__()
+        # warn user on unusual setup between dilation and stride
+        if stride > 1 and dilation > 1:
+            warnings.warn(
+                "SConv1d has been initialized with stride > 1 and dilation > 1"
+                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+            )
+        self.conv = NormConv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            causal=causal,
+            norm=norm,
+            norm_kwargs=norm_kwargs,
+        )
+        self.causal = causal
+        self.pad_mode = pad_mode
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # x: [B,C,T]
+        B, C, T = x.shape
+        kernel_size = self.conv.conv.kernel_size[0]
+        stride = self.conv.conv.stride[0]
+        dilation = self.conv.conv.dilation[0]
+        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+        if self.causal:
+            # Left padding for causal
+            x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)  # [B,C,T+padding_total+extra_padding]
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            x = pad1d(
+                x, (padding_left, padding_right + extra_padding), mode=self.pad_mode
+            )  # [B,C,T+padding_total+extra_padding]
+        return self.conv(x)  # [B,C_out,T_out]
+
+
+class SConvTranspose1d(nn.Module):
+    """ConvTranspose1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        causal: bool = False,
+        norm: str = "none",
+        trim_right_ratio: float = 1.0,
+        norm_kwargs: dict[str, Any] = {},
+    ):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(
+            in_channels, out_channels, kernel_size, stride, causal=causal, norm=norm, norm_kwargs=norm_kwargs
+        )
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert self.causal or self.trim_right_ratio == 1.0, (
+            "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
+        )
+        assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # x: [B,C,T]
+        kernel_size = self.convtr.convtr.kernel_size[0]
+        stride = self.convtr.convtr.stride[0]
+        padding_total = kernel_size - stride
+
+        y = self.convtr(x)  # [B,C_out,T*stride+padding_total]
+
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if self.causal:
+            # Trim the padding on the right according to the specified ratio
+            # if trim_right_ratio = 1.0, trim everything from right
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))  # [B,C_out,T_out]
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))  # [B,C_out,T_out]
+        return y  # [B,C_out,T_out]
diff --git a/vllm_omni/diffusion/models/cosmos3/guardrails.py b/vllm_omni/diffusion/models/cosmos3/guardrails.py
new file mode 100644
index 00000000000..a085c3f3a59
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/guardrails.py
@@ -0,0 +1,430 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cosmos3 guardrail hooks for vllm-omni.
+
+Text: Blocklist (keyword matching) + Qwen3Guard (0.6B LLM classifier)
+Video: SigLIP-based content safety filter + RetinaFace face blur
+
+Enable via custom_pipeline_args or the test script:
+    python test_cosmos3.py --model ... --guardrails
+"""
+
+from __future__ import annotations
+
+import os
+import warnings
+from collections.abc import Callable
+from typing import Any
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+from vllm.logger import init_logger
+
+from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
+
+logger = init_logger(__name__)
+
+TextGuardrailFn = Callable[[str], None]
+VideoGuardrailFn = Callable[[np.ndarray], np.ndarray]
+
+_text_guardrail: TextGuardrailFn | None = None
+_video_guardrail: VideoGuardrailFn | None = None
+_initialized = False
+
+GUARDRAIL_HF_REPO = "nvidia/Cosmos-Guardrail1"
+GUARDRAIL_HF_REVISION = "d6d4bfa899a71454a700907664f3e88f503950cf"
+CUTOFF_UNSAFE_FRAMES_PERCENT = 10
+
+
+def set_text_guardrail(fn: TextGuardrailFn) -> None:
+    global _text_guardrail
+    _text_guardrail = fn
+
+
+def set_video_guardrail(fn: VideoGuardrailFn) -> None:
+    global _video_guardrail
+    _video_guardrail = fn
+
+
+# ---------------------------------------------------------------------------
+# Video safety classifier (matches reference: SigLIP so400m + 3-layer head)
+# ---------------------------------------------------------------------------
+class SafetyClassifier(nn.Module):
+    """3-layer classifier with BatchNorm (1152 → 512 → 256 → 7)."""
+
+    def __init__(self, input_size: int = 1152, num_classes: int = 7):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(input_size, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Linear(512, 256),
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Linear(256, num_classes),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.layers(x)
+
+
+CLASS_IDX_TO_NAME = {
+    0: "Safe",
+    1: "Sexual_Content",
+    3: "Drugs",
+    4: "Child_Abuse",
+    5: "Hate_and_Harassment",
+    6: "Self-Harm",
+}
+
+
+# ---------------------------------------------------------------------------
+# Face pixelation utility
+# ---------------------------------------------------------------------------
+def _pixelate_face(face_img: np.ndarray, blocks: int = 5) -> np.ndarray:
+    h, w = face_img.shape[:2]
+    if h == 0 or w == 0:
+        return face_img
+    temp = cv2.resize(face_img, (blocks, blocks), interpolation=cv2.INTER_LINEAR)
+    return cv2.resize(temp, (w, h), interpolation=cv2.INTER_NEAREST)
+
+
+# ---------------------------------------------------------------------------
+# Default guardrail builders
+# ---------------------------------------------------------------------------
+def _download_checkpoint() -> str:
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(GUARDRAIL_HF_REPO, revision=GUARDRAIL_HF_REVISION)
+
+
+def _build_text_guardrail(offload_to_cpu: bool) -> TextGuardrailFn:
+    checkers: list[Callable[[str], tuple[bool, str]]] = []
+
+    # 1. Blocklist
+    try:
+        import nltk
+        from better_profanity import profanity as profanity_filter
+
+        ckpt_dir = _download_checkpoint()
+        blocklist_dir = os.path.join(ckpt_dir, "blocklist")
+        nltk.data.path.append(os.path.join(blocklist_dir, "nltk_data"))
+
+        def _read_keywords(dirpath: str) -> list[str]:
+            words: list[str] = []
+            if not os.path.isdir(dirpath):
+                return words
+            for fname in sorted(os.listdir(dirpath)):
+                fpath = os.path.join(dirpath, fname)
+                if os.path.isfile(fpath):
+                    with open(fpath) as f:
+                        words.extend(line.strip() for line in f if line.strip())
+            return words
+
+        blocklist_words = _read_keywords(os.path.join(blocklist_dir, "custom"))
+        whitelist_words = _read_keywords(os.path.join(blocklist_dir, "whitelist"))
+        profanity_filter.load_censor_words(custom_words=blocklist_words, whitelist_words=whitelist_words)
+
+        def _blocklist_check(prompt: str) -> tuple[bool, str]:
+            if profanity_filter.contains_profanity(prompt):
+                return False, "Blocked by keyword filter"
+            return True, ""
+
+        checkers.append(_blocklist_check)
+        if _is_rank_zero():
+            logger.info("Blocklist guardrail loaded (%d keywords)", len(blocklist_words))
+    except ImportError:
+        logger.warning("better-profanity or nltk not installed; skipping blocklist guardrail")
+
+    # 2. Qwen3Guard
+    try:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        model_id = "Qwen/Qwen3Guard-Gen-0.6B"
+        qwen_tokenizer = AutoTokenizer.from_pretrained(model_id)
+        device = "cpu" if offload_to_cpu else "cuda"
+        qwen_model = (
+            AutoModelForCausalLM.from_pretrained(
+                model_id,
+                torch_dtype=torch.bfloat16,
+            )
+            .to(device)
+            .eval()
+        )
+
+        def _qwen_check(prompt: str) -> tuple[bool, str]:
+            conversations = [{"role": "user", "content": prompt}]
+            input_ids = qwen_tokenizer.apply_chat_template(
+                conversations,
+                tokenize=True,
+                return_tensors="pt",
+                add_generation_prompt=True,
+            ).to(device)
+            with torch.no_grad():
+                output_ids = qwen_model.generate(input_ids, max_new_tokens=128)
+            response = qwen_tokenizer.decode(
+                output_ids[0][input_ids.shape[1] :],
+                skip_special_tokens=True,
+            )
+            if "unsafe" in response.lower():
+                return False, f"Qwen3Guard: {response.strip()}"
+            return True, ""
+
+        checkers.append(_qwen_check)
+        if _is_rank_zero():
+            logger.info("Qwen3Guard guardrail loaded")
+    except ImportError:
+        logger.warning("transformers not installed; skipping Qwen3Guard")
+
+    def text_guardrail(prompt: str) -> None:
+        for checker in checkers:
+            is_safe, msg = checker(prompt)
+            if not is_safe:
+                raise ValueError(f"Guardrail blocked prompt: {msg}")
+
+    return text_guardrail
+
+
+def _build_video_guardrail(offload_to_cpu: bool) -> VideoGuardrailFn:
+    ckpt_dir = _download_checkpoint()
+    safety_checker: Callable[[np.ndarray], tuple[bool, str]] | None = None
+    face_blurrer: Callable[[np.ndarray], np.ndarray] | None = None
+
+    # 1. Video content safety filter: SigLIP so400m + SafetyClassifier
+    try:
+        from PIL import Image
+        from transformers import SiglipModel, SiglipProcessor
+
+        device = "cpu" if offload_to_cpu else "cuda"
+        siglip_id = "google/siglip-so400m-patch14-384"
+        siglip_model = SiglipModel.from_pretrained(siglip_id).to(device, dtype=torch.float32).eval()
+        siglip_processor = SiglipProcessor.from_pretrained(siglip_id)
+
+        classifier = SafetyClassifier(input_size=1152, num_classes=7)
+        ckpt_path = os.path.join(ckpt_dir, "video_content_safety_filter", "safety_filter.pt")
+        checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        # Checkpoint keys have "network." prefix from the VideoSafetyModel wrapper.
+        state = {k.removeprefix("network."): v for k, v in checkpoint["model"].items()}
+        classifier.load_state_dict(state)
+        classifier = classifier.to(device, dtype=torch.float32).eval()
+
+        def _safety_check(frames: np.ndarray) -> tuple[bool, str]:
+            nonlocal siglip_model, classifier
+            if offload_to_cpu:
+                siglip_model = siglip_model.to("cuda")
+                classifier = classifier.to("cuda")
+
+            unsafe_count = 0
+            total = len(frames)
+            for frame in frames:
+                img = Image.fromarray(frame)
+                inputs = siglip_processor(images=img, return_tensors="pt").to("cuda", dtype=torch.float32)
+                with torch.no_grad():
+                    features = siglip_model.get_image_features(**inputs)
+                    features = features / features.norm(dim=-1, keepdim=True)
+                    logits = classifier(features)
+                    pred = logits.argmax(dim=-1).item()
+                class_name = CLASS_IDX_TO_NAME.get(pred, "Unknown")
+                if class_name != "Safe":
+                    unsafe_count += 1
+
+            if offload_to_cpu:
+                siglip_model = siglip_model.to("cpu")
+                classifier = classifier.to("cpu")
+
+            if unsafe_count / total > CUTOFF_UNSAFE_FRAMES_PERCENT / 100:
+                return False, f"Video content safety: {unsafe_count}/{total} frames unsafe"
+            return True, ""
+
+        safety_checker = _safety_check
+        if _is_rank_zero():
+            logger.info("Video content safety filter loaded (SigLIP so400m + classifier)")
+    except (ImportError, FileNotFoundError) as e:
+        logger.warning("Could not load video safety filter: %s", e)
+
+    # 2. Face blur: RetinaFace + pixelation
+    try:
+        from retinaface.data import cfg_re50
+        from retinaface.layers.functions.prior_box import PriorBox
+        from retinaface.models.retinaface import RetinaFace
+        from retinaface.utils.nms.py_cpu_nms import py_cpu_nms
+
+        face_ckpt = os.path.join(ckpt_dir, "face_blur_filter", "Resnet50_Final.pth")
+        if not os.path.exists(face_ckpt):
+            raise FileNotFoundError(face_ckpt)
+
+        cfg = dict(cfg_re50)
+        cfg["pretrain"] = False
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            retinaface_net = RetinaFace(cfg=cfg, phase="test")
+
+        # Load weights (strip 'module.' prefix if present)
+        pretrained_dict = torch.load(face_ckpt, map_location="cpu", weights_only=True)
+        if "state_dict" in pretrained_dict:
+            pretrained_dict = pretrained_dict["state_dict"]
+        pretrained_dict = {
+            k.replace("module.", "", 1) if k.startswith("module.") else k: v for k, v in pretrained_dict.items()
+        }
+        retinaface_net.load_state_dict(pretrained_dict, strict=False)
+        retinaface_device = "cpu" if offload_to_cpu else "cuda"
+        retinaface_net = retinaface_net.to(retinaface_device, dtype=torch.float32).eval()
+
+        CONF_THRESH = 0.7
+        NMS_THRESH = 0.4
+        TOP_K = 5000
+        KEEP_TOP_K = 750
+
+        def _decode_batch(loc, priors, variances):
+            batch_size = loc.size(0)
+            p = priors.unsqueeze(0).expand(batch_size, -1, -1)
+            boxes = torch.cat(
+                (
+                    p[:, :, :2] + loc[:, :, :2] * variances[0] * p[:, :, 2:],
+                    p[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1]),
+                ),
+                dim=2,
+            )
+            boxes[:, :, :2] -= boxes[:, :, 2:] / 2
+            boxes[:, :, 2:] += boxes[:, :, :2]
+            return boxes
+
+        def _face_blur(frames: np.ndarray) -> np.ndarray:
+            nonlocal retinaface_net
+            if offload_to_cpu:
+                retinaface_net = retinaface_net.to("cuda")
+
+            prior_data = None
+            scale = None
+            result_frames = []
+
+            for frame in frames:
+                frame_t = torch.from_numpy(frame).to("cuda", dtype=torch.float32)
+                frame_t = frame_t.permute(2, 0, 1).unsqueeze(0)  # [1, C, H, W]
+                frame_t = frame_t[:, [2, 1, 0], :, :]  # RGB → BGR
+                means = torch.tensor([104.0, 117.0, 123.0], device="cuda", dtype=torch.float32).view(1, 3, 1, 1)
+                frame_t = frame_t - means
+
+                h, w = frame_t.shape[2], frame_t.shape[3]
+                if prior_data is None:
+                    priorbox = PriorBox(cfg, image_size=(h, w))
+                    prior_data = priorbox.forward().to("cuda", dtype=torch.float32)
+                if scale is None:
+                    scale = torch.tensor([w, h, w, h], device="cuda", dtype=torch.float32)
+
+                with torch.no_grad():
+                    loc, conf, _ = retinaface_net(frame_t)
+
+                boxes = _decode_batch(loc, prior_data, cfg["variance"])
+                boxes = (boxes * scale).squeeze(0).cpu().numpy()
+                scores = conf.squeeze(0)[:, 1].cpu().numpy()
+
+                # Filter by confidence
+                inds = np.where(scores > CONF_THRESH)[0]
+                boxes_f = boxes[inds]
+                scores_f = scores[inds]
+                order = scores_f.argsort()[::-1][:TOP_K]
+                boxes_f = boxes_f[order]
+                scores_f = scores_f[order]
+
+                # NMS
+                dets = np.hstack((boxes_f, scores_f[:, np.newaxis])).astype(np.float32)
+                keep = py_cpu_nms(dets, NMS_THRESH)
+                dets = dets[keep][:KEEP_TOP_K]
+
+                out_frame = frame.copy()
+                for det in dets:
+                    x1, y1, x2, y2 = map(int, det[:4])
+                    if x2 - x1 < 20 or y2 - y1 < 20:
+                        continue
+                    max_h, max_w = out_frame.shape[:2]
+                    y1c, y2c = max(y1, 0), min(y2, max_h)
+                    x1c, x2c = max(x1, 0), min(x2, max_w)
+                    out_frame[y1c:y2c, x1c:x2c] = _pixelate_face(out_frame[y1c:y2c, x1c:x2c])
+
+                result_frames.append(out_frame)
+
+            if offload_to_cpu:
+                retinaface_net = retinaface_net.to("cpu")
+
+            return np.array(result_frames)
+
+        face_blurrer = _face_blur
+        if _is_rank_zero():
+            logger.info("Face blur filter loaded (RetinaFace Resnet50)")
+    except (ImportError, FileNotFoundError) as e:
+        logger.warning("Could not load face blur filter: %s", e)
+
+    def video_guardrail(frames: np.ndarray) -> np.ndarray:
+        if safety_checker is not None:
+            is_safe, msg = safety_checker(frames)
+            if not is_safe:
+                raise ValueError(f"Guardrail blocked video: {msg}")
+        if face_blurrer is not None:
+            frames = face_blurrer(frames)
+        return frames
+
+    return video_guardrail
+
+
+# ---------------------------------------------------------------------------
+# Initialization
+# ---------------------------------------------------------------------------
+def _init_default_guardrails(offload_to_cpu: bool = False) -> None:
+    global _text_guardrail, _video_guardrail, _initialized
+    if _initialized:
+        return
+    if _is_rank_zero():
+        logger.info("Initializing Cosmos3 guardrails (offload_to_cpu=%s)...", offload_to_cpu)
+    _text_guardrail = _build_text_guardrail(offload_to_cpu)
+    _video_guardrail = _build_video_guardrail(offload_to_cpu)
+    _initialized = True
+    if _is_rank_zero():
+        logger.info("Cosmos3 guardrails initialized.")
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def ensure_initialized(od_config: Any) -> None:
+    if not is_guardrails_enabled(od_config):
+        return
+    _init_default_guardrails(offload_to_cpu=get_offload_flag(od_config))
+
+
+def check_text_safety(prompt: str) -> None:
+    if _text_guardrail is not None:
+        _text_guardrail(prompt)
+
+
+def check_video_safety(video_tensor: torch.Tensor) -> torch.Tensor:
+    if _video_guardrail is None:
+        return video_tensor
+
+    v = video_tensor.detach().cpu().float()
+    if v.dim() == 5:
+        v = v[0]
+    v = v.clamp(-1, 1) * 0.5 + 0.5
+    frames_np = (v.permute(1, 2, 3, 0).numpy() * 255).round().astype(np.uint8)
+
+    frames_np = _video_guardrail(frames_np)
+
+    # Convert back to [-1, 1] to match the VAE output range.
+    result = torch.from_numpy(frames_np.copy()).float() / 127.5 - 1.0
+    result = result.permute(3, 0, 1, 2)
+    if video_tensor.dim() == 5:
+        result = result.unsqueeze(0)
+    return result.to(video_tensor.device)
+
+
+def is_guardrails_enabled(od_config: Any) -> bool:
+    return False
+    cfg = getattr(od_config, "model_config", None) or {}
+    return bool(cfg.get("guardrails", True))
+
+
+def get_offload_flag(od_config: Any) -> bool:
+    cfg = getattr(od_config, "model_config", None) or {}
+    return bool(cfg.get("offload_guardrail_models", False))
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
new file mode 100644
index 00000000000..634be5f6ca7
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -0,0 +1,1848 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cosmos3 text/image-to-video and text-to-image pipeline for vllm-omni.
+
+Single pipeline class supports T2V, I2V, and T2I; the mode is selected at
+runtime by:
+
+* ``prompt["modalities"]`` contains ``"image"``: **T2I** (text-to-image).
+* ``prompt["modalities"]`` contains ``"video"`` or is omitted: **T2V**
+  (text-to-video).
+* ``multi_modal_data['image']`` present on the prompt:  **I2V**
+  (handled by :func:`get_cosmos3_pre_process_func`)
+
+"""
+
+from __future__ import annotations
+
+import os
+import time
+from collections.abc import Iterable
+from typing import Any, ClassVar
+
+import numpy as np
+import PIL.Image
+import torch
+from diffusers import UniPCMultistepScheduler
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from torch import nn
+from transformers import AutoTokenizer
+from vllm.logger import init_logger
+from vllm.model_executor.models.utils import AutoWeightsLoader
+
+from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig
+from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import DistributedAutoencoderKLWan
+from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin
+from vllm_omni.diffusion.distributed.parallel_state import (
+    get_classifier_free_guidance_world_size,
+)
+from vllm_omni.diffusion.distributed.utils import get_local_device
+from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader
+from vllm_omni.diffusion.models.interface import SupportImageInput
+from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin, _is_rank_zero
+from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin
+from vllm_omni.diffusion.request import OmniDiffusionRequest
+
+from .action import (
+    ACTION_MODE_FORWARD_DYNAMICS,
+    ACTION_MODE_INVERSE_DYNAMICS,
+    ACTION_MODE_POLICY,
+    action_start_frame_offset,
+    build_action_condition_mask,
+    build_vision_condition_mask,
+    find_closest_target_size,
+    load_action_tensor,
+    normalize_action_mode,
+    pad_action_to_dim,
+    resolve_domain_id,
+)
+from .transformer_cosmos3 import Cosmos3VFMTransformer
+
+logger = init_logger(__name__)
+
+COSMOS3_DEFAULT_NEGATIVE_PROMPT = ""
+COSMOS3_DURATION_TEMPLATE = "The video is {duration:.1f} seconds long and is of {fps:.0f} FPS."
+COSMOS3_RESOLUTION_TEMPLATE = "This video is of {height}x{width} resolution."
+COSMOS3_IMAGE_RESOLUTION_TEMPLATE = "This image is of {height}x{width} resolution."
+COSMOS3_INVERSE_DURATION_TEMPLATE = "The video is not {duration:.1f} seconds long and is not of {fps:.0f} FPS."
+COSMOS3_INVERSE_RESOLUTION_TEMPLATE = "This video is not of {height}x{width} resolution."
+COSMOS3_INVERSE_IMAGE_RESOLUTION_TEMPLATE = "This image is not of {height}x{width} resolution."
+COSMOS3_SYSTEM_PROMPT = "You are a helpful assistant who will generate videos from a given prompt."
+COSMOS3_T2I_SYSTEM_PROMPT = "You are a helpful assistant who will generate images from a given prompt."
+
+
+# ---------------------------------------------------------------------------
+# Post-process function (registered in registry.py)
+# ---------------------------------------------------------------------------
+def get_cosmos3_pre_process_func(od_config: OmniDiffusionConfig):
+    """Pre-process function for both T2V and I2V.
+
+    For T2V (no image in ``multi_modal_data``), the request is returned
+    unchanged after the optional guardrails check.  For I2V (image present),
+    the conditioning image is loaded, aspect-resized + center-cropped, and
+    stored back on the prompt as ``additional_information.preprocessed_image``.
+    """
+    from .guardrails import check_text_safety, ensure_initialized, is_guardrails_enabled
+
+    video_processor = VideoProcessor(vae_scale_factor=16)
+    guardrails_on = is_guardrails_enabled(od_config)
+    if guardrails_on:
+        ensure_initialized(od_config)
+
+    def _extra_args(request: OmniDiffusionRequest) -> dict[str, Any]:
+        extra = getattr(getattr(request, "sampling_params", None), "extra_args", None)
+        return extra if isinstance(extra, dict) else {}
+
+    def _request_action_mode(request: OmniDiffusionRequest) -> str | None:
+        return normalize_action_mode(_extra_args(request).get("action_mode"))
+
+    def _set_action_size_from_image(request: OmniDiffusionRequest, image: PIL.Image.Image) -> tuple[int, int]:
+        sp = request.sampling_params
+        if sp.height is not None and sp.width is not None:
+            return int(sp.height), int(sp.width)
+
+        extra = _extra_args(request)
+        resolution = extra.get("resolution", extra.get("image_size", 480))
+        target_w, target_h = find_closest_target_size(image.height, image.width, resolution)
+        if sp.height is None:
+            sp.height = target_h
+        if sp.width is None:
+            sp.width = target_w
+        return int(sp.height), int(sp.width)
+
+    def _pil_to_rgb(value: Any) -> PIL.Image.Image:
+        if isinstance(value, str):
+            return PIL.Image.open(value).convert("RGB")
+        if isinstance(value, PIL.Image.Image):
+            return value.convert("RGB")
+        raise TypeError(f"Cosmos3 action preprocessing expected PIL image or image path, got {type(value)!r}.")
+
+    def _resize_and_pad_action_image(image: PIL.Image.Image, target_h: int, target_w: int) -> PIL.Image.Image:
+        scale = min(target_w / image.width, target_h / image.height, 1.0)
+        resize_w = max(1, int(scale * image.width + 0.5))
+        resize_h = max(1, int(scale * image.height + 0.5))
+        if (resize_w, resize_h) != image.size:
+            image = image.resize((resize_w, resize_h), PIL.Image.Resampling.BICUBIC)
+
+        array = np.asarray(image)
+        pad_h = target_h - resize_h
+        pad_w = target_w - resize_w
+        if pad_h < 0 or pad_w < 0:
+            raise ValueError(
+                f"Cosmos3 action image resize exceeded target size: resized={(resize_h, resize_w)}, "
+                f"target={(target_h, target_w)}."
+            )
+        if pad_h == 0 and pad_w == 0:
+            return image
+        pad_mode = "reflect" if pad_h < resize_h and pad_w < resize_w else "edge"
+        padded = np.pad(array, ((0, pad_h), (0, pad_w), (0, 0)), mode=pad_mode)
+        return PIL.Image.fromarray(padded)
+
+    def _preprocess_action_image(image: PIL.Image.Image, target_h: int, target_w: int) -> torch.Tensor:
+        image = _resize_and_pad_action_image(image, target_h, target_w)
+        return video_processor.preprocess(image, height=target_h, width=target_w)
+
+    def _preprocess_action_video(frames: list[Any], target_h: int, target_w: int) -> torch.Tensor:
+        if not frames:
+            raise ValueError("Cosmos3 action video input must contain at least one frame.")
+        processed = [_preprocess_action_image(_pil_to_rgb(frame), target_h, target_w).squeeze(0) for frame in frames]
+        return torch.stack(processed, dim=1).unsqueeze(0).contiguous()
+
+    def pre_process_func(request: OmniDiffusionRequest) -> OmniDiffusionRequest:
+        action_mode = _request_action_mode(request)
+        if guardrails_on:
+            for prompt in request.prompts:
+                text = prompt if isinstance(prompt, str) else prompt.get("prompt", "")
+                check_text_safety(text)
+
+        for i, prompt in enumerate(request.prompts):
+            if isinstance(prompt, str):
+                continue
+            multi_modal_data = prompt.get("multi_modal_data", {}) or {}
+            raw_image = multi_modal_data.get("image")
+            raw_video = multi_modal_data.get("video")
+            if raw_image is None and not (action_mode is not None and raw_video is not None):
+                continue
+
+            if "additional_information" not in prompt:
+                prompt["additional_information"] = {}
+
+            if raw_image is None:
+                if not isinstance(raw_video, list) or not raw_video:
+                    raise TypeError("Cosmos3 action video input must be a non-empty list of PIL images or image paths.")
+                image = _pil_to_rgb(raw_video[0])
+            else:
+                image = _pil_to_rgb(raw_image)
+
+            # Auto-calculate H/W from aspect ratio (720p max area)
+            if request.sampling_params.height is None or request.sampling_params.width is None:
+                if action_mode is not None:
+                    _set_action_size_from_image(request, image)
+                else:
+                    max_area = 720 * 1280
+                    aspect_ratio = image.height / image.width
+                    mod_value = 16
+                    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+                    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+                    if request.sampling_params.height is None:
+                        request.sampling_params.height = height
+                    if request.sampling_params.width is None:
+                        request.sampling_params.width = width
+
+            target_w = request.sampling_params.width
+            target_h = request.sampling_params.height
+            if action_mode is not None:
+                prompt["additional_information"]["preprocessed_image"] = _preprocess_action_image(
+                    image,
+                    int(target_h),
+                    int(target_w),
+                )
+            else:
+                scale = max(target_w / image.width, target_h / image.height)
+                resize_w = int(np.ceil(scale * image.width))
+                resize_h = int(np.ceil(scale * image.height))
+                image = image.resize((resize_w, resize_h), PIL.Image.Resampling.LANCZOS)
+                left = (resize_w - target_w) // 2
+                top = (resize_h - target_h) // 2
+                image = image.crop((left, top, left + target_w, top + target_h))
+
+                prompt["additional_information"]["preprocessed_image"] = video_processor.preprocess(
+                    image, height=target_h, width=target_w
+                )
+            if action_mode is not None and raw_video is not None:
+                if not isinstance(raw_video, list):
+                    raise TypeError("Cosmos3 action video input must be a list of PIL images or image paths.")
+                prompt["additional_information"]["preprocessed_video"] = _preprocess_action_video(
+                    raw_video,
+                    int(target_h),
+                    int(target_w),
+                )
+            request.prompts[i] = prompt
+
+        return request
+
+    return pre_process_func
+
+
+def get_cosmos3_post_process_func(od_config: OmniDiffusionConfig):
+    from .guardrails import check_video_safety, is_guardrails_enabled
+
+    video_processor = VideoProcessor(vae_scale_factor=16)
+    guardrails_on = is_guardrails_enabled(od_config)
+
+    def _sampling_param(sampling_params, key: str, default=None):
+        extra = getattr(sampling_params, "extra_args", None)
+        if isinstance(extra, dict) and extra.get(key) is not None:
+            return extra[key]
+        value = getattr(sampling_params, key, None)
+        return default if value is None else value
+
+    def _resolve_output_fps(sampling_params):
+        fps = (
+            _sampling_param(sampling_params, "resolved_frame_rate")
+            or _sampling_param(sampling_params, "frame_rate")
+            or _sampling_param(sampling_params, "fps")
+            or 24.0
+        )
+        try:
+            fps_value = float(fps)
+        except (TypeError, ValueError):
+            fps_value = 24.0
+        if fps_value <= 0:
+            fps_value = 24.0
+        return int(fps_value) if fps_value.is_integer() else fps_value
+
+    def post_process_func(
+        output: torch.Tensor | dict[str, torch.Tensor] | tuple,
+        output_type: str = "np",
+        sampling_params=None,
+    ):
+        if output_type == "latent":
+            return output
+
+        audio = None
+        audio_sample_rate = None
+        if isinstance(output, dict):
+            if "image" in output and "video" in output:
+                raise ValueError("Cosmos3 output cannot contain both image and video payloads.")
+            if "image" in output:
+                video = output["image"]
+            elif "video" in output:
+                video = output["video"]
+            else:
+                raise ValueError("Cosmos3 postprocess expected an 'image' or 'video' output payload.")
+            audio = output.get("audio")
+            audio_sample_rate = output.get("audio_sample_rate")
+        elif isinstance(output, tuple):
+            if len(output) == 3:
+                video, audio, audio_sample_rate = output
+            elif len(output) == 2:
+                video, audio = output
+            else:
+                raise ValueError(
+                    "Cosmos3 postprocess expects output tensor, output dict, or (video, audio[, sample_rate]) tuple."
+                )
+        else:
+            video = output
+
+        if isinstance(output, dict) and "image" in output:
+            if audio is not None:
+                raise ValueError("Cosmos3 text-to-image postprocess does not support audio output.")
+            if video.ndim != 5 or video.shape[2] != 1:
+                raise ValueError(
+                    "Cosmos3 text-to-image postprocess expects decoded output "
+                    f"with shape [B, C, 1, H, W], got {tuple(video.shape)}."
+                )
+            image = video.squeeze(2)  # [B, 3, H, W]
+            if guardrails_on:
+                # check_video_safety expects a 5D tensor; re-add T axis.
+                checked = check_video_safety(image.unsqueeze(2))
+                image = checked.squeeze(2)
+            return video_processor.postprocess(image, output_type="pil")
+        if guardrails_on:
+            video = check_video_safety(video)
+        result = {"video": video_processor.postprocess_video(video, output_type=output_type)}
+        if audio is None:
+            return result
+        if isinstance(audio, torch.Tensor):
+            audio = audio.detach().cpu()
+        result["audio"] = audio
+        result["fps"] = _resolve_output_fps(sampling_params)
+        if audio_sample_rate is not None:
+            result["audio_sample_rate"] = int(audio_sample_rate)
+        return result
+
+    return post_process_func
+
+
+# ---------------------------------------------------------------------------
+# Pipeline
+# ---------------------------------------------------------------------------
+class Cosmos3OmniDiffusersPipeline(
+    nn.Module, CFGParallelMixin, SupportImageInput, ProgressBarMixin, DiffusionPipelineProfilerMixin
+):
+    """Cosmos3 text/image-to-video / text-to-image pipeline.
+
+    Architecture: Mixture-of-Transformers with Qwen3-VL backbone.
+    - Understanding pathway: causal self-attention on text (runs once, K/V cached)
+    - Generation pathway: cross-attention on noisy visual latents (runs each step)
+
+    Supports T2V, I2V, and T2I from the same class.  Mode is selected at
+    runtime:
+
+    * **T2I** when ``prompt["modalities"]`` contains ``"image"``.  Latent
+      T-dim is forced to 1, T2I-specific scheduler defaults are applied (50 steps,
+      flow_shift=3.0, guidance_interval=[400, 1000]), the duration
+      template is suppressed, and post-process emits PIL images.
+    * **I2V** when the request supplies a preprocessed image via
+      ``multi_modal_data['image']`` (handled by
+      :func:`get_cosmos3_pre_process_func`) and the requested output modality
+      is not image.
+      Frame 0 of the initial latent is set to the VAE-encoded conditioning
+      image, frame-0 noise predictions are masked to zero, and the clean
+      image latent is re-injected at frame 0 after each scheduler step.
+    * **T2V** otherwise (default video generation).
+    """
+
+    support_image_input: ClassVar[bool] = True
+    color_format: ClassVar[str] = "RGB"
+
+    def __init__(
+        self,
+        *,
+        od_config: OmniDiffusionConfig,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if od_config.enable_cpu_offload:
+            raise ValueError(
+                "Cosmos3 has no separate text encoder, so CPU offloading "
+                "(transformer↔encoder swapping) is not supported. "
+                "Use --enable-layerwise-offload instead."
+            )
+        self.od_config = od_config
+        self.device = get_local_device()
+        self.dtype = getattr(od_config, "dtype", torch.bfloat16)
+
+        model_path = od_config.model
+        local_files_only = os.path.exists(model_path)
+
+        # --- Tokenizer ---
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            subfolder="text_tokenizer",
+            local_files_only=local_files_only,
+        )
+
+        # --- VAE ---
+        self.vae = DistributedAutoencoderKLWan.from_pretrained(
+            model_path,
+            subfolder="vae",
+            torch_dtype=torch.bfloat16,
+            local_files_only=local_files_only,
+        ).to(self.device)
+
+        if not hasattr(self.vae.config, "scale_factor_temporal"):
+            raise ValueError(
+                "Cosmos3 Diffusers VAE config must define scale_factor_temporal "
+                "so transformer mRoPE temporal positions can be computed correctly."
+            )
+        self.vae_scale_factor_temporal = int(self.vae.config.scale_factor_temporal)
+        self.vae_scale_factor_spatial = getattr(self.vae.config, "scale_factor_spatial", 16)
+
+        # --- Transformer (weights loaded later via weights_sources) ---
+        self.transformer = Cosmos3VFMTransformer(
+            od_config=od_config,
+            temporal_compression_factor=self.vae_scale_factor_temporal,
+        )
+
+        # --- Scheduler ---
+        # Load from checkpoint to preserve solver_order, timestep_spacing,
+        # beta_schedule, sigma bounds, flow_shift, etc. Only override
+        # flow_shift when explicitly requested by the user.
+        self.scheduler = UniPCMultistepScheduler.from_pretrained(
+            model_path,
+            subfolder="scheduler",
+            local_files_only=local_files_only,
+        )
+        if od_config.flow_shift is not None:
+            self.scheduler = UniPCMultistepScheduler.from_config(self.scheduler.config, flow_shift=od_config.flow_shift)
+
+        # --- Video processor for post-decode ---
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+        # --- Weight sources for DiffusersPipelineLoader ---
+        self.weights_sources = [
+            DiffusersPipelineLoader.ComponentSource(
+                model_or_path=model_path,
+                subfolder=None,
+                revision=None,
+                prefix="transformer.",
+                fall_back_to_pt=True,
+                allow_patterns_overrides=["transformer/*.safetensors"],
+            ),
+        ]
+
+        # Snapshot the loaded scheduler config so we can rebuild the
+        # scheduler at request time when a per-request flow_shift override
+        # is supplied (T2I uses shift=3.0; T2V/I2V use the engine default).
+        self._base_scheduler_config = self.scheduler.config
+        # ``_engine_init_flow_shift`` is the shift the engine was configured
+        # with at init time (after the optional ``od_config.flow_shift``
+        # override).  This is the value T2V/I2V requests fall back to.
+        # ``_current_flow_shift`` tracks the shift the scheduler *currently*
+        # uses, since per-request rebuilds in ``_set_flow_shift`` must be
+        # detectable on the next request to restore the prior shift.
+        self._engine_init_flow_shift = float(getattr(self.scheduler.config, "flow_shift", 1.0) or 1.0)
+        self._current_flow_shift = self._engine_init_flow_shift
+
+        self._guidance_scale = None
+        self._num_timesteps = None
+        self._loaded_weight_names: set[str] = set()
+        self._sound_tokenizer = None
+        if getattr(self.transformer, "sound_gen", False):
+            self._get_sound_tokenizer()
+
+        self.setup_diffusion_pipeline_profiler(
+            enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler
+        )
+
+    # -- Weight loading --------------------------------------------------------
+
+    @staticmethod
+    def _remap_ckpt_key(key: str) -> str | None:
+        """Remap a Diffusers transformer key to the model parameter namespace.
+
+        Checkpoint keys arrive with a synthetic ``transformer.`` prefix from
+        ``weights_sources``.  The source checkpoint itself uses the Diffusers
+        transformer namespace: top-level projections plus ``model.*`` for the
+        Qwen3-VL backbone.  UND and GEN components share each layer in the
+        source and are split into separate module lists here.
+
+        Returns the remapped name under ``transformer.``, or None to skip.
+        """
+        k = key
+        # Strip the weights_sources prefix
+        if k.startswith("transformer."):
+            k = k[len("transformer.") :]
+
+        # Top-level generation components.
+        if k.startswith(
+            (
+                "vae2llm.",
+                "llm2vae.",
+                "time_embedder.",
+                "sound2llm.",
+                "llm2sound.",
+                "action2llm.",
+                "llm2action.",
+            )
+        ):
+            return f"transformer.{k}"
+        if k in ("sound_modality_embed", "sound_modality_embed.weight"):
+            return "transformer.sound_modality_embed"
+        if k in ("action_modality_embed", "action_modality_embed.weight"):
+            return "transformer.action_modality_embed"
+        if k.startswith("action_pos_embed."):
+            return None
+
+        # Skip lm_head
+        if k.startswith("lm_head."):
+            return None
+
+        # embed_tokens / norm → language_model.*
+        if k.startswith("model.embed_tokens."):
+            return f"transformer.language_model.{k[len('model.') :]}"
+        if k.startswith("model.norm."):
+            return f"transformer.language_model.{k[len('model.') :]}"
+
+        # norm_moe_gen → top level
+        if k.startswith("model.norm_moe_gen."):
+            return f"transformer.{k[len('model.') :]}"
+
+        if not k.startswith("model.layers."):
+            return None
+        k = k[len("model.") :]
+
+        if not k.startswith("layers."):
+            return None
+
+        parts = k.split(".", 2)  # ['layers', '{i}', '{rest}']
+        if len(parts) != 3:
+            return None
+        layer_idx = parts[1]
+        rest = parts[2]
+
+        und_lp = f"transformer.language_model.layers.{layer_idx}"
+        gen_lp = f"transformer.gen_layers.{layer_idx}"
+
+        _LAYER_MAP = {
+            # UND attention
+            "self_attn.q_proj.": f"{und_lp}.self_attn.q_proj.",
+            "self_attn.k_proj.": f"{und_lp}.self_attn.k_proj.",
+            "self_attn.v_proj.": f"{und_lp}.self_attn.v_proj.",
+            "self_attn.o_proj.": f"{und_lp}.self_attn.o_proj.",
+            "self_attn.q_norm.": f"{und_lp}.self_attn.q_norm.",
+            "self_attn.k_norm.": f"{und_lp}.self_attn.k_norm.",
+            # GEN attention
+            "self_attn.q_proj_moe_gen.": f"{gen_lp}.cross_attention.q_proj.",
+            "self_attn.k_proj_moe_gen.": f"{gen_lp}.cross_attention.k_proj.",
+            "self_attn.v_proj_moe_gen.": f"{gen_lp}.cross_attention.v_proj.",
+            "self_attn.o_proj_moe_gen.": f"{gen_lp}.cross_attention.o_proj.",
+            "self_attn.q_norm_moe_gen.": f"{gen_lp}.cross_attention.q_norm.",
+            "self_attn.k_norm_moe_gen.": f"{gen_lp}.cross_attention.k_norm.",
+            # Norms
+            "input_layernorm.": f"{und_lp}.input_layernorm.",
+            "post_attention_layernorm.": f"{und_lp}.post_attention_layernorm.",
+            "input_layernorm_moe_gen.": f"{gen_lp}.input_layernorm.",
+            "post_attention_layernorm_moe_gen.": f"{gen_lp}.post_attention_layernorm.",
+            # UND MLP
+            "mlp.gate_proj.": f"{und_lp}.mlp.gate_proj.",
+            "mlp.up_proj.": f"{und_lp}.mlp.up_proj.",
+            "mlp.down_proj.": f"{und_lp}.mlp.down_proj.",
+            # GEN MLP
+            "mlp_moe_gen.gate_proj.": f"{gen_lp}.mlp.gate_proj.",
+            "mlp_moe_gen.up_proj.": f"{gen_lp}.mlp.up_proj.",
+            "mlp_moe_gen.down_proj.": f"{gen_lp}.mlp.down_proj.",
+        }
+
+        for pattern, replacement in _LAYER_MAP.items():
+            if rest.startswith(pattern):
+                suffix = rest[len(pattern) :]
+                return replacement + suffix
+
+        return None
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        """Stream-remap checkpoint weights and load via AutoWeightsLoader.
+
+        Handles quantization, TP-aware weight_loader, and buffer loading.
+        Returns the set of loaded parameter names for strict validation.
+        """
+        state = self.state_dict()
+        allowed = set(state.keys())
+        tp_aware = {n for n, p in self.named_parameters() if hasattr(p, "weight_loader")}
+
+        def _remapped_weights() -> Iterable[tuple[str, torch.Tensor]]:
+            total = kept = 0
+            for name, tensor in weights:
+                total += 1
+                remapped = self._remap_ckpt_key(name)
+                if remapped is not None and (remapped in allowed or remapped in tp_aware):
+                    kept += 1
+                    yield remapped, tensor
+            if _is_rank_zero():
+                logger.info(
+                    "Cosmos3 weight remap: kept %d/%d tensors",
+                    kept,
+                    total,
+                )
+
+        loader = AutoWeightsLoader(self)
+        loaded = loader.load_weights(_remapped_weights())
+        self.transformer.post_load_weights()
+        self.transformer.eval()
+        self._loaded_weight_names = set(loaded)
+        if getattr(self.transformer, "sound_gen", False):
+            sound_markers = ("sound2llm.", "llm2sound.", "sound_modality_embed")
+            missing = [marker.rstrip(".") for marker in sound_markers if not any(marker in name for name in loaded)]
+            if missing:
+                raise ValueError(
+                    "Cosmos3 transformer config enables sound generation, but "
+                    f"the checkpoint is missing sound weights for {missing}. "
+                    "Use a sound-capable transformer checkpoint."
+                )
+        if getattr(self.transformer, "action_gen", False):
+            action_markers = ("action2llm.", "llm2action.", "action_modality_embed")
+            missing = [marker.rstrip(".") for marker in action_markers if not any(marker in name for name in loaded)]
+            if missing:
+                raise ValueError(
+                    "Cosmos3 transformer config enables action generation, but "
+                    f"the checkpoint is missing action weights for {missing}. "
+                    "Use an action-capable transformer checkpoint."
+                )
+        return loaded
+
+    def predict_noise(self, **kwargs) -> torch.Tensor | tuple[torch.Tensor, ...]:
+        """Override CFGParallelMixin.predict_noise for Cosmos3.
+
+        The transformer returns the raw prediction: video-only as a tensor,
+        or a tuple in video, action, sound order for multimodal generation.
+        """
+        return self.transformer(**kwargs)
+
+    @staticmethod
+    def _cfg_parallel_active() -> bool:
+        try:
+            return get_classifier_free_guidance_world_size() > 1
+        except Exception:
+            return False
+
+    @staticmethod
+    def _get_sp_param(sp, key: str, default=None):
+        """Read a runtime control from sampling params.
+
+        Order of precedence:
+            1. ``sp.extra_args[key]`` - preferred path; the OpenAI image/video
+               endpoints surface custom controls here (see e.g.
+               ``serving_video.py`` writing ``extra_args['flow_shift']``).
+            2. direct attribute on ``sp`` - backward compat for callers that
+               set attributes directly.
+            3. ``default``.
+
+        Skipping this helper would cause API-driven overrides like
+        ``request.flow_shift`` (forwarded as ``extra_args['flow_shift']``) to
+        be silently ignored.
+        """
+        extra = getattr(sp, "extra_args", None)
+        if isinstance(extra, dict) and extra.get(key) is not None:
+            return extra[key]
+        val = getattr(sp, key, None)
+        if val is not None:
+            return val
+        return default
+
+    @staticmethod
+    def _truthy(value) -> bool:
+        if isinstance(value, str):
+            return value.strip().lower() in {"1", "true", "yes", "on"}
+        return bool(value)
+
+    @classmethod
+    def _get_prompt_param(cls, prompt_data, key: str, default=None):
+        if not isinstance(prompt_data, dict):
+            return default
+        if prompt_data.get(key) is not None:
+            return prompt_data[key]
+        additional = prompt_data.get("additional_information")
+        if isinstance(additional, dict) and additional.get(key) is not None:
+            return additional[key]
+        return default
+
+    @classmethod
+    def _is_sound_request(cls, prompt_data, sp) -> bool:
+        keys = (
+            "sound_gen",
+            "generate_sound",
+            "enable_sound_generation",
+            "return_audio",
+            "output_audio",
+            "generate_audio",
+        )
+        for key in keys:
+            if cls._truthy(cls._get_prompt_param(prompt_data, key, None)):
+                return True
+            if cls._truthy(cls._get_sp_param(sp, key, None)):
+                return True
+        return False
+
+    @classmethod
+    def _get_action_mode(cls, prompt_data, sp) -> str | None:
+        return normalize_action_mode(
+            cls._get_sp_param(sp, "action_mode", cls._get_prompt_param(prompt_data, "action_mode", None))
+        )
+
+    def _get_sound_tokenizer(self):
+        if not hasattr(self, "_sound_tokenizer"):
+            self._sound_tokenizer = None
+        if self._sound_tokenizer is None:
+            from .sound_tokenizer import Cosmos3SoundTokenizer
+
+            self._sound_tokenizer = Cosmos3SoundTokenizer.from_config(self.od_config)
+        return self._sound_tokenizer
+
+    @staticmethod
+    def _is_t2i_request(req: OmniDiffusionRequest) -> bool:
+        """Detect text-to-image mode from request-level prompt modalities."""
+        if not req.prompts:
+            return False
+        first_prompt = req.prompts[0]
+        modalities = first_prompt.get("modalities", []) if isinstance(first_prompt, dict) else []
+        if modalities is None:
+            modalities = []
+        if isinstance(modalities, str):
+            modalities = [modalities]
+        if "image" in modalities and "video" in modalities:
+            raise ValueError("Cosmos3 prompt modalities cannot request both image and video output.")
+        return "image" in modalities
+
+    def _set_flow_shift(self, target_shift: float) -> None:
+        """Set the UniPC ``flow_shift`` to a concrete target value.
+
+        The scheduler is rebuilt from the saved base config if
+        the target differs from the current shift.  Tracking
+        ``self._current_flow_shift`` explicitly is required because the
+        previous mode may have rebuilt the scheduler - we cannot rely on
+        ``self.scheduler.config.flow_shift`` reflecting the last requested
+        target if a rebuild was skipped via the equality check.
+        """
+        target = float(target_shift)
+        if target == float(self._current_flow_shift):
+            return
+        self.scheduler = UniPCMultistepScheduler.from_config(self._base_scheduler_config, flow_shift=target)
+        self._current_flow_shift = target
+
+    def _set_scheduler_timesteps(self, num_inference_steps: int) -> None:
+        for name, value in vars(self.scheduler).items():
+            if isinstance(value, torch.Tensor) and value.device.type != "cpu":
+                setattr(self.scheduler, name, value.cpu())
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale is not None and self._guidance_scale > 1.0
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    # -- Prompt formatting -----------------------------------------------------
+
+    @staticmethod
+    def _apply_metadata_templates(
+        prompt: str,
+        num_frames: int,
+        frame_rate: float,
+        height: int,
+        width: int,
+        duration_template: str | None = COSMOS3_DURATION_TEMPLATE,
+        resolution_template: str | None = COSMOS3_RESOLUTION_TEMPLATE,
+        force_duration_template: bool = False,
+    ) -> str:
+        """Append duration and resolution metadata to a prompt.
+
+        Strips trailing dot and appends ``". <template>"`` for each.
+        """
+        if duration_template is not None and (num_frames > 1 or force_duration_template):
+            duration = num_frames / frame_rate
+            dur_text = duration_template.format(duration=duration, fps=frame_rate)
+            prompt = prompt.rstrip(".") + ". " + dur_text
+
+        if resolution_template is not None:
+            res_text = resolution_template.format(height=height, width=width)
+            prompt = prompt.rstrip(".") + ". " + res_text
+
+        return prompt
+
+    # -- Tokenization --------------------------------------------------------
+
+    def _tokenize_prompt(
+        self,
+        text: str,
+        max_sequence_length: int,
+        use_system_prompt: bool = False,
+        system_prompt: str | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Tokenize a prompt using the Qwen2 chat template.
+
+        Returns (input_ids, attention_mask) as [1, S] tensors on device.
+        """
+        conversations = []
+        if use_system_prompt:
+            conversations.append(
+                {
+                    "role": "system",
+                    "content": system_prompt or COSMOS3_SYSTEM_PROMPT,
+                }
+            )
+        conversations.append({"role": "user", "content": text})
+
+        token_ids = self._normalize_token_ids(
+            self.tokenizer.apply_chat_template(conversations, tokenize=True, add_generation_prompt=True)
+        )
+        token_ids = token_ids[:max_sequence_length]
+        token_ids.append(self.tokenizer.eos_token_id)  # 151645
+        token_ids.append(self.tokenizer.convert_tokens_to_ids("<|vision_start|>"))  # 151652
+        seq_len = len(token_ids)
+
+        pad_len = max_sequence_length - seq_len
+        attention_mask = [1] * seq_len + [0] * pad_len
+        token_ids = token_ids + [self.tokenizer.pad_token_id or 0] * pad_len
+
+        input_ids = torch.tensor([token_ids], dtype=torch.long, device=self.device)
+        attention_mask = torch.tensor([attention_mask], dtype=torch.long, device=self.device)
+        return input_ids, attention_mask
+
+    @staticmethod
+    def _normalize_token_ids(tokenized_output: object) -> list[int]:
+        """Normalize tokenizer outputs into a flat ``list[int]``.
+
+        Different Transformers/tokenizers versions can return ``list[int]``,
+        a mapping/BatchEncoding with ``input_ids``, tensors, or
+        ``tokenizers.Encoding`` objects from ``apply_chat_template``.
+        """
+        token_ids = tokenized_output
+        while True:
+            if isinstance(token_ids, dict) and "input_ids" in token_ids:
+                token_ids = token_ids["input_ids"]
+            elif hasattr(token_ids, "input_ids"):
+                token_ids = token_ids.input_ids
+            elif hasattr(token_ids, "ids"):
+                token_ids = token_ids.ids
+            elif hasattr(token_ids, "tolist"):
+                token_ids = token_ids.tolist()
+            elif isinstance(token_ids, tuple):
+                token_ids = list(token_ids)
+            elif isinstance(token_ids, list) and len(token_ids) == 1:
+                first = token_ids[0]
+                if isinstance(first, list | tuple):
+                    token_ids = list(first)
+                elif hasattr(first, "ids") or hasattr(first, "input_ids"):
+                    token_ids = first
+                elif hasattr(first, "tolist"):
+                    first_list = first.tolist()
+                    if isinstance(first_list, list | tuple):
+                        token_ids = list(first_list)
+                    else:
+                        break
+                else:
+                    break
+            else:
+                break
+
+        if not isinstance(token_ids, list):
+            raise TypeError(
+                "Cosmos3 tokenizer must return token IDs as a list-like value; "
+                f"got {type(token_ids).__name__}: {token_ids!r}"
+            )
+
+        normalized_ids = []
+        for idx, token_id in enumerate(token_ids):
+            if hasattr(token_id, "item"):
+                token_id = token_id.item()
+            try:
+                normalized_ids.append(int(token_id))
+            except (TypeError, ValueError) as exc:
+                raise TypeError(
+                    "Cosmos3 tokenizer returned a non-integer token at "
+                    f"index {idx}: {type(token_id).__name__}: {token_id!r}"
+                ) from exc
+        return normalized_ids
+
+    # -- Latent preparation --------------------------------------------------
+
+    def _prepare_latents(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        generator: torch.Generator,
+    ) -> torch.Tensor:
+        num_channels_latents = self.transformer.latent_channel_size
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            1,
+            num_channels_latents,
+            num_latent_frames,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        return randn_tensor(shape, generator=generator, device=self.device, dtype=self.dtype)
+
+    def _prepare_sound_latents(
+        self,
+        target_audio_samples: int,
+        generator: torch.Generator,
+    ) -> tuple[torch.Tensor, int]:
+        sound_tokenizer = self._get_sound_tokenizer()
+        latent_frames = max(1, int(sound_tokenizer.get_latent_num_samples(max(1, target_audio_samples))))
+        sound_dim = int(getattr(sound_tokenizer, "latent_ch", 64))
+        transformer_sound_dim = int(getattr(self.transformer, "sound_dim", sound_dim))
+        if sound_dim != transformer_sound_dim:
+            raise ValueError(
+                "Cosmos3 sound tokenizer latent channels do not match transformer "
+                f"sound_dim: tokenizer={sound_dim}, transformer={transformer_sound_dim}."
+            )
+        latents = randn_tensor(
+            (1, sound_dim, latent_frames),
+            generator=generator,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        return latents, latent_frames
+
+    def _resolve_sound_target_samples(
+        self,
+        sp,
+        num_frames: int,
+        frame_rate: float,
+    ) -> tuple[int, float, int]:
+        sound_tokenizer = self._get_sound_tokenizer()
+        duration = self._get_sp_param(sp, "sound_duration", None)
+        if duration is None:
+            duration = self._get_sp_param(sp, "audio_duration", None)
+        if duration is None:
+            duration = num_frames / frame_rate
+        duration = max(float(duration), 1.0 / max(float(frame_rate), 1.0))
+        sample_rate = int(getattr(sound_tokenizer, "sample_rate", 48000))
+        return max(1, int(round(duration * sample_rate))), duration, sample_rate
+
+    # -- VAE decode ----------------------------------------------------------
+
+    def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.to(self.vae.dtype)
+
+        if hasattr(self.vae.config, "latents_mean") and hasattr(self.vae.config, "latents_std"):
+            if not hasattr(self, "_latents_mean"):
+                self._latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, -1, 1, 1, 1).to(self.device, self.vae.dtype)
+                )
+                self._latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, -1, 1, 1, 1).to(self.device, self.vae.dtype)
+                )
+            latents = (latents * self._latents_std) + self._latents_mean
+        else:
+            scaling_factor = getattr(self.vae.config, "scaling_factor", 1.0)
+            latents = latents / scaling_factor
+
+        video = self.vae.decode(latents, return_dict=False)[0]
+        return video
+
+    def _decode_sound_latents(
+        self,
+        sound_latents: torch.Tensor,
+        target_audio_samples: int,
+    ) -> torch.Tensor:
+        sound_tokenizer = self._get_sound_tokenizer()
+        audio = sound_tokenizer.decode(sound_latents.to(self.dtype))
+        if audio.shape[-1] > target_audio_samples:
+            audio = audio[..., :target_audio_samples]
+        elif audio.shape[-1] < target_audio_samples:
+            audio = torch.nn.functional.pad(audio, (0, target_audio_samples - audio.shape[-1]))
+        return audio.detach().cpu()
+
+    # -- Prompt formatting + tokenization (shared by T2V and I2V) ------------
+
+    def _format_and_tokenize_prompts(
+        self,
+        prompt: str,
+        negative_prompt: str,
+        num_frames: int,
+        frame_rate: float,
+        height: int,
+        width: int,
+        max_sequence_length: int,
+        sp,
+        use_system_prompt: bool = False,
+        is_t2i: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Format prompts with metadata templates and tokenize.
+
+        Returns (cond_ids, cond_mask, uncond_ids, uncond_mask).
+
+        For T2I (``is_t2i=True``) the duration template is suppressed (no FPS
+        or duration concept for a single image) and the image-flavored
+        resolution template is used.
+        """
+        # Route cosmos3-specific controls through ``_get_sp_param`` so they
+        # are picked up from ``extra_args`` (OpenAI endpoint path) as well
+        # as from direct attributes.
+        use_duration_template = bool(self._get_sp_param(sp, "use_duration_template", True)) and not is_t2i
+        dur_tmpl = COSMOS3_DURATION_TEMPLATE if use_duration_template else None
+        if bool(self._get_sp_param(sp, "use_resolution_template", True)):
+            res_tmpl = COSMOS3_IMAGE_RESOLUTION_TEMPLATE if is_t2i else COSMOS3_RESOLUTION_TEMPLATE
+        else:
+            res_tmpl = None
+        prompt = self._apply_metadata_templates(
+            prompt,
+            num_frames,
+            frame_rate,
+            height,
+            width,
+            duration_template=dur_tmpl,
+            resolution_template=res_tmpl,
+        )
+        if _is_rank_zero():
+            logger.info("Final prompt: '%s'", prompt)
+
+        # Negative prompt metadata: "none" | "same" | "inverse".
+        # "same"    = same templates as positive (CFG guides caption only).
+        # "inverse" = inverted templates ("not {duration}...", "not {height}x{width}...").
+        # "none"    = no metadata on negative prompt.
+        # negative_prompt_keep_metadata=True upgrades "none" to "same" (compat).
+        # T2I uses a plain neg prompt by default.
+        neg_meta_default = "none" if is_t2i else "same"
+        neg_meta_mode = self._get_sp_param(sp, "negative_metadata_mode", "none")
+        keep_metadata = bool(self._get_sp_param(sp, "negative_prompt_keep_metadata", not is_t2i))
+        if keep_metadata and neg_meta_mode == "none":
+            neg_meta_mode = neg_meta_default
+
+        if neg_meta_mode == "same":
+            negative_prompt = (
+                self._apply_metadata_templates(
+                    negative_prompt,
+                    num_frames,
+                    frame_rate,
+                    height,
+                    width,
+                    duration_template=dur_tmpl,
+                    resolution_template=res_tmpl,
+                )
+                .lstrip(".")
+                .strip()
+            )
+        elif neg_meta_mode == "inverse":
+            inv_dur = COSMOS3_INVERSE_DURATION_TEMPLATE if dur_tmpl else None
+            if res_tmpl is None:
+                inv_res = None
+            elif is_t2i:
+                inv_res = COSMOS3_INVERSE_IMAGE_RESOLUTION_TEMPLATE
+            else:
+                inv_res = COSMOS3_INVERSE_RESOLUTION_TEMPLATE
+            negative_prompt = (
+                self._apply_metadata_templates(
+                    negative_prompt,
+                    num_frames,
+                    frame_rate,
+                    height,
+                    width,
+                    duration_template=inv_dur,
+                    resolution_template=inv_res,
+                    force_duration_template=True,
+                )
+                .lstrip(".")
+                .strip()
+            )
+
+        default_sys_prompt = COSMOS3_T2I_SYSTEM_PROMPT if is_t2i else COSMOS3_SYSTEM_PROMPT
+        sys_prompt = self._get_sp_param(sp, "system_prompt", default_sys_prompt) or default_sys_prompt
+        cond_ids, cond_mask = self._tokenize_prompt(
+            prompt, max_sequence_length, use_system_prompt, system_prompt=sys_prompt
+        )
+        uncond_ids, uncond_mask = self._tokenize_prompt(
+            negative_prompt, max_sequence_length, use_system_prompt, system_prompt=sys_prompt
+        )
+        return cond_ids, cond_mask, uncond_ids, uncond_mask
+
+    # -- I2V latent preparation ---------------------------------------------
+
+    def _encode_conditioning_video(
+        self,
+        image_tensor: torch.Tensor,
+        num_frames: int,
+        height: int,
+        width: int,
+    ) -> torch.Tensor:
+        """VAE-encode a conditioning image as a full-length video.
+
+        The WAN VAE has temporal compression (factor 4), so encoding a single
+        frame produces degenerate temporal features.  We fill the entire
+        pixel-space video with the conditioning image (repeating it across all
+        frames) so the temporal encoder sees plausible content everywhere.
+        The caller keeps only the conditioned latent frame(s) and replaces
+        the rest with noise.
+        """
+        # image_tensor: [1, 3, H, W] -> [1, 3, num_frames, H, W]
+        video = image_tensor.unsqueeze(2).expand(-1, -1, num_frames, -1, -1).contiguous()
+        video = video.to(device=self.device, dtype=self.vae.dtype)
+
+        latent = self.vae.encode(video).latent_dist.mode()
+
+        # Normalize (inverse of _decode_latents denormalization)
+        if hasattr(self.vae.config, "latents_mean") and hasattr(self.vae.config, "latents_std"):
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean).view(1, -1, 1, 1, 1).to(latent.device, latent.dtype)
+            )
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, -1, 1, 1, 1).to(latent.device, latent.dtype)
+            latent = (latent - latents_mean) / latents_std
+        else:
+            scaling_factor = getattr(self.vae.config, "scaling_factor", 1.0)
+            latent = latent * scaling_factor
+
+        return latent.to(self.dtype)
+
+    def _encode_video_tensor(self, video_tensor: torch.Tensor) -> torch.Tensor:
+        """VAE-encode a preprocessed pixel video [1, 3, T, H, W]."""
+        if video_tensor.ndim == 4:
+            video_tensor = video_tensor.unsqueeze(0)
+        if video_tensor.ndim != 5:
+            raise ValueError(f"Cosmos3 video tensor must have shape [1, 3, T, H, W], got {tuple(video_tensor.shape)}.")
+        if video_tensor.shape[0] != 1 or video_tensor.shape[1] != 3:
+            raise ValueError(f"Cosmos3 video tensor must have shape [1, 3, T, H, W], got {tuple(video_tensor.shape)}.")
+
+        video = video_tensor.to(device=self.device, dtype=self.vae.dtype)
+        latent = self.vae.encode(video).latent_dist.mode()
+
+        if hasattr(self.vae.config, "latents_mean") and hasattr(self.vae.config, "latents_std"):
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean).view(1, -1, 1, 1, 1).to(latent.device, latent.dtype)
+            )
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, -1, 1, 1, 1).to(latent.device, latent.dtype)
+            latent = (latent - latents_mean) / latents_std
+        else:
+            scaling_factor = getattr(self.vae.config, "scaling_factor", 1.0)
+            latent = latent * scaling_factor
+
+        return latent.to(self.dtype)
+
+    def _prepare_latents_i2v(
+        self,
+        image_tensor: torch.Tensor,
+        height: int,
+        width: int,
+        num_frames: int,
+        generator: torch.Generator,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Prepare initial latents with frame 0 conditioned on the input image.
+
+        Returns:
+            latents: [1, C, T_lat, H_lat, W_lat] with frame 0 = image, rest = noise
+            velocity_mask: [1, 1, T_lat, 1, 1] with frame 0 = 0, rest = 1
+            image_latent: [1, C, 1, H_lat, W_lat] clean frame 0 for re-injection
+        """
+        C = self.transformer.latent_channel_size
+        T_lat = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        H_lat = height // self.vae_scale_factor_spatial
+        W_lat = width // self.vae_scale_factor_spatial
+
+        noise = randn_tensor(
+            (1, C, T_lat, H_lat, W_lat),
+            generator=generator,
+            device=self.device,
+            dtype=self.dtype,
+        )
+
+        cond_latent = self._encode_conditioning_video(image_tensor, num_frames, height, width)
+        image_latent = cond_latent[:, :, 0:1, :, :]
+
+        condition_mask = torch.zeros(1, 1, T_lat, 1, 1, device=self.device, dtype=self.dtype)
+        condition_mask[:, :, 0, :, :] = 1.0
+        latents = condition_mask * cond_latent + (1.0 - condition_mask) * noise
+        velocity_mask = 1.0 - condition_mask
+        return latents, velocity_mask, image_latent
+
+    def _prepare_latents_action_video(
+        self,
+        video_tensor: torch.Tensor,
+        mode: str,
+        height: int,
+        width: int,
+        num_frames: int,
+        generator: torch.Generator,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Prepare video latents for action modes with mode-specific conditioning."""
+        del height, width
+        C = self.transformer.latent_channel_size
+        T_lat = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        H_lat = video_tensor.shape[-2] // self.vae_scale_factor_spatial
+        W_lat = video_tensor.shape[-1] // self.vae_scale_factor_spatial
+
+        noise = randn_tensor(
+            (1, C, T_lat, H_lat, W_lat),
+            generator=generator,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        cond_latent = self._encode_video_tensor(video_tensor)
+        if cond_latent.shape[2:] != noise.shape[2:]:
+            raise ValueError(
+                "Cosmos3 action video latent shape mismatch: "
+                f"encoded={tuple(cond_latent.shape)}, expected={tuple(noise.shape)}."
+            )
+        condition_mask = build_vision_condition_mask(
+            mode,
+            num_frames,
+            self.vae_scale_factor_temporal,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        latents = condition_mask * cond_latent + (1.0 - condition_mask) * noise
+        velocity_mask = 1.0 - condition_mask
+        return latents, velocity_mask, cond_latent
+
+    def _prepare_action_latents(
+        self,
+        *,
+        mode: str,
+        action_chunk_size: int,
+        raw_action_dim: int | None,
+        generator: torch.Generator,
+        sp,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+        action_dim = int(getattr(self.transformer, "action_dim", 64))
+        if mode == ACTION_MODE_FORWARD_DYNAMICS:
+            action = load_action_tensor(
+                self._get_sp_param(sp, "action", None),
+                self._get_sp_param(sp, "action_path", None),
+            )
+            if action.shape[0] < action_chunk_size:
+                pad = action[-1:].repeat(action_chunk_size - action.shape[0], 1)
+                action = torch.cat([action, pad], dim=0)
+            elif action.shape[0] > action_chunk_size:
+                action = action[:action_chunk_size]
+            if raw_action_dim is None:
+                raw_action_dim = int(action.shape[-1])
+            clean_action = pad_action_to_dim(action, action_dim)
+        else:
+            if raw_action_dim is None:
+                raise ValueError(
+                    "Cosmos3 action_mode='policy' and 'inverse_dynamics' require extra_args['raw_action_dim']."
+                )
+            clean_action = torch.zeros(action_chunk_size, action_dim, dtype=torch.float32)
+
+        raw_action_dim = int(raw_action_dim)
+        if raw_action_dim <= 0 or raw_action_dim > action_dim:
+            raise ValueError(f"Cosmos3 raw_action_dim must be in [1, {action_dim}], got {raw_action_dim}.")
+
+        clean_action = clean_action.to(device=self.device, dtype=self.dtype).unsqueeze(0)
+        condition_mask = build_action_condition_mask(
+            mode,
+            action_chunk_size,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        noise = randn_tensor(
+            (1, action_chunk_size, action_dim),
+            generator=generator,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        noise[:, :, raw_action_dim:] = 0
+        clean_action[:, :, raw_action_dim:] = 0
+        action_latents = condition_mask * clean_action + (1.0 - condition_mask) * noise
+        action_velocity_mask = 1.0 - condition_mask
+        return action_latents, action_velocity_mask, clean_action, raw_action_dim
+
+    # -- Denoising loop (shared by T2V and I2V) -----------------------------
+
+    def diffuse(
+        self,
+        latents: torch.Tensor,
+        timesteps: torch.Tensor,
+        cond_ids: torch.Tensor,
+        cond_mask: torch.Tensor,
+        uncond_ids: torch.Tensor,
+        uncond_mask: torch.Tensor,
+        guidance_scale: float,
+        shared_kwargs: dict,
+        *,
+        action_latents: torch.Tensor | None = None,
+        action_velocity_mask: torch.Tensor | None = None,
+        action_condition_latents: torch.Tensor | None = None,
+        sound_latents: torch.Tensor | None = None,
+        velocity_mask: torch.Tensor | None = None,
+        image_latent: torch.Tensor | None = None,
+        condition_latents: torch.Tensor | None = None,
+        guidance_interval: tuple[float, float] | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+        """Denoising loop with 3-mode CFG support (parallel, sequential, none).
+
+        Cosmos3's UND pathway is text-dependent, so CFG needs separate K/V
+        caches for conditional and unconditional text.
+
+        Two modes:
+          1. CFG parallel (multi-GPU): each rank handles one condition via
+             predict_noise_maybe_with_cfg; caching is rank-local.
+          2. Sequential CFG (single-GPU or cfg_size=1): two separate
+             forward passes with explicit cache swapping.  We cannot
+             batch B=2 because different text lengths would cause the
+             shorter branch to attend to padding in cross-attention.
+
+        I2V conditioning (when both arguments are supplied):
+          * ``velocity_mask`` zeros frame-0 noise predictions before stepping.
+          * ``image_latent`` is re-injected into frame 0 after each scheduler
+            step, since UniPC's predictor-corrector update rescales the
+            sample (sigma-dependent), so even zero velocity does not preserve
+            frame 0.
+
+        ``guidance_interval`` (T2I) restricts CFG to
+        timesteps inside the closed interval ``[lo, hi]``.  The interval is
+        compared against the raw scheduler timestep value; works for both
+        the [0, 1000] discrete scale and normalized flow-matching scales.
+        Outside the interval the cond/uncond delta is zeroed so all ranks
+        continue to execute identical control flow (CFG-Parallel safe).
+        """
+        do_cfg = guidance_scale > 1.0
+        cfg_parallel = self._cfg_parallel_active() and do_cfg
+        self.transformer.reset_cache()
+
+        def _cfg_active_at(t: torch.Tensor) -> bool:
+            if guidance_interval is None:
+                return True
+            t_scalar = float(t.item()) if torch.is_tensor(t) else float(t)
+            lo, hi = guidance_interval
+            return lo <= t_scalar <= hi
+
+        def _pack_joint(
+            video_tensor: torch.Tensor,
+            action_tensor: torch.Tensor | None = None,
+            sound_tensor: torch.Tensor | None = None,
+        ):
+            batch = video_tensor.shape[0]
+            tensors = [video_tensor]
+            if action_tensor is not None:
+                tensors.append(action_tensor)
+            if sound_tensor is not None:
+                tensors.append(sound_tensor)
+            flats = [tensor.reshape(batch, -1) for tensor in tensors]
+            return torch.cat(flats, dim=1), [tensor.shape for tensor in tensors], [flat.shape[1] for flat in flats]
+
+        def _unpack_joint(
+            packed: torch.Tensor,
+            shapes: list[torch.Size],
+            numels: list[int],
+        ) -> tuple[torch.Tensor, ...]:
+            outputs = []
+            offset = 0
+            for shape, numel in zip(shapes, numels, strict=True):
+                outputs.append(packed[:, offset : offset + numel].reshape(shape))
+                offset += numel
+            return tuple(outputs)
+
+        def _split_noise_pred(
+            noise_pred: torch.Tensor | tuple[torch.Tensor, ...],
+        ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+            has_action = action_latents is not None
+            has_sound = sound_latents is not None
+            if not has_action and not has_sound:
+                if isinstance(noise_pred, tuple):
+                    raise ValueError("Cosmos3 video-only diffusion received tuple predictions.")
+                return noise_pred, None, None
+            if not isinstance(noise_pred, tuple):
+                raise ValueError("Cosmos3 multimodal diffusion expects transformer predictions as a tuple.")
+            expected = 1 + int(has_action) + int(has_sound)
+            if len(noise_pred) != expected:
+                raise ValueError(
+                    f"Cosmos3 multimodal diffusion expected {expected} predictions, got {len(noise_pred)}."
+                )
+            video_pred = noise_pred[0]
+            idx = 1
+            action_pred = noise_pred[idx] if has_action else None
+            if has_action:
+                idx += 1
+            sound_pred = noise_pred[idx] if has_sound else None
+            return video_pred, action_pred, sound_pred
+
+        def _step(
+            noise_pred: torch.Tensor | tuple[torch.Tensor, ...],
+            t: torch.Tensor,
+            latents: torch.Tensor,
+            action_latents: torch.Tensor | None,
+            sound_latents: torch.Tensor | None,
+        ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+            video_pred, action_pred, sound_pred = _split_noise_pred(noise_pred)
+            if velocity_mask is not None:
+                video_pred = video_pred * velocity_mask
+            if action_pred is not None and action_velocity_mask is not None:
+                action_pred = action_pred * action_velocity_mask
+            if action_latents is None and sound_latents is None:
+                latents = self.scheduler.step(video_pred, t, latents, return_dict=False)[0]
+            else:
+                packed_noise, shapes, numels = _pack_joint(video_pred, action_pred, sound_pred)
+                packed_latents, _, _ = _pack_joint(latents, action_latents, sound_latents)
+                packed_next = self.scheduler.step(packed_noise, t, packed_latents, return_dict=False)[0]
+                unpacked = _unpack_joint(packed_next, shapes, numels)
+                latents = unpacked[0]
+                idx = 1
+                if action_latents is not None:
+                    action_latents = unpacked[idx]
+                    idx += 1
+                if sound_latents is not None:
+                    sound_latents = unpacked[idx]
+            if condition_latents is not None and velocity_mask is not None:
+                latents = velocity_mask * latents + (1.0 - velocity_mask) * condition_latents
+            elif image_latent is not None:
+                latents[:, :, 0:1, :, :] = image_latent
+            if action_latents is not None and action_condition_latents is not None and action_velocity_mask is not None:
+                action_latents = (
+                    action_velocity_mask * action_latents + (1.0 - action_velocity_mask) * action_condition_latents
+                )
+            outputs = [latents]
+            if action_latents is not None:
+                outputs.append(action_latents)
+            if sound_latents is not None:
+                outputs.append(sound_latents)
+            return outputs[0] if len(outputs) == 1 else tuple(outputs)
+
+        def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
+            nonlocal latents, action_latents, sound_latents
+            if action_latents is None and sound_latents is None:
+                assert isinstance(step_out, torch.Tensor)
+                latents = step_out
+                return
+            if not isinstance(step_out, tuple):
+                raise ValueError("Cosmos3 multimodal diffusion step returned a non-tuple result.")
+            latents = step_out[0]
+            idx = 1
+            if action_latents is not None:
+                action_latents = step_out[idx]
+                idx += 1
+            if sound_latents is not None:
+                sound_latents = step_out[idx]
+
+        if cfg_parallel:
+            for t in self.progress_bar(timesteps):
+                timestep = t.unsqueeze(0)
+                # Out-of-interval steps run with effective scale 1.0 so the
+                # combined output equals the cond branch (uncond is dropped).
+                # All ranks still execute both branches; no CFG-Parallel
+                # divergence.
+                step_scale = guidance_scale if _cfg_active_at(t) else 1.0
+                noise_pred = self.predict_noise_maybe_with_cfg(
+                    do_true_cfg=True,
+                    true_cfg_scale=step_scale,
+                    positive_kwargs=dict(
+                        hidden_states=latents,
+                        timestep=timestep,
+                        text_ids=cond_ids,
+                        text_mask=cond_mask,
+                        action_latents=action_latents,
+                        sound_latents=sound_latents,
+                        **shared_kwargs,
+                    ),
+                    negative_kwargs=dict(
+                        hidden_states=latents,
+                        timestep=timestep,
+                        text_ids=uncond_ids,
+                        text_mask=uncond_mask,
+                        action_latents=action_latents,
+                        sound_latents=sound_latents,
+                        **shared_kwargs,
+                    ),
+                    cfg_normalize=False,
+                )
+                _assign_step_out(_step(noise_pred, t, latents, action_latents, sound_latents))
+
+        elif do_cfg:
+            cond_cache: tuple = (None, None)
+            uncond_cache: tuple = (None, None)
+
+            for t in self.progress_bar(timesteps):
+                timestep = t.unsqueeze(0)
+                cfg_active = _cfg_active_at(t)
+
+                self.transformer.cached_kv, self.transformer.cached_freqs_gen = cond_cache
+                noise_cond = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep,
+                    text_ids=cond_ids,
+                    text_mask=cond_mask,
+                    action_latents=action_latents,
+                    sound_latents=sound_latents,
+                    **shared_kwargs,
+                )
+                if cond_cache[0] is None:
+                    cond_cache = (self.transformer.cached_kv, self.transformer.cached_freqs_gen)
+
+                if cfg_active:
+                    self.transformer.cached_kv, self.transformer.cached_freqs_gen = uncond_cache
+                    noise_uncond = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep,
+                        text_ids=uncond_ids,
+                        text_mask=uncond_mask,
+                        action_latents=action_latents,
+                        sound_latents=sound_latents,
+                        **shared_kwargs,
+                    )
+                    if uncond_cache[0] is None:
+                        uncond_cache = (self.transformer.cached_kv, self.transformer.cached_freqs_gen)
+                    noise_pred = self.combine_cfg_noise(noise_cond, noise_uncond, guidance_scale, cfg_normalize=False)
+                else:
+                    # Skip uncond forward entirely outside the interval; this
+                    # is correctness-preserving (CFG with scale=1 reduces to
+                    # the cond branch) and gives a free speedup for T2I.
+                    noise_pred = noise_cond
+
+                _assign_step_out(_step(noise_pred, t, latents, action_latents, sound_latents))
+
+        else:
+            for t in self.progress_bar(timesteps):
+                timestep = t.unsqueeze(0)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep,
+                    text_ids=cond_ids,
+                    text_mask=cond_mask,
+                    action_latents=action_latents,
+                    sound_latents=sound_latents,
+                    **shared_kwargs,
+                )
+                _assign_step_out(_step(noise_pred, t, latents, action_latents, sound_latents))
+
+        outputs = [latents]
+        if action_latents is not None:
+            outputs.append(action_latents)
+        if sound_latents is not None:
+            outputs.append(sound_latents)
+        return outputs[0] if len(outputs) == 1 else tuple(outputs)
+
+    # -- Forward (main generation entry point) -------------------------------
+
+    def forward(
+        self,
+        req: OmniDiffusionRequest,
+    ) -> DiffusionOutput:
+        pipeline_start = time.time()
+
+        # --- Parse request ---
+        if len(req.prompts) > 1:
+            raise ValueError("Cosmos3OmniDiffusersPipeline currently supports a single prompt per request.")
+
+        prompt_data = req.prompts[0]
+        if isinstance(prompt_data, str):
+            prompt = prompt_data
+            negative_prompt = COSMOS3_DEFAULT_NEGATIVE_PROMPT
+            image_tensor = None
+            action_video_tensor = None
+        else:
+            prompt = prompt_data.get("prompt", "")
+            negative_prompt = prompt_data.get("negative_prompt", COSMOS3_DEFAULT_NEGATIVE_PROMPT)
+            additional_info = prompt_data.get("additional_information", {}) or {}
+            image_tensor = additional_info.get("preprocessed_image")
+            action_video_tensor = additional_info.get("preprocessed_video")
+
+        sp = req.sampling_params
+        is_t2i = self._is_t2i_request(req)
+        sound_enabled = self._is_sound_request(prompt_data, sp)
+        action_mode = self._get_action_mode(prompt_data, sp)
+        action_enabled = action_mode is not None
+        if action_enabled and is_t2i:
+            raise ValueError("Cosmos3 action generation is supported only for video outputs.")
+        if action_enabled and sound_enabled:
+            raise ValueError("Cosmos3 action+sound joint generation is not supported in this phase.")
+        if action_enabled and not getattr(self.transformer, "action_gen", False):
+            raise ValueError(
+                "Cosmos3 action generation was requested, but the transformer was "
+                "initialized without action modules. Check that the checkpoint config "
+                "enables action_gen or defines action_dim/max_action_dim and includes action weights."
+            )
+        if sound_enabled and is_t2i:
+            raise ValueError(
+                "Cosmos3 sound generation is supported only for video outputs in "
+                "this phase; text-to-image with sound is unsupported."
+            )
+        if sound_enabled and not getattr(self.transformer, "sound_gen", False):
+            raise ValueError(
+                "Cosmos3 sound generation was requested, but the transformer was "
+                "initialized without sound modules. Check that the checkpoint config "
+                "enables sound_gen or defines sound_dim and includes sound weights."
+            )
+
+        # T2I and T2V share the same model + forward path; only defaults
+        # differ:
+        #   T2I: 1024x1024, 50 steps, shift=3.0, guidance_interval=[400, 1000]
+        #   T2V: 720x1280,  35 steps, shift=engine-init, no interval
+        if is_t2i:
+            height = sp.height or 1024
+            width = sp.width or 1024
+            num_frames = 1
+            num_inference_steps = sp.num_inference_steps or 50
+            guidance_scale = sp.guidance_scale if sp.guidance_scale else 7.0
+            default_flow_shift = 3.0
+            default_guidance_interval: tuple[float, float] | None = (400.0, 1000.0)
+            batch_size = max(1, int(getattr(sp, "num_outputs_per_prompt", None) or 1))
+        else:
+            height = sp.height or 720
+            width = sp.width or 1280
+            num_frames = sp.num_frames or 81
+            num_inference_steps = sp.num_inference_steps or 35
+            guidance_scale = sp.guidance_scale if sp.guidance_scale else 4.0
+            # Fall back to the engine-init shift, NOT None: passing None
+            # to ``_set_flow_shift`` would leak a prior T2I rebuild
+            # (shift=3.0) into a subsequent video request.
+            default_flow_shift = self._engine_init_flow_shift
+            default_guidance_interval = None
+            batch_size = 1  # Existing video pipeline assumes B=1.
+
+        if action_enabled:
+            action_chunk_param = self._get_sp_param(sp, "action_chunk_size", None)
+            if action_chunk_param is not None:
+                action_chunk_size = int(action_chunk_param)
+                if sp.num_frames is None:
+                    num_frames = action_chunk_size + 1
+            elif sp.num_frames is None:
+                action_chunk_size = 16
+                num_frames = action_chunk_size + 1
+            else:
+                action_chunk_size = int(num_frames) - 1
+            if action_chunk_size <= 0:
+                raise ValueError(f"Cosmos3 action_chunk_size must be positive, got {action_chunk_size}.")
+            if num_frames not in (action_chunk_size, action_chunk_size + 1):
+                raise ValueError(
+                    "Cosmos3 action requests require num_frames to equal action_chunk_size "
+                    f"or action_chunk_size + 1; got num_frames={num_frames}, action_chunk_size={action_chunk_size}."
+                )
+            num_inference_steps = sp.num_inference_steps or 30
+            guidance_scale = sp.guidance_scale if sp.guidance_scale is not None else 1.0
+            default_flow_shift = 5.0
+
+        domain_id = None
+        if action_enabled:
+            domain_id = resolve_domain_id(
+                domain_id=self._get_sp_param(sp, "domain_id", None),
+                domain_name=self._get_sp_param(sp, "domain_name", None),
+                require_explicit=True,
+            )
+
+        # Runtime controls: prefer ``extra_args`` (OpenAI endpoints write
+        # there) over direct attrs.
+        flow_shift_target = float(self._get_sp_param(sp, "flow_shift", default_flow_shift))
+        guidance_interval = self._get_sp_param(sp, "guidance_interval", default_guidance_interval)
+
+        seed = sp.seed if sp.seed is not None else 42
+        frame_rate = self._get_sp_param(sp, "resolved_frame_rate") or self._get_sp_param(sp, "frame_rate") or 24.0
+        max_sequence_length = self._get_sp_param(sp, "max_sequence_length", 512) or 512
+        use_system_prompt = bool(self._get_sp_param(sp, "use_system_prompt", False))
+
+        if action_enabled and action_video_tensor is None:
+            extra_action_video = self._get_sp_param(sp, "action_video", None)
+            if isinstance(extra_action_video, torch.Tensor):
+                action_video_tensor = extra_action_video
+        if action_enabled and isinstance(action_video_tensor, torch.Tensor):
+            if action_video_tensor.ndim == 4:
+                action_video_tensor = action_video_tensor.unsqueeze(0)
+            if action_video_tensor.ndim != 5:
+                raise ValueError(
+                    "Cosmos3 extra_args['action_video'] must have shape [1, 3, T, H, W] "
+                    f"or [3, T, H, W], got {tuple(action_video_tensor.shape)}."
+                )
+            if sp.height is None:
+                height = int(action_video_tensor.shape[-2])
+            if sp.width is None:
+                width = int(action_video_tensor.shape[-1])
+
+        self._guidance_scale = guidance_scale
+        self._num_timesteps = num_inference_steps
+
+        # Always resolve to a concrete target shift for this request, then
+        # update the scheduler.  This is what guarantees mode-to-mode
+        # transitions restore the right schedule (no T2I to T2V leak).
+        self._set_flow_shift(flow_shift_target)
+
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+
+        # --- Format prompts & tokenize (B=1; reused across loop iterations
+        # for T2I num_outputs_per_prompt > 1) ---
+        cond_ids, cond_mask, uncond_ids, uncond_mask = self._format_and_tokenize_prompts(
+            prompt,
+            negative_prompt,
+            num_frames,
+            frame_rate,
+            height,
+            width,
+            max_sequence_length,
+            sp,
+            use_system_prompt,
+            is_t2i=is_t2i,
+        )
+
+        # --- Prepare latents (T2I, T2V, or I2V) ---
+        # T2I shares _prepare_latents with T2V; the math collapses cleanly
+        # at num_frames=1 ((1-1)//4 + 1 = 1 latent frame).  For T2I with
+        # ``num_outputs_per_prompt > 1`` we loop the diffusion below;
+        # batching B=N together would require expanding text K/V (UND
+        # pathway is text-only and cached) and is left as a future
+        # optimization.
+        action_latents = None
+        action_velocity_mask = None
+        action_condition_latents = None
+        raw_action_dim = None
+        action_offset = 1
+        if action_enabled:
+            if action_video_tensor is not None and action_video_tensor.ndim == 4:
+                action_video_tensor = action_video_tensor.unsqueeze(0)
+            if action_video_tensor is not None and action_video_tensor.ndim != 5:
+                raise ValueError(
+                    "Cosmos3 action video tensor must have shape [1, 3, T, H, W] "
+                    f"or [3, T, H, W], got {tuple(action_video_tensor.shape)}."
+                )
+            if action_video_tensor is not None and action_video_tensor.shape[2] < num_frames:
+                pad = action_video_tensor[:, :, -1:].repeat(1, 1, num_frames - action_video_tensor.shape[2], 1, 1)
+                action_video_tensor = torch.cat([action_video_tensor, pad], dim=2)
+            elif action_video_tensor is not None and action_video_tensor.shape[2] > num_frames:
+                action_video_tensor = action_video_tensor[:, :, :num_frames]
+
+            if action_mode == ACTION_MODE_INVERSE_DYNAMICS and action_video_tensor is None:
+                raise ValueError("Cosmos3 inverse_dynamics action mode requires multi_modal_data['video'].")
+            if action_mode in {ACTION_MODE_POLICY, ACTION_MODE_FORWARD_DYNAMICS} and image_tensor is None:
+                if action_video_tensor is None:
+                    raise ValueError(
+                        f"Cosmos3 action_mode={action_mode!r} requires multi_modal_data['image'] "
+                        "or multi_modal_data['video']."
+                    )
+                image_tensor = action_video_tensor[:, :, 0]
+
+            raw_action_dim_param = self._get_sp_param(sp, "raw_action_dim", None)
+            raw_action_dim = int(raw_action_dim_param) if raw_action_dim_param is not None else None
+            action_prepared = self._prepare_action_latents(
+                mode=action_mode,
+                action_chunk_size=action_chunk_size,
+                raw_action_dim=raw_action_dim,
+                generator=generator,
+                sp=sp,
+            )
+            action_latents, action_velocity_mask, action_condition_latents, raw_action_dim = action_prepared
+            action_offset = action_start_frame_offset(action_mode, action_chunk_size, num_frames)
+
+        if action_enabled and action_video_tensor is not None:
+            latents, velocity_mask, condition_latents = self._prepare_latents_action_video(
+                action_video_tensor,
+                action_mode,
+                height,
+                width,
+                num_frames,
+                generator,
+            )
+            image_latent = condition_latents[:, :, 0:1]
+        elif image_tensor is not None and not is_t2i:
+            latents, velocity_mask, image_latent = self._prepare_latents_i2v(
+                image_tensor,
+                height,
+                width,
+                num_frames,
+                generator,
+            )
+            condition_latents = None
+        else:
+            latents = self._prepare_latents(height, width, num_frames, generator)
+            velocity_mask = None
+            image_latent = None
+            condition_latents = None
+
+        sound_latents = None
+        target_audio_samples = None
+        sound_sample_rate = None
+        if sound_enabled:
+            target_audio_samples, _, sound_sample_rate = self._resolve_sound_target_samples(sp, num_frames, frame_rate)
+            sound_latents, _ = self._prepare_sound_latents(target_audio_samples, generator)
+
+        T_latent = latents.shape[2]
+        H_latent = latents.shape[3]
+        W_latent = latents.shape[4]
+        video_shape = (T_latent, H_latent, W_latent)
+
+        # --- Denoising loop ---
+        shared_kwargs = dict(video_shape=video_shape, fps=frame_rate)
+        if velocity_mask is not None:
+            shared_kwargs["noisy_frame_mask"] = velocity_mask
+        if action_enabled:
+            shared_kwargs.update(
+                action_domain_ids=torch.tensor([domain_id], dtype=torch.long, device=self.device),
+                action_noisy_mask=action_velocity_mask,
+                action_start_frame_offset=action_offset,
+                action_fps=float(self._get_sp_param(sp, "action_fps", frame_rate) or frame_rate),
+            )
+
+        def _run_diffusion(start_latents):
+            self._set_scheduler_timesteps(num_inference_steps)
+            return self.diffuse(
+                latents=start_latents,
+                timesteps=self.scheduler.timesteps,
+                cond_ids=cond_ids,
+                cond_mask=cond_mask,
+                uncond_ids=uncond_ids,
+                uncond_mask=uncond_mask,
+                guidance_scale=guidance_scale,
+                shared_kwargs=shared_kwargs,
+                action_latents=action_latents,
+                action_velocity_mask=action_velocity_mask,
+                action_condition_latents=action_condition_latents,
+                sound_latents=sound_latents,
+                velocity_mask=velocity_mask,
+                image_latent=image_latent,
+                condition_latents=condition_latents,
+                guidance_interval=guidance_interval,
+            )
+
+        if is_t2i and batch_size > 1:
+            # Generate N independent images by re-running the full diffusion
+            # loop with different noise seeds.  The first sample reuses
+            # ``latents`` already drawn from ``generator``; subsequent
+            # samples draw fresh noise from the same generator (state
+            # advances per call), giving distinct outputs from a single
+            # user-provided seed.  Batched B=N would be more efficient but
+            # requires expanding cached UND text K/V to match.
+            samples = [_run_diffusion(latents)]
+            for _ in range(batch_size - 1):
+                next_latents = self._prepare_latents(height, width, num_frames, generator)
+                samples.append(_run_diffusion(next_latents))
+            latents = torch.cat(samples, dim=0)
+        else:
+            diffusion_output = _run_diffusion(latents)
+            if action_enabled and sound_enabled:
+                latents, action_latents, sound_latents = diffusion_output
+            elif action_enabled:
+                latents, action_latents = diffusion_output
+            elif sound_enabled:
+                latents, sound_latents = diffusion_output
+            else:
+                latents = diffusion_output
+
+        # --- Decode ---
+        if _is_rank_zero():
+            logger.info("Decoding video...")
+        decode_start = time.time()
+        video = self._decode_latents(latents)
+        if _is_rank_zero():
+            logger.info("Video decoded in %.2fs", time.time() - decode_start)
+            logger.info("Total pipeline time: %.2fs", time.time() - pipeline_start)
+
+        if sound_enabled:
+            if sound_latents is None or target_audio_samples is None or sound_sample_rate is None:
+                raise ValueError("Cosmos3 sound generation finished without sound latents.")
+            if _is_rank_zero():
+                logger.info("Decoding sound...")
+            audio = self._decode_sound_latents(sound_latents, target_audio_samples)
+            return DiffusionOutput(output={"video": video, "audio": audio, "audio_sample_rate": sound_sample_rate})
+
+        if action_enabled:
+            if action_latents is None or raw_action_dim is None or domain_id is None:
+                raise ValueError("Cosmos3 action generation finished without action latents.")
+            action = action_latents[:, :, :raw_action_dim].detach().cpu()
+            return DiffusionOutput(
+                output={"video": video},
+                custom_output={
+                    "action": action,
+                    "raw_action_dim": raw_action_dim,
+                    "action_mode": action_mode,
+                    "domain_id": domain_id,
+                },
+            )
+
+        return DiffusionOutput(output={"image": video} if is_t2i else {"video": video})
diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
new file mode 100644
index 00000000000..863561bac53
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
@@ -0,0 +1,232 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cosmos3 sound tokenizer integration."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Any
+
+import torch
+from vllm.logger import init_logger
+
+from vllm_omni.diffusion.data import OmniDiffusionConfig
+from vllm_omni.diffusion.distributed.utils import get_local_device
+from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
+
+from .audio_tokenizer import Cosmos3AVAEAudioTokenizer
+
+logger = init_logger(__name__)
+
+DEFAULT_SOUND_SAMPLE_RATE = 48000
+DEFAULT_SOUND_CHANNELS = 2
+DEFAULT_SOUND_DIM = 64
+DEFAULT_SOUND_HOP_SIZE = 1920
+DEFAULT_SOUND_LATENT_FPS = DEFAULT_SOUND_SAMPLE_RATE / DEFAULT_SOUND_HOP_SIZE
+SOUND_TOKENIZER_COMPONENT_NAME = "sound_tokenizer"
+SOUND_TOKENIZER_CHECKPOINT_NAME = "model.safetensors"
+
+
+def _pipeline_args(od_config: OmniDiffusionConfig) -> dict[str, Any]:
+    return dict(getattr(od_config, "custom_pipeline_args", None) or {})
+
+
+def _resolve_model_file(path: Any, model_root: str | None) -> str | None:
+    if not path:
+        return None
+    path = str(path)
+    if "://" in path or os.path.isabs(path) or os.path.exists(path) or not model_root:
+        return path
+    return str(Path(model_root) / path)
+
+
+def get_sound_config_value(
+    od_config: OmniDiffusionConfig,
+    name: str,
+    default: Any,
+    aliases: tuple[str, ...] = (),
+) -> Any:
+    keys = (name, *aliases)
+    for config in (
+        _pipeline_args(od_config),
+        getattr(od_config, "model_config", None),
+        getattr(od_config, "tf_model_config", None),
+    ):
+        if config is None:
+            continue
+        for key in keys:
+            if hasattr(config, "get"):
+                value = config.get(key, None)
+            else:
+                value = getattr(config, key, None)
+            if value is not None:
+                return value
+    return default
+
+
+def get_sound_sample_rate(od_config: OmniDiffusionConfig) -> int:
+    return int(
+        get_sound_config_value(
+            od_config,
+            "sound_sample_rate",
+            DEFAULT_SOUND_SAMPLE_RATE,
+            ("sample_rate",),
+        )
+    )
+
+
+def get_sound_channels(od_config: OmniDiffusionConfig) -> int:
+    return int(
+        get_sound_config_value(
+            od_config,
+            "sound_audio_channels",
+            DEFAULT_SOUND_CHANNELS,
+            ("audio_channels",),
+        )
+    )
+
+
+def get_sound_dim(od_config: OmniDiffusionConfig | None) -> int:
+    if od_config is None:
+        return DEFAULT_SOUND_DIM
+    return int(
+        get_sound_config_value(
+            od_config,
+            "sound_dim",
+            DEFAULT_SOUND_DIM,
+            ("io_channels", "latent_ch"),
+        )
+    )
+
+
+def get_sound_hop_size(od_config: OmniDiffusionConfig) -> int:
+    return int(
+        get_sound_config_value(
+            od_config,
+            "sound_hop_size",
+            DEFAULT_SOUND_HOP_SIZE,
+            ("hop_size",),
+        )
+    )
+
+
+def get_sound_latent_fps(od_config: OmniDiffusionConfig | None) -> float:
+    if od_config is None:
+        return DEFAULT_SOUND_LATENT_FPS
+    sample_rate = get_sound_sample_rate(od_config)
+    hop_size = get_sound_hop_size(od_config)
+    return float(get_sound_config_value(od_config, "sound_latent_fps", sample_rate / hop_size))
+
+
+class Cosmos3SoundTokenizer:
+    """Thin adapter around the local AVAE tokenizer implementation."""
+
+    def __init__(self, tokenizer: Any) -> None:
+        self.tokenizer = tokenizer
+        self.sample_rate = int(getattr(tokenizer, "sample_rate", DEFAULT_SOUND_SAMPLE_RATE))
+        self.audio_channels = int(getattr(tokenizer, "audio_channels", DEFAULT_SOUND_CHANNELS))
+        self.latent_ch = int(getattr(tokenizer, "latent_ch", DEFAULT_SOUND_DIM))
+        self.hop_size = int(getattr(tokenizer, "temporal_compression_factor", DEFAULT_SOUND_HOP_SIZE))
+
+    @classmethod
+    def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer:
+        args = _pipeline_args(od_config)
+        model_path = getattr(od_config, "model", None)
+        explicit_avae_path = (
+            args.get("sound_tokenizer_path")
+            or args.get("avae_path")
+            or args.get("cosmos3_avae_path")
+            or os.environ.get("COSMOS3_SOUND_TOKENIZER_PATH")
+        )
+        explicit_config_path = args.get("sound_tokenizer_config_path") or os.environ.get(
+            "COSMOS3_SOUND_TOKENIZER_CONFIG_PATH"
+        )
+
+        model_root = str(model_path) if model_path and os.path.isdir(model_path) else None
+        if model_root is None and model_path and not explicit_avae_path:
+            from huggingface_hub import snapshot_download
+
+            model_root = snapshot_download(
+                repo_id=str(model_path),
+                revision=getattr(od_config, "revision", None),
+                allow_patterns=[
+                    f"{SOUND_TOKENIZER_COMPONENT_NAME}/config.json",
+                    f"{SOUND_TOKENIZER_COMPONENT_NAME}/{SOUND_TOKENIZER_CHECKPOINT_NAME}",
+                ],
+            )
+
+        if explicit_avae_path:
+            avae_path = _resolve_model_file(explicit_avae_path, model_root)
+        else:
+            tokenizer_dir = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME if model_root else None
+            candidate = tokenizer_dir / SOUND_TOKENIZER_CHECKPOINT_NAME if tokenizer_dir else None
+            avae_path = str(candidate) if candidate and candidate.exists() else None
+
+        if not avae_path:
+            raise ValueError(
+                "Cosmos3 sound generation was requested, but no AVAE sound "
+                "tokenizer checkpoint was provided. Set "
+                "custom_pipeline_args['sound_tokenizer_path'] or "
+                "COSMOS3_SOUND_TOKENIZER_PATH, or include "
+                "sound_tokenizer/model.safetensors under the model path."
+            )
+
+        sample_rate = get_sound_sample_rate(od_config)
+        audio_channels = get_sound_channels(od_config)
+        sound_dim = get_sound_dim(od_config)
+        hop_size = get_sound_hop_size(od_config)
+
+        config_path = _resolve_model_file(explicit_config_path, model_root)
+        if config_path is None and model_root:
+            candidate = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME / "config.json"
+            config_path = str(candidate) if candidate.exists() else None
+        tokenizer = Cosmos3AVAEAudioTokenizer(
+            checkpoint_path=str(avae_path),
+            config_path=config_path,
+            sample_rate=sample_rate,
+            audio_channels=audio_channels,
+            io_channels=sound_dim,
+            hop_size=hop_size,
+            normalize_latents=bool(args.get("sound_normalize_latents", True)),
+            normalization_type=args.get("sound_normalization_type", "none"),
+            tanh_input_scale=float(args.get("sound_tanh_input_scale", 1.5)),
+            tanh_output_scale=float(args.get("sound_tanh_output_scale", 3.5)),
+            tanh_clamp=float(args.get("sound_tanh_clamp", 0.995)),
+            dtype=getattr(od_config, "dtype", torch.bfloat16),
+            device=get_local_device(),
+        )
+        if _is_rank_zero():
+            logger.info(
+                "Loaded Cosmos3 AVAE sound tokenizer from %s (sr=%d, channels=%d, latent_ch=%d, hop=%d)",
+                avae_path,
+                sample_rate,
+                audio_channels,
+                sound_dim,
+                hop_size,
+            )
+        return cls(tokenizer)
+
+    def get_latent_num_samples(self, num_audio_samples: int) -> int:
+        return int(self.tokenizer.get_latent_num_samples(num_audio_samples))
+
+    def get_audio_num_samples(self, num_latent_samples: int) -> int:
+        return int(self.tokenizer.get_audio_num_samples(num_latent_samples))
+
+    @torch.no_grad()
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        """Decode sound latents.
+
+        Args:
+            latents: ``[B, C, T]`` or ``[C, T]`` tensor.
+
+        Returns:
+            ``[B, audio_channels, N]`` tensor for batched input, or
+            ``[audio_channels, N]`` for unbatched input.
+        """
+        squeeze = latents.ndim == 2
+        if squeeze:
+            latents = latents.unsqueeze(0)
+        audio = self.tokenizer.decode(latents)
+        audio = audio.clamp(-1.0, 1.0)
+        return audio.squeeze(0) if squeeze else audio
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
new file mode 100644
index 00000000000..22ff22caeaf
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -0,0 +1,1586 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cosmos3 VFM Transformer for vllm-omni.
+
+Implements the Mixture-of-Transformers architecture with two pathways:
+- Understanding (UND): causal self-attention on text tokens (Qwen3-VL backbone)
+- Generation (GEN): cross-attention where visual Q attends to [K_und, K_gen]
+
+Ported from the TRT-LLM integration (tekit branch user/shreyasm/cosmos3).
+"""
+
+from __future__ import annotations
+
+import math
+from typing import Any
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+)
+
+from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata
+from vllm_omni.diffusion.attention.layer import Attention as FrameworkAttention
+from vllm_omni.diffusion.distributed.sp_plan import SequenceParallelInput, SequenceParallelOutput
+
+logger = init_logger(__name__)
+
+
+def _get_ulysses_state() -> tuple[int, int, dist.ProcessGroup | None]:
+    """Return (ulysses_size, ulysses_rank, ulysses_pg) from vllm-omni parallel state.
+
+    Returns (1, 0, None) when sequence parallelism is not active.
+    """
+    try:
+        from vllm_omni.diffusion.distributed.parallel_state import (
+            get_sp_group,
+            get_ulysses_parallel_rank,
+            get_ulysses_parallel_world_size,
+        )
+
+        size = get_ulysses_parallel_world_size()
+        if size <= 1:
+            return 1, 0, None
+        return size, get_ulysses_parallel_rank(), get_sp_group().ulysses_group
+    except Exception:
+        return 1, 0, None
+
+
+def _is_sp_active() -> bool:
+    """Check whether sequence parallelism is active in the current forward context.
+
+    Follows the Bagel pattern: read ``forward_context.sp_active`` which returns
+    True when ``sequence_parallel_size > 1`` even without ``_sp_plan`` hooks.
+    """
+    try:
+        from vllm_omni.diffusion.forward_context import (
+            get_forward_context,
+            is_forward_context_available,
+        )
+
+        if not is_forward_context_available():
+            return False
+        return get_forward_context().sp_active
+    except Exception:
+        return False
+
+
+def _tf_config_get(config: Any, key: str, default: Any) -> Any:
+    """Read a value from TransformerConfig, dict, or simple namespace."""
+    if config is None:
+        return default
+    if hasattr(config, "get"):
+        return config.get(key, default)
+    return getattr(config, key, default)
+
+
+def _nested_get(value: Any, key: str) -> Any:
+    if isinstance(value, dict):
+        if key in value:
+            return value[key]
+        for child in value.values():
+            found = _nested_get(child, key)
+            if found is not None:
+                return found
+    elif isinstance(value, list | tuple):
+        for child in value:
+            found = _nested_get(child, key)
+            if found is not None:
+                return found
+    return None
+
+
+def _od_config_get(od_config: Any, key: str, default: Any = None) -> Any:
+    """Read Cosmos3 options from runtime, model, or transformer config."""
+    if od_config is None:
+        return default
+    for attr in ("custom_pipeline_args", "model_config"):
+        source = getattr(od_config, attr, None) or {}
+        if isinstance(source, dict):
+            if key in source:
+                return source[key]
+            found = _nested_get(source, key)
+            if found is not None:
+                return found
+    tf_model_config = getattr(od_config, "tf_model_config", None)
+    if isinstance(tf_model_config, dict):
+        if key in tf_model_config:
+            return tf_model_config[key]
+        found = _nested_get(tf_model_config, key)
+        if found is not None:
+            return found
+    value = _tf_config_get(tf_model_config, key, None)
+    return default if value is None else value
+
+
+def _as_bool(value: Any) -> bool:
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
+
+
+# ---------------------------------------------------------------------------
+# RMSNorm
+# ---------------------------------------------------------------------------
+class Qwen3VLTextRMSNorm(nn.Module):
+    """RMSNorm compatible with Qwen3-VL / T5LayerNorm."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=dtype))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class DomainAwareLinear(nn.Module):
+    """Linear projection with one weight/bias pair per action embodiment domain."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        num_domains: int,
+        *,
+        dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        super().__init__()
+        self.input_size = int(input_size)
+        self.output_size = int(output_size)
+        self.num_domains = int(num_domains)
+        self.fc = nn.Embedding(self.num_domains, self.output_size * self.input_size, dtype=dtype)
+        self.bias = nn.Embedding(self.num_domains, self.output_size, dtype=dtype)
+        nn.init.xavier_uniform_(self.fc.weight)
+        nn.init.zeros_(self.bias.weight)
+
+    def forward(self, x: torch.Tensor, domain_id: torch.Tensor) -> torch.Tensor:
+        if domain_id.ndim == 0:
+            domain_id = domain_id.unsqueeze(0)
+        domain_id = domain_id.to(device=x.device, dtype=torch.long).reshape(-1)
+        if x.shape[0] != domain_id.shape[0]:
+            raise ValueError(
+                "Cosmos3 action domain_id batch size must match action tokens: "
+                f"tokens={x.shape[0]}, domain_id={domain_id.shape[0]}."
+            )
+        if torch.any((domain_id < 0) | (domain_id >= self.num_domains)):
+            raise ValueError(f"Cosmos3 action domain_id must be in [0, {self.num_domains}), got {domain_id.tolist()}.")
+
+        weight = self.fc(domain_id).view(domain_id.shape[0], self.input_size, self.output_size)
+        bias = self.bias(domain_id).view(domain_id.shape[0], self.output_size)
+        if x.ndim == 2:
+            return torch.bmm(x.unsqueeze(1), weight).squeeze(1) + bias
+        if x.ndim == 3:
+            return torch.bmm(x, weight) + bias.unsqueeze(1)
+        raise ValueError(f"Cosmos3 DomainAwareLinear expected rank-2 or rank-3 input, got {tuple(x.shape)}.")
+
+
+# ---------------------------------------------------------------------------
+# Rotary Position Embeddings (mRoPE)
+# ---------------------------------------------------------------------------
+def compute_mrope_position_ids_text(
+    num_tokens: int,
+    temporal_offset: int,
+) -> tuple[torch.Tensor, int]:
+    """Generate 3D mRoPE position IDs for text tokens.
+
+    Text tokens: all three axes (t, h, w) share the same monotonically
+    increasing position IDs.
+    """
+    ids = torch.arange(num_tokens, dtype=torch.long) + temporal_offset
+    mrope_ids = ids.unsqueeze(0).expand(3, -1).contiguous()
+    return mrope_ids, temporal_offset + num_tokens
+
+
+def compute_mrope_position_ids_vision(
+    grid_t: int,
+    grid_h: int,
+    grid_w: int,
+    temporal_offset: int | float,
+    fps: float | None = None,
+    base_fps: float = 24.0,
+    temporal_compression_factor: int = 4,
+    base_temporal_compression_factor: int | None = None,
+    enable_fps_modulation: bool = True,
+    start_frame_offset: int = 0,
+) -> tuple[torch.Tensor, int | float]:
+    """Generate 3D mRoPE position IDs for vision tokens.
+
+    Creates a (t, h, w) position grid with spatial indices reset per segment
+    (Qwen3VL-style). Flattened in t-major order.
+    """
+    fps_modulation = enable_fps_modulation and fps is not None
+
+    if fps_modulation:
+        tps = fps / temporal_compression_factor
+        effective_base_tcf = (
+            base_temporal_compression_factor
+            if base_temporal_compression_factor is not None
+            else temporal_compression_factor
+        )
+        base_tps = base_fps / effective_base_tcf
+        frame_indices = torch.arange(grid_t, dtype=torch.float32)
+        t_index = (
+            ((frame_indices + start_frame_offset) / tps * base_tps + temporal_offset)
+            .view(-1, 1)
+            .expand(-1, grid_h * grid_w)
+            .flatten()
+        )
+    else:
+        t_index = (
+            torch.arange(grid_t, dtype=torch.long).view(-1, 1).expand(-1, grid_h * grid_w).flatten()
+            + int(temporal_offset)
+            + start_frame_offset
+        )
+
+    h_index = torch.arange(grid_h, dtype=torch.long).view(1, -1, 1).expand(grid_t, -1, grid_w).flatten()
+    w_index = torch.arange(grid_w, dtype=torch.long).view(1, 1, -1).expand(grid_t, grid_h, -1).flatten()
+
+    if fps_modulation:
+        mrope_ids = torch.stack([t_index, h_index.to(torch.float32), w_index.to(torch.float32)], dim=0)
+    else:
+        mrope_ids = torch.stack([t_index, h_index, w_index], dim=0)
+
+    next_offset = math.floor(mrope_ids.max().item()) + 1
+    return mrope_ids, next_offset
+
+
+def compute_mrope_position_ids_sound(
+    grid_t: int,
+    temporal_offset: int | float,
+    sound_latent_fps: float,
+    base_fps: float = 24.0,
+    base_temporal_compression_factor: int = 4,
+    enable_fps_modulation: bool = True,
+) -> tuple[torch.Tensor, int | float]:
+    """Generate mRoPE IDs for sound tokens as a (T, 1, 1) grid."""
+    return compute_mrope_position_ids_vision(
+        grid_t=grid_t,
+        grid_h=1,
+        grid_w=1,
+        temporal_offset=temporal_offset,
+        fps=sound_latent_fps,
+        base_fps=base_fps,
+        temporal_compression_factor=1,
+        base_temporal_compression_factor=base_temporal_compression_factor,
+        enable_fps_modulation=enable_fps_modulation,
+    )
+
+
+def compute_mrope_position_ids_action(
+    grid_t: int,
+    temporal_offset: int | float,
+    action_fps: float | None,
+    base_fps: float = 24.0,
+    base_temporal_compression_factor: int = 4,
+    enable_fps_modulation: bool = True,
+    start_frame_offset: int = 1,
+) -> tuple[torch.Tensor, int | float]:
+    """Generate mRoPE IDs for action tokens as a frame-rate (T, 1, 1) grid."""
+    return compute_mrope_position_ids_vision(
+        grid_t=grid_t,
+        grid_h=1,
+        grid_w=1,
+        temporal_offset=temporal_offset,
+        fps=action_fps,
+        base_fps=base_fps,
+        temporal_compression_factor=1,
+        base_temporal_compression_factor=base_temporal_compression_factor,
+        enable_fps_modulation=enable_fps_modulation,
+        start_frame_offset=start_frame_offset,
+    )
+
+
+class Qwen3VLTextRotaryEmbedding(nn.Module):
+    """Multi-dimensional rotary position embedding for Qwen3-VL."""
+
+    def __init__(
+        self,
+        *,
+        head_dim: int,
+        rope_theta: float,
+        mrope_section: list[int],
+    ) -> None:
+        super().__init__()
+        self.head_dim = head_dim
+        self.rope_theta = rope_theta
+        self.mrope_section = mrope_section
+        inv_freq = 1.0 / (
+            rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.int64).to(dtype=torch.float) / head_dim)
+        )
+        self.attention_scaling = 1.0
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def apply_interleaved_mrope(self, freqs: torch.Tensor, mrope_section: list[int]) -> torch.Tensor:
+        """Reorganize from chunked [TTT...HHH...WWW] to interleaved [THTHW...]."""
+        freqs_t = freqs[0]
+        for dim, offset in enumerate((1, 2), start=1):
+            length = mrope_section[dim] * 3
+            idx = slice(offset, length, 3)
+            freqs_t[..., idx] = freqs[dim, ..., idx]
+        return freqs_t
+
+    @torch.no_grad()
+    def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        if position_ids.ndim == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        inv_freq_expanded = (
+            self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1).to(x.device)
+        )
+        position_ids_expanded = position_ids[:, :, None, :].float()
+
+        freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+        freqs = self.apply_interleaved_mrope(freqs, self.mrope_section)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos() * self.attention_scaling
+        sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# ---------------------------------------------------------------------------
+# RoPE application (Qwen3/Llama style)
+# ---------------------------------------------------------------------------
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def _apply_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Qwen3-style RoPE: (x * cos) + (rotate_half(x) * sin).
+
+    Args:
+        q: [B, S, h, D]
+        k: [B, S, H_kv, D]
+        cos: [1, S, 1, D] or broadcastable
+        sin: [1, S, 1, D] or broadcastable
+    """
+    q_embed = (q * cos) + (_rotate_half(q) * sin)
+    k_embed = (k * cos) + (_rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# ---------------------------------------------------------------------------
+# Timestep Embedder
+# ---------------------------------------------------------------------------
+class TimestepEmbedder(nn.Module):
+    """Embeds scalar timesteps into vector representations."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        frequency_embedding_size: int = 256,
+        max_period: int = 10000,
+        target_dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+        self.hidden_size = hidden_size
+
+        half = frequency_embedding_size // 2
+        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=target_dtype) / half)
+        self.register_buffer("freqs", freqs, persistent=False)
+
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        args = t[:, None] * self.freqs[None]
+        t_freq = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        return self.mlp(t_freq)
+
+
+# ---------------------------------------------------------------------------
+# GatedMLP (replaces TRT-LLM GatedMLP)
+# ---------------------------------------------------------------------------
+class Cosmos3GatedMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 4096,
+        intermediate_size: int = 12288,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_proj = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=False,
+            gather_output=False,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_proj",
+        )
+        self.up_proj = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=False,
+            gather_output=False,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+
+
+# ---------------------------------------------------------------------------
+# Attention Modules
+# ---------------------------------------------------------------------------
+class Cosmos3CausalAttention(nn.Module):
+    """Understanding pathway: causal self-attention on text tokens.
+
+    Returns (output, K, V) where K/V are post-norm, post-RoPE for the
+    generation pathway's cross-attention.
+    """
+
+    def __init__(
+        self,
+        *,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        rms_norm_eps: float,
+        dtype: torch.dtype = torch.bfloat16,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_attention_heads
+        self.num_kv_heads = num_key_value_heads
+        self.head_dim = head_dim
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_local = self.num_heads // tp_size
+        self.num_kv_heads_local = self.num_kv_heads // tp_size
+
+        self.q_proj = ColumnParallelLinear(
+            hidden_size,
+            self.num_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.k_proj = ColumnParallelLinear(
+            hidden_size,
+            self.num_kv_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.k_proj",
+        )
+        self.v_proj = ColumnParallelLinear(
+            hidden_size,
+            self.num_kv_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.v_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.q_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=rms_norm_eps, dtype=dtype)
+        self.k_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=rms_norm_eps, dtype=dtype)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        text_mask: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        B, S, _ = hidden_states.shape
+
+        q = self.q_proj(hidden_states).view(B, S, self.num_heads_local, self.head_dim)
+        k = self.k_proj(hidden_states).view(B, S, self.num_kv_heads_local, self.head_dim)
+        v = self.v_proj(hidden_states).view(B, S, self.num_kv_heads_local, self.head_dim)
+
+        # Per-head QK norm
+        q = F.rms_norm(q, (q.shape[-1],), self.q_norm.weight, self.q_norm.variance_epsilon)
+        k = F.rms_norm(k, (k.shape[-1],), self.k_norm.weight, self.k_norm.variance_epsilon)
+
+        # Qwen3-style RoPE
+        q, k = _apply_rotary_pos_emb(q, k, freqs_cos, freqs_sin)
+
+        # Transpose to [B, h, S, D] for SDPA
+        q_t = q.transpose(1, 2)
+        k_t = k.transpose(1, 2)
+        v_t = v.transpose(1, 2)
+
+        if text_mask is not None:
+            causal = torch.tril(torch.ones(S, S, device=hidden_states.device, dtype=torch.bool))
+            padding = text_mask[:, None, None, :].bool()  # [B, 1, 1, S]
+            combined = causal[None, None] & padding  # [B, 1, S, S]
+            out = F.scaled_dot_product_attention(q_t, k_t, v_t, attn_mask=combined, enable_gqa=True)
+        else:
+            out = F.scaled_dot_product_attention(q_t, k_t, v_t, is_causal=True, enable_gqa=True)
+
+        out = out.transpose(1, 2).contiguous().view(B, S, -1)
+        return self.o_proj(out), k, v
+
+
+class Cosmos3CrossAttention(nn.Module):
+    """Generation pathway: full attention where visual Q attends to all K/V.
+
+    Dual-path implementation:
+
+    * **Non-SP path** (single GPU): the framework ``Attention`` layer with
+      explicit ``cat([k_und, k_gen])`` concatenation.  Text conditioning is
+      always present because K/V are physically concatenated.
+
+    * **SP path** (Ulysses active): the framework ``Attention`` layer with
+      ``joint_key/joint_value`` in ``AttentionMetadata``.  The Ulysses
+      wrapper head-slices the replicated UND K/V and performs all-to-all
+      on the sharded GEN Q/K/V so every query sees the full context.
+    """
+
+    def __init__(
+        self,
+        *,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        rms_norm_eps: float,
+        dtype: torch.dtype = torch.bfloat16,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_attention_heads
+        self.num_kv_heads = num_key_value_heads
+        self.head_dim = head_dim
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_local = self.num_heads // tp_size
+        self.num_kv_heads_local = self.num_kv_heads // tp_size
+
+        self.q_proj = ColumnParallelLinear(
+            hidden_size,
+            self.num_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.k_proj = ColumnParallelLinear(
+            hidden_size,
+            self.num_kv_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.k_proj",
+        )
+        self.v_proj = ColumnParallelLinear(
+            hidden_size,
+            self.num_kv_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.v_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            return_bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.q_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=rms_norm_eps, dtype=dtype)
+        self.k_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=rms_norm_eps, dtype=dtype)
+
+        self.local_attn = FrameworkAttention(
+            num_heads=self.num_heads_local,
+            head_size=self.head_dim,
+            causal=False,
+            softmax_scale=1.0 / (self.head_dim**0.5),
+            num_kv_heads=self.num_kv_heads_local,
+            skip_sequence_parallel=True,
+        )
+
+        # Lazy-created on first SP forward so it captures the active SP context.
+        self._sp_attn: nn.Module | None = None
+
+    def _get_sp_attn(self) -> nn.Module:
+        if self._sp_attn is None:
+            self._sp_attn = FrameworkAttention(
+                num_heads=self.num_heads,
+                head_size=self.head_dim,
+                causal=False,
+                softmax_scale=1.0 / (self.head_dim**0.5),
+                num_kv_heads=self.num_kv_heads,
+            )
+        return self._sp_attn
+
+    # -- Non-SP path: explicit K/V concatenation + framework Attention --------
+
+    def _forward_local(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        k_und: torch.Tensor,
+        v_und: torch.Tensor,
+    ) -> torch.Tensor:
+        B, S_gen = q.shape[:2]
+        k_all = torch.cat([k_und, k], dim=1)
+        v_all = torch.cat([v_und, v], dim=1)
+
+        out = self.local_attn(q, k_all, v_all)
+        return out.reshape(B, S_gen, -1)
+
+    # -- SP path: framework Attention with joint_key/value -------------------
+
+    def _forward_sp(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        k_und: torch.Tensor,
+        v_und: torch.Tensor,
+    ) -> torch.Tensor:
+        B, S_gen = q.shape[:2]
+
+        # Zero-length joint_query satisfies the Ulysses contract
+        # (joint_query, joint_key, joint_value must all be present) without
+        # adding text tokens to Q.  joint_len=0 keeps post_attention on the
+        # standard reverse-all-to-all path (no joint-output splitting).
+        joint_q = q.new_empty(B, 0, self.num_heads_local, self.head_dim)
+
+        attn_metadata = AttentionMetadata(
+            joint_query=joint_q,
+            joint_key=k_und,
+            joint_value=v_und,
+            joint_strategy="front",
+        )
+        out = self._get_sp_attn()(q, k, v, attn_metadata)
+        return out.reshape(B, S_gen, -1)
+
+    # -- Public forward: routes to the appropriate path ----------------------
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        k_und: torch.Tensor,
+        v_und: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: [B, S_gen_local, hidden_size] (may be sequence-sharded)
+            k_und: [B, S_und, H_kv_local, D] pre-computed UND keys (TP-sharded, post-norm/RoPE)
+            v_und: [B, S_und, H_kv_local, D] pre-computed UND values (TP-sharded)
+            freqs_cos: [B, S_gen_local, 1, D]
+            freqs_sin: [B, S_gen_local, 1, D]
+        """
+        B, S_gen, _ = hidden_states.shape
+
+        q = self.q_proj(hidden_states).view(B, S_gen, self.num_heads_local, self.head_dim)
+        k = self.k_proj(hidden_states).view(B, S_gen, self.num_kv_heads_local, self.head_dim)
+        v = self.v_proj(hidden_states).view(B, S_gen, self.num_kv_heads_local, self.head_dim)
+
+        # Per-head QK norm
+        q = F.rms_norm(q, (q.shape[-1],), self.q_norm.weight, self.q_norm.variance_epsilon)
+        k = F.rms_norm(k, (k.shape[-1],), self.k_norm.weight, self.k_norm.variance_epsilon)
+
+        # Qwen3-style RoPE
+        q, k = _apply_rotary_pos_emb(q, k, freqs_cos, freqs_sin)
+
+        if _is_sp_active():
+            out = self._forward_sp(q, k, v, k_und, v_und)
+        else:
+            out = self._forward_local(q, k, v, k_und, v_und)
+
+        return self.o_proj(out)
+
+
+# ---------------------------------------------------------------------------
+# Decoder Layers
+# ---------------------------------------------------------------------------
+class Cosmos3UndDecoderLayer(nn.Module):
+    """Understanding pathway decoder layer: causal self-attention + MLP."""
+
+    def __init__(
+        self,
+        *,
+        hidden_size: int,
+        intermediate_size: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        rms_norm_eps: float,
+        dtype: torch.dtype = torch.bfloat16,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.self_attn = Cosmos3CausalAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            head_dim=head_dim,
+            rms_norm_eps=rms_norm_eps,
+            dtype=dtype,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.input_layernorm = Qwen3VLTextRMSNorm(hidden_size, eps=rms_norm_eps, dtype=dtype)
+        self.post_attention_layernorm = Qwen3VLTextRMSNorm(hidden_size, eps=rms_norm_eps, dtype=dtype)
+        self.mlp = Cosmos3GatedMLP(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        freqs: tuple[torch.Tensor, torch.Tensor],
+        text_mask: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Returns (hidden_states, K, V) where K/V are for GEN cross-attention."""
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        cos, sin = freqs
+        attn_out, k, v = self.self_attn(hidden_states, cos, sin, text_mask)
+        hidden_states = residual + attn_out
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + self.mlp(hidden_states)
+
+        return hidden_states, k, v
+
+
+class Cosmos3GenDecoderLayer(nn.Module):
+    """Generation pathway decoder layer: cross-attention (to UND K/V) + MLP."""
+
+    def __init__(
+        self,
+        *,
+        layer_idx: int | None = None,
+        hidden_size: int,
+        intermediate_size: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        rms_norm_eps: float,
+        dtype: torch.dtype = torch.bfloat16,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.cross_attention = Cosmos3CrossAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            head_dim=head_dim,
+            rms_norm_eps=rms_norm_eps,
+            dtype=dtype,
+            quant_config=quant_config,
+            prefix=f"{prefix}.cross_attention",
+        )
+        self.input_layernorm = Qwen3VLTextRMSNorm(hidden_size, eps=rms_norm_eps, dtype=dtype)
+        self.post_attention_layernorm = Qwen3VLTextRMSNorm(hidden_size, eps=rms_norm_eps, dtype=dtype)
+        self.mlp = Cosmos3GatedMLP(
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        *,
+        k_und: torch.Tensor | None = None,
+        v_und: torch.Tensor | None = None,
+        freqs_cos: torch.Tensor | None = None,
+        freqs_sin: torch.Tensor | None = None,
+        cached_kv: list[tuple[torch.Tensor, torch.Tensor]] | None = None,
+        freqs_gen: tuple[torch.Tensor, torch.Tensor] | None = None,
+    ) -> torch.Tensor:
+        if cached_kv is not None:
+            if self.layer_idx is None:
+                raise ValueError("Cosmos3 GEN layer requires layer_idx when cached_kv is provided.")
+            k_und, v_und = cached_kv[self.layer_idx]
+        if freqs_gen is not None:
+            freqs_cos, freqs_sin = freqs_gen
+        if k_und is None or v_und is None or freqs_cos is None or freqs_sin is None:
+            raise ValueError("Cosmos3 GEN layer requires k_und/v_und/freqs_cos/freqs_sin.")
+
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.cross_attention(
+            hidden_states, k_und=k_und, v_und=v_und, freqs_cos=freqs_cos, freqs_sin=freqs_sin
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + self.mlp(hidden_states)
+
+        return hidden_states
+
+
+# ---------------------------------------------------------------------------
+# Language Model (Understanding pathway)
+# ---------------------------------------------------------------------------
+class Cosmos3LanguageModel(nn.Module):
+    """Understanding pathway: a standard causal LM that processes text tokens.
+
+    Returns per-layer K/V tensors for the generation pathway's cross-attention.
+    The UND pathway is independent of the denoising step, so its K/V can be
+    computed once and reused across all sampling steps.
+    """
+
+    def __init__(
+        self,
+        *,
+        hidden_size: int,
+        intermediate_size: int,
+        num_hidden_layers: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        head_dim: int,
+        vocab_size: int,
+        rms_norm_eps: float,
+        rope_theta: float,
+        mrope_section: list[int],
+        dtype: torch.dtype = torch.bfloat16,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_tokens = nn.Embedding(vocab_size, hidden_size)
+        self.rotary_emb = Qwen3VLTextRotaryEmbedding(
+            head_dim=head_dim,
+            rope_theta=rope_theta,
+            mrope_section=mrope_section,
+        )
+        self.layers = nn.ModuleList(
+            [
+                Cosmos3UndDecoderLayer(
+                    hidden_size=hidden_size,
+                    intermediate_size=intermediate_size,
+                    num_attention_heads=num_attention_heads,
+                    num_key_value_heads=num_key_value_heads,
+                    head_dim=head_dim,
+                    rms_norm_eps=rms_norm_eps,
+                    dtype=dtype,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{i}",
+                )
+                for i in range(num_hidden_layers)
+            ]
+        )
+        self.norm = Qwen3VLTextRMSNorm(hidden_size, eps=rms_norm_eps, dtype=dtype)
+
+    def forward(
+        self,
+        text_ids: torch.Tensor,
+        text_mask: torch.Tensor,
+        freqs: tuple[torch.Tensor, torch.Tensor],
+    ) -> list[tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Args:
+            text_ids: [B, S] token IDs
+            text_mask: [B, S] float mask (1=real, 0=pad)
+            freqs: (cos, sin) each [B, S, 1, D]
+
+        Returns:
+            List of (K, V) per layer, each [B, S, H_kv, D].
+        """
+        hidden = self.embed_tokens(text_ids)
+        mask_3d = text_mask.unsqueeze(-1)  # [B, S, 1]
+
+        cached_kv: list[tuple[torch.Tensor, torch.Tensor]] = []
+        for layer in self.layers:
+            hidden = hidden * mask_3d
+            hidden, k, v = layer(hidden, freqs, text_mask=None)
+            cached_kv.append((k, v))
+
+        return cached_kv
+
+
+# ---------------------------------------------------------------------------
+# Main Transformer
+# ---------------------------------------------------------------------------
+class Cosmos3GenSPPrepare(nn.Module):
+    """Module boundary used by _sp_plan to shard GEN states and RoPE together."""
+
+    def forward(
+        self,
+        hidden_gen: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        return hidden_gen, freqs_cos, freqs_sin
+
+
+class Cosmos3VFMTransformer(nn.Module):
+    """Cosmos3 VFM Transformer: UND language model + GEN denoising layers.
+
+    The UND pathway runs once per generation (K/V cached). The GEN pathway
+    runs at each denoising step.
+
+    Layerwise offloading uses ``gen_layers`` as the block container.
+
+    Sequence parallelism uses ``_sp_plan`` to shard/gather the GEN pathway at
+    module boundaries. ``Cosmos3CrossAttention`` checks
+    ``forward_context.sp_active`` at runtime and routes to the framework
+    ``Attention`` layer (with Ulysses all-to-all) or plain SDPA accordingly.
+    """
+
+    _repeated_blocks = ["Cosmos3GenDecoderLayer"]
+
+    _layerwise_offload_blocks_attr = "gen_layers"
+
+    packed_modules_mapping = {}
+
+    @staticmethod
+    def _is_transformer_block(name: str, module) -> bool:
+        return ("gen_layers" in name or "language_model.layers" in name) and name.split(".")[-1].isdigit()
+
+    _hsdp_shard_conditions = [_is_transformer_block]
+
+    _sp_plan = {
+        "gen_sp_prepare": {
+            0: SequenceParallelInput(split_dim=1, expected_dims=3, split_output=True),
+            1: SequenceParallelInput(split_dim=1, expected_dims=4, split_output=True),
+            2: SequenceParallelInput(split_dim=1, expected_dims=4, split_output=True),
+        },
+        "gen_sp_gather": SequenceParallelOutput(gather_dim=1, expected_dims=3),
+    }
+
+    @staticmethod
+    def _validate_supported_config(model_config: Any) -> None:
+        """Fail loudly when a checkpoint requests an unsupported architecture."""
+        expected_values = {
+            "qk_norm_for_diffusion": True,
+            "qk_norm_for_text": True,
+            "position_embedding_type": "unified_3d_mrope",
+            "unified_3d_mrope_reset_spatial_ids": True,
+            "joint_attn_implementation": "two_way",
+        }
+        for key, expected in expected_values.items():
+            actual = _tf_config_get(model_config, key, expected)
+            if actual != expected:
+                raise ValueError(f"Unsupported Cosmos3 transformer config: {key}={actual!r}; expected {expected!r}.")
+
+    def __init__(
+        self,
+        od_config: object | None = None,
+        *,
+        temporal_compression_factor: int | None = None,
+    ) -> None:
+        super().__init__()
+        model_config = getattr(od_config, "tf_model_config", None) if od_config else None
+        self._validate_supported_config(model_config)
+        rope_scaling = _tf_config_get(model_config, "rope_scaling", {}) or {}
+
+        self.hidden_size = int(_tf_config_get(model_config, "hidden_size", 4096))
+        self.num_hidden_layers = int(_tf_config_get(model_config, "num_hidden_layers", 36))
+        self.num_attention_heads = int(_tf_config_get(model_config, "num_attention_heads", 32))
+        self.num_key_value_heads = int(_tf_config_get(model_config, "num_key_value_heads", 8))
+        self.head_dim = int(_tf_config_get(model_config, "head_dim", 128))
+        self.intermediate_size = int(_tf_config_get(model_config, "intermediate_size", 12288))
+        self.vocab_size = int(_tf_config_get(model_config, "vocab_size", 151936))
+        self.rms_norm_eps = float(_tf_config_get(model_config, "rms_norm_eps", 1e-6))
+        self.rope_theta = float(_tf_config_get(model_config, "rope_theta", 5_000_000))
+        self.mrope_section = list(rope_scaling.get("mrope_section", [24, 20, 20]))
+        self.latent_patch_size = int(_tf_config_get(model_config, "latent_patch_size", 2))
+        self.latent_channel_size = int(_tf_config_get(model_config, "latent_channel", 48))
+        self.timestep_scale = float(_tf_config_get(model_config, "timestep_scale", 0.001))
+        self.base_fps = float(_tf_config_get(model_config, "base_fps", 24.0))
+        sound_gen_value = _od_config_get(od_config, "sound_gen", None)
+        sound_dim_value = _od_config_get(od_config, "sound_dim", None)
+        self.sound_gen = _as_bool(sound_gen_value) if sound_gen_value is not None else sound_dim_value is not None
+        self.sound_dim = int(sound_dim_value if sound_dim_value is not None else 64)
+        action_gen_value = _od_config_get(od_config, "action_gen", None)
+        action_dim_value = _od_config_get(od_config, "action_dim", None)
+        if action_dim_value is None:
+            action_dim_value = _od_config_get(od_config, "max_action_dim", None)
+        self.action_gen = _as_bool(action_gen_value) if action_gen_value is not None else action_dim_value is not None
+        self.action_dim = int(action_dim_value if action_dim_value is not None else 64)
+        self.num_embodiment_domains = int(_od_config_get(od_config, "num_embodiment_domains", 32))
+        from .sound_tokenizer import get_sound_latent_fps
+
+        self.sound_latent_fps = float(get_sound_latent_fps(od_config))
+        if temporal_compression_factor is None:
+            temporal_compression_factor = _tf_config_get(model_config, "temporal_compression_factor", 4)
+        self.temporal_compression_factor = int(temporal_compression_factor)
+        self.enable_fps_modulation = bool(_tf_config_get(model_config, "enable_fps_modulation", True))
+        self.temporal_modality_margin = int(
+            _tf_config_get(
+                model_config,
+                "unified_3d_mrope_temporal_modality_margin",
+                15000,
+            )
+        )
+        self.patch_latent_dim = (self.latent_patch_size**2) * self.latent_channel_size
+
+        dtype = getattr(od_config, "dtype", torch.bfloat16) if od_config else torch.bfloat16
+        quant_config = getattr(od_config, "quantization_config", None) if od_config else None
+
+        self.language_model = Cosmos3LanguageModel(
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            head_dim=self.head_dim,
+            vocab_size=self.vocab_size,
+            rms_norm_eps=self.rms_norm_eps,
+            rope_theta=self.rope_theta,
+            mrope_section=self.mrope_section,
+            dtype=dtype,
+            quant_config=quant_config,
+            prefix="language_model",
+        )
+
+        # vae2llm / llm2vae are small projection layers; not worth quantizing.
+        self.vae2llm = nn.Linear(self.patch_latent_dim, self.hidden_size)
+        self.llm2vae = nn.Linear(self.hidden_size, self.patch_latent_dim)
+        if self.action_gen:
+            self.action2llm = DomainAwareLinear(
+                self.action_dim,
+                self.hidden_size,
+                self.num_embodiment_domains,
+                dtype=dtype,
+            )
+            self.llm2action = DomainAwareLinear(
+                self.hidden_size,
+                self.action_dim,
+                self.num_embodiment_domains,
+                dtype=dtype,
+            )
+            self.action_modality_embed = nn.Parameter(torch.zeros(self.hidden_size, dtype=dtype))
+        if self.sound_gen:
+            self.sound2llm = nn.Linear(self.sound_dim, self.hidden_size)
+            self.llm2sound = nn.Linear(self.hidden_size, self.sound_dim)
+            self.sound_modality_embed = nn.Parameter(torch.zeros(self.hidden_size))
+
+        self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=torch.bfloat16)
+
+        self.gen_layers = nn.ModuleList(
+            [
+                Cosmos3GenDecoderLayer(
+                    layer_idx=i,
+                    hidden_size=self.hidden_size,
+                    intermediate_size=self.intermediate_size,
+                    num_attention_heads=self.num_attention_heads,
+                    num_key_value_heads=self.num_key_value_heads,
+                    head_dim=self.head_dim,
+                    rms_norm_eps=self.rms_norm_eps,
+                    dtype=dtype,
+                    quant_config=quant_config,
+                    prefix=f"gen_layers.{i}",
+                )
+                for i in range(self.num_hidden_layers)
+            ]
+        )
+
+        self.norm_moe_gen = Qwen3VLTextRMSNorm(self.hidden_size, eps=self.rms_norm_eps, dtype=dtype)
+        self.gen_sp_prepare = Cosmos3GenSPPrepare()
+        self.gen_sp_gather = nn.Identity()
+
+        # SDPA backend selection for torch.nn.attention.sdpa_kernel context.
+        # Default: allow all backends; override to restrict (e.g. FlashAttention only).
+        self.sdpa_backends = [
+            torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+            torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+            torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+            torch.nn.attention.SDPBackend.MATH,
+        ]
+
+        # Cached state (populated on first forward, reused across denoising steps)
+        self.cached_kv: list[tuple[torch.Tensor, torch.Tensor]] | None = None
+        self.cached_freqs_gen: tuple[torch.Tensor, torch.Tensor] | None = None
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+
+    # -- Patchify / Unpatchify -----------------------------------------------
+
+    def _pad_to_patch_size(self, h: int, w: int) -> tuple[int, int, int, int]:
+        """Returns (hp, wp, H_padded, W_padded)."""
+        p = self.latent_patch_size
+        H_padded = ((h + p - 1) // p) * p
+        W_padded = ((w + p - 1) // p) * p
+        return H_padded // p, W_padded // p, H_padded, W_padded
+
+    def patchify(self, latents: torch.Tensor, t: int, h: int, w: int) -> torch.Tensor:
+        """[B, C, t, h, w] -> [B, t*hp*wp, p*p*C], padding h/w if needed."""
+        B = latents.shape[0]
+        p = self.latent_patch_size
+        C = self.latent_channel_size
+        hp, wp, H_padded, W_padded = self._pad_to_patch_size(h, w)
+
+        if H_padded != h or W_padded != w:
+            latents = F.pad(latents, (0, W_padded - w, 0, H_padded - h))
+
+        x = latents.reshape(B, C, t, hp, p, wp, p)
+        x = x.permute(0, 2, 3, 5, 4, 6, 1)  # [B, t, hp, wp, p, p, C]
+        return x.reshape(B, t * hp * wp, p * p * C)
+
+    def unpatchify(self, tokens: torch.Tensor, t: int, h: int, w: int) -> torch.Tensor:
+        """[B, t*hp*wp, p*p*C] -> [B, C, t, h, w], cropping padding if needed."""
+        B = tokens.shape[0]
+        p = self.latent_patch_size
+        C = self.latent_channel_size
+        hp, wp, H_padded, W_padded = self._pad_to_patch_size(h, w)
+
+        x = tokens.reshape(B, t, hp, wp, p, p, C)
+        x = x.permute(0, 6, 1, 2, 4, 3, 5)  # [B, C, t, hp, p, wp, p]
+        x = x.reshape(B, C, t, H_padded, W_padded)
+
+        if H_padded != h or W_padded != w:
+            x = x[:, :, :, :h, :w]
+        return x
+
+    def pack_sound(self, sound_latents: torch.Tensor) -> torch.Tensor:
+        """[B, C_sound, T_sound] -> [B, T_sound, C_sound]."""
+        if sound_latents.ndim != 3:
+            raise ValueError(f"Cosmos3 sound latents must have shape [B, C, T], got {tuple(sound_latents.shape)}.")
+        if sound_latents.shape[1] != self.sound_dim:
+            raise ValueError(
+                f"Cosmos3 sound latent channel mismatch: expected {self.sound_dim}, got {sound_latents.shape[1]}."
+            )
+        return sound_latents.permute(0, 2, 1).contiguous()
+
+    @staticmethod
+    def unpack_sound(tokens: torch.Tensor) -> torch.Tensor:
+        """[B, T_sound, C_sound] -> [B, C_sound, T_sound]."""
+        return tokens.permute(0, 2, 1).contiguous()
+
+    def pack_action(self, action_latents: torch.Tensor) -> torch.Tensor:
+        """Validate and return action latents as [B, T_action, D_action] tokens."""
+        if action_latents.ndim != 3:
+            raise ValueError(f"Cosmos3 action latents must have shape [B, T, D], got {tuple(action_latents.shape)}.")
+        if action_latents.shape[-1] != self.action_dim:
+            raise ValueError(
+                f"Cosmos3 action latent dimension mismatch: expected {self.action_dim}, got {action_latents.shape[-1]}."
+            )
+        return action_latents.contiguous()
+
+    @staticmethod
+    def unpack_action(tokens: torch.Tensor) -> torch.Tensor:
+        """Return [B, T_action, D_action] action predictions."""
+        return tokens.contiguous()
+
+    # -- RoPE computation ----------------------------------------------------
+
+    def _compute_rope_freqs(
+        self,
+        text_mask: torch.Tensor,
+        t: int,
+        hp: int,
+        wp: int,
+        fps: float | None,
+        device: torch.device,
+        dtype: torch.dtype,
+        t_action: int | None = None,
+        action_start_frame_offset: int = 1,
+        action_fps: float | None = None,
+        t_sound: int | None = None,
+    ) -> tuple[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
+        """Compute mRoPE cos/sin for UND text and GEN media pathways."""
+        B = text_mask.shape[0]
+        S_text = text_mask.shape[1]
+        text_lengths = text_mask.sum(dim=1).long()
+        effective_fps = fps if fps is not None and t > 1 else None
+        action_frames = int(t_action or 0)
+        sound_frames = int(t_sound or 0)
+
+        text_pos_list = []
+        gen_pos_list = []
+        for b in range(B):
+            real_len = int(text_lengths[b].item())
+            t_pos, t_offset = compute_mrope_position_ids_text(real_len, temporal_offset=0)
+            media_temporal_offset = t_offset + self.temporal_modality_margin
+            v_pos, _ = compute_mrope_position_ids_vision(
+                t,
+                hp,
+                wp,
+                temporal_offset=media_temporal_offset,
+                fps=effective_fps,
+                base_fps=self.base_fps,
+                temporal_compression_factor=self.temporal_compression_factor,
+                enable_fps_modulation=self.enable_fps_modulation,
+            )
+            gen_positions = [v_pos]
+            if action_frames > 0:
+                a_pos, _ = compute_mrope_position_ids_action(
+                    action_frames,
+                    temporal_offset=media_temporal_offset,
+                    action_fps=action_fps if action_fps is not None else fps,
+                    base_fps=self.base_fps,
+                    base_temporal_compression_factor=self.temporal_compression_factor,
+                    enable_fps_modulation=self.enable_fps_modulation,
+                    start_frame_offset=action_start_frame_offset,
+                )
+                gen_positions.append(a_pos)
+            if sound_frames > 0:
+                s_pos, _ = compute_mrope_position_ids_sound(
+                    sound_frames,
+                    temporal_offset=media_temporal_offset,
+                    sound_latent_fps=self.sound_latent_fps,
+                    base_fps=self.base_fps,
+                    base_temporal_compression_factor=self.temporal_compression_factor,
+                    enable_fps_modulation=self.enable_fps_modulation,
+                )
+                gen_positions.append(s_pos)
+            pos_dtype = gen_positions[0].dtype
+            for pos in gen_positions[1:]:
+                pos_dtype = torch.promote_types(pos_dtype, pos.dtype)
+            v_pos = torch.cat([pos.to(pos_dtype) for pos in gen_positions], dim=1)
+            if real_len < S_text:
+                t_pos = torch.cat(
+                    [t_pos, torch.zeros(3, S_text - real_len, dtype=t_pos.dtype)],
+                    dim=1,
+                )
+            text_pos_list.append(t_pos)
+            gen_pos_list.append(v_pos)
+
+        text_pos_ids = torch.stack(text_pos_list, dim=1).to(device)  # [3, B, S_text]
+        gen_pos_ids = torch.stack(gen_pos_list, dim=1).to(device)  # [3, B, S_gen]
+
+        rotary_emb = self.language_model.rotary_emb
+        _dummy = torch.tensor([], dtype=dtype, device=device)
+        cos_und, sin_und = rotary_emb(_dummy, position_ids=text_pos_ids)
+        cos_gen, sin_gen = rotary_emb(_dummy, position_ids=gen_pos_ids)
+
+        freqs_und = (cos_und.unsqueeze(2), sin_und.unsqueeze(2))  # (B, S, 1, D)
+        freqs_gen = (cos_gen.unsqueeze(2), sin_gen.unsqueeze(2))
+        return freqs_und, freqs_gen
+
+    # -- Cache management ----------------------------------------------------
+
+    def reset_cache(self) -> None:
+        self.cached_kv = None
+        self.cached_freqs_gen = None
+
+    @staticmethod
+    def _validate_gen_sequence_parallel(
+        *,
+        s_gen: int,
+        s_video: int,
+        s_action: int,
+        s_sound: int,
+        has_action: bool,
+        has_sound: bool,
+        ulysses_size: int,
+    ) -> None:
+        if ulysses_size <= 1 or s_gen % ulysses_size == 0:
+            return
+
+        detail_parts = [f"video tokens {s_video}"]
+        if has_action:
+            detail_parts.append(f"action tokens {s_action}")
+        if has_sound:
+            detail_parts.append(f"sound tokens {s_sound}")
+        detail = " = " + " + ".join(detail_parts) if len(detail_parts) > 1 else ""
+        adjust_detail = (
+            "Adjust the spatial resolution, frame count, action chunk size, "
+            "sound duration, or sound latent FPS so the combined media sequence is a "
+            "multiple of ulysses_degree."
+            if has_action or has_sound
+            else (
+                "Adjust the spatial resolution so that "
+                "t * ceil(h/patch) * ceil(w/patch) is a multiple "
+                "of ulysses_degree."
+            )
+        )
+        raise ValueError(
+            f"GEN sequence length ({s_gen}{detail}) must be divisible by "
+            f"ulysses_degree ({ulysses_size}). {adjust_detail}"
+        )
+
+    # -- Forward -------------------------------------------------------------
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        text_ids: torch.Tensor,
+        text_mask: torch.Tensor,
+        video_shape: tuple[int, int, int],
+        fps: float | None = None,
+        action_latents: torch.Tensor | None = None,
+        action_domain_ids: torch.Tensor | None = None,
+        action_noisy_mask: torch.Tensor | None = None,
+        action_start_frame_offset: int = 1,
+        action_fps: float | None = None,
+        sound_latents: torch.Tensor | None = None,
+        noisy_frame_mask: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+        """
+        Args:
+            hidden_states: [B, C, t, h, w] noisy latents
+            timestep: [B] diffusion timestep
+            text_ids: [B, S_text] tokenized text
+            text_mask: [B, S_text] attention mask (1=real, 0=pad)
+            video_shape: (t, h, w) in latent space
+            fps: video frame rate for temporal mRoPE modulation
+            action_latents: Optional [B, T_action, D_action] noisy action latents.
+            action_domain_ids: Optional [B] embodiment domain IDs for action projections.
+            action_noisy_mask: Optional [B, T_action, 1] mask where 1=noisy
+                action token and 0=clean conditioned token.
+            sound_latents: Optional [B, C_sound, T_sound] noisy sound latents.
+            noisy_frame_mask: Optional [B, 1, t, 1, 1] mask where 1=noisy (add
+                timestep embedding, predict velocity) and 0=conditioned (clean
+                context, skip timestep embedding).  None means all frames noisy
+                (T2V mode).
+
+        Returns:
+            [B, C, t, h, w] velocity prediction, or
+            tuple outputs in video, action, sound order when extra modalities are provided.
+        """
+        t, h, w = video_shape
+        hp, wp, _, _ = self._pad_to_patch_size(h, w)
+        max_real_len = int(text_mask.sum(dim=1).max().item())
+        has_action = action_latents is not None
+        has_sound = sound_latents is not None
+        if has_action and not self.action_gen:
+            raise ValueError(
+                "Cosmos3 action generation was requested, but this transformer "
+                "was initialized without action modules. Check that the "
+                "transformer config enables action_gen or defines action_dim/max_action_dim."
+            )
+        if has_sound and not self.sound_gen:
+            raise ValueError(
+                "Cosmos3 sound generation was requested, but this transformer "
+                "was initialized without sound modules. Check that the "
+                "transformer config enables sound_gen or defines sound_dim."
+            )
+
+        # Query Ulysses state at runtime
+        ulysses_size, _, _ = _get_ulysses_state()
+
+        # Patchify latents and project to hidden space
+        hidden_video = self.vae2llm(self.patchify(hidden_states, t, h, w))
+        s_video = hidden_video.shape[1]
+        s_action = 0
+        hidden_action = None
+        s_sound = 0
+        hidden_sound = None
+        if action_latents is not None:
+            if action_latents.shape[0] != hidden_states.shape[0]:
+                raise ValueError(
+                    "Cosmos3 action and video batch sizes must match: "
+                    f"video={hidden_states.shape[0]}, action={action_latents.shape[0]}."
+                )
+            if action_domain_ids is None:
+                action_domain_ids = torch.zeros(action_latents.shape[0], dtype=torch.long, device=action_latents.device)
+            hidden_action = self.action2llm(self.pack_action(action_latents), action_domain_ids)
+            hidden_action = hidden_action + self.action_modality_embed.to(hidden_action.dtype)
+            s_action = hidden_action.shape[1]
+        if sound_latents is not None:
+            if sound_latents.shape[0] != hidden_states.shape[0]:
+                raise ValueError(
+                    "Cosmos3 sound and video batch sizes must match: "
+                    f"video={hidden_states.shape[0]}, sound={sound_latents.shape[0]}."
+                )
+            hidden_sound = self.sound2llm(self.pack_sound(sound_latents))
+            hidden_sound = hidden_sound + self.sound_modality_embed.to(hidden_sound.dtype)
+            s_sound = hidden_sound.shape[1]
+
+        # Timestep embedding (fp32 for precision).
+        # For I2V: only add to noisy tokens, not conditioned ones.
+        # Conditioned frames are clean context and should not receive
+        # the diffusion timestep signal.
+        with torch.autocast("cuda", enabled=True, dtype=torch.float32):
+            time_embed = self.time_embedder(timestep * self.timestep_scale)
+        time_embed = time_embed.to(hidden_states.dtype)
+
+        if noisy_frame_mask is not None:
+            # Build per-token mask from per-frame mask.
+            # noisy_frame_mask: [B, 1, t, 1, 1] → token mask: [B, t*hp*wp, 1]
+            token_noisy_mask = (
+                noisy_frame_mask[:, 0, :, 0, 0]  # [B, t]
+                .unsqueeze(-1)  # [B, t, 1]
+                .expand(-1, -1, hp * wp)  # [B, t, hp*wp]
+                .reshape(hidden_video.shape[0], -1, 1)  # [B, t*hp*wp, 1]
+            )
+            hidden_video = hidden_video + time_embed.unsqueeze(1) * token_noisy_mask
+        else:
+            hidden_video = hidden_video + time_embed.unsqueeze(1)
+
+        if hidden_action is not None:
+            if action_noisy_mask is None:
+                hidden_action = hidden_action + time_embed.unsqueeze(1)
+            else:
+                if action_noisy_mask.shape != (hidden_action.shape[0], hidden_action.shape[1], 1):
+                    raise ValueError(
+                        "Cosmos3 action_noisy_mask must have shape [B, T_action, 1], "
+                        f"got {tuple(action_noisy_mask.shape)}."
+                    )
+                hidden_action = hidden_action + time_embed.unsqueeze(1) * action_noisy_mask.to(hidden_action.dtype)
+
+        if hidden_sound is not None:
+            hidden_sound = hidden_sound + time_embed.unsqueeze(1)
+        hidden_parts = [hidden_video]
+        if hidden_action is not None:
+            hidden_parts.append(hidden_action)
+        if hidden_sound is not None:
+            hidden_parts.append(hidden_sound)
+        hidden_gen = torch.cat(hidden_parts, dim=1)
+
+        with torch.nn.attention.sdpa_kernel(self.sdpa_backends, set_priority=True):
+            # Run UND pathway once and cache K/V (replicated across all ranks)
+            if self.cached_kv is None:
+                freqs_und, freqs_gen = self._compute_rope_freqs(
+                    text_mask,
+                    t,
+                    hp,
+                    wp,
+                    fps,
+                    hidden_states.device,
+                    hidden_states.dtype,
+                    t_action=s_action,
+                    action_start_frame_offset=action_start_frame_offset,
+                    action_fps=action_fps,
+                    t_sound=s_sound,
+                )
+                cached_kv_full = self.language_model(text_ids, text_mask, freqs_und)
+                self.cached_freqs_gen = freqs_gen
+
+                # Trim to real text length (remove padding).  K/V stay replicated;
+                # the framework Attention layer head-slices them via joint_key/value.
+                self.cached_kv = [(k[:, :max_real_len], v[:, :max_real_len]) for k, v in cached_kv_full]
+
+            # Run GEN layers.  UND K/V (replicated) is passed to each layer;
+            # the Cosmos3CrossAttention forwards them as joint_key/value so the
+            # framework Attention handles the Ulysses head-slicing internally.
+            if self.cached_kv is None or self.cached_freqs_gen is None:
+                raise RuntimeError("Cosmos3 GEN cache was not initialized before running GEN layers.")
+            self._validate_gen_sequence_parallel(
+                s_gen=hidden_gen.shape[1],
+                s_video=s_video,
+                s_action=s_action,
+                s_sound=s_sound,
+                has_action=has_action,
+                has_sound=has_sound,
+                ulysses_size=ulysses_size,
+            )
+            freqs_cos, freqs_sin = self.cached_freqs_gen
+            hidden_gen, freqs_cos, freqs_sin = self.gen_sp_prepare(hidden_gen, freqs_cos, freqs_sin)
+            freqs_gen = (freqs_cos, freqs_sin)
+
+            if len(self.gen_layers) == len(self.cached_kv):
+                for layer, (k_und, v_und) in zip(self.gen_layers, self.cached_kv, strict=True):
+                    hidden_gen = layer(
+                        hidden_gen,
+                        k_und=k_und,
+                        v_und=v_und,
+                        freqs_cos=freqs_cos,
+                        freqs_sin=freqs_sin,
+                    )
+                    # Cache-dit's block wrapper may return a tuple; unwrap it.
+                    if isinstance(hidden_gen, tuple):
+                        hidden_gen = hidden_gen[0]
+            else:
+                # Cache-dit patches gen_layers to a grouped wrapper.
+                for layer in self.gen_layers:
+                    hidden_gen = layer(
+                        hidden_gen,
+                        cached_kv=self.cached_kv,
+                        freqs_gen=freqs_gen,
+                    )
+                    if isinstance(hidden_gen, tuple):
+                        hidden_gen = hidden_gen[0]
+
+            hidden_gen = self.gen_sp_gather(hidden_gen)
+
+        # Final norm and project back to latent space
+        hidden_gen = self.norm_moe_gen(hidden_gen)
+        if not has_action and not has_sound:
+            return self.unpatchify(self.llm2vae(hidden_gen), t, h, w)
+
+        split_sizes = [s_video]
+        if has_action:
+            split_sizes.append(s_action)
+        if has_sound:
+            split_sizes.append(s_sound)
+        split_hidden = hidden_gen.split(split_sizes, dim=1)
+        hidden_video = split_hidden[0]
+        video_pred = self.unpatchify(self.llm2vae(hidden_video), t, h, w)
+        outputs: list[torch.Tensor] = [video_pred]
+        split_idx = 1
+        if has_action:
+            hidden_action = split_hidden[split_idx]
+            split_idx += 1
+            assert action_domain_ids is not None
+            outputs.append(self.unpack_action(self.llm2action(hidden_action, action_domain_ids)))
+        if has_sound:
+            hidden_sound = split_hidden[split_idx]
+            outputs.append(self.unpack_sound(self.llm2sound(hidden_sound)))
+        return tuple(outputs)
+
+    def post_load_weights(self) -> None:
+        """Post-load processing: ensure correct dtypes."""
+        self.time_embedder.to(torch.float32)
diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py
index b6a5a3700e8..42ffb132fe7 100644
--- a/vllm_omni/diffusion/registry.py
+++ b/vllm_omni/diffusion/registry.py
@@ -256,6 +256,11 @@
         "pipeline_omnivoice",
         "OmniVoicePipeline",
     ),
+    "Cosmos3OmniDiffusersPipeline": (
+        "cosmos3",
+        "pipeline_cosmos3",
+        "Cosmos3OmniDiffusersPipeline",
+    ),
     "DiffusersAdapterPipeline": (
         "diffusers_adapter",
         "pipeline_diffusers_adapter",
@@ -482,6 +487,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) -
     "OmniVoicePipeline": "get_omnivoice_post_process_func",
     "DreamIDOmniPipeline": "get_dreamid_omni_post_process_func",
     "SenseNovaU1Pipeline": "get_sensenova_u1_post_process_func",
+    "Cosmos3OmniDiffusersPipeline": "get_cosmos3_post_process_func",
 }
 
 _DIFFUSION_PRE_PROCESS_FUNCS = {
@@ -505,6 +511,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) -
     "HunyuanVideo15ImageToVideoPipeline": "get_hunyuan_video_15_i2v_pre_process_func",
     "HunyuanImage3ForCausalMM": "get_hunyuan_image_3_pre_process_func",
     "MagiHumanPipeline": "get_magi_human_pre_process_func",
+    "Cosmos3OmniDiffusersPipeline": "get_cosmos3_pre_process_func",
 }
 
 
diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py
index 35b323487aa..7d9b8470853 100644
--- a/vllm_omni/engine/async_omni_engine.py
+++ b/vllm_omni/engine/async_omni_engine.py
@@ -1512,6 +1512,7 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list:
             "max_num_seqs": kwargs.get("max_num_seqs") or 1,
             "parallel_config": parallel_config,
             "model_class_name": kwargs.get("model_class_name", None),
+            "model_config": kwargs.get("model_config", None),
             "additional_config": kwargs.get("additional_config", None),
             "step_execution": kwargs.get("step_execution", False),
             "vae_use_slicing": kwargs.get("vae_use_slicing", False),
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index c1467f7190a..e6359d1c59d 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -1549,7 +1549,7 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request)
             return ImageGenerationResponse(created=int(time.time()), data=image_data)
 
         # Build params - pass through user values directly
-        prompt: OmniTextPrompt = {"prompt": request.prompt}
+        prompt: OmniTextPrompt = {"prompt": request.prompt, "modalities": ["image"]}
         if request.negative_prompt is not None:
             prompt["negative_prompt"] = request.negative_prompt
         gen_params = OmniDiffusionSamplingParams(num_outputs_per_prompt=request.n)
@@ -1725,7 +1725,7 @@ async def edit_images(
         )
     try:
         # 2. Build prompt & images params
-        prompt: OmniTextPrompt = {"prompt": prompt}
+        prompt: OmniTextPrompt = {"prompt": prompt, "modalities": ["image"]}
         if negative_prompt is not None:
             prompt["negative_prompt"] = negative_prompt
         input_images_list = []
@@ -2447,7 +2447,7 @@ async def _run_video_generation_job(
     started_at = time.perf_counter()
     output_path = None
     try:
-        video_bytes, stage_durations, peak_memory_mb = await handler.generate_video_bytes(
+        video_bytes, stage_durations, peak_memory_mb, action = await handler.generate_video_bytes(
             request, video_id, reference_image=reference_image
         )
 
@@ -2465,6 +2465,7 @@ async def _run_video_generation_job(
                 "inference_time_s": time.perf_counter() - started_at,
                 "stage_durations": stage_durations,
                 "peak_memory_mb": peak_memory_mb,
+                "action": action,
             },
         )
     except (EngineGenerateError, EngineDeadError) as exc:
@@ -2529,6 +2530,8 @@ async def _parse_video_form(
     flow_shift: float | None = Form(default=None),
     true_cfg_scale: float | None = Form(default=None),
     seed: int | None = Form(default=None),
+    generate_sound: bool | None = Form(default=None),
+    sound_duration: float | None = Form(default=None, gt=0.0),
     negative_prompt: str | None = Form(default=None),
     enable_frame_interpolation: bool | None = Form(default=None),
     frame_interpolation_exp: int | None = Form(default=None, ge=1),
@@ -2569,6 +2572,8 @@ async def _parse_video_form(
         "flow_shift": flow_shift,
         "true_cfg_scale": true_cfg_scale,
         "seed": seed,
+        "generate_sound": generate_sound,
+        "sound_duration": sound_duration,
         "negative_prompt": negative_prompt,
         "enable_frame_interpolation": enable_frame_interpolation,
         "frame_interpolation_exp": frame_interpolation_exp,
@@ -2672,7 +2677,7 @@ async def create_video_sync(
     raw_request.state.request_metadata = RequestResponseMetadata(request_id=request_id)
     started_at = time.perf_counter()
     try:
-        video_bytes, stage_durations, peak_memory_mb = await asyncio.wait_for(
+        video_bytes, stage_durations, peak_memory_mb, _action = await asyncio.wait_for(
             handler.generate_video_bytes(request, request_id, reference_image=reference_image),
             timeout=VIDEO_SYNC_TIMEOUT_S,
         )
diff --git a/vllm_omni/entrypoints/openai/protocol/__init__.py b/vllm_omni/entrypoints/openai/protocol/__init__.py
index c73203cc4db..58ff188250e 100644
--- a/vllm_omni/entrypoints/openai/protocol/__init__.py
+++ b/vllm_omni/entrypoints/openai/protocol/__init__.py
@@ -9,6 +9,7 @@
     ResponseFormat,
 )
 from vllm_omni.entrypoints.openai.protocol.videos import (
+    VideoAction,
     VideoData,
     VideoGenerationRequest,
     VideoGenerationResponse,
@@ -19,6 +20,7 @@
     "ImageGenerationRequest",
     "ImageGenerationResponse",
     "ResponseFormat",
+    "VideoAction",
     "VideoData",
     "VideoGenerationRequest",
     "VideoGenerationResponse",
diff --git a/vllm_omni/entrypoints/openai/protocol/videos.py b/vllm_omni/entrypoints/openai/protocol/videos.py
index d46c8d43d6b..ec5ab14e8d8 100644
--- a/vllm_omni/entrypoints/openai/protocol/videos.py
+++ b/vllm_omni/entrypoints/openai/protocol/videos.py
@@ -149,6 +149,15 @@ class VideoGenerationRequest(BaseModel):
         description="True CFG scale (model-specific parameter, may be ignored if not supported)",
     )
     seed: int | None = Field(default=None, description="Random seed for reproducibility")
+    generate_sound: bool = Field(
+        default=False,
+        description="Request model-generated audio for video models that support sound generation.",
+    )
+    sound_duration: float | None = Field(
+        default=None,
+        gt=0.0,
+        description="Duration in seconds for model-generated audio. Defaults to the generated video duration.",
+    )
 
     # vllm-omni extensions for post-generation frame interpolation.
     enable_frame_interpolation: bool = Field(
@@ -211,12 +220,24 @@ def resolve_video_params(self) -> VideoParams:
         return vp
 
 
+class VideoAction(BaseModel):
+    """Generated action sequence returned by action-capable video models."""
+
+    data: list[Any] = Field(..., description="JSON-serializable nested action values")
+    shape: list[int] = Field(..., description="Shape of the returned action data")
+    dtype: str | None = Field(default=None, description="Source action dtype, if available")
+    raw_action_dim: int | None = Field(default=None, description="Raw action dimension requested by the model")
+    action_mode: str | None = Field(default=None, description="Action generation mode")
+    domain_id: int | None = Field(default=None, description="Action embodiment domain id")
+
+
 class VideoData(BaseModel):
     """Single generated video data."""
 
     b64_json: str | None = Field(default=None, description="Base64-encoded MP4 video")
     url: str | None = Field(default=None, description="Video URL (not implemented)")
     revised_prompt: str | None = Field(default=None, description="Revised prompt (OpenAI compatibility, always null)")
+    action: VideoAction | None = Field(default=None, description="Generated action sequence metadata, if any")
 
 
 class VideoGenerationResponse(BaseModel):
@@ -289,6 +310,7 @@ class VideoResponse(BaseModel):
         default=0.0,
         description="Peak device memory usage in MB reported by the diffusion pipeline.",
     )
+    action: VideoAction | None = Field(default=None, description="Generated action sequence metadata, if any")
 
     @property
     def file_extension(self) -> str:
diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py
index 2c375fa2928..6f1be20fa04 100644
--- a/vllm_omni/entrypoints/openai/serving_chat.py
+++ b/vllm_omni/entrypoints/openai/serving_chat.py
@@ -2450,6 +2450,7 @@ async def generate_diffusion_images(
         gen_prompt: OmniTextPrompt = {
             "prompt": prompt,
             "negative_prompt": negative_prompt,
+            "modalities": ["image"],
         }
         if pil_images:
             if len(pil_images) == 1:
@@ -2621,6 +2622,7 @@ async def _create_diffusion_chat_completion(
             gen_prompt: OmniTextPrompt = {
                 "prompt": prompt,
                 "negative_prompt": negative_prompt,
+                "modalities": ["image"],
             }
             gen_params = OmniDiffusionSamplingParams(
                 height=height,
diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py
index 043ccd98322..f896fa15d75 100644
--- a/vllm_omni/entrypoints/openai/serving_video.py
+++ b/vllm_omni/entrypoints/openai/serving_video.py
@@ -16,6 +16,7 @@
 
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 from vllm_omni.entrypoints.openai.protocol.videos import (
+    VideoAction,
     VideoData,
     VideoGenerationRequest,
     VideoGenerationResponse,
@@ -44,6 +45,7 @@ class VideoGenerationArtifacts:
 
     videos: list[Any]
     audios: list[Any | None]
+    actions: list[VideoAction | None]
     audio_sample_rate: int
     output_fps: int
     stage_durations: dict[str, float]
@@ -96,7 +98,7 @@ async def _run_and_extract(
         reference_image: ReferenceImage | None = None,
     ) -> VideoGenerationArtifacts:
         """Run the generation pipeline and extract video/audio/profiler outputs."""
-        prompt: OmniTextPrompt = OmniTextPrompt(prompt=request.prompt)
+        prompt: OmniTextPrompt = OmniTextPrompt(prompt=request.prompt, modalities=["video"])
         if request.negative_prompt is not None:
             prompt["negative_prompt"] = request.negative_prompt
 
@@ -148,6 +150,10 @@ async def _run_and_extract(
         )
         if "flow_shift" in provided_fields and request.flow_shift is not None:
             gen_params.extra_args["flow_shift"] = request.flow_shift
+        if "generate_sound" in provided_fields:
+            gen_params.extra_args["generate_sound"] = request.generate_sound
+        if "sound_duration" in provided_fields and request.sound_duration is not None:
+            gen_params.extra_args["sound_duration"] = request.sound_duration
 
         # Apply model-specific extra parameters
         if request.extra_params is not None:
@@ -173,11 +179,13 @@ async def _run_and_extract(
         result = await self._run_generation(prompt, gen_params, reference_id)
         videos = self._extract_video_outputs(result)
         audios = self._extract_audio_outputs(result, expected_count=len(videos))
+        actions = self._extract_action_outputs(result, expected_count=len(videos))
         audio_sample_rate = self._resolve_audio_sample_rate(result)
         output_fps = (vp.fps or self._resolve_fps(result) or 24) * self._resolve_video_fps_multiplier(result)
         return VideoGenerationArtifacts(
             videos=videos,
             audios=audios,
+            actions=actions,
             audio_sample_rate=audio_sample_rate,
             output_fps=output_fps,
             stage_durations=self._extract_stage_durations(result),
@@ -211,7 +219,8 @@ async def generate_videos(
                         audio_sample_rate=artifacts.audio_sample_rate,
                         video_codec_options=video_codec_options,
                     )
-                )
+                ),
+                action=artifacts.actions[idx],
             )
             for idx, video in enumerate(artifacts.videos)
         ]
@@ -230,7 +239,7 @@ async def generate_video_bytes(
         reference_id: str,
         *,
         reference_image: ReferenceImage | None = None,
-    ) -> tuple[bytes, dict[str, float], float]:
+    ) -> tuple[bytes, dict[str, float], float, VideoAction | None]:
         """Generate a video and return raw MP4 bytes, bypassing base64 encoding."""
         artifacts = await self._run_and_extract(request, reference_id, reference_image=reference_image)
         if len(artifacts.videos) > 1:
@@ -255,22 +264,15 @@ async def generate_video_bytes(
         )
         _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000
         logger.info("Video response encoding (MP4 bytes): %.2f ms", _t_encode_ms)
-        return video_bytes, artifacts.stage_durations, artifacts.peak_memory_mb
+        return video_bytes, artifacts.stage_durations, artifacts.peak_memory_mb, artifacts.actions[0]
 
     @staticmethod
     def _resolve_video_fps_multiplier(result: Any) -> int:
-        custom_output = getattr(result, "custom_output", None)
+        custom_output = OmniOpenAIServingVideo._extract_custom_output(result)
         if isinstance(custom_output, dict):
             multiplier = custom_output.get("video_fps_multiplier")
             if multiplier is not None:
                 return int(multiplier)
-        request_output = getattr(result, "request_output", None)
-        if request_output is not None:
-            custom_output = getattr(request_output, "custom_output", None)
-            if isinstance(custom_output, dict):
-                multiplier = custom_output.get("video_fps_multiplier")
-                if multiplier is not None:
-                    return int(multiplier)
         return 1
 
     def _resolve_default_sampling_params(self) -> OmniDiffusionSamplingParams:
@@ -430,6 +432,119 @@ def _extract_audio_outputs(result: Any, expected_count: int) -> list[Any | None]
 
         return [audio] + [None] * max(expected_count - 1, 0)
 
+    @classmethod
+    def _extract_action_outputs(cls, result: Any, expected_count: int) -> list[VideoAction | None]:
+        custom_output = cls._extract_custom_output(result)
+        if not custom_output or "action" not in custom_output:
+            return [None] * expected_count
+
+        action_items = cls._split_action_payload(custom_output["action"], expected_count)
+        return [
+            cls._make_video_action(action_item, custom_output) if action_item is not None else None
+            for action_item in action_items
+        ]
+
+    @staticmethod
+    def _extract_custom_output(result: Any) -> dict[str, Any]:
+        custom_output = getattr(result, "custom_output", None)
+        if isinstance(custom_output, dict):
+            return custom_output
+
+        request_output = getattr(result, "request_output", None)
+        if isinstance(request_output, dict):
+            custom_output = request_output.get("custom_output")
+            if custom_output is None:
+                custom_output = request_output.get("_custom_output")
+        elif request_output is not None:
+            custom_output = getattr(request_output, "custom_output", None)
+            if custom_output is None:
+                custom_output = getattr(request_output, "_custom_output", None)
+
+        return custom_output if isinstance(custom_output, dict) else {}
+
+    @classmethod
+    def _split_action_payload(cls, action: Any, expected_count: int) -> list[Any | None]:
+        if expected_count <= 0:
+            return []
+
+        shape = cls._shape_of(action)
+        if len(shape) >= 3:
+            count = min(shape[0], expected_count)
+            actions = [cls._index_action(action, i) for i in range(count)]
+            actions.extend([None] * (expected_count - count))
+            return actions
+
+        return [action] + [None] * (expected_count - 1)
+
+    @classmethod
+    def _make_video_action(cls, action: Any, custom_output: dict[str, Any]) -> VideoAction:
+        data = cls._to_jsonable(action)
+        if not isinstance(data, list):
+            data = [data]
+
+        action_mode = custom_output.get("action_mode")
+        return VideoAction(
+            data=data,
+            shape=cls._shape_of(action),
+            dtype=cls._dtype_of(action),
+            raw_action_dim=cls._coerce_optional_int(custom_output.get("raw_action_dim")),
+            action_mode=str(action_mode) if action_mode is not None else None,
+            domain_id=cls._coerce_optional_int(custom_output.get("domain_id")),
+        )
+
+    @staticmethod
+    def _index_action(action: Any, index: int) -> Any:
+        try:
+            return action[index]
+        except (IndexError, KeyError, TypeError):
+            return None
+
+    @classmethod
+    def _to_jsonable(cls, value: Any) -> Any:
+        if hasattr(value, "detach"):
+            value = value.detach()
+        if hasattr(value, "cpu"):
+            value = value.cpu()
+        if hasattr(value, "tolist"):
+            return cls._to_jsonable(value.tolist())
+        if isinstance(value, (list, tuple)):
+            return [cls._to_jsonable(item) for item in value]
+        if hasattr(value, "item"):
+            try:
+                return value.item()
+            except (TypeError, ValueError):
+                pass
+        return value
+
+    @classmethod
+    def _shape_of(cls, value: Any) -> list[int]:
+        shape = getattr(value, "shape", None)
+        if shape is not None:
+            try:
+                return [int(dim) for dim in shape]
+            except (TypeError, ValueError):
+                pass
+        if isinstance(value, (list, tuple)):
+            if not value:
+                return [0]
+            return [len(value)] + cls._shape_of(value[0])
+        return []
+
+    @staticmethod
+    def _dtype_of(value: Any) -> str | None:
+        dtype = getattr(value, "dtype", None)
+        return str(dtype) if dtype is not None else None
+
+    @staticmethod
+    def _coerce_optional_int(value: Any) -> int | None:
+        if value is None:
+            return None
+        try:
+            value = value.item() if hasattr(value, "item") else value
+            return int(value)
+        except (TypeError, ValueError):
+            return None
+
     def _resolve_audio_sample_rate(self, result: Any) -> int:
         result_sample_rate = self._extract_audio_sample_rate_from_result(result)
         if result_sample_rate is not None:
diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py
index 1b80f4b1b77..99877b6b9c0 100644
--- a/vllm_omni/inputs/data.py
+++ b/vllm_omni/inputs/data.py
@@ -33,6 +33,7 @@ class OmniTextPrompt(TextPrompt):
     """
 
     negative_prompt: NotRequired[str]
+    modalities: NotRequired[list[str]]
     prompt_embeds: NotRequired[torch.Tensor]
     negative_prompt_embeds: NotRequired[torch.Tensor]
     additional_information: NotRequired[dict[str, Any]]

From ee77c619081dc6531c260ada8687f80fc484eb39 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 14 May 2026 13:41:22 +0200
Subject: [PATCH 02/41] Small qol improvements

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../cosmos3/audio_tokenizer/activations.py    |  2 -
 .../cosmos3/audio_tokenizer/bottlenecks.py    |  3 +-
 .../models/cosmos3/audio_tokenizer/modules.py | 39 +++----------------
 .../models/cosmos3/pipeline_cosmos3.py        |  2 +-
 .../models/cosmos3/transformer_cosmos3.py     |  4 +-
 5 files changed, 10 insertions(+), 40 deletions(-)

diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py
index 02678a4ef09..0c3daaa4ac5 100755
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py
@@ -8,7 +8,6 @@
 from torch.nn import Parameter
 
 
-# https://github.com/jaywalnut310/vits/blob/main/commons.py
 @torch.jit.script
 def fused_add_tanh_sigmoid_multiply(
     input_a: torch.Tensor, input_b: torch.Tensor, n_channels: list[int]
@@ -21,7 +20,6 @@ def fused_add_tanh_sigmoid_multiply(
     return acts  # [B,C,T]
 
 
-# about 10% faster training. no_div_by_zero (1e-9) baked in
 @torch.jit.script
 def fused_snake(x: torch.Tensor, alpha: torch.Tensor, beta: torch.Tensor) -> torch.Tensor:
     return x + (1.0 / (beta + 1e-9)) * pow(sin(x * alpha), 2)
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py
index 191f653c470..dc797d051ff 100755
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py
@@ -68,8 +68,7 @@ def encode(
         Encode input through VAE bottleneck.
 
         Args:
-            x: Input tensor with shape [B, C*2, T] where C*2 contains
-               concatenated mean and scale parameters
+            x: Input tensor with shape [B, C*2, T] where C*2 contains concatenated mean and scale parameters
             return_info: Whether to return additional info dict
 
         Returns:
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py
index 03c08938dcf..55a8597f128 100755
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py
@@ -12,7 +12,6 @@
 from typing import Any, Literal
 
 import torch
-import torch.nn.functional as F
 from torch import Tensor, nn
 from torch.cuda import amp
 from torch.nn.utils import weight_norm
@@ -63,36 +62,6 @@ def may_mask(
     return x
 
 
-class LayerNorm(nn.Module):
-    """
-    LayerNorm with optional bias.
-    PyTorch doesn't support bias=False natively.
-    """
-
-    def __init__(self, size: int, gamma0: float = 1, eps: float = 1e-5, use_bias: bool = False) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(size))
-        self.bias = nn.Parameter(torch.zeros(size)) if use_bias else None
-        self.eps = eps
-        self.size = size
-
-    def forward(self, tensor: Tensor) -> Tensor:
-        """
-        Forward pass.
-
-        Args:
-            tensor: Input tensor of shape (B, T, C)
-
-        Returns:
-            Normalized tensor
-        """
-        dtype = tensor.dtype
-        # fp32 to avoid numerical issues
-        with amp.autocast(enabled=True, dtype=torch.float32):
-            tensor = F.layer_norm(tensor, self.weight.shape, self.weight, self.bias, self.eps)
-        return tensor.to(dtype)
-
-
 class ConvNeXtBlock(nn.Module):
     """
     ConvNeXt 1D Block adapted from https://github.com/charactr-platform/vocos
@@ -131,7 +100,7 @@ def __init__(
                 nn.Conv1d(dim, dim, kernel_size=7, groups=dim),
             )
 
-        self.norm = LayerNorm(dim)
+        self.norm = nn.LayerNorm(dim, bias=False)
         self.pwconv1 = nn.Conv1d(dim, intermediate_dim, 1)  # pointwise/1x1 convs
         self.act = activations.SnakeBeta(intermediate_dim) if use_snake else nn.GELU()
 
@@ -153,7 +122,11 @@ def forward(self, x: Tensor, mask: Tensor | None = None) -> Tensor:
         """
         residual = x  # [B,C,T]
         x = self.dwconv(may_mask(x, mask))  # [B,C,T]
-        x = self.norm(x.permute(0, 2, 1)).permute(0, 2, 1)  # [B,C,T] -> [B,T,C] -> [B,C,T]
+        x = x.permute(0, 2, 1)  # [B,C,T] -> [B,T,C]
+        dtype = x.dtype
+        with amp.autocast(enabled=True, dtype=torch.float32):
+            x = self.norm(x)
+        x = x.to(dtype).permute(0, 2, 1)  # [B,T,C] -> [B,C,T]
         x = self.pwconv1(x)  # [B,intermediate_dim,T]
         x = self.act(x)  # [B,intermediate_dim,T]
         x = self.pwconv2(x)  # [B,C,T]
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 634be5f6ca7..3c84b39ab8a 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -1550,7 +1550,7 @@ def forward(
             raise ValueError(
                 "Cosmos3 action generation was requested, but the transformer was "
                 "initialized without action modules. Check that the checkpoint config "
-                "enables action_gen or defines action_dim/max_action_dim and includes action weights."
+                "enables action_gen and includes action weights."
             )
         if sound_enabled and is_t2i:
             raise ValueError(
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 22ff22caeaf..118b78cbaed 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -1062,7 +1062,7 @@ def __init__(
         action_dim_value = _od_config_get(od_config, "action_dim", None)
         if action_dim_value is None:
             action_dim_value = _od_config_get(od_config, "max_action_dim", None)
-        self.action_gen = _as_bool(action_gen_value) if action_gen_value is not None else action_dim_value is not None
+        self.action_gen = _as_bool(action_gen_value) if action_gen_value is not None else False
         self.action_dim = int(action_dim_value if action_dim_value is not None else 64)
         self.num_embodiment_domains = int(_od_config_get(od_config, "num_embodiment_domains", 32))
         from .sound_tokenizer import get_sound_latent_fps
@@ -1409,7 +1409,7 @@ def forward(
             raise ValueError(
                 "Cosmos3 action generation was requested, but this transformer "
                 "was initialized without action modules. Check that the "
-                "transformer config enables action_gen or defines action_dim/max_action_dim."
+                "transformer config enables action_gen."
             )
         if has_sound and not self.sound_gen:
             raise ValueError(

From 0d0542f1316e00f7b814ed6609ff297cd265f177 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 14 May 2026 14:01:23 +0200
Subject: [PATCH 03/41] Updated docs for Cosmos3

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 docs/.nav.yml                                 |   4 +-
 .../examples/offline_inference/cosmos3.md     |  91 ++++
 .../offline_inference/image_to_video.md       |  31 +-
 .../offline_inference/text_to_image.md        |  56 +-
 .../offline_inference/text_to_video.md        |  26 -
 .../examples/online_serving/cosmos3.md        |  93 ++++
 .../examples/online_serving/image_to_video.md | 120 ++---
 .../examples/online_serving/text_to_image.md  |  80 +--
 .../examples/online_serving/text_to_video.md  | 155 +-----
 examples/offline_inference/cosmos3/README.md  |  81 +++
 examples/offline_inference/cosmos3/end2end.py | 506 ++++++++++++++++++
 .../image_to_video/README.md                  |  29 +-
 .../offline_inference/text_to_image/README.md |  25 +-
 .../text_to_video/text_to_video.md            |  23 -
 examples/online_serving/cosmos3/README.md     |  63 +++
 .../cosmos3/run_curl_action_policy.sh         |  63 +++
 .../online_serving/cosmos3/run_curl_i2v.sh    |  22 +
 .../online_serving/cosmos3/run_curl_t2i.sh    |  21 +
 .../online_serving/cosmos3/run_curl_t2v.sh    |  20 +
 .../cosmos3/run_curl_t2v_sound.sh             |  22 +
 examples/online_serving/cosmos3/run_server.sh |  48 ++
 .../online_serving/image_to_video/README.md   |  66 +--
 .../online_serving/text_to_image/README.md    |  35 --
 .../online_serving/text_to_video/README.md    |  63 ---
 24 files changed, 1134 insertions(+), 609 deletions(-)
 create mode 100644 docs/user_guide/examples/offline_inference/cosmos3.md
 create mode 100644 docs/user_guide/examples/online_serving/cosmos3.md
 create mode 100644 examples/offline_inference/cosmos3/README.md
 create mode 100644 examples/offline_inference/cosmos3/end2end.py
 create mode 100644 examples/online_serving/cosmos3/README.md
 create mode 100644 examples/online_serving/cosmos3/run_curl_action_policy.sh
 create mode 100644 examples/online_serving/cosmos3/run_curl_i2v.sh
 create mode 100644 examples/online_serving/cosmos3/run_curl_t2i.sh
 create mode 100644 examples/online_serving/cosmos3/run_curl_t2v.sh
 create mode 100644 examples/online_serving/cosmos3/run_curl_t2v_sound.sh
 create mode 100644 examples/online_serving/cosmos3/run_server.sh

diff --git a/docs/.nav.yml b/docs/.nav.yml
index 55283f0e8b1..b1ad961ab0f 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -17,6 +17,7 @@ nav:
     - Offline Inference:
       - BAGEL-7B-MoT: user_guide/examples/offline_inference/bagel.md
       - GLM-Image Multistage End-to-End Inference: user_guide/examples/offline_inference/glm_image.md
+      - Cosmos3: user_guide/examples/offline_inference/cosmos3.md
       - Helios Video Generation: user_guide/examples/offline_inference/helios.md
       - HunyuanImage-3.0 Image-to-Text Inference: user_guide/examples/offline_inference/hunyuan_image3.md
       - Image-To-Image: user_guide/examples/offline_inference/image_to_image.md
@@ -34,6 +35,7 @@ nav:
     - Online Serving:
       - BAGEL-7B-MoT: user_guide/examples/online_serving/bagel.md
       - vLLM-Omni Helm Chart: user_guide/examples/online_serving/chart-helm.md
+      - Cosmos3: user_guide/examples/online_serving/cosmos3.md
       - Diffusers Backend Adapter: user_guide/examples/online_serving/diffusers_pipeline_adapter.md
       - GLM-Image Online Serving: user_guide/examples/online_serving/glm_image.md
       - Image-To-Image: user_guide/examples/online_serving/image_to_image.md
@@ -41,8 +43,8 @@ nav:
       - Online serving Example of vLLM-Omni for MiMo-Audio: user_guide/examples/online_serving/mimo_audio.md
       - Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md
       - Qwen3-Omni: user_guide/examples/online_serving/qwen3_omni.md
-      - Text-To-Speech: user_guide/examples/online_serving/text_to_speech.md
       - Text-To-Image: user_guide/examples/online_serving/text_to_image.md
+      - Text-To-Speech: user_guide/examples/online_serving/text_to_speech.md
       - Text-To-Video: user_guide/examples/online_serving/text_to_video.md
   - General:
     - usage/*
diff --git a/docs/user_guide/examples/offline_inference/cosmos3.md b/docs/user_guide/examples/offline_inference/cosmos3.md
new file mode 100644
index 00000000000..a750080cc33
--- /dev/null
+++ b/docs/user_guide/examples/offline_inference/cosmos3.md
@@ -0,0 +1,91 @@
+# Cosmos3
+
+Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/cosmos3>.
+
+
+Cosmos3 uses `Cosmos3OmniDiffusersPipeline` for text-to-image, text-to-video, image-to-video, video-with-sound, and action generation. Set `COSMOS3_MODEL` to a local Diffusers-format checkpoint before running these examples.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+cd examples/offline_inference/cosmos3
+```
+
+## Text-to-Image
+
+```bash
+python end2end.py \
+  --task t2i \
+  --prompt "A small warehouse robot carrying a blue box, clean product photography" \
+  --negative-prompt "blurry, distorted, low quality" \
+  --output cosmos3_t2i.png
+```
+
+## Text-to-Video
+
+```bash
+python end2end.py \
+  --task t2v \
+  --prompt "A small warehouse robot moves a blue box across a clean floor." \
+  --negative-prompt "blurry, distorted, low quality" \
+  --output cosmos3_t2v.mp4
+```
+
+## Image-to-Video
+
+Download an example image or provide your own image path.
+
+```bash
+wget https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg
+
+python end2end.py \
+  --task i2v \
+  --image cherry_blossom.jpg \
+  --prompt "Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
+  --negative-prompt "blurry, distorted, low quality" \
+  --output cosmos3_i2v.mp4
+```
+
+## Video With Sound
+
+This path requires a sound-capable Cosmos3 checkpoint with `sound_gen` weights.
+
+```bash
+python end2end.py \
+  --task t2v_sound \
+  --prompt "A small warehouse robot rolls across the floor with soft motor sounds." \
+  --negative-prompt "blurry, distorted, low quality" \
+  --sound-duration 3.4 \
+  --output cosmos3_t2v_sound.mp4
+```
+
+## Action Policy
+
+This path requires an action-capable Cosmos3 checkpoint with `action_gen` weights. The example returns a video plus an action JSON payload. Pass either `--domain-name` or `--domain-id`.
+
+```bash
+python end2end.py \
+  --task action_policy \
+  --image cherry_blossom.jpg \
+  --prompt "Predict the robot action for moving toward the target." \
+  --domain-name bridge_orig_lerobot \
+  --raw-action-dim 2 \
+  --action-chunk-size 16 \
+  --output cosmos3_action_policy.mp4 \
+  --action-output cosmos3_action_policy_action.json
+```
+
+## Common Options
+
+- `--enable-layerwise-offload`: use layerwise offload for memory-constrained runs.
+- `--cache-backend cache_dit`: enable Cache-DiT where supported.
+- `--cfg-parallel-size 2`, `--ulysses-degree`, `--tensor-parallel-size`, `--use-hsdp`: enable parallel execution options.
+- `--height`, `--width`, `--num-frames`, `--num-inference-steps`, `--guidance-scale`, `--fps`: override task defaults.
+
+Do not use model-level `--enable-cpu-offload` for Cosmos3. Use `--enable-layerwise-offload` instead.
+
+## Example materials
+
+??? abstract "end2end.py"
+    ``````py
+    --8<-- "examples/offline_inference/cosmos3/end2end.py"
+    ``````
diff --git a/docs/user_guide/examples/offline_inference/image_to_video.md b/docs/user_guide/examples/offline_inference/image_to_video.md
index 5011ccf1978..cac6c1f4c95 100644
--- a/docs/user_guide/examples/offline_inference/image_to_video.md
+++ b/docs/user_guide/examples/offline_inference/image_to_video.md
@@ -3,7 +3,7 @@
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video>.
 
 
-This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models and Cosmos3 with vLLM-Omni's offline inference API.
+This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models with vLLM-Omni's offline inference API.
 
 ## Supported Models
 
@@ -11,7 +11,6 @@ This example demonstrates how to generate videos from images using Wan2.2 Image-
 |-------|--------------------|----------------|---------------|----------|
 | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
 | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
-| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | auto, 720p area | 81 | 35 | 4.0 |
 
 ## Local CLI Usage
 
@@ -59,34 +58,10 @@ python image_to_video.py \
   --output i2v_output.mp4
 ```
 
-### Cosmos3
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-python image_to_video.py \
-  --model "$COSMOS3_MODEL" \
-  --model-class-name Cosmos3OmniDiffusersPipeline \
-  --image cherry_blossom.jpg \
-  --prompt "Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
-  --negative-prompt "blurry, distorted, low quality" \
-  --height 720 \
-  --width 1280 \
-  --num-frames 81 \
-  --guidance-scale 4.0 \
-  --num-inference-steps 35 \
-  --fps 24 \
-  --output cosmos3_i2v_output.mp4
-```
-
-For Cosmos3 I2V, the input image is resized and center-cropped by the pipeline. If `--height` and `--width` are omitted, this example chooses a 720p-area resolution from the input aspect ratio. Cosmos3 currently supports one prompt and one video per request, and model-level CPU offload is not supported; use `--enable-layerwise-offload` instead.
-
 Key arguments:
 
 - `--model`: Model ID (I2V-A14B for MoE, TI2V-5B for unified T2V+I2V).
-- `--model-class-name`: explicit pipeline class. Use `Cosmos3OmniDiffusersPipeline` for Cosmos3 checkpoints.
+- `--model-class-name`: explicit pipeline class override.
 - `--image`: Path to input image (required).
 - `--prompt`: Text description of desired motion/animation.
 - `--height/--width`: Output resolution (auto-calculated from image if not set). Dimensions should be multiples of 16.
@@ -114,7 +89,7 @@ Key arguments:
 > ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage.
 
 For Wan2.2 LightX2V-converted local Diffusers directories and related LoRA
-assets, see the [LoRA guide](../../diffusion/lora.md#wan22-lightx2v-offline-assembly).
+assets, see the [LoRA guide](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/lora.md#wan22-lightx2v-offline-assembly).
 
 ## Example materials
 
diff --git a/docs/user_guide/examples/offline_inference/text_to_image.md b/docs/user_guide/examples/offline_inference/text_to_image.md
index e9bf48d7aa1..d2c87da6458 100644
--- a/docs/user_guide/examples/offline_inference/text_to_image.md
+++ b/docs/user_guide/examples/offline_inference/text_to_image.md
@@ -32,11 +32,12 @@ This folder provides several entrypoints for experimenting with text-to-image di
 | `AIDC-AI/Ovis-Image-7B` | 1024 x 1024 | 71.8 | 17.1 |
 | `OmniGen2/OmniGen2` |  1024 x 1024 | 20.1 | 14.7 |
 | `stabilityai/stable-diffusion-3.5-medium` | 1024 x 1024 | 20.1 | 15.6 |
-| `black-forest-labs/FLUX.1-dev` | 1024 x 1024 | 77.6 | 31.4 |
+| `black-forest-labs/FLUX.1-dev` | 1024 x 1024 | 33.9 | 31.4 |
+| `black-forest-labs/FLUX.1-schnell` | 1024 x 1024 | 33.9 | 31.4 |
 | `black-forest-labs/FLUX.2-klein-4B` | 1024 x 1024 | 72.7 | 14.9 |
 | `black-forest-labs/FLUX.2-klein-9B` | 1024 x 1024 | 37.1 | 32.3 |
 | `black-forest-labs/FLUX.2-dev` | 1024 x 1024 | 65.7 | >80 (CPU offload required) |
-| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | 1024 x 1024 | model/checkpoint dependent | local checkpoint |
+| `HunyuanImage-3.0` | 1024 x 1024 | 80.0 (TP≥3)  | 160 |
 
 !!! info
 *Peak VRAM:  based on basic single-card usage, batch size =1, without any acceleration/optimization features. FLUX.2-dev requires `--enable-cpu-offload` on a single 80 GiB GPU.
@@ -76,11 +77,12 @@ python text_to_image.py \
 | Argument | Type | Default | Description |
 | -------- | ---- | ------- | ----------- |
 | `--model` | str | `"Qwen/Qwen-Image"` | Diffusion model name or local path |
+| `--model-class-name` | str | `None` | Override pipeline class |
 | `--prompt` | str | `"a cup of coffee on the table"` | Text description for image generation |
 | `--seed` | int | `142` | Integer seed for deterministic sampling |
 | `--negative-prompt` | str | `None` | Negative prompt for classifier-free conditional guidance |
 | `--cfg-scale` | float | `4.0` | True CFG scale (model-specific guidance strength) |
-| `--guidance-scale` | float | `1.0` | Classifier-free guidance scale |
+| `--guidance-scale` | float | `4.0` | Classifier-free guidance scale |
 | `--num-images-per-prompt` | int | `1` | Number of images per prompt (saved as `output`, `output_1`, ...) |
 | `--num-inference-steps` | int | `50` | Diffusion sampling steps (more steps = higher quality, slower) |
 | `--height` | int | `1024` | Output image height in pixels |
@@ -95,6 +97,8 @@ python text_to_image.py \
 | `--enable-cpu-offload` | flag | off | Enable CPU offloading for diffusion models |
 | `--lora-path` | str | — | Path to PEFT LoRA adapter folder |
 | `--lora-scale` | float | `1.0` | Scale factor for LoRA weights |
+| `--use-system-prompt` | str | `None` | System prompt preset: `en_unified`, `en_vanilla`, `en_recaption`, `en_think_recaption`, `dynamic`, `None`, or custom text. Recommended: `en_unified`. Only for HunyuanImage-3.0.|
+| `--system-prompt` | str | `None` | Custom system prompt text. Only used when `--use-system-prompt` is set to `custom`. Only for HunyuanImage-3.0.|
 
 **NextStep-1.1 specific arguments:**
 
@@ -128,6 +132,19 @@ python text_to_image.py \
 
 `Tongyi-MAI/Z-Image-Turbo` is a distilled version of Z-Image. Distilled diffusion models usually require less number of inference steps (4~9), and Classifier-Free Guidance (CFG) is usually NOT applied. Similar distilled models are `black-forest-labs/FLUX.2-klein-4B` and `black-forest-labs/FLUX.2-klein-9B`.
 
+Advanced UAA example (requires 2 GPUs):
+
+```bash
+python text_to_image.py \
+  --model Tongyi-MAI/Z-Image-Turbo \
+  --prompt "a cup of coffee on the table" \
+  --ulysses-degree 2 \
+  --ulysses-mode advanced_uaa \
+  --height 1024 \
+  --width 1024 \
+  --output outputs/coffee_hybrid.png
+```
+
 ### NextStep Models
 
 NextStep-1.1 supports extra arguments for dual-level CFG control:
@@ -165,28 +182,6 @@ python examples/offline_inference/text_to_image/text_to_image.py \
   --output flux2-dev.png
 ```
 
-### Cosmos3
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-python text_to_image.py \
-  --model "$COSMOS3_MODEL" \
-  --model-class-name Cosmos3OmniDiffusersPipeline \
-  --prompt "A small warehouse robot carrying a blue box, clean product photography" \
-  --negative-prompt "blurry, distorted, low quality" \
-  --guidance-scale 7.0 \
-  --num-inference-steps 50 \
-  --height 1024 \
-  --width 1024 \
-  --num-images-per-prompt 1 \
-  --output cosmos3_t2i.png
-```
-
-This script marks text-to-image requests with `modalities=["image"]`, which selects Cosmos3 T2I. Cosmos3 currently supports one prompt per request; use `--num-images-per-prompt` to request multiple images for that prompt. Model-level CPU offload is not supported for Cosmos3, so use `--enable-layerwise-offload` for offload instead.
-
 ### Batch Requests (Multiple Prompts)
 
 You can pass multiple prompts in a single `generate` call.
@@ -258,7 +253,7 @@ python examples/offline_inference/text_to_image/text_to_image.py \
 #### CFG Parallel
 
 Set `--cfg-parallel-size 2` to enable CFG Parallel for faster inference on multi-GPU setups.
-See more examples in the [diffusion acceleration user guide](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion_acceleration.md#using-cfg-parallel).
+See more examples in the [cfg_parallel user guide](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/parallelism/cfg_parallel.md#using-cfg-parallel).
 
 #### LoRA
 
@@ -283,15 +278,6 @@ lora_adapter/
 
 ## Web UI Demo
 
-!!! note "Gradio is an optional dependency"
-    The Gradio demo requires the `[demo]` extras. Install them first:
-
-    ```bash
-    pip install 'vllm-omni[demo]'
-    ```
-
-    Or, if installing from source: `pip install -e '.[demo]'`
-
 Launch the Gradio demo:
 
 ```bash
diff --git a/docs/user_guide/examples/offline_inference/text_to_video.md b/docs/user_guide/examples/offline_inference/text_to_video.md
index 861af8ca1d4..bb7a1d43ece 100644
--- a/docs/user_guide/examples/offline_inference/text_to_video.md
+++ b/docs/user_guide/examples/offline_inference/text_to_video.md
@@ -5,8 +5,6 @@ Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inf
 
 A unified script for text-to-video generation. Supports multiple models with model-aware defaults.
 
-For backend selection and SageAttention usage, see the [Diffusion Attention Backends](../../diffusion/attention_backends.md) guide.
-
 ## Supported Models
 
 | Model | Default Resolution | Default Frames | Default Steps | Guidance | VRAM (BF16) |
@@ -14,7 +12,6 @@ For backend selection and SageAttention usage, see the [Diffusion Attention Back
 | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | 720x1280 | 81 | 40 | 4.0 | ~60 GiB |
 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v` | 480x832 | 121 | 50 | 6.0 | 1×A100 80GB |
 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v` | 720x1280 | 121 | 50 | 6.0 | FP8 + VAE tiling required |
-| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | 720x1280 | 81 | 35 | 4.0 | model/checkpoint dependent |
 
 ## Local CLI Usage
 
@@ -51,29 +48,6 @@ python text_to_video.py \
   --output ltx2_out.mp4
 ```
 
-### Cosmos3
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-python text_to_video.py \
-  --model "$COSMOS3_MODEL" \
-  --model-class-name Cosmos3OmniDiffusersPipeline \
-  --prompt "A small warehouse robot moves a blue box across a clean floor." \
-  --negative-prompt "blurry, distorted, low quality" \
-  --height 720 \
-  --width 1280 \
-  --num-frames 81 \
-  --guidance-scale 4.0 \
-  --num-inference-steps 35 \
-  --fps 24 \
-  --output cosmos3_t2v_output.mp4
-```
-
-Cosmos3 video generation currently supports one prompt and one video per request. The implementation supports `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--ulysses-degree`, `--tensor-parallel-size`, `--use-hsdp`, and `--enable-layerwise-offload`. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
-
 ### HunyuanVideo-1.5 (480p)
 
 ```bash
diff --git a/docs/user_guide/examples/online_serving/cosmos3.md b/docs/user_guide/examples/online_serving/cosmos3.md
new file mode 100644
index 00000000000..f9bbd365ec4
--- /dev/null
+++ b/docs/user_guide/examples/online_serving/cosmos3.md
@@ -0,0 +1,93 @@
+# Cosmos3
+
+Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/cosmos3>.
+
+
+This example shows Cosmos3 online serving with `Cosmos3OmniDiffusersPipeline`.
+
+Set `COSMOS3_MODEL` to a local Diffusers-format checkpoint before starting the server:
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+cd examples/online_serving/cosmos3
+bash run_server.sh
+```
+
+`run_server.sh` accepts these environment overrides:
+
+- `MODEL`: checkpoint path, defaults to `COSMOS3_MODEL`
+- `PORT`: server port, defaults to `8091`
+- `CACHE_BACKEND`: set to `cache_dit` to enable Cache-DiT
+- `ENABLE_LAYERWISE_OFFLOAD`: set to `1` to enable layerwise offload
+- `CFG_PARALLEL_SIZE`, `TENSOR_PARALLEL_SIZE`, `ULYSSES_DEGREE`, `USE_HSDP`: parallel execution controls
+- `ALLOWED_LOCAL_MEDIA_PATH`: local media access path, defaults to `/`
+
+## Text-to-Image
+
+```bash
+bash run_curl_t2i.sh
+```
+
+The script calls `POST /v1/images/generations`, which selects Cosmos3 text-to-image through `modalities=["image"]` internally.
+
+## Text-to-Video
+
+```bash
+bash run_curl_t2v.sh
+```
+
+## Image-to-Video
+
+Download an example image or set `IMAGE_PATH` to your own image:
+
+```bash
+wget https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg
+IMAGE_PATH=cherry_blossom.jpg bash run_curl_i2v.sh
+```
+
+## Video With Sound
+
+This path requires a sound-capable Cosmos3 checkpoint with `sound_gen` weights.
+
+```bash
+bash run_curl_t2v_sound.sh
+```
+
+The script passes `generate_sound=true` and `sound_duration` to the video endpoint.
+
+## Action Policy
+
+This path requires an action-capable Cosmos3 checkpoint with `action_gen` weights. Pass either `domain_name` or `domain_id` through `extra_params`.
+
+```bash
+IMAGE_PATH=cherry_blossom.jpg bash run_curl_action_policy.sh
+```
+
+The script uses the asynchronous `POST /v1/videos` job endpoint so it can download the MP4 and save the returned action metadata JSON.
+
+## Example materials
+
+??? abstract "run_curl_action_policy.sh"
+    ``````sh
+    --8<-- "examples/online_serving/cosmos3/run_curl_action_policy.sh"
+    ``````
+??? abstract "run_curl_i2v.sh"
+    ``````sh
+    --8<-- "examples/online_serving/cosmos3/run_curl_i2v.sh"
+    ``````
+??? abstract "run_curl_t2i.sh"
+    ``````sh
+    --8<-- "examples/online_serving/cosmos3/run_curl_t2i.sh"
+    ``````
+??? abstract "run_curl_t2v.sh"
+    ``````sh
+    --8<-- "examples/online_serving/cosmos3/run_curl_t2v.sh"
+    ``````
+??? abstract "run_curl_t2v_sound.sh"
+    ``````sh
+    --8<-- "examples/online_serving/cosmos3/run_curl_t2v_sound.sh"
+    ``````
+??? abstract "run_server.sh"
+    ``````sh
+    --8<-- "examples/online_serving/cosmos3/run_server.sh"
+    ``````
diff --git a/docs/user_guide/examples/online_serving/image_to_video.md b/docs/user_guide/examples/online_serving/image_to_video.md
index 1ef5c9be318..b30bbcdf80b 100644
--- a/docs/user_guide/examples/online_serving/image_to_video.md
+++ b/docs/user_guide/examples/online_serving/image_to_video.md
@@ -3,7 +3,7 @@
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/image_to_video>.
 
 
-This example demonstrates how to deploy image-to-video models, including Wan2.2 and Cosmos3, for online video generation using vLLM-Omni.
+This example demonstrates how to deploy Wan2.2 image-to-video models for online video generation using vLLM-Omni.
 
 ## Supported Models
 
@@ -11,7 +11,6 @@ This example demonstrates how to deploy image-to-video models, including Wan2.2
 |-------|----------|
 | Wan2.2 I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` |
 | Wan2.2 TI2V | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` |
-| Cosmos3 I2V | `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` |
 
 ## Start Server
 
@@ -37,22 +36,23 @@ The script allows overriding:
 - `CACHE_BACKEND` (default: `none`)
 - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`)
 
-### Cosmos3
+### Ascend / Local LightX2V Example
 
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
+For a local Wan2.2-LightX2V Diffusers directory on Ascend/NPU, you can start the server like this:
 
 ```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-vllm serve "$COSMOS3_MODEL" \
+vllm serve /path/to/Wan2.2-I2V-A14B-LightX2V-Diffusers-Lightning \
   --omni \
   --port 8091 \
-  --model-class-name Cosmos3OmniDiffusersPipeline \
-  --allowed-local-media-path /
+  --flow-shift 12 \
+  --cfg-parallel-size 1 \
+  --ulysses-degree 4 \
+  --use-hsdp \
+  --trust-remote-code \
+  --allowed-local-media-path / \
+  --seed 42
 ```
 
-Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
-
 ## Async Job Behavior
 
 `POST /v1/videos` is asynchronous. It creates a video job and immediately
@@ -83,7 +83,6 @@ file. Metadata is returned via response headers:
 - `X-Model`: model name used for generation
 - `X-Inference-Time-S`: wall-clock inference time in seconds
 
-### Wan2.2 Sync Request
 ```bash
 curl -X POST http://localhost:8091/v1/videos/sync \
   -F "prompt=A bear playing with yarn, smooth motion" \
@@ -97,59 +96,34 @@ curl -X POST http://localhost:8091/v1/videos/sync \
   -F "guidance_scale_2=1.0" \
   -F "boundary_ratio=0.875" \
   -F "flow_shift=12.0" \
-  -F "enable_frame_interpolation=true" \
-  -F "frame_interpolation_exp=1" \
-  -F "frame_interpolation_scale=1.0" \
+  -F 'extra_params={"sample_solver":"euler"}' \
   -F "seed=42" \
   -o sync_i2v_output.mp4
 ```
 
-### Cosmos3 Sync Request
+For Wan Lightning/Distill checkpoints, pass `{"sample_solver":"euler"}` via `extra_params`. The default solver is `unipc`.
+
+Example matching the local LightX2V deployment above:
 
 ```bash
-curl -X POST http://localhost:8091/v1/videos/sync \
-  -F "prompt=Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "input_reference=@/path/to/cherry_blossom.jpg" \
-  -F "size=1280x720" \
+curl -sS -X POST http://localhost:8091/v1/videos/sync \
+  -H "Accept: video/mp4" \
+  -F "prompt=A cat playing with yarn" \
+  -F "input_reference=@/path/to/input.jpg" \
+  -F "width=832" \
+  -F "height=480" \
   -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
+  -F "fps=16" \
+  -F "num_inference_steps=4" \
+  -F "guidance_scale=1.0" \
+  -F "guidance_scale_2=1.0" \
+  -F "boundary_ratio=0.875" \
   -F "seed=42" \
-  -o cosmos3_i2v_output.mp4
+  -F 'extra_params={"sample_solver":"euler"}' \
+  -o ./output.mp4
 ```
 
-For async generation, send the same form fields to `POST /v1/videos`, poll `GET /v1/videos/{video_id}`, and download from `GET /v1/videos/{video_id}/content`. Cosmos3 currently supports one prompt and one video per request.
-
-```bash
-create_response=$(curl -s http://localhost:8091/v1/videos \
-  -H "Accept: application/json" \
-  -F "prompt=Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "input_reference=@/path/to/cherry_blossom.jpg" \
-  -F "size=1280x720" \
-  -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
-  -F "seed=42")
-
-video_id=$(echo "$create_response" | jq -r '.id')
-while true; do
-  status=$(curl -s "http://localhost:8091/v1/videos/${video_id}" | jq -r '.status')
-  if [ "$status" = "completed" ]; then
-    break
-  fi
-  if [ "$status" = "failed" ]; then
-    echo "Video generation failed"
-    exit 1
-  fi
-  sleep 2
-done
-
-curl -L "http://localhost:8091/v1/videos/${video_id}/content" -o cosmos3_i2v_output.mp4
-```
+Use `/v1/videos/sync` if you want to write the MP4 directly to a file. `POST /v1/videos` is async and returns job metadata, not inline `b64_json`.
 
 ## Storage
 
@@ -174,8 +148,10 @@ export VLLM_OMNI_STORAGE_MAX_CONCURRENCY=8
 # Basic image-to-video generation
 bash run_curl_image_to_video.sh
 
+# Wan Lightning/Distill checkpoints
+SAMPLE_SOLVER=euler bash run_curl_image_to_video.sh
+
 # Or execute directly (OpenAI-style multipart)
-# Note: frame interpolation specific arguments are relevant only for Wan2.2 models
 create_response=$(curl -s http://localhost:8091/v1/videos \
   -H "Accept: application/json" \
   -F "prompt=A bear playing with yarn, smooth motion" \
@@ -190,9 +166,7 @@ create_response=$(curl -s http://localhost:8091/v1/videos \
   -F "guidance_scale_2=1.0" \
   -F "boundary_ratio=0.875" \
   -F "flow_shift=12.0" \
-  -F "enable_frame_interpolation=true" \
-  -F "frame_interpolation_exp=1" \
-  -F "frame_interpolation_scale=1.0" \
+  -F 'extra_params={"sample_solver":"euler"}' \
   -F "seed=42")
 
 video_id=$(echo "$create_response" | jq -r '.id')
@@ -238,7 +212,6 @@ curl -X POST http://localhost:8091/v1/videos \
 ### Generation with Parameters
 
 ```bash
-# Note: frame interpolation specific arguments are relevant only for Wan2.2 models
 curl -X POST http://localhost:8091/v1/videos \
   -F "prompt=A bear playing with yarn, smooth motion" \
   -F "negative_prompt=low quality, blurry, static" \
@@ -252,34 +225,11 @@ curl -X POST http://localhost:8091/v1/videos \
   -F "guidance_scale_2=1.0" \
   -F "boundary_ratio=0.875" \
   -F "flow_shift=12.0" \
-  -F "enable_frame_interpolation=true" \
-  -F "frame_interpolation_exp=1" \
-  -F "frame_interpolation_scale=1.0" \
+  -F 'extra_params={"sample_solver":"euler"}' \
   -F "seed=42"
 ```
 
-Frame interpolation is also available for supported Wan2.2 I2V requests. See
-[Frame Interpolation](../../diffusion/frame_interpolation.md) for worker-side
-execution details and feature constraints.
-
-### Frame Interpolation Example
-
-```bash
-curl -X POST http://localhost:8091/v1/videos/sync \
-  -F "prompt=A bear playing with yarn, smooth motion" \
-  -F "input_reference=@/path/to/qwen-bear.png" \
-  -F "width=832" \
-  -F "height=480" \
-  -F "num_frames=33" \
-  -F "fps=16" \
-  -F "num_inference_steps=40" \
-  -F "guidance_scale=1.0" \
-  -F "guidance_scale_2=1.0" \
-  -F "enable_frame_interpolation=true" \
-  -F "frame_interpolation_exp=1" \
-  -F "frame_interpolation_scale=1.0" \
-  -o sync_i2v_interpolated.mp4
-```
+`sample_solver` is supported by Wan2.2 online serving through the existing `extra_params` field, which is merged into the pipeline `extra_args`. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints.
 
 ## Create Response Format
 
diff --git a/docs/user_guide/examples/online_serving/text_to_image.md b/docs/user_guide/examples/online_serving/text_to_image.md
index 894a1b4be6b..47b916de080 100644
--- a/docs/user_guide/examples/online_serving/text_to_image.md
+++ b/docs/user_guide/examples/online_serving/text_to_image.md
@@ -23,21 +23,6 @@ Or use the startup script:
 bash run_server.sh
 ```
 
-### Cosmos3
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-vllm serve "$COSMOS3_MODEL" \
-  --omni \
-  --port 8091 \
-  --model-class-name Cosmos3OmniDiffusersPipeline
-```
-
-Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
-
 ### Start with Parallelism Acceleration
 
 Enable Tensor Parallelism and VAE Patch Parallelism for faster inference:
@@ -86,26 +71,6 @@ curl -s http://localhost:8091/v1/chat/completions \
   }' | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png
 ```
 
-#### Cosmos3 Images API
-
-The dedicated image endpoint sets `modalities=["image"]` internally, which selects Cosmos3 text-to-image.
-
-```bash
-curl -X POST http://localhost:8091/v1/images/generations \
-  -H "Content-Type: application/json" \
-  -d '{
-    "prompt": "A small warehouse robot carrying a blue box, clean product photography",
-    "size": "1024x1024",
-    "n": 1,
-    "num_inference_steps": 50,
-    "guidance_scale": 7.0,
-    "negative_prompt": "blurry, distorted, low quality",
-    "seed": 42
-  }' | jq -r '.data[0].b64_json' | base64 -d > cosmos3_t2i.png
-```
-
-Cosmos3 currently supports one prompt per request. Use `n` to request multiple images for that prompt.
-
 ### Method 2: Using OpenAI Python SDK
 
 ```python
@@ -132,6 +97,12 @@ with open("output.png", "wb") as f:
     f.write(base64.b64decode(b64_data))
 ```
 
+!!! note
+    The OpenAI SDK's `extra_body` keyword argument merges parameters into the
+    top-level request body automatically. When using curl or Python `requests`,
+    wrap generation parameters inside a literal `"extra_body"` key in the JSON
+    instead (as shown in the curl example above).
+
 ### Method 3: Using Python Client Script
 
 ```bash
@@ -140,15 +111,6 @@ python openai_chat_client.py --prompt "A beautiful landscape painting" --output
 
 ### Method 4: Using Gradio Demo
 
-!!! note "Gradio is an optional dependency"
-    The Gradio demo requires the `[demo]` extras. Install them first:
-
-    ```bash
-    pip install 'vllm-omni[demo]'
-    ```
-
-    Or, if installing from source: `pip install -e '.[demo]'`
-
 ```bash
 python gradio_demo.py
 # Visit http://localhost:7860
@@ -221,7 +183,7 @@ lora_adapter/
 
 ### Generation with Parameters
 
-Wrap generation parameters inside `extra_body` in the request JSON:
+Use `extra_body` to pass generation parameters:
 
 ```json
 {
@@ -238,21 +200,6 @@ Wrap generation parameters inside `extra_body` in the request JSON:
 }
 ```
 
-!!! tip "Using the OpenAI SDK"
-    When using the OpenAI Python SDK, pass these parameters via the `extra_body`
-    keyword argument. The SDK merges them into the top-level request body automatically:
-
-    ```python
-    client.chat.completions.create(
-        model="Qwen/Qwen-Image",
-        messages=[...],
-        extra_body={"height": 1024, "width": 1024, "num_inference_steps": 50},
-    )
-    ```
-
-    For details on how generation parameters are handled across different clients, see the
-    [Diffusion Chat API guide](../../../../serving/diffusion_chat_api.md).
-
 ### Multimodal Input (Text + Structured Content)
 
 ```json
@@ -271,12 +218,11 @@ Wrap generation parameters inside `extra_body` in the request JSON:
 ## Generation Parameters
 
 When using `/v1/chat/completions`, pass these inside `extra_body` in the curl
-JSON, or via the `extra_body` keyword argument in the OpenAI Python SDK (see the
-[Diffusion Chat API guide](../../../../serving/diffusion_chat_api.md)).
-When using the dedicated [`/v1/images/generations`](../../../../serving/image_generation_api.md)
-endpoint, pass the supported generation controls as top-level JSON fields
-directly. For image dimensions and count, use `size` and `n` rather than
-`height`, `width`, or `num_outputs_per_prompt`.
+JSON, or via the `extra_body` keyword argument in the OpenAI Python SDK.
+When using the dedicated `/v1/images/generations` endpoint, pass the supported
+generation controls as top-level JSON fields directly. For image dimensions and
+count, use `size` and `n` rather than `height`, `width`, or
+`num_outputs_per_prompt`.
 
 | Parameter                | Type  | Default | Description                    |
 | ------------------------ | ----- | ------- | ------------------------------ |
@@ -289,6 +235,8 @@ directly. For image dimensions and count, use `size` and `n` rather than
 | `seed`                   | int   | None    | Random seed (reproducible)     |
 | `negative_prompt`        | str   | None    | Negative prompt                |
 | `num_outputs_per_prompt` | int   | 1       | Number of images to generate   |
+| `use_system_prompt` | str | None | System prompt preset: `en_unified`, `en_vanilla`, `en_recaption`, `en_think_recaption`, `dynamic`, `None`, or custom text string. Only for HunyuanImage-3.0. |
+| `system_prompt` | str | None | Custom system prompt text. Only used when `use_system_prompt` is set to `custom`. Only for HunyuanImage-3.0. |
 
 ## Response Format
 
diff --git a/docs/user_guide/examples/online_serving/text_to_video.md b/docs/user_guide/examples/online_serving/text_to_video.md
index d5aa8a154ff..f045e0d44a4 100644
--- a/docs/user_guide/examples/online_serving/text_to_video.md
+++ b/docs/user_guide/examples/online_serving/text_to_video.md
@@ -13,7 +13,6 @@ This example demonstrates how to deploy text-to-video models for online video ge
 | Wan2.1 T2V (14B) | `Wan-AI/Wan2.1-T2V-14B-Diffusers` |
 | Wan2.2 T2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` |
 | LTX-2 | `Lightricks/LTX-2` |
-| Cosmos3 T2V | `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` |
 
 ## Wan2.2 T2V
 
@@ -41,23 +40,6 @@ The script allows overriding:
 - `CACHE_BACKEND` (default: `none`)
 - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`)
 
-## Cosmos3 T2V
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-### Start Server
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-vllm serve "$COSMOS3_MODEL" \
-  --omni \
-  --port 8091 \
-  --model-class-name Cosmos3OmniDiffusersPipeline
-```
-
-Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
-
 ## Async Job Behavior
 
 `POST /v1/videos` is asynchronous. It creates a video job and immediately
@@ -103,51 +85,6 @@ curl -X POST http://localhost:8091/v1/videos/sync \
   -o sync_t2v_output.mp4
 ```
 
-### Cosmos3 Sync Request
-
-```bash
-curl -X POST http://localhost:8091/v1/videos/sync \
-  -F "prompt=A small warehouse robot moves a blue box across a clean floor." \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "size=1280x720" \
-  -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
-  -F "seed=42" \
-  -o cosmos3_t2v_output.mp4
-```
-
-For async generation, send the same form fields to `POST /v1/videos`, poll `GET /v1/videos/{video_id}`, and download from `GET /v1/videos/{video_id}/content`. Cosmos3 currently supports one prompt and one video per request.
-
-```bash
-create_response=$(curl -s http://localhost:8091/v1/videos \
-  -H "Accept: application/json" \
-  -F "prompt=A small warehouse robot moves a blue box across a clean floor." \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "size=1280x720" \
-  -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
-  -F "seed=42")
-
-video_id=$(echo "$create_response" | jq -r '.id')
-while true; do
-  status=$(curl -s "http://localhost:8091/v1/videos/${video_id}" | jq -r '.status')
-  if [ "$status" = "completed" ]; then
-    break
-  fi
-  if [ "$status" = "failed" ]; then
-    echo "Video generation failed"
-    exit 1
-  fi
-  sleep 2
-done
-
-curl -L "http://localhost:8091/v1/videos/${video_id}/content" -o cosmos3_t2v_output.mp4
-```
-
 ## Storage
 
 Generated video files are stored on local disk by the async video API.
@@ -216,7 +153,6 @@ curl -X POST http://localhost:8091/v1/videos \
 ### Generation with Parameters
 
 ```bash
-# Note: frame interpolation specific arguments are relevant only for Wan2.2 models
 curl -X POST http://localhost:8091/v1/videos \
   -F "prompt=A cinematic view of a futuristic city at sunset" \
   -F "width=832" \
@@ -229,61 +165,28 @@ curl -X POST http://localhost:8091/v1/videos \
   -F "guidance_scale_2=4.0" \
   -F "boundary_ratio=0.875" \
   -F "flow_shift=5.0" \
-  -F "enable_frame_interpolation=true" \
-  -F "frame_interpolation_exp=1" \
-  -F "frame_interpolation_scale=1.0" \
   -F "seed=42"
 ```
 
 ## Generation Parameters
 
-| Parameter             | Type   | Default | Description                                                                                              |
-| --------------------- | ------ | ------- |----------------------------------------------------------------------------------------------------------|
-| `prompt`              | str    | -       | Text description of the desired video                                                                    |
-| `seconds`             | str    | None    | Clip duration in seconds                                                                                 |
-| `size`                | str    | None    | Output size in `WIDTHxHEIGHT` format                                                                     |
-| `negative_prompt`     | str    | None    | Negative prompt                                                                                          |
-| `width`               | int    | None    | Video width in pixels                                                                                    |
-| `height`              | int    | None    | Video height in pixels                                                                                   |
-| `num_frames`          | int    | None    | Number of frames to generate                                                                             |
-| `fps`                 | int    | None    | Frames per second for output video                                                                       |
-| `num_inference_steps` | int    | None    | Number of denoising steps                                                                                |
-| `guidance_scale`      | float  | None    | CFG guidance scale (low-noise stage)                                                                     |
-| `guidance_scale_2`    | float  | None    | CFG guidance scale (high-noise stage, Wan2.2)                                                            |
-| `boundary_ratio`      | float  | None    | Boundary split ratio for low/high DiT (Wan2.2)                                                           |
-| `flow_shift`          | float  | None    | Scheduler flow shift                                                                                     |
-| `seed`                | int    | None    | Random seed (reproducible)                                                                               |
-| `lora`                | object | None    | LoRA configuration                                                                                       |
-| `enable_frame_interpolation` | bool | false | Enable RIFE frame interpolation before MP4 encoding (Wan2.2)                                             |
-| `frame_interpolation_exp` | int | 1 | Interpolation exponent; 1=2x temporal resolution, 2=4x (Wan2.2)                                          |
-| `frame_interpolation_scale` | float | 1.0 | RIFE inference scale; use 0.5 for high-resolution inputs (Wan2.2)                                        |
-| `frame_interpolation_model_path` | str | None | Local directory or Hugging Face repo ID with `flownet.pkl`; defaults to `elfgum/RIFE-4.22.lite` (Wan2.2) |
-
-## Frame Interpolation
-
-Frame interpolation is an optional post-processing step for `/v1/videos` and
-`/v1/videos/sync`, supported by Wan2.2 models. It synthesizes intermediate frames between generated frames
-without rerunning the diffusion model. If the generated video has `N` frames,
-the interpolated output frame count is `(N - 1) * 2**exp + 1`. The encoder FPS
-is multiplied by `2**exp` so the output duration remains close to the original.
-
-Frame interpolation runs in the diffusion worker post-processing path instead of
-the API server encoding path, so it can reuse the worker's current accelerator
-device without blocking the FastAPI event loop.
-
-Example: generate 5 frames and interpolate to 9 frames:
-
-```bash
-# Note: frame interpolation specific arguments are relevant only for Wan2.2 models
-curl -X POST http://localhost:8091/v1/videos/sync \
-  -F "prompt=A dog running through a park" \
-  -F "num_frames=5" \
-  -F "fps=8" \
-  -F "enable_frame_interpolation=true" \
-  -F "frame_interpolation_exp=1" \
-  -F "frame_interpolation_scale=1.0" \
-  -o sync_t2v_interpolated.mp4
-```
+| Parameter             | Type   | Default | Description                                      |
+| --------------------- | ------ | ------- | ------------------------------------------------ |
+| `prompt`              | str    | -       | Text description of the desired video            |
+| `seconds`             | str    | None    | Clip duration in seconds                         |
+| `size`                | str    | None    | Output size in `WIDTHxHEIGHT` format             |
+| `negative_prompt`     | str    | None    | Negative prompt                                  |
+| `width`               | int    | None    | Video width in pixels                            |
+| `height`              | int    | None    | Video height in pixels                           |
+| `num_frames`          | int    | None    | Number of frames to generate                     |
+| `fps`                 | int    | None    | Frames per second for output video               |
+| `num_inference_steps` | int    | None    | Number of denoising steps                        |
+| `guidance_scale`      | float  | None    | CFG guidance scale (low-noise stage)             |
+| `guidance_scale_2`    | float  | None    | CFG guidance scale (high-noise stage, Wan2.2)     |
+| `boundary_ratio`      | float  | None    | Boundary split ratio for low/high DiT (Wan2.2)   |
+| `flow_shift`          | float  | None    | Scheduler flow shift                             |
+| `seed`                | int    | None    | Random seed (reproducible)                       |
+| `lora`                | object | None    | LoRA configuration                               |
 
 ## Create Response Format
 
@@ -353,14 +256,6 @@ vllm serve Lightricks/LTX-2 --omni --port 8098 \
     --enforce-eager --flow-shift 1.0 --boundary-ratio 1.0
 ```
 
-For multi-GPU memory reduction, you can enable HSDP:
-
-```bash
-vllm serve Lightricks/LTX-2 --omni --port 8098 \
-    --enforce-eager --flow-shift 1.0 --boundary-ratio 1.0 \
-    --use-hsdp --hsdp-shard-size 2
-```
-
 #### Start with Optimization Presets
 
 Use the LTX-2 startup script with built-in optimization presets:
@@ -431,16 +326,13 @@ curl -sS -X POST http://localhost:8098/v1/videos \
 
 ## Example materials
 
-??? abstract "response.json"
-    ``````json
-    --8<-- "examples/online_serving/text_to_video/response.json"
+??? abstract "run_curl_hunyuan_video_15.sh"
+    ``````sh
+    --8<-- "examples/online_serving/text_to_video/run_curl_hunyuan_video_15.sh"
     ``````
 ??? abstract "run_curl_ltx2.sh"
     ``````sh
     --8<-- "examples/online_serving/text_to_video/run_curl_ltx2.sh"
-??? abstract "run_curl_hunyuan_video_15.sh"
-    ``````sh
-    --8<-- "examples/online_serving/text_to_video/run_curl_hunyuan_video_15.sh"
     ``````
 ??? abstract "run_curl_text_to_video.sh"
     ``````sh
@@ -450,10 +342,11 @@ curl -sS -X POST http://localhost:8098/v1/videos \
     ``````sh
     --8<-- "examples/online_serving/text_to_video/run_server.sh"
     ``````
-??? abstract "run_server_ltx2.sh"
-    ``````sh
-    --8<-- "examples/online_serving/text_to_video/run_server_ltx2.sh"
 ??? abstract "run_server_hunyuan_video_15.sh"
     ``````sh
     --8<-- "examples/online_serving/text_to_video/run_server_hunyuan_video_15.sh"
     ``````
+??? abstract "run_server_ltx2.sh"
+    ``````sh
+    --8<-- "examples/online_serving/text_to_video/run_server_ltx2.sh"
+    ``````
diff --git a/examples/offline_inference/cosmos3/README.md b/examples/offline_inference/cosmos3/README.md
new file mode 100644
index 00000000000..9674c3de449
--- /dev/null
+++ b/examples/offline_inference/cosmos3/README.md
@@ -0,0 +1,81 @@
+# Cosmos3
+
+Cosmos3 uses `Cosmos3OmniDiffusersPipeline` for text-to-image, text-to-video, image-to-video, video-with-sound, and action generation. Set `COSMOS3_MODEL` to a local Diffusers-format checkpoint before running these examples.
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+cd examples/offline_inference/cosmos3
+```
+
+## Text-to-Image
+
+```bash
+python end2end.py \
+  --task t2i \
+  --prompt "A small warehouse robot carrying a blue box, clean product photography" \
+  --negative-prompt "blurry, distorted, low quality" \
+  --output cosmos3_t2i.png
+```
+
+## Text-to-Video
+
+```bash
+python end2end.py \
+  --task t2v \
+  --prompt "A small warehouse robot moves a blue box across a clean floor." \
+  --negative-prompt "blurry, distorted, low quality" \
+  --output cosmos3_t2v.mp4
+```
+
+## Image-to-Video
+
+Download an example image or provide your own image path.
+
+```bash
+wget https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg
+
+python end2end.py \
+  --task i2v \
+  --image cherry_blossom.jpg \
+  --prompt "Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
+  --negative-prompt "blurry, distorted, low quality" \
+  --output cosmos3_i2v.mp4
+```
+
+## Video With Sound
+
+This path requires a sound-capable Cosmos3 checkpoint with `sound_gen` weights.
+
+```bash
+python end2end.py \
+  --task t2v_sound \
+  --prompt "A small warehouse robot rolls across the floor with soft motor sounds." \
+  --negative-prompt "blurry, distorted, low quality" \
+  --sound-duration 3.4 \
+  --output cosmos3_t2v_sound.mp4
+```
+
+## Action Policy
+
+This path requires an action-capable Cosmos3 checkpoint with `action_gen` weights. The example returns a video plus an action JSON payload. Pass either `--domain-name` or `--domain-id`.
+
+```bash
+python end2end.py \
+  --task action_policy \
+  --image cherry_blossom.jpg \
+  --prompt "Predict the robot action for moving toward the target." \
+  --domain-name bridge_orig_lerobot \
+  --raw-action-dim 2 \
+  --action-chunk-size 16 \
+  --output cosmos3_action_policy.mp4 \
+  --action-output cosmos3_action_policy_action.json
+```
+
+## Common Options
+
+- `--enable-layerwise-offload`: use layerwise offload for memory-constrained runs.
+- `--cache-backend cache_dit`: enable Cache-DiT where supported.
+- `--cfg-parallel-size 2`, `--ulysses-degree`, `--tensor-parallel-size`, `--use-hsdp`: enable parallel execution options.
+- `--height`, `--width`, `--num-frames`, `--num-inference-steps`, `--guidance-scale`, `--fps`: override task defaults.
+
+Do not use model-level `--enable-cpu-offload` for Cosmos3. Use `--enable-layerwise-offload` instead.
diff --git a/examples/offline_inference/cosmos3/end2end.py b/examples/offline_inference/cosmos3/end2end.py
new file mode 100644
index 00000000000..93525a39019
--- /dev/null
+++ b/examples/offline_inference/cosmos3/end2end.py
@@ -0,0 +1,506 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import PIL.Image
+import torch
+
+from vllm_omni.diffusion.data import DiffusionParallelConfig
+from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
+
+
+DEFAULT_NEGATIVE_PROMPT = "blurry, distorted, low quality"
+TASK_DEFAULTS = {
+    "t2i": {
+        "height": 1024,
+        "width": 1024,
+        "num_frames": None,
+        "num_inference_steps": 50,
+        "guidance_scale": 7.0,
+        "fps": 24,
+        "output": "cosmos3_t2i.png",
+    },
+    "t2v": {
+        "height": 720,
+        "width": 1280,
+        "num_frames": 81,
+        "num_inference_steps": 35,
+        "guidance_scale": 4.0,
+        "fps": 24,
+        "output": "cosmos3_t2v.mp4",
+    },
+    "i2v": {
+        "height": 720,
+        "width": 1280,
+        "num_frames": 81,
+        "num_inference_steps": 35,
+        "guidance_scale": 4.0,
+        "fps": 24,
+        "output": "cosmos3_i2v.mp4",
+    },
+    "t2v_sound": {
+        "height": 720,
+        "width": 1280,
+        "num_frames": 81,
+        "num_inference_steps": 35,
+        "guidance_scale": 4.0,
+        "fps": 24,
+        "output": "cosmos3_t2v_sound.mp4",
+    },
+    "action_policy": {
+        "height": 480,
+        "width": 640,
+        "num_frames": 17,
+        "num_inference_steps": 30,
+        "guidance_scale": 1.0,
+        "fps": 24,
+        "output": "cosmos3_action_policy.mp4",
+    },
+}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Cosmos3 offline inference examples.")
+    parser.add_argument(
+        "--model",
+        default=os.environ.get("COSMOS3_MODEL"),
+        help="Local Diffusers-format Cosmos3 checkpoint. Defaults to COSMOS3_MODEL.",
+    )
+    parser.add_argument(
+        "--task",
+        choices=sorted(TASK_DEFAULTS),
+        default="t2v",
+        help="Cosmos3 example task to run.",
+    )
+    parser.add_argument(
+        "--prompt",
+        default="A small warehouse robot moves a blue box across a clean floor.",
+        help="Text prompt.",
+    )
+    parser.add_argument("--negative-prompt", default=DEFAULT_NEGATIVE_PROMPT, help="Negative prompt.")
+    parser.add_argument("--image", default=None, help="Input image for i2v or action_policy.")
+    parser.add_argument("--output", default=None, help="Output PNG or MP4 path. Default depends on --task.")
+    parser.add_argument(
+        "--action-output",
+        default=None,
+        help="Action JSON path for action_policy. Defaults to the video output stem plus _action.json.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed.")
+    parser.add_argument("--height", type=int, default=None, help="Output height. Default depends on --task.")
+    parser.add_argument("--width", type=int, default=None, help="Output width. Default depends on --task.")
+    parser.add_argument("--num-frames", type=int, default=None, help="Video frames. Default depends on --task.")
+    parser.add_argument(
+        "--num-inference-steps",
+        type=int,
+        default=None,
+        help="Sampling steps. Default depends on --task.",
+    )
+    parser.add_argument("--guidance-scale", type=float, default=None, help="CFG scale. Default depends on --task.")
+    parser.add_argument("--fps", type=int, default=None, help="Output video fps. Default depends on --task.")
+    parser.add_argument(
+        "--sound-duration",
+        type=float,
+        default=None,
+        help="Audio duration in seconds for t2v_sound. Defaults to generated video duration.",
+    )
+    parser.add_argument(
+        "--audio-sample-rate",
+        type=int,
+        default=24000,
+        help="Fallback sample rate used when muxing audio if the model does not return one.",
+    )
+    parser.add_argument(
+        "--domain-name",
+        default="bridge_orig_lerobot",
+        help="Cosmos3 action embodiment name for action_policy.",
+    )
+    parser.add_argument("--domain-id", type=int, default=None, help="Cosmos3 action embodiment id.")
+    parser.add_argument(
+        "--raw-action-dim",
+        type=int,
+        default=2,
+        help="Number of action dimensions to keep for action_policy.",
+    )
+    parser.add_argument(
+        "--action-chunk-size",
+        type=int,
+        default=16,
+        help="Number of action steps for action_policy.",
+    )
+    parser.add_argument(
+        "--cache-backend",
+        type=str,
+        default=None,
+        choices=["cache_dit"],
+        help="Cache backend for supported Cosmos3 generation paths.",
+    )
+    parser.add_argument("--enable-layerwise-offload", action="store_true", help="Enable layerwise offload.")
+    parser.add_argument("--vae-use-slicing", action="store_true", help="Enable VAE slicing.")
+    parser.add_argument("--vae-use-tiling", action="store_true", help="Enable VAE tiling.")
+    parser.add_argument("--enforce-eager", action="store_true", help="Disable torch.compile.")
+    parser.add_argument("--ulysses-degree", type=int, default=1, help="Ulysses sequence parallel degree.")
+    parser.add_argument("--ring-degree", type=int, default=1, help="Ring sequence parallel degree.")
+    parser.add_argument("--cfg-parallel-size", type=int, default=1, choices=[1, 2], help="CFG parallel size.")
+    parser.add_argument("--tensor-parallel-size", type=int, default=1, help="Tensor parallel size.")
+    parser.add_argument("--vae-patch-parallel-size", type=int, default=1, help="VAE patch parallel size.")
+    parser.add_argument("--use-hsdp", action="store_true", help="Enable HSDP.")
+    parser.add_argument("--hsdp-shard-size", type=int, default=1, help="HSDP shard size.")
+    parser.add_argument("--hsdp-replicate-size", type=int, default=1, help="HSDP replicate size.")
+    parser.add_argument(
+        "--quantization",
+        type=str,
+        default=None,
+        choices=["fp8", "mxfp8", "int8", "gguf"],
+        help="Transformer quantization method.",
+    )
+    return parser.parse_args()
+
+
+def _cache_config(cache_backend: str | None) -> dict[str, Any] | None:
+    if cache_backend != "cache_dit":
+        return None
+    return {
+        "Fn_compute_blocks": 1,
+        "Bn_compute_blocks": 0,
+        "max_warmup_steps": 4,
+        "max_cached_steps": 20,
+        "residual_diff_threshold": 0.24,
+        "max_continuous_cached_steps": 3,
+        "enable_taylorseer": False,
+        "taylorseer_order": 1,
+        "scm_steps_mask_policy": None,
+        "scm_steps_policy": "dynamic",
+    }
+
+
+def _first_output(outputs: Any) -> Any:
+    if isinstance(outputs, list):
+        if not outputs:
+            raise ValueError("No output generated.")
+        return outputs[0]
+    return outputs
+
+
+def _inner_output(output: Any) -> Any:
+    if isinstance(output, OmniRequestOutput) and output.is_pipeline_output and output.request_output is not None:
+        return output.request_output
+    return output
+
+
+def _extract_images(outputs: Any) -> list[Any]:
+    output = _inner_output(_first_output(outputs))
+    if isinstance(output, OmniRequestOutput) and output.images:
+        return output.images
+    images = getattr(output, "images", None)
+    if images:
+        return images
+    raise ValueError("No images found in output.")
+
+
+def _extract_video_audio_action(outputs: Any) -> tuple[Any, Any | None, int | None, dict[str, Any]]:
+    outer = _first_output(outputs)
+    output = _inner_output(outer)
+    audio = None
+    audio_sample_rate = None
+    action = {}
+
+    for candidate in (outer, output):
+        if isinstance(candidate, OmniRequestOutput):
+            if candidate.multimodal_output:
+                audio = audio or candidate.multimodal_output.get("audio")
+                audio_sample_rate = audio_sample_rate or candidate.multimodal_output.get("audio_sample_rate")
+            if candidate.custom_output:
+                action.update(candidate.custom_output)
+
+    videos = None
+    if isinstance(output, OmniRequestOutput):
+        if output.multimodal_output:
+            videos = output.multimodal_output.get("video")
+            audio = audio or output.multimodal_output.get("audio")
+            audio_sample_rate = audio_sample_rate or output.multimodal_output.get("audio_sample_rate")
+        if videos is None and output.images:
+            videos = output.images
+    else:
+        videos = getattr(output, "images", None)
+        mm = getattr(output, "multimodal_output", None)
+        if mm:
+            videos = videos or mm.get("video")
+            audio = audio or mm.get("audio")
+            audio_sample_rate = audio_sample_rate or mm.get("audio_sample_rate")
+
+    if isinstance(videos, list) and len(videos) == 1:
+        first = videos[0]
+        if isinstance(first, tuple) and len(first) == 2:
+            videos, audio = first
+        elif isinstance(first, dict):
+            audio = audio or first.get("audio")
+            audio_sample_rate = audio_sample_rate or first.get("audio_sample_rate")
+            videos = first.get("frames") or first.get("video")
+        elif isinstance(first, list):
+            videos = first
+
+    if isinstance(videos, tuple) and len(videos) == 2:
+        videos, audio = videos
+    elif isinstance(videos, dict):
+        audio = audio or videos.get("audio")
+        audio_sample_rate = audio_sample_rate or videos.get("audio_sample_rate")
+        videos = videos.get("frames") or videos.get("video")
+
+    if videos is None:
+        raise ValueError("No video frames found in output.")
+    return videos, audio, audio_sample_rate, action
+
+
+def _normalize_frame(frame: Any) -> Any:
+    if isinstance(frame, torch.Tensor):
+        frame_tensor = frame.detach().cpu()
+        if frame_tensor.dim() == 4 and frame_tensor.shape[0] == 1:
+            frame_tensor = frame_tensor[0]
+        if frame_tensor.dim() == 3 and frame_tensor.shape[0] in (3, 4):
+            frame_tensor = frame_tensor.permute(1, 2, 0)
+        if frame_tensor.is_floating_point():
+            frame_tensor = frame_tensor.clamp(-1, 1) * 0.5 + 0.5
+        return frame_tensor.float().numpy()
+    if isinstance(frame, np.ndarray):
+        frame_array = frame
+        if frame_array.ndim == 4 and frame_array.shape[0] == 1:
+            frame_array = frame_array[0]
+        if np.issubdtype(frame_array.dtype, np.integer):
+            frame_array = frame_array.astype(np.float32) / 255.0
+        return frame_array
+    if isinstance(frame, PIL.Image.Image):
+        return np.asarray(frame).astype(np.float32) / 255.0
+    return frame
+
+
+def _ensure_frame_list(video: Any) -> Any:
+    if isinstance(video, list):
+        if not video:
+            return video
+        first = video[0]
+        if isinstance(first, np.ndarray):
+            if first.ndim == 5:
+                return list(first[0])
+            if first.ndim == 4:
+                return list(first)
+            if first.ndim == 3:
+                return video
+        return video
+    if isinstance(video, np.ndarray):
+        if video.ndim == 5:
+            return list(video[0])
+        if video.ndim == 4:
+            return list(video)
+        if video.ndim == 3:
+            return [video]
+    return video
+
+
+def _video_to_array(video: Any) -> Any:
+    if isinstance(video, torch.Tensor):
+        video_tensor = video.detach().cpu()
+        if video_tensor.dim() == 5:
+            if video_tensor.shape[1] in (3, 4):
+                video_tensor = video_tensor[0].permute(1, 2, 3, 0)
+            else:
+                video_tensor = video_tensor[0]
+        elif video_tensor.dim() == 4 and video_tensor.shape[0] in (3, 4):
+            video_tensor = video_tensor.permute(1, 2, 3, 0)
+        if video_tensor.is_floating_point():
+            video_tensor = video_tensor.clamp(-1, 1) * 0.5 + 0.5
+        return video_tensor.float().numpy()
+    if isinstance(video, np.ndarray):
+        video_array = video
+        if video_array.ndim == 5:
+            video_array = video_array[0]
+        if np.issubdtype(video_array.dtype, np.integer):
+            video_array = video_array.astype(np.float32) / 255.0
+        return video_array
+    if isinstance(video, list):
+        if not video:
+            raise ValueError("No video frames found in output.")
+        return [_normalize_frame(frame) for frame in video]
+    return video
+
+
+def _save_video(video: Any, output_path: Path, fps: int, audio: Any | None, audio_sample_rate: int) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    video_array = _ensure_frame_list(_video_to_array(video))
+
+    if audio is not None:
+        from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes
+
+        frames_np = np.stack(video_array, axis=0) if isinstance(video_array, list) else np.asarray(video_array)
+        if frames_np.ndim == 4 and frames_np.shape[-1] == 4:
+            frames_np = frames_np[..., :3]
+        frames_u8 = (np.clip(frames_np, 0.0, 1.0) * 255).round().clip(0, 255).astype("uint8")
+
+        audio_np = audio
+        if isinstance(audio_np, list):
+            audio_np = audio_np[0] if audio_np else None
+        if isinstance(audio_np, torch.Tensor):
+            audio_np = audio_np.detach().cpu().float().numpy()
+        if isinstance(audio_np, np.ndarray):
+            audio_np = np.squeeze(audio_np).astype(np.float32)
+
+        video_bytes = mux_video_audio_bytes(
+            frames_u8,
+            audio_np,
+            fps=float(fps),
+            audio_sample_rate=audio_sample_rate,
+        )
+        output_path.write_bytes(video_bytes)
+        return
+
+    try:
+        from diffusers.utils import export_to_video
+    except ImportError as exc:
+        raise ImportError("diffusers is required for export_to_video.") from exc
+    export_to_video(video_array, str(output_path), fps=fps)
+
+
+def _jsonable(value: Any) -> Any:
+    if isinstance(value, torch.Tensor):
+        return value.detach().cpu().tolist()
+    if isinstance(value, np.ndarray):
+        return value.tolist()
+    if isinstance(value, (np.integer, np.floating)):
+        return value.item()
+    if isinstance(value, dict):
+        return {str(k): _jsonable(v) for k, v in value.items()}
+    if isinstance(value, (list, tuple)):
+        return [_jsonable(v) for v in value]
+    return value
+
+
+def _build_omni(args: argparse.Namespace) -> Omni:
+    parallel_config = DiffusionParallelConfig(
+        ulysses_degree=args.ulysses_degree,
+        ring_degree=args.ring_degree,
+        cfg_parallel_size=args.cfg_parallel_size,
+        tensor_parallel_size=args.tensor_parallel_size,
+        vae_patch_parallel_size=args.vae_patch_parallel_size,
+        use_hsdp=args.use_hsdp,
+        hsdp_shard_size=args.hsdp_shard_size,
+        hsdp_replicate_size=args.hsdp_replicate_size,
+    )
+    kwargs: dict[str, Any] = {
+        "model": args.model,
+        "model_class_name": "Cosmos3OmniDiffusersPipeline",
+        "enable_layerwise_offload": args.enable_layerwise_offload,
+        "vae_use_slicing": args.vae_use_slicing,
+        "vae_use_tiling": args.vae_use_tiling,
+        "enforce_eager": args.enforce_eager,
+        "parallel_config": parallel_config,
+        "cache_backend": args.cache_backend,
+        "cache_config": _cache_config(args.cache_backend),
+    }
+    if args.quantization is not None:
+        kwargs["quantization"] = args.quantization
+    return Omni(**kwargs)
+
+
+def main() -> None:
+    args = parse_args()
+    if not args.model:
+        raise ValueError("Set COSMOS3_MODEL or pass --model with a Cosmos3 Diffusers checkpoint path.")
+
+    defaults = TASK_DEFAULTS[args.task]
+    height = args.height or defaults["height"]
+    width = args.width or defaults["width"]
+    num_frames = args.num_frames if args.num_frames is not None else defaults["num_frames"]
+    num_inference_steps = args.num_inference_steps or defaults["num_inference_steps"]
+    guidance_scale = args.guidance_scale if args.guidance_scale is not None else defaults["guidance_scale"]
+    fps = args.fps or defaults["fps"]
+    output_path = Path(args.output or defaults["output"])
+
+    if args.task in {"i2v", "action_policy"} and args.image is None:
+        raise ValueError(f"--image is required for {args.task}.")
+
+    image = PIL.Image.open(args.image).convert("RGB") if args.image else None
+    generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
+    omni = _build_omni(args)
+
+    prompt: dict[str, Any] = {
+        "prompt": args.prompt,
+        "negative_prompt": args.negative_prompt,
+        "modalities": ["image"] if args.task == "t2i" else ["video"],
+    }
+    if image is not None:
+        prompt["multi_modal_data"] = {"image": image}
+    if args.task == "t2v_sound":
+        prompt["generate_sound"] = True
+
+    extra_args: dict[str, Any] = {}
+    if args.task == "t2v_sound":
+        extra_args["generate_sound"] = True
+        if args.sound_duration is not None:
+            extra_args["sound_duration"] = args.sound_duration
+    if args.task == "action_policy":
+        extra_args.update(
+            {
+                "action_mode": "policy",
+                "action_chunk_size": args.action_chunk_size,
+                "raw_action_dim": args.raw_action_dim,
+            }
+        )
+        if args.domain_id is not None:
+            extra_args["domain_id"] = args.domain_id
+        else:
+            extra_args["domain_name"] = args.domain_name
+
+    sampling = OmniDiffusionSamplingParams(
+        height=height,
+        width=width,
+        generator=generator,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        num_frames=num_frames,
+        frame_rate=float(fps),
+        extra_args=extra_args,
+    )
+
+    print("Cosmos3 generation configuration:")
+    print(f"  Task: {args.task}")
+    print(f"  Model: {args.model}")
+    print(f"  Size: {width}x{height}")
+    if num_frames is not None:
+        print(f"  Frames: {num_frames}")
+    print(f"  Steps: {num_inference_steps}")
+    print(f"  Guidance scale: {guidance_scale}")
+
+    start = time.perf_counter()
+    outputs = omni.generate(prompt, sampling)
+    elapsed = time.perf_counter() - start
+    print(f"Total generation time: {elapsed:.4f} seconds")
+
+    if args.task == "t2i":
+        images = _extract_images(outputs)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        images[0].save(output_path)
+        print(f"Saved image to {output_path}")
+        return
+
+    video, audio, returned_sample_rate, action = _extract_video_audio_action(outputs)
+    _save_video(video, output_path, fps=fps, audio=audio, audio_sample_rate=returned_sample_rate or args.audio_sample_rate)
+    print(f"Saved video to {output_path}")
+
+    if args.task == "action_policy":
+        action_path = Path(args.action_output) if args.action_output else output_path.with_name(f"{output_path.stem}_action.json")
+        action_path.parent.mkdir(parents=True, exist_ok=True)
+        action_path.write_text(json.dumps(_jsonable(action), indent=2) + "\n", encoding="utf-8")
+        print(f"Saved action metadata to {action_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/image_to_video/README.md b/examples/offline_inference/image_to_video/README.md
index 8de4cafce78..e667d1eafcc 100644
--- a/examples/offline_inference/image_to_video/README.md
+++ b/examples/offline_inference/image_to_video/README.md
@@ -1,6 +1,6 @@
 # Image-To-Video
 
-This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models and Cosmos3 with vLLM-Omni's offline inference API.
+This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models with vLLM-Omni's offline inference API.
 
 ## Supported Models
 
@@ -8,7 +8,6 @@ This example demonstrates how to generate videos from images using Wan2.2 Image-
 |-------|--------------------|----------------|---------------|----------|
 | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
 | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
-| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | auto, 720p area | 81 | 35 | 4.0 |
 
 ## Local CLI Usage
 
@@ -56,34 +55,10 @@ python image_to_video.py \
   --output i2v_output.mp4
 ```
 
-### Cosmos3
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-python image_to_video.py \
-  --model "$COSMOS3_MODEL" \
-  --model-class-name Cosmos3OmniDiffusersPipeline \
-  --image cherry_blossom.jpg \
-  --prompt "Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
-  --negative-prompt "blurry, distorted, low quality" \
-  --height 720 \
-  --width 1280 \
-  --num-frames 81 \
-  --guidance-scale 4.0 \
-  --num-inference-steps 35 \
-  --fps 24 \
-  --output cosmos3_i2v_output.mp4
-```
-
-For Cosmos3 I2V, the input image is resized and center-cropped by the pipeline. If `--height` and `--width` are omitted, this example chooses a 720p-area resolution from the input aspect ratio. Cosmos3 currently supports one prompt and one video per request, and model-level CPU offload is not supported; use `--enable-layerwise-offload` instead.
-
 Key arguments:
 
 - `--model`: Model ID (I2V-A14B for MoE, TI2V-5B for unified T2V+I2V).
-- `--model-class-name`: explicit pipeline class. Use `Cosmos3OmniDiffusersPipeline` for Cosmos3 checkpoints.
+- `--model-class-name`: explicit pipeline class override.
 - `--image`: Path to input image (required).
 - `--prompt`: Text description of desired motion/animation.
 - `--height/--width`: Output resolution (auto-calculated from image if not set). Dimensions should be multiples of 16.
diff --git a/examples/offline_inference/text_to_image/README.md b/examples/offline_inference/text_to_image/README.md
index 149e4260904..ef5c526f40f 100644
--- a/examples/offline_inference/text_to_image/README.md
+++ b/examples/offline_inference/text_to_image/README.md
@@ -34,7 +34,6 @@ This folder provides several entrypoints for experimenting with text-to-image di
 | `black-forest-labs/FLUX.2-klein-4B` | 1024 x 1024 | 72.7 | 14.9 |
 | `black-forest-labs/FLUX.2-klein-9B` | 1024 x 1024 | 37.1 | 32.3 |
 | `black-forest-labs/FLUX.2-dev` | 1024 x 1024 | 65.7 | >80 (CPU offload required) |
-| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | 1024 x 1024 | model/checkpoint dependent | local checkpoint |
 | `HunyuanImage-3.0` | 1024 x 1024 | 80.0 (TP≥3)  | 160 |
 
 !!! info
@@ -75,7 +74,7 @@ python text_to_image.py \
 | Argument | Type | Default | Description |
 | -------- | ---- | ------- | ----------- |
 | `--model` | str | `"Qwen/Qwen-Image"` | Diffusion model name or local path |
-| `--model-class-name` | str | `None` | Override pipeline class, for example `Cosmos3OmniDiffusersPipeline` |
+| `--model-class-name` | str | `None` | Override pipeline class |
 | `--prompt` | str | `"a cup of coffee on the table"` | Text description for image generation |
 | `--seed` | int | `142` | Integer seed for deterministic sampling |
 | `--negative-prompt` | str | `None` | Negative prompt for classifier-free conditional guidance |
@@ -180,28 +179,6 @@ python examples/offline_inference/text_to_image/text_to_image.py \
   --output flux2-dev.png
 ```
 
-### Cosmos3
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-python text_to_image.py \
-  --model "$COSMOS3_MODEL" \
-  --model-class-name Cosmos3OmniDiffusersPipeline \
-  --prompt "A small warehouse robot carrying a blue box, clean product photography" \
-  --negative-prompt "blurry, distorted, low quality" \
-  --guidance-scale 7.0 \
-  --num-inference-steps 50 \
-  --height 1024 \
-  --width 1024 \
-  --num-images-per-prompt 1 \
-  --output cosmos3_t2i.png
-```
-
-This script marks text-to-image requests with `modalities=["image"]`, which selects Cosmos3 T2I. Cosmos3 currently supports one prompt per request; use `--num-images-per-prompt` to request multiple images for that prompt. Model-level CPU offload is not supported for Cosmos3, so use `--enable-layerwise-offload` for offload instead.
-
 ### Batch Requests (Multiple Prompts)
 
 You can pass multiple prompts in a single `generate` call.
diff --git a/examples/offline_inference/text_to_video/text_to_video.md b/examples/offline_inference/text_to_video/text_to_video.md
index 69ef1dadfe7..936a9078179 100644
--- a/examples/offline_inference/text_to_video/text_to_video.md
+++ b/examples/offline_inference/text_to_video/text_to_video.md
@@ -9,7 +9,6 @@ A unified script for text-to-video generation. Supports multiple models with mod
 | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | 720x1280 | 81 | 40 | 4.0 | ~60 GiB |
 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v` | 480x832 | 121 | 50 | 6.0 | 1×A100 80GB |
 | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v` | 720x1280 | 121 | 50 | 6.0 | FP8 + VAE tiling required |
-| `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` | 720x1280 | 81 | 35 | 4.0 | model/checkpoint dependent |
 
 ## Local CLI Usage
 
@@ -46,28 +45,6 @@ python text_to_video.py \
   --output ltx2_out.mp4
 ```
 
-### Cosmos3
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-python text_to_video.py \
-  --model "$COSMOS3_MODEL" \
-  --prompt "A small warehouse robot moves a blue box across a clean floor." \
-  --negative-prompt "blurry, distorted, low quality" \
-  --height 720 \
-  --width 1280 \
-  --num-frames 81 \
-  --guidance-scale 4.0 \
-  --num-inference-steps 35 \
-  --fps 24 \
-  --output cosmos3_t2v_output.mp4
-```
-
-Cosmos3 video generation currently supports one prompt and one video per request. The implementation supports `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--ulysses-degree`, `--tensor-parallel-size`, `--use-hsdp`, and `--enable-layerwise-offload`. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
-
 ### HunyuanVideo-1.5 (480p)
 
 ```bash
diff --git a/examples/online_serving/cosmos3/README.md b/examples/online_serving/cosmos3/README.md
new file mode 100644
index 00000000000..fd5e1c4d93e
--- /dev/null
+++ b/examples/online_serving/cosmos3/README.md
@@ -0,0 +1,63 @@
+# Cosmos3
+
+This example shows Cosmos3 online serving with `Cosmos3OmniDiffusersPipeline`.
+
+Set `COSMOS3_MODEL` to a local Diffusers-format checkpoint before starting the server:
+
+```bash
+export COSMOS3_MODEL=/path/to/cosmos3-diffusers
+cd examples/online_serving/cosmos3
+bash run_server.sh
+```
+
+`run_server.sh` accepts these environment overrides:
+
+- `MODEL`: checkpoint path, defaults to `COSMOS3_MODEL`
+- `PORT`: server port, defaults to `8091`
+- `CACHE_BACKEND`: set to `cache_dit` to enable Cache-DiT
+- `ENABLE_LAYERWISE_OFFLOAD`: set to `1` to enable layerwise offload
+- `CFG_PARALLEL_SIZE`, `TENSOR_PARALLEL_SIZE`, `ULYSSES_DEGREE`, `USE_HSDP`: parallel execution controls
+- `ALLOWED_LOCAL_MEDIA_PATH`: local media access path, defaults to `/`
+
+## Text-to-Image
+
+```bash
+bash run_curl_t2i.sh
+```
+
+The script calls `POST /v1/images/generations`, which selects Cosmos3 text-to-image through `modalities=["image"]` internally.
+
+## Text-to-Video
+
+```bash
+bash run_curl_t2v.sh
+```
+
+## Image-to-Video
+
+Download an example image or set `IMAGE_PATH` to your own image:
+
+```bash
+wget https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg
+IMAGE_PATH=cherry_blossom.jpg bash run_curl_i2v.sh
+```
+
+## Video With Sound
+
+This path requires a sound-capable Cosmos3 checkpoint with `sound_gen` weights.
+
+```bash
+bash run_curl_t2v_sound.sh
+```
+
+The script passes `generate_sound=true` and `sound_duration` to the video endpoint.
+
+## Action Policy
+
+This path requires an action-capable Cosmos3 checkpoint with `action_gen` weights. Pass either `domain_name` or `domain_id` through `extra_params`.
+
+```bash
+IMAGE_PATH=cherry_blossom.jpg bash run_curl_action_policy.sh
+```
+
+The script uses the asynchronous `POST /v1/videos` job endpoint so it can download the MP4 and save the returned action metadata JSON.
diff --git a/examples/online_serving/cosmos3/run_curl_action_policy.sh b/examples/online_serving/cosmos3/run_curl_action_policy.sh
new file mode 100644
index 00000000000..b5635e3f201
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_action_policy.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Cosmos3 action policy example. Requires an action-capable checkpoint.
+
+set -euo pipefail
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+IMAGE_PATH="${IMAGE_PATH:-cherry_blossom.jpg}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_action_policy.mp4}"
+ACTION_OUTPUT_PATH="${ACTION_OUTPUT_PATH:-cosmos3_action_policy_action.json}"
+POLL_INTERVAL="${POLL_INTERVAL:-2}"
+
+create_response=$(
+  curl -sS -X POST "${BASE_URL}/v1/videos" \
+    -H "Accept: application/json" \
+    -F "prompt=Predict the robot action for moving toward the target." \
+    -F "input_reference=@${IMAGE_PATH}" \
+    -F "size=640x480" \
+    -F "num_frames=17" \
+    -F "fps=24" \
+    -F "num_inference_steps=30" \
+    -F "guidance_scale=1.0" \
+    -F 'extra_params={"action_mode":"policy","domain_name":"bridge_orig_lerobot","raw_action_dim":2,"action_chunk_size":16}' \
+    -F "seed=42"
+)
+
+video_id="$(echo "${create_response}" | jq -r '.id')"
+if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then
+  echo "Failed to create video job:"
+  echo "${create_response}" | jq .
+  exit 1
+fi
+
+echo "Created video job ${video_id}"
+while true; do
+  status_response="$(curl -sS "${BASE_URL}/v1/videos/${video_id}")"
+  status="$(echo "${status_response}" | jq -r '.status')"
+
+  case "${status}" in
+    queued|in_progress)
+      echo "Video job ${video_id} status: ${status}"
+      sleep "${POLL_INTERVAL}"
+      ;;
+    completed)
+      echo "${status_response}" | jq '.data[0].action' > "${ACTION_OUTPUT_PATH}"
+      break
+      ;;
+    failed)
+      echo "Video generation failed:"
+      echo "${status_response}" | jq .
+      exit 1
+      ;;
+    *)
+      echo "Unexpected status response:"
+      echo "${status_response}" | jq .
+      exit 1
+      ;;
+  esac
+done
+
+curl -sS -L "${BASE_URL}/v1/videos/${video_id}/content" -o "${OUTPUT_PATH}"
+
+echo "Saved video to ${OUTPUT_PATH}"
+echo "Saved action metadata to ${ACTION_OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_i2v.sh b/examples/online_serving/cosmos3/run_curl_i2v.sh
new file mode 100644
index 00000000000..eb65ca9621a
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_i2v.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Cosmos3 image-to-video example using the sync video API.
+
+set -euo pipefail
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+IMAGE_PATH="${IMAGE_PATH:-cherry_blossom.jpg}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_i2v.mp4}"
+
+curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
+  -F "prompt=Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "input_reference=@${IMAGE_PATH}" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "seed=42" \
+  -o "${OUTPUT_PATH}"
+
+echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_t2i.sh b/examples/online_serving/cosmos3/run_curl_t2i.sh
new file mode 100644
index 00000000000..421b0664c35
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_t2i.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Cosmos3 text-to-image example using the images API.
+
+set -euo pipefail
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_t2i.png}"
+
+curl -sS -X POST "${BASE_URL}/v1/images/generations" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "A small warehouse robot carrying a blue box, clean product photography",
+    "size": "1024x1024",
+    "n": 1,
+    "num_inference_steps": 50,
+    "guidance_scale": 7.0,
+    "negative_prompt": "blurry, distorted, low quality",
+    "seed": 42
+  }' | jq -r '.data[0].b64_json' | base64 -d > "${OUTPUT_PATH}"
+
+echo "Saved image to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_t2v.sh b/examples/online_serving/cosmos3/run_curl_t2v.sh
new file mode 100644
index 00000000000..dc436b28277
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_t2v.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Cosmos3 text-to-video example using the sync video API.
+
+set -euo pipefail
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_t2v.mp4}"
+
+curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
+  -F "prompt=A small warehouse robot moves a blue box across a clean floor." \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "seed=42" \
+  -o "${OUTPUT_PATH}"
+
+echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_t2v_sound.sh b/examples/online_serving/cosmos3/run_curl_t2v_sound.sh
new file mode 100644
index 00000000000..3c82f965b7d
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_t2v_sound.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Cosmos3 video-with-sound example. Requires a sound-capable checkpoint.
+
+set -euo pipefail
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_t2v_sound.mp4}"
+
+curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
+  -F "prompt=A small warehouse robot rolls across the floor with soft motor sounds." \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "size=1280x720" \
+  -F "num_frames=81" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=4.0" \
+  -F "generate_sound=true" \
+  -F "sound_duration=3.4" \
+  -F "seed=42" \
+  -o "${OUTPUT_PATH}"
+
+echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_server.sh b/examples/online_serving/cosmos3/run_server.sh
new file mode 100644
index 00000000000..5d3e1b820a4
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_server.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Cosmos3 online serving startup script.
+
+set -euo pipefail
+
+MODEL="${MODEL:-${COSMOS3_MODEL:-}}"
+PORT="${PORT:-8091}"
+CACHE_BACKEND="${CACHE_BACKEND:-none}"
+ENABLE_LAYERWISE_OFFLOAD="${ENABLE_LAYERWISE_OFFLOAD:-0}"
+CFG_PARALLEL_SIZE="${CFG_PARALLEL_SIZE:-1}"
+TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-1}"
+ULYSSES_DEGREE="${ULYSSES_DEGREE:-1}"
+USE_HSDP="${USE_HSDP:-0}"
+ALLOWED_LOCAL_MEDIA_PATH="${ALLOWED_LOCAL_MEDIA_PATH:-/}"
+
+if [ -z "${MODEL}" ]; then
+  echo "Set COSMOS3_MODEL or MODEL to a local Diffusers-format Cosmos3 checkpoint."
+  exit 1
+fi
+
+args=(
+  vllm serve "${MODEL}"
+  --omni
+  --port "${PORT}"
+  --model-class-name Cosmos3OmniDiffusersPipeline
+  --allowed-local-media-path "${ALLOWED_LOCAL_MEDIA_PATH}"
+  --cfg-parallel-size "${CFG_PARALLEL_SIZE}"
+  --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}"
+)
+
+if [ "${ULYSSES_DEGREE}" != "1" ]; then
+  args+=(--usp "${ULYSSES_DEGREE}")
+fi
+
+if [ "${CACHE_BACKEND}" != "none" ]; then
+  args+=(--cache-backend "${CACHE_BACKEND}")
+fi
+
+if [ "${ENABLE_LAYERWISE_OFFLOAD}" != "0" ]; then
+  args+=(--enable-layerwise-offload)
+fi
+
+if [ "${USE_HSDP}" != "0" ]; then
+  args+=(--use-hsdp)
+fi
+
+echo "Starting Cosmos3 server on port ${PORT}"
+exec "${args[@]}"
diff --git a/examples/online_serving/image_to_video/README.md b/examples/online_serving/image_to_video/README.md
index 6f82d3a2019..067c18d5e5f 100644
--- a/examples/online_serving/image_to_video/README.md
+++ b/examples/online_serving/image_to_video/README.md
@@ -1,6 +1,6 @@
 # Image-To-Video
 
-This example demonstrates how to deploy image-to-video models, including Wan2.2 and Cosmos3, for online video generation using vLLM-Omni.
+This example demonstrates how to deploy Wan2.2 image-to-video models for online video generation using vLLM-Omni.
 
 ## Supported Models
 
@@ -8,7 +8,6 @@ This example demonstrates how to deploy image-to-video models, including Wan2.2
 |-------|----------|
 | Wan2.2 I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` |
 | Wan2.2 TI2V | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` |
-| Cosmos3 I2V | `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` |
 
 ## Start Server
 
@@ -34,22 +33,6 @@ The script allows overriding:
 - `CACHE_BACKEND` (default: `none`)
 - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`)
 
-### Cosmos3
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-vllm serve "$COSMOS3_MODEL" \
-  --omni \
-  --port 8091 \
-  --model-class-name Cosmos3OmniDiffusersPipeline \
-  --allowed-local-media-path /
-```
-
-Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
-
 ### Ascend / Local LightX2V Example
 
 For a local Wan2.2-LightX2V Diffusers directory on Ascend/NPU, you can start the server like this:
@@ -115,53 +98,6 @@ curl -X POST http://localhost:8091/v1/videos/sync \
   -o sync_i2v_output.mp4
 ```
 
-### Cosmos3 Sync Request
-
-```bash
-curl -X POST http://localhost:8091/v1/videos/sync \
-  -F "prompt=Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "input_reference=@/path/to/cherry_blossom.jpg" \
-  -F "size=1280x720" \
-  -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
-  -F "seed=42" \
-  -o cosmos3_i2v_output.mp4
-```
-
-For async generation, send the same form fields to `POST /v1/videos`, poll `GET /v1/videos/{video_id}`, and download from `GET /v1/videos/{video_id}/content`. Cosmos3 currently supports one prompt and one video per request.
-
-```bash
-create_response=$(curl -s http://localhost:8091/v1/videos \
-  -H "Accept: application/json" \
-  -F "prompt=Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "input_reference=@/path/to/cherry_blossom.jpg" \
-  -F "size=1280x720" \
-  -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
-  -F "seed=42")
-
-video_id=$(echo "$create_response" | jq -r '.id')
-while true; do
-  status=$(curl -s "http://localhost:8091/v1/videos/${video_id}" | jq -r '.status')
-  if [ "$status" = "completed" ]; then
-    break
-  fi
-  if [ "$status" = "failed" ]; then
-    echo "Video generation failed"
-    exit 1
-  fi
-  sleep 2
-done
-
-curl -L "http://localhost:8091/v1/videos/${video_id}/content" -o cosmos3_i2v_output.mp4
-```
-
 For Wan Lightning/Distill checkpoints, pass `{"sample_solver":"euler"}` via `extra_params`. The default solver is `unipc`.
 
 Example matching the local LightX2V deployment above:
diff --git a/examples/online_serving/text_to_image/README.md b/examples/online_serving/text_to_image/README.md
index 41062f718b2..d212a1aa6e9 100644
--- a/examples/online_serving/text_to_image/README.md
+++ b/examples/online_serving/text_to_image/README.md
@@ -20,21 +20,6 @@ Or use the startup script:
 bash run_server.sh
 ```
 
-### Cosmos3
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-vllm serve "$COSMOS3_MODEL" \
-  --omni \
-  --port 8091 \
-  --model-class-name Cosmos3OmniDiffusersPipeline
-```
-
-Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
-
 ### Start with Parallelism Acceleration
 
 Enable Tensor Parallelism and VAE Patch Parallelism for faster inference:
@@ -83,26 +68,6 @@ curl -s http://localhost:8091/v1/chat/completions \
   }' | jq -r '.choices[0].message.content[0].image_url.url' | cut -d',' -f2- | base64 -d > output.png
 ```
 
-#### Cosmos3 Images API
-
-The dedicated image endpoint sets `modalities=["image"]` internally, which selects Cosmos3 text-to-image.
-
-```bash
-curl -X POST http://localhost:8091/v1/images/generations \
-  -H "Content-Type: application/json" \
-  -d '{
-    "prompt": "A small warehouse robot carrying a blue box, clean product photography",
-    "size": "1024x1024",
-    "n": 1,
-    "num_inference_steps": 50,
-    "guidance_scale": 7.0,
-    "negative_prompt": "blurry, distorted, low quality",
-    "seed": 42
-  }' | jq -r '.data[0].b64_json' | base64 -d > cosmos3_t2i.png
-```
-
-Cosmos3 currently supports one prompt per request. Use `n` to request multiple images for that prompt.
-
 ### Method 2: Using OpenAI Python SDK
 
 ```python
diff --git a/examples/online_serving/text_to_video/README.md b/examples/online_serving/text_to_video/README.md
index 57922abd38a..a334bc2eb8d 100644
--- a/examples/online_serving/text_to_video/README.md
+++ b/examples/online_serving/text_to_video/README.md
@@ -10,7 +10,6 @@ This example demonstrates how to deploy text-to-video models for online video ge
 | Wan2.1 T2V (14B) | `Wan-AI/Wan2.1-T2V-14B-Diffusers` |
 | Wan2.2 T2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` |
 | LTX-2 | `Lightricks/LTX-2` |
-| Cosmos3 T2V | `$COSMOS3_MODEL` with `Cosmos3OmniDiffusersPipeline` |
 
 ## Wan2.2 T2V
 
@@ -38,23 +37,6 @@ The script allows overriding:
 - `CACHE_BACKEND` (default: `none`)
 - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`)
 
-## Cosmos3 T2V
-
-Cosmos3 uses one pipeline for text-to-image, text-to-video, and image-to-video. Set `COSMOS3_MODEL` to a local Diffusers-format Cosmos3 checkpoint or model reference, and select the pipeline explicitly.
-
-### Start Server
-
-```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
-
-vllm serve "$COSMOS3_MODEL" \
-  --omni \
-  --port 8091 \
-  --model-class-name Cosmos3OmniDiffusersPipeline
-```
-
-Use `--enable-layerwise-offload`, `--cache-backend cache_dit`, `--cfg-parallel-size 2`, `--usp`, `--tensor-parallel-size`, or `--use-hsdp` as needed. Do not use `--enable-cpu-offload`; Cosmos3 does not support model-level CPU offload.
-
 ## Async Job Behavior
 
 `POST /v1/videos` is asynchronous. It creates a video job and immediately
@@ -100,51 +82,6 @@ curl -X POST http://localhost:8091/v1/videos/sync \
   -o sync_t2v_output.mp4
 ```
 
-### Cosmos3 Sync Request
-
-```bash
-curl -X POST http://localhost:8091/v1/videos/sync \
-  -F "prompt=A small warehouse robot moves a blue box across a clean floor." \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "size=1280x720" \
-  -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
-  -F "seed=42" \
-  -o cosmos3_t2v_output.mp4
-```
-
-For async generation, send the same form fields to `POST /v1/videos`, poll `GET /v1/videos/{video_id}`, and download from `GET /v1/videos/{video_id}/content`. Cosmos3 currently supports one prompt and one video per request.
-
-```bash
-create_response=$(curl -s http://localhost:8091/v1/videos \
-  -H "Accept: application/json" \
-  -F "prompt=A small warehouse robot moves a blue box across a clean floor." \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "size=1280x720" \
-  -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
-  -F "seed=42")
-
-video_id=$(echo "$create_response" | jq -r '.id')
-while true; do
-  status=$(curl -s "http://localhost:8091/v1/videos/${video_id}" | jq -r '.status')
-  if [ "$status" = "completed" ]; then
-    break
-  fi
-  if [ "$status" = "failed" ]; then
-    echo "Video generation failed"
-    exit 1
-  fi
-  sleep 2
-done
-
-curl -L "http://localhost:8091/v1/videos/${video_id}/content" -o cosmos3_t2v_output.mp4
-```
-
 ## Storage
 
 Generated video files are stored on local disk by the async video API.

From bd4ecb318331ec31630abd3ac40a7ca96807864c Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 14 May 2026 14:19:12 +0200
Subject: [PATCH 04/41] Cleared up docs

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../offline_inference/image_to_video.md       | 17 +---
 .../offline_inference/text_to_image.md        | 37 +++-----
 .../offline_inference/text_to_video.md        |  3 +-
 .../examples/online_serving/image_to_video.md | 88 +++++++------------
 .../examples/online_serving/text_to_image.md  | 46 ++++++----
 .../examples/online_serving/text_to_video.md  | 58 ++++++++++--
 examples/offline_inference/cosmos3/end2end.py |  9 +-
 .../image_to_video/README.md                  | 15 +---
 .../offline_inference/text_to_image/README.md |  4 +-
 .../text_to_video/text_to_video.md            |  1 -
 .../online_serving/image_to_video/README.md   |  9 +-
 .../online_serving/text_to_image/README.md    |  1 -
 .../online_serving/text_to_video/README.md    |  2 +-
 13 files changed, 143 insertions(+), 147 deletions(-)

diff --git a/docs/user_guide/examples/offline_inference/image_to_video.md b/docs/user_guide/examples/offline_inference/image_to_video.md
index cac6c1f4c95..6e105741a7e 100644
--- a/docs/user_guide/examples/offline_inference/image_to_video.md
+++ b/docs/user_guide/examples/offline_inference/image_to_video.md
@@ -5,13 +5,6 @@ Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inf
 
 This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models with vLLM-Omni's offline inference API.
 
-## Supported Models
-
-| Model | Default Resolution | Default Frames | Default Steps | Guidance |
-|-------|--------------------|----------------|---------------|----------|
-| `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
-| `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
-
 ## Local CLI Usage
 
 Download the example image:
@@ -61,19 +54,17 @@ python image_to_video.py \
 Key arguments:
 
 - `--model`: Model ID (I2V-A14B for MoE, TI2V-5B for unified T2V+I2V).
-- `--model-class-name`: explicit pipeline class override.
 - `--image`: Path to input image (required).
 - `--prompt`: Text description of desired motion/animation.
 - `--height/--width`: Output resolution (auto-calculated from image if not set). Dimensions should be multiples of 16.
-- `--num-frames`: Number of frames (default is model-specific).
+- `--num-frames`: Number of frames (default 81).
 - `--guidance-scale` and `--guidance-scale-high`: CFG scale (applied to low/high-noise stages for MoE).
 - `--negative-prompt`: Optional list of artifacts to suppress.
 - `--boundary-ratio`: Boundary split ratio for two-stage MoE models.
-- `--flow-shift`: Scheduler flow shift. Defaults are model-specific.
+- `--flow-shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p).
 - `--sample-solver`: Wan2.2 sampling solver. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints.
-- `--num-inference-steps`: Number of denoising steps (default is model-specific).
+- `--num-inference-steps`: Number of denoising steps (default 50).
 - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video).
-- `--frame-rate`: Generation frame rate for models that use it. Defaults to `--fps`.
 - `--output`: Path to save the generated video.
 - `--vae-use-slicing`: Enable VAE slicing for memory optimization.
 - `--vae-use-tiling`: Enable VAE tiling for memory optimization.
@@ -89,7 +80,7 @@ Key arguments:
 > ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage.
 
 For Wan2.2 LightX2V-converted local Diffusers directories and related LoRA
-assets, see the [LoRA guide](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/lora.md#wan22-lightx2v-offline-assembly).
+assets, see the [LoRA guide](../../diffusion/lora.md#wan22-lightx2v-offline-assembly).
 
 ## Example materials
 
diff --git a/docs/user_guide/examples/offline_inference/text_to_image.md b/docs/user_guide/examples/offline_inference/text_to_image.md
index d2c87da6458..3a97ffbf74b 100644
--- a/docs/user_guide/examples/offline_inference/text_to_image.md
+++ b/docs/user_guide/examples/offline_inference/text_to_image.md
@@ -32,12 +32,10 @@ This folder provides several entrypoints for experimenting with text-to-image di
 | `AIDC-AI/Ovis-Image-7B` | 1024 x 1024 | 71.8 | 17.1 |
 | `OmniGen2/OmniGen2` |  1024 x 1024 | 20.1 | 14.7 |
 | `stabilityai/stable-diffusion-3.5-medium` | 1024 x 1024 | 20.1 | 15.6 |
-| `black-forest-labs/FLUX.1-dev` | 1024 x 1024 | 33.9 | 31.4 |
-| `black-forest-labs/FLUX.1-schnell` | 1024 x 1024 | 33.9 | 31.4 |
+| `black-forest-labs/FLUX.1-dev` | 1024 x 1024 | 77.6 | 31.4 |
 | `black-forest-labs/FLUX.2-klein-4B` | 1024 x 1024 | 72.7 | 14.9 |
 | `black-forest-labs/FLUX.2-klein-9B` | 1024 x 1024 | 37.1 | 32.3 |
 | `black-forest-labs/FLUX.2-dev` | 1024 x 1024 | 65.7 | >80 (CPU offload required) |
-| `HunyuanImage-3.0` | 1024 x 1024 | 80.0 (TP≥3)  | 160 |
 
 !!! info
 *Peak VRAM:  based on basic single-card usage, batch size =1, without any acceleration/optimization features. FLUX.2-dev requires `--enable-cpu-offload` on a single 80 GiB GPU.
@@ -76,13 +74,11 @@ python text_to_image.py \
 
 | Argument | Type | Default | Description |
 | -------- | ---- | ------- | ----------- |
-| `--model` | str | `"Qwen/Qwen-Image"` | Diffusion model name or local path |
-| `--model-class-name` | str | `None` | Override pipeline class |
 | `--prompt` | str | `"a cup of coffee on the table"` | Text description for image generation |
 | `--seed` | int | `142` | Integer seed for deterministic sampling |
 | `--negative-prompt` | str | `None` | Negative prompt for classifier-free conditional guidance |
 | `--cfg-scale` | float | `4.0` | True CFG scale (model-specific guidance strength) |
-| `--guidance-scale` | float | `4.0` | Classifier-free guidance scale |
+| `--guidance-scale` | float | `1.0` | Classifier-free guidance scale |
 | `--num-images-per-prompt` | int | `1` | Number of images per prompt (saved as `output`, `output_1`, ...) |
 | `--num-inference-steps` | int | `50` | Diffusion sampling steps (more steps = higher quality, slower) |
 | `--height` | int | `1024` | Output image height in pixels |
@@ -91,14 +87,9 @@ python text_to_image.py \
 | `--vae-use-slicing` | flag | off | Enable VAE slicing for memory optimization |
 | `--vae-use-tiling` | flag | off | Enable VAE tiling for memory optimization |
 | `--cfg-parallel-size` | int | `1` | Set to `2` to enable CFG Parallel |
-| `--ulysses-degree` | int | `1` | Ulysses sequence parallel degree for multi-GPU inference |
-| `--ring-degree` | int | `1` | Ring sequence parallel degree for hybrid Ulysses + Ring inference |
-| `--ulysses-mode` | str | `"strict"` | Ulysses SP mode: `"strict"` or `"advanced_uaa"` |
 | `--enable-cpu-offload` | flag | off | Enable CPU offloading for diffusion models |
 | `--lora-path` | str | — | Path to PEFT LoRA adapter folder |
 | `--lora-scale` | float | `1.0` | Scale factor for LoRA weights |
-| `--use-system-prompt` | str | `None` | System prompt preset: `en_unified`, `en_vanilla`, `en_recaption`, `en_think_recaption`, `dynamic`, `None`, or custom text. Recommended: `en_unified`. Only for HunyuanImage-3.0.|
-| `--system-prompt` | str | `None` | Custom system prompt text. Only used when `--use-system-prompt` is set to `custom`. Only for HunyuanImage-3.0.|
 
 **NextStep-1.1 specific arguments:**
 
@@ -132,19 +123,6 @@ python text_to_image.py \
 
 `Tongyi-MAI/Z-Image-Turbo` is a distilled version of Z-Image. Distilled diffusion models usually require less number of inference steps (4~9), and Classifier-Free Guidance (CFG) is usually NOT applied. Similar distilled models are `black-forest-labs/FLUX.2-klein-4B` and `black-forest-labs/FLUX.2-klein-9B`.
 
-Advanced UAA example (requires 2 GPUs):
-
-```bash
-python text_to_image.py \
-  --model Tongyi-MAI/Z-Image-Turbo \
-  --prompt "a cup of coffee on the table" \
-  --ulysses-degree 2 \
-  --ulysses-mode advanced_uaa \
-  --height 1024 \
-  --width 1024 \
-  --output outputs/coffee_hybrid.png
-```
-
 ### NextStep Models
 
 NextStep-1.1 supports extra arguments for dual-level CFG control:
@@ -253,7 +231,7 @@ python examples/offline_inference/text_to_image/text_to_image.py \
 #### CFG Parallel
 
 Set `--cfg-parallel-size 2` to enable CFG Parallel for faster inference on multi-GPU setups.
-See more examples in the [cfg_parallel user guide](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/parallelism/cfg_parallel.md#using-cfg-parallel).
+See more examples in the [diffusion acceleration user guide](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion_acceleration.md#using-cfg-parallel).
 
 #### LoRA
 
@@ -278,6 +256,15 @@ lora_adapter/
 
 ## Web UI Demo
 
+!!! note "Gradio is an optional dependency"
+    The Gradio demo requires the `[demo]` extras. Install them first:
+
+    ```bash
+    pip install 'vllm-omni[demo]'
+    ```
+
+    Or, if installing from source: `pip install -e '.[demo]'`
+
 Launch the Gradio demo:
 
 ```bash
diff --git a/docs/user_guide/examples/offline_inference/text_to_video.md b/docs/user_guide/examples/offline_inference/text_to_video.md
index bb7a1d43ece..a09dbfc979f 100644
--- a/docs/user_guide/examples/offline_inference/text_to_video.md
+++ b/docs/user_guide/examples/offline_inference/text_to_video.md
@@ -5,6 +5,8 @@ Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inf
 
 A unified script for text-to-video generation. Supports multiple models with model-aware defaults.
 
+For backend selection and SageAttention usage, see the [Diffusion Attention Backends](../../diffusion/attention_backends.md) guide.
+
 ## Supported Models
 
 | Model | Default Resolution | Default Frames | Default Steps | Guidance | VRAM (BF16) |
@@ -125,7 +127,6 @@ python text_to_video.py \
 - `--audio-sample-rate`: audio sample rate for embedded audio (when the pipeline returns audio).
 - `--quantization`: quantization method (`fp8` for FP8, `gguf` for GGUF).
 - `--flow-shift`: scheduler flow_shift parameter.
-- `--cache-backend`: `cache_dit` for supported models.
 
 ### Wan2.2-specific
 
diff --git a/docs/user_guide/examples/online_serving/image_to_video.md b/docs/user_guide/examples/online_serving/image_to_video.md
index b30bbcdf80b..781f0c2a5ed 100644
--- a/docs/user_guide/examples/online_serving/image_to_video.md
+++ b/docs/user_guide/examples/online_serving/image_to_video.md
@@ -3,14 +3,7 @@
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/image_to_video>.
 
 
-This example demonstrates how to deploy Wan2.2 image-to-video models for online video generation using vLLM-Omni.
-
-## Supported Models
-
-| Model | Model ID |
-|-------|----------|
-| Wan2.2 I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` |
-| Wan2.2 TI2V | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` |
+This example demonstrates how to deploy the Wan2.2 image-to-video model for online video generation using vLLM-Omni.
 
 ## Start Server
 
@@ -36,23 +29,6 @@ The script allows overriding:
 - `CACHE_BACKEND` (default: `none`)
 - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`)
 
-### Ascend / Local LightX2V Example
-
-For a local Wan2.2-LightX2V Diffusers directory on Ascend/NPU, you can start the server like this:
-
-```bash
-vllm serve /path/to/Wan2.2-I2V-A14B-LightX2V-Diffusers-Lightning \
-  --omni \
-  --port 8091 \
-  --flow-shift 12 \
-  --cfg-parallel-size 1 \
-  --ulysses-degree 4 \
-  --use-hsdp \
-  --trust-remote-code \
-  --allowed-local-media-path / \
-  --seed 42
-```
-
 ## Async Job Behavior
 
 `POST /v1/videos` is asynchronous. It creates a video job and immediately
@@ -96,35 +72,13 @@ curl -X POST http://localhost:8091/v1/videos/sync \
   -F "guidance_scale_2=1.0" \
   -F "boundary_ratio=0.875" \
   -F "flow_shift=12.0" \
-  -F 'extra_params={"sample_solver":"euler"}' \
+  -F "enable_frame_interpolation=true" \
+  -F "frame_interpolation_exp=1" \
+  -F "frame_interpolation_scale=1.0" \
   -F "seed=42" \
   -o sync_i2v_output.mp4
 ```
 
-For Wan Lightning/Distill checkpoints, pass `{"sample_solver":"euler"}` via `extra_params`. The default solver is `unipc`.
-
-Example matching the local LightX2V deployment above:
-
-```bash
-curl -sS -X POST http://localhost:8091/v1/videos/sync \
-  -H "Accept: video/mp4" \
-  -F "prompt=A cat playing with yarn" \
-  -F "input_reference=@/path/to/input.jpg" \
-  -F "width=832" \
-  -F "height=480" \
-  -F "num_frames=81" \
-  -F "fps=16" \
-  -F "num_inference_steps=4" \
-  -F "guidance_scale=1.0" \
-  -F "guidance_scale_2=1.0" \
-  -F "boundary_ratio=0.875" \
-  -F "seed=42" \
-  -F 'extra_params={"sample_solver":"euler"}' \
-  -o ./output.mp4
-```
-
-Use `/v1/videos/sync` if you want to write the MP4 directly to a file. `POST /v1/videos` is async and returns job metadata, not inline `b64_json`.
-
 ## Storage
 
 Generated video files are stored on local disk by the async video API.
@@ -148,9 +102,6 @@ export VLLM_OMNI_STORAGE_MAX_CONCURRENCY=8
 # Basic image-to-video generation
 bash run_curl_image_to_video.sh
 
-# Wan Lightning/Distill checkpoints
-SAMPLE_SOLVER=euler bash run_curl_image_to_video.sh
-
 # Or execute directly (OpenAI-style multipart)
 create_response=$(curl -s http://localhost:8091/v1/videos \
   -H "Accept: application/json" \
@@ -166,7 +117,9 @@ create_response=$(curl -s http://localhost:8091/v1/videos \
   -F "guidance_scale_2=1.0" \
   -F "boundary_ratio=0.875" \
   -F "flow_shift=12.0" \
-  -F 'extra_params={"sample_solver":"euler"}' \
+  -F "enable_frame_interpolation=true" \
+  -F "frame_interpolation_exp=1" \
+  -F "frame_interpolation_scale=1.0" \
   -F "seed=42")
 
 video_id=$(echo "$create_response" | jq -r '.id')
@@ -225,11 +178,34 @@ curl -X POST http://localhost:8091/v1/videos \
   -F "guidance_scale_2=1.0" \
   -F "boundary_ratio=0.875" \
   -F "flow_shift=12.0" \
-  -F 'extra_params={"sample_solver":"euler"}' \
+  -F "enable_frame_interpolation=true" \
+  -F "frame_interpolation_exp=1" \
+  -F "frame_interpolation_scale=1.0" \
   -F "seed=42"
 ```
 
-`sample_solver` is supported by Wan2.2 online serving through the existing `extra_params` field, which is merged into the pipeline `extra_args`. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints.
+Frame interpolation is also available for supported Wan2.2 I2V requests. See
+[Frame Interpolation](../../diffusion/frame_interpolation.md) for worker-side
+execution details and feature constraints.
+
+### Frame Interpolation Example
+
+```bash
+curl -X POST http://localhost:8091/v1/videos/sync \
+  -F "prompt=A bear playing with yarn, smooth motion" \
+  -F "input_reference=@/path/to/qwen-bear.png" \
+  -F "width=832" \
+  -F "height=480" \
+  -F "num_frames=33" \
+  -F "fps=16" \
+  -F "num_inference_steps=40" \
+  -F "guidance_scale=1.0" \
+  -F "guidance_scale_2=1.0" \
+  -F "enable_frame_interpolation=true" \
+  -F "frame_interpolation_exp=1" \
+  -F "frame_interpolation_scale=1.0" \
+  -o sync_i2v_interpolated.mp4
+```
 
 ## Create Response Format
 
diff --git a/docs/user_guide/examples/online_serving/text_to_image.md b/docs/user_guide/examples/online_serving/text_to_image.md
index 47b916de080..69c1480e39f 100644
--- a/docs/user_guide/examples/online_serving/text_to_image.md
+++ b/docs/user_guide/examples/online_serving/text_to_image.md
@@ -97,12 +97,6 @@ with open("output.png", "wb") as f:
     f.write(base64.b64decode(b64_data))
 ```
 
-!!! note
-    The OpenAI SDK's `extra_body` keyword argument merges parameters into the
-    top-level request body automatically. When using curl or Python `requests`,
-    wrap generation parameters inside a literal `"extra_body"` key in the JSON
-    instead (as shown in the curl example above).
-
 ### Method 3: Using Python Client Script
 
 ```bash
@@ -111,6 +105,15 @@ python openai_chat_client.py --prompt "A beautiful landscape painting" --output
 
 ### Method 4: Using Gradio Demo
 
+!!! note "Gradio is an optional dependency"
+    The Gradio demo requires the `[demo]` extras. Install them first:
+
+    ```bash
+    pip install 'vllm-omni[demo]'
+    ```
+
+    Or, if installing from source: `pip install -e '.[demo]'`
+
 ```bash
 python gradio_demo.py
 # Visit http://localhost:7860
@@ -183,7 +186,7 @@ lora_adapter/
 
 ### Generation with Parameters
 
-Use `extra_body` to pass generation parameters:
+Wrap generation parameters inside `extra_body` in the request JSON:
 
 ```json
 {
@@ -200,6 +203,21 @@ Use `extra_body` to pass generation parameters:
 }
 ```
 
+!!! tip "Using the OpenAI SDK"
+    When using the OpenAI Python SDK, pass these parameters via the `extra_body`
+    keyword argument. The SDK merges them into the top-level request body automatically:
+
+    ```python
+    client.chat.completions.create(
+        model="Qwen/Qwen-Image",
+        messages=[...],
+        extra_body={"height": 1024, "width": 1024, "num_inference_steps": 50},
+    )
+    ```
+
+    For details on how generation parameters are handled across different clients, see the
+    [Diffusion Chat API guide](../../../../serving/diffusion_chat_api.md).
+
 ### Multimodal Input (Text + Structured Content)
 
 ```json
@@ -218,25 +236,23 @@ Use `extra_body` to pass generation parameters:
 ## Generation Parameters
 
 When using `/v1/chat/completions`, pass these inside `extra_body` in the curl
-JSON, or via the `extra_body` keyword argument in the OpenAI Python SDK.
-When using the dedicated `/v1/images/generations` endpoint, pass the supported
-generation controls as top-level JSON fields directly. For image dimensions and
-count, use `size` and `n` rather than `height`, `width`, or
-`num_outputs_per_prompt`.
+JSON, or via the `extra_body` keyword argument in the OpenAI Python SDK (see the
+[Diffusion Chat API guide](../../../../serving/diffusion_chat_api.md)).
+When using the dedicated [`/v1/images/generations`](../../../../serving/image_generation_api.md)
+endpoint, pass the supported generation controls as top-level JSON fields
+directly. For image dimensions and count, use `size` and `n` rather than
+`height`, `width`, or `num_outputs_per_prompt`.
 
 | Parameter                | Type  | Default | Description                    |
 | ------------------------ | ----- | ------- | ------------------------------ |
 | `height`                 | int   | None    | Image height in pixels         |
 | `width`                  | int   | None    | Image width in pixels          |
 | `size`                   | str   | None    | Image size (e.g., "1024x1024") |
-| `n`                      | int   | 1       | Number of images for `/v1/images/generations` |
 | `num_inference_steps`    | int   | 50      | Number of denoising steps      |
 | `true_cfg_scale`         | float | 4.0     | Qwen-Image CFG scale           |
 | `seed`                   | int   | None    | Random seed (reproducible)     |
 | `negative_prompt`        | str   | None    | Negative prompt                |
 | `num_outputs_per_prompt` | int   | 1       | Number of images to generate   |
-| `use_system_prompt` | str | None | System prompt preset: `en_unified`, `en_vanilla`, `en_recaption`, `en_think_recaption`, `dynamic`, `None`, or custom text string. Only for HunyuanImage-3.0. |
-| `system_prompt` | str | None | Custom system prompt text. Only used when `use_system_prompt` is set to `custom`. Only for HunyuanImage-3.0. |
 
 ## Response Format
 
diff --git a/docs/user_guide/examples/online_serving/text_to_video.md b/docs/user_guide/examples/online_serving/text_to_video.md
index f045e0d44a4..b918aac19d0 100644
--- a/docs/user_guide/examples/online_serving/text_to_video.md
+++ b/docs/user_guide/examples/online_serving/text_to_video.md
@@ -165,6 +165,9 @@ curl -X POST http://localhost:8091/v1/videos \
   -F "guidance_scale_2=4.0" \
   -F "boundary_ratio=0.875" \
   -F "flow_shift=5.0" \
+  -F "enable_frame_interpolation=true" \
+  -F "frame_interpolation_exp=1" \
+  -F "frame_interpolation_scale=1.0" \
   -F "seed=42"
 ```
 
@@ -184,9 +187,38 @@ curl -X POST http://localhost:8091/v1/videos \
 | `guidance_scale`      | float  | None    | CFG guidance scale (low-noise stage)             |
 | `guidance_scale_2`    | float  | None    | CFG guidance scale (high-noise stage, Wan2.2)     |
 | `boundary_ratio`      | float  | None    | Boundary split ratio for low/high DiT (Wan2.2)   |
-| `flow_shift`          | float  | None    | Scheduler flow shift                             |
+| `flow_shift`          | float  | None    | Scheduler flow shift (Wan2.2)                    |
 | `seed`                | int    | None    | Random seed (reproducible)                       |
 | `lora`                | object | None    | LoRA configuration                               |
+| `enable_frame_interpolation` | bool | false | Enable RIFE frame interpolation before MP4 encoding |
+| `frame_interpolation_exp` | int | 1 | Interpolation exponent; 1=2x temporal resolution, 2=4x |
+| `frame_interpolation_scale` | float | 1.0 | RIFE inference scale; use 0.5 for high-resolution inputs |
+| `frame_interpolation_model_path` | str | None | Local directory or Hugging Face repo ID with `flownet.pkl`; defaults to `elfgum/RIFE-4.22.lite` |
+
+## Frame Interpolation
+
+Frame interpolation is an optional post-processing step for `/v1/videos` and
+`/v1/videos/sync`. It synthesizes intermediate frames between generated frames
+without rerunning the diffusion model. If the generated video has `N` frames,
+the interpolated output frame count is `(N - 1) * 2**exp + 1`. The encoder FPS
+is multiplied by `2**exp` so the output duration remains close to the original.
+
+Frame interpolation runs in the diffusion worker post-processing path instead of
+the API server encoding path, so it can reuse the worker's current accelerator
+device without blocking the FastAPI event loop.
+
+Example: generate 5 frames and interpolate to 9 frames:
+
+```bash
+curl -X POST http://localhost:8091/v1/videos/sync \
+  -F "prompt=A dog running through a park" \
+  -F "num_frames=5" \
+  -F "fps=8" \
+  -F "enable_frame_interpolation=true" \
+  -F "frame_interpolation_exp=1" \
+  -F "frame_interpolation_scale=1.0" \
+  -o sync_t2v_interpolated.mp4
+```
 
 ## Create Response Format
 
@@ -256,6 +288,14 @@ vllm serve Lightricks/LTX-2 --omni --port 8098 \
     --enforce-eager --flow-shift 1.0 --boundary-ratio 1.0
 ```
 
+For multi-GPU memory reduction, you can enable HSDP:
+
+```bash
+vllm serve Lightricks/LTX-2 --omni --port 8098 \
+    --enforce-eager --flow-shift 1.0 --boundary-ratio 1.0 \
+    --use-hsdp --hsdp-shard-size 2
+```
+
 #### Start with Optimization Presets
 
 Use the LTX-2 startup script with built-in optimization presets:
@@ -326,13 +366,16 @@ curl -sS -X POST http://localhost:8098/v1/videos \
 
 ## Example materials
 
-??? abstract "run_curl_hunyuan_video_15.sh"
-    ``````sh
-    --8<-- "examples/online_serving/text_to_video/run_curl_hunyuan_video_15.sh"
+??? abstract "response.json"
+    ``````json
+    --8<-- "examples/online_serving/text_to_video/response.json"
     ``````
 ??? abstract "run_curl_ltx2.sh"
     ``````sh
     --8<-- "examples/online_serving/text_to_video/run_curl_ltx2.sh"
+??? abstract "run_curl_hunyuan_video_15.sh"
+    ``````sh
+    --8<-- "examples/online_serving/text_to_video/run_curl_hunyuan_video_15.sh"
     ``````
 ??? abstract "run_curl_text_to_video.sh"
     ``````sh
@@ -342,11 +385,10 @@ curl -sS -X POST http://localhost:8098/v1/videos \
     ``````sh
     --8<-- "examples/online_serving/text_to_video/run_server.sh"
     ``````
-??? abstract "run_server_hunyuan_video_15.sh"
-    ``````sh
-    --8<-- "examples/online_serving/text_to_video/run_server_hunyuan_video_15.sh"
-    ``````
 ??? abstract "run_server_ltx2.sh"
     ``````sh
     --8<-- "examples/online_serving/text_to_video/run_server_ltx2.sh"
+??? abstract "run_server_hunyuan_video_15.sh"
+    ``````sh
+    --8<-- "examples/online_serving/text_to_video/run_server_hunyuan_video_15.sh"
     ``````
diff --git a/examples/offline_inference/cosmos3/end2end.py b/examples/offline_inference/cosmos3/end2end.py
index 93525a39019..c0d1141f28a 100644
--- a/examples/offline_inference/cosmos3/end2end.py
+++ b/examples/offline_inference/cosmos3/end2end.py
@@ -18,7 +18,6 @@
 from vllm_omni.outputs import OmniRequestOutput
 from vllm_omni.platforms import current_omni_platform
 
-
 DEFAULT_NEGATIVE_PROMPT = "blurry, distorted, low quality"
 TASK_DEFAULTS = {
     "t2i": {
@@ -492,11 +491,15 @@ def main() -> None:
         return
 
     video, audio, returned_sample_rate, action = _extract_video_audio_action(outputs)
-    _save_video(video, output_path, fps=fps, audio=audio, audio_sample_rate=returned_sample_rate or args.audio_sample_rate)
+    _save_video(
+        video, output_path, fps=fps, audio=audio, audio_sample_rate=returned_sample_rate or args.audio_sample_rate
+    )
     print(f"Saved video to {output_path}")
 
     if args.task == "action_policy":
-        action_path = Path(args.action_output) if args.action_output else output_path.with_name(f"{output_path.stem}_action.json")
+        action_path = (
+            Path(args.action_output) if args.action_output else output_path.with_name(f"{output_path.stem}_action.json")
+        )
         action_path.parent.mkdir(parents=True, exist_ok=True)
         action_path.write_text(json.dumps(_jsonable(action), indent=2) + "\n", encoding="utf-8")
         print(f"Saved action metadata to {action_path}")
diff --git a/examples/offline_inference/image_to_video/README.md b/examples/offline_inference/image_to_video/README.md
index e667d1eafcc..a458850a02b 100644
--- a/examples/offline_inference/image_to_video/README.md
+++ b/examples/offline_inference/image_to_video/README.md
@@ -2,13 +2,6 @@
 
 This example demonstrates how to generate videos from images using Wan2.2 Image-to-Video models with vLLM-Omni's offline inference API.
 
-## Supported Models
-
-| Model | Default Resolution | Default Frames | Default Steps | Guidance |
-|-------|--------------------|----------------|---------------|----------|
-| `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
-| `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | auto, 480p area | 81 | 50 | 5.0 |
-
 ## Local CLI Usage
 
 Download the example image:
@@ -58,19 +51,17 @@ python image_to_video.py \
 Key arguments:
 
 - `--model`: Model ID (I2V-A14B for MoE, TI2V-5B for unified T2V+I2V).
-- `--model-class-name`: explicit pipeline class override.
 - `--image`: Path to input image (required).
 - `--prompt`: Text description of desired motion/animation.
 - `--height/--width`: Output resolution (auto-calculated from image if not set). Dimensions should be multiples of 16.
-- `--num-frames`: Number of frames (default is model-specific).
+- `--num-frames`: Number of frames (default 81).
 - `--guidance-scale` and `--guidance-scale-high`: CFG scale (applied to low/high-noise stages for MoE).
 - `--negative-prompt`: Optional list of artifacts to suppress.
 - `--boundary-ratio`: Boundary split ratio for two-stage MoE models.
-- `--flow-shift`: Scheduler flow shift. Defaults are model-specific.
+- `--flow-shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p).
 - `--sample-solver`: Wan2.2 sampling solver. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints.
-- `--num-inference-steps`: Number of denoising steps (default is model-specific).
+- `--num-inference-steps`: Number of denoising steps (default 50).
 - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video).
-- `--frame-rate`: Generation frame rate for models that use it. Defaults to `--fps`.
 - `--output`: Path to save the generated video.
 - `--vae-use-slicing`: Enable VAE slicing for memory optimization.
 - `--vae-use-tiling`: Enable VAE tiling for memory optimization.
diff --git a/examples/offline_inference/text_to_image/README.md b/examples/offline_inference/text_to_image/README.md
index ef5c526f40f..c71773972b3 100644
--- a/examples/offline_inference/text_to_image/README.md
+++ b/examples/offline_inference/text_to_image/README.md
@@ -73,13 +73,11 @@ python text_to_image.py \
 
 | Argument | Type | Default | Description |
 | -------- | ---- | ------- | ----------- |
-| `--model` | str | `"Qwen/Qwen-Image"` | Diffusion model name or local path |
-| `--model-class-name` | str | `None` | Override pipeline class |
 | `--prompt` | str | `"a cup of coffee on the table"` | Text description for image generation |
 | `--seed` | int | `142` | Integer seed for deterministic sampling |
 | `--negative-prompt` | str | `None` | Negative prompt for classifier-free conditional guidance |
 | `--cfg-scale` | float | `4.0` | True CFG scale (model-specific guidance strength) |
-| `--guidance-scale` | float | `4.0` | Classifier-free guidance scale |
+| `--guidance-scale` | float | `1.0` | Classifier-free guidance scale |
 | `--num-images-per-prompt` | int | `1` | Number of images per prompt (saved as `output`, `output_1`, ...) |
 | `--num-inference-steps` | int | `50` | Diffusion sampling steps (more steps = higher quality, slower) |
 | `--height` | int | `1024` | Output image height in pixels |
diff --git a/examples/offline_inference/text_to_video/text_to_video.md b/examples/offline_inference/text_to_video/text_to_video.md
index 936a9078179..f852e980a78 100644
--- a/examples/offline_inference/text_to_video/text_to_video.md
+++ b/examples/offline_inference/text_to_video/text_to_video.md
@@ -122,7 +122,6 @@ python text_to_video.py \
 - `--audio-sample-rate`: audio sample rate for embedded audio (when the pipeline returns audio).
 - `--quantization`: quantization method (`fp8` for FP8, `gguf` for GGUF).
 - `--flow-shift`: scheduler flow_shift parameter.
-- `--cache-backend`: `cache_dit` for supported models.
 
 ### Wan2.2-specific
 
diff --git a/examples/online_serving/image_to_video/README.md b/examples/online_serving/image_to_video/README.md
index 067c18d5e5f..285eeb27983 100644
--- a/examples/online_serving/image_to_video/README.md
+++ b/examples/online_serving/image_to_video/README.md
@@ -1,13 +1,6 @@
 # Image-To-Video
 
-This example demonstrates how to deploy Wan2.2 image-to-video models for online video generation using vLLM-Omni.
-
-## Supported Models
-
-| Model | Model ID |
-|-------|----------|
-| Wan2.2 I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` |
-| Wan2.2 TI2V | `Wan-AI/Wan2.2-TI2V-5B-Diffusers` |
+This example demonstrates how to deploy the Wan2.2 image-to-video model for online video generation using vLLM-Omni.
 
 ## Start Server
 
diff --git a/examples/online_serving/text_to_image/README.md b/examples/online_serving/text_to_image/README.md
index d212a1aa6e9..17d377ea3e2 100644
--- a/examples/online_serving/text_to_image/README.md
+++ b/examples/online_serving/text_to_image/README.md
@@ -226,7 +226,6 @@ count, use `size` and `n` rather than `height`, `width`, or
 | `height`                 | int   | None    | Image height in pixels         |
 | `width`                  | int   | None    | Image width in pixels          |
 | `size`                   | str   | None    | Image size (e.g., "1024x1024") |
-| `n`                      | int   | 1       | Number of images for `/v1/images/generations` |
 | `num_inference_steps`    | int   | 50      | Number of denoising steps      |
 | `true_cfg_scale`         | float | 4.0     | Qwen-Image CFG scale           |
 | `seed`                   | int   | None    | Random seed (reproducible)     |
diff --git a/examples/online_serving/text_to_video/README.md b/examples/online_serving/text_to_video/README.md
index a334bc2eb8d..c01e0602ff9 100644
--- a/examples/online_serving/text_to_video/README.md
+++ b/examples/online_serving/text_to_video/README.md
@@ -181,7 +181,7 @@ curl -X POST http://localhost:8091/v1/videos \
 | `guidance_scale`      | float  | None    | CFG guidance scale (low-noise stage)             |
 | `guidance_scale_2`    | float  | None    | CFG guidance scale (high-noise stage, Wan2.2)     |
 | `boundary_ratio`      | float  | None    | Boundary split ratio for low/high DiT (Wan2.2)   |
-| `flow_shift`          | float  | None    | Scheduler flow shift                             |
+| `flow_shift`          | float  | None    | Scheduler flow shift (Wan2.2)                    |
 | `seed`                | int    | None    | Random seed (reproducible)                       |
 | `lora`                | object | None    | LoRA configuration                               |
 

From 921cc4bb30b2f29d8bedcbf8764cba6a86894856 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 14 May 2026 18:01:46 +0200
Subject: [PATCH 05/41] Fixed sound quality issues

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../models/cosmos3/test_cosmos3_pipeline.py   |  47 ++-
 .../cosmos3/test_cosmos3_transformer.py       |  51 ++-
 .../models/cosmos3/audio_tokenizer/avae.py    |  19 +-
 .../models/cosmos3/pipeline_cosmos3.py        |  39 +-
 .../models/cosmos3/sound_tokenizer.py         | 387 ++++++++++++++++--
 .../models/cosmos3/transformer_cosmos3.py     |  25 +-
 6 files changed, 485 insertions(+), 83 deletions(-)

diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
index b068ea7e74a..f86e451fdaa 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -367,13 +367,10 @@ def test_prepare_sound_latents_uses_lazy_tokenizer_and_duration(self, make_cosmo
         class FakeSoundTokenizer:
             sample_rate = 10
             latent_ch = 3
-
-            def get_latent_num_samples(self, samples: int) -> int:
-                assert samples == 20
-                return 5
+            hop_size = 4
 
             def decode(self, latents: torch.Tensor) -> torch.Tensor:
-                return torch.ones(latents.shape[0], 2, 7)
+                return torch.ones(latents.shape[0], 2, 24)
 
         pipeline._sound_tokenizer = FakeSoundTokenizer()
 
@@ -383,15 +380,15 @@ def decode(self, latents: torch.Tensor) -> torch.Tensor:
             frame_rate=3.0,
         )
         latents, latent_frames = pipeline._prepare_sound_latents(
-            target_samples,
+            21,
             torch.Generator(device="cpu").manual_seed(0),
         )
-        audio = pipeline._decode_sound_latents(torch.zeros(1, 3, 5), target_audio_samples=5)
+        audio = pipeline._decode_sound_latents(torch.zeros(1, 3, 6), target_audio_samples=21)
 
         assert (target_samples, duration, sample_rate) == (20, 2.0, 10)
-        assert latents.shape == (1, 3, 5)
-        assert latent_frames == 5
-        assert audio.shape == (1, 2, 5)
+        assert latents.shape == (1, 3, 6)
+        assert latent_frames == 6
+        assert audio.shape == (1, 2, 21)
 
     def test_init_eagerly_loads_sound_tokenizer_when_transformer_supports_sound(
         self,
@@ -781,6 +778,7 @@ def test_forward_uses_t2i_defaults_and_generates_multiple_outputs(self, make_cos
         assert captured["flow_shifts"] == [3.0]
         assert captured["scheduler_steps"] == [50, 50]
         assert captured["format"]["is_t2i"] is True
+        assert captured["format"]["negative_prompt"] == ""
         assert captured["format"]["height"] == 1024
         assert captured["format"]["width"] == 1024
         assert captured["format"]["num_frames"] == 1
@@ -789,6 +787,8 @@ def test_forward_uses_t2i_defaults_and_generates_multiple_outputs(self, make_cos
         assert output.output["image"].shape[0] == 2
 
     def test_forward_uses_t2v_defaults_and_engine_flow_shift(self, make_cosmos3_pipeline) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import COSMOS3_T2V_NEGATIVE_PROMPT
+
         pipeline = make_cosmos3_pipeline()
         captured = self._install_forward_stubs(pipeline)
         req = SimpleNamespace(
@@ -801,9 +801,11 @@ def test_forward_uses_t2v_defaults_and_engine_flow_shift(self, make_cosmos3_pipe
         assert captured["flow_shifts"] == [1.0]
         assert captured["scheduler_steps"] == [35]
         assert captured["format"]["is_t2i"] is False
+        assert captured["format"]["negative_prompt"] == COSMOS3_T2V_NEGATIVE_PROMPT
         assert captured["format"]["height"] == 720
         assert captured["format"]["width"] == 1280
-        assert captured["format"]["num_frames"] == 81
+        assert captured["format"]["num_frames"] == 189
+        assert captured["diffuse_calls"][0]["guidance_scale"] == 6.0
         assert captured["diffuse_calls"][0]["guidance_interval"] is None
 
     def test_forward_defaults_to_video_without_modalities(self, make_cosmos3_pipeline) -> None:
@@ -819,6 +821,28 @@ def test_forward_defaults_to_video_without_modalities(self, make_cosmos3_pipelin
         assert captured["format"]["is_t2i"] is False
         assert "video" in output.output
 
+    def test_forward_flow_shifts_do_not_leak_between_t2v_and_t2i(
+        self,
+        make_cosmos3_pipeline,
+    ) -> None:
+        pipeline = make_cosmos3_pipeline()
+        captured = self._install_forward_stubs(pipeline)
+
+        pipeline.forward(
+            SimpleNamespace(
+                prompts=[{"prompt": "A warehouse robot", "modalities": ["video"]}],
+                sampling_params=make_sampling_params(),
+            )
+        )
+        pipeline.forward(
+            SimpleNamespace(
+                prompts=[{"prompt": "A painted robot", "modalities": ["image"]}],
+                sampling_params=make_sampling_params(),
+            )
+        )
+
+        assert captured["flow_shifts"] == [1.0, 3.0]
+
     def test_forward_selects_i2v_latents_for_image_conditioning(self, make_cosmos3_pipeline) -> None:
         pipeline = make_cosmos3_pipeline()
         captured = self._install_forward_stubs(pipeline)
@@ -887,6 +911,7 @@ def test_forward_policy_action_returns_custom_output(self, make_cosmos3_pipeline
 
         output = pipeline.forward(req)
 
+        assert captured["format"]["negative_prompt"] == ""
         diffuse_call = captured["diffuse_calls"][0]
         assert diffuse_call["action_latents"].shape == (1, 2, 4)
         assert diffuse_call["action_velocity_mask"].tolist() == [[[1.0], [1.0]]]
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index 49b5821347c..1fb870b7d97 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -71,15 +71,14 @@ def test_compute_mrope_position_ids_sound_uses_sound_latent_fps() -> None:
     ids, next_offset = compute_mrope_position_ids_sound(
         grid_t=3,
         temporal_offset=10,
-        sound_latent_fps=24.0,
+        sound_latent_fps=25.0,
         base_fps=24.0,
-        base_temporal_compression_factor=4,
     )
 
-    torch.testing.assert_close(ids[0], torch.tensor([10.0, 10.25, 10.5]))
+    torch.testing.assert_close(ids[0], torch.tensor([10.0, 10.96, 11.92]))
     assert ids[1].tolist() == [0.0, 0.0, 0.0]
     assert ids[2].tolist() == [0.0, 0.0, 0.0]
-    assert next_offset == 11
+    assert next_offset == 12
 
 
 def test_compute_mrope_position_ids_action_uses_start_frame_offset() -> None:
@@ -98,6 +97,24 @@ def test_compute_mrope_position_ids_action_uses_start_frame_offset() -> None:
     assert next_offset == 14
 
 
+def test_compute_mrope_position_ids_action_keeps_video_base_temporal_compression() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
+        compute_mrope_position_ids_action,
+    )
+
+    ids, next_offset = compute_mrope_position_ids_action(
+        grid_t=3,
+        temporal_offset=10,
+        action_fps=24.0,
+        base_fps=24.0,
+        base_temporal_compression_factor=4,
+        start_frame_offset=0,
+    )
+
+    torch.testing.assert_close(ids[0], torch.tensor([10.0, 10.25, 10.5]))
+    assert next_offset == 11
+
+
 @pytest.mark.parametrize(
     ("key", "value"),
     [
@@ -203,6 +220,14 @@ def test_sound_modules_created_only_when_sound_config_present() -> None:
             dtype=torch.float32,
         )
     )
+    with_nested_sound_dim = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config={**tiny, "sound_gen": True},
+            model_config={"sound_tokenizer": {"io_channels": 5}},
+            custom_pipeline_args={},
+            dtype=torch.float32,
+        )
+    )
 
     assert no_sound.sound_gen is False
     assert not hasattr(no_sound, "sound2llm")
@@ -212,6 +237,8 @@ def test_sound_modules_created_only_when_sound_config_present() -> None:
     assert with_sound.sound2llm.in_features == 3
     assert with_sound.llm2sound.out_features == 3
     assert tuple(with_sound.sound_modality_embed.shape) == (8,)
+    assert with_nested_sound_dim.sound_dim == 5
+    assert with_nested_sound_dim.sound2llm.in_features == 5
 
 
 def test_action_modules_created_only_when_action_config_present() -> None:
@@ -251,7 +278,8 @@ def test_sound_latent_fps_derives_from_sound_tokenizer_config() -> None:
     derived = Cosmos3VFMTransformer(
         SimpleNamespace(
             tf_model_config=tiny,
-            custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800},
+            model_config={"sound_tokenizer": {"sample_rate": 32000, "hop_size": 800}},
+            custom_pipeline_args={},
             dtype=torch.float32,
         )
     )
@@ -471,7 +499,8 @@ def __call__(self, x, position_ids):
     model.base_fps = 24.0
     model.temporal_compression_factor = 4
     model.enable_fps_modulation = True
-    model.sound_latent_fps = 24.0
+    model.temporal_compression_factor_sound = 1
+    model.sound_latent_fps = 25.0
 
     model._compute_rope_freqs(
         text_mask=torch.tensor([[1, 1]], dtype=torch.long),
@@ -488,7 +517,7 @@ def __call__(self, x, position_ids):
     assert gen_pos.shape == (3, 1, 5)
     torch.testing.assert_close(
         gen_pos[0, 0],
-        torch.tensor([102.0, 103.0, 102.0, 102.25, 102.5]),
+        torch.tensor([102.0, 103.0, 102.0, 102.96, 103.92]),
     )
 
 
@@ -514,7 +543,8 @@ def __call__(self, x, position_ids):
     model.base_fps = 24.0
     model.temporal_compression_factor = 4
     model.enable_fps_modulation = False
-    model.sound_latent_fps = 24.0
+    model.temporal_compression_factor_sound = 1
+    model.sound_latent_fps = 25.0
 
     model._compute_rope_freqs(
         text_mask=torch.tensor([[1, 1]], dtype=torch.long),
@@ -556,7 +586,8 @@ def __call__(self, x, position_ids):
     model.base_fps = 24.0
     model.temporal_compression_factor = 4
     model.enable_fps_modulation = True
-    model.sound_latent_fps = 24.0
+    model.temporal_compression_factor_sound = 1
+    model.sound_latent_fps = 25.0
 
     model._compute_rope_freqs(
         text_mask=torch.tensor([[1, 1]], dtype=torch.long),
@@ -573,5 +604,5 @@ def __call__(self, x, position_ids):
     assert gen_pos.dtype == torch.float32
     torch.testing.assert_close(
         gen_pos[0, 0],
-        torch.tensor([102.0, 102.0, 102.25, 102.5]),
+        torch.tensor([102.0, 102.0, 102.96, 103.92]),
     )
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
index 03367071f4f..2ee4ad2a3ef 100644
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import json
+import math
 from pathlib import Path
 
 import torch
@@ -45,13 +46,13 @@ def _default_avae_config(
             "enc_hop_length": 16,
             "enc_latent_dim": 128,
             "enc_c_mults": [1, 2, 4],
-            "enc_strides": [4, 4, 8],
+            "enc_strides": [4, 5, 6],
             "enc_identity_init": False,
             "enc_use_snake": True,
             "dec_type": "oobleck",
             "dec_dim": 320,
             "dec_c_mults": [1, 2, 4, 8, 16],
-            "dec_strides": [2, 4, 4, 8, 8],
+            "dec_strides": [2, 4, 5, 6, 8],
             "dec_use_snake": True,
             "dec_final_tanh": False,
             "dec_out_channels": audio_channels,
@@ -154,7 +155,7 @@ def __init__(
         audio_channels: int = 2,
         io_channels: int = 64,
         hop_size: int = 1920,
-        normalize_latents: bool = True,
+        normalize_latents: bool = False,
         normalization_type: str = "none",
         tanh_input_scale: float = 1.5,
         tanh_output_scale: float = 3.5,
@@ -185,6 +186,18 @@ def __init__(
             io_channels=self.latent_ch,
             hop_size=self.hop_size,
         )
+        self.sample_rate = int(config.sampling_rate)
+        self.audio_channels = int(
+            getattr(config, "dec_out_channels", 2 if bool(getattr(config, "stereo", True)) else 1)
+        )
+        self.latent_ch = int(config.vocoder_input_dim)
+        self.hop_size = int(config.hop_size)
+        dec_stride_product = math.prod(int(stride) for stride in config.dec_strides)
+        if dec_stride_product != self.hop_size:
+            raise ValueError(
+                "Cosmos3 AVAE config dec_strides product must equal hop_size "
+                f"for correct latent/audio duration math: product={dec_stride_product}, hop_size={self.hop_size}."
+            )
         self.model = load_generator(config.model_type, config, self.device)
         state_dict = _strip_prefixes(
             _load_checkpoint(checkpoint_path, self.device),
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 3c84b39ab8a..ea84377b0bc 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import math
 import os
 import time
 from collections.abc import Iterable
@@ -62,6 +63,16 @@
 logger = init_logger(__name__)
 
 COSMOS3_DEFAULT_NEGATIVE_PROMPT = ""
+COSMOS3_VIDEO_NEGATIVE_PROMPT = (
+    "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
+    "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
+    "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, "
+    "jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
+    "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
+    "Overall, the video is of poor quality."
+)
+COSMOS3_T2V_NEGATIVE_PROMPT = COSMOS3_VIDEO_NEGATIVE_PROMPT
+COSMOS3_I2V_NEGATIVE_PROMPT = COSMOS3_VIDEO_NEGATIVE_PROMPT
 COSMOS3_DURATION_TEMPLATE = "The video is {duration:.1f} seconds long and is of {fps:.0f} FPS."
 COSMOS3_RESOLUTION_TEMPLATE = "This video is of {height}x{width} resolution."
 COSMOS3_IMAGE_RESOLUTION_TEMPLATE = "This image is of {height}x{width} resolution."
@@ -428,12 +439,6 @@ def __init__(
         # scheduler at request time when a per-request flow_shift override
         # is supplied (T2I uses shift=3.0; T2V/I2V use the engine default).
         self._base_scheduler_config = self.scheduler.config
-        # ``_engine_init_flow_shift`` is the shift the engine was configured
-        # with at init time (after the optional ``od_config.flow_shift``
-        # override).  This is the value T2V/I2V requests fall back to.
-        # ``_current_flow_shift`` tracks the shift the scheduler *currently*
-        # uses, since per-request rebuilds in ``_set_flow_shift`` must be
-        # detectable on the next request to restore the prior shift.
         self._engine_init_flow_shift = float(getattr(self.scheduler.config, "flow_shift", 1.0) or 1.0)
         self._current_flow_shift = self._engine_init_flow_shift
 
@@ -890,7 +895,11 @@ def _prepare_sound_latents(
         generator: torch.Generator,
     ) -> tuple[torch.Tensor, int]:
         sound_tokenizer = self._get_sound_tokenizer()
-        latent_frames = max(1, int(sound_tokenizer.get_latent_num_samples(max(1, target_audio_samples))))
+        hop_size = int(
+            getattr(sound_tokenizer, "hop_size", None)
+            or getattr(sound_tokenizer, "temporal_compression_factor")
+        )
+        latent_frames = max(1, math.ceil(max(1, int(target_audio_samples)) / hop_size))
         sound_dim = int(getattr(sound_tokenizer, "latent_ch", 64))
         transformer_sound_dim = int(getattr(self.transformer, "sound_dim", sound_dim))
         if sound_dim != transformer_sound_dim:
@@ -1527,12 +1536,12 @@ def forward(
         prompt_data = req.prompts[0]
         if isinstance(prompt_data, str):
             prompt = prompt_data
-            negative_prompt = COSMOS3_DEFAULT_NEGATIVE_PROMPT
+            negative_prompt = None
             image_tensor = None
             action_video_tensor = None
         else:
             prompt = prompt_data.get("prompt", "")
-            negative_prompt = prompt_data.get("negative_prompt", COSMOS3_DEFAULT_NEGATIVE_PROMPT)
+            negative_prompt = prompt_data.get("negative_prompt")
             additional_info = prompt_data.get("additional_information", {}) or {}
             image_tensor = additional_info.get("preprocessed_image")
             action_video_tensor = additional_info.get("preprocessed_video")
@@ -1563,6 +1572,14 @@ def forward(
                 "initialized without sound modules. Check that the checkpoint config "
                 "enables sound_gen or defines sound_dim and includes sound weights."
             )
+        is_i2v = image_tensor is not None and not is_t2i and not action_enabled
+        if negative_prompt is None:
+            if is_t2i or action_enabled:
+                negative_prompt = COSMOS3_DEFAULT_NEGATIVE_PROMPT
+            elif is_i2v:
+                negative_prompt = COSMOS3_I2V_NEGATIVE_PROMPT
+            else:
+                negative_prompt = COSMOS3_T2V_NEGATIVE_PROMPT
 
         # T2I and T2V share the same model + forward path; only defaults
         # differ:
@@ -1580,9 +1597,9 @@ def forward(
         else:
             height = sp.height or 720
             width = sp.width or 1280
-            num_frames = sp.num_frames or 81
+            num_frames = sp.num_frames or 189
             num_inference_steps = sp.num_inference_steps or 35
-            guidance_scale = sp.guidance_scale if sp.guidance_scale else 4.0
+            guidance_scale = sp.guidance_scale if sp.guidance_scale else 6.0
             # Fall back to the engine-init shift, NOT None: passing None
             # to ``_set_flow_shift`` would leak a prior T2I rebuild
             # (shift=3.0) into a subsequent video request.
diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
index 863561bac53..cceaf897083 100644
--- a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
+++ b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+import json
 import os
 from pathlib import Path
 from typing import Any
@@ -24,6 +25,11 @@
 DEFAULT_SOUND_DIM = 64
 DEFAULT_SOUND_HOP_SIZE = 1920
 DEFAULT_SOUND_LATENT_FPS = DEFAULT_SOUND_SAMPLE_RATE / DEFAULT_SOUND_HOP_SIZE
+DEFAULT_SOUND_NORMALIZE_LATENTS = False
+DEFAULT_SOUND_NORMALIZATION_TYPE = "none"
+DEFAULT_SOUND_TANH_INPUT_SCALE = 1.5
+DEFAULT_SOUND_TANH_OUTPUT_SCALE = 3.5
+DEFAULT_SOUND_TANH_CLAMP = 0.995
 SOUND_TOKENIZER_COMPONENT_NAME = "sound_tokenizer"
 SOUND_TOKENIZER_CHECKPOINT_NAME = "model.safetensors"
 
@@ -32,6 +38,109 @@ def _pipeline_args(od_config: OmniDiffusionConfig) -> dict[str, Any]:
     return dict(getattr(od_config, "custom_pipeline_args", None) or {})
 
 
+def _config_get(config: Any, key: str, default: Any = None) -> Any:
+    if config is None:
+        return default
+    if isinstance(config, dict):
+        return config.get(key, default)
+    if hasattr(config, "get"):
+        value = config.get(key, None)
+        return default if value is None else value
+    return getattr(config, key, default)
+
+
+def _config_path_get(config: Any, *keys: str) -> Any:
+    value = config
+    for key in keys:
+        value = _config_get(value, key, None)
+        if value is None:
+            return None
+    return value
+
+
+def _sound_tokenizer_config_from(config: Any) -> Any:
+    """Return nested ``sound_tokenizer`` config from Cosmos3 config shapes."""
+    for path in (
+        ("sound_tokenizer",),
+        ("model", "config", "sound_tokenizer"),
+        ("config", "sound_tokenizer"),
+        ("model_config", "sound_tokenizer"),
+    ):
+        value = _config_path_get(config, *path)
+        if value is not None:
+            return value
+    return None
+
+
+def _nested_sound_tokenizer_configs(od_config: OmniDiffusionConfig | None) -> tuple[Any, ...]:
+    if od_config is None:
+        return ()
+    configs = []
+    for source in (
+        getattr(od_config, "model_config", None),
+        getattr(od_config, "tf_model_config", None),
+    ):
+        config = _sound_tokenizer_config_from(source)
+        if config is not None:
+            configs.append(config)
+    return tuple(configs)
+
+
+def _first_value_from_configs(configs: tuple[Any, ...], keys: tuple[str, ...]) -> Any:
+    for config in configs:
+        for key in keys:
+            value = _config_get(config, key, None)
+            if value is not None:
+                return value
+    return None
+
+
+def _top_level_model_value(od_config: OmniDiffusionConfig | None, keys: tuple[str, ...]) -> Any:
+    if od_config is None:
+        return None
+    for source in (
+        getattr(od_config, "model_config", None),
+        getattr(od_config, "tf_model_config", None),
+    ):
+        for key in keys:
+            for path in ((key,), ("model", "config", key), ("config", key), ("model_config", key)):
+                value = _config_path_get(source, *path)
+                if value is not None:
+                    return value
+    return None
+
+
+def _custom_arg_value(args: dict[str, Any], keys: tuple[str, ...]) -> Any:
+    for key in keys:
+        value = args.get(key)
+        if value is not None:
+            return value
+    return None
+
+
+def _as_bool(value: Any) -> bool:
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
+
+
+def _as_audio_channels(value: Any) -> int:
+    if isinstance(value, bool):
+        return 2 if value else 1
+    if isinstance(value, str) and value.strip().lower() in {
+        "1",
+        "0",
+        "true",
+        "false",
+        "yes",
+        "no",
+        "on",
+        "off",
+    }:
+        return 2 if _as_bool(value) else 1
+    return int(value)
+
+
 def _resolve_model_file(path: Any, model_root: str | None) -> str | None:
     if not path:
         return None
@@ -41,12 +150,97 @@ def _resolve_model_file(path: Any, model_root: str | None) -> str | None:
     return str(Path(model_root) / path)
 
 
+def _load_sound_tokenizer_component_config(config_path: str | None) -> dict[str, Any]:
+    if not config_path:
+        return {}
+    with open(config_path, encoding="utf-8") as f:
+        config = json.load(f)
+    if not isinstance(config, dict):
+        raise TypeError(f"Cosmos3 sound tokenizer config must be a JSON object, got {type(config)!r}.")
+    return config
+
+
+def _component_audio_channels(config: dict[str, Any]) -> Any:
+    if config.get("dec_out_channels") is not None:
+        return config["dec_out_channels"]
+    if config.get("audio_channels") is not None:
+        return config["audio_channels"]
+    if config.get("stereo") is not None:
+        return 2 if _as_bool(config["stereo"]) else 1
+    return None
+
+
+def _component_arch_values(config: dict[str, Any]) -> dict[str, Any]:
+    values = {
+        "sample_rate": config.get("sampling_rate", config.get("sample_rate")),
+        "audio_channels": _component_audio_channels(config),
+        "io_channels": config.get("vocoder_input_dim", config.get("io_channels", config.get("latent_ch"))),
+        "hop_size": config.get("hop_size"),
+    }
+    return {key: value for key, value in values.items() if value is not None}
+
+
+def _resolve_arch_value(
+    od_config: OmniDiffusionConfig,
+    args: dict[str, Any],
+    component_values: dict[str, Any],
+    *,
+    field: str,
+    custom_keys: tuple[str, ...],
+    nested_keys: tuple[str, ...],
+    top_level_keys: tuple[str, ...],
+    default: Any,
+    cast,
+) -> Any:
+    custom_value = _custom_arg_value(args, custom_keys)
+    component_value = component_values.get(field)
+    if component_value is not None:
+        resolved = cast(component_value)
+        if custom_value is not None and cast(custom_value) != resolved:
+            raise ValueError(
+                "Conflicting Cosmos3 sound tokenizer architecture override for "
+                f"{field}: component config has {resolved!r}, custom args have {cast(custom_value)!r}."
+            )
+        return resolved
+
+    if custom_value is not None:
+        return cast(custom_value)
+
+    nested_value = _first_value_from_configs(_nested_sound_tokenizer_configs(od_config), nested_keys)
+    if nested_value is not None:
+        return cast(nested_value)
+
+    top_value = _top_level_model_value(od_config, top_level_keys)
+    if top_value is not None:
+        return cast(top_value)
+
+    return cast(default)
+
+
+def _resolve_normalization_value(
+    od_config: OmniDiffusionConfig,
+    args: dict[str, Any],
+    *,
+    name: str,
+    default: Any,
+    aliases: tuple[str, ...] = (),
+) -> Any:
+    keys = (f"sound_{name}", name, *aliases)
+    custom_value = _custom_arg_value(args, keys)
+    if custom_value is not None:
+        return custom_value
+    nested_value = _first_value_from_configs(_nested_sound_tokenizer_configs(od_config), (name, *aliases))
+    return default if nested_value is None else nested_value
+
+
 def get_sound_config_value(
     od_config: OmniDiffusionConfig,
     name: str,
     default: Any,
     aliases: tuple[str, ...] = (),
 ) -> Any:
+    # Backward-compatible generic accessor.  Prefer the more specific helpers
+    # below for Cosmos3 sound tokenizer fields so precedence stays explicit.
     keys = (name, *aliases)
     for config in (
         _pipeline_args(od_config),
@@ -66,57 +260,86 @@ def get_sound_config_value(
 
 
 def get_sound_sample_rate(od_config: OmniDiffusionConfig) -> int:
-    return int(
-        get_sound_config_value(
-            od_config,
-            "sound_sample_rate",
-            DEFAULT_SOUND_SAMPLE_RATE,
-            ("sample_rate",),
-        )
+    args = _pipeline_args(od_config)
+    return _resolve_arch_value(
+        od_config,
+        args,
+        {},
+        field="sample_rate",
+        custom_keys=("sound_sample_rate", "sample_rate"),
+        nested_keys=("sample_rate", "sampling_rate"),
+        top_level_keys=("sound_sample_rate", "sample_rate"),
+        default=DEFAULT_SOUND_SAMPLE_RATE,
+        cast=int,
     )
 
 
 def get_sound_channels(od_config: OmniDiffusionConfig) -> int:
-    return int(
-        get_sound_config_value(
-            od_config,
-            "sound_audio_channels",
-            DEFAULT_SOUND_CHANNELS,
-            ("audio_channels",),
-        )
+    args = _pipeline_args(od_config)
+    return _resolve_arch_value(
+        od_config,
+        args,
+        {},
+        field="audio_channels",
+        custom_keys=("sound_audio_channels", "audio_channels", "stereo"),
+        nested_keys=("audio_channels", "dec_out_channels", "stereo"),
+        top_level_keys=("sound_audio_channels", "audio_channels", "stereo"),
+        default=DEFAULT_SOUND_CHANNELS,
+        cast=_as_audio_channels,
     )
 
 
 def get_sound_dim(od_config: OmniDiffusionConfig | None) -> int:
     if od_config is None:
         return DEFAULT_SOUND_DIM
-    return int(
-        get_sound_config_value(
-            od_config,
-            "sound_dim",
-            DEFAULT_SOUND_DIM,
-            ("io_channels", "latent_ch"),
-        )
+    args = _pipeline_args(od_config)
+    custom_value = _custom_arg_value(args, ("sound_dim", "io_channels", "latent_ch"))
+    if custom_value is not None:
+        return int(custom_value)
+    top_value = _top_level_model_value(od_config, ("sound_dim",))
+    if top_value is not None:
+        return int(top_value)
+    nested_value = _first_value_from_configs(
+        _nested_sound_tokenizer_configs(od_config),
+        ("io_channels", "vocoder_input_dim", "latent_ch"),
     )
+    return int(DEFAULT_SOUND_DIM if nested_value is None else nested_value)
 
 
 def get_sound_hop_size(od_config: OmniDiffusionConfig) -> int:
-    return int(
-        get_sound_config_value(
-            od_config,
-            "sound_hop_size",
-            DEFAULT_SOUND_HOP_SIZE,
-            ("hop_size",),
-        )
+    args = _pipeline_args(od_config)
+    return _resolve_arch_value(
+        od_config,
+        args,
+        {},
+        field="hop_size",
+        custom_keys=("sound_hop_size", "hop_size"),
+        nested_keys=("hop_size",),
+        top_level_keys=("sound_hop_size", "hop_size"),
+        default=DEFAULT_SOUND_HOP_SIZE,
+        cast=int,
     )
 
 
 def get_sound_latent_fps(od_config: OmniDiffusionConfig | None) -> float:
     if od_config is None:
         return DEFAULT_SOUND_LATENT_FPS
-    sample_rate = get_sound_sample_rate(od_config)
-    hop_size = get_sound_hop_size(od_config)
-    return float(get_sound_config_value(od_config, "sound_latent_fps", sample_rate / hop_size))
+    args = _pipeline_args(od_config)
+    custom_value = _custom_arg_value(args, ("sound_latent_fps",))
+    if custom_value is not None:
+        return float(custom_value)
+    top_value = _top_level_model_value(od_config, ("sound_latent_fps",))
+    if top_value is not None:
+        return float(top_value)
+    nested_configs = _nested_sound_tokenizer_configs(od_config)
+    nested_fps = _first_value_from_configs(nested_configs, ("sound_latent_fps", "latent_fps"))
+    if nested_fps is not None:
+        return float(nested_fps)
+    sample_rate = _first_value_from_configs(nested_configs, ("sample_rate", "sampling_rate"))
+    hop_size = _first_value_from_configs(nested_configs, ("hop_size",))
+    if sample_rate is not None and hop_size is not None:
+        return float(sample_rate) / float(hop_size)
+    return float(DEFAULT_SOUND_LATENT_FPS)
 
 
 class Cosmos3SoundTokenizer:
@@ -172,15 +395,97 @@ def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer:
                 "sound_tokenizer/model.safetensors under the model path."
             )
 
-        sample_rate = get_sound_sample_rate(od_config)
-        audio_channels = get_sound_channels(od_config)
-        sound_dim = get_sound_dim(od_config)
-        hop_size = get_sound_hop_size(od_config)
-
         config_path = _resolve_model_file(explicit_config_path, model_root)
         if config_path is None and model_root:
             candidate = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME / "config.json"
             config_path = str(candidate) if candidate.exists() else None
+        component_config = _load_sound_tokenizer_component_config(config_path)
+        component_values = _component_arch_values(component_config)
+
+        sample_rate = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="sample_rate",
+            custom_keys=("sound_sample_rate", "sample_rate"),
+            nested_keys=("sample_rate", "sampling_rate"),
+            top_level_keys=("sound_sample_rate", "sample_rate"),
+            default=DEFAULT_SOUND_SAMPLE_RATE,
+            cast=int,
+        )
+        audio_channels = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="audio_channels",
+            custom_keys=("sound_audio_channels", "audio_channels", "stereo"),
+            nested_keys=("audio_channels", "dec_out_channels", "stereo"),
+            top_level_keys=("sound_audio_channels", "audio_channels", "stereo"),
+            default=DEFAULT_SOUND_CHANNELS,
+            cast=_as_audio_channels,
+        )
+        sound_dim = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="io_channels",
+            custom_keys=("sound_dim", "io_channels", "latent_ch"),
+            nested_keys=("io_channels", "vocoder_input_dim", "latent_ch"),
+            top_level_keys=("sound_dim",),
+            default=DEFAULT_SOUND_DIM,
+            cast=int,
+        )
+        hop_size = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="hop_size",
+            custom_keys=("sound_hop_size", "hop_size"),
+            nested_keys=("hop_size",),
+            top_level_keys=("sound_hop_size", "hop_size"),
+            default=DEFAULT_SOUND_HOP_SIZE,
+            cast=int,
+        )
+        normalize_latents = _as_bool(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="normalize_latents",
+                default=DEFAULT_SOUND_NORMALIZE_LATENTS,
+            )
+        )
+        normalization_type = str(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="normalization_type",
+                default=DEFAULT_SOUND_NORMALIZATION_TYPE,
+            )
+        )
+        tanh_input_scale = float(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="tanh_input_scale",
+                default=DEFAULT_SOUND_TANH_INPUT_SCALE,
+            )
+        )
+        tanh_output_scale = float(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="tanh_output_scale",
+                default=DEFAULT_SOUND_TANH_OUTPUT_SCALE,
+            )
+        )
+        tanh_clamp = float(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="tanh_clamp",
+                default=DEFAULT_SOUND_TANH_CLAMP,
+            )
+        )
         tokenizer = Cosmos3AVAEAudioTokenizer(
             checkpoint_path=str(avae_path),
             config_path=config_path,
@@ -188,11 +493,11 @@ def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer:
             audio_channels=audio_channels,
             io_channels=sound_dim,
             hop_size=hop_size,
-            normalize_latents=bool(args.get("sound_normalize_latents", True)),
-            normalization_type=args.get("sound_normalization_type", "none"),
-            tanh_input_scale=float(args.get("sound_tanh_input_scale", 1.5)),
-            tanh_output_scale=float(args.get("sound_tanh_output_scale", 3.5)),
-            tanh_clamp=float(args.get("sound_tanh_clamp", 0.995)),
+            normalize_latents=normalize_latents,
+            normalization_type=normalization_type,
+            tanh_input_scale=tanh_input_scale,
+            tanh_output_scale=tanh_output_scale,
+            tanh_clamp=tanh_clamp,
             dtype=getattr(od_config, "dtype", torch.bfloat16),
             device=get_local_device(),
         )
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 118b78cbaed..9166498bc9f 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -268,10 +268,12 @@ def compute_mrope_position_ids_sound(
     temporal_offset: int | float,
     sound_latent_fps: float,
     base_fps: float = 24.0,
-    base_temporal_compression_factor: int = 4,
+    temporal_compression_factor_sound: int = 1,
     enable_fps_modulation: bool = True,
+    base_temporal_compression_factor: int | None = None,
 ) -> tuple[torch.Tensor, int | float]:
     """Generate mRoPE IDs for sound tokens as a (T, 1, 1) grid."""
+    del base_temporal_compression_factor
     return compute_mrope_position_ids_vision(
         grid_t=grid_t,
         grid_h=1,
@@ -279,8 +281,8 @@ def compute_mrope_position_ids_sound(
         temporal_offset=temporal_offset,
         fps=sound_latent_fps,
         base_fps=base_fps,
-        temporal_compression_factor=1,
-        base_temporal_compression_factor=base_temporal_compression_factor,
+        temporal_compression_factor=temporal_compression_factor_sound,
+        base_temporal_compression_factor=temporal_compression_factor_sound,
         enable_fps_modulation=enable_fps_modulation,
     )
 
@@ -1056,8 +1058,16 @@ def __init__(
         self.base_fps = float(_tf_config_get(model_config, "base_fps", 24.0))
         sound_gen_value = _od_config_get(od_config, "sound_gen", None)
         sound_dim_value = _od_config_get(od_config, "sound_dim", None)
+        if sound_dim_value is None:
+            sound_dim_value = _od_config_get(od_config, "io_channels", None)
+        if sound_dim_value is None:
+            sound_dim_value = _od_config_get(od_config, "vocoder_input_dim", None)
+        if sound_dim_value is None:
+            sound_dim_value = _od_config_get(od_config, "latent_ch", None)
         self.sound_gen = _as_bool(sound_gen_value) if sound_gen_value is not None else sound_dim_value is not None
-        self.sound_dim = int(sound_dim_value if sound_dim_value is not None else 64)
+        from .sound_tokenizer import get_sound_dim, get_sound_latent_fps
+
+        self.sound_dim = int(sound_dim_value if sound_dim_value is not None else get_sound_dim(od_config))
         action_gen_value = _od_config_get(od_config, "action_gen", None)
         action_dim_value = _od_config_get(od_config, "action_dim", None)
         if action_dim_value is None:
@@ -1065,12 +1075,13 @@ def __init__(
         self.action_gen = _as_bool(action_gen_value) if action_gen_value is not None else False
         self.action_dim = int(action_dim_value if action_dim_value is not None else 64)
         self.num_embodiment_domains = int(_od_config_get(od_config, "num_embodiment_domains", 32))
-        from .sound_tokenizer import get_sound_latent_fps
-
         self.sound_latent_fps = float(get_sound_latent_fps(od_config))
         if temporal_compression_factor is None:
             temporal_compression_factor = _tf_config_get(model_config, "temporal_compression_factor", 4)
         self.temporal_compression_factor = int(temporal_compression_factor)
+        self.temporal_compression_factor_sound = int(
+            _tf_config_get(model_config, "temporal_compression_factor_sound", 1)
+        )
         self.enable_fps_modulation = bool(_tf_config_get(model_config, "enable_fps_modulation", True))
         self.temporal_modality_margin = int(
             _tf_config_get(
@@ -1289,7 +1300,7 @@ def _compute_rope_freqs(
                     temporal_offset=media_temporal_offset,
                     sound_latent_fps=self.sound_latent_fps,
                     base_fps=self.base_fps,
-                    base_temporal_compression_factor=self.temporal_compression_factor,
+                    temporal_compression_factor_sound=getattr(self, "temporal_compression_factor_sound", 1),
                     enable_fps_modulation=self.enable_fps_modulation,
                 )
                 gen_positions.append(s_pos)

From 75d988874a16c810e9b67d22c0d96ffd7a6ca6ce Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 14 May 2026 18:02:32 +0200
Subject: [PATCH 06/41] Linter fixes

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index ea84377b0bc..b6e9f16b398 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -896,8 +896,7 @@ def _prepare_sound_latents(
     ) -> tuple[torch.Tensor, int]:
         sound_tokenizer = self._get_sound_tokenizer()
         hop_size = int(
-            getattr(sound_tokenizer, "hop_size", None)
-            or getattr(sound_tokenizer, "temporal_compression_factor")
+            getattr(sound_tokenizer, "hop_size", None) or getattr(sound_tokenizer, "temporal_compression_factor")
         )
         latent_frames = max(1, math.ceil(max(1, int(target_audio_samples)) / hop_size))
         sound_dim = int(getattr(sound_tokenizer, "latent_ch", 64))

From 9e9e453aa1d790392ce45b24b02cf9f9c41e7d94 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 14 May 2026 18:05:01 +0200
Subject: [PATCH 07/41] extra cleanup

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 docs/.nav.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/.nav.yml b/docs/.nav.yml
index b1ad961ab0f..cc25a49f7a6 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -43,8 +43,8 @@ nav:
       - Online serving Example of vLLM-Omni for MiMo-Audio: user_guide/examples/online_serving/mimo_audio.md
       - Qwen2.5-Omni: user_guide/examples/online_serving/qwen2_5_omni.md
       - Qwen3-Omni: user_guide/examples/online_serving/qwen3_omni.md
-      - Text-To-Image: user_guide/examples/online_serving/text_to_image.md
       - Text-To-Speech: user_guide/examples/online_serving/text_to_speech.md
+      - Text-To-Image: user_guide/examples/online_serving/text_to_image.md
       - Text-To-Video: user_guide/examples/online_serving/text_to_video.md
   - General:
     - usage/*

From 3c7cd31058bf6e600cfb32a27d3a3382d2ddb8a5 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Fri, 15 May 2026 09:40:50 +0200
Subject: [PATCH 08/41] Updated examples to refer to HF repo

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 docs/models/supported_models.md                            | 2 +-
 docs/user_guide/diffusion/cache_acceleration/cache_dit.md  | 5 ++---
 docs/user_guide/diffusion/cpu_offload_diffusion.md         | 2 +-
 docs/user_guide/examples/offline_inference/cosmos3.md      | 3 +--
 docs/user_guide/examples/online_serving/cosmos3.md         | 5 ++---
 examples/offline_inference/cosmos3/README.md               | 3 +--
 examples/offline_inference/cosmos3/end2end.py              | 7 +++----
 .../offline_inference/image_to_video/image_to_video.py     | 2 +-
 examples/offline_inference/text_to_image/text_to_image.py  | 2 +-
 examples/offline_inference/text_to_video/text_to_video.py  | 2 +-
 examples/online_serving/cosmos3/README.md                  | 5 ++---
 examples/online_serving/cosmos3/run_server.sh              | 7 +------
 12 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 880d7f3939d..a1165611ddf 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -32,7 +32,7 @@ th {
 | `ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanPipeline` | Wan2.1-T2V, Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-T2V-14B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
-| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V | local Diffusers-format Cosmos3 checkpoint (`$COSMOS3_MODEL`) | ✅︎ | | | |
+| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V, T2V with sound, action policy | `nvidia/Cosmos3-Nano` | ✅︎ | | | |
 | `WanSpeechToVideoPipeline` | Wan2.2-S2V | `Wan-AI/Wan2.2-S2V-14B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Wan22VACEPipeline` | Wan2.1-VACE | `Wan-AI/Wan2.1-VACE-1.3B-diffusers`, `Wan-AI/Wan2.1-VACE-14B-diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `LTX2Pipeline` | LTX-2-T2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | |
diff --git a/docs/user_guide/diffusion/cache_acceleration/cache_dit.md b/docs/user_guide/diffusion/cache_acceleration/cache_dit.md
index 8e55e36bd57..93287182d4f 100644
--- a/docs/user_guide/diffusion/cache_acceleration/cache_dit.md
+++ b/docs/user_guide/diffusion/cache_acceleration/cache_dit.md
@@ -132,10 +132,9 @@ For Cosmos3 text-to-video or image-to-video, use the video examples with the Cos
 
 ```bash
 cd examples/offline_inference/text_to_video
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
 
 python text_to_video.py \
-    --model "$COSMOS3_MODEL" \
+    --model nvidia/Cosmos3-Nano \
     --model-class-name Cosmos3OmniDiffusersPipeline \
     --prompt "A small warehouse robot moves a blue box across a clean floor." \
     --cache-backend cache_dit \
@@ -156,7 +155,7 @@ vllm serve Qwen/Qwen-Image --omni --port 8091 \
   --cache-config '{"Fn_compute_blocks": 1, "residual_diff_threshold": 0.12}'
 
 # Cosmos3
-vllm serve "$COSMOS3_MODEL" --omni --port 8091 \
+vllm serve nvidia/Cosmos3-Nano --omni --port 8091 \
   --model-class-name Cosmos3OmniDiffusersPipeline \
   --cache-backend cache_dit
 ```
diff --git a/docs/user_guide/diffusion/cpu_offload_diffusion.md b/docs/user_guide/diffusion/cpu_offload_diffusion.md
index d725502da1d..0e1121b8d51 100644
--- a/docs/user_guide/diffusion/cpu_offload_diffusion.md
+++ b/docs/user_guide/diffusion/cpu_offload_diffusion.md
@@ -194,7 +194,7 @@ Factory function `get_offload_backend()` selects the appropriate backend based o
 | OvisImagePipeline | `AIDC-AI/Ovis-Image-7B` | `OvisImageTransformer2DModel` | - | ✓ | `"transformer"` |
 | QwenImagePipeline | `Qwen/Qwen-Image` | `QwenImageTransformer2DModel` | ✓ | ✓ | `"transformer_blocks"` |
 | StableDiffusion3Pipeline | `stabilityai/stable-diffusion-3.5-medium` | `SD3Transformer2DModel` | - | ✓ | `"transformer_blocks"` |
-| Cosmos3OmniDiffusersPipeline | `$COSMOS3_MODEL` | `Cosmos3VFMTransformer` | - | ✓ | `"gen_layers"` |
+| Cosmos3OmniDiffusersPipeline | `nvidia/Cosmos3-Nano` | `Cosmos3VFMTransformer` | - | ✓ | `"gen_layers"` |
 | Wan22I2VPipeline | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` |
 | Wan22Pipeline | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` |
 | BagelPipeline | `ByteDance-Seed/BAGEL-7B-MoT` | `Qwen2MoTModel` | - | ✓ | `"layers"`, `"customized modules"` |
diff --git a/docs/user_guide/examples/offline_inference/cosmos3.md b/docs/user_guide/examples/offline_inference/cosmos3.md
index a750080cc33..9d9924a1a15 100644
--- a/docs/user_guide/examples/offline_inference/cosmos3.md
+++ b/docs/user_guide/examples/offline_inference/cosmos3.md
@@ -3,10 +3,9 @@
 Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/cosmos3>.
 
 
-Cosmos3 uses `Cosmos3OmniDiffusersPipeline` for text-to-image, text-to-video, image-to-video, video-with-sound, and action generation. Set `COSMOS3_MODEL` to a local Diffusers-format checkpoint before running these examples.
+Cosmos3 uses `Cosmos3OmniDiffusersPipeline` for text-to-image, text-to-video, image-to-video, video-with-sound, and action generation. Examples default to the `nvidia/Cosmos3-Nano` Hugging Face repo, but you can override the checkpoint with `--model` or by exporting `COSMOS3_MODEL` to a local Diffusers-format checkpoint.
 
 ```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
 cd examples/offline_inference/cosmos3
 ```
 
diff --git a/docs/user_guide/examples/online_serving/cosmos3.md b/docs/user_guide/examples/online_serving/cosmos3.md
index f9bbd365ec4..d61deb427cc 100644
--- a/docs/user_guide/examples/online_serving/cosmos3.md
+++ b/docs/user_guide/examples/online_serving/cosmos3.md
@@ -5,17 +5,16 @@ Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serv
 
 This example shows Cosmos3 online serving with `Cosmos3OmniDiffusersPipeline`.
 
-Set `COSMOS3_MODEL` to a local Diffusers-format checkpoint before starting the server:
+The server defaults to the `nvidia/Cosmos3-Nano` Hugging Face repo. Override the checkpoint by exporting `MODEL` or `COSMOS3_MODEL` to a local Diffusers-format checkpoint.
 
 ```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
 cd examples/online_serving/cosmos3
 bash run_server.sh
 ```
 
 `run_server.sh` accepts these environment overrides:
 
-- `MODEL`: checkpoint path, defaults to `COSMOS3_MODEL`
+- `MODEL`: checkpoint path or Hugging Face repo, defaults to `nvidia/Cosmos3-Nano` (or `COSMOS3_MODEL` if set)
 - `PORT`: server port, defaults to `8091`
 - `CACHE_BACKEND`: set to `cache_dit` to enable Cache-DiT
 - `ENABLE_LAYERWISE_OFFLOAD`: set to `1` to enable layerwise offload
diff --git a/examples/offline_inference/cosmos3/README.md b/examples/offline_inference/cosmos3/README.md
index 9674c3de449..7fe430da44f 100644
--- a/examples/offline_inference/cosmos3/README.md
+++ b/examples/offline_inference/cosmos3/README.md
@@ -1,9 +1,8 @@
 # Cosmos3
 
-Cosmos3 uses `Cosmos3OmniDiffusersPipeline` for text-to-image, text-to-video, image-to-video, video-with-sound, and action generation. Set `COSMOS3_MODEL` to a local Diffusers-format checkpoint before running these examples.
+Cosmos3 uses `Cosmos3OmniDiffusersPipeline` for text-to-image, text-to-video, image-to-video, video-with-sound, and action generation. Examples default to the `nvidia/Cosmos3-Nano` Hugging Face repo, but you can override the checkpoint with `--model` or by exporting `COSMOS3_MODEL` to a local Diffusers-format checkpoint.
 
 ```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
 cd examples/offline_inference/cosmos3
 ```
 
diff --git a/examples/offline_inference/cosmos3/end2end.py b/examples/offline_inference/cosmos3/end2end.py
index c0d1141f28a..ed3db03655f 100644
--- a/examples/offline_inference/cosmos3/end2end.py
+++ b/examples/offline_inference/cosmos3/end2end.py
@@ -72,8 +72,9 @@ def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Cosmos3 offline inference examples.")
     parser.add_argument(
         "--model",
-        default=os.environ.get("COSMOS3_MODEL"),
-        help="Local Diffusers-format Cosmos3 checkpoint. Defaults to COSMOS3_MODEL.",
+        default=os.environ.get("COSMOS3_MODEL", "nvidia/Cosmos3-Nano"),
+        help="Cosmos3 checkpoint (Hugging Face repo id or local Diffusers-format path). "
+        "Defaults to COSMOS3_MODEL when set, otherwise nvidia/Cosmos3-Nano.",
     )
     parser.add_argument(
         "--task",
@@ -411,8 +412,6 @@ def _build_omni(args: argparse.Namespace) -> Omni:
 
 def main() -> None:
     args = parse_args()
-    if not args.model:
-        raise ValueError("Set COSMOS3_MODEL or pass --model with a Cosmos3 Diffusers checkpoint path.")
 
     defaults = TASK_DEFAULTS[args.task]
     height = args.height or defaults["height"]
diff --git a/examples/offline_inference/image_to_video/image_to_video.py b/examples/offline_inference/image_to_video/image_to_video.py
index b89409e50e4..79f6e9f1151 100644
--- a/examples/offline_inference/image_to_video/image_to_video.py
+++ b/examples/offline_inference/image_to_video/image_to_video.py
@@ -33,7 +33,7 @@
         --flow-shift 5.0 --guidance-scale 6.0
 
     # Cosmos3 image-to-video
-    python image_to_video.py --model "$COSMOS3_MODEL" \
+    python image_to_video.py --model nvidia/Cosmos3-Nano \
         --model-class-name Cosmos3OmniDiffusersPipeline \
         --image input.jpg --prompt "A cinematic dolly shot of a boat" \
         --height 720 --width 1280 --num-frames 81 \
diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py
index 6978b8bc1a9..e986aee6b18 100644
--- a/examples/offline_inference/text_to_image/text_to_image.py
+++ b/examples/offline_inference/text_to_image/text_to_image.py
@@ -49,7 +49,7 @@ def parse_args() -> argparse.Namespace:
         "Qwen/Qwen-Image, Tongyi-MAI/Z-Image-Turbo, Qwen/Qwen-Image-2512, stepfun-ai/NextStep-1.1, "
         "black-forest-labs/FLUX.1-dev, black-forest-labs/FLUX.2-klein-9B, "
         "black-forest-labs/FLUX.2-dev, tencent/HunyuanImage-3.0-Instruct, "
-        "meituan-longcat/LongCat-Image, OvisAI/Ovis-Image, Cosmos3, "
+        "meituan-longcat/LongCat-Image, OvisAI/Ovis-Image, nvidia/Cosmos3-Nano, "
         "stabilityai/stable-diffusion-3.5-medium, Tongyi-MAI/Z-Image-Turbo and etc.",
     )
     parser.add_argument(
diff --git a/examples/offline_inference/text_to_video/text_to_video.py b/examples/offline_inference/text_to_video/text_to_video.py
index b704f1b87eb..e6a0716b04b 100644
--- a/examples/offline_inference/text_to_video/text_to_video.py
+++ b/examples/offline_inference/text_to_video/text_to_video.py
@@ -81,7 +81,7 @@ def parse_args() -> argparse.Namespace:
         default="Wan-AI/Wan2.2-T2V-A14B-Diffusers",
         help="Diffusers model ID or local path. "
         "Examples: Wan-AI/Wan2.2-T2V-A14B-Diffusers, "
-        "hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v, $COSMOS3_MODEL",
+        "hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v, nvidia/Cosmos3-Nano",
     )
     parser.add_argument(
         "--model-class-name",
diff --git a/examples/online_serving/cosmos3/README.md b/examples/online_serving/cosmos3/README.md
index fd5e1c4d93e..62fa00d69da 100644
--- a/examples/online_serving/cosmos3/README.md
+++ b/examples/online_serving/cosmos3/README.md
@@ -2,17 +2,16 @@
 
 This example shows Cosmos3 online serving with `Cosmos3OmniDiffusersPipeline`.
 
-Set `COSMOS3_MODEL` to a local Diffusers-format checkpoint before starting the server:
+The server defaults to the `nvidia/Cosmos3-Nano` Hugging Face repo. Override the checkpoint by exporting `MODEL` or `COSMOS3_MODEL` to a local Diffusers-format checkpoint.
 
 ```bash
-export COSMOS3_MODEL=/path/to/cosmos3-diffusers
 cd examples/online_serving/cosmos3
 bash run_server.sh
 ```
 
 `run_server.sh` accepts these environment overrides:
 
-- `MODEL`: checkpoint path, defaults to `COSMOS3_MODEL`
+- `MODEL`: checkpoint path or Hugging Face repo, defaults to `nvidia/Cosmos3-Nano` (or `COSMOS3_MODEL` if set)
 - `PORT`: server port, defaults to `8091`
 - `CACHE_BACKEND`: set to `cache_dit` to enable Cache-DiT
 - `ENABLE_LAYERWISE_OFFLOAD`: set to `1` to enable layerwise offload
diff --git a/examples/online_serving/cosmos3/run_server.sh b/examples/online_serving/cosmos3/run_server.sh
index 5d3e1b820a4..c45685c6c0d 100644
--- a/examples/online_serving/cosmos3/run_server.sh
+++ b/examples/online_serving/cosmos3/run_server.sh
@@ -3,7 +3,7 @@
 
 set -euo pipefail
 
-MODEL="${MODEL:-${COSMOS3_MODEL:-}}"
+MODEL="${MODEL:-${COSMOS3_MODEL:-nvidia/Cosmos3-Nano}}"
 PORT="${PORT:-8091}"
 CACHE_BACKEND="${CACHE_BACKEND:-none}"
 ENABLE_LAYERWISE_OFFLOAD="${ENABLE_LAYERWISE_OFFLOAD:-0}"
@@ -13,11 +13,6 @@ ULYSSES_DEGREE="${ULYSSES_DEGREE:-1}"
 USE_HSDP="${USE_HSDP:-0}"
 ALLOWED_LOCAL_MEDIA_PATH="${ALLOWED_LOCAL_MEDIA_PATH:-/}"
 
-if [ -z "${MODEL}" ]; then
-  echo "Set COSMOS3_MODEL or MODEL to a local Diffusers-format Cosmos3 checkpoint."
-  exit 1
-fi
-
 args=(
   vllm serve "${MODEL}"
   --omni

From c36b23c93ddca88ce03cd95f78089d34153a3448 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Fri, 15 May 2026 15:09:37 +0200
Subject: [PATCH 09/41] Improved guardrails

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 tests/diffusion/models/cosmos3/conftest.py    |   2 +-
 .../cosmos3/test_cosmos3_sound_tokenizer.py   | 351 ++++++++++++++++++
 .../models/test_cosmos3_guardrails.py         | 111 ++++++
 vllm_omni/diffusion/data.py                   |   4 +
 .../diffusion/models/cosmos3/guardrails.py    | 304 ++++++++++-----
 .../models/cosmos3/pipeline_cosmos3.py        |  22 +-
 vllm_omni/entrypoints/openai/serving_video.py |  19 +-
 7 files changed, 707 insertions(+), 106 deletions(-)
 create mode 100644 tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
 create mode 100644 tests/diffusion/models/test_cosmos3_guardrails.py

diff --git a/tests/diffusion/models/cosmos3/conftest.py b/tests/diffusion/models/cosmos3/conftest.py
index 58d4af9bf85..1864447aae2 100644
--- a/tests/diffusion/models/cosmos3/conftest.py
+++ b/tests/diffusion/models/cosmos3/conftest.py
@@ -136,7 +136,7 @@ def passthrough_progress_bar(iterable):
 @pytest.fixture(autouse=True)
 def fake_cosmos3_guardrails(monkeypatch: pytest.MonkeyPatch):
     module = types.ModuleType("vllm_omni.diffusion.models.cosmos3.guardrails")
-    module.is_guardrails_enabled = lambda od_config: False
+    module.is_guardrails_enabled = lambda od_config, sampling_params=None: False
     module.ensure_initialized = lambda od_config: None
     module.check_text_safety = lambda text: None
     module.check_video_safety = lambda video: video
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
new file mode 100644
index 00000000000..b647bc7c0dc
--- /dev/null
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
@@ -0,0 +1,351 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
+
+
+class _FakeAVAEAudioTokenizer:
+    def __init__(self, **kwargs) -> None:
+        self.kwargs = kwargs
+        self.sample_rate = int(kwargs["sample_rate"])
+        self.audio_channels = int(kwargs["audio_channels"])
+        self.latent_ch = int(kwargs["io_channels"])
+        self.temporal_compression_factor = int(kwargs["hop_size"])
+
+    def get_latent_num_samples(self, num_audio_samples: int) -> int:
+        return int(num_audio_samples) // self.temporal_compression_factor
+
+    def get_audio_num_samples(self, num_latent_samples: int) -> int:
+        return int(num_latent_samples) * self.temporal_compression_factor
+
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        return torch.zeros(latents.shape[0], self.audio_channels, 8)
+
+
+def test_from_config_loads_default_sound_tokenizer_component(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    tokenizer_dir = model_dir / "sound_tokenizer"
+    tokenizer_dir.mkdir(parents=True)
+    checkpoint_path = tokenizer_dir / "model.safetensors"
+    config_path = tokenizer_dir / "config.json"
+    checkpoint_path.write_bytes(b"stub")
+    config_path.write_text("{}", encoding="utf-8")
+
+    created = {}
+
+    class FakeAVAE(_FakeAVAEAudioTokenizer):
+        def __init__(self, **kwargs) -> None:
+            created.update(kwargs)
+            super().__init__(**kwargs)
+
+    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
+    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
+
+    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model=str(model_dir),
+            custom_pipeline_args={
+                "sound_sample_rate": 32000,
+                "sound_hop_size": 800,
+                "sound_dim": 3,
+            },
+            dtype=torch.float32,
+        )
+    )
+
+    assert created["checkpoint_path"] == str(checkpoint_path)
+    assert created["config_path"] == str(config_path)
+    assert tokenizer.sample_rate == 32000
+    assert tokenizer.latent_ch == 3
+    assert tokenizer.hop_size == 800
+
+
+def test_from_config_downloads_default_sound_tokenizer_from_hf_repo(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    import huggingface_hub
+
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    cache_dir = tmp_path / "hf"
+    tokenizer_dir = cache_dir / "sound_tokenizer"
+    tokenizer_dir.mkdir(parents=True)
+    checkpoint_path = tokenizer_dir / "model.safetensors"
+    config_path = tokenizer_dir / "config.json"
+    checkpoint_path.write_bytes(b"stub")
+    config_path.write_text("{}", encoding="utf-8")
+
+    calls = []
+
+    def fake_snapshot_download(
+        repo_id: str,
+        *,
+        revision: str | None,
+        allow_patterns: list[str],
+    ) -> str:
+        calls.append((repo_id, revision, allow_patterns))
+        return str(cache_dir)
+
+    created = {}
+
+    class FakeAVAE(_FakeAVAEAudioTokenizer):
+        def __init__(self, **kwargs) -> None:
+            created.update(kwargs)
+            super().__init__(**kwargs)
+
+    monkeypatch.setattr(huggingface_hub, "snapshot_download", fake_snapshot_download)
+    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
+    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
+
+    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model="nvidia/cosmos3",
+            revision="test-rev",
+            custom_pipeline_args={
+                "sound_sample_rate": 32000,
+                "sound_hop_size": 800,
+                "sound_dim": 3,
+            },
+            dtype=torch.float32,
+        )
+    )
+
+    assert created["checkpoint_path"] == str(checkpoint_path)
+    assert created["config_path"] == str(config_path)
+    assert tokenizer.sample_rate == 32000
+    assert tokenizer.latent_ch == 3
+    assert calls == [
+        (
+            "nvidia/cosmos3",
+            "test-rev",
+            ["sound_tokenizer/config.json", "sound_tokenizer/model.safetensors"],
+        )
+    ]
+
+
+def test_from_config_uses_fixed_sound_tokenizer_checkpoint_name(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    tokenizer_dir = model_dir / "sound_tokenizer"
+    tokenizer_dir.mkdir(parents=True)
+    checkpoint_path = tokenizer_dir / "model.safetensors"
+    checkpoint_path.write_bytes(b"stub")
+    (tokenizer_dir / "config.json").write_text("{}", encoding="utf-8")
+
+    created = {}
+
+    class FakeAVAE(_FakeAVAEAudioTokenizer):
+        def __init__(self, **kwargs) -> None:
+            created.update(kwargs)
+            super().__init__(**kwargs)
+
+    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
+    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
+
+    sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(model=str(model_dir), custom_pipeline_args={}, dtype=torch.float32)
+    )
+
+    assert created["checkpoint_path"] == str(checkpoint_path)
+
+
+def test_default_component_requires_sound_tokenizer_checkpoint(tmp_path) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    (model_dir / "sound_tokenizer").mkdir(parents=True)
+
+    with pytest.raises(ValueError, match="no AVAE sound tokenizer checkpoint"):
+        sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+            SimpleNamespace(model=str(model_dir), custom_pipeline_args={}, dtype=torch.float32)
+        )
+
+
+def test_from_config_uses_nested_normalization_config(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    tokenizer_dir = model_dir / "sound_tokenizer"
+    tokenizer_dir.mkdir(parents=True)
+    (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
+    (tokenizer_dir / "config.json").write_text("{}", encoding="utf-8")
+
+    created = {}
+
+    class FakeAVAE(_FakeAVAEAudioTokenizer):
+        def __init__(self, **kwargs) -> None:
+            created.update(kwargs)
+            super().__init__(**kwargs)
+
+    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
+    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
+
+    sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model=str(model_dir),
+            custom_pipeline_args={},
+            model_config={
+                "sound_tokenizer": {
+                    "normalize_latents": False,
+                    "normalization_type": "none",
+                }
+            },
+            dtype=torch.float32,
+        )
+    )
+
+    assert created["normalize_latents"] is False
+    assert created["normalization_type"] == "none"
+
+
+def test_from_config_custom_normalization_overrides_nested_config(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    tokenizer_dir = model_dir / "sound_tokenizer"
+    tokenizer_dir.mkdir(parents=True)
+    (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
+    (tokenizer_dir / "config.json").write_text("{}", encoding="utf-8")
+
+    created = {}
+
+    class FakeAVAE(_FakeAVAEAudioTokenizer):
+        def __init__(self, **kwargs) -> None:
+            created.update(kwargs)
+            super().__init__(**kwargs)
+
+    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
+    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
+
+    sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model=str(model_dir),
+            custom_pipeline_args={
+                "sound_normalize_latents": True,
+                "sound_normalization_type": "tanh",
+                "sound_tanh_input_scale": 2.0,
+            },
+            model_config={
+                "sound_tokenizer": {
+                    "normalize_latents": False,
+                    "normalization_type": "none",
+                    "tanh_input_scale": 1.0,
+                }
+            },
+            dtype=torch.float32,
+        )
+    )
+
+    assert created["normalize_latents"] is True
+    assert created["normalization_type"] == "tanh"
+    assert created["tanh_input_scale"] == 2.0
+
+
+def test_from_config_uses_component_config_architecture_values(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    tokenizer_dir = model_dir / "sound_tokenizer"
+    tokenizer_dir.mkdir(parents=True)
+    (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
+    (tokenizer_dir / "config.json").write_text(
+        (
+            '{"sampling_rate": 48000, "dec_out_channels": 2, '
+            '"vocoder_input_dim": 64, "hop_size": 1920}'
+        ),
+        encoding="utf-8",
+    )
+
+    created = {}
+
+    class FakeAVAE(_FakeAVAEAudioTokenizer):
+        def __init__(self, **kwargs) -> None:
+            created.update(kwargs)
+            super().__init__(**kwargs)
+
+    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
+    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
+
+    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model=str(model_dir),
+            custom_pipeline_args={},
+            model_config={
+                "sound_tokenizer": {
+                    "sample_rate": 32000,
+                    "audio_channels": 1,
+                    "io_channels": 3,
+                    "hop_size": 800,
+                }
+            },
+            dtype=torch.float32,
+        )
+    )
+
+    assert created["sample_rate"] == 48000
+    assert created["audio_channels"] == 2
+    assert created["io_channels"] == 64
+    assert created["hop_size"] == 1920
+    assert tokenizer.sample_rate == 48000
+    assert tokenizer.latent_ch == 64
+    assert tokenizer.hop_size == 1920
+
+
+def test_from_config_rejects_custom_architecture_conflict_with_component_config(
+    tmp_path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    tokenizer_dir = model_dir / "sound_tokenizer"
+    tokenizer_dir.mkdir(parents=True)
+    (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
+    (tokenizer_dir / "config.json").write_text(
+        (
+            '{"sampling_rate": 48000, "dec_out_channels": 2, '
+            '"vocoder_input_dim": 64, "hop_size": 1920}'
+        ),
+        encoding="utf-8",
+    )
+
+    class FakeAVAE(_FakeAVAEAudioTokenizer):
+        pass
+
+    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
+    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
+
+    with pytest.raises(ValueError, match=r"sample_rate.*48000.*32000"):
+        sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+            SimpleNamespace(
+                model=str(model_dir),
+                custom_pipeline_args={"sound_sample_rate": 32000},
+                dtype=torch.float32,
+            )
+        )
diff --git a/tests/diffusion/models/test_cosmos3_guardrails.py b/tests/diffusion/models/test_cosmos3_guardrails.py
new file mode 100644
index 00000000000..53b03114200
--- /dev/null
+++ b/tests/diffusion/models/test_cosmos3_guardrails.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import pytest
+import torch
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.tokenization_utils_base import BatchEncoding
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
+
+
+class _FakeTokenizer:
+    def __init__(self, model_inputs):
+        self.model_inputs = model_inputs
+        self.decoded_ids: list[int] | None = None
+
+    def apply_chat_template(self, conversations, *, tokenize: bool, return_tensors: str, add_generation_prompt: bool):
+        assert conversations == [{"role": "user", "content": "a safe prompt"}]
+        assert tokenize is True
+        assert return_tensors == "pt"
+        assert add_generation_prompt is True
+        return self.model_inputs
+
+    def decode(self, token_ids, *, skip_special_tokens: bool) -> str:
+        assert skip_special_tokens is True
+        self.decoded_ids = token_ids.tolist()
+        return "safe"
+
+
+class _FakeModel:
+    def __init__(self) -> None:
+        self.calls: list[tuple[tuple[object, ...], dict[str, object]]] = []
+
+    def generate(self, *args, **kwargs):
+        self.calls.append((args, kwargs))
+        input_ids = args[0] if args else kwargs["input_ids"]
+        return torch.cat([input_ids, torch.tensor([[99]], dtype=input_ids.dtype)], dim=-1)
+
+
+def test_qwen_guardrail_generation_accepts_batch_encoding() -> None:
+    from vllm_omni.diffusion.models.cosmos3.guardrails import _generate_qwen_guardrail_response
+
+    input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long)
+    attention_mask = torch.ones_like(input_ids)
+    tokenizer = _FakeTokenizer(BatchEncoding({"input_ids": input_ids, "attention_mask": attention_mask}))
+    model = _FakeModel()
+
+    response = _generate_qwen_guardrail_response("a safe prompt", tokenizer, model, "cpu")
+
+    assert response == "safe"
+    assert tokenizer.decoded_ids == [99]
+    args, kwargs = model.calls[0]
+    assert args == ()
+    assert torch.equal(kwargs["input_ids"], input_ids)
+    assert torch.equal(kwargs["attention_mask"], attention_mask)
+    assert kwargs["max_new_tokens"] == 128
+
+
+def test_qwen_guardrail_generation_accepts_tensor_input_ids() -> None:
+    from vllm_omni.diffusion.models.cosmos3.guardrails import _generate_qwen_guardrail_response
+
+    input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long)
+    tokenizer = _FakeTokenizer(input_ids)
+    model = _FakeModel()
+
+    response = _generate_qwen_guardrail_response("a safe prompt", tokenizer, model, "cpu")
+
+    assert response == "safe"
+    assert tokenizer.decoded_ids == [99]
+    args, kwargs = model.calls[0]
+    assert len(args) == 1
+    assert torch.equal(args[0], input_ids)
+    assert kwargs == {"max_new_tokens": 128}
+
+
+def test_siglip_feature_extraction_accepts_tensor() -> None:
+    from vllm_omni.diffusion.models.cosmos3.guardrails import _extract_siglip_image_features
+
+    features = torch.randn(1, 1152)
+
+    assert _extract_siglip_image_features(features) is features
+
+
+def test_siglip_feature_extraction_accepts_base_model_output_with_pooling() -> None:
+    from vllm_omni.diffusion.models.cosmos3.guardrails import _extract_siglip_image_features
+
+    last_hidden_state = torch.randn(1, 729, 1152)
+    pooler_output = torch.randn(1, 1152)
+    output = BaseModelOutputWithPooling(last_hidden_state=last_hidden_state, pooler_output=pooler_output)
+
+    assert _extract_siglip_image_features(output) is pooler_output
+
+
+def test_siglip_feature_extraction_accepts_tuple_output() -> None:
+    from vllm_omni.diffusion.models.cosmos3.guardrails import _extract_siglip_image_features
+
+    last_hidden_state = torch.randn(1, 729, 1152)
+    pooler_output = torch.randn(1, 1152)
+
+    assert _extract_siglip_image_features((last_hidden_state, pooler_output)) is pooler_output
+
+
+def test_siglip_feature_extraction_rejects_unpooled_features() -> None:
+    from vllm_omni.diffusion.models.cosmos3.guardrails import _extract_siglip_image_features
+
+    last_hidden_state = torch.randn(1, 729, 1152)
+
+    with pytest.raises(TypeError, match="pooled features"):
+        _extract_siglip_image_features(last_hidden_state)
diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
index 9cbc7b87798..ec4d36ae5f1 100644
--- a/vllm_omni/diffusion/data.py
+++ b/vllm_omni/diffusion/data.py
@@ -938,6 +938,10 @@ class DiffusionRequestAbortedError(RuntimeError):
     """Raised when a diffusion request ends via user-visible abort."""
 
 
+class GuardrailViolationError(ValueError):
+    """Raised when a guardrail blocks user input or generated output."""
+
+
 @dataclass
 class AttentionSpec:
     """Specifies a backend and its backend-specific parameters for one attention role."""
diff --git a/vllm_omni/diffusion/models/cosmos3/guardrails.py b/vllm_omni/diffusion/models/cosmos3/guardrails.py
index a085c3f3a59..31b8dbc2bf9 100644
--- a/vllm_omni/diffusion/models/cosmos3/guardrails.py
+++ b/vllm_omni/diffusion/models/cosmos3/guardrails.py
@@ -13,7 +13,7 @@
 
 import os
 import warnings
-from collections.abc import Callable
+from collections.abc import Callable, Mapping
 from typing import Any
 
 import cv2
@@ -22,6 +22,7 @@
 import torch.nn as nn
 from vllm.logger import init_logger
 
+from vllm_omni.diffusion.data import GuardrailViolationError
 from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
 
 logger = init_logger(__name__)
@@ -100,6 +101,103 @@ def _download_checkpoint() -> str:
     return snapshot_download(GUARDRAIL_HF_REPO, revision=GUARDRAIL_HF_REVISION)
 
 
+def _move_tokenizer_output_to_device(tokenizer_output: object, device: str) -> object:
+    if hasattr(tokenizer_output, "to"):
+        return tokenizer_output.to(device)
+    if isinstance(tokenizer_output, Mapping):
+        return {
+            key: value.to(device) if hasattr(value, "to") else value for key, value in tokenizer_output.items()
+        }
+    return tokenizer_output
+
+
+def _qwen_input_length(input_ids: object) -> int:
+    if hasattr(input_ids, "shape"):
+        return int(input_ids.shape[-1])
+    if isinstance(input_ids, list | tuple):
+        if input_ids and isinstance(input_ids[0], list | tuple):
+            return len(input_ids[0])
+        return len(input_ids)
+    raise TypeError(f"Qwen3Guard tokenizer returned unsupported input_ids type: {type(input_ids).__name__}")
+
+
+def _generate_qwen_guardrail_response(prompt: str, tokenizer: Any, model: Any, device: str) -> str:
+    conversations = [{"role": "user", "content": prompt}]
+    model_inputs = tokenizer.apply_chat_template(
+        conversations,
+        tokenize=True,
+        return_tensors="pt",
+        add_generation_prompt=True,
+    )
+    model_inputs = _move_tokenizer_output_to_device(model_inputs, device)
+
+    if isinstance(model_inputs, torch.Tensor):
+        input_ids = model_inputs
+        generate_kwargs = {}
+        generate_args = (input_ids,)
+    elif isinstance(model_inputs, Mapping):
+        if "input_ids" not in model_inputs:
+            raise TypeError("Qwen3Guard tokenizer output must include input_ids.")
+        input_ids = model_inputs["input_ids"]
+        generate_kwargs = dict(model_inputs)
+        generate_args = ()
+    else:
+        input_ids = getattr(model_inputs, "input_ids", None)
+        if input_ids is None:
+            raise TypeError(
+                "Qwen3Guard tokenizer must return a tensor or mapping with input_ids; "
+                f"got {type(model_inputs).__name__}"
+            )
+        generate_kwargs = {"input_ids": input_ids}
+        generate_args = ()
+
+    input_length = _qwen_input_length(input_ids)
+    with torch.no_grad():
+        output_ids = model.generate(*generate_args, **generate_kwargs, max_new_tokens=128)
+    return tokenizer.decode(
+        output_ids[0][input_length:],
+        skip_special_tokens=True,
+    )
+
+
+def _extract_siglip_image_features(features: object) -> torch.Tensor:
+    def _validate_features(tensor: torch.Tensor) -> torch.Tensor:
+        if tensor.dim() != 2:
+            raise TypeError(
+                "SigLIP image feature extractor returned features with shape "
+                f"{tuple(tensor.shape)}; expected pooled features with shape [batch, hidden]."
+            )
+        return tensor
+
+    if isinstance(features, torch.Tensor):
+        return _validate_features(features)
+
+    pooler_output = getattr(features, "pooler_output", None)
+    if isinstance(pooler_output, torch.Tensor):
+        return _validate_features(pooler_output)
+
+    image_embeds = getattr(features, "image_embeds", None)
+    if isinstance(image_embeds, torch.Tensor):
+        return _validate_features(image_embeds)
+
+    if isinstance(features, Mapping):
+        for key in ("pooler_output", "image_embeds"):
+            value = features.get(key)
+            if isinstance(value, torch.Tensor):
+                return _validate_features(value)
+
+    if isinstance(features, list | tuple):
+        if len(features) > 1 and isinstance(features[1], torch.Tensor):
+            return _validate_features(features[1])
+        if features and isinstance(features[0], torch.Tensor):
+            return _validate_features(features[0])
+
+    raise TypeError(
+        "SigLIP image feature extractor returned unsupported output type "
+        f"{type(features).__name__}; expected a tensor or output with pooled image features."
+    )
+
+
 def _build_text_guardrail(offload_to_cpu: bool) -> TextGuardrailFn:
     checkers: list[Callable[[str], tuple[bool, str]]] = []
 
@@ -155,19 +253,7 @@ def _blocklist_check(prompt: str) -> tuple[bool, str]:
         )
 
         def _qwen_check(prompt: str) -> tuple[bool, str]:
-            conversations = [{"role": "user", "content": prompt}]
-            input_ids = qwen_tokenizer.apply_chat_template(
-                conversations,
-                tokenize=True,
-                return_tensors="pt",
-                add_generation_prompt=True,
-            ).to(device)
-            with torch.no_grad():
-                output_ids = qwen_model.generate(input_ids, max_new_tokens=128)
-            response = qwen_tokenizer.decode(
-                output_ids[0][input_ids.shape[1] :],
-                skip_special_tokens=True,
-            )
+            response = _generate_qwen_guardrail_response(prompt, qwen_tokenizer, qwen_model, device)
             if "unsafe" in response.lower():
                 return False, f"Qwen3Guard: {response.strip()}"
             return True, ""
@@ -182,7 +268,7 @@ def text_guardrail(prompt: str) -> None:
         for checker in checkers:
             is_safe, msg = checker(prompt)
             if not is_safe:
-                raise ValueError(f"Guardrail blocked prompt: {msg}")
+                raise GuardrailViolationError(f"Guardrail blocked prompt: {msg}")
 
     return text_guardrail
 
@@ -192,14 +278,18 @@ def _build_video_guardrail(offload_to_cpu: bool) -> VideoGuardrailFn:
     safety_checker: Callable[[np.ndarray], tuple[bool, str]] | None = None
     face_blurrer: Callable[[np.ndarray], np.ndarray] | None = None
 
+    # `offload_to_cpu` controls idle weight placement only; the forward pass
+    # always runs on `compute_device` and weights are returned to CPU after.
+    compute_device = "cuda"
+    idle_device = "cpu" if offload_to_cpu else compute_device
+
     # 1. Video content safety filter: SigLIP so400m + SafetyClassifier
     try:
         from PIL import Image
         from transformers import SiglipModel, SiglipProcessor
 
-        device = "cpu" if offload_to_cpu else "cuda"
         siglip_id = "google/siglip-so400m-patch14-384"
-        siglip_model = SiglipModel.from_pretrained(siglip_id).to(device, dtype=torch.float32).eval()
+        siglip_model = SiglipModel.from_pretrained(siglip_id).to(idle_device, dtype=torch.float32).eval()
         siglip_processor = SiglipProcessor.from_pretrained(siglip_id)
 
         classifier = SafetyClassifier(input_size=1152, num_classes=7)
@@ -208,31 +298,35 @@ def _build_video_guardrail(offload_to_cpu: bool) -> VideoGuardrailFn:
         # Checkpoint keys have "network." prefix from the VideoSafetyModel wrapper.
         state = {k.removeprefix("network."): v for k, v in checkpoint["model"].items()}
         classifier.load_state_dict(state)
-        classifier = classifier.to(device, dtype=torch.float32).eval()
+        classifier = classifier.to(idle_device, dtype=torch.float32).eval()
 
         def _safety_check(frames: np.ndarray) -> tuple[bool, str]:
             nonlocal siglip_model, classifier
+            total = len(frames)
+            if total == 0:
+                return True, ""
             if offload_to_cpu:
-                siglip_model = siglip_model.to("cuda")
-                classifier = classifier.to("cuda")
+                siglip_model = siglip_model.to(compute_device)
+                classifier = classifier.to(compute_device)
 
             unsafe_count = 0
-            total = len(frames)
-            for frame in frames:
-                img = Image.fromarray(frame)
-                inputs = siglip_processor(images=img, return_tensors="pt").to("cuda", dtype=torch.float32)
-                with torch.no_grad():
-                    features = siglip_model.get_image_features(**inputs)
-                    features = features / features.norm(dim=-1, keepdim=True)
-                    logits = classifier(features)
-                    pred = logits.argmax(dim=-1).item()
-                class_name = CLASS_IDX_TO_NAME.get(pred, "Unknown")
-                if class_name != "Safe":
-                    unsafe_count += 1
-
-            if offload_to_cpu:
-                siglip_model = siglip_model.to("cpu")
-                classifier = classifier.to("cpu")
+            try:
+                for frame in frames:
+                    img = Image.fromarray(frame)
+                    inputs = siglip_processor(images=img, return_tensors="pt").to(compute_device, dtype=torch.float32)
+                    with torch.no_grad():
+                        features = siglip_model.get_image_features(**inputs)
+                        features = _extract_siglip_image_features(features)
+                        features = torch.nn.functional.normalize(features, p=2, dim=-1)
+                        logits = classifier(features)
+                        pred = logits.argmax(dim=-1).item()
+                    class_name = CLASS_IDX_TO_NAME.get(pred, "Unknown")
+                    if class_name != "Safe":
+                        unsafe_count += 1
+            finally:
+                if offload_to_cpu:
+                    siglip_model = siglip_model.to("cpu")
+                    classifier = classifier.to("cpu")
 
             if unsafe_count / total > CUTOFF_UNSAFE_FRAMES_PERCENT / 100:
                 return False, f"Video content safety: {unsafe_count}/{total} frames unsafe"
@@ -269,8 +363,7 @@ def _safety_check(frames: np.ndarray) -> tuple[bool, str]:
             k.replace("module.", "", 1) if k.startswith("module.") else k: v for k, v in pretrained_dict.items()
         }
         retinaface_net.load_state_dict(pretrained_dict, strict=False)
-        retinaface_device = "cpu" if offload_to_cpu else "cuda"
-        retinaface_net = retinaface_net.to(retinaface_device, dtype=torch.float32).eval()
+        retinaface_net = retinaface_net.to(idle_device, dtype=torch.float32).eval()
 
         CONF_THRESH = 0.7
         NMS_THRESH = 0.4
@@ -294,60 +387,63 @@ def _decode_batch(loc, priors, variances):
         def _face_blur(frames: np.ndarray) -> np.ndarray:
             nonlocal retinaface_net
             if offload_to_cpu:
-                retinaface_net = retinaface_net.to("cuda")
+                retinaface_net = retinaface_net.to(compute_device)
 
             prior_data = None
             scale = None
             result_frames = []
 
-            for frame in frames:
-                frame_t = torch.from_numpy(frame).to("cuda", dtype=torch.float32)
-                frame_t = frame_t.permute(2, 0, 1).unsqueeze(0)  # [1, C, H, W]
-                frame_t = frame_t[:, [2, 1, 0], :, :]  # RGB → BGR
-                means = torch.tensor([104.0, 117.0, 123.0], device="cuda", dtype=torch.float32).view(1, 3, 1, 1)
-                frame_t = frame_t - means
-
-                h, w = frame_t.shape[2], frame_t.shape[3]
-                if prior_data is None:
-                    priorbox = PriorBox(cfg, image_size=(h, w))
-                    prior_data = priorbox.forward().to("cuda", dtype=torch.float32)
-                if scale is None:
-                    scale = torch.tensor([w, h, w, h], device="cuda", dtype=torch.float32)
-
-                with torch.no_grad():
-                    loc, conf, _ = retinaface_net(frame_t)
-
-                boxes = _decode_batch(loc, prior_data, cfg["variance"])
-                boxes = (boxes * scale).squeeze(0).cpu().numpy()
-                scores = conf.squeeze(0)[:, 1].cpu().numpy()
-
-                # Filter by confidence
-                inds = np.where(scores > CONF_THRESH)[0]
-                boxes_f = boxes[inds]
-                scores_f = scores[inds]
-                order = scores_f.argsort()[::-1][:TOP_K]
-                boxes_f = boxes_f[order]
-                scores_f = scores_f[order]
-
-                # NMS
-                dets = np.hstack((boxes_f, scores_f[:, np.newaxis])).astype(np.float32)
-                keep = py_cpu_nms(dets, NMS_THRESH)
-                dets = dets[keep][:KEEP_TOP_K]
-
-                out_frame = frame.copy()
-                for det in dets:
-                    x1, y1, x2, y2 = map(int, det[:4])
-                    if x2 - x1 < 20 or y2 - y1 < 20:
-                        continue
-                    max_h, max_w = out_frame.shape[:2]
-                    y1c, y2c = max(y1, 0), min(y2, max_h)
-                    x1c, x2c = max(x1, 0), min(x2, max_w)
-                    out_frame[y1c:y2c, x1c:x2c] = _pixelate_face(out_frame[y1c:y2c, x1c:x2c])
-
-                result_frames.append(out_frame)
-
-            if offload_to_cpu:
-                retinaface_net = retinaface_net.to("cpu")
+            try:
+                for frame in frames:
+                    frame_t = torch.from_numpy(frame).to(compute_device, dtype=torch.float32)
+                    frame_t = frame_t.permute(2, 0, 1).unsqueeze(0)  # [1, C, H, W]
+                    frame_t = frame_t[:, [2, 1, 0], :, :]  # RGB → BGR
+                    means = torch.tensor(
+                        [104.0, 117.0, 123.0], device=compute_device, dtype=torch.float32
+                    ).view(1, 3, 1, 1)
+                    frame_t = frame_t - means
+
+                    h, w = frame_t.shape[2], frame_t.shape[3]
+                    if prior_data is None:
+                        priorbox = PriorBox(cfg, image_size=(h, w))
+                        prior_data = priorbox.forward().to(compute_device, dtype=torch.float32)
+                    if scale is None:
+                        scale = torch.tensor([w, h, w, h], device=compute_device, dtype=torch.float32)
+
+                    with torch.no_grad():
+                        loc, conf, _ = retinaface_net(frame_t)
+
+                    boxes = _decode_batch(loc, prior_data, cfg["variance"])
+                    boxes = (boxes * scale).squeeze(0).cpu().numpy()
+                    scores = conf.squeeze(0)[:, 1].cpu().numpy()
+
+                    # Filter by confidence
+                    inds = np.where(scores > CONF_THRESH)[0]
+                    boxes_f = boxes[inds]
+                    scores_f = scores[inds]
+                    order = scores_f.argsort()[::-1][:TOP_K]
+                    boxes_f = boxes_f[order]
+                    scores_f = scores_f[order]
+
+                    # NMS
+                    dets = np.hstack((boxes_f, scores_f[:, np.newaxis])).astype(np.float32)
+                    keep = py_cpu_nms(dets, NMS_THRESH)
+                    dets = dets[keep][:KEEP_TOP_K]
+
+                    out_frame = frame.copy()
+                    for det in dets:
+                        x1, y1, x2, y2 = map(int, det[:4])
+                        if x2 - x1 < 20 or y2 - y1 < 20:
+                            continue
+                        max_h, max_w = out_frame.shape[:2]
+                        y1c, y2c = max(y1, 0), min(y2, max_h)
+                        x1c, x2c = max(x1, 0), min(x2, max_w)
+                        out_frame[y1c:y2c, x1c:x2c] = _pixelate_face(out_frame[y1c:y2c, x1c:x2c])
+
+                    result_frames.append(out_frame)
+            finally:
+                if offload_to_cpu:
+                    retinaface_net = retinaface_net.to("cpu")
 
             return np.array(result_frames)
 
@@ -361,7 +457,7 @@ def video_guardrail(frames: np.ndarray) -> np.ndarray:
         if safety_checker is not None:
             is_safe, msg = safety_checker(frames)
             if not is_safe:
-                raise ValueError(f"Guardrail blocked video: {msg}")
+                raise GuardrailViolationError(f"Guardrail blocked video: {msg}")
         if face_blurrer is not None:
             frames = face_blurrer(frames)
         return frames
@@ -378,8 +474,13 @@ def _init_default_guardrails(offload_to_cpu: bool = False) -> None:
         return
     if _is_rank_zero():
         logger.info("Initializing Cosmos3 guardrails (offload_to_cpu=%s)...", offload_to_cpu)
-    _text_guardrail = _build_text_guardrail(offload_to_cpu)
-    _video_guardrail = _build_video_guardrail(offload_to_cpu)
+    # Build into locals first so a partial failure doesn't leave the module
+    # in a half-initialized state (one guardrail set, the other missing,
+    # and `_initialized` still False so the next call retries from scratch).
+    text_fn = _build_text_guardrail(offload_to_cpu)
+    video_fn = _build_video_guardrail(offload_to_cpu)
+    _text_guardrail = text_fn
+    _video_guardrail = video_fn
     _initialized = True
     if _is_rank_zero():
         logger.info("Cosmos3 guardrails initialized.")
@@ -419,10 +520,27 @@ def check_video_safety(video_tensor: torch.Tensor) -> torch.Tensor:
     return result.to(video_tensor.device)
 
 
-def is_guardrails_enabled(od_config: Any) -> bool:
-    return False
+def is_guardrails_enabled(od_config: Any, sampling_params: Any = None) -> bool:
+    """Resolve the active guardrail gate.
+
+    Server-level ``od_config.model_config["guardrails"]`` decides whether the
+    guardrail models are loaded at all (eager load at pipeline build time).
+    When that is False, no per-request override can turn checks back on,
+    because the singletons in this module are never populated.
+
+    When the server gate is on, ``sampling_params.extra_args["guardrails"]``
+    may override on a per-request basis: ``False`` skips the check for that
+    request, anything else (or missing) keeps the default behavior.
+    """
     cfg = getattr(od_config, "model_config", None) or {}
-    return bool(cfg.get("guardrails", True))
+    if not bool(cfg.get("guardrails", True)):
+        return False
+    if sampling_params is not None:
+        extra = getattr(sampling_params, "extra_args", None) or {}
+        per_request = extra.get("guardrails")
+        if per_request is not None:
+            return bool(per_request)
+    return True
 
 
 def get_offload_flag(od_config: Any) -> bool:
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index b6e9f16b398..2dacb80f948 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -97,8 +97,10 @@ def get_cosmos3_pre_process_func(od_config: OmniDiffusionConfig):
     from .guardrails import check_text_safety, ensure_initialized, is_guardrails_enabled
 
     video_processor = VideoProcessor(vae_scale_factor=16)
-    guardrails_on = is_guardrails_enabled(od_config)
-    if guardrails_on:
+    # Eager-load guardrail models at pipeline build time when the server-level
+    # gate is on. Per-request overrides only decide whether the loaded models
+    # are *invoked* — they cannot turn checks on without a server-side preload.
+    if is_guardrails_enabled(od_config):
         ensure_initialized(od_config)
 
     def _extra_args(request: OmniDiffusionRequest) -> dict[str, Any]:
@@ -162,7 +164,7 @@ def _preprocess_action_video(frames: list[Any], target_h: int, target_w: int) ->
 
     def pre_process_func(request: OmniDiffusionRequest) -> OmniDiffusionRequest:
         action_mode = _request_action_mode(request)
-        if guardrails_on:
+        if is_guardrails_enabled(od_config, request.sampling_params):
             for prompt in request.prompts:
                 text = prompt if isinstance(prompt, str) else prompt.get("prompt", "")
                 check_text_safety(text)
@@ -240,7 +242,6 @@ def get_cosmos3_post_process_func(od_config: OmniDiffusionConfig):
     from .guardrails import check_video_safety, is_guardrails_enabled
 
     video_processor = VideoProcessor(vae_scale_factor=16)
-    guardrails_on = is_guardrails_enabled(od_config)
 
     def _sampling_param(sampling_params, key: str, default=None):
         extra = getattr(sampling_params, "extra_args", None)
@@ -306,12 +307,12 @@ def post_process_func(
                     f"with shape [B, C, 1, H, W], got {tuple(video.shape)}."
                 )
             image = video.squeeze(2)  # [B, 3, H, W]
-            if guardrails_on:
+            if is_guardrails_enabled(od_config, sampling_params):
                 # check_video_safety expects a 5D tensor; re-add T axis.
                 checked = check_video_safety(image.unsqueeze(2))
                 image = checked.squeeze(2)
             return video_processor.postprocess(image, output_type="pil")
-        if guardrails_on:
+        if is_guardrails_enabled(od_config, sampling_params):
             video = check_video_safety(video)
         result = {"video": video_processor.postprocess_video(video, output_type=output_type)}
         if audio is None:
@@ -800,6 +801,15 @@ def _tokenize_prompt(
         token_ids = self._normalize_token_ids(
             self.tokenizer.apply_chat_template(conversations, tokenize=True, add_generation_prompt=True)
         )
+        original_token_count = len(token_ids)
+        if original_token_count > max_sequence_length and _is_rank_zero():
+            logger.warning(
+                "Cosmos3 prompt token_ids shortened to max_sequence_length: "
+                "original_token_count=%d, max_sequence_length=%d, removed_token_count=%d",
+                original_token_count,
+                max_sequence_length,
+                original_token_count - max_sequence_length,
+            )
         token_ids = token_ids[:max_sequence_length]
         token_ids.append(self.tokenizer.eos_token_id)  # 151645
         token_ids.append(self.tokenizer.convert_tokens_to_ids("<|vision_start|>"))  # 151652
diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py
index f896fa15d75..fcc65fab4aa 100644
--- a/vllm_omni/entrypoints/openai/serving_video.py
+++ b/vllm_omni/entrypoints/openai/serving_video.py
@@ -14,6 +14,7 @@
 from vllm.engine.protocol import EngineClient
 from vllm.logger import init_logger
 
+from vllm_omni.diffusion.data import GuardrailViolationError
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 from vllm_omni.entrypoints.openai.protocol.videos import (
     VideoAction,
@@ -336,12 +337,18 @@ async def _run_generation(
         )
 
         result = None
-        async for output in engine_client.generate(
-            prompt=prompt,
-            request_id=request_id,
-            sampling_params_list=sampling_params_list,
-        ):
-            result = output
+        try:
+            async for output in engine_client.generate(
+                prompt=prompt,
+                request_id=request_id,
+                sampling_params_list=sampling_params_list,
+            ):
+                result = output
+        except GuardrailViolationError as exc:
+            raise HTTPException(
+                status_code=HTTPStatus.BAD_REQUEST.value,
+                detail=str(exc),
+            ) from exc
 
         if result is None:
             raise HTTPException(

From 095f6b584ebe96896d4cd8ee058f0dca4cd6bfa3 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Fri, 15 May 2026 15:11:43 +0200
Subject: [PATCH 10/41] Linter fixes

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../models/cosmos3/test_cosmos3_sound_tokenizer.py     | 10 ++--------
 vllm_omni/diffusion/models/cosmos3/guardrails.py       | 10 ++++------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
index b647bc7c0dc..cb73eb0d818 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
@@ -275,10 +275,7 @@ def test_from_config_uses_component_config_architecture_values(
     tokenizer_dir.mkdir(parents=True)
     (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
     (tokenizer_dir / "config.json").write_text(
-        (
-            '{"sampling_rate": 48000, "dec_out_channels": 2, '
-            '"vocoder_input_dim": 64, "hop_size": 1920}'
-        ),
+        ('{"sampling_rate": 48000, "dec_out_channels": 2, "vocoder_input_dim": 64, "hop_size": 1920}'),
         encoding="utf-8",
     )
 
@@ -328,10 +325,7 @@ def test_from_config_rejects_custom_architecture_conflict_with_component_config(
     tokenizer_dir.mkdir(parents=True)
     (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
     (tokenizer_dir / "config.json").write_text(
-        (
-            '{"sampling_rate": 48000, "dec_out_channels": 2, '
-            '"vocoder_input_dim": 64, "hop_size": 1920}'
-        ),
+        ('{"sampling_rate": 48000, "dec_out_channels": 2, "vocoder_input_dim": 64, "hop_size": 1920}'),
         encoding="utf-8",
     )
 
diff --git a/vllm_omni/diffusion/models/cosmos3/guardrails.py b/vllm_omni/diffusion/models/cosmos3/guardrails.py
index 31b8dbc2bf9..e739d17b962 100644
--- a/vllm_omni/diffusion/models/cosmos3/guardrails.py
+++ b/vllm_omni/diffusion/models/cosmos3/guardrails.py
@@ -105,9 +105,7 @@ def _move_tokenizer_output_to_device(tokenizer_output: object, device: str) -> o
     if hasattr(tokenizer_output, "to"):
         return tokenizer_output.to(device)
     if isinstance(tokenizer_output, Mapping):
-        return {
-            key: value.to(device) if hasattr(value, "to") else value for key, value in tokenizer_output.items()
-        }
+        return {key: value.to(device) if hasattr(value, "to") else value for key, value in tokenizer_output.items()}
     return tokenizer_output
 
 
@@ -398,9 +396,9 @@ def _face_blur(frames: np.ndarray) -> np.ndarray:
                     frame_t = torch.from_numpy(frame).to(compute_device, dtype=torch.float32)
                     frame_t = frame_t.permute(2, 0, 1).unsqueeze(0)  # [1, C, H, W]
                     frame_t = frame_t[:, [2, 1, 0], :, :]  # RGB → BGR
-                    means = torch.tensor(
-                        [104.0, 117.0, 123.0], device=compute_device, dtype=torch.float32
-                    ).view(1, 3, 1, 1)
+                    means = torch.tensor([104.0, 117.0, 123.0], device=compute_device, dtype=torch.float32).view(
+                        1, 3, 1, 1
+                    )
                     frame_t = frame_t - means
 
                     h, w = frame_t.shape[2], frame_t.shape[3]

From 05850957171cf87def078a357273b40b68fd5d4f Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Mon, 18 May 2026 13:24:27 +0200
Subject: [PATCH 11/41] Reworked guardrail error

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 tests/entrypoints/test_omni_entrypoints.py    | 86 ++++++++++++++++++-
 vllm_omni/diffusion/data.py                   | 14 +++
 vllm_omni/diffusion/diffusion_engine.py       | 17 +++-
 .../inline_stage_diffusion_client.py          |  4 +-
 vllm_omni/diffusion/stage_diffusion_client.py |  9 +-
 vllm_omni/diffusion/stage_diffusion_proc.py   |  4 +-
 .../diffusion/worker/diffusion_worker.py      | 13 ++-
 vllm_omni/engine/messages.py                  |  1 +
 vllm_omni/engine/orchestrator.py              |  1 +
 vllm_omni/entrypoints/async_omni.py           | 11 ++-
 vllm_omni/entrypoints/omni_base.py            | 20 ++++-
 vllm_omni/outputs.py                          |  4 +
 12 files changed, 172 insertions(+), 12 deletions(-)

diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py
index 2360a68fc4d..f84d07b6811 100644
--- a/tests/entrypoints/test_omni_entrypoints.py
+++ b/tests/entrypoints/test_omni_entrypoints.py
@@ -13,6 +13,10 @@
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 
+from vllm_omni.diffusion.data import (
+    DiffusionErrorType,
+    GuardrailViolationError,
+)
 from vllm_omni.engine.async_omni_engine import StageRuntimeInfo
 from vllm_omni.engine.messages import ErrorMessage, OutputMessage
 from vllm_omni.entrypoints.async_omni import AsyncOmni
@@ -413,6 +417,17 @@ def _enqueue_error_message(engine: FakeAsyncOmniEngine, msg: dict[str, Any]) ->
     )
 
 
+def _enqueue_guardrail_error_message(engine: FakeAsyncOmniEngine, msg: dict[str, Any]) -> None:
+    engine.output_q.put_nowait(
+        ErrorMessage(
+            request_id=msg["request_id"],
+            stage_id=0,
+            error="Guardrail blocked prompt: unsafe",
+            error_type=DiffusionErrorType.GUARDRAIL_VIOLATION,
+        )
+    )
+
+
 def _enqueue_fatal_error_message(engine: FakeAsyncOmniEngine, msg: dict[str, Any]) -> None:
     engine.output_q.put_nowait(
         ErrorMessage(
@@ -847,11 +862,16 @@ def _enqueue_stage_error(
     *,
     error_text: str,
     kill_engine: bool = False,
+    error_type: str | None = None,
 ):
     """Enqueue a stage error output, optionally killing the engine."""
     if kill_engine:
         engine._alive = False
-    engine_output = OmniRequestOutput.from_error(msg["request_id"], error_text)
+    engine_output = OmniRequestOutput.from_error(
+        msg["request_id"],
+        error_text,
+        error_type=error_type,
+    )
     engine_output.payload = ""
     engine.output_q.put_nowait(
         OutputMessage(
@@ -903,6 +923,49 @@ async def test_async_omni_propagates_engine_generate_error(monkeypatch: pytest.M
         app.shutdown()
 
 
+@pytest.mark.asyncio
+async def test_async_omni_rehydrates_guardrail_stage_error(monkeypatch: pytest.MonkeyPatch):
+    """Structured guardrail errors should not be flattened to EngineGenerateError."""
+
+    engine = FakeAsyncOmniEngine(
+        stage_metadata=THREE_STAGE_META,
+        on_add_request=lambda eng, msg: _enqueue_stage_error(
+            eng,
+            msg,
+            error_text="Guardrail blocked prompt: unsafe",
+            error_type=DiffusionErrorType.GUARDRAIL_VIOLATION,
+        ),
+    )
+    _patch_engine(monkeypatch, engine)
+
+    app = AsyncOmni("dummy-model")
+    try:
+        with pytest.raises(GuardrailViolationError, match="Guardrail blocked prompt"):
+            async for _ in app.generate(prompt="hello", request_id="req-guardrail-output"):
+                pass
+    finally:
+        app.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_async_omni_rehydrates_guardrail_error_message(monkeypatch: pytest.MonkeyPatch):
+    """Request-scoped ErrorMessage metadata should reach the request generator."""
+
+    engine = FakeAsyncOmniEngine(
+        stage_metadata=THREE_STAGE_META,
+        on_add_request=_enqueue_guardrail_error_message,
+    )
+    _patch_engine(monkeypatch, engine)
+
+    app = AsyncOmni("dummy-model")
+    try:
+        with pytest.raises(GuardrailViolationError, match="Guardrail blocked prompt"):
+            async for _ in app.generate(prompt="hello", request_id="req-guardrail-message"):
+                pass
+    finally:
+        app.shutdown()
+
+
 # ───────── OmniBase.check_health() aggregation ─────────
 
 
@@ -1003,6 +1066,27 @@ def test_omni_propagates_engine_generate_error(monkeypatch: pytest.MonkeyPatch):
         app.shutdown()
 
 
+def test_omni_rehydrates_guardrail_stage_error(monkeypatch: pytest.MonkeyPatch):
+    """Synchronous generation should preserve structured guardrail failures."""
+    engine = FakeAsyncOmniEngine(
+        stage_metadata=THREE_STAGE_META,
+        on_add_request=lambda eng, msg: _enqueue_stage_error(
+            eng,
+            msg,
+            error_text="Guardrail blocked video: unsafe",
+            error_type=DiffusionErrorType.GUARDRAIL_VIOLATION,
+        ),
+    )
+    _patch_engine(monkeypatch, engine)
+
+    app = Omni("dummy-model")
+    try:
+        with pytest.raises(GuardrailViolationError, match="Guardrail blocked video"):
+            list(app.generate(["hello"], py_generator=False, use_tqdm=False))
+    finally:
+        app.shutdown()
+
+
 def test_omni_errored_property_alive(monkeypatch: pytest.MonkeyPatch):
     """Omni.errored (inherited from OmniBase) returns False when healthy."""
     engine = FakeAsyncOmniEngine(stage_metadata=THREE_STAGE_META)
diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
index ec4d36ae5f1..0afb9edc653 100644
--- a/vllm_omni/diffusion/data.py
+++ b/vllm_omni/diffusion/data.py
@@ -6,6 +6,7 @@
 import random
 from collections.abc import Callable, Mapping
 from dataclasses import dataclass, field, fields
+from enum import Enum
 from typing import TYPE_CHECKING, Any
 
 import diffusers
@@ -915,6 +916,7 @@ class DiffusionOutput:
     trajectory_log_probs: torch.Tensor | dict | None = None
     trajectory_decoded: list[Image.Image] | None = None
     error: str | None = None
+    error_type: str | None = None
     aborted: bool = False
     abort_message: str | None = None
 
@@ -942,6 +944,18 @@ class GuardrailViolationError(ValueError):
     """Raised when a guardrail blocks user input or generated output."""
 
 
+class DiffusionErrorType(str, Enum):
+    """Stable, serializable identifiers for recoverable diffusion errors."""
+
+    GUARDRAIL_VIOLATION = "guardrail_violation"
+
+
+def diffusion_error_type_from_exception(exc: BaseException) -> DiffusionErrorType | None:
+    if isinstance(exc, GuardrailViolationError):
+        return DiffusionErrorType.GUARDRAIL_VIOLATION
+    return None
+
+
 @dataclass
 class AttentionSpec:
     """Specifies a backend and its backend-specific parameters for one attention role."""
diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py
index 17259467a64..327aeece08a 100644
--- a/vllm_omni/diffusion/diffusion_engine.py
+++ b/vllm_omni/diffusion/diffusion_engine.py
@@ -20,9 +20,12 @@
 from vllm.v1.engine.exceptions import EngineDeadError
 
 from vllm_omni.diffusion.data import (
+    DiffusionErrorType,
     DiffusionOutput,
     DiffusionRequestAbortedError,
+    GuardrailViolationError,
     OmniDiffusionConfig,
+    diffusion_error_type_from_exception,
 )
 from vllm_omni.diffusion.executor.abstract import DiffusionExecutor
 from vllm_omni.diffusion.registry import (
@@ -215,6 +218,8 @@ async def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]:
         if output.aborted:
             raise DiffusionRequestAbortedError(output.abort_message or "Diffusion request aborted.")
         if output.error:
+            if output.error_type == DiffusionErrorType.GUARDRAIL_VIOLATION:
+                raise GuardrailViolationError(output.error)
             raise RuntimeError(output.error)
         logger.debug("Generation completed successfully.")
 
@@ -473,13 +478,17 @@ def _busy_loop(self):
                 logger.error(
                     "Execution failed for diffusion requests %s", sched_output.scheduled_req_ids, exc_info=True
                 )
+                error_type = diffusion_error_type_from_exception(exc)
                 runner_output = BatchRunnerOutput.from_list(
                     [
                         RunnerOutput(
                             req_id=req_id,
                             step_index=None,
                             finished=True,
-                            result=DiffusionOutput(error=str(exc)),
+                            result=DiffusionOutput(
+                                error=str(exc),
+                                error_type=error_type,
+                            ),
                         )
                         for req_id in sched_output.scheduled_req_ids
                     ]
@@ -627,11 +636,15 @@ def add_req_and_wait_for_response(self, request: OmniDiffusionRequest) -> Diffus
                     raise
                 except Exception as exc:
                     logger.error("Execution failed for diffusion request %s", sched_req_id, exc_info=True)
+                    error_type = diffusion_error_type_from_exception(exc)
                     runner_output = RunnerOutput(
                         req_id=sched_req_id,
                         step_index=None,
                         finished=True,
-                        result=DiffusionOutput(error=str(exc)),
+                        result=DiffusionOutput(
+                            error=str(exc),
+                            error_type=error_type,
+                        ),
                     )
 
                 self._process_aborts_queue()
diff --git a/vllm_omni/diffusion/inline_stage_diffusion_client.py b/vllm_omni/diffusion/inline_stage_diffusion_client.py
index 62eff7ecc47..576de66457b 100644
--- a/vllm_omni/diffusion/inline_stage_diffusion_client.py
+++ b/vllm_omni/diffusion/inline_stage_diffusion_client.py
@@ -17,7 +17,7 @@
 from vllm.logger import init_logger
 from vllm.v1.engine.exceptions import EngineDeadError
 
-from vllm_omni.diffusion.data import DiffusionRequestAbortedError
+from vllm_omni.diffusion.data import DiffusionRequestAbortedError, diffusion_error_type_from_exception
 from vllm_omni.diffusion.diffusion_engine import DiffusionEngine
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.engine.stage_client import StageClientBase
@@ -135,6 +135,7 @@ async def _dispatch_request(
                 images=[],
             )
             error_output.error = str(e)
+            error_output.error_type = diffusion_error_type_from_exception(e)
             self._output_queue.put_nowait(error_output)
         finally:
             self._tasks.pop(request_id, None)
@@ -241,6 +242,7 @@ async def _dispatch_batch(
                 images=[],
             )
             error_output.error = str(e)
+            error_output.error_type = diffusion_error_type_from_exception(e)
             self._output_queue.put_nowait(error_output)
         finally:
             self._tasks.pop(request_id, None)
diff --git a/vllm_omni/diffusion/stage_diffusion_client.py b/vllm_omni/diffusion/stage_diffusion_client.py
index 11809f6ef03..c06b8e43ba3 100644
--- a/vllm_omni/diffusion/stage_diffusion_client.py
+++ b/vllm_omni/diffusion/stage_diffusion_client.py
@@ -219,6 +219,7 @@ def _drain_responses(self) -> None:
                 req_id = msg.get("request_id")
                 rpc_id = msg.get("rpc_id")
                 error_msg = msg.get("error")
+                error_type = msg.get("error_type")
                 logger.error(
                     "[StageDiffusionClient] stage-%s [rep-%s] subprocess error for %s: %s",
                     self.stage_id,
@@ -235,7 +236,13 @@ def _drain_responses(self) -> None:
                 # Route request errors as error outputs so the Orchestrator
                 # sees the request complete (instead of hanging forever).
                 if req_id is not None:
-                    self._output_queue.put_nowait(OmniRequestOutput.from_error(req_id, error_msg))
+                    self._output_queue.put_nowait(
+                        OmniRequestOutput.from_error(
+                            req_id,
+                            error_msg,
+                            error_type=error_type,
+                        )
+                    )
 
     # Fields that are subprocess-local and cannot be serialized across
     # process boundaries.  They are recreated in the subprocess with
diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py
index 2f3ad55943a..6f7420ba62b 100644
--- a/vllm_omni/diffusion/stage_diffusion_proc.py
+++ b/vllm_omni/diffusion/stage_diffusion_proc.py
@@ -23,7 +23,7 @@
 from vllm.utils.system_utils import get_mp_context
 from vllm.v1.utils import shutdown
 
-from vllm_omni.diffusion.data import DiffusionRequestAbortedError
+from vllm_omni.diffusion.data import DiffusionRequestAbortedError, diffusion_error_type_from_exception
 from vllm_omni.diffusion.diffusion_engine import DiffusionEngine
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.distributed.omni_connectors.utils.serialization import (
@@ -339,6 +339,7 @@ async def _dispatch_request(
                             "type": "error",
                             "request_id": request_id,
                             "error": str(e),
+                            "error_type": diffusion_error_type_from_exception(e),
                         }
                     )
                 )
@@ -394,6 +395,7 @@ async def _dispatch_batch(
                                         "type": "error",
                                         "request_id": rid,
                                         "error": str(e),
+                                        "error_type": diffusion_error_type_from_exception(e),
                                     }
                                 )
                             )
diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
index 3909fd2d490..e1f9862576d 100644
--- a/vllm_omni/diffusion/worker/diffusion_worker.py
+++ b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -33,6 +33,7 @@
     OmniDiffusionConfig,
     OmniSleepTask,
     OmniWakeTask,
+    diffusion_error_type_from_exception,
 )
 from vllm_omni.diffusion.distributed.parallel_state import (
     destroy_distributed_env,
@@ -752,7 +753,12 @@ def worker_busy_loop(self) -> None:
                 except Exception as e:
                     logger.error(f"Error processing RPC: {e}", exc_info=True)
                     if self.result_mq is not None:
-                        self.return_result(DiffusionOutput(error=str(e)))
+                        self.return_result(
+                            DiffusionOutput(
+                                error=str(e),
+                                error_type=diffusion_error_type_from_exception(e),
+                            )
+                        )
 
             elif isinstance(msg, dict) and msg.get("type") == "shutdown":
                 logger.info("Worker %s: Received shutdown message", self.gpu_id)
@@ -768,7 +774,10 @@ def worker_busy_loop(self) -> None:
                         f"Error executing forward in event loop: {e}",
                         exc_info=True,
                     )
-                    output = DiffusionOutput(error=str(e))
+                    output = DiffusionOutput(
+                        error=str(e),
+                        error_type=diffusion_error_type_from_exception(e),
+                    )
 
                 try:
                     self.return_result(output)
diff --git a/vllm_omni/engine/messages.py b/vllm_omni/engine/messages.py
index 28a5c721cb1..4a5743cac2a 100644
--- a/vllm_omni/engine/messages.py
+++ b/vllm_omni/engine/messages.py
@@ -59,6 +59,7 @@ class ShutdownRequestMessage(EngineQueueMessage, kw_only=True):
 class ErrorMessage(EngineQueueMessage, kw_only=True):
     type: Literal["error"] = "error"
     error: str
+    error_type: str | None = None
     fatal: bool = False
     request_id: str | None = None
     stage_id: int | None = None
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index 4bc032ae20b..5931ff37608 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -559,6 +559,7 @@ async def _handle_stage_error(self, stage_id: int, output: Any) -> None:
                 request_id=parent_id,
                 stage_id=stage_id,
                 error=output.error,
+                error_type=getattr(output, "error_type", None),
             )
         )
         await self._cleanup_request_ids(
diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py
index c22e780962a..57510540ca9 100644
--- a/vllm_omni/entrypoints/async_omni.py
+++ b/vllm_omni/entrypoints/async_omni.py
@@ -519,7 +519,7 @@ async def _process_orchestrator_results(
                         result.error,
                         error_stage_id=result.stage_id,
                     )
-                raise RuntimeError(result.error)
+                self._raise_nonfatal_error_message(result)
 
             if not isinstance(result, OutputMessage):
                 logger.warning("[AsyncOmni] Dropping unexpected per-request message %r", result)
@@ -590,6 +590,15 @@ async def _final_output_loop():
                         await self.event_resolver.resolve(msg)
                         continue
 
+                    if isinstance(msg, ErrorMessage) and msg.request_id is not None:
+                        req_state = self.request_states.get(msg.request_id)
+                        if req_state is None:
+                            logger.debug("[AsyncOmni] Dropping error for unknown req %s", msg.request_id)
+                            continue
+                        req_state.stage_id = msg.stage_id
+                        await req_state.queue.put(msg)
+                        continue
+
                     should_continue, _, stage_id, req_state = self._handle_output_message(msg)
                     if should_continue:
                         continue
diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index 78f2b798517..19b20614560 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -13,6 +13,7 @@
 from vllm.logger import init_logger
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 
+from vllm_omni.diffusion.data import DiffusionErrorType, GuardrailViolationError
 from vllm_omni.engine.async_omni_engine import AsyncOmniEngine
 from vllm_omni.engine.messages import (
     EngineQueueMessage,
@@ -324,7 +325,7 @@ def _handle_output_message(
                     msg.error,
                     error_stage_id=msg.stage_id,
                 )
-            raise RuntimeError(msg.error)
+            self._raise_nonfatal_error_message(msg)
 
         if not isinstance(msg, OutputMessage):
             logger.warning("[%s] got unexpected msg type: %s", self.__class__.__name__, msg.type)
@@ -346,6 +347,15 @@ def _handle_output_message(
 
         return False, req_id, stage_id, req_state
 
+    @staticmethod
+    def _is_guardrail_violation(error_type: str | None) -> bool:
+        return error_type == DiffusionErrorType.GUARDRAIL_VIOLATION
+
+    def _raise_nonfatal_error_message(self, msg: ErrorMessage) -> None:
+        if self._is_guardrail_violation(msg.error_type):
+            raise GuardrailViolationError(msg.error)
+        raise RuntimeError(msg.error)
+
     def _check_engine_output_error(
         self,
         result: OutputMessage,
@@ -355,13 +365,15 @@ def _check_engine_output_error(
         """Raise if ``engine_outputs`` carries an error field.
 
         Raises :class:`EngineDeadError` when ``self.errored`` indicates the
-        engine is unrecoverable, otherwise raises :class:`EngineGenerateError`
-        (recoverable, single-request failure).
+        engine is unrecoverable. For recoverable, single-request failures,
+        raises :class:`GuardrailViolationError` when the error metadata marks
+        a guardrail block, otherwise :class:`EngineGenerateError`.
         """
         engine_outputs = result.engine_outputs
         error_text = getattr(engine_outputs, "error", None)
         if error_text is None:
             return
+        error_type = getattr(engine_outputs, "error_type", None)
         logger.error(
             "[%s] Stage error for req=%s stage-%s: %s",
             self.__class__.__name__,
@@ -375,6 +387,8 @@ def _check_engine_output_error(
                 error_text,
                 error_stage_id=stage_id,
             )
+        if self._is_guardrail_violation(error_type):
+            raise GuardrailViolationError(error_text)
         raise EngineGenerateError(error_text)
 
     def _process_single_result(
diff --git a/vllm_omni/outputs.py b/vllm_omni/outputs.py
index b4d308ebf8f..bdf7b973ab8 100644
--- a/vllm_omni/outputs.py
+++ b/vllm_omni/outputs.py
@@ -102,12 +102,15 @@ class OmniRequestOutput:
 
     # error handling
     error: str | None = None
+    error_type: str | None = None
 
     @classmethod
     def from_error(
         cls,
         request_id: str,
         error_message: str,
+        *,
+        error_type: str | None = None,
     ) -> "OmniRequestOutput":
         """Create a terminal error output.
 
@@ -122,6 +125,7 @@ def from_error(
             request_id=request_id,
             finished=True,
             error=error_message,
+            error_type=error_type,
         )
 
     @classmethod

From 144ffdfc8596a17a114b9a08037808c90c235ff8 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Mon, 18 May 2026 14:34:56 +0200
Subject: [PATCH 12/41] Simplify sound tokenizer and bring parity with
 diffusers

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../cosmos3/test_cosmos3_sound_tokenizer.py   |  84 ++-
 .../cosmos3/audio_tokenizer/activations.py    | 145 -----
 .../alias_free_torch/__init__.py              |  16 -
 .../audio_tokenizer/alias_free_torch/act.py   |  32 -
 .../alias_free_torch/filter.py                |  95 ---
 .../alias_free_torch/resample.py              |  48 --
 .../models/cosmos3/audio_tokenizer/avae.py    | 347 +++++-----
 .../cosmos3/audio_tokenizer/bottlenecks.py    | 132 ----
 .../models/cosmos3/audio_tokenizer/config.py  |  20 -
 .../models/cosmos3/audio_tokenizer/models.py  | 614 ------------------
 .../models/cosmos3/audio_tokenizer/modules.py | 391 -----------
 .../audio_tokenizer/modules_encodec.py        | 297 ---------
 .../models/cosmos3/sound_tokenizer.py         |   4 +-
 .../models/cosmos3/transformer_cosmos3.py     |  11 +-
 14 files changed, 274 insertions(+), 1962 deletions(-)
 delete mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py
 delete mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/__init__.py
 delete mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/act.py
 delete mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/filter.py
 delete mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/resample.py
 delete mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py
 delete mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/config.py
 delete mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/models.py
 delete mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py
 delete mode 100755 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules_encodec.py

diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
index cb73eb0d818..0e8e5034d85 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import json
 from types import SimpleNamespace
 
 import pytest
@@ -10,6 +11,8 @@
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
 
+DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME = "diffusion_pytorch_model.safetensors"
+
 
 class _FakeAVAEAudioTokenizer:
     def __init__(self, **kwargs) -> None:
@@ -38,7 +41,7 @@ def test_from_config_loads_default_sound_tokenizer_component(
     model_dir = tmp_path / "model"
     tokenizer_dir = model_dir / "sound_tokenizer"
     tokenizer_dir.mkdir(parents=True)
-    checkpoint_path = tokenizer_dir / "model.safetensors"
+    checkpoint_path = tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME
     config_path = tokenizer_dir / "config.json"
     checkpoint_path.write_bytes(b"stub")
     config_path.write_text("{}", encoding="utf-8")
@@ -83,7 +86,7 @@ def test_from_config_downloads_default_sound_tokenizer_from_hf_repo(
     cache_dir = tmp_path / "hf"
     tokenizer_dir = cache_dir / "sound_tokenizer"
     tokenizer_dir.mkdir(parents=True)
-    checkpoint_path = tokenizer_dir / "model.safetensors"
+    checkpoint_path = tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME
     config_path = tokenizer_dir / "config.json"
     checkpoint_path.write_bytes(b"stub")
     config_path.write_text("{}", encoding="utf-8")
@@ -131,12 +134,12 @@ def __init__(self, **kwargs) -> None:
         (
             "nvidia/cosmos3",
             "test-rev",
-            ["sound_tokenizer/config.json", "sound_tokenizer/model.safetensors"],
+            ["sound_tokenizer/config.json", f"sound_tokenizer/{DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME}"],
         )
     ]
 
 
-def test_from_config_uses_fixed_sound_tokenizer_checkpoint_name(
+def test_from_config_uses_diffusers_sound_tokenizer_checkpoint_name(
     tmp_path,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
@@ -145,7 +148,7 @@ def test_from_config_uses_fixed_sound_tokenizer_checkpoint_name(
     model_dir = tmp_path / "model"
     tokenizer_dir = model_dir / "sound_tokenizer"
     tokenizer_dir.mkdir(parents=True)
-    checkpoint_path = tokenizer_dir / "model.safetensors"
+    checkpoint_path = tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME
     checkpoint_path.write_bytes(b"stub")
     (tokenizer_dir / "config.json").write_text("{}", encoding="utf-8")
 
@@ -178,6 +181,20 @@ def test_default_component_requires_sound_tokenizer_checkpoint(tmp_path) -> None
         )
 
 
+def test_default_component_rejects_legacy_sound_tokenizer_checkpoint_name(tmp_path) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    tokenizer_dir = model_dir / "sound_tokenizer"
+    tokenizer_dir.mkdir(parents=True)
+    (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
+
+    with pytest.raises(ValueError, match=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME):
+        sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+            SimpleNamespace(model=str(model_dir), custom_pipeline_args={}, dtype=torch.float32)
+        )
+
+
 def test_from_config_uses_nested_normalization_config(
     tmp_path,
     monkeypatch: pytest.MonkeyPatch,
@@ -187,7 +204,7 @@ def test_from_config_uses_nested_normalization_config(
     model_dir = tmp_path / "model"
     tokenizer_dir = model_dir / "sound_tokenizer"
     tokenizer_dir.mkdir(parents=True)
-    (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
+    (tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME).write_bytes(b"stub")
     (tokenizer_dir / "config.json").write_text("{}", encoding="utf-8")
 
     created = {}
@@ -227,7 +244,7 @@ def test_from_config_custom_normalization_overrides_nested_config(
     model_dir = tmp_path / "model"
     tokenizer_dir = model_dir / "sound_tokenizer"
     tokenizer_dir.mkdir(parents=True)
-    (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
+    (tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME).write_bytes(b"stub")
     (tokenizer_dir / "config.json").write_text("{}", encoding="utf-8")
 
     created = {}
@@ -273,7 +290,7 @@ def test_from_config_uses_component_config_architecture_values(
     model_dir = tmp_path / "model"
     tokenizer_dir = model_dir / "sound_tokenizer"
     tokenizer_dir.mkdir(parents=True)
-    (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
+    (tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME).write_bytes(b"stub")
     (tokenizer_dir / "config.json").write_text(
         ('{"sampling_rate": 48000, "dec_out_channels": 2, "vocoder_input_dim": 64, "hop_size": 1920}'),
         encoding="utf-8",
@@ -323,7 +340,7 @@ def test_from_config_rejects_custom_architecture_conflict_with_component_config(
     model_dir = tmp_path / "model"
     tokenizer_dir = model_dir / "sound_tokenizer"
     tokenizer_dir.mkdir(parents=True)
-    (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
+    (tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME).write_bytes(b"stub")
     (tokenizer_dir / "config.json").write_text(
         ('{"sampling_rate": 48000, "dec_out_channels": 2, "vocoder_input_dim": 64, "hop_size": 1920}'),
         encoding="utf-8",
@@ -343,3 +360,52 @@ class FakeAVAE(_FakeAVAEAudioTokenizer):
                 dtype=torch.float32,
             )
         )
+
+
+def test_avae_uses_diffusers_decoder_state_dict_layout(tmp_path) -> None:
+    from safetensors.torch import save_file
+
+    from vllm_omni.diffusion.models.cosmos3.audio_tokenizer import avae
+
+    config = {
+        "sampling_rate": 8000,
+        "hop_size": 2,
+        "dec_dim": 4,
+        "dec_c_mults": [1],
+        "dec_strides": [2],
+        "dec_out_channels": 1,
+        "vocoder_input_dim": 2,
+        "normalization_type": "none",
+    }
+    checkpoint_path = tmp_path / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME
+    config_path = tmp_path / "config.json"
+    config_path.write_text(json.dumps(config), encoding="utf-8")
+
+    decoder = avae.OobleckDecoder(
+        channels=4,
+        input_channels=2,
+        audio_channels=1,
+        upsampling_ratios=[2],
+        channel_multiples=[1],
+    )
+    save_file({f"decoder.{key}": value for key, value in decoder.state_dict().items()}, str(checkpoint_path))
+
+    tokenizer = avae.Cosmos3AVAEAudioTokenizer(
+        checkpoint_path=checkpoint_path,
+        config_path=config_path,
+        dtype=torch.float32,
+        device="cpu",
+    )
+
+    keys = set(tokenizer.state_dict())
+    assert "decoder.conv1.weight_g" in keys
+    assert "decoder.block.0.snake1.alpha" in keys
+    assert "decoder.block.0.conv_t1.weight_g" in keys
+    assert "decoder.block.0.res_unit1.conv1.weight_g" in keys
+    assert "decoder.snake1.alpha" in keys
+    assert "decoder.conv2.weight_g" in keys
+    assert not any(key.startswith("decoder.layers.") for key in keys)
+    assert not any(key.startswith("model.decoder.") for key in keys)
+    assert tokenizer.decode(torch.zeros(1, 2, 3)).shape == (1, 1, 6)
+    with pytest.raises(NotImplementedError, match="decoder-only"):
+        tokenizer.encode(torch.zeros(1, 1, 6))
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py
deleted file mode 100755
index 0c3daaa4ac5..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/activations.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
-
-
-import torch
-from torch import nn, pow, sin
-from torch.nn import Parameter
-
-
-@torch.jit.script
-def fused_add_tanh_sigmoid_multiply(
-    input_a: torch.Tensor, input_b: torch.Tensor, n_channels: list[int]
-) -> torch.Tensor:
-    n_channels_int = n_channels[0]
-    in_act = input_a + input_b  # [B,2*C,T]
-    t_act = torch.tanh(in_act[:, :n_channels_int, :])  # [B,C,T]
-    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])  # [B,C,T]
-    acts = t_act * s_act  # [B,C,T]
-    return acts  # [B,C,T]
-
-
-@torch.jit.script
-def fused_snake(x: torch.Tensor, alpha: torch.Tensor, beta: torch.Tensor) -> torch.Tensor:
-    return x + (1.0 / (beta + 1e-9)) * pow(sin(x * alpha), 2)
-
-
-class Snake(nn.Module):
-    """
-    Implementation of a sine-based periodic activation function
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter
-    References:
-        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snake(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    """
-
-    def __init__(
-        self, in_features: int, alpha: float = 1.0, alpha_trainable: bool = True, alpha_logscale: bool = True
-    ) -> None:
-        """
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha: trainable parameter
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            alpha will be trained along with the rest of your model.
-        """
-        super().__init__()
-        self.in_features = in_features
-
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale:  # log scale alphas initialized to zeros
-            self.alpha = Parameter(torch.zeros(in_features) * alpha)
-        else:  # linear scale alphas initialized to ones
-            self.alpha = Parameter(torch.ones(in_features) * alpha)
-
-        self.alpha.requires_grad = alpha_trainable
-
-        self.no_div_by_zero = 0.000000001
-
-    def forward(self: "Snake", x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        Snake := x + 1/a * sin^2 (xa)
-        """
-        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # [1,C,1]
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)  # [1,C,1]
-
-        return fused_snake(x, alpha, alpha)  # [B,C,T]
-        # x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-        # return x
-
-
-class SnakeBeta(nn.Module):
-    """
-    A modified Snake function which uses separate parameters for the magnitude of the periodic components
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter that controls frequency
-        - beta - trainable parameter that controls magnitude
-    References:
-        - Modified from the paper by Liu Ziyin, Tilman Hartwig, and Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snakebeta(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    """
-
-    def __init__(
-        self, in_features: int, alpha: float = 1.0, alpha_trainable: bool = True, alpha_logscale: bool = True
-    ) -> None:
-        """
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha - trainable parameter that controls frequency
-            - beta - trainable parameter that controls magnitude
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            beta is initialized to 1 by default, higher values = higher-magnitude.
-            alpha will be trained along with the rest of your model.
-        """
-        super().__init__()
-        self.in_features = in_features
-
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale:  # log scale alphas initialized to zeros
-            self.alpha = Parameter(torch.zeros(in_features) * alpha)
-            self.beta = Parameter(torch.zeros(in_features) * alpha)
-        else:  # linear scale alphas initialized to ones
-            self.alpha = Parameter(torch.ones(in_features) * alpha)
-            self.beta = Parameter(torch.ones(in_features) * alpha)
-
-        self.alpha.requires_grad = alpha_trainable
-        self.beta.requires_grad = alpha_trainable
-
-        self.no_div_by_zero = 0.000000001
-
-    def forward(self: "SnakeBeta", x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        SnakeBeta := x + 1/b * sin^2 (xa)
-        """
-        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # [1,C,1]
-        beta = self.beta.unsqueeze(0).unsqueeze(-1)  # [1,C,1]
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)  # [1,C,1]
-            beta = torch.exp(beta)  # [1,C,1]
-
-        return fused_snake(x, alpha, beta)  # [B,C,T]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/__init__.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/__init__.py
deleted file mode 100755
index 28f76f7d706..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-
-from .act import Activation1d
-from .filter import LowPassFilter1d, kaiser_sinc_filter1d, sinc
-from .resample import DownSample1d, UpSample1d
-
-__all__ = [
-    "Activation1d",
-    "LowPassFilter1d",
-    "kaiser_sinc_filter1d",
-    "sinc",
-    "DownSample1d",
-    "UpSample1d",
-]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/act.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/act.py
deleted file mode 100755
index 0825c181fa5..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/act.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-
-import torch.nn as nn
-
-from .resample import DownSample1d, UpSample1d
-
-
-class Activation1d(nn.Module):
-    def __init__(
-        self,
-        activation: nn.Module,
-        up_ratio: int = 2,
-        down_ratio: int = 2,
-        up_kernel_size: int = 12,
-        down_kernel_size: int = 12,
-    ):
-        super().__init__()
-        self.up_ratio = up_ratio
-        self.down_ratio = down_ratio
-        self.act = activation
-        self.upsample = UpSample1d(up_ratio, up_kernel_size)
-        self.downsample = DownSample1d(down_ratio, down_kernel_size)
-
-    # x: [B,C,T]
-    def forward(self, x):
-        x = self.upsample(x)
-        x = self.act(x)
-        x = self.downsample(x)
-
-        return x
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/filter.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/filter.py
deleted file mode 100755
index 56a45011ed9..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/filter.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-if "sinc" in dir(torch):
-    sinc = torch.sinc
-else:
-    # This code is adopted from adefossez's julius.core.sinc under the MIT License
-    # https://adefossez.github.io/julius/julius/core.html
-    def sinc(x: torch.Tensor):
-        """
-        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
-        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
-        """
-        return torch.where(
-            x == 0, torch.tensor(1.0, device=x.device, dtype=x.dtype), torch.sin(math.pi * x) / math.pi / x
-        )
-
-
-# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
-# https://adefossez.github.io/julius/julius/lowpass.html
-def kaiser_sinc_filter1d(cutoff, half_width, kernel_size):  # return filter [1,1,kernel_size]
-    even = kernel_size % 2 == 0
-    half_size = kernel_size // 2
-
-    # For kaiser window
-    delta_f = 4 * half_width
-    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
-    if A > 50.0:
-        beta = 0.1102 * (A - 8.7)
-    elif A >= 21.0:
-        beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
-    else:
-        beta = 0.0
-    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)  # [kernel_size]
-
-    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
-    if even:
-        time = torch.arange(-half_size, half_size) + 0.5  # [kernel_size]
-    else:
-        time = torch.arange(kernel_size) - half_size  # [kernel_size]
-    if cutoff == 0:
-        filter_ = torch.zeros_like(time)  # [kernel_size]
-    else:
-        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)  # [kernel_size]
-        # Normalize filter to have sum = 1, otherwise we will have a small leakage
-        # of the constant component in the input signal.
-        filter_ /= filter_.sum()
-    filter = filter_.view(1, 1, kernel_size)  # [1,1,kernel_size]
-
-    return filter  # [1,1,kernel_size]
-
-
-class LowPassFilter1d(nn.Module):
-    def __init__(
-        self,
-        cutoff=0.5,
-        half_width=0.6,
-        stride: int = 1,
-        padding: bool = True,
-        padding_mode: str = "replicate",
-        kernel_size: int = 12,
-    ):
-        # kernel_size should be even number for stylegan3 setup,
-        # in this implementation, odd number is also possible.
-        super().__init__()
-        if cutoff < -0.0:
-            raise ValueError("Minimum cutoff must be larger than zero.")
-        if cutoff > 0.5:
-            raise ValueError("A cutoff above 0.5 does not make sense.")
-        self.kernel_size = kernel_size
-        self.even = kernel_size % 2 == 0
-        self.pad_left = kernel_size // 2 - int(self.even)
-        self.pad_right = kernel_size // 2
-        self.stride = stride
-        self.padding = padding
-        self.padding_mode = padding_mode
-        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
-        self.register_buffer("filter", filter)
-
-    # input [B,C,T]
-    def forward(self, x):  # x: [B,C,T]
-        _, C, _ = x.shape
-
-        if self.padding:
-            x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)  # [B,C,T+pad_left+pad_right]
-        out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)  # [B,C,T//stride]
-
-        return out  # [B,C,T//stride]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/resample.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/resample.py
deleted file mode 100755
index 30e9663fe18..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/alias_free_torch/resample.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-
-import torch.nn as nn
-from torch.nn import functional as F
-
-from .filter import LowPassFilter1d, kaiser_sinc_filter1d
-
-
-class UpSample1d(nn.Module):
-    def __init__(self, ratio=2, kernel_size=None):
-        super().__init__()
-        self.ratio = ratio
-        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
-        self.stride = ratio
-        self.pad = self.kernel_size // ratio - 1
-        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
-        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
-        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size)
-        self.register_buffer("filter", filter)
-
-    # x: [B,C,T]
-    def forward(self, x):  # x: [B,C,T]
-        _, C, _ = x.shape
-
-        x = F.pad(x, (self.pad, self.pad), mode="replicate")  # [B,C,T+2*pad]
-        x = self.ratio * F.conv_transpose1d(
-            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
-        )  # [B,C,T*ratio+pad_left+pad_right]
-        x = x[..., self.pad_left : -self.pad_right]  # [B,C,T*ratio]
-
-        return x  # [B,C,T*ratio]
-
-
-class DownSample1d(nn.Module):
-    def __init__(self, ratio=2, kernel_size=None):
-        super().__init__()
-        self.ratio = ratio
-        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
-        self.lowpass = LowPassFilter1d(
-            cutoff=0.5 / ratio, half_width=0.6 / ratio, stride=ratio, kernel_size=self.kernel_size
-        )
-
-    def forward(self, x):  # x: [B,C,T]
-        xx = self.lowpass(x)  # [B,C,T//ratio]
-
-        return xx  # [B,C,T//ratio]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
index 2ee4ad2a3ef..7f04177c2d1 100644
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
@@ -1,23 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Local AVAE audio tokenizer used by Cosmos3 sound generation."""
+"""Diffusers-format AVAE audio tokenizer used by Cosmos3 sound generation."""
 
 from __future__ import annotations
 
 import json
 import math
 from pathlib import Path
+from typing import Any
 
 import torch
-import torch.nn.functional as F
 from torch import nn
+from torch.nn.utils import weight_norm
 from vllm.logger import init_logger
 
 from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
 
-from .config import AttrDict
-from .models import load_generator
-
 logger = init_logger(__name__)
 
 
@@ -27,49 +25,29 @@ def _default_avae_config(
     audio_channels: int,
     io_channels: int,
     hop_size: int,
-) -> AttrDict:
-    return AttrDict(
-        {
-            "model_type": "autoencoder_v2",
-            "sampling_rate": sample_rate,
-            "stereo": audio_channels == 2,
-            "use_wav_as_input": True,
-            "normalize_volume": True,
-            "hop_size": hop_size,
-            "input_channels": 1,
-            "enc_type": "spec_convnext",
-            "enc_dim": 192,
-            "enc_intermediate_dim": 768,
-            "enc_num_layers": 12,
-            "enc_num_blocks": 2,
-            "enc_n_fft": 64,
-            "enc_hop_length": 16,
-            "enc_latent_dim": 128,
-            "enc_c_mults": [1, 2, 4],
-            "enc_strides": [4, 5, 6],
-            "enc_identity_init": False,
-            "enc_use_snake": True,
-            "dec_type": "oobleck",
-            "dec_dim": 320,
-            "dec_c_mults": [1, 2, 4, 8, 16],
-            "dec_strides": [2, 4, 5, 6, 8],
-            "dec_use_snake": True,
-            "dec_final_tanh": False,
-            "dec_out_channels": audio_channels,
-            "dec_anti_aliasing": False,
-            "dec_use_nearest_upsample": False,
-            "dec_use_tanh_at_final": False,
-            "bottleneck_type": "vae",
-            "bottleneck": {"type": "vae"},
-            "activation": "snakebeta",
-            "snake_logscale": True,
-            "anti_aliasing": False,
-            "use_cuda_kernel": False,
-            "causal": False,
-            "padding_mode": "zeros",
-            "vocoder_input_dim": io_channels,
-        }
-    )
+) -> dict[str, Any]:
+    return {
+        "sampling_rate": sample_rate,
+        "hop_size": hop_size,
+        "dec_dim": 320,
+        "dec_c_mults": [1, 2, 4, 8, 16],
+        "dec_strides": [2, 4, 5, 6, 8],
+        "dec_out_channels": audio_channels,
+        "vocoder_input_dim": io_channels,
+        "normalization_type": "none",
+        "normalize_latents": False,
+        "tanh_input_scale": 1.5,
+        "tanh_output_scale": 3.5,
+        "tanh_clamp": 0.995,
+    }
+
+
+def _config_get(config: dict[str, Any], *keys: str, default: Any = None) -> Any:
+    for key in keys:
+        value = config.get(key)
+        if value is not None:
+            return value
+    return default
 
 
 def _load_config(
@@ -79,10 +57,13 @@ def _load_config(
     audio_channels: int,
     io_channels: int,
     hop_size: int,
-) -> AttrDict:
+) -> dict[str, Any]:
     if config_path:
         with open(config_path, encoding="utf-8") as f:
-            return AttrDict(json.load(f))
+            config = json.load(f)
+        if not isinstance(config, dict):
+            raise TypeError(f"Cosmos3 AVAE config must be a JSON object, got {type(config)!r}.")
+        return config
     return _default_avae_config(
         sample_rate=sample_rate,
         audio_channels=audio_channels,
@@ -103,48 +84,127 @@ def _load_checkpoint(path: str | Path, map_location: torch.device | str) -> dict
         checkpoint = torch.load(path, map_location=map_location)
 
     if not isinstance(checkpoint, dict):
-        raise TypeError(f"AVAE checkpoint must be a dict, got {type(checkpoint)!r}.")
+        raise TypeError(f"AVAE checkpoint must be a flat state dict, got {type(checkpoint)!r}.")
+    if not all(isinstance(value, torch.Tensor) for value in checkpoint.values()):
+        raise TypeError("AVAE checkpoint must be a flat tensor state dict.")
+    return checkpoint
 
-    for key in ("generator", "state_dict", "model"):
-        value = checkpoint.get(key)
-        if isinstance(value, dict):
-            checkpoint = value
-            break
 
-    if not all(isinstance(value, torch.Tensor) for value in checkpoint.values()):
-        tensor_items = {key: value for key, value in checkpoint.items() if isinstance(value, torch.Tensor)}
-        if not tensor_items:
-            raise RuntimeError(f"No tensor state dict found in AVAE checkpoint keys: {list(checkpoint.keys())[:16]}")
-        checkpoint = tensor_items
+def _validate_diffusers_state_dict(state_dict: dict[str, torch.Tensor]) -> None:
+    if not state_dict:
+        raise RuntimeError("AVAE checkpoint is empty.")
 
-    return checkpoint
+    if not any(key.startswith("decoder.") for key in state_dict):
+        raise RuntimeError("Cosmos3 AVAE checkpoint must contain diffusers-format decoder.* keys.")
+
+
+class Snake1d(nn.Module):
+    """One-dimensional Snake activation matching diffusers' Oobleck layout."""
+
+    def __init__(self, hidden_dim: int, logscale: bool = True) -> None:
+        super().__init__()
+        self.alpha = nn.Parameter(torch.zeros(1, hidden_dim, 1))
+        self.beta = nn.Parameter(torch.zeros(1, hidden_dim, 1))
+        self.logscale = logscale
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        shape = hidden_states.shape
+        alpha = torch.exp(self.alpha) if self.logscale else self.alpha
+        beta = torch.exp(self.beta) if self.logscale else self.beta
+        hidden_states = hidden_states.reshape(shape[0], shape[1], -1)
+        hidden_states = hidden_states + (beta + 1e-9).reciprocal() * torch.sin(alpha * hidden_states).pow(2)
+        return hidden_states.reshape(shape)
+
+
+class OobleckResidualUnit(nn.Module):
+    """Residual unit used by the diffusers Oobleck decoder."""
+
+    def __init__(self, dimension: int = 16, dilation: int = 1) -> None:
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.snake1 = Snake1d(dimension)
+        self.conv1 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad))
+        self.snake2 = Snake1d(dimension)
+        self.conv2 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=1))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        output_tensor = self.conv1(self.snake1(hidden_state))
+        output_tensor = self.conv2(self.snake2(output_tensor))
+        padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2
+        if padding > 0:
+            hidden_state = hidden_state[..., padding:-padding]
+        return hidden_state + output_tensor
+
+
+class OobleckDecoderBlock(nn.Module):
+    """Decoder block used by the diffusers Oobleck decoder."""
+
+    def __init__(self, input_dim: int, output_dim: int, stride: int = 1, output_padding: int = 0) -> None:
+        super().__init__()
+        self.snake1 = Snake1d(input_dim)
+        self.conv_t1 = weight_norm(
+            nn.ConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                output_padding=output_padding,
+            )
+        )
+        self.res_unit1 = OobleckResidualUnit(output_dim, dilation=1)
+        self.res_unit2 = OobleckResidualUnit(output_dim, dilation=3)
+        self.res_unit3 = OobleckResidualUnit(output_dim, dilation=9)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv_t1(hidden_state)
+        hidden_state = self.res_unit1(hidden_state)
+        hidden_state = self.res_unit2(hidden_state)
+        return self.res_unit3(hidden_state)
+
+
+class OobleckDecoder(nn.Module):
+    """Diffusers-compatible Oobleck decoder for Cosmos3 AVAE latents."""
+
+    def __init__(
+        self,
+        channels: int,
+        input_channels: int,
+        audio_channels: int,
+        upsampling_ratios: list[int],
+        channel_multiples: list[int],
+    ) -> None:
+        super().__init__()
+        strides = upsampling_ratios
+        channel_multiples = [1] + channel_multiples
 
+        self.conv1 = weight_norm(nn.Conv1d(input_channels, channels * channel_multiples[-1], kernel_size=7, padding=3))
 
-def _strip_prefixes(
-    state_dict: dict[str, torch.Tensor],
-    model_state: dict[str, torch.Tensor],
-) -> dict[str, torch.Tensor]:
-    prefixes = ("module.", "generator.", "model.")
-    normalized: dict[str, torch.Tensor] = {}
-    for key, value in state_dict.items():
-        candidates = [key]
-        current = key
-        changed = True
-        while changed:
-            changed = False
-            for prefix in prefixes:
-                if current.startswith(prefix):
-                    current = current[len(prefix) :]
-                    candidates.append(current)
-                    changed = True
-                    break
-        selected = next((candidate for candidate in candidates if candidate in model_state), candidates[-1])
-        normalized[selected] = value
-    return normalized
+        block = []
+        for stride_index, stride in enumerate(strides):
+            block.append(
+                OobleckDecoderBlock(
+                    input_dim=channels * channel_multiples[len(strides) - stride_index],
+                    output_dim=channels * channel_multiples[len(strides) - stride_index - 1],
+                    stride=stride,
+                    output_padding=stride % 2,
+                )
+            )
+        self.block = nn.ModuleList(block)
+        self.snake1 = Snake1d(channels)
+        self.conv2 = weight_norm(nn.Conv1d(channels, audio_channels, kernel_size=7, padding=3, bias=False))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.conv1(hidden_state)
+        for layer in self.block:
+            hidden_state = layer(hidden_state)
+        hidden_state = self.snake1(hidden_state)
+        return self.conv2(hidden_state)
 
 
 class Cosmos3AVAEAudioTokenizer(nn.Module):
-    """AVAE tokenizer/decoder for Cosmos3 audio latents."""
+    """Decoder-only AVAE tokenizer for Cosmos3 audio latents."""
 
     def __init__(
         self,
@@ -164,63 +224,63 @@ def __init__(
         device: torch.device | str = "cuda",
     ) -> None:
         super().__init__()
-        self.sample_rate = int(sample_rate)
-        self.audio_channels = int(audio_channels)
-        self.latent_ch = int(io_channels)
-        self.hop_size = int(hop_size)
         self.dtype = dtype
         self.device = torch.device(device)
-        self.normalize_volume = True
-
-        if normalization_type == "none" and normalize_latents:
-            normalization_type = "tanh"
-        self.normalization_type = normalization_type
-        self.tanh_input_scale = float(tanh_input_scale)
-        self.tanh_output_scale = float(tanh_output_scale)
-        self.tanh_clamp = float(tanh_clamp)
 
         config = _load_config(
             config_path,
-            sample_rate=self.sample_rate,
-            audio_channels=self.audio_channels,
-            io_channels=self.latent_ch,
-            hop_size=self.hop_size,
+            sample_rate=sample_rate,
+            audio_channels=audio_channels,
+            io_channels=io_channels,
+            hop_size=hop_size,
         )
-        self.sample_rate = int(config.sampling_rate)
+        self.sample_rate = int(_config_get(config, "sampling_rate", "sample_rate", default=sample_rate))
         self.audio_channels = int(
-            getattr(config, "dec_out_channels", 2 if bool(getattr(config, "stereo", True)) else 1)
+            _config_get(
+                config,
+                "dec_out_channels",
+                "audio_channels",
+                default=2 if bool(config.get("stereo", audio_channels == 2)) else 1,
+            )
+        )
+        self.latent_ch = int(_config_get(config, "vocoder_input_dim", "io_channels", "latent_ch", default=io_channels))
+        dec_strides = [int(stride) for stride in _config_get(config, "dec_strides", default=[2, 4, 5, 6, 8])]
+        self.hop_size = int(
+            _config_get(config, "hop_size", default=math.prod(dec_strides) if dec_strides else hop_size)
         )
-        self.latent_ch = int(config.vocoder_input_dim)
-        self.hop_size = int(config.hop_size)
-        dec_stride_product = math.prod(int(stride) for stride in config.dec_strides)
+        dec_stride_product = math.prod(dec_strides)
         if dec_stride_product != self.hop_size:
             raise ValueError(
                 "Cosmos3 AVAE config dec_strides product must equal hop_size "
                 f"for correct latent/audio duration math: product={dec_stride_product}, hop_size={self.hop_size}."
             )
-        self.model = load_generator(config.model_type, config, self.device)
-        state_dict = _strip_prefixes(
-            _load_checkpoint(checkpoint_path, self.device),
-            self.model.state_dict(),
+
+        normalization_type = str(_config_get(config, "normalization_type", default=normalization_type))
+        normalize_latents = bool(_config_get(config, "normalize_latents", default=normalize_latents))
+        if normalization_type == "none" and normalize_latents:
+            normalization_type = "tanh"
+        self.normalization_type = normalization_type
+        self.tanh_input_scale = float(_config_get(config, "tanh_input_scale", default=tanh_input_scale))
+        self.tanh_output_scale = float(_config_get(config, "tanh_output_scale", default=tanh_output_scale))
+        self.tanh_clamp = float(_config_get(config, "tanh_clamp", default=tanh_clamp))
+
+        self.decoder = OobleckDecoder(
+            channels=int(_config_get(config, "dec_dim", default=320)),
+            input_channels=self.latent_ch,
+            audio_channels=self.audio_channels,
+            upsampling_ratios=list(reversed(dec_strides)),
+            channel_multiples=list(_config_get(config, "dec_c_mults", default=[1, 2, 4, 8, 16])),
         )
-        matching_keys = set(state_dict).intersection(self.model.state_dict())
-        if not matching_keys:
-            raise RuntimeError("AVAE checkpoint did not contain any keys matching the local AVAE model.")
-        missing, unexpected = self.model.load_state_dict(state_dict, strict=False)
-        if _is_rank_zero():
-            logger.info(
-                "Loaded Cosmos3 AVAE checkpoint from %s; missing=%d unexpected=%d",
-                checkpoint_path,
-                len(missing),
-                len(unexpected),
-            )
+        state_dict = _load_checkpoint(checkpoint_path, self.device)
+        _validate_diffusers_state_dict(state_dict)
+        self.load_state_dict(state_dict, strict=True)
 
-        self.model.eval()
-        for param in self.model.parameters():
+        self.eval()
+        for param in self.parameters():
             param.requires_grad = False
-        if hasattr(self.model, "remove_weight_norm"):
-            self.model.remove_weight_norm()
-        self.model.to(dtype=self.dtype)
+        self.to(device=self.device, dtype=self.dtype)
+        if _is_rank_zero():
+            logger.info("Loaded diffusers-format Cosmos3 AVAE checkpoint from %s", checkpoint_path)
 
     @property
     def temporal_compression_factor(self) -> int:
@@ -232,14 +292,6 @@ def get_latent_num_samples(self, num_audio_samples: int) -> int:
     def get_audio_num_samples(self, num_latent_samples: int) -> int:
         return int(num_latent_samples) * self.temporal_compression_factor
 
-    def _normalize_latent(self, latent: torch.Tensor) -> torch.Tensor:
-        if self.normalization_type == "tanh":
-            in_dtype = latent.dtype
-            return (torch.tanh(latent.float() / self.tanh_input_scale) * self.tanh_output_scale).to(in_dtype)
-        if self.normalization_type != "none":
-            raise ValueError(f"Unsupported AVAE normalization_type={self.normalization_type!r}.")
-        return latent
-
     def _denormalize_latent(self, latent: torch.Tensor) -> torch.Tensor:
         if self.normalization_type == "tanh":
             in_dtype = latent.dtype
@@ -255,30 +307,15 @@ def _denormalize_latent(self, latent: torch.Tensor) -> torch.Tensor:
 
     @torch.no_grad()
     def encode(self, audio: torch.Tensor, force_pad: bool = False) -> torch.Tensor:
-        in_dtype = audio.dtype
-        x = audio.to(self.device)
-        if x.ndim != 3:
-            raise ValueError(f"AVAE audio input must be [B, C, T], got {tuple(x.shape)}.")
-        if x.shape[1] == 1 and self.audio_channels == 2:
-            x = x.repeat(1, 2, 1)
-        elif x.shape[1] > self.audio_channels:
-            x = x[:, : self.audio_channels]
-        if self.normalize_volume:
-            x = x / (x.abs().amax(dim=(-2, -1), keepdim=True) + 1e-5) * 0.95
-        if force_pad or not self.model.training:
-            pad_amount = (self.hop_size - (x.shape[-1] % self.hop_size)) % self.hop_size
-            if pad_amount:
-                x = F.pad(x, (0, pad_amount), mode="constant", value=0)
-        encoded = self.model.encode(x.to(self.dtype))
-        latent = encoded["latent"] if isinstance(encoded, dict) else encoded
-        return self._normalize_latent(latent).to(in_dtype)
+        del audio, force_pad
+        raise NotImplementedError("Cosmos3AVAEAudioTokenizer is decoder-only for diffusers-format sound_tokenizer/.")
 
     @torch.no_grad()
     def decode(self, latent: torch.Tensor) -> torch.Tensor:
         in_dtype = latent.dtype
+        squeeze = latent.ndim == 2
+        if squeeze:
+            latent = latent.unsqueeze(0)
         z = self._denormalize_latent(latent.to(self.device)).to(self.dtype)
-        decoded = self.model.decode(z)
-        if not isinstance(decoded, dict) or "decoder_out" not in decoded:
-            raise RuntimeError("AVAE decoder did not return decoder_out.")
-        audio = decoded["decoder_out"].clamp(-1.0, 1.0)
-        return audio.to(in_dtype)
+        audio = self.decoder(z).clamp(-1.0, 1.0).to(in_dtype)
+        return audio.squeeze(0) if squeeze else audio
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py
deleted file mode 100755
index dc797d051ff..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/bottlenecks.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Bottleneck modules for AVAE tokenizer.
-
-This cleaned-up version only includes VAEBottleneck which is used
-by the spec_convnext encoder + oobleck decoder + vae configuration.
-"""
-
-from typing import Any
-
-import torch
-from torch import Tensor, nn
-
-
-# Base class
-class Bottleneck(nn.Module):
-    """Base class for bottleneck modules."""
-
-    def __init__(self: "Bottleneck", is_discrete: bool = False) -> None:
-        super().__init__()
-        self.is_discrete = is_discrete
-
-    def encode(
-        self: "Bottleneck", x: Tensor, return_info: bool = False, **kwargs: Any
-    ) -> Tensor | tuple[Tensor, dict[str, Any]]:
-        raise NotImplementedError
-
-    def decode(self: "Bottleneck", x: Tensor, return_info: bool = False) -> Tensor | tuple[Tensor, dict[str, Any]]:
-        raise NotImplementedError
-
-
-def vae_sample(mean: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]:
-    """
-    Sample from VAE latent distribution.
-
-    Args:
-        mean: Mean of the latent distribution
-        scale: Scale parameter (will be passed through softplus)
-
-    Returns:
-        latents: Sampled latents
-        kl: KL divergence loss
-    """
-    stdev = nn.functional.softplus(scale) + 1e-4  # [B,C,T]
-    var = stdev * stdev  # [B,C,T]
-    logvar = torch.log(var)  # [B,C,T]
-    latents = torch.randn_like(mean) * stdev + mean  # [B,C,T]
-
-    kl = (mean * mean + var - logvar - 1).sum(1).mean()  # scalar
-
-    return latents, kl
-
-
-class VAEBottleneck(Bottleneck):
-    """
-    Variational Autoencoder (VAE) bottleneck.
-
-    Applies VAE reparameterization trick during encoding.
-    """
-
-    def __init__(self: "VAEBottleneck") -> None:
-        super().__init__(is_discrete=False)
-
-    def encode(
-        self: "VAEBottleneck", x: Tensor, return_info: bool = False, **kwargs: Any
-    ) -> Tensor | tuple[Tensor, dict[str, Any]]:
-        """
-        Encode input through VAE bottleneck.
-
-        Args:
-            x: Input tensor with shape [B, C*2, T] where C*2 contains concatenated mean and scale parameters
-            return_info: Whether to return additional info dict
-
-        Returns:
-            Sampled latents (and optionally info dict with KL divergence)
-        """
-        info: dict[str, Any] = {}
-
-        mean, scale = x.chunk(2, dim=1)  # mean,scale: [B,C,T]
-        x, kl = vae_sample(mean, scale)  # x: [B,C,T]
-
-        info["kl"] = kl
-
-        if return_info:
-            return x, info
-        else:
-            return x
-
-    def decode(self: "VAEBottleneck", x: Tensor, return_info: bool = False) -> Tensor | tuple[Tensor, dict[str, Any]]:
-        """
-        Decode from latents (identity operation for VAE).
-
-        Args:
-            x: Latent tensor
-            return_info: Whether to return additional info dict
-
-        Returns:
-            Latents (and optionally empty info dict)
-        """
-        info: dict[str, Any] = {}
-        if return_info:
-            return x, info
-        else:
-            return x
-
-
-def create_bottleneck_from_config(bottleneck_config: dict[str, Any]) -> Bottleneck:
-    """
-    Create a bottleneck module from configuration.
-
-    Args:
-        bottleneck_config: Dictionary with 'type' key specifying bottleneck type
-
-    Returns:
-        Bottleneck module instance
-
-    Note:
-        This cleaned version only supports 'vae' bottleneck type.
-    """
-    bottleneck_type = bottleneck_config.get("type", None)
-
-    assert bottleneck_type is not None, "type must be specified in bottleneck config"
-
-    if bottleneck_type == "vae":
-        bottleneck = VAEBottleneck()
-    else:
-        raise NotImplementedError(
-            f"Bottleneck type '{bottleneck_type}' not supported in cleaned AVAE. "
-            f"Only 'vae' is supported for the spec_convnext + oobleck + vae configuration."
-        )
-
-    return bottleneck
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/config.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/config.py
deleted file mode 100644
index c52a956ce4b..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/config.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
-
-from typing import Any
-
-
-class AttrDict(dict):
-    def __init__(self: "AttrDict", *args: Any, **kwargs: Any) -> None:
-        values = dict(*args, **kwargs)
-        super().__init__({key: self._convert(value) for key, value in values.items()})
-        self.__dict__ = self
-
-    @classmethod
-    def _convert(cls, value: Any) -> Any:
-        if isinstance(value, dict) and not isinstance(value, AttrDict):
-            return cls(value)
-        if isinstance(value, list):
-            return [cls._convert(item) for item in value]
-        return value
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/models.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/models.py
deleted file mode 100755
index 41ebe5b7b65..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/models.py
+++ /dev/null
@@ -1,614 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
-
-"""AVAE Models.
-
-This file contains only the models needed for the spec_convnext encoder +
-oobleck decoder + vae configuration.
-"""
-
-import math
-from collections.abc import Callable
-from functools import partial
-from typing import Any
-
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-from torch.nn.utils import remove_weight_norm
-from torch.nn.utils.parametrize import remove_parametrizations
-
-from .config import AttrDict
-from .modules import ConvNeXtBlock, OobleckDecoderBlock, WNConv1d, get_activation
-
-# for causal models we use encodec modules
-from .modules_encodec import SConv1d
-
-
-def load_generator(model_type: str, h: AttrDict, device: torch.device | str) -> nn.Module:
-    """
-    Load generator model based on model_type.
-
-    Cleaned version only supports 'autoencoder_v2' type.
-    """
-    if model_type in ["autoencoder_v2"]:
-        generator = LatentAutoEncoderV2(h).to(device)
-    else:
-        raise NotImplementedError(
-            f"Model type '{model_type}' not supported in cleaned AVAE. Only 'autoencoder_v2' is supported."
-        )
-
-    return generator
-
-
-class TrimPadding(nn.Module):
-    """
-    Used for causal convolution support of a conv layer wrapped with nn.Sequential
-    """
-
-    def __init__(self: "TrimPadding", padding: int) -> None:
-        super().__init__()
-        self.padding = padding
-
-    def forward(self: "TrimPadding", x: torch.Tensor) -> torch.Tensor:
-        return x[:, :, : -self.padding]  # [B,C,T-padding]
-
-
-class SpectrogramConvNeXtEncoder(nn.Module):
-    """
-    Spectrogram Encoder with ConvNeXtBlocks
-
-    This encoder processes input waveforms by converting them into spectrograms
-    (magnitude and phase concatenated along the channel dimension) and encodes them
-    using a sequence of ConvNeXtBlocks and downsampling layers.
-
-    Args (mapped from h):
-        in_channels (int): Number of input audio channels (1 for mono, 2 for stereo).
-        channels (int): Base number of channels for the encoder.
-        latent_dim (int): Dimensionality of the final latent representation.
-        c_mults (List[int]): Channel multipliers at each depth of the encoder.
-        strides (List[int]): Downsampling strides for each depth.
-        num_blocks (int): Number of ConvNeXtBlocks to stack per depth.
-        identity_init (bool): Whether to initialize the 1x1 convs in residual paths as zeros.
-        n_fft (int): Number of FFT points for spectrogram computation.
-        hop_length (int): Hop length for the STFT.
-        use_snake (bool): Whether to use Snake activation in ConvNeXtBlocks.
-        causal (bool): If True, uses causal convolutions.
-        padding_mode (str): Padding mode for convolutions (default: 'zeros').
-
-    Inputs:
-        x (torch.Tensor): Input waveform tensor of shape `[batch, in_channels, time]`.
-
-    Outputs:
-        torch.Tensor: Encoded representation of shape `[batch, time_out, latent_dim]`.
-
-    Forward Pass:
-        - Converts waveform input into spectrograms (concatenates magnitude and phase).
-        - Processes the spectrogram through stacked ConvNeXtBlocks and downsampling layers.
-        - Outputs the final latent representation of specified dimensionality.
-
-    Example:
-        encoder = SpectrogramConvNeXtEncoder(
-            in_channels=2, channels=256, latent_dim=128, c_mults=[1, 2, 4], strides=[4, 4, 8]
-        )
-        waveform = torch.randn(8, 2, 65536)  # [batch, channels, time]
-        encoded = encoder(waveform)  # Output: [8, time_out, 128]
-
-    NOTE: output is in [B, T, C] to be consistent with other encoders
-    """
-
-    def __init__(self: "SpectrogramConvNeXtEncoder", h: AttrDict, **kwargs: Any) -> None:
-        super().__init__()
-
-        self.in_channels = h.input_channels
-        if getattr(h, "stereo", False):
-            self.in_channels *= 2
-
-        # if "enc_latent_dim" is found in v2 config, set it as latent_dim
-        if hasattr(h, "enc_latent_dim"):
-            self.latent_dim = h.enc_latent_dim
-        else:
-            # if not found, fallback to v1 logic
-            self.latent_dim = h.vocoder_input_dim
-            if h.model_type == "vae":
-                self.latent_dim *= 2
-
-        self.channels = h.enc_dim
-
-        self.c_mults = h.enc_c_mults
-        self.strides = h.enc_strides
-        self.num_blocks = h.enc_num_blocks
-        self.identity_init = h.enc_identity_init
-        self.causal = h.causal
-        self.padding_mode = h.padding_mode
-
-        self.use_snake = h.enc_use_snake
-
-        # Basic checks
-        assert len(self.c_mults) == len(self.strides), (
-            f"The length of c_mults and strides must match. Got {len(self.c_mults)} vs {len(self.strides)}."
-        )
-
-        # Spectrogram function
-        self.n_fft = h.enc_n_fft
-        self.hop_length = h.enc_hop_length
-        self.spectrogram_fn = partial(
-            self.spectrogram,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            win_length=self.n_fft,
-            window_fn=torch.hann_window,
-        )
-
-        # ---------------------------------------------------------------------
-        # 1) Initial projection (similar to the first_conv in OobleckEncoder),
-        #    but here we typically use a 1x1 conv for a "spectrogram style" input.
-        # ---------------------------------------------------------------------
-        layers = []
-        layers.append(
-            WNConv1d((self.n_fft + 2) * self.in_channels, self.c_mults[0] * self.channels, kernel_size=1, bias=False)
-        )
-
-        # ---------------------------------------------------------------------
-        # 2) Stages: For each i in range(len(c_mults)):
-        #       - Stack num_blocks of ConvNeXtBlock
-        #       - Downsample via stride convolution
-        # ---------------------------------------------------------------------
-        for i in range(len(self.c_mults)):
-            dim_in = self.c_mults[i] * self.channels
-            # Determine output dimension for the block
-            if i < len(self.c_mults) - 1:  # If not the last block
-                dim_out = self.c_mults[i + 1] * self.channels
-            else:  # For the last block, dim_out is c_mults[-1] * channels
-                dim_out = self.c_mults[-1] * self.channels
-            ds_rate = self.strides[i]
-
-            # (a) Repeated ConvNeXtBlocks
-            for _ in range(self.num_blocks):
-                layers.append(
-                    ConvNeXtBlock(
-                        dim=dim_in,
-                        intermediate_dim=dim_in * 4,
-                        identity_init=self.identity_init,
-                        use_snake=self.use_snake,
-                        causal=self.causal,
-                    )
-                )
-
-            # (b) Downsampling convolution
-            layers.append(self._create_downsample_layer(dim_in, dim_out, ds_rate, self.causal, self.padding_mode))
-
-        # ---------------------------------------------------------------------
-        # 3) Final projection from the last channel dimension to latent_dim.
-        # ---------------------------------------------------------------------
-        layers.append(WNConv1d(self.c_mults[-1] * self.channels, self.latent_dim, kernel_size=1, bias=False))
-
-        self.layers = nn.Sequential(*layers)
-
-    def spectrogram(
-        self: "SpectrogramConvNeXtEncoder",
-        wav: Tensor,
-        n_fft: int,
-        hop_length: int,
-        win_length: int,
-        window_fn: Callable[[int], torch.Tensor] = torch.hann_window,
-    ) -> Tensor:
-        """
-        wav: [B_ch,T_audio] where B_ch = batch * channels (channel folded into batch)
-        returns: [B_ch,n_fft//2+1,T_frames] complex
-        """
-        pad_size_l = (n_fft - hop_length) // 2
-        pad_size_r = (n_fft - hop_length) - pad_size_l
-        with torch.autocast(device_type=wav.device.type, enabled=False):
-            wav = F.pad(wav, (pad_size_l, pad_size_r)).float()  # [B_ch,T_audio+pad]
-            spec = torch.stft(
-                wav,
-                n_fft,
-                hop_length=hop_length,
-                win_length=win_length,
-                window=window_fn(win_length).to(wav),
-                center=False,
-                normalized=False,
-                onesided=True,
-                return_complex=True,
-            )  # [B_ch,n_fft//2+1,T_frames]
-        return spec  # [B_ch,n_fft//2+1,T_frames]
-
-    def _create_downsample_layer(
-        self: "SpectrogramConvNeXtEncoder",
-        in_channels: int,
-        out_channels: int,
-        stride: int,
-        causal: bool,
-        padding_mode: str,
-    ) -> nn.Module:
-        if (
-            causal
-        ):  # use EnCodec's SConv1d for convenience without reinventing the wheels. padding_mode is reflect by default
-            downsample_layer = SConv1d(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=2 * stride,
-                stride=stride,
-                causal=True,
-                norm="weight_norm",
-            )
-        else:  # original non-causal implementation
-            downsample_layer = WNConv1d(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=2 * stride,
-                stride=stride,
-                padding=math.ceil(stride / 2),
-                padding_mode=padding_mode,
-            )
-        return downsample_layer
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x: [B,C,T_audio] waveform (mono: C=1, stereo: C=2)
-
-        Returns:
-            [B,T_latent,latent_dim]
-        """
-
-        # Handle stereo input by merging channel dim into batch dim
-        batch, channels, length = x.shape
-        if channels > 1:  # Stereo case
-            x = x.reshape(batch * channels, 1, length)  # [B*C,1,T_audio] (channel folded into batch)
-
-        # Compute the spectrogram
-        with torch.autocast(device_type=x.device.type, enabled=False):
-            spec = self.spectrogram_fn(x.float().squeeze(1))  # [B*C,n_fft//2+1,T_frames] complex
-            mag, ph = torch.view_as_real(spec).chunk(2, dim=-1)  # each [B*C,n_fft//2+1,T_frames,1]
-            spectrogram = torch.cat([mag, ph], dim=1).squeeze(-1)  # [B*C,n_fft+2,T_frames]
-
-        # Cast spectrogram back to original dtype
-        spectrogram = spectrogram.to(x.dtype)  # [B*C,n_fft+2,T_frames]
-
-        # Restore stereo structure if needed
-        if channels > 1:  # Stereo case
-            freq = spectrogram.shape[1]  # Get the frequency dimension
-            spectrogram = spectrogram.reshape(
-                batch, channels * freq, *spectrogram.shape[2:]
-            )  # [B,(n_fft+2)*C,T_frames]
-
-        # forward pass the encoder
-        output = self.layers(spectrogram)  # [B,latent_dim,T_latent]
-
-        return output.transpose(1, 2)  # [B,T_latent,latent_dim]
-
-    def remove_weight_norm(self: "SpectrogramConvNeXtEncoder") -> None:
-        for module in self.modules():
-            if hasattr(module, "parametrizations"):  # for new WN implementation using parameterizations
-                try:
-                    remove_parametrizations(module, "weight")
-                except ValueError:
-                    pass
-            elif hasattr(module, "weight"):
-                try:
-                    remove_weight_norm(module)
-                except ValueError:
-                    pass
-
-
-class OobleckDecoder(nn.Module):
-    """
-    Oobleck Decoder for audio synthesis.
-
-    Decodes latent representations into audio waveforms using
-    upsampling blocks with optional Snake activation and anti-aliasing.
-    """
-
-    def __init__(
-        self: "OobleckDecoder",
-        h: AttrDict,
-    ) -> None:
-        super().__init__()
-
-        self.h = h
-
-        latent_dim = self.h.vocoder_input_dim
-
-        out_channels = self.h.input_channels
-        if getattr(h, "stereo", False):
-            out_channels *= 2
-
-        channels = self.h.dec_dim
-        c_mults = self.h.dec_c_mults
-        strides = self.h.dec_strides
-        use_snake = self.h.dec_use_snake
-        use_nearest_upsample = self.h.dec_use_nearest_upsample
-        antialias_activation = self.h.dec_anti_aliasing
-        causal = self.h.causal
-        final_tanh = self.h.dec_use_tanh_at_final
-        padding_mode = self.h.padding_mode
-
-        c_mults = [1, *c_mults]
-
-        self.depth = len(c_mults)
-
-        # Padding for the first convolution layer
-        self.first_padding = 6 if causal else 3
-        first_conv = WNConv1d(
-            in_channels=latent_dim,
-            out_channels=c_mults[-1] * channels,
-            kernel_size=7,
-            padding=self.first_padding,
-            padding_mode=padding_mode,
-        )
-
-        if causal:
-            first_conv = nn.Sequential(first_conv, TrimPadding(self.first_padding))
-
-        layers = [first_conv]
-
-        for i in range(self.depth - 1, 0, -1):
-            layers += [
-                OobleckDecoderBlock(
-                    in_channels=c_mults[i] * channels,
-                    out_channels=c_mults[i - 1] * channels,
-                    stride=strides[i - 1],
-                    use_snake=use_snake,
-                    antialias_activation=antialias_activation,
-                    use_nearest_upsample=use_nearest_upsample,
-                    causal=causal,
-                    padding_mode=padding_mode,
-                )
-            ]
-
-        # Padding for the final convolution layer
-        self.final_padding = 6 if causal else 3
-        final_conv = WNConv1d(
-            in_channels=c_mults[0] * channels,
-            out_channels=out_channels,
-            kernel_size=7,
-            padding=self.final_padding,
-            padding_mode=padding_mode,
-            bias=False,
-        )
-
-        if causal:
-            final_conv = nn.Sequential(final_conv, TrimPadding(self.final_padding))
-
-        layers += [
-            get_activation(
-                "snake" if use_snake else "elu", antialias=antialias_activation, channels=c_mults[0] * channels
-            ),
-            final_conv,
-            nn.Tanh() if final_tanh else nn.Identity(),
-        ]
-
-        self.layers = nn.Sequential(*layers)
-
-    def forward(self: "OobleckDecoder", x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x: [B,latent_dim,T_latent]
-
-        Returns:
-            [B,C,T_audio]
-        """
-        x = self.layers(x)  # [B,C,T_audio]
-        return x  # [B,C,T_audio]
-
-    def remove_weight_norm(self: "OobleckDecoder") -> None:
-        for module in self.modules():
-            if hasattr(module, "parametrizations"):  # for new WN implementation using parameterizations
-                try:
-                    remove_parametrizations(module, "weight")
-                except ValueError:
-                    pass
-            elif hasattr(module, "weight"):
-                try:
-                    remove_weight_norm(module)
-                except ValueError:
-                    pass
-
-
-class LatentAutoEncoderV2(nn.Module):
-    """
-    A Latent AutoEncoder class with cleaner implementation to generalize using bottleneck.py
-
-    Attributes:
-        h: Configuration object containing model hyperparameters.
-        encoder (nn.Module): The encoder module based on configuration.
-        bottleneck (Bottleneck): Bottleneck module from bottleneck.py.
-        decoder (nn.Module): The decoder module based on configuration.
-    """
-
-    def __init__(self: "LatentAutoEncoderV2", h: AttrDict) -> None:
-        super().__init__()
-        self.h = h
-
-        # Set up basic model properties
-        self.stereo = getattr(self.h, "stereo", False)
-
-        # Determine input type
-        self.input_type = None
-        if getattr(self.h, "use_wav_as_input", False):
-            self.input_type = "waveform"
-            self.h.input_channels = 1
-        elif getattr(self.h, "use_linear_spec_as_input", False):
-            self.input_type = "linear"
-            self.h.input_channels = self.h.num_linears
-        elif getattr(self.h, "use_discrete_code_as_input", False):
-            self.input_type = "discrete_code"
-            self.h.input_channels = 1
-        else:
-            self.input_type = "mel"
-            self.h.input_channels = self.h.num_mels
-
-        # hop_size defines the down/up sampling factor of the autoencoder
-        self.hop_size = self.h.hop_size
-
-        # Initialize encoder
-        self.enc_type = getattr(self.h, "enc_type", "convnext")
-
-        # Define encoder (only spec_convnext supported in cleaned version)
-        if self.enc_type == "spec_convnext":
-            self.encoder = SpectrogramConvNeXtEncoder(self.h)
-        else:
-            raise NotImplementedError(
-                f"Encoder type '{self.enc_type}' not supported in cleaned AVAE. Only 'spec_convnext' is supported."
-            )
-
-        # Initialize encoder projector (Identity for spec_convnext)
-        self.encoder_proj = nn.Identity()
-
-        # Initialize bottleneck from config
-        from .bottlenecks import create_bottleneck_from_config
-
-        if hasattr(self.h, "bottleneck"):
-            self.bottleneck = create_bottleneck_from_config(self.h.bottleneck)
-        else:
-            raise ValueError("Bottleneck configuration must be specified")
-
-        # Check for encoder-only mode
-        self.encoder_only = getattr(self.h, "encoder_only", False)
-
-        if not self.encoder_only:
-            # Initialize decoder
-            self.dec_type = getattr(self.h, "dec_type", "oobleck")
-            if self.dec_type == "oobleck":
-                self.decoder = OobleckDecoder(self.h)
-            else:
-                raise NotImplementedError(
-                    f"Decoder type '{self.dec_type}' not supported in cleaned AVAE. Only 'oobleck' is supported."
-                )
-        else:
-            # Skip decoder initialization
-            self.decoder = None
-
-        # Whether to freeze encoder
-        self.freeze_encoder = getattr(self.h, "freeze_encoder", False)
-        if self.freeze_encoder:
-            for param in self.encoder.parameters():
-                param.requires_grad = False
-
-    def calculate_latent_lengths(self: "LatentAutoEncoderV2", audio_lengths: torch.Tensor) -> torch.Tensor:
-        """
-        Calculates the latent lengths given the original audio lengths.
-
-        Args:
-            audio_lengths (torch.Tensor): A tensor of shape [B] containing the lengths of the original audio samples.
-
-        Returns:
-            torch.Tensor: A tensor of shape [B] containing the corresponding latent lengths.
-        """
-        if self.input_type == "waveform":
-            # The latent length is the audio length divided by the hop_size
-            latent_lengths = torch.ceil(audio_lengths.float() / self.hop_size).long()  # [B]
-        else:
-            # The latent length is same as audio_lengths
-            latent_lengths = audio_lengths  # [B]
-
-        return latent_lengths
-
-    def forward(self: "LatentAutoEncoderV2", x: torch.Tensor) -> dict[str, torch.Tensor]:
-        """
-        Forward pass through the model.
-
-        Args:
-            x (torch.Tensor): Input tensor to the model with shape [B,C,T_audio].
-
-        Returns:
-            dict[str, torch.Tensor]: Dictionary of output tensors including:
-                - encoder_out: Raw encoder output
-                - latent: Bottleneck latent representation
-                - decoder_out: Decoded output (if decoder exists)
-                - Additional outputs specific to the bottleneck type
-        """
-        return_dict = {}
-
-        # Encoder
-        encoder_out = self.encoder(x)  # [B,T_latent,enc_latent_dim]
-        encoder_out_proj = self.encoder_proj(encoder_out)  # [B,T_latent,enc_latent_dim]
-
-        # Apply bottleneck after reshaping to [B, C, T] again
-        latent, bottleneck_enc_info = self.bottleneck.encode(
-            encoder_out_proj.transpose(1, 2),
-            return_info=True,  # transpose: [B,enc_latent_dim,T_latent]
-        )  # [B,C,T_latent]
-
-        # Update return dictionary
-        return_dict.update(
-            {"encoder_out": encoder_out.transpose(1, 2), "latent": latent}  # encoder_out: [B,enc_latent_dim,T_latent]
-        )
-        # Add bottleneck-specific info to return dict
-        for k, v in bottleneck_enc_info.items():
-            return_dict[k] = v
-
-        # Decode (if decoder exists)
-        if self.decoder is not None:
-            # Apply bottleneck decode
-            decoded_latent, bottleneck_dec_info = self.bottleneck.decode(latent, return_info=True)  # [B,C,T_latent]
-            # Apply decoder
-            decoder_out = self.decoder(decoded_latent)  # [B,C,T_audio]
-
-            # Update return dictionary
-            return_dict["decoder_out"] = decoder_out  # [B,C,T_audio]
-            # Add bottleneck-specific info to return dict
-            for k, v in bottleneck_dec_info.items():
-                return_dict[k] = v
-
-        return return_dict
-
-    def encode(self: "LatentAutoEncoderV2", x: torch.Tensor) -> dict[str, torch.Tensor]:
-        """
-        Encodes input x into latent representation using encoder and bottleneck.
-
-        Args:
-            x (torch.Tensor): Input tensor with shape [B, C, T].
-
-        Returns:
-            dict[str, torch.Tensor]: Dictionary containing:
-                - latent: Bottleneck latent representation
-                - Additional outputs specific to the bottleneck type
-        """
-        encoder_out = self.encoder(x)  # [B,T_latent,enc_latent_dim]
-        encoder_out_proj = self.encoder_proj(encoder_out)  # [B,T_latent,enc_latent_dim]
-        latent, bottleneck_info = self.bottleneck.encode(
-            encoder_out_proj.transpose(1, 2),
-            return_info=True,  # transpose: [B,enc_latent_dim,T_latent]
-        )  # [B,C,T_latent]
-
-        return_dict = {"latent": latent}  # latent: [B,C,T_latent]
-        # Add bottleneck-specific info to return dict
-        for k, v in bottleneck_info.items():
-            return_dict[k] = v
-
-        return return_dict
-
-    def decode(self: "LatentAutoEncoderV2", latent: torch.Tensor) -> dict[str, torch.Tensor]:
-        """
-        Decodes continuous latent representation into output using bottleneck and decoder.
-
-        Args:
-            latent (torch.Tensor): continuous latent representation with shape [B, C, T].
-
-        Returns:
-            dict[str, torch.Tensor]: Dictionary containing:
-                - decoder_out: The output from the decoder
-                - Additional outputs from the bottleneck decode process
-        """
-        # Apply bottleneck decode
-        decoded_latent, bottleneck_info = self.bottleneck.decode(latent, return_info=True)  # [B,C,T_latent]
-
-        # Apply decoder
-        decoder_out = self.decoder(decoded_latent)  # [B,C,T_audio]
-
-        return_dict = {"decoder_out": decoder_out}  # decoder_out: [B,C,T_audio]
-        # Add bottleneck-specific info to return dict
-        for k, v in bottleneck_info.items():
-            return_dict[k] = v
-
-        return return_dict
-
-    def remove_weight_norm(self: "LatentAutoEncoderV2") -> None:
-        """Remove weight normalization from all components."""
-        self.encoder.remove_weight_norm()
-        if self.decoder is not None:
-            self.decoder.remove_weight_norm()
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py
deleted file mode 100755
index 55a8597f128..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules.py
+++ /dev/null
@@ -1,391 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
-
-"""AVAE Modules.
-
-This file contains only the modules needed for the spec_convnext encoder +
-oobleck decoder + vae configuration.
-"""
-
-import math
-from typing import Any, Literal
-
-import torch
-from torch import Tensor, nn
-from torch.cuda import amp
-from torch.nn.utils import weight_norm
-
-from . import activations
-from .alias_free_torch.act import Activation1d as TorchActivation1d
-
-# for causal models we use encodec modules
-from .modules_encodec import SConvTranspose1d
-
-
-def WNConv1d(*args: Any, **kwargs: Any) -> nn.Conv1d:
-    """Weight-normalized 1D convolution."""
-    return weight_norm(nn.Conv1d(*args, **kwargs))
-
-
-def WNConvTranspose1d(*args: Any, **kwargs: Any) -> nn.ConvTranspose1d:
-    """Weight-normalized 1D transpose convolution."""
-    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
-
-
-def zero_module(module: nn.Module) -> nn.Module:
-    """
-    Zero out the parameters of a module and return it.
-    Used for identity initialization in ConvNeXt blocks.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-
-
-def may_mask(
-    x: Tensor,
-    mask: Tensor | None = None,
-) -> Tensor:
-    """
-    Apply mask to tensor if provided.
-
-    Args:
-        x: Input tensor
-        mask: Optional mask tensor
-
-    Returns:
-        Masked tensor if mask is provided, otherwise original tensor
-    """
-    if mask is not None:
-        x = x * mask
-    return x
-
-
-class ConvNeXtBlock(nn.Module):
-    """
-    ConvNeXt 1D Block adapted from https://github.com/charactr-platform/vocos
-    which is adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
-    Supports causal and non-causal mode.
-
-    Args:
-        dim (int): Number of input channels.
-        intermediate_dim (int): Dimensionality of the intermediate layer.
-        identity_init (bool): If True, initializes the 1x1 conv in residual paths to zero (identity-friendly).
-        use_snake (bool): If True, uses SnakeBeta activation; otherwise, GELU.
-        causal (bool): If True, applies causal padding; otherwise, applies symmetric padding for non-causal.
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        intermediate_dim: int,
-        identity_init: bool = False,
-        use_snake: bool = False,
-        causal: bool = False,
-    ):
-        super().__init__()
-        self.causal = causal
-
-        if causal:
-            # Causal padding: Only pad on the left
-            self.dwconv = nn.Sequential(
-                nn.ConstantPad1d((6, 0), 0),  # causal padding
-                nn.Conv1d(dim, dim, kernel_size=7, groups=dim),
-            )
-        else:
-            # Non-causal padding: Symmetric padding
-            self.dwconv = nn.Sequential(
-                nn.ConstantPad1d((3, 3), 0),  # symmetric padding (kernel_size // 2 on both sides)
-                nn.Conv1d(dim, dim, kernel_size=7, groups=dim),
-            )
-
-        self.norm = nn.LayerNorm(dim, bias=False)
-        self.pwconv1 = nn.Conv1d(dim, intermediate_dim, 1)  # pointwise/1x1 convs
-        self.act = activations.SnakeBeta(intermediate_dim) if use_snake else nn.GELU()
-
-        if identity_init:
-            self.pwconv2 = zero_module(nn.Conv1d(intermediate_dim, dim, 1))
-        else:
-            self.pwconv2 = nn.Conv1d(intermediate_dim, dim, 1)
-
-    def forward(self, x: Tensor, mask: Tensor | None = None) -> Tensor:
-        """
-        Forward pass.
-
-        Args:
-            x: Input tensor of shape (B, C, T)
-            mask: Optional mask tensor
-
-        Returns:
-            Output tensor of shape (B, C, T)
-        """
-        residual = x  # [B,C,T]
-        x = self.dwconv(may_mask(x, mask))  # [B,C,T]
-        x = x.permute(0, 2, 1)  # [B,C,T] -> [B,T,C]
-        dtype = x.dtype
-        with amp.autocast(enabled=True, dtype=torch.float32):
-            x = self.norm(x)
-        x = x.to(dtype).permute(0, 2, 1)  # [B,T,C] -> [B,C,T]
-        x = self.pwconv1(x)  # [B,intermediate_dim,T]
-        x = self.act(x)  # [B,intermediate_dim,T]
-        x = self.pwconv2(x)  # [B,C,T]
-        x = residual + x  # [B,C,T]
-        return may_mask(x, mask)  # [B,C,T]
-
-    def remove_weight_norm(self) -> None:
-        """No weight norm is applied in ConvNeXtBlock."""
-        pass
-
-
-def get_activation(
-    activation: Literal["elu", "snake", "none"],
-    antialias: bool = False,
-    channels: int | None = None,
-    use_cuda_kernel: bool = False,
-) -> nn.Module:
-    """
-    Get activation module by name.
-
-    Args:
-        activation: Activation type ('elu', 'snake', or 'none')
-        antialias: Whether to wrap with anti-aliasing
-        channels: Number of channels (required for snake activation)
-        use_cuda_kernel: Whether to use CUDA kernel (not supported)
-
-    Returns:
-        Activation module
-    """
-    if activation == "elu":
-        act = nn.ELU()
-    elif activation == "snake":
-        act = activations.SnakeBeta(channels)
-    elif activation == "none":
-        act = nn.Identity()
-    else:
-        raise ValueError(f"Unknown activation {activation}")
-
-    if antialias:
-        # select which Activation1d, lazy-load cuda version to ensure backward compatibility
-        if use_cuda_kernel:
-            raise NotImplementedError("CUDA kernels not supported in this port")
-        else:
-            Activation1d = TorchActivation1d
-
-        act = Activation1d(act)
-
-    return act
-
-
-class ResidualUnit(nn.Module):
-    """
-    Residual unit with dilated convolutions.
-    Used in OobleckDecoderBlock.
-
-    Args:
-        in_channels: Number of input channels
-        out_channels: Number of output channels
-        dilation: Dilation rate
-        kernel_size: Convolution kernel size (default: 7)
-        use_snake: Whether to use Snake activation (default: False)
-        antialias_activation: Whether to use anti-aliasing (default: False)
-        causal: Whether to use causal convolutions (default: False)
-        padding_mode: Padding mode for convolutions (default: 'zeros')
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        dilation: int,
-        kernel_size: int = 7,
-        use_snake: bool = False,
-        antialias_activation: bool = False,
-        causal: bool = False,
-        padding_mode: str = "zeros",
-    ) -> None:
-        super().__init__()
-
-        self.dilation = dilation
-        self.causal = causal
-        self.kernel_size = kernel_size
-
-        if causal:
-            self.padding = dilation * (kernel_size - 1)
-        else:
-            self.padding = (dilation * (kernel_size - 1)) // 2
-
-        # original non-causal impl used zero padding (DAC, SAVAE)
-        # Reflect padding may reduce edge artifacts (EnCodec's default), but
-        # it increases VRAM usage during training.
-        self.padding_mode = padding_mode
-
-        self.layers = nn.Sequential(
-            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=out_channels),
-            WNConv1d(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=kernel_size,
-                dilation=dilation,
-                padding=self.padding,
-                padding_mode=self.padding_mode,
-            ),
-            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=out_channels),
-            WNConv1d(in_channels=out_channels, out_channels=out_channels, kernel_size=1, padding=0),
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Forward pass.
-
-        Args:
-            x: Input tensor of shape (B, C, T)
-
-        Returns:
-            Output tensor of shape (B, C, T)
-        """
-        res = x  # [B,C,T]
-
-        # apply conv layers
-        x = self.layers(x)  # [B,C,T] (padded if causal)
-
-        if self.causal:
-            # Trim right padding to get the causal output
-            x = x[:, :, : -self.padding]  # [B,C,T]
-
-        return x + res  # [B,C,T]
-
-
-class OobleckDecoderBlock(nn.Module):
-    """
-    Oobleck decoder block with upsampling and residual units.
-
-    Args:
-        in_channels: Number of input channels
-        out_channels: Number of output channels
-        stride: Upsampling stride
-        use_snake: Whether to use Snake activation (default: False)
-        antialias_activation: Whether to use anti-aliasing (default: False)
-        use_nearest_upsample: Whether to use nearest neighbor upsampling (default: False)
-        causal: Whether to use causal convolutions (default: False)
-        padding_mode: Padding mode for convolutions (default: 'zeros')
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        stride: int,
-        use_snake: bool = False,
-        antialias_activation: bool = False,
-        use_nearest_upsample: bool = False,
-        causal: bool = False,
-        padding_mode: str = "zeros",
-    ) -> None:
-        super().__init__()
-
-        self.causal = causal
-
-        self.layers = nn.Sequential(
-            get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=in_channels),
-            self._create_upsample_layer(in_channels, out_channels, stride, use_nearest_upsample, causal, padding_mode),
-            ResidualUnit(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                dilation=1,
-                use_snake=use_snake,
-                causal=causal,
-                padding_mode=padding_mode,
-            ),
-            ResidualUnit(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                dilation=3,
-                use_snake=use_snake,
-                causal=causal,
-                padding_mode=padding_mode,
-            ),
-            ResidualUnit(
-                in_channels=out_channels,
-                out_channels=out_channels,
-                dilation=9,
-                use_snake=use_snake,
-                causal=causal,
-                padding_mode=padding_mode,
-            ),
-        )
-
-    def _create_upsample_layer(
-        self,
-        in_channels: int,
-        out_channels: int,
-        stride: int,
-        use_nearest_upsample: bool,
-        causal: bool,
-        padding_mode: str,
-    ) -> nn.Module:
-        """
-        Create upsampling layer based on configuration.
-
-        Note: padding_mode parameter is not used in this function.
-        """
-
-        if causal:  # use EnCodec's SConvTransposed1d for convenience. padding_mode is reflect by default
-            assert not use_nearest_upsample, "use_nearest_upsample is not implemented for causal mode!"
-            upsample_layer = SConvTranspose1d(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=2 * stride,
-                stride=stride,
-                causal=True,
-                norm="weight_norm",
-            )
-        else:
-            if use_nearest_upsample:
-                upsample_layer = nn.Sequential(
-                    nn.Upsample(scale_factor=stride, mode="nearest"),
-                    WNConv1d(
-                        in_channels=in_channels,
-                        out_channels=out_channels,
-                        kernel_size=2 * stride,
-                        stride=1,
-                        bias=False,
-                        padding="same",
-                    ),
-                )
-            else:
-                # WNConvTranspose1d only supports zeros padding mode so it's hardcoded
-                upsample_layer = WNConvTranspose1d(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    kernel_size=2 * stride,
-                    stride=stride,
-                    padding=math.ceil(stride / 2),
-                    output_padding=stride % 2,
-                    padding_mode="zeros",
-                )
-
-        return upsample_layer
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Forward pass.
-
-        Args:
-            x: Input tensor of shape (B, C, T)
-
-        Returns:
-            Output tensor of shape (B, C, T_upsampled)
-        """
-        return self.layers(x)
-
-    def remove_weight_norm(self) -> None:
-        """Remove weight normalization from all layers."""
-        from torch.nn.utils import remove_weight_norm
-
-        for layer in self.layers:
-            try:
-                remove_weight_norm(layer)
-            except (ValueError, AttributeError):
-                # Layer doesn't have weight norm or is not a module with weight norm
-                pass
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules_encodec.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules_encodec.py
deleted file mode 100755
index 007e13f24df..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/modules_encodec.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from https://github.com/facebookresearch/encodec under the MIT license.
-
-"""Convolutional layers wrappers and utilities."""
-
-import math
-import warnings
-from typing import Any
-
-import einops
-import torch
-from torch import nn
-from torch.nn import functional as F
-from torch.nn.utils import spectral_norm, weight_norm
-
-CONV_NORMALIZATIONS = frozenset(
-    ["none", "weight_norm", "spectral_norm", "time_layer_norm", "layer_norm", "time_group_norm"]
-)
-
-
-class ConvLayerNorm(nn.LayerNorm):
-    """
-    Convolution-friendly LayerNorm that moves channels to last dimensions
-    before running the normalization and moves them back to original position right after.
-    """
-
-    def __init__(self: "ConvLayerNorm", normalized_shape: int | list[int] | torch.Size, **kwargs: Any) -> None:
-        super().__init__(normalized_shape, **kwargs)
-
-    def forward(self: "ConvLayerNorm", x: torch.Tensor) -> torch.Tensor:
-        x = einops.rearrange(x, "b ... t -> b t ...")  # [B,T,C]
-        x = super().forward(x)  # [B,T,C]
-        x = einops.rearrange(x, "b t ... -> b ... t")  # [B,C,T]
-        return x  # [B,C,T]
-
-
-def apply_parametrization_norm(module: nn.Module, norm: str = "none") -> nn.Module:
-    assert norm in CONV_NORMALIZATIONS
-    if norm == "weight_norm":
-        return weight_norm(module)
-    elif norm == "spectral_norm":
-        return spectral_norm(module)
-    else:
-        # We already check was in CONV_NORMALIZATION, so any other choice
-        # doesn't need reparameterization.
-        return module
-
-
-def get_norm_module(module: nn.Module, causal: bool = False, norm: str = "none", **norm_kwargs) -> nn.Module:
-    """Return the proper normalization module. If causal is True, this will ensure the returned
-    module is causal, or return an error if the normalization doesn't support causal evaluation.
-    """
-    assert norm in CONV_NORMALIZATIONS
-    if norm == "layer_norm":
-        assert isinstance(module, nn.modules.conv._ConvAnd)
-        return ConvLayerNorm(module.out_channels, **norm_kwargs)
-    elif norm == "time_group_norm":
-        if causal:
-            raise ValueError("GroupNorm doesn't support causal evaluation.")
-        assert isinstance(module, nn.modules.conv._ConvAnd)
-        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
-    else:
-        return nn.Identity()
-
-
-def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0) -> int:
-    """See `pad_for_conv1d`."""
-    length = x.shape[-1]
-    n_frames = (length - kernel_size + padding_total) / stride + 1
-    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
-    return ideal_length - length
-
-
-def pad_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0):
-    """Pad for a convolution to make sure that the last window is full.
-    Extra padding is added at the end. This is required to ensure that we can rebuild
-    an output of the same length, as otherwise, even with padding, some time steps
-    might get removed.
-    For instance, with total padding = 4, kernel size = 4, stride = 2:
-        0 0 1 2 3 4 5 0 0   # (0s are padding)
-        1   2   3           # (output frames of a convolution, last 0 is never used)
-        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
-            1 2 3 4         # once you removed padding, we are missing one time step !
-    """
-    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
-    return F.pad(x, (0, extra_padding))  # [B,C,T+extra_padding]
-
-
-def pad1d(x: torch.Tensor, paddings: tuple[int, int], mode: str = "zero", value: float = 0.0):
-    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
-    If this is the case, we insert extra 0 padding to the right before the reflection happen.
-    """
-    length = x.shape[-1]
-    padding_left, padding_right = paddings
-    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
-    if mode == "reflect":
-        max_pad = max(padding_left, padding_right)
-        extra_pad = 0
-        if length <= max_pad:
-            extra_pad = max_pad - length + 1
-            x = F.pad(x, (0, extra_pad))  # [B,C,T+extra_pad]
-        padded = F.pad(x, paddings, mode, value)  # [B,C,T+padding_left+padding_right]
-        end = padded.shape[-1] - extra_pad
-        return padded[..., :end]  # [B,C,T+padding_left+padding_right]
-    else:
-        return F.pad(x, paddings, mode, value)  # [B,C,T+padding_left+padding_right]
-
-
-def unpad1d(x: torch.Tensor, paddings: tuple[int, int]):
-    """Remove padding from x, handling properly zero padding. Only for 1d!"""
-    padding_left, padding_right = paddings
-    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
-    assert (padding_left + padding_right) <= x.shape[-1]
-    end = x.shape[-1] - padding_right
-    return x[..., padding_left:end]
-
-
-class NormConv1d(nn.Module):
-    """Wrapper around Conv1d and normalization applied to this conv
-    to provide a uniform interface across normalization approaches.
-    """
-
-    def __init__(self, *args, causal: bool = False, norm: str = "none", norm_kwargs: dict[str, Any] = {}, **kwargs):
-        super().__init__()
-        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
-        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
-        self.norm_type = norm
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.conv(x)  # [B,C_out,T_out]
-        x = self.norm(x)  # [B,C_out,T_out]
-        return x  # [B,C_out,T_out]
-
-
-class NormConv2d(nn.Module):
-    """Wrapper around Conv2d and normalization applied to this conv
-    to provide a uniform interface across normalization approaches.
-    """
-
-    def __init__(self, *args, norm: str = "none", norm_kwargs: dict[str, Any] = {}, **kwargs):
-        super().__init__()
-        self.conv = apply_parametrization_norm(nn.Conv2d(*args, **kwargs), norm)
-        self.norm = get_norm_module(self.conv, causal=False, norm=norm, **norm_kwargs)
-        self.norm_type = norm
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.conv(x)  # [B,C_out,H_out,W_out]
-        x = self.norm(x)  # [B,C_out,H_out,W_out]
-        return x  # [B,C_out,H_out,W_out]
-
-
-class NormConvTranspose1d(nn.Module):
-    """Wrapper around ConvTranspose1d and normalization applied to this conv
-    to provide a uniform interface across normalization approaches.
-    """
-
-    def __init__(self, *args, causal: bool = False, norm: str = "none", norm_kwargs: dict[str, Any] = {}, **kwargs):
-        super().__init__()
-        self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
-        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
-        self.norm_type = norm
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.convtr(x)  # [B,C_out,T_out]
-        x = self.norm(x)  # [B,C_out,T_out]
-        return x  # [B,C_out,T_out]
-
-
-class NormConvTranspose2d(nn.Module):
-    """Wrapper around ConvTranspose2d and normalization applied to this conv
-    to provide a uniform interface across normalization approaches.
-    """
-
-    def __init__(self, *args, norm: str = "none", norm_kwargs: dict[str, Any] = {}, **kwargs):
-        super().__init__()
-        self.convtr = apply_parametrization_norm(nn.ConvTranspose2d(*args, **kwargs), norm)
-        self.norm = get_norm_module(self.convtr, causal=False, norm=norm, **norm_kwargs)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.convtr(x)  # [B,C_out,H_out,W_out]
-        x = self.norm(x)  # [B,C_out,H_out,W_out]
-        return x  # [B,C_out,H_out,W_out]
-
-
-class SConv1d(nn.Module):
-    """Conv1d with some builtin handling of asymmetric or causal padding
-    and normalization.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int,
-        stride: int = 1,
-        dilation: int = 1,
-        groups: int = 1,
-        bias: bool = True,
-        causal: bool = False,
-        norm: str = "none",
-        norm_kwargs: dict[str, Any] = {},
-        pad_mode: str = "reflect",
-    ):
-        super().__init__()
-        # warn user on unusual setup between dilation and stride
-        if stride > 1 and dilation > 1:
-            warnings.warn(
-                "SConv1d has been initialized with stride > 1 and dilation > 1"
-                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
-            )
-        self.conv = NormConv1d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            dilation=dilation,
-            groups=groups,
-            bias=bias,
-            causal=causal,
-            norm=norm,
-            norm_kwargs=norm_kwargs,
-        )
-        self.causal = causal
-        self.pad_mode = pad_mode
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:  # x: [B,C,T]
-        B, C, T = x.shape
-        kernel_size = self.conv.conv.kernel_size[0]
-        stride = self.conv.conv.stride[0]
-        dilation = self.conv.conv.dilation[0]
-        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
-        padding_total = kernel_size - stride
-        extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
-        if self.causal:
-            # Left padding for causal
-            x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)  # [B,C,T+padding_total+extra_padding]
-        else:
-            # Asymmetric padding required for odd strides
-            padding_right = padding_total // 2
-            padding_left = padding_total - padding_right
-            x = pad1d(
-                x, (padding_left, padding_right + extra_padding), mode=self.pad_mode
-            )  # [B,C,T+padding_total+extra_padding]
-        return self.conv(x)  # [B,C_out,T_out]
-
-
-class SConvTranspose1d(nn.Module):
-    """ConvTranspose1d with some builtin handling of asymmetric or causal padding
-    and normalization.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int,
-        stride: int = 1,
-        causal: bool = False,
-        norm: str = "none",
-        trim_right_ratio: float = 1.0,
-        norm_kwargs: dict[str, Any] = {},
-    ):
-        super().__init__()
-        self.convtr = NormConvTranspose1d(
-            in_channels, out_channels, kernel_size, stride, causal=causal, norm=norm, norm_kwargs=norm_kwargs
-        )
-        self.causal = causal
-        self.trim_right_ratio = trim_right_ratio
-        assert self.causal or self.trim_right_ratio == 1.0, (
-            "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
-        )
-        assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:  # x: [B,C,T]
-        kernel_size = self.convtr.convtr.kernel_size[0]
-        stride = self.convtr.convtr.stride[0]
-        padding_total = kernel_size - stride
-
-        y = self.convtr(x)  # [B,C_out,T*stride+padding_total]
-
-        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
-        # removed at the very end, when keeping only the right length for the output,
-        # as removing it here would require also passing the length at the matching layer
-        # in the encoder.
-        if self.causal:
-            # Trim the padding on the right according to the specified ratio
-            # if trim_right_ratio = 1.0, trim everything from right
-            padding_right = math.ceil(padding_total * self.trim_right_ratio)
-            padding_left = padding_total - padding_right
-            y = unpad1d(y, (padding_left, padding_right))  # [B,C_out,T_out]
-        else:
-            # Asymmetric padding required for odd strides
-            padding_right = padding_total // 2
-            padding_left = padding_total - padding_right
-            y = unpad1d(y, (padding_left, padding_right))  # [B,C_out,T_out]
-        return y  # [B,C_out,T_out]
diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
index cceaf897083..281b7e1d9f0 100644
--- a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
+++ b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
@@ -31,7 +31,7 @@
 DEFAULT_SOUND_TANH_OUTPUT_SCALE = 3.5
 DEFAULT_SOUND_TANH_CLAMP = 0.995
 SOUND_TOKENIZER_COMPONENT_NAME = "sound_tokenizer"
-SOUND_TOKENIZER_CHECKPOINT_NAME = "model.safetensors"
+SOUND_TOKENIZER_CHECKPOINT_NAME = "diffusion_pytorch_model.safetensors"
 
 
 def _pipeline_args(od_config: OmniDiffusionConfig) -> dict[str, Any]:
@@ -392,7 +392,7 @@ def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer:
                 "tokenizer checkpoint was provided. Set "
                 "custom_pipeline_args['sound_tokenizer_path'] or "
                 "COSMOS3_SOUND_TOKENIZER_PATH, or include "
-                "sound_tokenizer/model.safetensors under the model path."
+                f"{SOUND_TOKENIZER_COMPONENT_NAME}/{SOUND_TOKENIZER_CHECKPOINT_NAME} under the model path."
             )
 
         config_path = _resolve_model_file(explicit_config_path, model_root)
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 9166498bc9f..dec8400a4ec 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -399,11 +399,10 @@ def __init__(
         target_dtype: torch.dtype = torch.bfloat16,
     ) -> None:
         super().__init__()
-        self.mlp = nn.Sequential(
-            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
-            nn.SiLU(),
-            nn.Linear(hidden_size, hidden_size, bias=True),
-        )
+        # Following diffusers naming pattern here for checkpoint compatibility.
+        self.linear_1 = nn.Linear(frequency_embedding_size, hidden_size, bias=True)
+        self.act = nn.SiLU()
+        self.linear_2 = nn.Linear(hidden_size, hidden_size, bias=True)
         self.frequency_embedding_size = frequency_embedding_size
         self.hidden_size = hidden_size
 
@@ -414,7 +413,7 @@ def __init__(
     def forward(self, t: torch.Tensor) -> torch.Tensor:
         args = t[:, None] * self.freqs[None]
         t_freq = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        return self.mlp(t_freq)
+        return self.linear_2(self.act(self.linear_1(t_freq)))
 
 
 # ---------------------------------------------------------------------------

From 6775a9128b27badf912f5bda2b39ed7a130db217 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Mon, 18 May 2026 17:22:17 +0200
Subject: [PATCH 13/41] Rename _layerwise_offload_blocks_attr

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index dec8400a4ec..353f77d7598 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -996,7 +996,7 @@ class Cosmos3VFMTransformer(nn.Module):
 
     _repeated_blocks = ["Cosmos3GenDecoderLayer"]
 
-    _layerwise_offload_blocks_attr = "gen_layers"
+    _layerwise_offload_blocks_attrs = ["gen_layers"]
 
     packed_modules_mapping = {}
 

From ab3c262ce7716d965ed85c9f2959cbec4cf124c7 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Mon, 18 May 2026 18:34:47 +0200
Subject: [PATCH 14/41] Added yaml example without guardrails

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../cosmos3/cosmos3_no_guardrails.yaml        | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml

diff --git a/examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml b/examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml
new file mode 100644
index 00000000000..c677bfaf294
--- /dev/null
+++ b/examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml
@@ -0,0 +1,36 @@
+# Cosmos3 stage config that disables guardrails at server startup.
+#
+# Usage:
+#   vllm serve nvidia/Cosmos3-Nano --omni \
+#     --model-class-name Cosmos3OmniDiffusersPipeline \
+#     --stage-configs-path examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml \
+#     --port 8091
+#
+# The legacy `stage_args:` schema is used because Cosmos3 is not yet declared
+# in `vllm_omni/config/pipeline_registry.py`, so the newer `--deploy-config`
+# path returns no stages for this model and falls back to the default-diffusion
+# factory which does not honor user-supplied model_config overrides. The
+# legacy `--stage-configs-path` loader merges this file directly into the
+# stage's engine_args, so `model_config.guardrails: false` reaches
+# `OmniDiffusionConfig.model_config` and gates `ensure_initialized()` at
+# pipeline build time.
+#
+# Other CLI flags (parallelism, cache backend, layerwise offload, etc.) are
+# still honored: this file only sets the fields it explicitly overrides; the
+# rest fall back to CLI / OmniDiffusionConfig defaults.
+
+stage_args:
+  - stage_id: 0
+    stage_type: diffusion
+    runtime:
+      devices: "0"
+    engine_args:
+      model_class_name: Cosmos3OmniDiffusersPipeline
+      max_num_seqs: 1
+      enforce_eager: true
+      trust_remote_code: true
+      model_config:
+        guardrails: false
+        offload_guardrail_models: false
+    final_output: true
+    final_output_type: image

From 89484947821e0a236fcc12f67e0af10c16a46a65 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 19 May 2026 17:15:44 +0200
Subject: [PATCH 15/41] Updated examples; improved action generation

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .gitignore                                    |   2 +
 examples/offline_inference/cosmos3/README.md  | 166 ++++++--
 examples/offline_inference/cosmos3/end2end.py | 397 +++++++++++++++---
 .../inputs/action_forward_dynamics_av.json    |  17 +
 .../action_forward_dynamics_camera.jsonl      |   2 +
 .../inputs/action_forward_dynamics_robot.json |  17 +
 .../inputs/action_inverse_dynamics_av.json    |  16 +
 .../inputs/action_inverse_dynamics_robot.json |  16 +
 .../cosmos3/inputs/action_policy_av.json      |  16 +
 .../cosmos3/inputs/action_policy_robot.json   |  16 +
 .../offline_inference/cosmos3/inputs/i2v.json |  11 +
 .../offline_inference/cosmos3/inputs/t2i.json |   9 +
 .../offline_inference/cosmos3/inputs/t2v.json |  10 +
 .../cosmos3/inputs/t2v_sound.json             |  12 +
 examples/online_serving/cosmos3/README.md     | 117 +++++-
 .../run_curl_action_forward_dynamics_av.sh    |  73 ++++
 ...run_curl_action_forward_dynamics_camera.sh |  76 ++++
 .../run_curl_action_forward_dynamics_robot.sh |  77 ++++
 .../run_curl_action_inverse_dynamics_av.sh    | 106 +++++
 .../run_curl_action_inverse_dynamics_robot.sh | 106 +++++
 .../cosmos3/run_curl_action_policy.sh         |  65 ++-
 .../cosmos3/run_curl_action_policy_av.sh      | 107 +++++
 .../online_serving/cosmos3/run_curl_i2v.sh    |  42 +-
 .../online_serving/cosmos3/run_curl_t2i.sh    |  44 +-
 .../online_serving/cosmos3/run_curl_t2v.sh    |  34 +-
 .../cosmos3/run_curl_t2v_sound.sh             |  39 +-
 .../models/cosmos3/test_cosmos3_pipeline.py   |  87 ++++
 .../cosmos3/test_cosmos3_transformer.py       |   2 +-
 .../models/test_cosmos3_guardrails.py         |  37 --
 .../diffusion/models/cosmos3/guardrails.py    | 136 +-----
 .../models/cosmos3/pipeline_cosmos3.py        |   6 +-
 31 files changed, 1545 insertions(+), 316 deletions(-)
 create mode 100644 examples/offline_inference/cosmos3/inputs/action_forward_dynamics_av.json
 create mode 100644 examples/offline_inference/cosmos3/inputs/action_forward_dynamics_camera.jsonl
 create mode 100644 examples/offline_inference/cosmos3/inputs/action_forward_dynamics_robot.json
 create mode 100644 examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_av.json
 create mode 100644 examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_robot.json
 create mode 100644 examples/offline_inference/cosmos3/inputs/action_policy_av.json
 create mode 100644 examples/offline_inference/cosmos3/inputs/action_policy_robot.json
 create mode 100644 examples/offline_inference/cosmos3/inputs/i2v.json
 create mode 100644 examples/offline_inference/cosmos3/inputs/t2i.json
 create mode 100644 examples/offline_inference/cosmos3/inputs/t2v.json
 create mode 100644 examples/offline_inference/cosmos3/inputs/t2v_sound.json
 create mode 100755 examples/online_serving/cosmos3/run_curl_action_forward_dynamics_av.sh
 create mode 100755 examples/online_serving/cosmos3/run_curl_action_forward_dynamics_camera.sh
 create mode 100755 examples/online_serving/cosmos3/run_curl_action_forward_dynamics_robot.sh
 create mode 100755 examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_av.sh
 create mode 100755 examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_robot.sh
 mode change 100644 => 100755 examples/online_serving/cosmos3/run_curl_action_policy.sh
 create mode 100755 examples/online_serving/cosmos3/run_curl_action_policy_av.sh
 mode change 100644 => 100755 examples/online_serving/cosmos3/run_curl_i2v.sh
 mode change 100644 => 100755 examples/online_serving/cosmos3/run_curl_t2i.sh
 mode change 100644 => 100755 examples/online_serving/cosmos3/run_curl_t2v.sh
 mode change 100644 => 100755 examples/online_serving/cosmos3/run_curl_t2v_sound.sh

diff --git a/.gitignore b/.gitignore
index 378de441c7e..06ff3f5667e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -241,7 +241,9 @@ datasets/
 *.json
 !tests/dfx/perf/tests/*.json
 !apps/ComfyUI-vLLM-Omni/example_workflows/*.json
+!examples/offline_inference/cosmos3/inputs/*.json
 *.jsonl
+!examples/offline_inference/cosmos3/inputs/*.jsonl
 *.parquet
 
 # Output files
diff --git a/examples/offline_inference/cosmos3/README.md b/examples/offline_inference/cosmos3/README.md
index 7fe430da44f..aa59b3af93d 100644
--- a/examples/offline_inference/cosmos3/README.md
+++ b/examples/offline_inference/cosmos3/README.md
@@ -1,80 +1,162 @@
 # Cosmos3
 
-Cosmos3 uses `Cosmos3OmniDiffusersPipeline` for text-to-image, text-to-video, image-to-video, video-with-sound, and action generation. Examples default to the `nvidia/Cosmos3-Nano` Hugging Face repo, but you can override the checkpoint with `--model` or by exporting `COSMOS3_MODEL` to a local Diffusers-format checkpoint.
+Cosmos3 uses `Cosmos3OmniDiffusersPipeline` for text-to-image, text-to-video, image-to-video, video-with-sound, and action generation (policy, forward dynamics, inverse dynamics). Examples default to the `nvidia/Cosmos3-Nano` Hugging Face repo; override with `--model` or by exporting `COSMOS3_MODEL` to a local Diffusers-format checkpoint.
 
-```bash
-cd examples/offline_inference/cosmos3
-```
+## Canonical inputs
+
+Each modality has a JSON file under `inputs/` carrying the long-form prompt and the matching sampling parameters:
+
+| Modality                          | Input file                                       |
+| --------------------------------- | ------------------------------------------------ |
+| Text-to-Image                     | `inputs/t2i.json`                                |
+| Text-to-Video                     | `inputs/t2v.json`                                |
+| Text-to-Video with sound          | `inputs/t2v_sound.json`                          |
+| Image-to-Video                    | `inputs/i2v.json`                                |
+| Action — policy (robot)           | `inputs/action_policy_robot.json`                |
+| Action — policy (AV)              | `inputs/action_policy_av.json`                   |
+| Action — forward dynamics (robot) | `inputs/action_forward_dynamics_robot.json`      |
+| Action — forward dynamics (AV)    | `inputs/action_forward_dynamics_av.json`         |
+| Action — forward dynamics (camera)| `inputs/action_forward_dynamics_camera.jsonl`    |
+| Action — inverse dynamics (robot) | `inputs/action_inverse_dynamics_robot.json`      |
+| Action — inverse dynamics (AV)    | `inputs/action_inverse_dynamics_av.json`         |
+
+Pass any of these to `--input-json`. Recognized fields (`prompt`, `negative_prompt`, `vision_path`, `action_path`, `height`, `width`, `num_frames`, `num_inference_steps`, `guidance_scale`, `fps`, `seed`, `action_mode`, `action_chunk_size`, `raw_action_dim`, `domain_name`, `domain_id`, `generate_sound`, `sound_duration`) override the task defaults; explicit CLI flags still win over the JSON record.
+
+`vision_path` and `action_path` may be local paths or `http(s)` URLs. Remote assets are downloaded to a cache directory (`COSMOS3_EXAMPLE_CACHE`, defaults to `$TMPDIR/cosmos3_examples`).
+
+JSONL inputs (e.g. `action_forward_dynamics_camera.jsonl`) generate one output per record, with `_0`, `_1`, … appended to the output stem.
 
 ## Text-to-Image
 
 ```bash
-python end2end.py \
-  --task t2i \
-  --prompt "A small warehouse robot carrying a blue box, clean product photography" \
-  --negative-prompt "blurry, distorted, low quality" \
-  --output cosmos3_t2i.png
+python end2end.py --task t2i --input-json inputs/t2i.json --output cosmos3_t2i.png
 ```
 
 ## Text-to-Video
 
 ```bash
-python end2end.py \
-  --task t2v \
-  --prompt "A small warehouse robot moves a blue box across a clean floor." \
-  --negative-prompt "blurry, distorted, low quality" \
-  --output cosmos3_t2v.mp4
+python end2end.py --task t2v --input-json inputs/t2v.json --output cosmos3_t2v.mp4
 ```
 
 ## Image-to-Video
 
-Download an example image or provide your own image path.
+The companion image (`robot_153.jpg`) is referenced by URL inside `inputs/i2v.json` and auto-cached on first run.
+
+```bash
+python end2end.py --task i2v --input-json inputs/i2v.json --output cosmos3_i2v.mp4
+```
+
+To use your own image, override the vision path:
 
 ```bash
-wget https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg
-
-python end2end.py \
-  --task i2v \
-  --image cherry_blossom.jpg \
-  --prompt "Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
-  --negative-prompt "blurry, distorted, low quality" \
-  --output cosmos3_i2v.mp4
+python end2end.py --task i2v --input-json inputs/i2v.json --vision-path /path/to/image.jpg --prompt "..."
 ```
 
 ## Video With Sound
 
-This path requires a sound-capable Cosmos3 checkpoint with `sound_gen` weights.
+```bash
+python end2end.py --task t2v_sound --input-json inputs/t2v_sound.json --output cosmos3_t2v_sound.mp4
+```
+
+The JSON sets `generate_sound: true` and `sound_duration: 3.4`; override on the command line with `--sound-duration` if needed.
+
+## Action — Policy
+
+Policy mode consumes an image plus a language instruction and returns a video together with the predicted action chunk. The bundled vision asset for these modes is a video clip (`bridge_0.mp4` / `av_vision_25_*.mp4`); end2end.py auto-extracts the first frame for image-input modes (see [Video assets for image-input action modes](#video-assets-for-image-input-action-modes)).
+
+Robot (`bridge_orig_lerobot`, `raw_action_dim=10`, `action_chunk_size=16`):
+
+```bash
+python end2end.py --task action_policy --input-json inputs/action_policy_robot.json \
+  --output cosmos3_action_policy_robot.mp4 \
+  --action-output cosmos3_action_policy_robot_action.json
+```
+
+Autonomous vehicle (`raw_action_dim=9`, `action_chunk_size=60`, "Please go backward"):
+
+```bash
+python end2end.py --task action_policy --input-json inputs/action_policy_av.json \
+  --output cosmos3_action_policy_av.mp4 \
+  --action-output cosmos3_action_policy_av_action.json
+```
+
+## Action — Forward Dynamics
+
+Forward dynamics consumes a vision input plus a chunk of action data and predicts the resulting video.
+When the vision input is a video, the example uses the first `action_chunk_size + 1` frames to match
+native Cosmos3 conditioning. `--action-path` (URL or local path) is required; the JSON points at the
+cosmos-dependencies asset and gets cached locally on first run.
+
+Robot:
 
 ```bash
-python end2end.py \
-  --task t2v_sound \
-  --prompt "A small warehouse robot rolls across the floor with soft motor sounds." \
-  --negative-prompt "blurry, distorted, low quality" \
-  --sound-duration 3.4 \
-  --output cosmos3_t2v_sound.mp4
+python end2end.py --task action_forward_dynamics \
+  --input-json inputs/action_forward_dynamics_robot.json \
+  --output cosmos3_forward_dynamics_robot.mp4
 ```
 
-## Action Policy
+Autonomous vehicle:
+
+```bash
+python end2end.py --task action_forward_dynamics \
+  --input-json inputs/action_forward_dynamics_av.json \
+  --output cosmos3_forward_dynamics_av.mp4
+```
 
-This path requires an action-capable Cosmos3 checkpoint with `action_gen` weights. The example returns a video plus an action JSON payload. Pass either `--domain-name` or `--domain-id`.
+Camera-pose (JSONL with two scenes — `mountain` and `solar`):
 
 ```bash
-python end2end.py \
-  --task action_policy \
-  --image cherry_blossom.jpg \
-  --prompt "Predict the robot action for moving toward the target." \
-  --domain-name bridge_orig_lerobot \
-  --raw-action-dim 2 \
-  --action-chunk-size 16 \
-  --output cosmos3_action_policy.mp4 \
-  --action-output cosmos3_action_policy_action.json
+python end2end.py --task action_forward_dynamics \
+  --input-json inputs/action_forward_dynamics_camera.jsonl \
+  --output cosmos3_forward_dynamics_camera.mp4
+# Produces cosmos3_forward_dynamics_camera_0.mp4 and cosmos3_forward_dynamics_camera_1.mp4
 ```
 
+## Action — Inverse Dynamics
+
+Inverse dynamics consumes a video plus a language instruction and predicts the action chunk. Video input is fed through `multi_modal_data["video"]`. The action JSON is written to the `--action-output` path.
+
+Robot:
+
+```bash
+python end2end.py --task action_inverse_dynamics \
+  --input-json inputs/action_inverse_dynamics_robot.json \
+  --output cosmos3_inverse_dynamics_robot.mp4 \
+  --action-output cosmos3_inverse_dynamics_robot_action.json
+```
+
+Autonomous vehicle:
+
+```bash
+python end2end.py --task action_inverse_dynamics \
+  --input-json inputs/action_inverse_dynamics_av.json \
+  --output cosmos3_inverse_dynamics_av.mp4 \
+  --action-output cosmos3_inverse_dynamics_av_action.json
+```
+
+## Video assets for action modes
+
+`forward_dynamics` uses the first `action_chunk_size + 1` frames when `--vision-path` resolves to a
+video file, matching the native Cosmos3 action loader. Still images are also accepted as a fallback.
+`policy` uses a still image; when its `--vision-path` resolves to a video file, end2end.py extracts
+the first frame automatically. Video frame loading requires `imageio` with the ffmpeg plugin:
+
+```bash
+pip install "imageio[ffmpeg]"
+```
+
+To bypass video loading/extraction, pass `--vision-path /path/to/still.jpg`.
+
 ## Common Options
 
+- `--input-json PATH`: load any of the `inputs/*.json` or `inputs/*.jsonl` records; CLI flags still override individual fields.
+- `--vision-path PATH_OR_URL`: image or video input (alias `--image` is kept for back-compat).
+- `--action-path PATH_OR_URL`: action JSON for forward-dynamics.
+- `--action-mode {forward_dynamics,inverse_dynamics,policy}`: override action_mode (otherwise derived from `--task`).
+- `--generate-sound`: force-enable sound generation outside the `t2v_sound` task.
 - `--enable-layerwise-offload`: use layerwise offload for memory-constrained runs.
 - `--cache-backend cache_dit`: enable Cache-DiT where supported.
 - `--cfg-parallel-size 2`, `--ulysses-degree`, `--tensor-parallel-size`, `--use-hsdp`: enable parallel execution options.
-- `--height`, `--width`, `--num-frames`, `--num-inference-steps`, `--guidance-scale`, `--fps`: override task defaults.
+- `--height`, `--width`, `--num-frames`, `--num-inference-steps`, `--guidance-scale`, `--fps`: override JSON/task defaults.
 
 Do not use model-level `--enable-cpu-offload` for Cosmos3. Use `--enable-layerwise-offload` instead.
diff --git a/examples/offline_inference/cosmos3/end2end.py b/examples/offline_inference/cosmos3/end2end.py
index ed3db03655f..f28fd50ef97 100644
--- a/examples/offline_inference/cosmos3/end2end.py
+++ b/examples/offline_inference/cosmos3/end2end.py
@@ -2,9 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
+import hashlib
 import json
 import os
+import sys
+import tempfile
 import time
+import urllib.parse
+import urllib.request
 from pathlib import Path
 from typing import Any
 
@@ -21,38 +26,42 @@
 DEFAULT_NEGATIVE_PROMPT = "blurry, distorted, low quality"
 TASK_DEFAULTS = {
     "t2i": {
-        "height": 1024,
-        "width": 1024,
+        "height": 960,
+        "width": 960,
         "num_frames": None,
         "num_inference_steps": 50,
-        "guidance_scale": 7.0,
+        "guidance_scale": 4.0,
+        "flow_shift": 3.0,
         "fps": 24,
         "output": "cosmos3_t2i.png",
     },
     "t2v": {
         "height": 720,
         "width": 1280,
-        "num_frames": 81,
+        "num_frames": 189,
         "num_inference_steps": 35,
-        "guidance_scale": 4.0,
+        "guidance_scale": 6.0,
+        "flow_shift": 10.0,
         "fps": 24,
         "output": "cosmos3_t2v.mp4",
     },
     "i2v": {
         "height": 720,
         "width": 1280,
-        "num_frames": 81,
+        "num_frames": 189,
         "num_inference_steps": 35,
-        "guidance_scale": 4.0,
+        "guidance_scale": 6.0,
+        "flow_shift": 10.0,
         "fps": 24,
         "output": "cosmos3_i2v.mp4",
     },
     "t2v_sound": {
         "height": 720,
         "width": 1280,
-        "num_frames": 81,
+        "num_frames": 189,
         "num_inference_steps": 35,
-        "guidance_scale": 4.0,
+        "guidance_scale": 6.0,
+        "flow_shift": 10.0,
         "fps": 24,
         "output": "cosmos3_t2v_sound.mp4",
     },
@@ -62,9 +71,68 @@
         "num_frames": 17,
         "num_inference_steps": 30,
         "guidance_scale": 1.0,
+        "flow_shift": 5.0,
         "fps": 24,
         "output": "cosmos3_action_policy.mp4",
     },
+    "action_forward_dynamics": {
+        "height": 480,
+        "width": 640,
+        "num_frames": 17,
+        "num_inference_steps": 30,
+        "guidance_scale": 1.0,
+        "flow_shift": 5.0,
+        "fps": 5,
+        "output": "cosmos3_action_forward_dynamics.mp4",
+    },
+    "action_inverse_dynamics": {
+        "height": 480,
+        "width": 640,
+        "num_frames": 17,
+        "num_inference_steps": 30,
+        "guidance_scale": 1.0,
+        "flow_shift": 5.0,
+        "fps": 5,
+        "output": "cosmos3_action_inverse_dynamics.mp4",
+    },
+}
+
+_INPUTS_DIR = Path(__file__).resolve().parent / "inputs"
+_TASK_ACTION_MODES = {
+    "action_policy": "policy",
+    "action_forward_dynamics": "forward_dynamics",
+    "action_inverse_dynamics": "inverse_dynamics",
+}
+_ACTION_TASKS = set(_TASK_ACTION_MODES)
+_VIDEO_INPUT_TASKS = {"action_inverse_dynamics"}
+_IMAGE_INPUT_TASKS = {"i2v", "action_policy", "action_forward_dynamics"}
+_VIDEO_EXTENSIONS = {".mp4", ".mov", ".mkv", ".webm", ".avi"}
+_CACHE_DIR = Path(
+    os.environ.get(
+        "COSMOS3_EXAMPLE_CACHE",
+        str(Path(tempfile.gettempdir()) / "cosmos3_examples"),
+    )
+)
+_JSON_TO_ATTR = {
+    "prompt": "prompt",
+    "negative_prompt": "negative_prompt",
+    "vision_path": "vision_path",
+    "action_path": "action_path",
+    "height": "height",
+    "width": "width",
+    "num_frames": "num_frames",
+    "num_inference_steps": "num_inference_steps",
+    "guidance_scale": "guidance_scale",
+    "flow_shift": "flow_shift",
+    "fps": "fps",
+    "seed": "seed",
+    "action_mode": "action_mode",
+    "action_chunk_size": "action_chunk_size",
+    "raw_action_dim": "raw_action_dim",
+    "domain_name": "domain_name",
+    "domain_id": "domain_id",
+    "generate_sound": "generate_sound",
+    "sound_duration": "sound_duration",
 }
 
 
@@ -82,18 +150,53 @@ def parse_args() -> argparse.Namespace:
         default="t2v",
         help="Cosmos3 example task to run.",
     )
+    parser.add_argument(
+        "--input-json",
+        default=None,
+        help="Path to a JSON or JSONL input file (e.g. inputs/t2v.json). When given, every recognized "
+        "field overrides the matching default; explicit CLI flags still win. Use JSONL to batch multiple "
+        "generations in one invocation (e.g. inputs/action_forward_dynamics_camera.jsonl).",
+    )
     parser.add_argument(
         "--prompt",
         default="A small warehouse robot moves a blue box across a clean floor.",
-        help="Text prompt.",
+        help="Text prompt. Overrides any prompt loaded from --input-json.",
     )
     parser.add_argument("--negative-prompt", default=DEFAULT_NEGATIVE_PROMPT, help="Negative prompt.")
-    parser.add_argument("--image", default=None, help="Input image for i2v or action_policy.")
+    parser.add_argument(
+        "--image",
+        default=None,
+        help="Input image path for i2v / image-input action tasks. Alias for --vision-path.",
+    )
+    parser.add_argument(
+        "--vision-path",
+        default=None,
+        help="Vision input as a local path or http(s) URL. Image file for i2v / policy; image or video file "
+        "for forward_dynamics; video file for inverse_dynamics. If a video is supplied for i2v / policy, "
+        "the first frame is extracted automatically (requires imageio).",
+    )
+    parser.add_argument(
+        "--action-path",
+        default=None,
+        help="Local path or URL to an action JSON for forward_dynamics tasks.",
+    )
+    parser.add_argument(
+        "--action-mode",
+        default=None,
+        choices=["forward_dynamics", "inverse_dynamics", "policy"],
+        help="Override action_mode. Defaults are derived from --task.",
+    )
+    parser.add_argument(
+        "--generate-sound",
+        action="store_true",
+        help="Enable sound generation.",
+    )
     parser.add_argument("--output", default=None, help="Output PNG or MP4 path. Default depends on --task.")
     parser.add_argument(
         "--action-output",
         default=None,
-        help="Action JSON path for action_policy. Defaults to the video output stem plus _action.json.",
+        help="Action JSON path for inverse_dynamics / action_policy outputs. "
+        "Defaults to the video output stem plus _action.json.",
     )
     parser.add_argument("--seed", type=int, default=42, help="Random seed.")
     parser.add_argument("--height", type=int, default=None, help="Output height. Default depends on --task.")
@@ -106,6 +209,12 @@ def parse_args() -> argparse.Namespace:
         help="Sampling steps. Default depends on --task.",
     )
     parser.add_argument("--guidance-scale", type=float, default=None, help="CFG scale. Default depends on --task.")
+    parser.add_argument(
+        "--flow-shift",
+        type=float,
+        default=None,
+        help="Flow-matching scheduler shift. Default depends on --task (cosmos3-internal: 3.0 t2i / 10.0 t2v/i2v / 5.0 action).",
+    )
     parser.add_argument("--fps", type=int, default=None, help="Output video fps. Default depends on --task.")
     parser.add_argument(
         "--sound-duration",
@@ -183,6 +292,114 @@ def _cache_config(cache_backend: str | None) -> dict[str, Any] | None:
     }
 
 
+def _is_url(value: str) -> bool:
+    return urllib.parse.urlparse(value).scheme in {"http", "https"}
+
+
+def _is_video_path(value: str) -> bool:
+    parsed = urllib.parse.urlparse(value)
+    target = parsed.path if parsed.scheme else value
+    return Path(target).suffix.lower() in _VIDEO_EXTENSIONS
+
+
+def _resolve_local_path(path_or_url: str) -> str:
+    if not _is_url(path_or_url):
+        return path_or_url
+    _CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    suffix = Path(urllib.parse.urlparse(path_or_url).path).suffix or ""
+    digest = hashlib.sha256(path_or_url.encode("utf-8")).hexdigest()[:16]
+    target = _CACHE_DIR / f"{digest}{suffix}"
+    if not target.exists():
+        print(f"Downloading {path_or_url} -> {target}")
+        with urllib.request.urlopen(path_or_url) as response, open(target, "wb") as fh:
+            fh.write(response.read())
+    return str(target)
+
+
+def _first_video_frame(video_path: str) -> PIL.Image.Image:
+    try:
+        import imageio.v3 as iio  # type: ignore[import-not-found]
+    except ImportError as exc:
+        raise ImportError(
+            "Extracting the first frame of a video for an image-input task requires imageio. "
+            "Install with `pip install imageio[ffmpeg]` or pass a still image via --vision-path."
+        ) from exc
+    frame = np.asarray(iio.imread(video_path, index=0))
+    return PIL.Image.fromarray(frame).convert("RGB")
+
+
+def _load_video_frames_from(path_or_url: str, max_frames: int) -> list[PIL.Image.Image]:
+    if max_frames <= 0:
+        raise ValueError(f"max_frames must be positive, got {max_frames}.")
+
+    try:
+        import imageio.v3 as iio  # type: ignore[import-not-found]
+    except ImportError as exc:
+        raise ImportError(
+            "Loading video frames for Cosmos3 action forward-dynamics requires imageio. "
+            "Install with `pip install imageio[ffmpeg]` or pass a still image via --vision-path."
+        ) from exc
+
+    local = _resolve_local_path(path_or_url)
+    frames: list[PIL.Image.Image] = []
+    for frame in iio.imiter(local):
+        frames.append(PIL.Image.fromarray(np.asarray(frame)).convert("RGB"))
+        if len(frames) >= max_frames:
+            break
+    if not frames:
+        raise ValueError(f"Cosmos3 action video input contains no frames: {path_or_url}")
+    return frames
+
+
+def _load_image_from(path_or_url: str) -> PIL.Image.Image:
+    local = _resolve_local_path(path_or_url)
+    if _is_video_path(path_or_url):
+        return _first_video_frame(local)
+    return PIL.Image.open(local).convert("RGB")
+
+
+def _load_input_records(path: str) -> list[dict[str, Any]]:
+    src = Path(path)
+    if not src.exists():
+        candidate = _INPUTS_DIR / path
+        if candidate.exists():
+            src = candidate
+    if not src.exists():
+        raise FileNotFoundError(f"Input JSON file not found: {path}")
+    text = src.read_text(encoding="utf-8").strip()
+    if src.suffix == ".jsonl":
+        return [json.loads(line) for line in text.splitlines() if line.strip()]
+    return [json.loads(text)]
+
+
+def _cli_provided_attrs(argv: list[str]) -> set[str]:
+    provided: set[str] = set()
+    for token in argv:
+        if not token.startswith("--"):
+            continue
+        flag = token.split("=", 1)[0][2:]
+        provided.add(flag.replace("-", "_"))
+    return provided
+
+
+def _apply_record(record: dict[str, Any], args: argparse.Namespace, cli_set: set[str]) -> None:
+    # --image and --vision-path are aliases for the same visual input. A CLI
+    # value for either should suppress a JSON override of the other.
+    effective_cli_set = set(cli_set)
+    if "image" in effective_cli_set or "vision_path" in effective_cli_set:
+        effective_cli_set |= {"image", "vision_path"}
+    for key, value in record.items():
+        attr = _JSON_TO_ATTR.get(key)
+        if attr is None:
+            print(f"Ignoring unknown input-json field: {key}")
+            continue
+        if attr in effective_cli_set:
+            continue
+        if attr == "generate_sound" and not bool(value):
+            continue
+        setattr(args, attr, value)
+
+
 def _first_output(outputs: Any) -> Any:
     if isinstance(outputs, list):
         if not outputs:
@@ -410,53 +627,95 @@ def _build_omni(args: argparse.Namespace) -> Omni:
     return Omni(**kwargs)
 
 
-def main() -> None:
-    args = parse_args()
+def _resolve_action_mode(task: str, args: argparse.Namespace) -> str | None:
+    if getattr(args, "action_mode", None):
+        return args.action_mode
+    return _TASK_ACTION_MODES.get(task)
 
-    defaults = TASK_DEFAULTS[args.task]
-    height = args.height or defaults["height"]
-    width = args.width or defaults["width"]
-    num_frames = args.num_frames if args.num_frames is not None else defaults["num_frames"]
-    num_inference_steps = args.num_inference_steps or defaults["num_inference_steps"]
-    guidance_scale = args.guidance_scale if args.guidance_scale is not None else defaults["guidance_scale"]
-    fps = args.fps or defaults["fps"]
-    output_path = Path(args.output or defaults["output"])
 
-    if args.task in {"i2v", "action_policy"} and args.image is None:
-        raise ValueError(f"--image is required for {args.task}.")
-
-    image = PIL.Image.open(args.image).convert("RGB") if args.image else None
-    generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
-    omni = _build_omni(args)
+def _build_prompt_and_extra(
+    args: argparse.Namespace,
+    task: str,
+    action_mode: str | None,
+) -> tuple[dict[str, Any], dict[str, Any]]:
+    vision_path = args.vision_path or args.image
 
     prompt: dict[str, Any] = {
         "prompt": args.prompt,
         "negative_prompt": args.negative_prompt,
-        "modalities": ["image"] if args.task == "t2i" else ["video"],
+        "modalities": ["image"] if task == "t2i" else ["video"],
     }
-    if image is not None:
-        prompt["multi_modal_data"] = {"image": image}
-    if args.task == "t2v_sound":
-        prompt["generate_sound"] = True
+
+    if task in _VIDEO_INPUT_TASKS:
+        if not vision_path:
+            raise ValueError(f"--vision-path (video) is required for {task}.")
+        local_video = _resolve_local_path(vision_path)
+        prompt["multi_modal_data"] = {"video": local_video}
+    elif task == "action_forward_dynamics" and vision_path and _is_video_path(vision_path):
+        prompt["multi_modal_data"] = {"video": _load_video_frames_from(vision_path, args.action_chunk_size + 1)}
+    elif task in _IMAGE_INPUT_TASKS:
+        if not vision_path:
+            raise ValueError(f"--vision-path (image) is required for {task}.")
+        prompt["multi_modal_data"] = {"image": _load_image_from(vision_path)}
+    elif vision_path:
+        prompt["multi_modal_data"] = {"image": _load_image_from(vision_path)}
 
     extra_args: dict[str, Any] = {}
-    if args.task == "t2v_sound":
+
+    if getattr(args, "flow_shift", None) is not None:
+        extra_args["flow_shift"] = float(args.flow_shift)
+
+    sound_enabled = bool(getattr(args, "generate_sound", False)) or task == "t2v_sound"
+    if sound_enabled and action_mode is not None:
+        raise ValueError("Cosmos3 does not support action modes combined with sound generation.")
+    if sound_enabled:
+        prompt["generate_sound"] = True
         extra_args["generate_sound"] = True
         if args.sound_duration is not None:
             extra_args["sound_duration"] = args.sound_duration
-    if args.task == "action_policy":
-        extra_args.update(
-            {
-                "action_mode": "policy",
-                "action_chunk_size": args.action_chunk_size,
-                "raw_action_dim": args.raw_action_dim,
-            }
-        )
+
+    if action_mode is not None:
+        extra_args["action_mode"] = action_mode
+        extra_args["action_chunk_size"] = args.action_chunk_size
+        if action_mode in {"policy", "inverse_dynamics"}:
+            extra_args["raw_action_dim"] = args.raw_action_dim
+        elif args.raw_action_dim is not None:
+            extra_args["raw_action_dim"] = args.raw_action_dim
         if args.domain_id is not None:
             extra_args["domain_id"] = args.domain_id
         else:
             extra_args["domain_name"] = args.domain_name
+        if action_mode == "forward_dynamics":
+            if not args.action_path:
+                raise ValueError("--action-path is required for forward_dynamics.")
+            extra_args["action_path"] = _resolve_local_path(args.action_path)
+        elif args.action_path:
+            extra_args["action_path"] = _resolve_local_path(args.action_path)
+
+    return prompt, extra_args
+
+
+def _run_one(
+    omni: Omni,
+    args: argparse.Namespace,
+    task: str,
+    output_path: Path,
+    record_index: int | None = None,
+) -> None:
+    defaults = TASK_DEFAULTS[task]
+    height = args.height or defaults["height"]
+    width = args.width or defaults["width"]
+    num_frames = args.num_frames if args.num_frames is not None else defaults["num_frames"]
+    num_inference_steps = args.num_inference_steps or defaults["num_inference_steps"]
+    guidance_scale = args.guidance_scale if args.guidance_scale is not None else defaults["guidance_scale"]
+    fps = args.fps or defaults["fps"]
+    if args.flow_shift is None and defaults.get("flow_shift") is not None:
+        args.flow_shift = defaults["flow_shift"]
+
+    action_mode = _resolve_action_mode(task, args)
+    prompt, extra_args = _build_prompt_and_extra(args, task, action_mode)
 
+    generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
     sampling = OmniDiffusionSamplingParams(
         height=height,
         width=width,
@@ -469,20 +728,26 @@ def main() -> None:
     )
 
     print("Cosmos3 generation configuration:")
-    print(f"  Task: {args.task}")
+    print(f"  Task: {task}")
+    if action_mode:
+        print(f"  Action mode: {action_mode}")
+    if record_index is not None:
+        print(f"  Record: {record_index}")
     print(f"  Model: {args.model}")
     print(f"  Size: {width}x{height}")
     if num_frames is not None:
         print(f"  Frames: {num_frames}")
     print(f"  Steps: {num_inference_steps}")
     print(f"  Guidance scale: {guidance_scale}")
+    if args.flow_shift is not None:
+        print(f"  Flow shift: {args.flow_shift}")
 
     start = time.perf_counter()
     outputs = omni.generate(prompt, sampling)
     elapsed = time.perf_counter() - start
     print(f"Total generation time: {elapsed:.4f} seconds")
 
-    if args.task == "t2i":
+    if task == "t2i":
         images = _extract_images(outputs)
         output_path.parent.mkdir(parents=True, exist_ok=True)
         images[0].save(output_path)
@@ -495,13 +760,47 @@ def main() -> None:
     )
     print(f"Saved video to {output_path}")
 
-    if args.task == "action_policy":
-        action_path = (
+    if action_mode in {"policy", "inverse_dynamics"} and action:
+        action_out = (
             Path(args.action_output) if args.action_output else output_path.with_name(f"{output_path.stem}_action.json")
         )
-        action_path.parent.mkdir(parents=True, exist_ok=True)
-        action_path.write_text(json.dumps(_jsonable(action), indent=2) + "\n", encoding="utf-8")
-        print(f"Saved action metadata to {action_path}")
+        action_out.parent.mkdir(parents=True, exist_ok=True)
+        action_out.write_text(json.dumps(_jsonable(action), indent=2) + "\n", encoding="utf-8")
+        print(f"Saved action metadata to {action_out}")
+
+
+def _record_output_path(base: Path, index: int, total: int) -> Path:
+    if total <= 1:
+        return base
+    return base.with_name(f"{base.stem}_{index}{base.suffix}")
+
+
+def main() -> None:
+    args = parse_args()
+    cli_set = _cli_provided_attrs(sys.argv[1:])
+
+    records: list[dict[str, Any]] = [{}]
+    if args.input_json:
+        records = _load_input_records(args.input_json)
+        if not records:
+            raise ValueError(f"--input-json {args.input_json} contained no records.")
+
+    omni = _build_omni(args)
+
+    base_output = Path(args.output or TASK_DEFAULTS[args.task]["output"])
+
+    for index, record in enumerate(records):
+        record_args = argparse.Namespace(**vars(args))
+        if record:
+            _apply_record(record, record_args, cli_set)
+        output_path = _record_output_path(base_output, index, len(records))
+        _run_one(
+            omni,
+            record_args,
+            args.task,
+            output_path,
+            record_index=index if len(records) > 1 else None,
+        )
 
 
 if __name__ == "__main__":
diff --git a/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_av.json b/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_av.json
new file mode 100644
index 00000000000..2a990696c92
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_av.json
@@ -0,0 +1,17 @@
+{
+    "action_mode": "forward_dynamics",
+    "prompt": "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene.",
+    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4",
+    "action_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_action_25.json",
+    "domain_name": "av",
+    "height": 480,
+    "width": 640,
+    "num_frames": 61,
+    "fps": 10,
+    "num_inference_steps": 30,
+    "guidance_scale": 1.0,
+    "flow_shift": 5.0,
+    "seed": 0,
+    "action_chunk_size": 60,
+    "raw_action_dim": 9
+}
diff --git a/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_camera.jsonl b/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_camera.jsonl
new file mode 100644
index 00000000000..8786eb7d556
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_camera.jsonl
@@ -0,0 +1,2 @@
+{"action_mode":"forward_dynamics","prompt":"A serene landscape video of a calm body of water in the foreground, leading up to rolling green pastoral hills and a prominent mountain peak partially shrouded in low-hanging fog under a moody, overcast gray sky. This video is captured from a first-person perspective looking at the scene.","vision_path":"https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/mountain_720.png","action_path":"https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/camera_action_44.json","domain_name":"camera_pose","height":480,"width":640,"num_frames":61,"fps":30,"num_inference_steps":30,"guidance_scale":1.0,"flow_shift":5.0,"seed":0,"action_chunk_size":60}
+{"action_mode":"forward_dynamics","prompt":"An architectural video of a modern elevated terrace at twilight, characterized by a large, intricate white geometric canopy supporting integrated solar panels, slatted wooden benches, and a unique cylindrical wooden seating pod, overlooking a distant campus. This video is captured from a first-person perspective looking at the scene.","vision_path":"https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/solar_720.png","action_path":"https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/camera_action_44.json","domain_name":"camera_pose","height":480,"width":640,"num_frames":61,"fps":30,"num_inference_steps":30,"guidance_scale":1.0,"flow_shift":5.0,"seed":0,"action_chunk_size":60}
diff --git a/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_robot.json b/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_robot.json
new file mode 100644
index 00000000000..966dbdce7aa
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_robot.json
@@ -0,0 +1,17 @@
+{
+    "action_mode": "forward_dynamics",
+    "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
+    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
+    "action_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.json",
+    "domain_name": "bridge_orig_lerobot",
+    "height": 544,
+    "width": 736,
+    "num_frames": 17,
+    "fps": 5,
+    "num_inference_steps": 30,
+    "guidance_scale": 1.0,
+    "flow_shift": 5.0,
+    "seed": 0,
+    "action_chunk_size": 16,
+    "raw_action_dim": 10
+}
diff --git a/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_av.json b/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_av.json
new file mode 100644
index 00000000000..7e746501533
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_av.json
@@ -0,0 +1,16 @@
+{
+    "action_mode": "inverse_dynamics",
+    "prompt": "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene.",
+    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4",
+    "domain_name": "av",
+    "height": 480,
+    "width": 640,
+    "num_frames": 61,
+    "fps": 10,
+    "num_inference_steps": 30,
+    "guidance_scale": 1.0,
+    "flow_shift": 5.0,
+    "seed": 0,
+    "action_chunk_size": 60,
+    "raw_action_dim": 9
+}
diff --git a/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_robot.json b/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_robot.json
new file mode 100644
index 00000000000..4cd7e68de05
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_robot.json
@@ -0,0 +1,16 @@
+{
+    "action_mode": "inverse_dynamics",
+    "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
+    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
+    "domain_name": "bridge_orig_lerobot",
+    "height": 480,
+    "width": 640,
+    "num_frames": 17,
+    "fps": 5,
+    "num_inference_steps": 30,
+    "guidance_scale": 1.0,
+    "flow_shift": 5.0,
+    "seed": 0,
+    "action_chunk_size": 16,
+    "raw_action_dim": 10
+}
diff --git a/examples/offline_inference/cosmos3/inputs/action_policy_av.json b/examples/offline_inference/cosmos3/inputs/action_policy_av.json
new file mode 100644
index 00000000000..1e9a6506753
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/action_policy_av.json
@@ -0,0 +1,16 @@
+{
+    "action_mode": "policy",
+    "prompt": "You are an autonomous vehicle planning system. Please go backward. This video is captured from a first-person perspective looking at the scene.",
+    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4",
+    "domain_name": "av",
+    "height": 480,
+    "width": 640,
+    "num_frames": 61,
+    "fps": 10,
+    "num_inference_steps": 30,
+    "guidance_scale": 1.0,
+    "flow_shift": 5.0,
+    "seed": 0,
+    "action_chunk_size": 60,
+    "raw_action_dim": 9
+}
diff --git a/examples/offline_inference/cosmos3/inputs/action_policy_robot.json b/examples/offline_inference/cosmos3/inputs/action_policy_robot.json
new file mode 100644
index 00000000000..937fd839cc7
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/action_policy_robot.json
@@ -0,0 +1,16 @@
+{
+    "action_mode": "policy",
+    "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
+    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
+    "domain_name": "bridge_orig_lerobot",
+    "height": 480,
+    "width": 640,
+    "num_frames": 17,
+    "fps": 5,
+    "num_inference_steps": 30,
+    "guidance_scale": 1.0,
+    "flow_shift": 5.0,
+    "seed": 0,
+    "action_chunk_size": 16,
+    "raw_action_dim": 10
+}
diff --git a/examples/offline_inference/cosmos3/inputs/i2v.json b/examples/offline_inference/cosmos3/inputs/i2v.json
new file mode 100644
index 00000000000..c45302b3af0
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/i2v.json
@@ -0,0 +1,11 @@
+{
+    "prompt": "The video opens with a view of a testing environment, characterized by a large wooden table at the center. On this table, two robot arms are positioned at opposite ends, with the left arm closer to the camera and the right arm further away. Between the hands lies a dark wooden shelf with a red spherical object on its top rack, likely serving as a platform or obstacle. In the background, various pieces of equipment, including a tripod, a chair, are visible. A person wearing a blue jacket and black pants stands near the center of the room, observing the experiment, with a static hand position throughout. The floor is tiled with a patterned design, and additional items like a small robot figure and some cables can be seen scattered around the space. As the video progresses, the right robotic hand extends outward, moving from its initial position towards the red spherical object on the shelf. The hand then picks up the object and places it on the lowest rack of the shelf, completing a smooth, deliberate manipulation. The left robotic hand remains stationary throughout the sequence. No new objects appear in the video; all existing elements maintain their positions except for the movement of the right robotic hand. The scene concludes with the right robotic hand returning to its initial position, while the left hand continues to rest on the table. The overall environment remains unchanged, with the focus remaining on the interaction between the robotic hands and the wooden block, highlighting precise control during the demonstration.",
+    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_153.jpg",
+    "height": 720,
+    "width": 1280,
+    "num_frames": 189,
+    "num_inference_steps": 35,
+    "guidance_scale": 6.0,
+    "flow_shift": 10.0,
+    "fps": 24
+}
diff --git a/examples/offline_inference/cosmos3/inputs/t2i.json b/examples/offline_inference/cosmos3/inputs/t2i.json
new file mode 100644
index 00000000000..b7e1c7fee9c
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/t2i.json
@@ -0,0 +1,9 @@
+{
+    "prompt": "A medium shot of a modern robotics research laboratory with white walls and a gray floor. A robotic arm with a metallic finish is mounted on a clean white workbench, its gripper positioned above a row of small colored objects. A laptop and neatly arranged tools sit beside the robot. A large monitor on the wall behind displays a software interface. The scene is brightly lit by overhead fluorescent lights.",
+    "height": 960,
+    "width": 960,
+    "num_inference_steps": 50,
+    "guidance_scale": 4.0,
+    "flow_shift": 3.0,
+    "fps": 24
+}
diff --git a/examples/offline_inference/cosmos3/inputs/t2v.json b/examples/offline_inference/cosmos3/inputs/t2v.json
new file mode 100644
index 00000000000..485f4e700bb
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/t2v.json
@@ -0,0 +1,10 @@
+{
+    "prompt": "The video opens with a view of a well-lit indoor space featuring a wooden display case with compartments filled with various fruits, including bananas, apples, pears, oranges, and carambolas. The bananas are neatly arranged in the middle compartment, while apples are in the left and a mix of pears, oranges, and carambolas are in the right. Two robotic arms with grippers are positioned at the bottom of the frame, with the one on the left remaining stationary, partially obscuring the apples. The robotic arm on the right begins its action, extending towards the right side of the display case. It carefully picks up a pear from the fruit section, placing it into a plastic bag in the shopping cart nearby, which has red handles. After securing the pear, the arm retracts back to its original position. The process repeats as the robotic arm picks up an orange and places it in the bag, followed by a carambola. The final frame captures the robotic arm returning to its initial position, leaving the display case and surrounding area unchanged. The video showcases a seamless and efficient automated fruit-picking process, highlighting the precision and efficiency of modern robotics in a retail setting.",
+    "height": 720,
+    "width": 1280,
+    "num_frames": 189,
+    "num_inference_steps": 35,
+    "guidance_scale": 6.0,
+    "flow_shift": 10.0,
+    "fps": 24
+}
diff --git a/examples/offline_inference/cosmos3/inputs/t2v_sound.json b/examples/offline_inference/cosmos3/inputs/t2v_sound.json
new file mode 100644
index 00000000000..f4ecdce266b
--- /dev/null
+++ b/examples/offline_inference/cosmos3/inputs/t2v_sound.json
@@ -0,0 +1,12 @@
+{
+    "prompt": "The video opens with a view of a well-lit indoor space featuring a wooden display case with compartments filled with various fruits, including bananas, apples, pears, oranges, and carambolas. The bananas are neatly arranged in the middle compartment, while apples are in the left and a mix of pears, oranges, and carambolas are in the right. Two robotic arms with grippers are positioned at the bottom of the frame, with the one on the left remaining stationary, partially obscuring the apples. The robotic arm on the right begins its action, extending towards the right side of the display case. It carefully picks up a pear from the fruit section, placing it into a plastic bag in the shopping cart nearby, which has red handles. After securing the pear, the arm retracts back to its original position. The process repeats as the robotic arm picks up an orange and places it in the bag, followed by a carambola. The final frame captures the robotic arm returning to its initial position, leaving the display case and surrounding area unchanged. Audio description: the soft whir of servo motors, gentle thuds as fruits land in the plastic bag, the rustle of the bag settling in the shopping cart, and a faint refrigeration hum in the background.",
+    "height": 720,
+    "width": 1280,
+    "num_frames": 189,
+    "num_inference_steps": 35,
+    "guidance_scale": 6.0,
+    "flow_shift": 10.0,
+    "fps": 24,
+    "generate_sound": true,
+    "sound_duration": 7.875
+}
diff --git a/examples/online_serving/cosmos3/README.md b/examples/online_serving/cosmos3/README.md
index 62fa00d69da..d027618bbb3 100644
--- a/examples/online_serving/cosmos3/README.md
+++ b/examples/online_serving/cosmos3/README.md
@@ -18,13 +18,49 @@ bash run_server.sh
 - `CFG_PARALLEL_SIZE`, `TENSOR_PARALLEL_SIZE`, `ULYSSES_DEGREE`, `USE_HSDP`: parallel execution controls
 - `ALLOWED_LOCAL_MEDIA_PATH`: local media access path, defaults to `/`
 
+## Disabling guardrails
+
+Cosmos3 ships with safety guardrails that check prompts and apply generated-output face blurring. Two override paths are available depending on whether you want to skip the guardrails globally or on a single request.
+
+### Server-wide (skip loading guardrail models entirely)
+
+Start the server with `--stage-configs-path cosmos3_no_guardrails.yaml`, which sets `model_config.guardrails: false` on the diffusion stage so the guardrail models are never loaded:
+
+```bash
+vllm serve nvidia/Cosmos3-Nano --omni \
+  --model-class-name Cosmos3OmniDiffusersPipeline \
+  --stage-configs-path examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml \
+  --port 8091
+```
+
+Other CLI flags (parallelism, cache backend, layerwise offload, etc.) are still honored; the YAML only overrides the guardrail toggle. When this path is used, per-request overrides cannot turn guardrails back on — the underlying models are not in memory.
+
+### Per-request (skip checks for a single generation)
+
+When the server has guardrails enabled, an individual request can opt out by passing `guardrails: false` inside `extra_params`. The server merges `extra_params` into the pipeline's `extra_args`, and the guardrail gate reads `extra_args["guardrails"]` as a per-request override:
+
+```bash
+curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
+  -F "prompt=..." \
+  -F 'extra_params={"guardrails": false}' \
+  -o cosmos3_no_check.mp4
+```
+
+For action-mode requests, fold the override into the existing `extra_params` object alongside `action_mode`, `domain_name`, and the rest. Anything other than `false` (or a missing field) keeps the default behavior.
+
+## Curl scripts
+
+Each script sources its prompt and sampling parameters from the canonical input file shared with the offline example at `../../offline_inference/cosmos3/inputs/`. Override the input file with `INPUT_JSON=` (or `INPUT_JSONL=` for the camera variant) or the parent directory with `INPUTS_DIR=`.
+
+Companion vision and action assets are auto-downloaded from `nvidia-cosmos/cosmos-dependencies` on first run, so the scripts work out of the box once the server is up. Image-input action modes (`policy`, `forward_dynamics`) extract the first frame of the source `.mp4` via `ffmpeg`, which is already a Cosmos3 system dependency.
+
 ## Text-to-Image
 
 ```bash
 bash run_curl_t2i.sh
 ```
 
-The script calls `POST /v1/images/generations`, which selects Cosmos3 text-to-image through `modalities=["image"]` internally.
+Calls `POST /v1/images/generations`, which selects Cosmos3 text-to-image through `modalities=["image"]` internally.
 
 ## Text-to-Video
 
@@ -34,29 +70,88 @@ bash run_curl_t2v.sh
 
 ## Image-to-Video
 
-Download an example image or set `IMAGE_PATH` to your own image:
+The companion image (`robot_153.jpg`) is auto-downloaded on first run. To use your own image:
 
 ```bash
-wget https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg
-IMAGE_PATH=cherry_blossom.jpg bash run_curl_i2v.sh
+IMAGE_PATH=/path/to/your.jpg bash run_curl_i2v.sh
 ```
 
 ## Video With Sound
 
-This path requires a sound-capable Cosmos3 checkpoint with `sound_gen` weights.
-
 ```bash
 bash run_curl_t2v_sound.sh
 ```
 
-The script passes `generate_sound=true` and `sound_duration` to the video endpoint.
+The script reads `sound_duration` from `inputs/t2v_sound.json` and posts `generate_sound=true` to `/v1/videos/sync`.
+
+## Action — Policy
+
+Policy mode returns a video plus a predicted action chunk; both are saved.
+
+Robot (`bridge_orig_lerobot`, `raw_action_dim=10`):
+
+```bash
+bash run_curl_action_policy.sh
+```
+
+Autonomous vehicle (`raw_action_dim=9`, "Please go backward"):
+
+```bash
+bash run_curl_action_policy_av.sh
+```
+
+## Action — Forward Dynamics
 
-## Action Policy
+Forward-dynamics scripts download both the source vision asset and the matching `action_path` JSON. The action JSON is passed as `action_path` inside `extra_params`, so it must be readable by the server process — that works out of the box on a same-host deployment with the default `ALLOWED_LOCAL_MEDIA_PATH=/`. For cross-host setups, share the file (e.g. via a mounted volume) or inline the action data into `extra_params` instead.
 
-This path requires an action-capable Cosmos3 checkpoint with `action_gen` weights. Pass either `domain_name` or `domain_id` through `extra_params`.
+Robot:
 
 ```bash
-IMAGE_PATH=cherry_blossom.jpg bash run_curl_action_policy.sh
+bash run_curl_action_forward_dynamics_robot.sh
 ```
 
-The script uses the asynchronous `POST /v1/videos` job endpoint so it can download the MP4 and save the returned action metadata JSON.
+Autonomous vehicle:
+
+```bash
+bash run_curl_action_forward_dynamics_av.sh
+```
+
+Camera-pose (two scenes — `SCENE_INDEX=0` for mountain (default), `SCENE_INDEX=1` for solar):
+
+```bash
+bash run_curl_action_forward_dynamics_camera.sh
+SCENE_INDEX=1 bash run_curl_action_forward_dynamics_camera.sh
+```
+
+## Action — Inverse Dynamics
+
+> **Known limitation:** the online `/v1/videos` endpoint accepts image bytes only via the `input_reference` form field. Inverse-dynamics needs the full source video, so the scripts below will currently fail at upload time. They are shipped pre-wired so they begin working unchanged once the server gains video upload support. In the meantime, run inverse-dynamics through the offline path:
+>
+> ```bash
+> cd ../../offline_inference/cosmos3
+> python end2end.py --task action_inverse_dynamics \
+>   --input-json inputs/action_inverse_dynamics_robot.json \
+>   --output cosmos3_inverse_dynamics_robot.mp4 \
+>   --action-output cosmos3_inverse_dynamics_robot_action.json
+> ```
+
+Curl variants (kept for forward compatibility):
+
+```bash
+bash run_curl_action_inverse_dynamics_robot.sh
+bash run_curl_action_inverse_dynamics_av.sh
+```
+
+## Common script overrides
+
+Every curl script accepts a small set of env overrides:
+
+- `BASE_URL`: server URL, defaults to `http://localhost:8091`
+- `OUTPUT_PATH`: where to save the generated image / video
+- `ACTION_OUTPUT_PATH`: where to save predicted action JSON (policy / inverse_dynamics)
+- `INPUT_JSON` / `INPUT_JSONL` (camera) / `INPUTS_DIR`: alternate source for prompt and sampling parameters
+- `IMAGE_PATH` / `VIDEO_PATH`: pre-existing vision asset (skip auto-download / frame-extraction)
+- `ACTION_PATH` (forward-dynamics): pre-existing action JSON on the server's filesystem
+- `POLL_INTERVAL` (async scripts): seconds between status checks
+
+Async scripts use `POST /v1/videos` so they can download the MP4 once the job completes and save the action JSON returned in the status response.
diff --git a/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_av.sh b/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_av.sh
new file mode 100755
index 00000000000..201c3dcbaa6
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_av.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Cosmos3 forward-dynamics example (autonomous vehicle, image input + action).
+#
+# See run_curl_action_forward_dynamics_robot.sh for notes on how action_path
+# is consumed by the server.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_forward_dynamics_av.json}"
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_forward_dynamics_av.mp4}"
+IMAGE_PATH="${IMAGE_PATH:-av_vision_25_frame0.jpg}"
+VIDEO_PATH="${VIDEO_PATH:-av_vision_25.mp4}"
+ACTION_PATH="${ACTION_PATH:-$(pwd)/av_action_25.json}"
+
+if [ ! -f "${INPUT_JSON}" ]; then
+  echo "Missing input JSON: ${INPUT_JSON}" >&2
+  exit 1
+fi
+
+PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
+VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
+ACTION_URL="$(jq -r '.action_path' "${INPUT_JSON}")"
+DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
+RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
+ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
+NUM_FRAMES="$(jq -r '.num_frames // 61' "${INPUT_JSON}")"
+FPS="$(jq -r '.fps // 10' "${INPUT_JSON}")"
+HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
+WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
+NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
+GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
+FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
+SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
+
+if [ ! -f "${IMAGE_PATH}" ]; then
+  if [ ! -f "${VIDEO_PATH}" ]; then
+    echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
+    curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
+  fi
+  echo "Extracting first frame -> ${IMAGE_PATH}"
+  ffmpeg -y -loglevel error -i "${VIDEO_PATH}" -vf "select=eq(n\,0)" -vframes 1 "${IMAGE_PATH}"
+fi
+
+if [ ! -f "${ACTION_PATH}" ]; then
+  echo "Downloading ${ACTION_URL} -> ${ACTION_PATH}"
+  curl -sSL "${ACTION_URL}" -o "${ACTION_PATH}"
+fi
+
+EXTRA_PARAMS="$(jq -nc \
+  --arg domain "${DOMAIN_NAME}" \
+  --argjson dim "${RAW_ACTION_DIM}" \
+  --argjson chunk "${ACTION_CHUNK_SIZE}" \
+  --arg action_path "${ACTION_PATH}" \
+  '{action_mode:"forward_dynamics", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk, action_path:$action_path}')"
+
+curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
+  -F "prompt=${PROMPT}" \
+  -F "input_reference=@${IMAGE_PATH}" \
+  -F "size=${WIDTH}x${HEIGHT}" \
+  -F "num_frames=${NUM_FRAMES}" \
+  -F "fps=${FPS}" \
+  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
+  -F "guidance_scale=${GUIDANCE_SCALE}" \
+  -F "flow_shift=${FLOW_SHIFT}" \
+  -F "extra_params=${EXTRA_PARAMS}" \
+  -F "seed=${SEED}" \
+  -o "${OUTPUT_PATH}"
+
+echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_camera.sh b/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_camera.sh
new file mode 100755
index 00000000000..cc349e167b8
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_camera.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Cosmos3 forward-dynamics example (camera_pose domain, image input + action).
+#
+# The JSONL input ships two scenes: mountain (index 0) and solar (index 1).
+# Pick one with SCENE_INDEX. Vision input is a still PNG, so no ffmpeg step.
+# Forward-dynamics returns only a video (no predicted action), so this uses
+# the sync video endpoint.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSONL="${INPUT_JSONL:-${INPUTS_DIR}/action_forward_dynamics_camera.jsonl}"
+SCENE_INDEX="${SCENE_INDEX:-0}"
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_forward_dynamics_camera.mp4}"
+
+if [ ! -f "${INPUT_JSONL}" ]; then
+  echo "Missing input JSONL: ${INPUT_JSONL}" >&2
+  exit 1
+fi
+
+RECORD="$(awk "NR==$((SCENE_INDEX + 1))" "${INPUT_JSONL}")"
+if [ -z "${RECORD}" ]; then
+  echo "SCENE_INDEX=${SCENE_INDEX} out of range for ${INPUT_JSONL}" >&2
+  exit 1
+fi
+
+PROMPT="$(echo "${RECORD}" | jq -r '.prompt')"
+VISION_URL="$(echo "${RECORD}" | jq -r '.vision_path')"
+ACTION_URL="$(echo "${RECORD}" | jq -r '.action_path')"
+DOMAIN_NAME="$(echo "${RECORD}" | jq -r '.domain_name')"
+ACTION_CHUNK_SIZE="$(echo "${RECORD}" | jq -r '.action_chunk_size')"
+NUM_FRAMES="$(echo "${RECORD}" | jq -r '.num_frames // 61')"
+FPS="$(echo "${RECORD}" | jq -r '.fps // 30')"
+HEIGHT="$(echo "${RECORD}" | jq -r '.height // 480')"
+WIDTH="$(echo "${RECORD}" | jq -r '.width // 640')"
+NUM_INFERENCE_STEPS="$(echo "${RECORD}" | jq -r '.num_inference_steps // 30')"
+GUIDANCE_SCALE="$(echo "${RECORD}" | jq -r '.guidance_scale // 1.0')"
+FLOW_SHIFT="$(echo "${RECORD}" | jq -r '.flow_shift // 5.0')"
+SEED="$(echo "${RECORD}" | jq -r '.seed // 0')"
+
+IMAGE_PATH="${IMAGE_PATH:-camera_scene_${SCENE_INDEX}.png}"
+ACTION_PATH="${ACTION_PATH:-$(pwd)/camera_action_44.json}"
+
+if [ ! -f "${IMAGE_PATH}" ]; then
+  echo "Downloading ${VISION_URL} -> ${IMAGE_PATH}"
+  curl -sSL "${VISION_URL}" -o "${IMAGE_PATH}"
+fi
+
+if [ ! -f "${ACTION_PATH}" ]; then
+  echo "Downloading ${ACTION_URL} -> ${ACTION_PATH}"
+  curl -sSL "${ACTION_URL}" -o "${ACTION_PATH}"
+fi
+
+EXTRA_PARAMS="$(jq -nc \
+  --arg domain "${DOMAIN_NAME}" \
+  --argjson chunk "${ACTION_CHUNK_SIZE}" \
+  --arg action_path "${ACTION_PATH}" \
+  '{action_mode:"forward_dynamics", domain_name:$domain, action_chunk_size:$chunk, action_path:$action_path}')"
+
+curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
+  -F "prompt=${PROMPT}" \
+  -F "input_reference=@${IMAGE_PATH}" \
+  -F "size=${WIDTH}x${HEIGHT}" \
+  -F "num_frames=${NUM_FRAMES}" \
+  -F "fps=${FPS}" \
+  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
+  -F "guidance_scale=${GUIDANCE_SCALE}" \
+  -F "flow_shift=${FLOW_SHIFT}" \
+  -F "extra_params=${EXTRA_PARAMS}" \
+  -F "seed=${SEED}" \
+  -o "${OUTPUT_PATH}"
+
+echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_robot.sh b/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_robot.sh
new file mode 100755
index 00000000000..28d8ce76d45
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_robot.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Cosmos3 forward-dynamics example (bridge_orig_lerobot, image input + action).
+#
+# Forward-dynamics consumes an image plus a chunk of action data and predicts
+# the resulting video. There is no predicted-action output, so this script
+# uses the sync video endpoint (raw MP4 response). The action JSON is
+# referenced via `action_path` in extra_params, so it must be readable by
+# the server process — works out of the box when client and server share a
+# filesystem and run_server.sh keeps its default `--allowed-local-media-path /`.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_forward_dynamics_robot.json}"
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_forward_dynamics_robot.mp4}"
+IMAGE_PATH="${IMAGE_PATH:-bridge_0_frame0.jpg}"
+VIDEO_PATH="${VIDEO_PATH:-bridge_0.mp4}"
+ACTION_PATH="${ACTION_PATH:-$(pwd)/bridge_0.json}"
+
+if [ ! -f "${INPUT_JSON}" ]; then
+  echo "Missing input JSON: ${INPUT_JSON}" >&2
+  exit 1
+fi
+
+PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
+VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
+ACTION_URL="$(jq -r '.action_path' "${INPUT_JSON}")"
+DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
+RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
+ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
+NUM_FRAMES="$(jq -r '.num_frames // 17' "${INPUT_JSON}")"
+FPS="$(jq -r '.fps // 5' "${INPUT_JSON}")"
+HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
+WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
+NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
+GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
+FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
+SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
+
+if [ ! -f "${IMAGE_PATH}" ]; then
+  if [ ! -f "${VIDEO_PATH}" ]; then
+    echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
+    curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
+  fi
+  echo "Extracting first frame -> ${IMAGE_PATH}"
+  ffmpeg -y -loglevel error -i "${VIDEO_PATH}" -vf "select=eq(n\,0)" -vframes 1 "${IMAGE_PATH}"
+fi
+
+if [ ! -f "${ACTION_PATH}" ]; then
+  echo "Downloading ${ACTION_URL} -> ${ACTION_PATH}"
+  curl -sSL "${ACTION_URL}" -o "${ACTION_PATH}"
+fi
+
+EXTRA_PARAMS="$(jq -nc \
+  --arg domain "${DOMAIN_NAME}" \
+  --argjson dim "${RAW_ACTION_DIM}" \
+  --argjson chunk "${ACTION_CHUNK_SIZE}" \
+  --arg action_path "${ACTION_PATH}" \
+  '{action_mode:"forward_dynamics", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk, action_path:$action_path}')"
+
+curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
+  -F "prompt=${PROMPT}" \
+  -F "input_reference=@${IMAGE_PATH}" \
+  -F "size=${WIDTH}x${HEIGHT}" \
+  -F "num_frames=${NUM_FRAMES}" \
+  -F "fps=${FPS}" \
+  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
+  -F "guidance_scale=${GUIDANCE_SCALE}" \
+  -F "flow_shift=${FLOW_SHIFT}" \
+  -F "extra_params=${EXTRA_PARAMS}" \
+  -F "seed=${SEED}" \
+  -o "${OUTPUT_PATH}"
+
+echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_av.sh b/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_av.sh
new file mode 100755
index 00000000000..458c3358d3d
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_av.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+# Cosmos3 inverse-dynamics example (autonomous vehicle, video input).
+#
+# KNOWN LIMITATION: as of writing, the online `/v1/videos` endpoint accepts
+# image bytes only via the `input_reference` form field. Inverse-dynamics
+# needs the full source video, so this script will currently fail at upload
+# time. The offline path (`examples/offline_inference/cosmos3/end2end.py
+# --task action_inverse_dynamics --input-json inputs/action_inverse_dynamics_av.json`)
+# does support video input today. The script below is kept ready so it
+# starts working when the server gains video upload support.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_inverse_dynamics_av.json}"
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_inverse_dynamics_av.mp4}"
+ACTION_OUTPUT_PATH="${ACTION_OUTPUT_PATH:-cosmos3_inverse_dynamics_av_action.json}"
+VIDEO_PATH="${VIDEO_PATH:-av_vision_25.mp4}"
+POLL_INTERVAL="${POLL_INTERVAL:-2}"
+
+if [ ! -f "${INPUT_JSON}" ]; then
+  echo "Missing input JSON: ${INPUT_JSON}" >&2
+  exit 1
+fi
+
+PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
+VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
+DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
+RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
+ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
+NUM_FRAMES="$(jq -r '.num_frames // 17' "${INPUT_JSON}")"
+FPS="$(jq -r '.fps // 10' "${INPUT_JSON}")"
+HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
+WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
+NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
+GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
+FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
+SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
+
+if [ ! -f "${VIDEO_PATH}" ]; then
+  echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
+  curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
+fi
+
+EXTRA_PARAMS="$(jq -nc \
+  --arg domain "${DOMAIN_NAME}" \
+  --argjson dim "${RAW_ACTION_DIM}" \
+  --argjson chunk "${ACTION_CHUNK_SIZE}" \
+  '{action_mode:"inverse_dynamics", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk}')"
+
+create_response=$(
+  curl -sS -X POST "${BASE_URL}/v1/videos" \
+    -H "Accept: application/json" \
+    -F "prompt=${PROMPT}" \
+    -F "input_reference=@${VIDEO_PATH}" \
+    -F "size=${WIDTH}x${HEIGHT}" \
+    -F "num_frames=${NUM_FRAMES}" \
+    -F "fps=${FPS}" \
+    -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
+    -F "guidance_scale=${GUIDANCE_SCALE}" \
+    -F "flow_shift=${FLOW_SHIFT}" \
+    -F "extra_params=${EXTRA_PARAMS}" \
+    -F "seed=${SEED}"
+)
+
+video_id="$(echo "${create_response}" | jq -r '.id')"
+if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then
+  echo "Failed to create video job:"
+  echo "${create_response}" | jq .
+  exit 1
+fi
+
+echo "Created video job ${video_id}"
+while true; do
+  status_response="$(curl -sS "${BASE_URL}/v1/videos/${video_id}")"
+  status="$(echo "${status_response}" | jq -r '.status')"
+
+  case "${status}" in
+    queued|in_progress)
+      echo "Video job ${video_id} status: ${status}"
+      sleep "${POLL_INTERVAL}"
+      ;;
+    completed)
+      echo "${status_response}" | jq '.data[0].action' > "${ACTION_OUTPUT_PATH}"
+      break
+      ;;
+    failed)
+      echo "Video generation failed:"
+      echo "${status_response}" | jq .
+      exit 1
+      ;;
+    *)
+      echo "Unexpected status response:"
+      echo "${status_response}" | jq .
+      exit 1
+      ;;
+  esac
+done
+
+curl -sS -L "${BASE_URL}/v1/videos/${video_id}/content" -o "${OUTPUT_PATH}"
+
+echo "Saved video to ${OUTPUT_PATH}"
+echo "Saved action metadata to ${ACTION_OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_robot.sh b/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_robot.sh
new file mode 100755
index 00000000000..a718a84b428
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_robot.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+# Cosmos3 inverse-dynamics example (bridge_orig_lerobot, video input).
+#
+# KNOWN LIMITATION: as of writing, the online `/v1/videos` endpoint accepts
+# image bytes only via the `input_reference` form field. Inverse-dynamics
+# needs the full source video, so this script will currently fail at upload
+# time. The offline path (`examples/offline_inference/cosmos3/end2end.py
+# --task action_inverse_dynamics --input-json inputs/action_inverse_dynamics_robot.json`)
+# does support video input today. The script below is kept ready so it
+# starts working when the server gains video upload support.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_inverse_dynamics_robot.json}"
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_inverse_dynamics_robot.mp4}"
+ACTION_OUTPUT_PATH="${ACTION_OUTPUT_PATH:-cosmos3_inverse_dynamics_robot_action.json}"
+VIDEO_PATH="${VIDEO_PATH:-bridge_0.mp4}"
+POLL_INTERVAL="${POLL_INTERVAL:-2}"
+
+if [ ! -f "${INPUT_JSON}" ]; then
+  echo "Missing input JSON: ${INPUT_JSON}" >&2
+  exit 1
+fi
+
+PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
+VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
+DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
+RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
+ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
+NUM_FRAMES="$(jq -r '.num_frames // 17' "${INPUT_JSON}")"
+FPS="$(jq -r '.fps // 5' "${INPUT_JSON}")"
+HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
+WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
+NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
+GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
+FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
+SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
+
+if [ ! -f "${VIDEO_PATH}" ]; then
+  echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
+  curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
+fi
+
+EXTRA_PARAMS="$(jq -nc \
+  --arg domain "${DOMAIN_NAME}" \
+  --argjson dim "${RAW_ACTION_DIM}" \
+  --argjson chunk "${ACTION_CHUNK_SIZE}" \
+  '{action_mode:"inverse_dynamics", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk}')"
+
+create_response=$(
+  curl -sS -X POST "${BASE_URL}/v1/videos" \
+    -H "Accept: application/json" \
+    -F "prompt=${PROMPT}" \
+    -F "input_reference=@${VIDEO_PATH}" \
+    -F "size=${WIDTH}x${HEIGHT}" \
+    -F "num_frames=${NUM_FRAMES}" \
+    -F "fps=${FPS}" \
+    -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
+    -F "guidance_scale=${GUIDANCE_SCALE}" \
+    -F "flow_shift=${FLOW_SHIFT}" \
+    -F "extra_params=${EXTRA_PARAMS}" \
+    -F "seed=${SEED}"
+)
+
+video_id="$(echo "${create_response}" | jq -r '.id')"
+if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then
+  echo "Failed to create video job:"
+  echo "${create_response}" | jq .
+  exit 1
+fi
+
+echo "Created video job ${video_id}"
+while true; do
+  status_response="$(curl -sS "${BASE_URL}/v1/videos/${video_id}")"
+  status="$(echo "${status_response}" | jq -r '.status')"
+
+  case "${status}" in
+    queued|in_progress)
+      echo "Video job ${video_id} status: ${status}"
+      sleep "${POLL_INTERVAL}"
+      ;;
+    completed)
+      echo "${status_response}" | jq '.data[0].action' > "${ACTION_OUTPUT_PATH}"
+      break
+      ;;
+    failed)
+      echo "Video generation failed:"
+      echo "${status_response}" | jq .
+      exit 1
+      ;;
+    *)
+      echo "Unexpected status response:"
+      echo "${status_response}" | jq .
+      exit 1
+      ;;
+  esac
+done
+
+curl -sS -L "${BASE_URL}/v1/videos/${video_id}/content" -o "${OUTPUT_PATH}"
+
+echo "Saved video to ${OUTPUT_PATH}"
+echo "Saved action metadata to ${ACTION_OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_policy.sh b/examples/online_serving/cosmos3/run_curl_action_policy.sh
old mode 100644
new mode 100755
index b5635e3f201..930ab699457
--- a/examples/online_serving/cosmos3/run_curl_action_policy.sh
+++ b/examples/online_serving/cosmos3/run_curl_action_policy.sh
@@ -1,26 +1,71 @@
 #!/bin/bash
-# Cosmos3 action policy example. Requires an action-capable checkpoint.
+# Cosmos3 action policy example (bridge_orig_lerobot, image input).
+#
+# Cosmos3 policy mode consumes an image plus a language instruction and
+# generates a video together with the predicted action sequence. The example
+# image is the first frame of bridge_0.mp4 (cosmos-dependencies), extracted
+# locally with ffmpeg so the request matches the prompt scene.
 
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_policy_robot.json}"
+
 BASE_URL="${BASE_URL:-http://localhost:8091}"
-IMAGE_PATH="${IMAGE_PATH:-cherry_blossom.jpg}"
 OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_action_policy.mp4}"
 ACTION_OUTPUT_PATH="${ACTION_OUTPUT_PATH:-cosmos3_action_policy_action.json}"
+IMAGE_PATH="${IMAGE_PATH:-bridge_0_frame0.jpg}"
+VIDEO_PATH="${VIDEO_PATH:-bridge_0.mp4}"
 POLL_INTERVAL="${POLL_INTERVAL:-2}"
 
+if [ ! -f "${INPUT_JSON}" ]; then
+  echo "Missing input JSON: ${INPUT_JSON}" >&2
+  exit 1
+fi
+
+PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
+VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
+DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
+RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
+ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
+NUM_FRAMES="$(jq -r '.num_frames // 17' "${INPUT_JSON}")"
+FPS="$(jq -r '.fps // 5' "${INPUT_JSON}")"
+HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
+WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
+NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
+GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
+FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
+SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
+
+if [ ! -f "${IMAGE_PATH}" ]; then
+  if [ ! -f "${VIDEO_PATH}" ]; then
+    echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
+    curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
+  fi
+  echo "Extracting first frame -> ${IMAGE_PATH}"
+  ffmpeg -y -loglevel error -i "${VIDEO_PATH}" -vf "select=eq(n\,0)" -vframes 1 "${IMAGE_PATH}"
+fi
+
+EXTRA_PARAMS="$(jq -nc \
+  --arg domain "${DOMAIN_NAME}" \
+  --argjson dim "${RAW_ACTION_DIM}" \
+  --argjson chunk "${ACTION_CHUNK_SIZE}" \
+  '{action_mode:"policy", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk}')"
+
 create_response=$(
   curl -sS -X POST "${BASE_URL}/v1/videos" \
     -H "Accept: application/json" \
-    -F "prompt=Predict the robot action for moving toward the target." \
+    -F "prompt=${PROMPT}" \
     -F "input_reference=@${IMAGE_PATH}" \
-    -F "size=640x480" \
-    -F "num_frames=17" \
-    -F "fps=24" \
-    -F "num_inference_steps=30" \
-    -F "guidance_scale=1.0" \
-    -F 'extra_params={"action_mode":"policy","domain_name":"bridge_orig_lerobot","raw_action_dim":2,"action_chunk_size":16}' \
-    -F "seed=42"
+    -F "size=${WIDTH}x${HEIGHT}" \
+    -F "num_frames=${NUM_FRAMES}" \
+    -F "fps=${FPS}" \
+    -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
+    -F "guidance_scale=${GUIDANCE_SCALE}" \
+    -F "flow_shift=${FLOW_SHIFT}" \
+    -F "extra_params=${EXTRA_PARAMS}" \
+    -F "seed=${SEED}"
 )
 
 video_id="$(echo "${create_response}" | jq -r '.id')"
diff --git a/examples/online_serving/cosmos3/run_curl_action_policy_av.sh b/examples/online_serving/cosmos3/run_curl_action_policy_av.sh
new file mode 100755
index 00000000000..9f7a3aa18d2
--- /dev/null
+++ b/examples/online_serving/cosmos3/run_curl_action_policy_av.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# Cosmos3 action policy example (autonomous vehicle domain, image input).
+#
+# The example image is the first frame of the AV vision clip
+# (cosmos-dependencies), extracted locally with ffmpeg so the request matches
+# the prompt scene.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_policy_av.json}"
+
+BASE_URL="${BASE_URL:-http://localhost:8091}"
+OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_action_policy_av.mp4}"
+ACTION_OUTPUT_PATH="${ACTION_OUTPUT_PATH:-cosmos3_action_policy_av_action.json}"
+IMAGE_PATH="${IMAGE_PATH:-av_vision_25_frame0.jpg}"
+VIDEO_PATH="${VIDEO_PATH:-av_vision_25.mp4}"
+POLL_INTERVAL="${POLL_INTERVAL:-2}"
+
+if [ ! -f "${INPUT_JSON}" ]; then
+  echo "Missing input JSON: ${INPUT_JSON}" >&2
+  exit 1
+fi
+
+PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
+VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
+DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
+RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
+ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
+NUM_FRAMES="$(jq -r '.num_frames // 17' "${INPUT_JSON}")"
+FPS="$(jq -r '.fps // 10' "${INPUT_JSON}")"
+HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
+WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
+NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
+GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
+FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
+SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
+
+if [ ! -f "${IMAGE_PATH}" ]; then
+  if [ ! -f "${VIDEO_PATH}" ]; then
+    echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
+    curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
+  fi
+  echo "Extracting first frame -> ${IMAGE_PATH}"
+  ffmpeg -y -loglevel error -i "${VIDEO_PATH}" -vf "select=eq(n\,0)" -vframes 1 "${IMAGE_PATH}"
+fi
+
+EXTRA_PARAMS="$(jq -nc \
+  --arg domain "${DOMAIN_NAME}" \
+  --argjson dim "${RAW_ACTION_DIM}" \
+  --argjson chunk "${ACTION_CHUNK_SIZE}" \
+  '{action_mode:"policy", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk}')"
+
+create_response=$(
+  curl -sS -X POST "${BASE_URL}/v1/videos" \
+    -H "Accept: application/json" \
+    -F "prompt=${PROMPT}" \
+    -F "input_reference=@${IMAGE_PATH}" \
+    -F "size=${WIDTH}x${HEIGHT}" \
+    -F "num_frames=${NUM_FRAMES}" \
+    -F "fps=${FPS}" \
+    -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
+    -F "guidance_scale=${GUIDANCE_SCALE}" \
+    -F "flow_shift=${FLOW_SHIFT}" \
+    -F "extra_params=${EXTRA_PARAMS}" \
+    -F "seed=${SEED}"
+)
+
+video_id="$(echo "${create_response}" | jq -r '.id')"
+if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then
+  echo "Failed to create video job:"
+  echo "${create_response}" | jq .
+  exit 1
+fi
+
+echo "Created video job ${video_id}"
+while true; do
+  status_response="$(curl -sS "${BASE_URL}/v1/videos/${video_id}")"
+  status="$(echo "${status_response}" | jq -r '.status')"
+
+  case "${status}" in
+    queued|in_progress)
+      echo "Video job ${video_id} status: ${status}"
+      sleep "${POLL_INTERVAL}"
+      ;;
+    completed)
+      echo "${status_response}" | jq '.data[0].action' > "${ACTION_OUTPUT_PATH}"
+      break
+      ;;
+    failed)
+      echo "Video generation failed:"
+      echo "${status_response}" | jq .
+      exit 1
+      ;;
+    *)
+      echo "Unexpected status response:"
+      echo "${status_response}" | jq .
+      exit 1
+      ;;
+  esac
+done
+
+curl -sS -L "${BASE_URL}/v1/videos/${video_id}/content" -o "${OUTPUT_PATH}"
+
+echo "Saved video to ${OUTPUT_PATH}"
+echo "Saved action metadata to ${ACTION_OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_i2v.sh b/examples/online_serving/cosmos3/run_curl_i2v.sh
old mode 100644
new mode 100755
index eb65ca9621a..b4d1a594a59
--- a/examples/online_serving/cosmos3/run_curl_i2v.sh
+++ b/examples/online_serving/cosmos3/run_curl_i2v.sh
@@ -1,21 +1,49 @@
 #!/bin/bash
 # Cosmos3 image-to-video example using the sync video API.
+#
+# The prompt is loaded from the canonical input JSON shared with the offline
+# example. The companion image (robot_153.jpg) is auto-downloaded if missing.
 
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/i2v.json}"
+
 BASE_URL="${BASE_URL:-http://localhost:8091}"
-IMAGE_PATH="${IMAGE_PATH:-cherry_blossom.jpg}"
 OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_i2v.mp4}"
+IMAGE_PATH="${IMAGE_PATH:-robot_153.jpg}"
+
+if [ ! -f "${INPUT_JSON}" ]; then
+  echo "Missing input JSON: ${INPUT_JSON}" >&2
+  exit 1
+fi
+
+PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
+VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
+HEIGHT="$(jq -r '.height // 720' "${INPUT_JSON}")"
+WIDTH="$(jq -r '.width // 1280' "${INPUT_JSON}")"
+NUM_FRAMES="$(jq -r '.num_frames // 189' "${INPUT_JSON}")"
+FPS="$(jq -r '.fps // 24' "${INPUT_JSON}")"
+NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 35' "${INPUT_JSON}")"
+GUIDANCE_SCALE="$(jq -r '.guidance_scale // 6.0' "${INPUT_JSON}")"
+FLOW_SHIFT="$(jq -r '.flow_shift // 10.0' "${INPUT_JSON}")"
+
+if [ ! -f "${IMAGE_PATH}" ]; then
+  echo "Downloading ${VISION_URL} -> ${IMAGE_PATH}"
+  curl -sSL "${VISION_URL}" -o "${IMAGE_PATH}"
+fi
 
 curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
-  -F "prompt=Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
+  -F "prompt=${PROMPT}" \
   -F "negative_prompt=blurry, distorted, low quality" \
   -F "input_reference=@${IMAGE_PATH}" \
-  -F "size=1280x720" \
-  -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
+  -F "size=${WIDTH}x${HEIGHT}" \
+  -F "num_frames=${NUM_FRAMES}" \
+  -F "fps=${FPS}" \
+  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
+  -F "guidance_scale=${GUIDANCE_SCALE}" \
+  -F "flow_shift=${FLOW_SHIFT}" \
   -F "seed=42" \
   -o "${OUTPUT_PATH}"
 
diff --git a/examples/online_serving/cosmos3/run_curl_t2i.sh b/examples/online_serving/cosmos3/run_curl_t2i.sh
old mode 100644
new mode 100755
index 421b0664c35..04519446336
--- a/examples/online_serving/cosmos3/run_curl_t2i.sh
+++ b/examples/online_serving/cosmos3/run_curl_t2i.sh
@@ -1,21 +1,47 @@
 #!/bin/bash
 # Cosmos3 text-to-image example using the images API.
+#
+# The prompt is loaded from the canonical input JSON shared with the offline
+# example so updates only need to happen in one place.
 
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/t2i.json}"
+
 BASE_URL="${BASE_URL:-http://localhost:8091}"
 OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_t2i.png}"
 
+if [ ! -f "${INPUT_JSON}" ]; then
+  echo "Missing input JSON: ${INPUT_JSON}" >&2
+  exit 1
+fi
+
+PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
+HEIGHT="$(jq -r '.height // 960' "${INPUT_JSON}")"
+WIDTH="$(jq -r '.width // 960' "${INPUT_JSON}")"
+NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 50' "${INPUT_JSON}")"
+GUIDANCE_SCALE="$(jq -r '.guidance_scale // 4.0' "${INPUT_JSON}")"
+FLOW_SHIFT="$(jq -r '.flow_shift // 3.0' "${INPUT_JSON}")"
+
 curl -sS -X POST "${BASE_URL}/v1/images/generations" \
   -H "Content-Type: application/json" \
-  -d '{
-    "prompt": "A small warehouse robot carrying a blue box, clean product photography",
-    "size": "1024x1024",
-    "n": 1,
-    "num_inference_steps": 50,
-    "guidance_scale": 7.0,
-    "negative_prompt": "blurry, distorted, low quality",
-    "seed": 42
-  }' | jq -r '.data[0].b64_json' | base64 -d > "${OUTPUT_PATH}"
+  -d "$(jq -nc \
+        --arg prompt "${PROMPT}" \
+        --arg negative "blurry, distorted, low quality" \
+        --arg size "${WIDTH}x${HEIGHT}" \
+        --argjson steps "${NUM_INFERENCE_STEPS}" \
+        --argjson guidance "${GUIDANCE_SCALE}" \
+        --argjson flow_shift "${FLOW_SHIFT}" \
+        '{prompt:$prompt,
+          size:$size,
+          n:1,
+          num_inference_steps:$steps,
+          guidance_scale:$guidance,
+          flow_shift:$flow_shift,
+          negative_prompt:$negative,
+          seed:42}')" \
+  | jq -r '.data[0].b64_json' | base64 -d > "${OUTPUT_PATH}"
 
 echo "Saved image to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_t2v.sh b/examples/online_serving/cosmos3/run_curl_t2v.sh
old mode 100644
new mode 100755
index dc436b28277..c6cd147579f
--- a/examples/online_serving/cosmos3/run_curl_t2v.sh
+++ b/examples/online_serving/cosmos3/run_curl_t2v.sh
@@ -1,19 +1,41 @@
 #!/bin/bash
 # Cosmos3 text-to-video example using the sync video API.
+#
+# The prompt is loaded from the canonical input JSON shared with the offline
+# example so updates only need to happen in one place.
 
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/t2v.json}"
+
 BASE_URL="${BASE_URL:-http://localhost:8091}"
 OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_t2v.mp4}"
 
+if [ ! -f "${INPUT_JSON}" ]; then
+  echo "Missing input JSON: ${INPUT_JSON}" >&2
+  exit 1
+fi
+
+PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
+HEIGHT="$(jq -r '.height // 720' "${INPUT_JSON}")"
+WIDTH="$(jq -r '.width // 1280' "${INPUT_JSON}")"
+NUM_FRAMES="$(jq -r '.num_frames // 189' "${INPUT_JSON}")"
+FPS="$(jq -r '.fps // 24' "${INPUT_JSON}")"
+NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 35' "${INPUT_JSON}")"
+GUIDANCE_SCALE="$(jq -r '.guidance_scale // 6.0' "${INPUT_JSON}")"
+FLOW_SHIFT="$(jq -r '.flow_shift // 10.0' "${INPUT_JSON}")"
+
 curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
-  -F "prompt=A small warehouse robot moves a blue box across a clean floor." \
+  -F "prompt=${PROMPT}" \
   -F "negative_prompt=blurry, distorted, low quality" \
-  -F "size=1280x720" \
-  -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
+  -F "size=${WIDTH}x${HEIGHT}" \
+  -F "num_frames=${NUM_FRAMES}" \
+  -F "fps=${FPS}" \
+  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
+  -F "guidance_scale=${GUIDANCE_SCALE}" \
+  -F "flow_shift=${FLOW_SHIFT}" \
   -F "seed=42" \
   -o "${OUTPUT_PATH}"
 
diff --git a/examples/online_serving/cosmos3/run_curl_t2v_sound.sh b/examples/online_serving/cosmos3/run_curl_t2v_sound.sh
old mode 100644
new mode 100755
index 3c82f965b7d..e026dfa6c04
--- a/examples/online_serving/cosmos3/run_curl_t2v_sound.sh
+++ b/examples/online_serving/cosmos3/run_curl_t2v_sound.sh
@@ -1,21 +1,44 @@
 #!/bin/bash
-# Cosmos3 video-with-sound example. Requires a sound-capable checkpoint.
+# Cosmos3 video-with-sound example.
+#
+# The prompt is loaded from the canonical input JSON shared with the offline
+# example. sound_duration is read from the JSON when present.
 
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
+INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/t2v_sound.json}"
+
 BASE_URL="${BASE_URL:-http://localhost:8091}"
 OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_t2v_sound.mp4}"
 
+if [ ! -f "${INPUT_JSON}" ]; then
+  echo "Missing input JSON: ${INPUT_JSON}" >&2
+  exit 1
+fi
+
+PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
+SOUND_DURATION="$(jq -r '.sound_duration // 3.4' "${INPUT_JSON}")"
+HEIGHT="$(jq -r '.height // 720' "${INPUT_JSON}")"
+WIDTH="$(jq -r '.width // 1280' "${INPUT_JSON}")"
+NUM_FRAMES="$(jq -r '.num_frames // 189' "${INPUT_JSON}")"
+FPS="$(jq -r '.fps // 24' "${INPUT_JSON}")"
+NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 35' "${INPUT_JSON}")"
+GUIDANCE_SCALE="$(jq -r '.guidance_scale // 6.0' "${INPUT_JSON}")"
+FLOW_SHIFT="$(jq -r '.flow_shift // 10.0' "${INPUT_JSON}")"
+
 curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
-  -F "prompt=A small warehouse robot rolls across the floor with soft motor sounds." \
+  -F "prompt=${PROMPT}" \
   -F "negative_prompt=blurry, distorted, low quality" \
-  -F "size=1280x720" \
-  -F "num_frames=81" \
-  -F "fps=24" \
-  -F "num_inference_steps=35" \
-  -F "guidance_scale=4.0" \
+  -F "size=${WIDTH}x${HEIGHT}" \
+  -F "num_frames=${NUM_FRAMES}" \
+  -F "fps=${FPS}" \
+  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
+  -F "guidance_scale=${GUIDANCE_SCALE}" \
+  -F "flow_shift=${FLOW_SHIFT}" \
   -F "generate_sound=true" \
-  -F "sound_duration=3.4" \
+  -F "sound_duration=${SOUND_DURATION}" \
   -F "seed=42" \
   -o "${OUTPUT_PATH}"
 
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
index f86e451fdaa..3a4b33962de 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -111,6 +111,34 @@ def test_preprocess_preserves_explicit_size_for_i2v(self) -> None:
 
         assert tuple(result.prompts[0]["additional_information"]["preprocessed_image"].shape[-2:]) == (64, 96)
 
+    def test_preprocess_action_video_stores_image_and_video_tensors(self) -> None:
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
+
+        frames = [
+            Image.new("RGB", (8, 4), "red"),
+            Image.new("RGB", (8, 4), "green"),
+            Image.new("RGB", (8, 4), "blue"),
+        ]
+        request = SimpleNamespace(
+            prompts=[
+                {
+                    "prompt": "Move the robot.",
+                    "multi_modal_data": {"video": frames},
+                }
+            ],
+            sampling_params=SimpleNamespace(
+                height=16,
+                width=32,
+                extra_args={"action_mode": "forward_dynamics"},
+            ),
+        )
+
+        result = get_cosmos3_pre_process_func(SimpleNamespace())(request)
+        additional = result.prompts[0]["additional_information"]
+
+        assert tuple(additional["preprocessed_image"].shape) == (1, 3, 16, 32)
+        assert tuple(additional["preprocessed_video"].shape) == (1, 3, 3, 16, 32)
+
     def test_postprocess_latent_passthrough_and_t2i_shape_validation(self) -> None:
         from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_post_process_func
 
@@ -950,6 +978,65 @@ def test_forward_action_defaults_to_reference_chunk_size(self, make_cosmos3_pipe
         assert captured["format"]["num_frames"] == 17
         assert captured["diffuse_calls"][0]["action_latents"].shape == (1, 16, 4)
 
+    def test_forward_forward_dynamics_uses_action_video_conditioning(self, make_cosmos3_pipeline) -> None:
+        pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
+        captured = self._install_forward_stubs(pipeline)
+        video_tensor = torch.zeros(1, 3, 3, 16, 16)
+        condition_latents = torch.full((1, 2, 1, 2, 2), 6.0)
+
+        def fake_prepare_action_latents(**kwargs):
+            captured["prepare_action_latents"] = kwargs
+            action_latents = torch.zeros(1, 2, 4)
+            action_velocity_mask = torch.zeros(1, 2, 1)
+            clean_action = torch.zeros(1, 2, 4)
+            return action_latents, action_velocity_mask, clean_action, 2
+
+        def fake_prepare_action_video(video, mode, height, width, num_frames, generator):
+            captured["prepare_action_video"] = (video, mode, height, width, num_frames, generator.initial_seed())
+            latents = torch.zeros(1, 2, 1, 2, 2)
+            velocity_mask = torch.ones(1, 1, 1, 1, 1)
+            return latents, velocity_mask, condition_latents
+
+        def fail_prepare_i2v(*args, **kwargs):
+            raise AssertionError("forward_dynamics video input must not route through the i2v image path")
+
+        pipeline._prepare_action_latents = fake_prepare_action_latents  # type: ignore[method-assign]
+        pipeline._prepare_latents_action_video = fake_prepare_action_video  # type: ignore[method-assign]
+        pipeline._prepare_latents_i2v = fail_prepare_i2v  # type: ignore[method-assign]
+        req = SimpleNamespace(
+            prompts=[
+                {
+                    "prompt": "Move the robot.",
+                    "modalities": ["video"],
+                    "additional_information": {
+                        "preprocessed_image": torch.zeros(1, 3, 16, 16),
+                        "preprocessed_video": video_tensor,
+                    },
+                }
+            ],
+            sampling_params=make_sampling_params(
+                height=16,
+                width=16,
+                num_frames=3,
+                extra_args={
+                    "action_mode": "forward_dynamics",
+                    "action_chunk_size": 2,
+                    "domain_id": 0,
+                },
+            ),
+        )
+
+        pipeline.forward(req)
+
+        prepared_video, mode, height, width, num_frames, seed = captured["prepare_action_video"]
+        assert prepared_video is video_tensor
+        assert (mode, height, width, num_frames, seed) == ("forward_dynamics", 16, 16, 3, 123)
+        assert captured["prepare_action_latents"]["mode"] == "forward_dynamics"
+        diffuse_call = captured["diffuse_calls"][0]
+        assert diffuse_call["condition_latents"] is condition_latents
+        torch.testing.assert_close(diffuse_call["image_latent"], condition_latents[:, :, 0:1])
+
     def test_forward_video_sound_decodes_and_returns_audio_payload(self, make_cosmos3_pipeline) -> None:
         pipeline = make_cosmos3_pipeline()
         pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index 1fb870b7d97..c25a5229290 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -164,7 +164,7 @@ def test_cosmos3_hsdp_conditions_match_und_and_gen_blocks() -> None:
 def test_cosmos3_transformer_exposes_layerwise_offload_and_repeated_blocks() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
-    assert Cosmos3VFMTransformer._layerwise_offload_blocks_attr == "gen_layers"
+    assert Cosmos3VFMTransformer._layerwise_offload_blocks_attrs == ["gen_layers"]
     assert Cosmos3VFMTransformer._repeated_blocks == ["Cosmos3GenDecoderLayer"]
 
 
diff --git a/tests/diffusion/models/test_cosmos3_guardrails.py b/tests/diffusion/models/test_cosmos3_guardrails.py
index 53b03114200..2e3457e174a 100644
--- a/tests/diffusion/models/test_cosmos3_guardrails.py
+++ b/tests/diffusion/models/test_cosmos3_guardrails.py
@@ -5,7 +5,6 @@
 
 import pytest
 import torch
-from transformers.modeling_outputs import BaseModelOutputWithPooling
 from transformers.tokenization_utils_base import BatchEncoding
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
@@ -73,39 +72,3 @@ def test_qwen_guardrail_generation_accepts_tensor_input_ids() -> None:
     assert len(args) == 1
     assert torch.equal(args[0], input_ids)
     assert kwargs == {"max_new_tokens": 128}
-
-
-def test_siglip_feature_extraction_accepts_tensor() -> None:
-    from vllm_omni.diffusion.models.cosmos3.guardrails import _extract_siglip_image_features
-
-    features = torch.randn(1, 1152)
-
-    assert _extract_siglip_image_features(features) is features
-
-
-def test_siglip_feature_extraction_accepts_base_model_output_with_pooling() -> None:
-    from vllm_omni.diffusion.models.cosmos3.guardrails import _extract_siglip_image_features
-
-    last_hidden_state = torch.randn(1, 729, 1152)
-    pooler_output = torch.randn(1, 1152)
-    output = BaseModelOutputWithPooling(last_hidden_state=last_hidden_state, pooler_output=pooler_output)
-
-    assert _extract_siglip_image_features(output) is pooler_output
-
-
-def test_siglip_feature_extraction_accepts_tuple_output() -> None:
-    from vllm_omni.diffusion.models.cosmos3.guardrails import _extract_siglip_image_features
-
-    last_hidden_state = torch.randn(1, 729, 1152)
-    pooler_output = torch.randn(1, 1152)
-
-    assert _extract_siglip_image_features((last_hidden_state, pooler_output)) is pooler_output
-
-
-def test_siglip_feature_extraction_rejects_unpooled_features() -> None:
-    from vllm_omni.diffusion.models.cosmos3.guardrails import _extract_siglip_image_features
-
-    last_hidden_state = torch.randn(1, 729, 1152)
-
-    with pytest.raises(TypeError, match="pooled features"):
-        _extract_siglip_image_features(last_hidden_state)
diff --git a/vllm_omni/diffusion/models/cosmos3/guardrails.py b/vllm_omni/diffusion/models/cosmos3/guardrails.py
index e739d17b962..0aaac1a7639 100644
--- a/vllm_omni/diffusion/models/cosmos3/guardrails.py
+++ b/vllm_omni/diffusion/models/cosmos3/guardrails.py
@@ -3,7 +3,7 @@
 """Cosmos3 guardrail hooks for vllm-omni.
 
 Text: Blocklist (keyword matching) + Qwen3Guard (0.6B LLM classifier)
-Video: SigLIP-based content safety filter + RetinaFace face blur
+Video: RetinaFace face blur
 
 Enable via custom_pipeline_args or the test script:
     python test_cosmos3.py --model ... --guardrails
@@ -19,7 +19,6 @@
 import cv2
 import numpy as np
 import torch
-import torch.nn as nn
 from vllm.logger import init_logger
 
 from vllm_omni.diffusion.data import GuardrailViolationError
@@ -36,7 +35,6 @@
 
 GUARDRAIL_HF_REPO = "nvidia/Cosmos-Guardrail1"
 GUARDRAIL_HF_REVISION = "d6d4bfa899a71454a700907664f3e88f503950cf"
-CUTOFF_UNSAFE_FRAMES_PERCENT = 10
 
 
 def set_text_guardrail(fn: TextGuardrailFn) -> None:
@@ -49,38 +47,6 @@ def set_video_guardrail(fn: VideoGuardrailFn) -> None:
     _video_guardrail = fn
 
 
-# ---------------------------------------------------------------------------
-# Video safety classifier (matches reference: SigLIP so400m + 3-layer head)
-# ---------------------------------------------------------------------------
-class SafetyClassifier(nn.Module):
-    """3-layer classifier with BatchNorm (1152 → 512 → 256 → 7)."""
-
-    def __init__(self, input_size: int = 1152, num_classes: int = 7):
-        super().__init__()
-        self.layers = nn.Sequential(
-            nn.Linear(input_size, 512),
-            nn.BatchNorm1d(512),
-            nn.ReLU(),
-            nn.Linear(512, 256),
-            nn.BatchNorm1d(256),
-            nn.ReLU(),
-            nn.Linear(256, num_classes),
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.layers(x)
-
-
-CLASS_IDX_TO_NAME = {
-    0: "Safe",
-    1: "Sexual_Content",
-    3: "Drugs",
-    4: "Child_Abuse",
-    5: "Hate_and_Harassment",
-    6: "Self-Harm",
-}
-
-
 # ---------------------------------------------------------------------------
 # Face pixelation utility
 # ---------------------------------------------------------------------------
@@ -158,44 +124,6 @@ def _generate_qwen_guardrail_response(prompt: str, tokenizer: Any, model: Any, d
     )
 
 
-def _extract_siglip_image_features(features: object) -> torch.Tensor:
-    def _validate_features(tensor: torch.Tensor) -> torch.Tensor:
-        if tensor.dim() != 2:
-            raise TypeError(
-                "SigLIP image feature extractor returned features with shape "
-                f"{tuple(tensor.shape)}; expected pooled features with shape [batch, hidden]."
-            )
-        return tensor
-
-    if isinstance(features, torch.Tensor):
-        return _validate_features(features)
-
-    pooler_output = getattr(features, "pooler_output", None)
-    if isinstance(pooler_output, torch.Tensor):
-        return _validate_features(pooler_output)
-
-    image_embeds = getattr(features, "image_embeds", None)
-    if isinstance(image_embeds, torch.Tensor):
-        return _validate_features(image_embeds)
-
-    if isinstance(features, Mapping):
-        for key in ("pooler_output", "image_embeds"):
-            value = features.get(key)
-            if isinstance(value, torch.Tensor):
-                return _validate_features(value)
-
-    if isinstance(features, list | tuple):
-        if len(features) > 1 and isinstance(features[1], torch.Tensor):
-            return _validate_features(features[1])
-        if features and isinstance(features[0], torch.Tensor):
-            return _validate_features(features[0])
-
-    raise TypeError(
-        "SigLIP image feature extractor returned unsupported output type "
-        f"{type(features).__name__}; expected a tensor or output with pooled image features."
-    )
-
-
 def _build_text_guardrail(offload_to_cpu: bool) -> TextGuardrailFn:
     checkers: list[Callable[[str], tuple[bool, str]]] = []
 
@@ -273,7 +201,6 @@ def text_guardrail(prompt: str) -> None:
 
 def _build_video_guardrail(offload_to_cpu: bool) -> VideoGuardrailFn:
     ckpt_dir = _download_checkpoint()
-    safety_checker: Callable[[np.ndarray], tuple[bool, str]] | None = None
     face_blurrer: Callable[[np.ndarray], np.ndarray] | None = None
 
     # `offload_to_cpu` controls idle weight placement only; the forward pass
@@ -281,62 +208,7 @@ def _build_video_guardrail(offload_to_cpu: bool) -> VideoGuardrailFn:
     compute_device = "cuda"
     idle_device = "cpu" if offload_to_cpu else compute_device
 
-    # 1. Video content safety filter: SigLIP so400m + SafetyClassifier
-    try:
-        from PIL import Image
-        from transformers import SiglipModel, SiglipProcessor
-
-        siglip_id = "google/siglip-so400m-patch14-384"
-        siglip_model = SiglipModel.from_pretrained(siglip_id).to(idle_device, dtype=torch.float32).eval()
-        siglip_processor = SiglipProcessor.from_pretrained(siglip_id)
-
-        classifier = SafetyClassifier(input_size=1152, num_classes=7)
-        ckpt_path = os.path.join(ckpt_dir, "video_content_safety_filter", "safety_filter.pt")
-        checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=True)
-        # Checkpoint keys have "network." prefix from the VideoSafetyModel wrapper.
-        state = {k.removeprefix("network."): v for k, v in checkpoint["model"].items()}
-        classifier.load_state_dict(state)
-        classifier = classifier.to(idle_device, dtype=torch.float32).eval()
-
-        def _safety_check(frames: np.ndarray) -> tuple[bool, str]:
-            nonlocal siglip_model, classifier
-            total = len(frames)
-            if total == 0:
-                return True, ""
-            if offload_to_cpu:
-                siglip_model = siglip_model.to(compute_device)
-                classifier = classifier.to(compute_device)
-
-            unsafe_count = 0
-            try:
-                for frame in frames:
-                    img = Image.fromarray(frame)
-                    inputs = siglip_processor(images=img, return_tensors="pt").to(compute_device, dtype=torch.float32)
-                    with torch.no_grad():
-                        features = siglip_model.get_image_features(**inputs)
-                        features = _extract_siglip_image_features(features)
-                        features = torch.nn.functional.normalize(features, p=2, dim=-1)
-                        logits = classifier(features)
-                        pred = logits.argmax(dim=-1).item()
-                    class_name = CLASS_IDX_TO_NAME.get(pred, "Unknown")
-                    if class_name != "Safe":
-                        unsafe_count += 1
-            finally:
-                if offload_to_cpu:
-                    siglip_model = siglip_model.to("cpu")
-                    classifier = classifier.to("cpu")
-
-            if unsafe_count / total > CUTOFF_UNSAFE_FRAMES_PERCENT / 100:
-                return False, f"Video content safety: {unsafe_count}/{total} frames unsafe"
-            return True, ""
-
-        safety_checker = _safety_check
-        if _is_rank_zero():
-            logger.info("Video content safety filter loaded (SigLIP so400m + classifier)")
-    except (ImportError, FileNotFoundError) as e:
-        logger.warning("Could not load video safety filter: %s", e)
-
-    # 2. Face blur: RetinaFace + pixelation
+    # Face blur: RetinaFace + pixelation
     try:
         from retinaface.data import cfg_re50
         from retinaface.layers.functions.prior_box import PriorBox
@@ -452,10 +324,6 @@ def _face_blur(frames: np.ndarray) -> np.ndarray:
         logger.warning("Could not load face blur filter: %s", e)
 
     def video_guardrail(frames: np.ndarray) -> np.ndarray:
-        if safety_checker is not None:
-            is_safe, msg = safety_checker(frames)
-            if not is_safe:
-                raise GuardrailViolationError(f"Guardrail blocked video: {msg}")
         if face_blurrer is not None:
             frames = face_blurrer(frames)
         return frames
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 2dacb80f948..f05d036c525 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -1651,7 +1651,6 @@ def forward(
         flow_shift_target = float(self._get_sp_param(sp, "flow_shift", default_flow_shift))
         guidance_interval = self._get_sp_param(sp, "guidance_interval", default_guidance_interval)
 
-        seed = sp.seed if sp.seed is not None else 42
         frame_rate = self._get_sp_param(sp, "resolved_frame_rate") or self._get_sp_param(sp, "frame_rate") or 24.0
         max_sequence_length = self._get_sp_param(sp, "max_sequence_length", 512) or 512
         use_system_prompt = bool(self._get_sp_param(sp, "use_system_prompt", False))
@@ -1681,7 +1680,10 @@ def forward(
         # transitions restore the right schedule (no T2I to T2V leak).
         self._set_flow_shift(flow_shift_target)
 
-        generator = torch.Generator(device=self.device).manual_seed(seed)
+        generator = sp.generator
+        if generator is None:
+            seed = sp.seed if sp.seed is not None else 42
+            generator = torch.Generator(device=self.device).manual_seed(seed)
 
         # --- Format prompts & tokenize (B=1; reused across loop iterations
         # for T2I num_outputs_per_prompt > 1) ---

From 4fa8f655858d9a23837eb3e12add7f2862ac369d Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 19 May 2026 18:28:26 +0200
Subject: [PATCH 16/41] Doc cleanup

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 docs/user_guide/diffusion_features.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md
index 9967eac3385..52c04fd8c17 100644
--- a/docs/user_guide/diffusion_features.md
+++ b/docs/user_guide/diffusion_features.md
@@ -151,7 +151,6 @@ The following tables show which models support each feature:
 | **HunyuanVideo-1.5 T2V I2V** |     ❌     |     ✅      |           ❌           |       ✅        |         ✅         |         ❌         |   ✅    |             ✅             |      ✅ (decode)      |       ✅        |        ❌         |
 | **DreamID-Omni**             |     ❌     |     ❌      |           ❌           |       ✅        |         ❌         |         ❌         |   ✅    |             ✅             |          ❌           |       ❌        |        ❌         |
 | **Cosmos3**                  |     ❌     |     ✅      |           ✅           |       ✅        |         ✅         |         ❌         |   ✅    |             ✅             |  ✅ (encode/decode)   |       ✅        |        ❌         |
-| **Cosmos3**                  | ❌ | ✅ | ✅ (Ulysses) | ✅ | ✅ | ✅ | ✅ | ✅ (encode/decode) | ✅ | ❌ |
 
 > Notes:
 > 1. Cosmos3 T2V and I2V use `Cosmos3OmniDiffusersPipeline` with video output. I2V is selected when the request includes an input image. Model-level CPU offload is not supported; use layerwise offload.

From 3ac4c71f0ff15924e6a392d1b3a79e2d2adb455e Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 19 May 2026 18:29:06 +0200
Subject: [PATCH 17/41] Doc cleanup v2

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 docs/user_guide/diffusion_features.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md
index 52c04fd8c17..c14ad293a2e 100644
--- a/docs/user_guide/diffusion_features.md
+++ b/docs/user_guide/diffusion_features.md
@@ -152,8 +152,6 @@ The following tables show which models support each feature:
 | **DreamID-Omni**             |     ❌     |     ❌      |           ❌           |       ✅        |         ❌         |         ❌         |   ✅    |             ✅             |          ❌           |       ❌        |        ❌         |
 | **Cosmos3**                  |     ❌     |     ✅      |           ✅           |       ✅        |         ✅         |         ❌         |   ✅    |             ✅             |  ✅ (encode/decode)   |       ✅        |        ❌         |
 
-> Notes:
-> 1. Cosmos3 T2V and I2V use `Cosmos3OmniDiffusersPipeline` with video output. I2V is selected when the request includes an input image. Model-level CPU offload is not supported; use layerwise offload.
 
 **Frame Interpolation Support**
 

From b16e4c7e86a1dc809eebf31b1ab1af5a10e5b98c Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 19 May 2026 19:29:06 +0200
Subject: [PATCH 18/41] Updated deploy config

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../examples/online_serving/cosmos3.md        | 18 ++++++++
 examples/online_serving/cosmos3/README.md     | 13 ++++--
 .../cosmos3/cosmos3_no_guardrails.yaml        | 41 ++++++-------------
 examples/online_serving/cosmos3/run_server.sh |  6 ++-
 vllm_omni/config/pipeline_registry.py         |  4 ++
 vllm_omni/deploy/cosmos3.yaml                 | 14 +++++++
 .../diffusion/models/cosmos3/pipeline.py      | 26 ++++++++++++
 vllm_omni/entrypoints/utils.py                |  1 +
 8 files changed, 90 insertions(+), 33 deletions(-)
 create mode 100644 vllm_omni/deploy/cosmos3.yaml
 create mode 100644 vllm_omni/diffusion/models/cosmos3/pipeline.py

diff --git a/docs/user_guide/examples/online_serving/cosmos3.md b/docs/user_guide/examples/online_serving/cosmos3.md
index d61deb427cc..25fec71fe33 100644
--- a/docs/user_guide/examples/online_serving/cosmos3.md
+++ b/docs/user_guide/examples/online_serving/cosmos3.md
@@ -20,6 +20,24 @@ bash run_server.sh
 - `ENABLE_LAYERWISE_OFFLOAD`: set to `1` to enable layerwise offload
 - `CFG_PARALLEL_SIZE`, `TENSOR_PARALLEL_SIZE`, `ULYSSES_DEGREE`, `USE_HSDP`: parallel execution controls
 - `ALLOWED_LOCAL_MEDIA_PATH`: local media access path, defaults to `/`
+- `DEPLOY_CONFIG`: optional deploy YAML override; defaults to the bundled Cosmos3 deploy config
+
+## Disabling Guardrails
+
+Cosmos3 ships with prompt and generated-output guardrails. To skip loading guardrail models for the whole server, start with the no-guardrails deploy override:
+
+```bash
+vllm serve nvidia/Cosmos3-Nano --omni \
+  --deploy-config examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml \
+  --port 8091
+```
+
+The helper script accepts the same override through `DEPLOY_CONFIG`:
+
+```bash
+cd examples/online_serving/cosmos3
+DEPLOY_CONFIG=cosmos3_no_guardrails.yaml bash run_server.sh
+```
 
 ## Text-to-Image
 
diff --git a/examples/online_serving/cosmos3/README.md b/examples/online_serving/cosmos3/README.md
index d027618bbb3..2eb5e06f6fa 100644
--- a/examples/online_serving/cosmos3/README.md
+++ b/examples/online_serving/cosmos3/README.md
@@ -17,6 +17,7 @@ bash run_server.sh
 - `ENABLE_LAYERWISE_OFFLOAD`: set to `1` to enable layerwise offload
 - `CFG_PARALLEL_SIZE`, `TENSOR_PARALLEL_SIZE`, `ULYSSES_DEGREE`, `USE_HSDP`: parallel execution controls
 - `ALLOWED_LOCAL_MEDIA_PATH`: local media access path, defaults to `/`
+- `DEPLOY_CONFIG`: optional deploy YAML override; defaults to the bundled Cosmos3 deploy config
 
 ## Disabling guardrails
 
@@ -24,15 +25,21 @@ Cosmos3 ships with safety guardrails that check prompts and apply generated-outp
 
 ### Server-wide (skip loading guardrail models entirely)
 
-Start the server with `--stage-configs-path cosmos3_no_guardrails.yaml`, which sets `model_config.guardrails: false` on the diffusion stage so the guardrail models are never loaded:
+Start the server with `--deploy-config cosmos3_no_guardrails.yaml`, which sets `model_config.guardrails: false` on the diffusion stage so the guardrail models are never loaded:
 
 ```bash
 vllm serve nvidia/Cosmos3-Nano --omni \
-  --model-class-name Cosmos3OmniDiffusersPipeline \
-  --stage-configs-path examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml \
+  --deploy-config examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml \
   --port 8091
 ```
 
+The same override can be used with the helper script:
+
+```bash
+cd examples/online_serving/cosmos3
+DEPLOY_CONFIG=cosmos3_no_guardrails.yaml bash run_server.sh
+```
+
 Other CLI flags (parallelism, cache backend, layerwise offload, etc.) are still honored; the YAML only overrides the guardrail toggle. When this path is used, per-request overrides cannot turn guardrails back on — the underlying models are not in memory.
 
 ### Per-request (skip checks for a single generation)
diff --git a/examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml b/examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml
index c677bfaf294..858a3b2ab6c 100644
--- a/examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml
+++ b/examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml
@@ -1,36 +1,19 @@
-# Cosmos3 stage config that disables guardrails at server startup.
+# Cosmos3 deploy override that disables guardrails at server startup.
 #
 # Usage:
 #   vllm serve nvidia/Cosmos3-Nano --omni \
-#     --model-class-name Cosmos3OmniDiffusersPipeline \
-#     --stage-configs-path examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml \
+#     --deploy-config examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml \
 #     --port 8091
 #
-# The legacy `stage_args:` schema is used because Cosmos3 is not yet declared
-# in `vllm_omni/config/pipeline_registry.py`, so the newer `--deploy-config`
-# path returns no stages for this model and falls back to the default-diffusion
-# factory which does not honor user-supplied model_config overrides. The
-# legacy `--stage-configs-path` loader merges this file directly into the
-# stage's engine_args, so `model_config.guardrails: false` reaches
-# `OmniDiffusionConfig.model_config` and gates `ensure_initialized()` at
-# pipeline build time.
-#
-# Other CLI flags (parallelism, cache backend, layerwise offload, etc.) are
-# still honored: this file only sets the fields it explicitly overrides; the
-# rest fall back to CLI / OmniDiffusionConfig defaults.
 
-stage_args:
+async_chunk: false
+trust_remote_code: true
+
+stages:
   - stage_id: 0
-    stage_type: diffusion
-    runtime:
-      devices: "0"
-    engine_args:
-      model_class_name: Cosmos3OmniDiffusersPipeline
-      max_num_seqs: 1
-      enforce_eager: true
-      trust_remote_code: true
-      model_config:
-        guardrails: false
-        offload_guardrail_models: false
-    final_output: true
-    final_output_type: image
+    max_num_seqs: 1
+    enforce_eager: true
+    model_class_name: Cosmos3OmniDiffusersPipeline
+    model_config:
+      guardrails: false
+      offload_guardrail_models: false
diff --git a/examples/online_serving/cosmos3/run_server.sh b/examples/online_serving/cosmos3/run_server.sh
index c45685c6c0d..9b75e810d34 100644
--- a/examples/online_serving/cosmos3/run_server.sh
+++ b/examples/online_serving/cosmos3/run_server.sh
@@ -12,17 +12,21 @@ TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-1}"
 ULYSSES_DEGREE="${ULYSSES_DEGREE:-1}"
 USE_HSDP="${USE_HSDP:-0}"
 ALLOWED_LOCAL_MEDIA_PATH="${ALLOWED_LOCAL_MEDIA_PATH:-/}"
+DEPLOY_CONFIG="${DEPLOY_CONFIG:-}"
 
 args=(
   vllm serve "${MODEL}"
   --omni
   --port "${PORT}"
-  --model-class-name Cosmos3OmniDiffusersPipeline
   --allowed-local-media-path "${ALLOWED_LOCAL_MEDIA_PATH}"
   --cfg-parallel-size "${CFG_PARALLEL_SIZE}"
   --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}"
 )
 
+if [ -n "${DEPLOY_CONFIG}" ]; then
+  args+=(--deploy-config "${DEPLOY_CONFIG}")
+fi
+
 if [ "${ULYSSES_DEGREE}" != "1" ]; then
   args+=(--usp "${ULYSSES_DEGREE}")
 fi
diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py
index 555f35e173a..046aa89f82c 100644
--- a/vllm_omni/config/pipeline_registry.py
+++ b/vllm_omni/config/pipeline_registry.py
@@ -33,6 +33,10 @@
 # --- Multi-stage omni pipelines (LLM-centric; audio / video I/O) ---
 _OMNI_PIPELINES: dict[str, tuple[str, str]] = {
     # model_type -> (module_path, variable_name)
+    "cosmos3": (
+        "vllm_omni.diffusion.models.cosmos3.pipeline",
+        "COSMOS3_PIPELINE",
+    ),
     "qwen2_5_omni": (
         "vllm_omni.model_executor.models.qwen2_5_omni.pipeline",
         "QWEN2_5_OMNI_PIPELINE",
diff --git a/vllm_omni/deploy/cosmos3.yaml b/vllm_omni/deploy/cosmos3.yaml
new file mode 100644
index 00000000000..2f3ed85a797
--- /dev/null
+++ b/vllm_omni/deploy/cosmos3.yaml
@@ -0,0 +1,14 @@
+# Cosmos3 single-stage diffusion deploy config.
+#
+# This config is auto-loaded for Diffusers repos whose model_index.json has
+# _class_name: Cosmos3OmniDiffusersPipeline. Pass --deploy-config only for
+# local overrides such as disabling guardrails.
+
+async_chunk: false
+trust_remote_code: true
+
+stages:
+  - stage_id: 0
+    max_num_seqs: 1
+    enforce_eager: true
+    model_class_name: Cosmos3OmniDiffusersPipeline
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline.py b/vllm_omni/diffusion/models/cosmos3/pipeline.py
new file mode 100644
index 00000000000..a6c84959586
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline.py
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cosmos3 deploy-schema topology."""
+
+from vllm_omni.config.stage_config import (
+    PipelineConfig,
+    StageExecutionType,
+    StagePipelineConfig,
+)
+
+COSMOS3_PIPELINE = PipelineConfig(
+    model_type="cosmos3",
+    model_arch="Cosmos3ForConditionalGeneration",
+    hf_architectures=("Cosmos3ForConditionalGeneration",),
+    diffusers_class_name="Cosmos3OmniDiffusersPipeline",
+    stages=(
+        StagePipelineConfig(
+            stage_id=0,
+            model_stage="diffusion",
+            execution_type=StageExecutionType.DIFFUSION,
+            input_sources=(),
+            final_output=True,
+            final_output_type="image",
+        ),
+    ),
+)
diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py
index de7eb5f4c7e..79351a3266b 100644
--- a/vllm_omni/entrypoints/utils.py
+++ b/vllm_omni/entrypoints/utils.py
@@ -36,6 +36,7 @@ def _warn_deprecated_explicit_keys(kwargs: dict[str, Any]) -> None:
 
 
 _DIFFUSERS_CLASS_TO_CONFIG: dict[str, str] = {
+    "Cosmos3OmniDiffusersPipeline": "cosmos3",
     "GlmImagePipeline": "glm_image",
 }
 

From 149700e3a8bc0192be5dc8c73319ed61a149f48e Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Wed, 20 May 2026 11:31:55 +0200
Subject: [PATCH 19/41] Removed examples for now

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .gitignore                                    |   2 -
 docs/.nav.yml                                 |   2 -
 .../diffusion/cache_acceleration/cache_dit.md |  20 -
 .../diffusion/cpu_offload_diffusion.md        |   2 -
 .../diffusion/parallelism/cfg_parallel.md     |   9 -
 .../examples/offline_inference/cosmos3.md     |  90 --
 .../examples/online_serving/cosmos3.md        | 110 ---
 examples/offline_inference/cosmos3/README.md  | 162 ----
 examples/offline_inference/cosmos3/end2end.py | 807 ------------------
 .../inputs/action_forward_dynamics_av.json    |  17 -
 .../action_forward_dynamics_camera.jsonl      |   2 -
 .../inputs/action_forward_dynamics_robot.json |  17 -
 .../inputs/action_inverse_dynamics_av.json    |  16 -
 .../inputs/action_inverse_dynamics_robot.json |  16 -
 .../cosmos3/inputs/action_policy_av.json      |  16 -
 .../cosmos3/inputs/action_policy_robot.json   |  16 -
 .../offline_inference/cosmos3/inputs/i2v.json |  11 -
 .../offline_inference/cosmos3/inputs/t2i.json |   9 -
 .../offline_inference/cosmos3/inputs/t2v.json |  10 -
 .../cosmos3/inputs/t2v_sound.json             |  12 -
 .../image_to_video/image_to_video.py          |  70 +-
 .../text_to_image/text_to_image.py            |   3 +-
 .../text_to_video/text_to_video.py            |  29 +-
 examples/online_serving/cosmos3/README.md     | 164 ----
 .../cosmos3/cosmos3_no_guardrails.yaml        |  19 -
 .../run_curl_action_forward_dynamics_av.sh    |  73 --
 ...run_curl_action_forward_dynamics_camera.sh |  76 --
 .../run_curl_action_forward_dynamics_robot.sh |  77 --
 .../run_curl_action_inverse_dynamics_av.sh    | 106 ---
 .../run_curl_action_inverse_dynamics_robot.sh | 106 ---
 .../cosmos3/run_curl_action_policy.sh         | 108 ---
 .../cosmos3/run_curl_action_policy_av.sh      | 107 ---
 .../online_serving/cosmos3/run_curl_i2v.sh    |  50 --
 .../online_serving/cosmos3/run_curl_t2i.sh    |  47 -
 .../online_serving/cosmos3/run_curl_t2v.sh    |  42 -
 .../cosmos3/run_curl_t2v_sound.sh             |  45 -
 examples/online_serving/cosmos3/run_server.sh |  47 -
 37 files changed, 23 insertions(+), 2492 deletions(-)
 delete mode 100644 docs/user_guide/examples/offline_inference/cosmos3.md
 delete mode 100644 docs/user_guide/examples/online_serving/cosmos3.md
 delete mode 100644 examples/offline_inference/cosmos3/README.md
 delete mode 100644 examples/offline_inference/cosmos3/end2end.py
 delete mode 100644 examples/offline_inference/cosmos3/inputs/action_forward_dynamics_av.json
 delete mode 100644 examples/offline_inference/cosmos3/inputs/action_forward_dynamics_camera.jsonl
 delete mode 100644 examples/offline_inference/cosmos3/inputs/action_forward_dynamics_robot.json
 delete mode 100644 examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_av.json
 delete mode 100644 examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_robot.json
 delete mode 100644 examples/offline_inference/cosmos3/inputs/action_policy_av.json
 delete mode 100644 examples/offline_inference/cosmos3/inputs/action_policy_robot.json
 delete mode 100644 examples/offline_inference/cosmos3/inputs/i2v.json
 delete mode 100644 examples/offline_inference/cosmos3/inputs/t2i.json
 delete mode 100644 examples/offline_inference/cosmos3/inputs/t2v.json
 delete mode 100644 examples/offline_inference/cosmos3/inputs/t2v_sound.json
 delete mode 100644 examples/online_serving/cosmos3/README.md
 delete mode 100644 examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml
 delete mode 100755 examples/online_serving/cosmos3/run_curl_action_forward_dynamics_av.sh
 delete mode 100755 examples/online_serving/cosmos3/run_curl_action_forward_dynamics_camera.sh
 delete mode 100755 examples/online_serving/cosmos3/run_curl_action_forward_dynamics_robot.sh
 delete mode 100755 examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_av.sh
 delete mode 100755 examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_robot.sh
 delete mode 100755 examples/online_serving/cosmos3/run_curl_action_policy.sh
 delete mode 100755 examples/online_serving/cosmos3/run_curl_action_policy_av.sh
 delete mode 100755 examples/online_serving/cosmos3/run_curl_i2v.sh
 delete mode 100755 examples/online_serving/cosmos3/run_curl_t2i.sh
 delete mode 100755 examples/online_serving/cosmos3/run_curl_t2v.sh
 delete mode 100755 examples/online_serving/cosmos3/run_curl_t2v_sound.sh
 delete mode 100644 examples/online_serving/cosmos3/run_server.sh

diff --git a/.gitignore b/.gitignore
index 06ff3f5667e..378de441c7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -241,9 +241,7 @@ datasets/
 *.json
 !tests/dfx/perf/tests/*.json
 !apps/ComfyUI-vLLM-Omni/example_workflows/*.json
-!examples/offline_inference/cosmos3/inputs/*.json
 *.jsonl
-!examples/offline_inference/cosmos3/inputs/*.jsonl
 *.parquet
 
 # Output files
diff --git a/docs/.nav.yml b/docs/.nav.yml
index 7392c9f5fc7..a60a4368dd4 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -18,7 +18,6 @@ nav:
     - Offline Inference:
       - BAGEL-7B-MoT: user_guide/examples/offline_inference/bagel.md
       - GLM-Image Multistage End-to-End Inference: user_guide/examples/offline_inference/glm_image.md
-      - Cosmos3: user_guide/examples/offline_inference/cosmos3.md
       - Helios Video Generation: user_guide/examples/offline_inference/helios.md
       - HunyuanImage-3.0 Image-to-Text Inference: user_guide/examples/offline_inference/hunyuan_image3.md
       - Image-To-Image: user_guide/examples/offline_inference/image_to_image.md
@@ -36,7 +35,6 @@ nav:
     - Online Serving:
       - BAGEL-7B-MoT: user_guide/examples/online_serving/bagel.md
       - vLLM-Omni Helm Chart: user_guide/examples/online_serving/chart-helm.md
-      - Cosmos3: user_guide/examples/online_serving/cosmos3.md
       - Diffusers Backend Adapter: user_guide/examples/online_serving/diffusers_pipeline_adapter.md
       - GLM-Image Online Serving: user_guide/examples/online_serving/glm_image.md
       - Image-To-Image: user_guide/examples/online_serving/image_to_image.md
diff --git a/docs/user_guide/diffusion/cache_acceleration/cache_dit.md b/docs/user_guide/diffusion/cache_acceleration/cache_dit.md
index 93287182d4f..eaaca84ad6d 100644
--- a/docs/user_guide/diffusion/cache_acceleration/cache_dit.md
+++ b/docs/user_guide/diffusion/cache_acceleration/cache_dit.md
@@ -128,21 +128,6 @@ python image_edit.py \
 
 See the [image_edit.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py) for detailed configuration options.
 
-For Cosmos3 text-to-video or image-to-video, use the video examples with the Cosmos3 pipeline class:
-
-```bash
-cd examples/offline_inference/text_to_video
-
-python text_to_video.py \
-    --model nvidia/Cosmos3-Nano \
-    --model-class-name Cosmos3OmniDiffusersPipeline \
-    --prompt "A small warehouse robot moves a blue box across a clean floor." \
-    --cache-backend cache_dit \
-    --num-inference-steps 35
-```
-
-Cosmos3 Cache-DiT wraps the GEN denoising path. TeaCache is not implemented for Cosmos3.
-
 ### Online Serving
 
 ```bash
@@ -153,11 +138,6 @@ vllm serve Qwen/Qwen-Image --omni --port 8091 --cache-backend cache_dit
 vllm serve Qwen/Qwen-Image --omni --port 8091 \
   --cache-backend cache_dit \
   --cache-config '{"Fn_compute_blocks": 1, "residual_diff_threshold": 0.12}'
-
-# Cosmos3
-vllm serve nvidia/Cosmos3-Nano --omni --port 8091 \
-  --model-class-name Cosmos3OmniDiffusersPipeline \
-  --cache-backend cache_dit
 ```
 
 ---
diff --git a/docs/user_guide/diffusion/cpu_offload_diffusion.md b/docs/user_guide/diffusion/cpu_offload_diffusion.md
index 0e1121b8d51..39dc366485e 100644
--- a/docs/user_guide/diffusion/cpu_offload_diffusion.md
+++ b/docs/user_guide/diffusion/cpu_offload_diffusion.md
@@ -194,7 +194,6 @@ Factory function `get_offload_backend()` selects the appropriate backend based o
 | OvisImagePipeline | `AIDC-AI/Ovis-Image-7B` | `OvisImageTransformer2DModel` | - | ✓ | `"transformer"` |
 | QwenImagePipeline | `Qwen/Qwen-Image` | `QwenImageTransformer2DModel` | ✓ | ✓ | `"transformer_blocks"` |
 | StableDiffusion3Pipeline | `stabilityai/stable-diffusion-3.5-medium` | `SD3Transformer2DModel` | - | ✓ | `"transformer_blocks"` |
-| Cosmos3OmniDiffusersPipeline | `nvidia/Cosmos3-Nano` | `Cosmos3VFMTransformer` | - | ✓ | `"gen_layers"` |
 | Wan22I2VPipeline | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` |
 | Wan22Pipeline | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` |
 | BagelPipeline | `ByteDance-Seed/BAGEL-7B-MoT` | `Qwen2MoTModel` | - | ✓ | `"layers"`, `"customized modules"` |
@@ -202,4 +201,3 @@ Factory function `get_offload_backend()` selects the appropriate backend based o
 **Notes:**
 - Model-Level Offloading is expected to be supported by all common diffusion models (DiT and encoders) naturally
 - Layerwise Offloading requires DiT class to define `_layerwise_offload_blocks_attrs` pointing to transformer blocks
-- Cosmos3 uses the singular `_layerwise_offload_blocks_attr` compatibility path and offloads GEN decoder layers.
diff --git a/docs/user_guide/diffusion/parallelism/cfg_parallel.md b/docs/user_guide/diffusion/parallelism/cfg_parallel.md
index ce468d817cd..5541106680a 100644
--- a/docs/user_guide/diffusion/parallelism/cfg_parallel.md
+++ b/docs/user_guide/diffusion/parallelism/cfg_parallel.md
@@ -144,15 +144,6 @@ sampling_params = OmniDiffusionSamplingParams(
 )
 ```
 
-For Cosmos3, use `guidance_scale` rather than `true_cfg_scale`:
-
-```python
-sampling_params = OmniDiffusionSamplingParams(
-    num_inference_steps=35,
-    guidance_scale=4.0,
-)
-```
-
 2. **Add negative prompt:**
 ```python
 outputs = omni.generate(
diff --git a/docs/user_guide/examples/offline_inference/cosmos3.md b/docs/user_guide/examples/offline_inference/cosmos3.md
deleted file mode 100644
index 9d9924a1a15..00000000000
--- a/docs/user_guide/examples/offline_inference/cosmos3.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# Cosmos3
-
-Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/cosmos3>.
-
-
-Cosmos3 uses `Cosmos3OmniDiffusersPipeline` for text-to-image, text-to-video, image-to-video, video-with-sound, and action generation. Examples default to the `nvidia/Cosmos3-Nano` Hugging Face repo, but you can override the checkpoint with `--model` or by exporting `COSMOS3_MODEL` to a local Diffusers-format checkpoint.
-
-```bash
-cd examples/offline_inference/cosmos3
-```
-
-## Text-to-Image
-
-```bash
-python end2end.py \
-  --task t2i \
-  --prompt "A small warehouse robot carrying a blue box, clean product photography" \
-  --negative-prompt "blurry, distorted, low quality" \
-  --output cosmos3_t2i.png
-```
-
-## Text-to-Video
-
-```bash
-python end2end.py \
-  --task t2v \
-  --prompt "A small warehouse robot moves a blue box across a clean floor." \
-  --negative-prompt "blurry, distorted, low quality" \
-  --output cosmos3_t2v.mp4
-```
-
-## Image-to-Video
-
-Download an example image or provide your own image path.
-
-```bash
-wget https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg
-
-python end2end.py \
-  --task i2v \
-  --image cherry_blossom.jpg \
-  --prompt "Cherry blossoms swaying gently in the breeze, petals falling, smooth motion" \
-  --negative-prompt "blurry, distorted, low quality" \
-  --output cosmos3_i2v.mp4
-```
-
-## Video With Sound
-
-This path requires a sound-capable Cosmos3 checkpoint with `sound_gen` weights.
-
-```bash
-python end2end.py \
-  --task t2v_sound \
-  --prompt "A small warehouse robot rolls across the floor with soft motor sounds." \
-  --negative-prompt "blurry, distorted, low quality" \
-  --sound-duration 3.4 \
-  --output cosmos3_t2v_sound.mp4
-```
-
-## Action Policy
-
-This path requires an action-capable Cosmos3 checkpoint with `action_gen` weights. The example returns a video plus an action JSON payload. Pass either `--domain-name` or `--domain-id`.
-
-```bash
-python end2end.py \
-  --task action_policy \
-  --image cherry_blossom.jpg \
-  --prompt "Predict the robot action for moving toward the target." \
-  --domain-name bridge_orig_lerobot \
-  --raw-action-dim 2 \
-  --action-chunk-size 16 \
-  --output cosmos3_action_policy.mp4 \
-  --action-output cosmos3_action_policy_action.json
-```
-
-## Common Options
-
-- `--enable-layerwise-offload`: use layerwise offload for memory-constrained runs.
-- `--cache-backend cache_dit`: enable Cache-DiT where supported.
-- `--cfg-parallel-size 2`, `--ulysses-degree`, `--tensor-parallel-size`, `--use-hsdp`: enable parallel execution options.
-- `--height`, `--width`, `--num-frames`, `--num-inference-steps`, `--guidance-scale`, `--fps`: override task defaults.
-
-Do not use model-level `--enable-cpu-offload` for Cosmos3. Use `--enable-layerwise-offload` instead.
-
-## Example materials
-
-??? abstract "end2end.py"
-    ``````py
-    --8<-- "examples/offline_inference/cosmos3/end2end.py"
-    ``````
diff --git a/docs/user_guide/examples/online_serving/cosmos3.md b/docs/user_guide/examples/online_serving/cosmos3.md
deleted file mode 100644
index 25fec71fe33..00000000000
--- a/docs/user_guide/examples/online_serving/cosmos3.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Cosmos3
-
-Source <https://github.com/vllm-project/vllm-omni/tree/main/examples/online_serving/cosmos3>.
-
-
-This example shows Cosmos3 online serving with `Cosmos3OmniDiffusersPipeline`.
-
-The server defaults to the `nvidia/Cosmos3-Nano` Hugging Face repo. Override the checkpoint by exporting `MODEL` or `COSMOS3_MODEL` to a local Diffusers-format checkpoint.
-
-```bash
-cd examples/online_serving/cosmos3
-bash run_server.sh
-```
-
-`run_server.sh` accepts these environment overrides:
-
-- `MODEL`: checkpoint path or Hugging Face repo, defaults to `nvidia/Cosmos3-Nano` (or `COSMOS3_MODEL` if set)
-- `PORT`: server port, defaults to `8091`
-- `CACHE_BACKEND`: set to `cache_dit` to enable Cache-DiT
-- `ENABLE_LAYERWISE_OFFLOAD`: set to `1` to enable layerwise offload
-- `CFG_PARALLEL_SIZE`, `TENSOR_PARALLEL_SIZE`, `ULYSSES_DEGREE`, `USE_HSDP`: parallel execution controls
-- `ALLOWED_LOCAL_MEDIA_PATH`: local media access path, defaults to `/`
-- `DEPLOY_CONFIG`: optional deploy YAML override; defaults to the bundled Cosmos3 deploy config
-
-## Disabling Guardrails
-
-Cosmos3 ships with prompt and generated-output guardrails. To skip loading guardrail models for the whole server, start with the no-guardrails deploy override:
-
-```bash
-vllm serve nvidia/Cosmos3-Nano --omni \
-  --deploy-config examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml \
-  --port 8091
-```
-
-The helper script accepts the same override through `DEPLOY_CONFIG`:
-
-```bash
-cd examples/online_serving/cosmos3
-DEPLOY_CONFIG=cosmos3_no_guardrails.yaml bash run_server.sh
-```
-
-## Text-to-Image
-
-```bash
-bash run_curl_t2i.sh
-```
-
-The script calls `POST /v1/images/generations`, which selects Cosmos3 text-to-image through `modalities=["image"]` internally.
-
-## Text-to-Video
-
-```bash
-bash run_curl_t2v.sh
-```
-
-## Image-to-Video
-
-Download an example image or set `IMAGE_PATH` to your own image:
-
-```bash
-wget https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg
-IMAGE_PATH=cherry_blossom.jpg bash run_curl_i2v.sh
-```
-
-## Video With Sound
-
-This path requires a sound-capable Cosmos3 checkpoint with `sound_gen` weights.
-
-```bash
-bash run_curl_t2v_sound.sh
-```
-
-The script passes `generate_sound=true` and `sound_duration` to the video endpoint.
-
-## Action Policy
-
-This path requires an action-capable Cosmos3 checkpoint with `action_gen` weights. Pass either `domain_name` or `domain_id` through `extra_params`.
-
-```bash
-IMAGE_PATH=cherry_blossom.jpg bash run_curl_action_policy.sh
-```
-
-The script uses the asynchronous `POST /v1/videos` job endpoint so it can download the MP4 and save the returned action metadata JSON.
-
-## Example materials
-
-??? abstract "run_curl_action_policy.sh"
-    ``````sh
-    --8<-- "examples/online_serving/cosmos3/run_curl_action_policy.sh"
-    ``````
-??? abstract "run_curl_i2v.sh"
-    ``````sh
-    --8<-- "examples/online_serving/cosmos3/run_curl_i2v.sh"
-    ``````
-??? abstract "run_curl_t2i.sh"
-    ``````sh
-    --8<-- "examples/online_serving/cosmos3/run_curl_t2i.sh"
-    ``````
-??? abstract "run_curl_t2v.sh"
-    ``````sh
-    --8<-- "examples/online_serving/cosmos3/run_curl_t2v.sh"
-    ``````
-??? abstract "run_curl_t2v_sound.sh"
-    ``````sh
-    --8<-- "examples/online_serving/cosmos3/run_curl_t2v_sound.sh"
-    ``````
-??? abstract "run_server.sh"
-    ``````sh
-    --8<-- "examples/online_serving/cosmos3/run_server.sh"
-    ``````
diff --git a/examples/offline_inference/cosmos3/README.md b/examples/offline_inference/cosmos3/README.md
deleted file mode 100644
index aa59b3af93d..00000000000
--- a/examples/offline_inference/cosmos3/README.md
+++ /dev/null
@@ -1,162 +0,0 @@
-# Cosmos3
-
-Cosmos3 uses `Cosmos3OmniDiffusersPipeline` for text-to-image, text-to-video, image-to-video, video-with-sound, and action generation (policy, forward dynamics, inverse dynamics). Examples default to the `nvidia/Cosmos3-Nano` Hugging Face repo; override with `--model` or by exporting `COSMOS3_MODEL` to a local Diffusers-format checkpoint.
-
-## Canonical inputs
-
-Each modality has a JSON file under `inputs/` carrying the long-form prompt and the matching sampling parameters:
-
-| Modality                          | Input file                                       |
-| --------------------------------- | ------------------------------------------------ |
-| Text-to-Image                     | `inputs/t2i.json`                                |
-| Text-to-Video                     | `inputs/t2v.json`                                |
-| Text-to-Video with sound          | `inputs/t2v_sound.json`                          |
-| Image-to-Video                    | `inputs/i2v.json`                                |
-| Action — policy (robot)           | `inputs/action_policy_robot.json`                |
-| Action — policy (AV)              | `inputs/action_policy_av.json`                   |
-| Action — forward dynamics (robot) | `inputs/action_forward_dynamics_robot.json`      |
-| Action — forward dynamics (AV)    | `inputs/action_forward_dynamics_av.json`         |
-| Action — forward dynamics (camera)| `inputs/action_forward_dynamics_camera.jsonl`    |
-| Action — inverse dynamics (robot) | `inputs/action_inverse_dynamics_robot.json`      |
-| Action — inverse dynamics (AV)    | `inputs/action_inverse_dynamics_av.json`         |
-
-Pass any of these to `--input-json`. Recognized fields (`prompt`, `negative_prompt`, `vision_path`, `action_path`, `height`, `width`, `num_frames`, `num_inference_steps`, `guidance_scale`, `fps`, `seed`, `action_mode`, `action_chunk_size`, `raw_action_dim`, `domain_name`, `domain_id`, `generate_sound`, `sound_duration`) override the task defaults; explicit CLI flags still win over the JSON record.
-
-`vision_path` and `action_path` may be local paths or `http(s)` URLs. Remote assets are downloaded to a cache directory (`COSMOS3_EXAMPLE_CACHE`, defaults to `$TMPDIR/cosmos3_examples`).
-
-JSONL inputs (e.g. `action_forward_dynamics_camera.jsonl`) generate one output per record, with `_0`, `_1`, … appended to the output stem.
-
-## Text-to-Image
-
-```bash
-python end2end.py --task t2i --input-json inputs/t2i.json --output cosmos3_t2i.png
-```
-
-## Text-to-Video
-
-```bash
-python end2end.py --task t2v --input-json inputs/t2v.json --output cosmos3_t2v.mp4
-```
-
-## Image-to-Video
-
-The companion image (`robot_153.jpg`) is referenced by URL inside `inputs/i2v.json` and auto-cached on first run.
-
-```bash
-python end2end.py --task i2v --input-json inputs/i2v.json --output cosmos3_i2v.mp4
-```
-
-To use your own image, override the vision path:
-
-```bash
-python end2end.py --task i2v --input-json inputs/i2v.json --vision-path /path/to/image.jpg --prompt "..."
-```
-
-## Video With Sound
-
-```bash
-python end2end.py --task t2v_sound --input-json inputs/t2v_sound.json --output cosmos3_t2v_sound.mp4
-```
-
-The JSON sets `generate_sound: true` and `sound_duration: 3.4`; override on the command line with `--sound-duration` if needed.
-
-## Action — Policy
-
-Policy mode consumes an image plus a language instruction and returns a video together with the predicted action chunk. The bundled vision asset for these modes is a video clip (`bridge_0.mp4` / `av_vision_25_*.mp4`); end2end.py auto-extracts the first frame for image-input modes (see [Video assets for image-input action modes](#video-assets-for-image-input-action-modes)).
-
-Robot (`bridge_orig_lerobot`, `raw_action_dim=10`, `action_chunk_size=16`):
-
-```bash
-python end2end.py --task action_policy --input-json inputs/action_policy_robot.json \
-  --output cosmos3_action_policy_robot.mp4 \
-  --action-output cosmos3_action_policy_robot_action.json
-```
-
-Autonomous vehicle (`raw_action_dim=9`, `action_chunk_size=60`, "Please go backward"):
-
-```bash
-python end2end.py --task action_policy --input-json inputs/action_policy_av.json \
-  --output cosmos3_action_policy_av.mp4 \
-  --action-output cosmos3_action_policy_av_action.json
-```
-
-## Action — Forward Dynamics
-
-Forward dynamics consumes a vision input plus a chunk of action data and predicts the resulting video.
-When the vision input is a video, the example uses the first `action_chunk_size + 1` frames to match
-native Cosmos3 conditioning. `--action-path` (URL or local path) is required; the JSON points at the
-cosmos-dependencies asset and gets cached locally on first run.
-
-Robot:
-
-```bash
-python end2end.py --task action_forward_dynamics \
-  --input-json inputs/action_forward_dynamics_robot.json \
-  --output cosmos3_forward_dynamics_robot.mp4
-```
-
-Autonomous vehicle:
-
-```bash
-python end2end.py --task action_forward_dynamics \
-  --input-json inputs/action_forward_dynamics_av.json \
-  --output cosmos3_forward_dynamics_av.mp4
-```
-
-Camera-pose (JSONL with two scenes — `mountain` and `solar`):
-
-```bash
-python end2end.py --task action_forward_dynamics \
-  --input-json inputs/action_forward_dynamics_camera.jsonl \
-  --output cosmos3_forward_dynamics_camera.mp4
-# Produces cosmos3_forward_dynamics_camera_0.mp4 and cosmos3_forward_dynamics_camera_1.mp4
-```
-
-## Action — Inverse Dynamics
-
-Inverse dynamics consumes a video plus a language instruction and predicts the action chunk. Video input is fed through `multi_modal_data["video"]`. The action JSON is written to the `--action-output` path.
-
-Robot:
-
-```bash
-python end2end.py --task action_inverse_dynamics \
-  --input-json inputs/action_inverse_dynamics_robot.json \
-  --output cosmos3_inverse_dynamics_robot.mp4 \
-  --action-output cosmos3_inverse_dynamics_robot_action.json
-```
-
-Autonomous vehicle:
-
-```bash
-python end2end.py --task action_inverse_dynamics \
-  --input-json inputs/action_inverse_dynamics_av.json \
-  --output cosmos3_inverse_dynamics_av.mp4 \
-  --action-output cosmos3_inverse_dynamics_av_action.json
-```
-
-## Video assets for action modes
-
-`forward_dynamics` uses the first `action_chunk_size + 1` frames when `--vision-path` resolves to a
-video file, matching the native Cosmos3 action loader. Still images are also accepted as a fallback.
-`policy` uses a still image; when its `--vision-path` resolves to a video file, end2end.py extracts
-the first frame automatically. Video frame loading requires `imageio` with the ffmpeg plugin:
-
-```bash
-pip install "imageio[ffmpeg]"
-```
-
-To bypass video loading/extraction, pass `--vision-path /path/to/still.jpg`.
-
-## Common Options
-
-- `--input-json PATH`: load any of the `inputs/*.json` or `inputs/*.jsonl` records; CLI flags still override individual fields.
-- `--vision-path PATH_OR_URL`: image or video input (alias `--image` is kept for back-compat).
-- `--action-path PATH_OR_URL`: action JSON for forward-dynamics.
-- `--action-mode {forward_dynamics,inverse_dynamics,policy}`: override action_mode (otherwise derived from `--task`).
-- `--generate-sound`: force-enable sound generation outside the `t2v_sound` task.
-- `--enable-layerwise-offload`: use layerwise offload for memory-constrained runs.
-- `--cache-backend cache_dit`: enable Cache-DiT where supported.
-- `--cfg-parallel-size 2`, `--ulysses-degree`, `--tensor-parallel-size`, `--use-hsdp`: enable parallel execution options.
-- `--height`, `--width`, `--num-frames`, `--num-inference-steps`, `--guidance-scale`, `--fps`: override JSON/task defaults.
-
-Do not use model-level `--enable-cpu-offload` for Cosmos3. Use `--enable-layerwise-offload` instead.
diff --git a/examples/offline_inference/cosmos3/end2end.py b/examples/offline_inference/cosmos3/end2end.py
deleted file mode 100644
index f28fd50ef97..00000000000
--- a/examples/offline_inference/cosmos3/end2end.py
+++ /dev/null
@@ -1,807 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import hashlib
-import json
-import os
-import sys
-import tempfile
-import time
-import urllib.parse
-import urllib.request
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import PIL.Image
-import torch
-
-from vllm_omni.diffusion.data import DiffusionParallelConfig
-from vllm_omni.entrypoints.omni import Omni
-from vllm_omni.inputs.data import OmniDiffusionSamplingParams
-from vllm_omni.outputs import OmniRequestOutput
-from vllm_omni.platforms import current_omni_platform
-
-DEFAULT_NEGATIVE_PROMPT = "blurry, distorted, low quality"
-TASK_DEFAULTS = {
-    "t2i": {
-        "height": 960,
-        "width": 960,
-        "num_frames": None,
-        "num_inference_steps": 50,
-        "guidance_scale": 4.0,
-        "flow_shift": 3.0,
-        "fps": 24,
-        "output": "cosmos3_t2i.png",
-    },
-    "t2v": {
-        "height": 720,
-        "width": 1280,
-        "num_frames": 189,
-        "num_inference_steps": 35,
-        "guidance_scale": 6.0,
-        "flow_shift": 10.0,
-        "fps": 24,
-        "output": "cosmos3_t2v.mp4",
-    },
-    "i2v": {
-        "height": 720,
-        "width": 1280,
-        "num_frames": 189,
-        "num_inference_steps": 35,
-        "guidance_scale": 6.0,
-        "flow_shift": 10.0,
-        "fps": 24,
-        "output": "cosmos3_i2v.mp4",
-    },
-    "t2v_sound": {
-        "height": 720,
-        "width": 1280,
-        "num_frames": 189,
-        "num_inference_steps": 35,
-        "guidance_scale": 6.0,
-        "flow_shift": 10.0,
-        "fps": 24,
-        "output": "cosmos3_t2v_sound.mp4",
-    },
-    "action_policy": {
-        "height": 480,
-        "width": 640,
-        "num_frames": 17,
-        "num_inference_steps": 30,
-        "guidance_scale": 1.0,
-        "flow_shift": 5.0,
-        "fps": 24,
-        "output": "cosmos3_action_policy.mp4",
-    },
-    "action_forward_dynamics": {
-        "height": 480,
-        "width": 640,
-        "num_frames": 17,
-        "num_inference_steps": 30,
-        "guidance_scale": 1.0,
-        "flow_shift": 5.0,
-        "fps": 5,
-        "output": "cosmos3_action_forward_dynamics.mp4",
-    },
-    "action_inverse_dynamics": {
-        "height": 480,
-        "width": 640,
-        "num_frames": 17,
-        "num_inference_steps": 30,
-        "guidance_scale": 1.0,
-        "flow_shift": 5.0,
-        "fps": 5,
-        "output": "cosmos3_action_inverse_dynamics.mp4",
-    },
-}
-
-_INPUTS_DIR = Path(__file__).resolve().parent / "inputs"
-_TASK_ACTION_MODES = {
-    "action_policy": "policy",
-    "action_forward_dynamics": "forward_dynamics",
-    "action_inverse_dynamics": "inverse_dynamics",
-}
-_ACTION_TASKS = set(_TASK_ACTION_MODES)
-_VIDEO_INPUT_TASKS = {"action_inverse_dynamics"}
-_IMAGE_INPUT_TASKS = {"i2v", "action_policy", "action_forward_dynamics"}
-_VIDEO_EXTENSIONS = {".mp4", ".mov", ".mkv", ".webm", ".avi"}
-_CACHE_DIR = Path(
-    os.environ.get(
-        "COSMOS3_EXAMPLE_CACHE",
-        str(Path(tempfile.gettempdir()) / "cosmos3_examples"),
-    )
-)
-_JSON_TO_ATTR = {
-    "prompt": "prompt",
-    "negative_prompt": "negative_prompt",
-    "vision_path": "vision_path",
-    "action_path": "action_path",
-    "height": "height",
-    "width": "width",
-    "num_frames": "num_frames",
-    "num_inference_steps": "num_inference_steps",
-    "guidance_scale": "guidance_scale",
-    "flow_shift": "flow_shift",
-    "fps": "fps",
-    "seed": "seed",
-    "action_mode": "action_mode",
-    "action_chunk_size": "action_chunk_size",
-    "raw_action_dim": "raw_action_dim",
-    "domain_name": "domain_name",
-    "domain_id": "domain_id",
-    "generate_sound": "generate_sound",
-    "sound_duration": "sound_duration",
-}
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Cosmos3 offline inference examples.")
-    parser.add_argument(
-        "--model",
-        default=os.environ.get("COSMOS3_MODEL", "nvidia/Cosmos3-Nano"),
-        help="Cosmos3 checkpoint (Hugging Face repo id or local Diffusers-format path). "
-        "Defaults to COSMOS3_MODEL when set, otherwise nvidia/Cosmos3-Nano.",
-    )
-    parser.add_argument(
-        "--task",
-        choices=sorted(TASK_DEFAULTS),
-        default="t2v",
-        help="Cosmos3 example task to run.",
-    )
-    parser.add_argument(
-        "--input-json",
-        default=None,
-        help="Path to a JSON or JSONL input file (e.g. inputs/t2v.json). When given, every recognized "
-        "field overrides the matching default; explicit CLI flags still win. Use JSONL to batch multiple "
-        "generations in one invocation (e.g. inputs/action_forward_dynamics_camera.jsonl).",
-    )
-    parser.add_argument(
-        "--prompt",
-        default="A small warehouse robot moves a blue box across a clean floor.",
-        help="Text prompt. Overrides any prompt loaded from --input-json.",
-    )
-    parser.add_argument("--negative-prompt", default=DEFAULT_NEGATIVE_PROMPT, help="Negative prompt.")
-    parser.add_argument(
-        "--image",
-        default=None,
-        help="Input image path for i2v / image-input action tasks. Alias for --vision-path.",
-    )
-    parser.add_argument(
-        "--vision-path",
-        default=None,
-        help="Vision input as a local path or http(s) URL. Image file for i2v / policy; image or video file "
-        "for forward_dynamics; video file for inverse_dynamics. If a video is supplied for i2v / policy, "
-        "the first frame is extracted automatically (requires imageio).",
-    )
-    parser.add_argument(
-        "--action-path",
-        default=None,
-        help="Local path or URL to an action JSON for forward_dynamics tasks.",
-    )
-    parser.add_argument(
-        "--action-mode",
-        default=None,
-        choices=["forward_dynamics", "inverse_dynamics", "policy"],
-        help="Override action_mode. Defaults are derived from --task.",
-    )
-    parser.add_argument(
-        "--generate-sound",
-        action="store_true",
-        help="Enable sound generation.",
-    )
-    parser.add_argument("--output", default=None, help="Output PNG or MP4 path. Default depends on --task.")
-    parser.add_argument(
-        "--action-output",
-        default=None,
-        help="Action JSON path for inverse_dynamics / action_policy outputs. "
-        "Defaults to the video output stem plus _action.json.",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="Random seed.")
-    parser.add_argument("--height", type=int, default=None, help="Output height. Default depends on --task.")
-    parser.add_argument("--width", type=int, default=None, help="Output width. Default depends on --task.")
-    parser.add_argument("--num-frames", type=int, default=None, help="Video frames. Default depends on --task.")
-    parser.add_argument(
-        "--num-inference-steps",
-        type=int,
-        default=None,
-        help="Sampling steps. Default depends on --task.",
-    )
-    parser.add_argument("--guidance-scale", type=float, default=None, help="CFG scale. Default depends on --task.")
-    parser.add_argument(
-        "--flow-shift",
-        type=float,
-        default=None,
-        help="Flow-matching scheduler shift. Default depends on --task (cosmos3-internal: 3.0 t2i / 10.0 t2v/i2v / 5.0 action).",
-    )
-    parser.add_argument("--fps", type=int, default=None, help="Output video fps. Default depends on --task.")
-    parser.add_argument(
-        "--sound-duration",
-        type=float,
-        default=None,
-        help="Audio duration in seconds for t2v_sound. Defaults to generated video duration.",
-    )
-    parser.add_argument(
-        "--audio-sample-rate",
-        type=int,
-        default=24000,
-        help="Fallback sample rate used when muxing audio if the model does not return one.",
-    )
-    parser.add_argument(
-        "--domain-name",
-        default="bridge_orig_lerobot",
-        help="Cosmos3 action embodiment name for action_policy.",
-    )
-    parser.add_argument("--domain-id", type=int, default=None, help="Cosmos3 action embodiment id.")
-    parser.add_argument(
-        "--raw-action-dim",
-        type=int,
-        default=2,
-        help="Number of action dimensions to keep for action_policy.",
-    )
-    parser.add_argument(
-        "--action-chunk-size",
-        type=int,
-        default=16,
-        help="Number of action steps for action_policy.",
-    )
-    parser.add_argument(
-        "--cache-backend",
-        type=str,
-        default=None,
-        choices=["cache_dit"],
-        help="Cache backend for supported Cosmos3 generation paths.",
-    )
-    parser.add_argument("--enable-layerwise-offload", action="store_true", help="Enable layerwise offload.")
-    parser.add_argument("--vae-use-slicing", action="store_true", help="Enable VAE slicing.")
-    parser.add_argument("--vae-use-tiling", action="store_true", help="Enable VAE tiling.")
-    parser.add_argument("--enforce-eager", action="store_true", help="Disable torch.compile.")
-    parser.add_argument("--ulysses-degree", type=int, default=1, help="Ulysses sequence parallel degree.")
-    parser.add_argument("--ring-degree", type=int, default=1, help="Ring sequence parallel degree.")
-    parser.add_argument("--cfg-parallel-size", type=int, default=1, choices=[1, 2], help="CFG parallel size.")
-    parser.add_argument("--tensor-parallel-size", type=int, default=1, help="Tensor parallel size.")
-    parser.add_argument("--vae-patch-parallel-size", type=int, default=1, help="VAE patch parallel size.")
-    parser.add_argument("--use-hsdp", action="store_true", help="Enable HSDP.")
-    parser.add_argument("--hsdp-shard-size", type=int, default=1, help="HSDP shard size.")
-    parser.add_argument("--hsdp-replicate-size", type=int, default=1, help="HSDP replicate size.")
-    parser.add_argument(
-        "--quantization",
-        type=str,
-        default=None,
-        choices=["fp8", "mxfp8", "int8", "gguf"],
-        help="Transformer quantization method.",
-    )
-    return parser.parse_args()
-
-
-def _cache_config(cache_backend: str | None) -> dict[str, Any] | None:
-    if cache_backend != "cache_dit":
-        return None
-    return {
-        "Fn_compute_blocks": 1,
-        "Bn_compute_blocks": 0,
-        "max_warmup_steps": 4,
-        "max_cached_steps": 20,
-        "residual_diff_threshold": 0.24,
-        "max_continuous_cached_steps": 3,
-        "enable_taylorseer": False,
-        "taylorseer_order": 1,
-        "scm_steps_mask_policy": None,
-        "scm_steps_policy": "dynamic",
-    }
-
-
-def _is_url(value: str) -> bool:
-    return urllib.parse.urlparse(value).scheme in {"http", "https"}
-
-
-def _is_video_path(value: str) -> bool:
-    parsed = urllib.parse.urlparse(value)
-    target = parsed.path if parsed.scheme else value
-    return Path(target).suffix.lower() in _VIDEO_EXTENSIONS
-
-
-def _resolve_local_path(path_or_url: str) -> str:
-    if not _is_url(path_or_url):
-        return path_or_url
-    _CACHE_DIR.mkdir(parents=True, exist_ok=True)
-    suffix = Path(urllib.parse.urlparse(path_or_url).path).suffix or ""
-    digest = hashlib.sha256(path_or_url.encode("utf-8")).hexdigest()[:16]
-    target = _CACHE_DIR / f"{digest}{suffix}"
-    if not target.exists():
-        print(f"Downloading {path_or_url} -> {target}")
-        with urllib.request.urlopen(path_or_url) as response, open(target, "wb") as fh:
-            fh.write(response.read())
-    return str(target)
-
-
-def _first_video_frame(video_path: str) -> PIL.Image.Image:
-    try:
-        import imageio.v3 as iio  # type: ignore[import-not-found]
-    except ImportError as exc:
-        raise ImportError(
-            "Extracting the first frame of a video for an image-input task requires imageio. "
-            "Install with `pip install imageio[ffmpeg]` or pass a still image via --vision-path."
-        ) from exc
-    frame = np.asarray(iio.imread(video_path, index=0))
-    return PIL.Image.fromarray(frame).convert("RGB")
-
-
-def _load_video_frames_from(path_or_url: str, max_frames: int) -> list[PIL.Image.Image]:
-    if max_frames <= 0:
-        raise ValueError(f"max_frames must be positive, got {max_frames}.")
-
-    try:
-        import imageio.v3 as iio  # type: ignore[import-not-found]
-    except ImportError as exc:
-        raise ImportError(
-            "Loading video frames for Cosmos3 action forward-dynamics requires imageio. "
-            "Install with `pip install imageio[ffmpeg]` or pass a still image via --vision-path."
-        ) from exc
-
-    local = _resolve_local_path(path_or_url)
-    frames: list[PIL.Image.Image] = []
-    for frame in iio.imiter(local):
-        frames.append(PIL.Image.fromarray(np.asarray(frame)).convert("RGB"))
-        if len(frames) >= max_frames:
-            break
-    if not frames:
-        raise ValueError(f"Cosmos3 action video input contains no frames: {path_or_url}")
-    return frames
-
-
-def _load_image_from(path_or_url: str) -> PIL.Image.Image:
-    local = _resolve_local_path(path_or_url)
-    if _is_video_path(path_or_url):
-        return _first_video_frame(local)
-    return PIL.Image.open(local).convert("RGB")
-
-
-def _load_input_records(path: str) -> list[dict[str, Any]]:
-    src = Path(path)
-    if not src.exists():
-        candidate = _INPUTS_DIR / path
-        if candidate.exists():
-            src = candidate
-    if not src.exists():
-        raise FileNotFoundError(f"Input JSON file not found: {path}")
-    text = src.read_text(encoding="utf-8").strip()
-    if src.suffix == ".jsonl":
-        return [json.loads(line) for line in text.splitlines() if line.strip()]
-    return [json.loads(text)]
-
-
-def _cli_provided_attrs(argv: list[str]) -> set[str]:
-    provided: set[str] = set()
-    for token in argv:
-        if not token.startswith("--"):
-            continue
-        flag = token.split("=", 1)[0][2:]
-        provided.add(flag.replace("-", "_"))
-    return provided
-
-
-def _apply_record(record: dict[str, Any], args: argparse.Namespace, cli_set: set[str]) -> None:
-    # --image and --vision-path are aliases for the same visual input. A CLI
-    # value for either should suppress a JSON override of the other.
-    effective_cli_set = set(cli_set)
-    if "image" in effective_cli_set or "vision_path" in effective_cli_set:
-        effective_cli_set |= {"image", "vision_path"}
-    for key, value in record.items():
-        attr = _JSON_TO_ATTR.get(key)
-        if attr is None:
-            print(f"Ignoring unknown input-json field: {key}")
-            continue
-        if attr in effective_cli_set:
-            continue
-        if attr == "generate_sound" and not bool(value):
-            continue
-        setattr(args, attr, value)
-
-
-def _first_output(outputs: Any) -> Any:
-    if isinstance(outputs, list):
-        if not outputs:
-            raise ValueError("No output generated.")
-        return outputs[0]
-    return outputs
-
-
-def _inner_output(output: Any) -> Any:
-    if isinstance(output, OmniRequestOutput) and output.is_pipeline_output and output.request_output is not None:
-        return output.request_output
-    return output
-
-
-def _extract_images(outputs: Any) -> list[Any]:
-    output = _inner_output(_first_output(outputs))
-    if isinstance(output, OmniRequestOutput) and output.images:
-        return output.images
-    images = getattr(output, "images", None)
-    if images:
-        return images
-    raise ValueError("No images found in output.")
-
-
-def _extract_video_audio_action(outputs: Any) -> tuple[Any, Any | None, int | None, dict[str, Any]]:
-    outer = _first_output(outputs)
-    output = _inner_output(outer)
-    audio = None
-    audio_sample_rate = None
-    action = {}
-
-    for candidate in (outer, output):
-        if isinstance(candidate, OmniRequestOutput):
-            if candidate.multimodal_output:
-                audio = audio or candidate.multimodal_output.get("audio")
-                audio_sample_rate = audio_sample_rate or candidate.multimodal_output.get("audio_sample_rate")
-            if candidate.custom_output:
-                action.update(candidate.custom_output)
-
-    videos = None
-    if isinstance(output, OmniRequestOutput):
-        if output.multimodal_output:
-            videos = output.multimodal_output.get("video")
-            audio = audio or output.multimodal_output.get("audio")
-            audio_sample_rate = audio_sample_rate or output.multimodal_output.get("audio_sample_rate")
-        if videos is None and output.images:
-            videos = output.images
-    else:
-        videos = getattr(output, "images", None)
-        mm = getattr(output, "multimodal_output", None)
-        if mm:
-            videos = videos or mm.get("video")
-            audio = audio or mm.get("audio")
-            audio_sample_rate = audio_sample_rate or mm.get("audio_sample_rate")
-
-    if isinstance(videos, list) and len(videos) == 1:
-        first = videos[0]
-        if isinstance(first, tuple) and len(first) == 2:
-            videos, audio = first
-        elif isinstance(first, dict):
-            audio = audio or first.get("audio")
-            audio_sample_rate = audio_sample_rate or first.get("audio_sample_rate")
-            videos = first.get("frames") or first.get("video")
-        elif isinstance(first, list):
-            videos = first
-
-    if isinstance(videos, tuple) and len(videos) == 2:
-        videos, audio = videos
-    elif isinstance(videos, dict):
-        audio = audio or videos.get("audio")
-        audio_sample_rate = audio_sample_rate or videos.get("audio_sample_rate")
-        videos = videos.get("frames") or videos.get("video")
-
-    if videos is None:
-        raise ValueError("No video frames found in output.")
-    return videos, audio, audio_sample_rate, action
-
-
-def _normalize_frame(frame: Any) -> Any:
-    if isinstance(frame, torch.Tensor):
-        frame_tensor = frame.detach().cpu()
-        if frame_tensor.dim() == 4 and frame_tensor.shape[0] == 1:
-            frame_tensor = frame_tensor[0]
-        if frame_tensor.dim() == 3 and frame_tensor.shape[0] in (3, 4):
-            frame_tensor = frame_tensor.permute(1, 2, 0)
-        if frame_tensor.is_floating_point():
-            frame_tensor = frame_tensor.clamp(-1, 1) * 0.5 + 0.5
-        return frame_tensor.float().numpy()
-    if isinstance(frame, np.ndarray):
-        frame_array = frame
-        if frame_array.ndim == 4 and frame_array.shape[0] == 1:
-            frame_array = frame_array[0]
-        if np.issubdtype(frame_array.dtype, np.integer):
-            frame_array = frame_array.astype(np.float32) / 255.0
-        return frame_array
-    if isinstance(frame, PIL.Image.Image):
-        return np.asarray(frame).astype(np.float32) / 255.0
-    return frame
-
-
-def _ensure_frame_list(video: Any) -> Any:
-    if isinstance(video, list):
-        if not video:
-            return video
-        first = video[0]
-        if isinstance(first, np.ndarray):
-            if first.ndim == 5:
-                return list(first[0])
-            if first.ndim == 4:
-                return list(first)
-            if first.ndim == 3:
-                return video
-        return video
-    if isinstance(video, np.ndarray):
-        if video.ndim == 5:
-            return list(video[0])
-        if video.ndim == 4:
-            return list(video)
-        if video.ndim == 3:
-            return [video]
-    return video
-
-
-def _video_to_array(video: Any) -> Any:
-    if isinstance(video, torch.Tensor):
-        video_tensor = video.detach().cpu()
-        if video_tensor.dim() == 5:
-            if video_tensor.shape[1] in (3, 4):
-                video_tensor = video_tensor[0].permute(1, 2, 3, 0)
-            else:
-                video_tensor = video_tensor[0]
-        elif video_tensor.dim() == 4 and video_tensor.shape[0] in (3, 4):
-            video_tensor = video_tensor.permute(1, 2, 3, 0)
-        if video_tensor.is_floating_point():
-            video_tensor = video_tensor.clamp(-1, 1) * 0.5 + 0.5
-        return video_tensor.float().numpy()
-    if isinstance(video, np.ndarray):
-        video_array = video
-        if video_array.ndim == 5:
-            video_array = video_array[0]
-        if np.issubdtype(video_array.dtype, np.integer):
-            video_array = video_array.astype(np.float32) / 255.0
-        return video_array
-    if isinstance(video, list):
-        if not video:
-            raise ValueError("No video frames found in output.")
-        return [_normalize_frame(frame) for frame in video]
-    return video
-
-
-def _save_video(video: Any, output_path: Path, fps: int, audio: Any | None, audio_sample_rate: int) -> None:
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    video_array = _ensure_frame_list(_video_to_array(video))
-
-    if audio is not None:
-        from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes
-
-        frames_np = np.stack(video_array, axis=0) if isinstance(video_array, list) else np.asarray(video_array)
-        if frames_np.ndim == 4 and frames_np.shape[-1] == 4:
-            frames_np = frames_np[..., :3]
-        frames_u8 = (np.clip(frames_np, 0.0, 1.0) * 255).round().clip(0, 255).astype("uint8")
-
-        audio_np = audio
-        if isinstance(audio_np, list):
-            audio_np = audio_np[0] if audio_np else None
-        if isinstance(audio_np, torch.Tensor):
-            audio_np = audio_np.detach().cpu().float().numpy()
-        if isinstance(audio_np, np.ndarray):
-            audio_np = np.squeeze(audio_np).astype(np.float32)
-
-        video_bytes = mux_video_audio_bytes(
-            frames_u8,
-            audio_np,
-            fps=float(fps),
-            audio_sample_rate=audio_sample_rate,
-        )
-        output_path.write_bytes(video_bytes)
-        return
-
-    try:
-        from diffusers.utils import export_to_video
-    except ImportError as exc:
-        raise ImportError("diffusers is required for export_to_video.") from exc
-    export_to_video(video_array, str(output_path), fps=fps)
-
-
-def _jsonable(value: Any) -> Any:
-    if isinstance(value, torch.Tensor):
-        return value.detach().cpu().tolist()
-    if isinstance(value, np.ndarray):
-        return value.tolist()
-    if isinstance(value, (np.integer, np.floating)):
-        return value.item()
-    if isinstance(value, dict):
-        return {str(k): _jsonable(v) for k, v in value.items()}
-    if isinstance(value, (list, tuple)):
-        return [_jsonable(v) for v in value]
-    return value
-
-
-def _build_omni(args: argparse.Namespace) -> Omni:
-    parallel_config = DiffusionParallelConfig(
-        ulysses_degree=args.ulysses_degree,
-        ring_degree=args.ring_degree,
-        cfg_parallel_size=args.cfg_parallel_size,
-        tensor_parallel_size=args.tensor_parallel_size,
-        vae_patch_parallel_size=args.vae_patch_parallel_size,
-        use_hsdp=args.use_hsdp,
-        hsdp_shard_size=args.hsdp_shard_size,
-        hsdp_replicate_size=args.hsdp_replicate_size,
-    )
-    kwargs: dict[str, Any] = {
-        "model": args.model,
-        "model_class_name": "Cosmos3OmniDiffusersPipeline",
-        "enable_layerwise_offload": args.enable_layerwise_offload,
-        "vae_use_slicing": args.vae_use_slicing,
-        "vae_use_tiling": args.vae_use_tiling,
-        "enforce_eager": args.enforce_eager,
-        "parallel_config": parallel_config,
-        "cache_backend": args.cache_backend,
-        "cache_config": _cache_config(args.cache_backend),
-    }
-    if args.quantization is not None:
-        kwargs["quantization"] = args.quantization
-    return Omni(**kwargs)
-
-
-def _resolve_action_mode(task: str, args: argparse.Namespace) -> str | None:
-    if getattr(args, "action_mode", None):
-        return args.action_mode
-    return _TASK_ACTION_MODES.get(task)
-
-
-def _build_prompt_and_extra(
-    args: argparse.Namespace,
-    task: str,
-    action_mode: str | None,
-) -> tuple[dict[str, Any], dict[str, Any]]:
-    vision_path = args.vision_path or args.image
-
-    prompt: dict[str, Any] = {
-        "prompt": args.prompt,
-        "negative_prompt": args.negative_prompt,
-        "modalities": ["image"] if task == "t2i" else ["video"],
-    }
-
-    if task in _VIDEO_INPUT_TASKS:
-        if not vision_path:
-            raise ValueError(f"--vision-path (video) is required for {task}.")
-        local_video = _resolve_local_path(vision_path)
-        prompt["multi_modal_data"] = {"video": local_video}
-    elif task == "action_forward_dynamics" and vision_path and _is_video_path(vision_path):
-        prompt["multi_modal_data"] = {"video": _load_video_frames_from(vision_path, args.action_chunk_size + 1)}
-    elif task in _IMAGE_INPUT_TASKS:
-        if not vision_path:
-            raise ValueError(f"--vision-path (image) is required for {task}.")
-        prompt["multi_modal_data"] = {"image": _load_image_from(vision_path)}
-    elif vision_path:
-        prompt["multi_modal_data"] = {"image": _load_image_from(vision_path)}
-
-    extra_args: dict[str, Any] = {}
-
-    if getattr(args, "flow_shift", None) is not None:
-        extra_args["flow_shift"] = float(args.flow_shift)
-
-    sound_enabled = bool(getattr(args, "generate_sound", False)) or task == "t2v_sound"
-    if sound_enabled and action_mode is not None:
-        raise ValueError("Cosmos3 does not support action modes combined with sound generation.")
-    if sound_enabled:
-        prompt["generate_sound"] = True
-        extra_args["generate_sound"] = True
-        if args.sound_duration is not None:
-            extra_args["sound_duration"] = args.sound_duration
-
-    if action_mode is not None:
-        extra_args["action_mode"] = action_mode
-        extra_args["action_chunk_size"] = args.action_chunk_size
-        if action_mode in {"policy", "inverse_dynamics"}:
-            extra_args["raw_action_dim"] = args.raw_action_dim
-        elif args.raw_action_dim is not None:
-            extra_args["raw_action_dim"] = args.raw_action_dim
-        if args.domain_id is not None:
-            extra_args["domain_id"] = args.domain_id
-        else:
-            extra_args["domain_name"] = args.domain_name
-        if action_mode == "forward_dynamics":
-            if not args.action_path:
-                raise ValueError("--action-path is required for forward_dynamics.")
-            extra_args["action_path"] = _resolve_local_path(args.action_path)
-        elif args.action_path:
-            extra_args["action_path"] = _resolve_local_path(args.action_path)
-
-    return prompt, extra_args
-
-
-def _run_one(
-    omni: Omni,
-    args: argparse.Namespace,
-    task: str,
-    output_path: Path,
-    record_index: int | None = None,
-) -> None:
-    defaults = TASK_DEFAULTS[task]
-    height = args.height or defaults["height"]
-    width = args.width or defaults["width"]
-    num_frames = args.num_frames if args.num_frames is not None else defaults["num_frames"]
-    num_inference_steps = args.num_inference_steps or defaults["num_inference_steps"]
-    guidance_scale = args.guidance_scale if args.guidance_scale is not None else defaults["guidance_scale"]
-    fps = args.fps or defaults["fps"]
-    if args.flow_shift is None and defaults.get("flow_shift") is not None:
-        args.flow_shift = defaults["flow_shift"]
-
-    action_mode = _resolve_action_mode(task, args)
-    prompt, extra_args = _build_prompt_and_extra(args, task, action_mode)
-
-    generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
-    sampling = OmniDiffusionSamplingParams(
-        height=height,
-        width=width,
-        generator=generator,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        num_frames=num_frames,
-        frame_rate=float(fps),
-        extra_args=extra_args,
-    )
-
-    print("Cosmos3 generation configuration:")
-    print(f"  Task: {task}")
-    if action_mode:
-        print(f"  Action mode: {action_mode}")
-    if record_index is not None:
-        print(f"  Record: {record_index}")
-    print(f"  Model: {args.model}")
-    print(f"  Size: {width}x{height}")
-    if num_frames is not None:
-        print(f"  Frames: {num_frames}")
-    print(f"  Steps: {num_inference_steps}")
-    print(f"  Guidance scale: {guidance_scale}")
-    if args.flow_shift is not None:
-        print(f"  Flow shift: {args.flow_shift}")
-
-    start = time.perf_counter()
-    outputs = omni.generate(prompt, sampling)
-    elapsed = time.perf_counter() - start
-    print(f"Total generation time: {elapsed:.4f} seconds")
-
-    if task == "t2i":
-        images = _extract_images(outputs)
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        images[0].save(output_path)
-        print(f"Saved image to {output_path}")
-        return
-
-    video, audio, returned_sample_rate, action = _extract_video_audio_action(outputs)
-    _save_video(
-        video, output_path, fps=fps, audio=audio, audio_sample_rate=returned_sample_rate or args.audio_sample_rate
-    )
-    print(f"Saved video to {output_path}")
-
-    if action_mode in {"policy", "inverse_dynamics"} and action:
-        action_out = (
-            Path(args.action_output) if args.action_output else output_path.with_name(f"{output_path.stem}_action.json")
-        )
-        action_out.parent.mkdir(parents=True, exist_ok=True)
-        action_out.write_text(json.dumps(_jsonable(action), indent=2) + "\n", encoding="utf-8")
-        print(f"Saved action metadata to {action_out}")
-
-
-def _record_output_path(base: Path, index: int, total: int) -> Path:
-    if total <= 1:
-        return base
-    return base.with_name(f"{base.stem}_{index}{base.suffix}")
-
-
-def main() -> None:
-    args = parse_args()
-    cli_set = _cli_provided_attrs(sys.argv[1:])
-
-    records: list[dict[str, Any]] = [{}]
-    if args.input_json:
-        records = _load_input_records(args.input_json)
-        if not records:
-            raise ValueError(f"--input-json {args.input_json} contained no records.")
-
-    omni = _build_omni(args)
-
-    base_output = Path(args.output or TASK_DEFAULTS[args.task]["output"])
-
-    for index, record in enumerate(records):
-        record_args = argparse.Namespace(**vars(args))
-        if record:
-            _apply_record(record, record_args, cli_set)
-        output_path = _record_output_path(base_output, index, len(records))
-        _run_one(
-            omni,
-            record_args,
-            args.task,
-            output_path,
-            record_index=index if len(records) > 1 else None,
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_av.json b/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_av.json
deleted file mode 100644
index 2a990696c92..00000000000
--- a/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_av.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-    "action_mode": "forward_dynamics",
-    "prompt": "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene.",
-    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4",
-    "action_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_action_25.json",
-    "domain_name": "av",
-    "height": 480,
-    "width": 640,
-    "num_frames": 61,
-    "fps": 10,
-    "num_inference_steps": 30,
-    "guidance_scale": 1.0,
-    "flow_shift": 5.0,
-    "seed": 0,
-    "action_chunk_size": 60,
-    "raw_action_dim": 9
-}
diff --git a/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_camera.jsonl b/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_camera.jsonl
deleted file mode 100644
index 8786eb7d556..00000000000
--- a/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_camera.jsonl
+++ /dev/null
@@ -1,2 +0,0 @@
-{"action_mode":"forward_dynamics","prompt":"A serene landscape video of a calm body of water in the foreground, leading up to rolling green pastoral hills and a prominent mountain peak partially shrouded in low-hanging fog under a moody, overcast gray sky. This video is captured from a first-person perspective looking at the scene.","vision_path":"https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/mountain_720.png","action_path":"https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/camera_action_44.json","domain_name":"camera_pose","height":480,"width":640,"num_frames":61,"fps":30,"num_inference_steps":30,"guidance_scale":1.0,"flow_shift":5.0,"seed":0,"action_chunk_size":60}
-{"action_mode":"forward_dynamics","prompt":"An architectural video of a modern elevated terrace at twilight, characterized by a large, intricate white geometric canopy supporting integrated solar panels, slatted wooden benches, and a unique cylindrical wooden seating pod, overlooking a distant campus. This video is captured from a first-person perspective looking at the scene.","vision_path":"https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/solar_720.png","action_path":"https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/camera_action_44.json","domain_name":"camera_pose","height":480,"width":640,"num_frames":61,"fps":30,"num_inference_steps":30,"guidance_scale":1.0,"flow_shift":5.0,"seed":0,"action_chunk_size":60}
diff --git a/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_robot.json b/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_robot.json
deleted file mode 100644
index 966dbdce7aa..00000000000
--- a/examples/offline_inference/cosmos3/inputs/action_forward_dynamics_robot.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-    "action_mode": "forward_dynamics",
-    "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
-    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
-    "action_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.json",
-    "domain_name": "bridge_orig_lerobot",
-    "height": 544,
-    "width": 736,
-    "num_frames": 17,
-    "fps": 5,
-    "num_inference_steps": 30,
-    "guidance_scale": 1.0,
-    "flow_shift": 5.0,
-    "seed": 0,
-    "action_chunk_size": 16,
-    "raw_action_dim": 10
-}
diff --git a/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_av.json b/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_av.json
deleted file mode 100644
index 7e746501533..00000000000
--- a/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_av.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "action_mode": "inverse_dynamics",
-    "prompt": "You are an autonomous vehicle planning system. This video is captured from a first-person perspective looking at the scene.",
-    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4",
-    "domain_name": "av",
-    "height": 480,
-    "width": 640,
-    "num_frames": 61,
-    "fps": 10,
-    "num_inference_steps": 30,
-    "guidance_scale": 1.0,
-    "flow_shift": 5.0,
-    "seed": 0,
-    "action_chunk_size": 60,
-    "raw_action_dim": 9
-}
diff --git a/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_robot.json b/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_robot.json
deleted file mode 100644
index 4cd7e68de05..00000000000
--- a/examples/offline_inference/cosmos3/inputs/action_inverse_dynamics_robot.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "action_mode": "inverse_dynamics",
-    "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
-    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
-    "domain_name": "bridge_orig_lerobot",
-    "height": 480,
-    "width": 640,
-    "num_frames": 17,
-    "fps": 5,
-    "num_inference_steps": 30,
-    "guidance_scale": 1.0,
-    "flow_shift": 5.0,
-    "seed": 0,
-    "action_chunk_size": 16,
-    "raw_action_dim": 10
-}
diff --git a/examples/offline_inference/cosmos3/inputs/action_policy_av.json b/examples/offline_inference/cosmos3/inputs/action_policy_av.json
deleted file mode 100644
index 1e9a6506753..00000000000
--- a/examples/offline_inference/cosmos3/inputs/action_policy_av.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "action_mode": "policy",
-    "prompt": "You are an autonomous vehicle planning system. Please go backward. This video is captured from a first-person perspective looking at the scene.",
-    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/av_vision_25_73d01c91-51f0-46cf-9b76-5682a76fb349.mp4",
-    "domain_name": "av",
-    "height": 480,
-    "width": 640,
-    "num_frames": 61,
-    "fps": 10,
-    "num_inference_steps": 30,
-    "guidance_scale": 1.0,
-    "flow_shift": 5.0,
-    "seed": 0,
-    "action_chunk_size": 60,
-    "raw_action_dim": 9
-}
diff --git a/examples/offline_inference/cosmos3/inputs/action_policy_robot.json b/examples/offline_inference/cosmos3/inputs/action_policy_robot.json
deleted file mode 100644
index 937fd839cc7..00000000000
--- a/examples/offline_inference/cosmos3/inputs/action_policy_robot.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "action_mode": "policy",
-    "prompt": "Put the pot to the left of the purple item. This video is captured from a first-person perspective looking at the scene.",
-    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/action/bridge_0.mp4",
-    "domain_name": "bridge_orig_lerobot",
-    "height": 480,
-    "width": 640,
-    "num_frames": 17,
-    "fps": 5,
-    "num_inference_steps": 30,
-    "guidance_scale": 1.0,
-    "flow_shift": 5.0,
-    "seed": 0,
-    "action_chunk_size": 16,
-    "raw_action_dim": 10
-}
diff --git a/examples/offline_inference/cosmos3/inputs/i2v.json b/examples/offline_inference/cosmos3/inputs/i2v.json
deleted file mode 100644
index c45302b3af0..00000000000
--- a/examples/offline_inference/cosmos3/inputs/i2v.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "prompt": "The video opens with a view of a testing environment, characterized by a large wooden table at the center. On this table, two robot arms are positioned at opposite ends, with the left arm closer to the camera and the right arm further away. Between the hands lies a dark wooden shelf with a red spherical object on its top rack, likely serving as a platform or obstacle. In the background, various pieces of equipment, including a tripod, a chair, are visible. A person wearing a blue jacket and black pants stands near the center of the room, observing the experiment, with a static hand position throughout. The floor is tiled with a patterned design, and additional items like a small robot figure and some cables can be seen scattered around the space. As the video progresses, the right robotic hand extends outward, moving from its initial position towards the red spherical object on the shelf. The hand then picks up the object and places it on the lowest rack of the shelf, completing a smooth, deliberate manipulation. The left robotic hand remains stationary throughout the sequence. No new objects appear in the video; all existing elements maintain their positions except for the movement of the right robotic hand. The scene concludes with the right robotic hand returning to its initial position, while the left hand continues to rest on the table. The overall environment remains unchanged, with the focus remaining on the interaction between the robotic hands and the wooden block, highlighting precise control during the demonstration.",
-    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_153.jpg",
-    "height": 720,
-    "width": 1280,
-    "num_frames": 189,
-    "num_inference_steps": 35,
-    "guidance_scale": 6.0,
-    "flow_shift": 10.0,
-    "fps": 24
-}
diff --git a/examples/offline_inference/cosmos3/inputs/t2i.json b/examples/offline_inference/cosmos3/inputs/t2i.json
deleted file mode 100644
index b7e1c7fee9c..00000000000
--- a/examples/offline_inference/cosmos3/inputs/t2i.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-    "prompt": "A medium shot of a modern robotics research laboratory with white walls and a gray floor. A robotic arm with a metallic finish is mounted on a clean white workbench, its gripper positioned above a row of small colored objects. A laptop and neatly arranged tools sit beside the robot. A large monitor on the wall behind displays a software interface. The scene is brightly lit by overhead fluorescent lights.",
-    "height": 960,
-    "width": 960,
-    "num_inference_steps": 50,
-    "guidance_scale": 4.0,
-    "flow_shift": 3.0,
-    "fps": 24
-}
diff --git a/examples/offline_inference/cosmos3/inputs/t2v.json b/examples/offline_inference/cosmos3/inputs/t2v.json
deleted file mode 100644
index 485f4e700bb..00000000000
--- a/examples/offline_inference/cosmos3/inputs/t2v.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "prompt": "The video opens with a view of a well-lit indoor space featuring a wooden display case with compartments filled with various fruits, including bananas, apples, pears, oranges, and carambolas. The bananas are neatly arranged in the middle compartment, while apples are in the left and a mix of pears, oranges, and carambolas are in the right. Two robotic arms with grippers are positioned at the bottom of the frame, with the one on the left remaining stationary, partially obscuring the apples. The robotic arm on the right begins its action, extending towards the right side of the display case. It carefully picks up a pear from the fruit section, placing it into a plastic bag in the shopping cart nearby, which has red handles. After securing the pear, the arm retracts back to its original position. The process repeats as the robotic arm picks up an orange and places it in the bag, followed by a carambola. The final frame captures the robotic arm returning to its initial position, leaving the display case and surrounding area unchanged. The video showcases a seamless and efficient automated fruit-picking process, highlighting the precision and efficiency of modern robotics in a retail setting.",
-    "height": 720,
-    "width": 1280,
-    "num_frames": 189,
-    "num_inference_steps": 35,
-    "guidance_scale": 6.0,
-    "flow_shift": 10.0,
-    "fps": 24
-}
diff --git a/examples/offline_inference/cosmos3/inputs/t2v_sound.json b/examples/offline_inference/cosmos3/inputs/t2v_sound.json
deleted file mode 100644
index f4ecdce266b..00000000000
--- a/examples/offline_inference/cosmos3/inputs/t2v_sound.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "prompt": "The video opens with a view of a well-lit indoor space featuring a wooden display case with compartments filled with various fruits, including bananas, apples, pears, oranges, and carambolas. The bananas are neatly arranged in the middle compartment, while apples are in the left and a mix of pears, oranges, and carambolas are in the right. Two robotic arms with grippers are positioned at the bottom of the frame, with the one on the left remaining stationary, partially obscuring the apples. The robotic arm on the right begins its action, extending towards the right side of the display case. It carefully picks up a pear from the fruit section, placing it into a plastic bag in the shopping cart nearby, which has red handles. After securing the pear, the arm retracts back to its original position. The process repeats as the robotic arm picks up an orange and places it in the bag, followed by a carambola. The final frame captures the robotic arm returning to its initial position, leaving the display case and surrounding area unchanged. Audio description: the soft whir of servo motors, gentle thuds as fruits land in the plastic bag, the rustle of the bag settling in the shopping cart, and a faint refrigeration hum in the background.",
-    "height": 720,
-    "width": 1280,
-    "num_frames": 189,
-    "num_inference_steps": 35,
-    "guidance_scale": 6.0,
-    "flow_shift": 10.0,
-    "fps": 24,
-    "generate_sound": true,
-    "sound_duration": 7.875
-}
diff --git a/examples/offline_inference/image_to_video/image_to_video.py b/examples/offline_inference/image_to_video/image_to_video.py
index 6ac63815c75..9e2521233b6 100644
--- a/examples/offline_inference/image_to_video/image_to_video.py
+++ b/examples/offline_inference/image_to_video/image_to_video.py
@@ -2,14 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 """
-Image-to-Video generation example using Wan2.2 I2V/TI2V models, LTX2, HunyuanVideo-1.5, or Cosmos3.
+Image-to-Video generation example using Wan2.2 I2V/TI2V models, LTX2, or HunyuanVideo-1.5.
 
 Supports:
 - Wan2.2-I2V-A14B-Diffusers: MoE model with CLIP image encoder
 - Wan2.2-TI2V-5B-Diffusers: Unified T2V+I2V model (dense 5B)
 - LTX2 image-to-video pipeline
 - HunyuanVideo-1.5 I2V: SigLIP + VAE dual image conditioning
-- Cosmos3: unified text-to-image, text-to-video, and image-to-video pipeline
 
 Usage:
     # Wan I2V-A14B (MoE)
@@ -31,13 +30,6 @@
     python image_to_video.py --model hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_i2v \
         --image input.jpg --prompt "A cat playing with yarn" \
         --flow-shift 5.0 --guidance-scale 6.0
-
-    # Cosmos3 image-to-video
-    python image_to_video.py --model nvidia/Cosmos3-Nano \
-        --model-class-name Cosmos3OmniDiffusersPipeline \
-        --image input.jpg --prompt "A cinematic dolly shot of a boat" \
-        --height 720 --width 1280 --num-frames 81 \
-        --num-inference-steps 35 --guidance-scale 4.0 --fps 24
 """
 
 import argparse
@@ -68,9 +60,7 @@ def parse_profiler_config(value: str) -> dict[str, Any]:
 
 
 def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Generate a video from an image (Wan2.2, LTX2, HunyuanVideo-1.5, Cosmos3)."
-    )
+    parser = argparse.ArgumentParser(description="Generate a video from an image (Wan2.2, LTX2, HunyuanVideo-1.5).")
     parser.add_argument(
         "--model",
         default="Wan-AI/Wan2.2-I2V-A14B-Diffusers",
@@ -79,13 +69,13 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--model-class-name",
         default=None,
-        help="Override model class name (e.g., Cosmos3OmniDiffusersPipeline or LTX2ImageToVideoPipeline).",
+        help="Override model class name (e.g., LTX2ImageToVideoPipeline).",
     )
     parser.add_argument("--image", required=True, help="Path to input image.")
     parser.add_argument("--prompt", default="", help="Text prompt describing the desired motion.")
     parser.add_argument("--negative-prompt", default="", help="Negative prompt.")
     parser.add_argument("--seed", type=int, default=42, help="Random seed.")
-    parser.add_argument("--guidance-scale", type=float, default=None, help="CFG scale. Default: model-specific.")
+    parser.add_argument("--guidance-scale", type=float, default=5.0, help="CFG scale.")
     parser.add_argument(
         "--guidance-scale-high", type=float, default=None, help="Optional separate CFG for high-noise (MoE only)."
     )
@@ -93,10 +83,8 @@ def parse_args() -> argparse.Namespace:
         "--height", type=int, default=None, help="Video height (auto-calculated from image if not set)."
     )
     parser.add_argument("--width", type=int, default=None, help="Video width (auto-calculated from image if not set).")
-    parser.add_argument("--num-frames", type=int, default=None, help="Number of frames. Default: model-specific.")
-    parser.add_argument(
-        "--num-inference-steps", type=int, default=None, help="Sampling steps. Default: model-specific."
-    )
+    parser.add_argument("--num-frames", type=int, default=81, help="Number of frames.")
+    parser.add_argument("--num-inference-steps", type=int, default=50, help="Sampling steps.")
     parser.add_argument("--boundary-ratio", type=float, default=0.875, help="Boundary split ratio for MoE models.")
     parser.add_argument(
         "--frame-rate",
@@ -105,10 +93,7 @@ def parse_args() -> argparse.Namespace:
         help="Optional generation frame rate (used by models like LTX2). Defaults to --fps.",
     )
     parser.add_argument(
-        "--flow-shift",
-        type=float,
-        default=None,
-        help="Scheduler flow_shift. Default: model-specific.",
+        "--flow-shift", type=float, default=5.0, help="Scheduler flow_shift (5.0 for 720p, 12.0 for 480p)."
     )
     parser.add_argument(
         "--sample-solver",
@@ -276,51 +261,31 @@ def calculate_dimensions(
     return height, width
 
 
-def _is_cosmos3_model(model_name: str, model_class_name: str | None = None) -> bool:
-    combined = f"{model_name} {model_class_name or ''}".lower()
-    return "cosmos3" in combined or "cosmos-3" in combined
-
-
 def main():
     args = parse_args()
     generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed)
     model_name = str(args.model).lower() if args.model is not None else ""
     model_class_name = args.model_class_name
     is_ltx2 = "ltx2" in model_name or (model_class_name and "ltx2" in model_class_name.lower())
-    is_cosmos3 = _is_cosmos3_model(model_name, model_class_name)
     if model_class_name is None and is_ltx2:
         model_class_name = "LTX2ImageToVideoPipeline"
-    elif model_class_name is None and is_cosmos3:
-        model_class_name = "Cosmos3OmniDiffusersPipeline"
 
     # Load input image
     image = PIL.Image.open(args.image).convert("RGB")
 
-    fps = args.fps if args.fps is not None else (24 if (is_ltx2 or is_cosmos3) else 16)
+    fps = args.fps if args.fps is not None else (24 if is_ltx2 else 16)
     frame_rate = args.frame_rate if args.frame_rate is not None else float(fps)
-    guidance_scale = (
-        args.guidance_scale if args.guidance_scale is not None else (4.0 if (is_ltx2 or is_cosmos3) else 5.0)
-    )
+    guidance_scale = args.guidance_scale if args.guidance_scale is not None else (4.0 if is_ltx2 else 5.0)
     num_frames = args.num_frames if args.num_frames is not None else (121 if is_ltx2 else 81)
-    num_inference_steps = (
-        args.num_inference_steps
-        if args.num_inference_steps is not None
-        else (40 if is_ltx2 else (35 if is_cosmos3 else 50))
-    )
+    num_inference_steps = args.num_inference_steps if args.num_inference_steps is not None else (40 if is_ltx2 else 50)
 
     # Calculate dimensions if not provided
     height = args.height
     width = args.width
     if height is None or width is None:
-        if is_ltx2:
-            max_area = 512 * 768
-            mod_value = 32
-        elif is_cosmos3:
-            max_area = 720 * 1280
-            mod_value = 16
-        else:
-            max_area = 480 * 832
-            mod_value = 16
+        # Default to 480P area for Wan2.2 I2V, 512x768 area for LTX2
+        max_area = 512 * 768 if is_ltx2 else 480 * 832
+        mod_value = 32 if is_ltx2 else 16
         calc_height, calc_width = calculate_dimensions(image, max_area=max_area, mod_value=mod_value)
         height = height or calc_height
         width = width or calc_width
@@ -402,10 +367,8 @@ def main():
     print(f"\n{'=' * 60}")
     print("Generation Configuration:")
     print(f"  Model: {args.model}")
-    if model_class_name:
-        print(f"  Model class: {model_class_name}")
-    print(f"  Inference steps: {num_inference_steps}")
-    print(f"  Frames: {num_frames}")
+    print(f"  Inference steps: {args.num_inference_steps}")
+    print(f"  Frames: {args.num_frames}")
     print(f"  Solver: {args.sample_solver}")
     print(f"  diffusion_kv_cache_dtype(config): {args.diffusion_kv_cache_dtype}")
     print(f"  diffusion_kv_cache_skip_steps(config): {args.diffusion_kv_cache_skip_steps}")
@@ -415,7 +378,7 @@ def main():
         f" tensor_parallel_size={args.tensor_parallel_size}, vae_patch_parallel_size={args.vae_patch_parallel_size},"
         f" pipeline_parallel_size={args.pipeline_parallel_size}"
     )
-    print(f"  Video size: {width}x{height}")
+    print(f"  Video size: {args.width}x{args.height}")
     print(f"{'=' * 60}\n")
 
     generation_start = time.perf_counter()
@@ -424,7 +387,6 @@ def main():
         {
             "prompt": args.prompt,
             "negative_prompt": args.negative_prompt,
-            "modalities": ["video"],
             "multi_modal_data": {"image": image},
         },
         OmniDiffusionSamplingParams(
diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py
index e986aee6b18..c0fd337bd93 100644
--- a/examples/offline_inference/text_to_image/text_to_image.py
+++ b/examples/offline_inference/text_to_image/text_to_image.py
@@ -49,7 +49,7 @@ def parse_args() -> argparse.Namespace:
         "Qwen/Qwen-Image, Tongyi-MAI/Z-Image-Turbo, Qwen/Qwen-Image-2512, stepfun-ai/NextStep-1.1, "
         "black-forest-labs/FLUX.1-dev, black-forest-labs/FLUX.2-klein-9B, "
         "black-forest-labs/FLUX.2-dev, tencent/HunyuanImage-3.0-Instruct, "
-        "meituan-longcat/LongCat-Image, OvisAI/Ovis-Image, nvidia/Cosmos3-Nano, "
+        "meituan-longcat/LongCat-Image, OvisAI/Ovis-Image, "
         "stabilityai/stable-diffusion-3.5-medium, Tongyi-MAI/Z-Image-Turbo and etc.",
     )
     parser.add_argument(
@@ -456,7 +456,6 @@ def main():
         {
             "prompt": args.prompt,
             "negative_prompt": args.negative_prompt,
-            "modalities": ["image"],
         },
         OmniDiffusionSamplingParams(
             height=args.height,
diff --git a/examples/offline_inference/text_to_video/text_to_video.py b/examples/offline_inference/text_to_video/text_to_video.py
index aa1e44736c9..9e1775dd14b 100644
--- a/examples/offline_inference/text_to_video/text_to_video.py
+++ b/examples/offline_inference/text_to_video/text_to_video.py
@@ -35,26 +35,10 @@
         "fps": 24,
         "output": "hunyuan_video_15_output.mp4",
     },
-    "cosmos3": {
-        "height": 720,
-        "width": 1280,
-        "num_frames": 81,
-        "num_inference_steps": 35,
-        "guidance_scale": 4.0,
-        "fps": 24,
-        "output": "cosmos3_t2v_output.mp4",
-    },
 }
 
 
-def _is_cosmos3_model(model: str, model_class_name: str | None = None) -> bool:
-    combined = f"{model} {model_class_name or ''}".lower()
-    return "cosmos3" in combined or "cosmos-3" in combined
-
-
-def _detect_preset(model: str, model_class_name: str | None = None) -> dict:
-    if _is_cosmos3_model(model, model_class_name):
-        return _MODEL_PRESETS["cosmos3"]
+def _detect_preset(model: str) -> dict:
     model_lower = model.lower()
     if "hunyuan" in model_lower:
         return _MODEL_PRESETS["hunyuan"]
@@ -74,19 +58,19 @@ def parse_profiler_config(value: str) -> dict[str, Any]:
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Generate a video from a text prompt. "
-        "Supports Wan2.2, HunyuanVideo-1.5, Cosmos3, and other text-to-video models."
+        "Supports Wan2.2, HunyuanVideo-1.5, and other text-to-video models."
     )
     parser.add_argument(
         "--model",
         default="Wan-AI/Wan2.2-T2V-A14B-Diffusers",
         help="Diffusers model ID or local path. "
         "Examples: Wan-AI/Wan2.2-T2V-A14B-Diffusers, "
-        "hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v, nvidia/Cosmos3-Nano",
+        "hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v",
     )
     parser.add_argument(
         "--model-class-name",
         default=None,
-        help="Override model class name (e.g., Cosmos3OmniDiffusersPipeline or LTX2TwoStagesVideoPipeline).",
+        help="Override model class name (e.g., LTX2TwoStagesVideoPipeline).",
     )
     parser.add_argument("--prompt", default="A serene lakeside sunrise with mist over the water.", help="Text prompt.")
     parser.add_argument("--negative-prompt", default="", help="Negative prompt.")
@@ -124,7 +108,7 @@ def parse_args() -> argparse.Namespace:
         type=str,
         default=None,
         choices=["cache_dit"],
-        help="Cache backend for acceleration on supported models. Default: None.",
+        help="Cache backend for acceleration (Wan2.2). Default: None.",
     )
     parser.add_argument(
         "--enable-cache-dit-summary",
@@ -338,7 +322,7 @@ def main():
     print(f"  Video size: {args.width}x{args.height}")
     print(f"{'=' * 60}\n")
 
-    prompt_dict = {"prompt": args.prompt, "modalities": ["video"]}
+    prompt_dict = {"prompt": args.prompt}
     if args.negative_prompt:
         prompt_dict["negative_prompt"] = args.negative_prompt
 
@@ -349,7 +333,6 @@ def main():
         guidance_scale=args.guidance_scale,
         num_inference_steps=args.num_inference_steps,
         num_frames=args.num_frames,
-        frame_rate=args.frame_rate if args.frame_rate is not None else float(args.fps),
     )
     if args.guidance_scale_high is not None:
         sampling_kwargs["guidance_scale_2"] = args.guidance_scale_high
diff --git a/examples/online_serving/cosmos3/README.md b/examples/online_serving/cosmos3/README.md
deleted file mode 100644
index 2eb5e06f6fa..00000000000
--- a/examples/online_serving/cosmos3/README.md
+++ /dev/null
@@ -1,164 +0,0 @@
-# Cosmos3
-
-This example shows Cosmos3 online serving with `Cosmos3OmniDiffusersPipeline`.
-
-The server defaults to the `nvidia/Cosmos3-Nano` Hugging Face repo. Override the checkpoint by exporting `MODEL` or `COSMOS3_MODEL` to a local Diffusers-format checkpoint.
-
-```bash
-cd examples/online_serving/cosmos3
-bash run_server.sh
-```
-
-`run_server.sh` accepts these environment overrides:
-
-- `MODEL`: checkpoint path or Hugging Face repo, defaults to `nvidia/Cosmos3-Nano` (or `COSMOS3_MODEL` if set)
-- `PORT`: server port, defaults to `8091`
-- `CACHE_BACKEND`: set to `cache_dit` to enable Cache-DiT
-- `ENABLE_LAYERWISE_OFFLOAD`: set to `1` to enable layerwise offload
-- `CFG_PARALLEL_SIZE`, `TENSOR_PARALLEL_SIZE`, `ULYSSES_DEGREE`, `USE_HSDP`: parallel execution controls
-- `ALLOWED_LOCAL_MEDIA_PATH`: local media access path, defaults to `/`
-- `DEPLOY_CONFIG`: optional deploy YAML override; defaults to the bundled Cosmos3 deploy config
-
-## Disabling guardrails
-
-Cosmos3 ships with safety guardrails that check prompts and apply generated-output face blurring. Two override paths are available depending on whether you want to skip the guardrails globally or on a single request.
-
-### Server-wide (skip loading guardrail models entirely)
-
-Start the server with `--deploy-config cosmos3_no_guardrails.yaml`, which sets `model_config.guardrails: false` on the diffusion stage so the guardrail models are never loaded:
-
-```bash
-vllm serve nvidia/Cosmos3-Nano --omni \
-  --deploy-config examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml \
-  --port 8091
-```
-
-The same override can be used with the helper script:
-
-```bash
-cd examples/online_serving/cosmos3
-DEPLOY_CONFIG=cosmos3_no_guardrails.yaml bash run_server.sh
-```
-
-Other CLI flags (parallelism, cache backend, layerwise offload, etc.) are still honored; the YAML only overrides the guardrail toggle. When this path is used, per-request overrides cannot turn guardrails back on — the underlying models are not in memory.
-
-### Per-request (skip checks for a single generation)
-
-When the server has guardrails enabled, an individual request can opt out by passing `guardrails: false` inside `extra_params`. The server merges `extra_params` into the pipeline's `extra_args`, and the guardrail gate reads `extra_args["guardrails"]` as a per-request override:
-
-```bash
-curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
-  -F "prompt=..." \
-  -F 'extra_params={"guardrails": false}' \
-  -o cosmos3_no_check.mp4
-```
-
-For action-mode requests, fold the override into the existing `extra_params` object alongside `action_mode`, `domain_name`, and the rest. Anything other than `false` (or a missing field) keeps the default behavior.
-
-## Curl scripts
-
-Each script sources its prompt and sampling parameters from the canonical input file shared with the offline example at `../../offline_inference/cosmos3/inputs/`. Override the input file with `INPUT_JSON=` (or `INPUT_JSONL=` for the camera variant) or the parent directory with `INPUTS_DIR=`.
-
-Companion vision and action assets are auto-downloaded from `nvidia-cosmos/cosmos-dependencies` on first run, so the scripts work out of the box once the server is up. Image-input action modes (`policy`, `forward_dynamics`) extract the first frame of the source `.mp4` via `ffmpeg`, which is already a Cosmos3 system dependency.
-
-## Text-to-Image
-
-```bash
-bash run_curl_t2i.sh
-```
-
-Calls `POST /v1/images/generations`, which selects Cosmos3 text-to-image through `modalities=["image"]` internally.
-
-## Text-to-Video
-
-```bash
-bash run_curl_t2v.sh
-```
-
-## Image-to-Video
-
-The companion image (`robot_153.jpg`) is auto-downloaded on first run. To use your own image:
-
-```bash
-IMAGE_PATH=/path/to/your.jpg bash run_curl_i2v.sh
-```
-
-## Video With Sound
-
-```bash
-bash run_curl_t2v_sound.sh
-```
-
-The script reads `sound_duration` from `inputs/t2v_sound.json` and posts `generate_sound=true` to `/v1/videos/sync`.
-
-## Action — Policy
-
-Policy mode returns a video plus a predicted action chunk; both are saved.
-
-Robot (`bridge_orig_lerobot`, `raw_action_dim=10`):
-
-```bash
-bash run_curl_action_policy.sh
-```
-
-Autonomous vehicle (`raw_action_dim=9`, "Please go backward"):
-
-```bash
-bash run_curl_action_policy_av.sh
-```
-
-## Action — Forward Dynamics
-
-Forward-dynamics scripts download both the source vision asset and the matching `action_path` JSON. The action JSON is passed as `action_path` inside `extra_params`, so it must be readable by the server process — that works out of the box on a same-host deployment with the default `ALLOWED_LOCAL_MEDIA_PATH=/`. For cross-host setups, share the file (e.g. via a mounted volume) or inline the action data into `extra_params` instead.
-
-Robot:
-
-```bash
-bash run_curl_action_forward_dynamics_robot.sh
-```
-
-Autonomous vehicle:
-
-```bash
-bash run_curl_action_forward_dynamics_av.sh
-```
-
-Camera-pose (two scenes — `SCENE_INDEX=0` for mountain (default), `SCENE_INDEX=1` for solar):
-
-```bash
-bash run_curl_action_forward_dynamics_camera.sh
-SCENE_INDEX=1 bash run_curl_action_forward_dynamics_camera.sh
-```
-
-## Action — Inverse Dynamics
-
-> **Known limitation:** the online `/v1/videos` endpoint accepts image bytes only via the `input_reference` form field. Inverse-dynamics needs the full source video, so the scripts below will currently fail at upload time. They are shipped pre-wired so they begin working unchanged once the server gains video upload support. In the meantime, run inverse-dynamics through the offline path:
->
-> ```bash
-> cd ../../offline_inference/cosmos3
-> python end2end.py --task action_inverse_dynamics \
->   --input-json inputs/action_inverse_dynamics_robot.json \
->   --output cosmos3_inverse_dynamics_robot.mp4 \
->   --action-output cosmos3_inverse_dynamics_robot_action.json
-> ```
-
-Curl variants (kept for forward compatibility):
-
-```bash
-bash run_curl_action_inverse_dynamics_robot.sh
-bash run_curl_action_inverse_dynamics_av.sh
-```
-
-## Common script overrides
-
-Every curl script accepts a small set of env overrides:
-
-- `BASE_URL`: server URL, defaults to `http://localhost:8091`
-- `OUTPUT_PATH`: where to save the generated image / video
-- `ACTION_OUTPUT_PATH`: where to save predicted action JSON (policy / inverse_dynamics)
-- `INPUT_JSON` / `INPUT_JSONL` (camera) / `INPUTS_DIR`: alternate source for prompt and sampling parameters
-- `IMAGE_PATH` / `VIDEO_PATH`: pre-existing vision asset (skip auto-download / frame-extraction)
-- `ACTION_PATH` (forward-dynamics): pre-existing action JSON on the server's filesystem
-- `POLL_INTERVAL` (async scripts): seconds between status checks
-
-Async scripts use `POST /v1/videos` so they can download the MP4 once the job completes and save the action JSON returned in the status response.
diff --git a/examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml b/examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml
deleted file mode 100644
index 858a3b2ab6c..00000000000
--- a/examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Cosmos3 deploy override that disables guardrails at server startup.
-#
-# Usage:
-#   vllm serve nvidia/Cosmos3-Nano --omni \
-#     --deploy-config examples/online_serving/cosmos3/cosmos3_no_guardrails.yaml \
-#     --port 8091
-#
-
-async_chunk: false
-trust_remote_code: true
-
-stages:
-  - stage_id: 0
-    max_num_seqs: 1
-    enforce_eager: true
-    model_class_name: Cosmos3OmniDiffusersPipeline
-    model_config:
-      guardrails: false
-      offload_guardrail_models: false
diff --git a/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_av.sh b/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_av.sh
deleted file mode 100755
index 201c3dcbaa6..00000000000
--- a/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_av.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-# Cosmos3 forward-dynamics example (autonomous vehicle, image input + action).
-#
-# See run_curl_action_forward_dynamics_robot.sh for notes on how action_path
-# is consumed by the server.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_forward_dynamics_av.json}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_forward_dynamics_av.mp4}"
-IMAGE_PATH="${IMAGE_PATH:-av_vision_25_frame0.jpg}"
-VIDEO_PATH="${VIDEO_PATH:-av_vision_25.mp4}"
-ACTION_PATH="${ACTION_PATH:-$(pwd)/av_action_25.json}"
-
-if [ ! -f "${INPUT_JSON}" ]; then
-  echo "Missing input JSON: ${INPUT_JSON}" >&2
-  exit 1
-fi
-
-PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
-VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
-ACTION_URL="$(jq -r '.action_path' "${INPUT_JSON}")"
-DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
-RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
-ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
-NUM_FRAMES="$(jq -r '.num_frames // 61' "${INPUT_JSON}")"
-FPS="$(jq -r '.fps // 10' "${INPUT_JSON}")"
-HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
-WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
-NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
-GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
-FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
-SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
-
-if [ ! -f "${IMAGE_PATH}" ]; then
-  if [ ! -f "${VIDEO_PATH}" ]; then
-    echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
-    curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
-  fi
-  echo "Extracting first frame -> ${IMAGE_PATH}"
-  ffmpeg -y -loglevel error -i "${VIDEO_PATH}" -vf "select=eq(n\,0)" -vframes 1 "${IMAGE_PATH}"
-fi
-
-if [ ! -f "${ACTION_PATH}" ]; then
-  echo "Downloading ${ACTION_URL} -> ${ACTION_PATH}"
-  curl -sSL "${ACTION_URL}" -o "${ACTION_PATH}"
-fi
-
-EXTRA_PARAMS="$(jq -nc \
-  --arg domain "${DOMAIN_NAME}" \
-  --argjson dim "${RAW_ACTION_DIM}" \
-  --argjson chunk "${ACTION_CHUNK_SIZE}" \
-  --arg action_path "${ACTION_PATH}" \
-  '{action_mode:"forward_dynamics", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk, action_path:$action_path}')"
-
-curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
-  -F "prompt=${PROMPT}" \
-  -F "input_reference=@${IMAGE_PATH}" \
-  -F "size=${WIDTH}x${HEIGHT}" \
-  -F "num_frames=${NUM_FRAMES}" \
-  -F "fps=${FPS}" \
-  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
-  -F "guidance_scale=${GUIDANCE_SCALE}" \
-  -F "flow_shift=${FLOW_SHIFT}" \
-  -F "extra_params=${EXTRA_PARAMS}" \
-  -F "seed=${SEED}" \
-  -o "${OUTPUT_PATH}"
-
-echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_camera.sh b/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_camera.sh
deleted file mode 100755
index cc349e167b8..00000000000
--- a/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_camera.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-# Cosmos3 forward-dynamics example (camera_pose domain, image input + action).
-#
-# The JSONL input ships two scenes: mountain (index 0) and solar (index 1).
-# Pick one with SCENE_INDEX. Vision input is a still PNG, so no ffmpeg step.
-# Forward-dynamics returns only a video (no predicted action), so this uses
-# the sync video endpoint.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSONL="${INPUT_JSONL:-${INPUTS_DIR}/action_forward_dynamics_camera.jsonl}"
-SCENE_INDEX="${SCENE_INDEX:-0}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_forward_dynamics_camera.mp4}"
-
-if [ ! -f "${INPUT_JSONL}" ]; then
-  echo "Missing input JSONL: ${INPUT_JSONL}" >&2
-  exit 1
-fi
-
-RECORD="$(awk "NR==$((SCENE_INDEX + 1))" "${INPUT_JSONL}")"
-if [ -z "${RECORD}" ]; then
-  echo "SCENE_INDEX=${SCENE_INDEX} out of range for ${INPUT_JSONL}" >&2
-  exit 1
-fi
-
-PROMPT="$(echo "${RECORD}" | jq -r '.prompt')"
-VISION_URL="$(echo "${RECORD}" | jq -r '.vision_path')"
-ACTION_URL="$(echo "${RECORD}" | jq -r '.action_path')"
-DOMAIN_NAME="$(echo "${RECORD}" | jq -r '.domain_name')"
-ACTION_CHUNK_SIZE="$(echo "${RECORD}" | jq -r '.action_chunk_size')"
-NUM_FRAMES="$(echo "${RECORD}" | jq -r '.num_frames // 61')"
-FPS="$(echo "${RECORD}" | jq -r '.fps // 30')"
-HEIGHT="$(echo "${RECORD}" | jq -r '.height // 480')"
-WIDTH="$(echo "${RECORD}" | jq -r '.width // 640')"
-NUM_INFERENCE_STEPS="$(echo "${RECORD}" | jq -r '.num_inference_steps // 30')"
-GUIDANCE_SCALE="$(echo "${RECORD}" | jq -r '.guidance_scale // 1.0')"
-FLOW_SHIFT="$(echo "${RECORD}" | jq -r '.flow_shift // 5.0')"
-SEED="$(echo "${RECORD}" | jq -r '.seed // 0')"
-
-IMAGE_PATH="${IMAGE_PATH:-camera_scene_${SCENE_INDEX}.png}"
-ACTION_PATH="${ACTION_PATH:-$(pwd)/camera_action_44.json}"
-
-if [ ! -f "${IMAGE_PATH}" ]; then
-  echo "Downloading ${VISION_URL} -> ${IMAGE_PATH}"
-  curl -sSL "${VISION_URL}" -o "${IMAGE_PATH}"
-fi
-
-if [ ! -f "${ACTION_PATH}" ]; then
-  echo "Downloading ${ACTION_URL} -> ${ACTION_PATH}"
-  curl -sSL "${ACTION_URL}" -o "${ACTION_PATH}"
-fi
-
-EXTRA_PARAMS="$(jq -nc \
-  --arg domain "${DOMAIN_NAME}" \
-  --argjson chunk "${ACTION_CHUNK_SIZE}" \
-  --arg action_path "${ACTION_PATH}" \
-  '{action_mode:"forward_dynamics", domain_name:$domain, action_chunk_size:$chunk, action_path:$action_path}')"
-
-curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
-  -F "prompt=${PROMPT}" \
-  -F "input_reference=@${IMAGE_PATH}" \
-  -F "size=${WIDTH}x${HEIGHT}" \
-  -F "num_frames=${NUM_FRAMES}" \
-  -F "fps=${FPS}" \
-  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
-  -F "guidance_scale=${GUIDANCE_SCALE}" \
-  -F "flow_shift=${FLOW_SHIFT}" \
-  -F "extra_params=${EXTRA_PARAMS}" \
-  -F "seed=${SEED}" \
-  -o "${OUTPUT_PATH}"
-
-echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_robot.sh b/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_robot.sh
deleted file mode 100755
index 28d8ce76d45..00000000000
--- a/examples/online_serving/cosmos3/run_curl_action_forward_dynamics_robot.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/bin/bash
-# Cosmos3 forward-dynamics example (bridge_orig_lerobot, image input + action).
-#
-# Forward-dynamics consumes an image plus a chunk of action data and predicts
-# the resulting video. There is no predicted-action output, so this script
-# uses the sync video endpoint (raw MP4 response). The action JSON is
-# referenced via `action_path` in extra_params, so it must be readable by
-# the server process — works out of the box when client and server share a
-# filesystem and run_server.sh keeps its default `--allowed-local-media-path /`.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_forward_dynamics_robot.json}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_forward_dynamics_robot.mp4}"
-IMAGE_PATH="${IMAGE_PATH:-bridge_0_frame0.jpg}"
-VIDEO_PATH="${VIDEO_PATH:-bridge_0.mp4}"
-ACTION_PATH="${ACTION_PATH:-$(pwd)/bridge_0.json}"
-
-if [ ! -f "${INPUT_JSON}" ]; then
-  echo "Missing input JSON: ${INPUT_JSON}" >&2
-  exit 1
-fi
-
-PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
-VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
-ACTION_URL="$(jq -r '.action_path' "${INPUT_JSON}")"
-DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
-RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
-ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
-NUM_FRAMES="$(jq -r '.num_frames // 17' "${INPUT_JSON}")"
-FPS="$(jq -r '.fps // 5' "${INPUT_JSON}")"
-HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
-WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
-NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
-GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
-FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
-SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
-
-if [ ! -f "${IMAGE_PATH}" ]; then
-  if [ ! -f "${VIDEO_PATH}" ]; then
-    echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
-    curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
-  fi
-  echo "Extracting first frame -> ${IMAGE_PATH}"
-  ffmpeg -y -loglevel error -i "${VIDEO_PATH}" -vf "select=eq(n\,0)" -vframes 1 "${IMAGE_PATH}"
-fi
-
-if [ ! -f "${ACTION_PATH}" ]; then
-  echo "Downloading ${ACTION_URL} -> ${ACTION_PATH}"
-  curl -sSL "${ACTION_URL}" -o "${ACTION_PATH}"
-fi
-
-EXTRA_PARAMS="$(jq -nc \
-  --arg domain "${DOMAIN_NAME}" \
-  --argjson dim "${RAW_ACTION_DIM}" \
-  --argjson chunk "${ACTION_CHUNK_SIZE}" \
-  --arg action_path "${ACTION_PATH}" \
-  '{action_mode:"forward_dynamics", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk, action_path:$action_path}')"
-
-curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
-  -F "prompt=${PROMPT}" \
-  -F "input_reference=@${IMAGE_PATH}" \
-  -F "size=${WIDTH}x${HEIGHT}" \
-  -F "num_frames=${NUM_FRAMES}" \
-  -F "fps=${FPS}" \
-  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
-  -F "guidance_scale=${GUIDANCE_SCALE}" \
-  -F "flow_shift=${FLOW_SHIFT}" \
-  -F "extra_params=${EXTRA_PARAMS}" \
-  -F "seed=${SEED}" \
-  -o "${OUTPUT_PATH}"
-
-echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_av.sh b/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_av.sh
deleted file mode 100755
index 458c3358d3d..00000000000
--- a/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_av.sh
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/bin/bash
-# Cosmos3 inverse-dynamics example (autonomous vehicle, video input).
-#
-# KNOWN LIMITATION: as of writing, the online `/v1/videos` endpoint accepts
-# image bytes only via the `input_reference` form field. Inverse-dynamics
-# needs the full source video, so this script will currently fail at upload
-# time. The offline path (`examples/offline_inference/cosmos3/end2end.py
-# --task action_inverse_dynamics --input-json inputs/action_inverse_dynamics_av.json`)
-# does support video input today. The script below is kept ready so it
-# starts working when the server gains video upload support.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_inverse_dynamics_av.json}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_inverse_dynamics_av.mp4}"
-ACTION_OUTPUT_PATH="${ACTION_OUTPUT_PATH:-cosmos3_inverse_dynamics_av_action.json}"
-VIDEO_PATH="${VIDEO_PATH:-av_vision_25.mp4}"
-POLL_INTERVAL="${POLL_INTERVAL:-2}"
-
-if [ ! -f "${INPUT_JSON}" ]; then
-  echo "Missing input JSON: ${INPUT_JSON}" >&2
-  exit 1
-fi
-
-PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
-VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
-DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
-RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
-ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
-NUM_FRAMES="$(jq -r '.num_frames // 17' "${INPUT_JSON}")"
-FPS="$(jq -r '.fps // 10' "${INPUT_JSON}")"
-HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
-WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
-NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
-GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
-FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
-SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
-
-if [ ! -f "${VIDEO_PATH}" ]; then
-  echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
-  curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
-fi
-
-EXTRA_PARAMS="$(jq -nc \
-  --arg domain "${DOMAIN_NAME}" \
-  --argjson dim "${RAW_ACTION_DIM}" \
-  --argjson chunk "${ACTION_CHUNK_SIZE}" \
-  '{action_mode:"inverse_dynamics", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk}')"
-
-create_response=$(
-  curl -sS -X POST "${BASE_URL}/v1/videos" \
-    -H "Accept: application/json" \
-    -F "prompt=${PROMPT}" \
-    -F "input_reference=@${VIDEO_PATH}" \
-    -F "size=${WIDTH}x${HEIGHT}" \
-    -F "num_frames=${NUM_FRAMES}" \
-    -F "fps=${FPS}" \
-    -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
-    -F "guidance_scale=${GUIDANCE_SCALE}" \
-    -F "flow_shift=${FLOW_SHIFT}" \
-    -F "extra_params=${EXTRA_PARAMS}" \
-    -F "seed=${SEED}"
-)
-
-video_id="$(echo "${create_response}" | jq -r '.id')"
-if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then
-  echo "Failed to create video job:"
-  echo "${create_response}" | jq .
-  exit 1
-fi
-
-echo "Created video job ${video_id}"
-while true; do
-  status_response="$(curl -sS "${BASE_URL}/v1/videos/${video_id}")"
-  status="$(echo "${status_response}" | jq -r '.status')"
-
-  case "${status}" in
-    queued|in_progress)
-      echo "Video job ${video_id} status: ${status}"
-      sleep "${POLL_INTERVAL}"
-      ;;
-    completed)
-      echo "${status_response}" | jq '.data[0].action' > "${ACTION_OUTPUT_PATH}"
-      break
-      ;;
-    failed)
-      echo "Video generation failed:"
-      echo "${status_response}" | jq .
-      exit 1
-      ;;
-    *)
-      echo "Unexpected status response:"
-      echo "${status_response}" | jq .
-      exit 1
-      ;;
-  esac
-done
-
-curl -sS -L "${BASE_URL}/v1/videos/${video_id}/content" -o "${OUTPUT_PATH}"
-
-echo "Saved video to ${OUTPUT_PATH}"
-echo "Saved action metadata to ${ACTION_OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_robot.sh b/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_robot.sh
deleted file mode 100755
index a718a84b428..00000000000
--- a/examples/online_serving/cosmos3/run_curl_action_inverse_dynamics_robot.sh
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/bin/bash
-# Cosmos3 inverse-dynamics example (bridge_orig_lerobot, video input).
-#
-# KNOWN LIMITATION: as of writing, the online `/v1/videos` endpoint accepts
-# image bytes only via the `input_reference` form field. Inverse-dynamics
-# needs the full source video, so this script will currently fail at upload
-# time. The offline path (`examples/offline_inference/cosmos3/end2end.py
-# --task action_inverse_dynamics --input-json inputs/action_inverse_dynamics_robot.json`)
-# does support video input today. The script below is kept ready so it
-# starts working when the server gains video upload support.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_inverse_dynamics_robot.json}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_inverse_dynamics_robot.mp4}"
-ACTION_OUTPUT_PATH="${ACTION_OUTPUT_PATH:-cosmos3_inverse_dynamics_robot_action.json}"
-VIDEO_PATH="${VIDEO_PATH:-bridge_0.mp4}"
-POLL_INTERVAL="${POLL_INTERVAL:-2}"
-
-if [ ! -f "${INPUT_JSON}" ]; then
-  echo "Missing input JSON: ${INPUT_JSON}" >&2
-  exit 1
-fi
-
-PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
-VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
-DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
-RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
-ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
-NUM_FRAMES="$(jq -r '.num_frames // 17' "${INPUT_JSON}")"
-FPS="$(jq -r '.fps // 5' "${INPUT_JSON}")"
-HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
-WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
-NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
-GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
-FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
-SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
-
-if [ ! -f "${VIDEO_PATH}" ]; then
-  echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
-  curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
-fi
-
-EXTRA_PARAMS="$(jq -nc \
-  --arg domain "${DOMAIN_NAME}" \
-  --argjson dim "${RAW_ACTION_DIM}" \
-  --argjson chunk "${ACTION_CHUNK_SIZE}" \
-  '{action_mode:"inverse_dynamics", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk}')"
-
-create_response=$(
-  curl -sS -X POST "${BASE_URL}/v1/videos" \
-    -H "Accept: application/json" \
-    -F "prompt=${PROMPT}" \
-    -F "input_reference=@${VIDEO_PATH}" \
-    -F "size=${WIDTH}x${HEIGHT}" \
-    -F "num_frames=${NUM_FRAMES}" \
-    -F "fps=${FPS}" \
-    -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
-    -F "guidance_scale=${GUIDANCE_SCALE}" \
-    -F "flow_shift=${FLOW_SHIFT}" \
-    -F "extra_params=${EXTRA_PARAMS}" \
-    -F "seed=${SEED}"
-)
-
-video_id="$(echo "${create_response}" | jq -r '.id')"
-if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then
-  echo "Failed to create video job:"
-  echo "${create_response}" | jq .
-  exit 1
-fi
-
-echo "Created video job ${video_id}"
-while true; do
-  status_response="$(curl -sS "${BASE_URL}/v1/videos/${video_id}")"
-  status="$(echo "${status_response}" | jq -r '.status')"
-
-  case "${status}" in
-    queued|in_progress)
-      echo "Video job ${video_id} status: ${status}"
-      sleep "${POLL_INTERVAL}"
-      ;;
-    completed)
-      echo "${status_response}" | jq '.data[0].action' > "${ACTION_OUTPUT_PATH}"
-      break
-      ;;
-    failed)
-      echo "Video generation failed:"
-      echo "${status_response}" | jq .
-      exit 1
-      ;;
-    *)
-      echo "Unexpected status response:"
-      echo "${status_response}" | jq .
-      exit 1
-      ;;
-  esac
-done
-
-curl -sS -L "${BASE_URL}/v1/videos/${video_id}/content" -o "${OUTPUT_PATH}"
-
-echo "Saved video to ${OUTPUT_PATH}"
-echo "Saved action metadata to ${ACTION_OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_policy.sh b/examples/online_serving/cosmos3/run_curl_action_policy.sh
deleted file mode 100755
index 930ab699457..00000000000
--- a/examples/online_serving/cosmos3/run_curl_action_policy.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/bin/bash
-# Cosmos3 action policy example (bridge_orig_lerobot, image input).
-#
-# Cosmos3 policy mode consumes an image plus a language instruction and
-# generates a video together with the predicted action sequence. The example
-# image is the first frame of bridge_0.mp4 (cosmos-dependencies), extracted
-# locally with ffmpeg so the request matches the prompt scene.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_policy_robot.json}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_action_policy.mp4}"
-ACTION_OUTPUT_PATH="${ACTION_OUTPUT_PATH:-cosmos3_action_policy_action.json}"
-IMAGE_PATH="${IMAGE_PATH:-bridge_0_frame0.jpg}"
-VIDEO_PATH="${VIDEO_PATH:-bridge_0.mp4}"
-POLL_INTERVAL="${POLL_INTERVAL:-2}"
-
-if [ ! -f "${INPUT_JSON}" ]; then
-  echo "Missing input JSON: ${INPUT_JSON}" >&2
-  exit 1
-fi
-
-PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
-VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
-DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
-RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
-ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
-NUM_FRAMES="$(jq -r '.num_frames // 17' "${INPUT_JSON}")"
-FPS="$(jq -r '.fps // 5' "${INPUT_JSON}")"
-HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
-WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
-NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
-GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
-FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
-SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
-
-if [ ! -f "${IMAGE_PATH}" ]; then
-  if [ ! -f "${VIDEO_PATH}" ]; then
-    echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
-    curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
-  fi
-  echo "Extracting first frame -> ${IMAGE_PATH}"
-  ffmpeg -y -loglevel error -i "${VIDEO_PATH}" -vf "select=eq(n\,0)" -vframes 1 "${IMAGE_PATH}"
-fi
-
-EXTRA_PARAMS="$(jq -nc \
-  --arg domain "${DOMAIN_NAME}" \
-  --argjson dim "${RAW_ACTION_DIM}" \
-  --argjson chunk "${ACTION_CHUNK_SIZE}" \
-  '{action_mode:"policy", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk}')"
-
-create_response=$(
-  curl -sS -X POST "${BASE_URL}/v1/videos" \
-    -H "Accept: application/json" \
-    -F "prompt=${PROMPT}" \
-    -F "input_reference=@${IMAGE_PATH}" \
-    -F "size=${WIDTH}x${HEIGHT}" \
-    -F "num_frames=${NUM_FRAMES}" \
-    -F "fps=${FPS}" \
-    -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
-    -F "guidance_scale=${GUIDANCE_SCALE}" \
-    -F "flow_shift=${FLOW_SHIFT}" \
-    -F "extra_params=${EXTRA_PARAMS}" \
-    -F "seed=${SEED}"
-)
-
-video_id="$(echo "${create_response}" | jq -r '.id')"
-if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then
-  echo "Failed to create video job:"
-  echo "${create_response}" | jq .
-  exit 1
-fi
-
-echo "Created video job ${video_id}"
-while true; do
-  status_response="$(curl -sS "${BASE_URL}/v1/videos/${video_id}")"
-  status="$(echo "${status_response}" | jq -r '.status')"
-
-  case "${status}" in
-    queued|in_progress)
-      echo "Video job ${video_id} status: ${status}"
-      sleep "${POLL_INTERVAL}"
-      ;;
-    completed)
-      echo "${status_response}" | jq '.data[0].action' > "${ACTION_OUTPUT_PATH}"
-      break
-      ;;
-    failed)
-      echo "Video generation failed:"
-      echo "${status_response}" | jq .
-      exit 1
-      ;;
-    *)
-      echo "Unexpected status response:"
-      echo "${status_response}" | jq .
-      exit 1
-      ;;
-  esac
-done
-
-curl -sS -L "${BASE_URL}/v1/videos/${video_id}/content" -o "${OUTPUT_PATH}"
-
-echo "Saved video to ${OUTPUT_PATH}"
-echo "Saved action metadata to ${ACTION_OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_action_policy_av.sh b/examples/online_serving/cosmos3/run_curl_action_policy_av.sh
deleted file mode 100755
index 9f7a3aa18d2..00000000000
--- a/examples/online_serving/cosmos3/run_curl_action_policy_av.sh
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/bin/bash
-# Cosmos3 action policy example (autonomous vehicle domain, image input).
-#
-# The example image is the first frame of the AV vision clip
-# (cosmos-dependencies), extracted locally with ffmpeg so the request matches
-# the prompt scene.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/action_policy_av.json}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_action_policy_av.mp4}"
-ACTION_OUTPUT_PATH="${ACTION_OUTPUT_PATH:-cosmos3_action_policy_av_action.json}"
-IMAGE_PATH="${IMAGE_PATH:-av_vision_25_frame0.jpg}"
-VIDEO_PATH="${VIDEO_PATH:-av_vision_25.mp4}"
-POLL_INTERVAL="${POLL_INTERVAL:-2}"
-
-if [ ! -f "${INPUT_JSON}" ]; then
-  echo "Missing input JSON: ${INPUT_JSON}" >&2
-  exit 1
-fi
-
-PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
-VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
-DOMAIN_NAME="$(jq -r '.domain_name' "${INPUT_JSON}")"
-RAW_ACTION_DIM="$(jq -r '.raw_action_dim' "${INPUT_JSON}")"
-ACTION_CHUNK_SIZE="$(jq -r '.action_chunk_size' "${INPUT_JSON}")"
-NUM_FRAMES="$(jq -r '.num_frames // 17' "${INPUT_JSON}")"
-FPS="$(jq -r '.fps // 10' "${INPUT_JSON}")"
-HEIGHT="$(jq -r '.height // 480' "${INPUT_JSON}")"
-WIDTH="$(jq -r '.width // 640' "${INPUT_JSON}")"
-NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 30' "${INPUT_JSON}")"
-GUIDANCE_SCALE="$(jq -r '.guidance_scale // 1.0' "${INPUT_JSON}")"
-FLOW_SHIFT="$(jq -r '.flow_shift // 5.0' "${INPUT_JSON}")"
-SEED="$(jq -r '.seed // 0' "${INPUT_JSON}")"
-
-if [ ! -f "${IMAGE_PATH}" ]; then
-  if [ ! -f "${VIDEO_PATH}" ]; then
-    echo "Downloading ${VISION_URL} -> ${VIDEO_PATH}"
-    curl -sSL "${VISION_URL}" -o "${VIDEO_PATH}"
-  fi
-  echo "Extracting first frame -> ${IMAGE_PATH}"
-  ffmpeg -y -loglevel error -i "${VIDEO_PATH}" -vf "select=eq(n\,0)" -vframes 1 "${IMAGE_PATH}"
-fi
-
-EXTRA_PARAMS="$(jq -nc \
-  --arg domain "${DOMAIN_NAME}" \
-  --argjson dim "${RAW_ACTION_DIM}" \
-  --argjson chunk "${ACTION_CHUNK_SIZE}" \
-  '{action_mode:"policy", domain_name:$domain, raw_action_dim:$dim, action_chunk_size:$chunk}')"
-
-create_response=$(
-  curl -sS -X POST "${BASE_URL}/v1/videos" \
-    -H "Accept: application/json" \
-    -F "prompt=${PROMPT}" \
-    -F "input_reference=@${IMAGE_PATH}" \
-    -F "size=${WIDTH}x${HEIGHT}" \
-    -F "num_frames=${NUM_FRAMES}" \
-    -F "fps=${FPS}" \
-    -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
-    -F "guidance_scale=${GUIDANCE_SCALE}" \
-    -F "flow_shift=${FLOW_SHIFT}" \
-    -F "extra_params=${EXTRA_PARAMS}" \
-    -F "seed=${SEED}"
-)
-
-video_id="$(echo "${create_response}" | jq -r '.id')"
-if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then
-  echo "Failed to create video job:"
-  echo "${create_response}" | jq .
-  exit 1
-fi
-
-echo "Created video job ${video_id}"
-while true; do
-  status_response="$(curl -sS "${BASE_URL}/v1/videos/${video_id}")"
-  status="$(echo "${status_response}" | jq -r '.status')"
-
-  case "${status}" in
-    queued|in_progress)
-      echo "Video job ${video_id} status: ${status}"
-      sleep "${POLL_INTERVAL}"
-      ;;
-    completed)
-      echo "${status_response}" | jq '.data[0].action' > "${ACTION_OUTPUT_PATH}"
-      break
-      ;;
-    failed)
-      echo "Video generation failed:"
-      echo "${status_response}" | jq .
-      exit 1
-      ;;
-    *)
-      echo "Unexpected status response:"
-      echo "${status_response}" | jq .
-      exit 1
-      ;;
-  esac
-done
-
-curl -sS -L "${BASE_URL}/v1/videos/${video_id}/content" -o "${OUTPUT_PATH}"
-
-echo "Saved video to ${OUTPUT_PATH}"
-echo "Saved action metadata to ${ACTION_OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_i2v.sh b/examples/online_serving/cosmos3/run_curl_i2v.sh
deleted file mode 100755
index b4d1a594a59..00000000000
--- a/examples/online_serving/cosmos3/run_curl_i2v.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# Cosmos3 image-to-video example using the sync video API.
-#
-# The prompt is loaded from the canonical input JSON shared with the offline
-# example. The companion image (robot_153.jpg) is auto-downloaded if missing.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/i2v.json}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_i2v.mp4}"
-IMAGE_PATH="${IMAGE_PATH:-robot_153.jpg}"
-
-if [ ! -f "${INPUT_JSON}" ]; then
-  echo "Missing input JSON: ${INPUT_JSON}" >&2
-  exit 1
-fi
-
-PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
-VISION_URL="$(jq -r '.vision_path' "${INPUT_JSON}")"
-HEIGHT="$(jq -r '.height // 720' "${INPUT_JSON}")"
-WIDTH="$(jq -r '.width // 1280' "${INPUT_JSON}")"
-NUM_FRAMES="$(jq -r '.num_frames // 189' "${INPUT_JSON}")"
-FPS="$(jq -r '.fps // 24' "${INPUT_JSON}")"
-NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 35' "${INPUT_JSON}")"
-GUIDANCE_SCALE="$(jq -r '.guidance_scale // 6.0' "${INPUT_JSON}")"
-FLOW_SHIFT="$(jq -r '.flow_shift // 10.0' "${INPUT_JSON}")"
-
-if [ ! -f "${IMAGE_PATH}" ]; then
-  echo "Downloading ${VISION_URL} -> ${IMAGE_PATH}"
-  curl -sSL "${VISION_URL}" -o "${IMAGE_PATH}"
-fi
-
-curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
-  -F "prompt=${PROMPT}" \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "input_reference=@${IMAGE_PATH}" \
-  -F "size=${WIDTH}x${HEIGHT}" \
-  -F "num_frames=${NUM_FRAMES}" \
-  -F "fps=${FPS}" \
-  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
-  -F "guidance_scale=${GUIDANCE_SCALE}" \
-  -F "flow_shift=${FLOW_SHIFT}" \
-  -F "seed=42" \
-  -o "${OUTPUT_PATH}"
-
-echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_t2i.sh b/examples/online_serving/cosmos3/run_curl_t2i.sh
deleted file mode 100755
index 04519446336..00000000000
--- a/examples/online_serving/cosmos3/run_curl_t2i.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Cosmos3 text-to-image example using the images API.
-#
-# The prompt is loaded from the canonical input JSON shared with the offline
-# example so updates only need to happen in one place.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/t2i.json}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_t2i.png}"
-
-if [ ! -f "${INPUT_JSON}" ]; then
-  echo "Missing input JSON: ${INPUT_JSON}" >&2
-  exit 1
-fi
-
-PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
-HEIGHT="$(jq -r '.height // 960' "${INPUT_JSON}")"
-WIDTH="$(jq -r '.width // 960' "${INPUT_JSON}")"
-NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 50' "${INPUT_JSON}")"
-GUIDANCE_SCALE="$(jq -r '.guidance_scale // 4.0' "${INPUT_JSON}")"
-FLOW_SHIFT="$(jq -r '.flow_shift // 3.0' "${INPUT_JSON}")"
-
-curl -sS -X POST "${BASE_URL}/v1/images/generations" \
-  -H "Content-Type: application/json" \
-  -d "$(jq -nc \
-        --arg prompt "${PROMPT}" \
-        --arg negative "blurry, distorted, low quality" \
-        --arg size "${WIDTH}x${HEIGHT}" \
-        --argjson steps "${NUM_INFERENCE_STEPS}" \
-        --argjson guidance "${GUIDANCE_SCALE}" \
-        --argjson flow_shift "${FLOW_SHIFT}" \
-        '{prompt:$prompt,
-          size:$size,
-          n:1,
-          num_inference_steps:$steps,
-          guidance_scale:$guidance,
-          flow_shift:$flow_shift,
-          negative_prompt:$negative,
-          seed:42}')" \
-  | jq -r '.data[0].b64_json' | base64 -d > "${OUTPUT_PATH}"
-
-echo "Saved image to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_t2v.sh b/examples/online_serving/cosmos3/run_curl_t2v.sh
deleted file mode 100755
index c6cd147579f..00000000000
--- a/examples/online_serving/cosmos3/run_curl_t2v.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-# Cosmos3 text-to-video example using the sync video API.
-#
-# The prompt is loaded from the canonical input JSON shared with the offline
-# example so updates only need to happen in one place.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/t2v.json}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_t2v.mp4}"
-
-if [ ! -f "${INPUT_JSON}" ]; then
-  echo "Missing input JSON: ${INPUT_JSON}" >&2
-  exit 1
-fi
-
-PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
-HEIGHT="$(jq -r '.height // 720' "${INPUT_JSON}")"
-WIDTH="$(jq -r '.width // 1280' "${INPUT_JSON}")"
-NUM_FRAMES="$(jq -r '.num_frames // 189' "${INPUT_JSON}")"
-FPS="$(jq -r '.fps // 24' "${INPUT_JSON}")"
-NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 35' "${INPUT_JSON}")"
-GUIDANCE_SCALE="$(jq -r '.guidance_scale // 6.0' "${INPUT_JSON}")"
-FLOW_SHIFT="$(jq -r '.flow_shift // 10.0' "${INPUT_JSON}")"
-
-curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
-  -F "prompt=${PROMPT}" \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "size=${WIDTH}x${HEIGHT}" \
-  -F "num_frames=${NUM_FRAMES}" \
-  -F "fps=${FPS}" \
-  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
-  -F "guidance_scale=${GUIDANCE_SCALE}" \
-  -F "flow_shift=${FLOW_SHIFT}" \
-  -F "seed=42" \
-  -o "${OUTPUT_PATH}"
-
-echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_curl_t2v_sound.sh b/examples/online_serving/cosmos3/run_curl_t2v_sound.sh
deleted file mode 100755
index e026dfa6c04..00000000000
--- a/examples/online_serving/cosmos3/run_curl_t2v_sound.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-# Cosmos3 video-with-sound example.
-#
-# The prompt is loaded from the canonical input JSON shared with the offline
-# example. sound_duration is read from the JSON when present.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-INPUTS_DIR="${INPUTS_DIR:-${SCRIPT_DIR}/../../offline_inference/cosmos3/inputs}"
-INPUT_JSON="${INPUT_JSON:-${INPUTS_DIR}/t2v_sound.json}"
-
-BASE_URL="${BASE_URL:-http://localhost:8091}"
-OUTPUT_PATH="${OUTPUT_PATH:-cosmos3_t2v_sound.mp4}"
-
-if [ ! -f "${INPUT_JSON}" ]; then
-  echo "Missing input JSON: ${INPUT_JSON}" >&2
-  exit 1
-fi
-
-PROMPT="$(jq -r '.prompt' "${INPUT_JSON}")"
-SOUND_DURATION="$(jq -r '.sound_duration // 3.4' "${INPUT_JSON}")"
-HEIGHT="$(jq -r '.height // 720' "${INPUT_JSON}")"
-WIDTH="$(jq -r '.width // 1280' "${INPUT_JSON}")"
-NUM_FRAMES="$(jq -r '.num_frames // 189' "${INPUT_JSON}")"
-FPS="$(jq -r '.fps // 24' "${INPUT_JSON}")"
-NUM_INFERENCE_STEPS="$(jq -r '.num_inference_steps // 35' "${INPUT_JSON}")"
-GUIDANCE_SCALE="$(jq -r '.guidance_scale // 6.0' "${INPUT_JSON}")"
-FLOW_SHIFT="$(jq -r '.flow_shift // 10.0' "${INPUT_JSON}")"
-
-curl -sS -X POST "${BASE_URL}/v1/videos/sync" \
-  -F "prompt=${PROMPT}" \
-  -F "negative_prompt=blurry, distorted, low quality" \
-  -F "size=${WIDTH}x${HEIGHT}" \
-  -F "num_frames=${NUM_FRAMES}" \
-  -F "fps=${FPS}" \
-  -F "num_inference_steps=${NUM_INFERENCE_STEPS}" \
-  -F "guidance_scale=${GUIDANCE_SCALE}" \
-  -F "flow_shift=${FLOW_SHIFT}" \
-  -F "generate_sound=true" \
-  -F "sound_duration=${SOUND_DURATION}" \
-  -F "seed=42" \
-  -o "${OUTPUT_PATH}"
-
-echo "Saved video to ${OUTPUT_PATH}"
diff --git a/examples/online_serving/cosmos3/run_server.sh b/examples/online_serving/cosmos3/run_server.sh
deleted file mode 100644
index 9b75e810d34..00000000000
--- a/examples/online_serving/cosmos3/run_server.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Cosmos3 online serving startup script.
-
-set -euo pipefail
-
-MODEL="${MODEL:-${COSMOS3_MODEL:-nvidia/Cosmos3-Nano}}"
-PORT="${PORT:-8091}"
-CACHE_BACKEND="${CACHE_BACKEND:-none}"
-ENABLE_LAYERWISE_OFFLOAD="${ENABLE_LAYERWISE_OFFLOAD:-0}"
-CFG_PARALLEL_SIZE="${CFG_PARALLEL_SIZE:-1}"
-TENSOR_PARALLEL_SIZE="${TENSOR_PARALLEL_SIZE:-1}"
-ULYSSES_DEGREE="${ULYSSES_DEGREE:-1}"
-USE_HSDP="${USE_HSDP:-0}"
-ALLOWED_LOCAL_MEDIA_PATH="${ALLOWED_LOCAL_MEDIA_PATH:-/}"
-DEPLOY_CONFIG="${DEPLOY_CONFIG:-}"
-
-args=(
-  vllm serve "${MODEL}"
-  --omni
-  --port "${PORT}"
-  --allowed-local-media-path "${ALLOWED_LOCAL_MEDIA_PATH}"
-  --cfg-parallel-size "${CFG_PARALLEL_SIZE}"
-  --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}"
-)
-
-if [ -n "${DEPLOY_CONFIG}" ]; then
-  args+=(--deploy-config "${DEPLOY_CONFIG}")
-fi
-
-if [ "${ULYSSES_DEGREE}" != "1" ]; then
-  args+=(--usp "${ULYSSES_DEGREE}")
-fi
-
-if [ "${CACHE_BACKEND}" != "none" ]; then
-  args+=(--cache-backend "${CACHE_BACKEND}")
-fi
-
-if [ "${ENABLE_LAYERWISE_OFFLOAD}" != "0" ]; then
-  args+=(--enable-layerwise-offload)
-fi
-
-if [ "${USE_HSDP}" != "0" ]; then
-  args+=(--use-hsdp)
-fi
-
-echo "Starting Cosmos3 server on port ${PORT}"
-exec "${args[@]}"

From a0638308179e050c9737e189c96f1ae53436a9d5 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Wed, 20 May 2026 11:58:41 +0200
Subject: [PATCH 20/41] Simplify tests

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 tests/diffusion/models/cosmos3/conftest.py    |    1 +
 .../models/cosmos3/test_cosmos3_pipeline.py   | 1407 ++++-------------
 .../cosmos3/test_cosmos3_sound_tokenizer.py   |  319 +---
 .../cosmos3/test_cosmos3_transformer.py       |  510 +-----
 .../models/test_cosmos3_guardrails.py         |   38 +-
 tests/e2e/accuracy/test_cosmos3_similarity.py |  107 +-
 6 files changed, 498 insertions(+), 1884 deletions(-)

diff --git a/tests/diffusion/models/cosmos3/conftest.py b/tests/diffusion/models/cosmos3/conftest.py
index 1864447aae2..80a7105d2ca 100644
--- a/tests/diffusion/models/cosmos3/conftest.py
+++ b/tests/diffusion/models/cosmos3/conftest.py
@@ -180,6 +180,7 @@ def make_sampling_params(**overrides: Any) -> SimpleNamespace:
         "num_frames": None,
         "num_inference_steps": None,
         "guidance_scale": None,
+        "generator": None,
         "seed": 123,
         "num_outputs_per_prompt": 1,
         "frame_rate": None,
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
index 3a4b33962de..452d8d4e8b5 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -3,7 +3,6 @@
 
 from __future__ import annotations
 
-import logging
 from types import SimpleNamespace
 
 import pytest
@@ -11,10 +10,7 @@
 from PIL import Image
 from torch import nn
 
-from tests.diffusion.models.cosmos3.conftest import (
-    StubScheduler,
-    make_sampling_params,
-)
+from tests.diffusion.models.cosmos3.conftest import make_sampling_params
 
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
 
@@ -27,712 +23,208 @@ def _mask() -> torch.Tensor:
     return torch.ones(1, 1, dtype=torch.long)
 
 
-class TestRegistryIntegration:
-    def test_pipeline_registered_and_exported(self) -> None:
-        from vllm_omni.diffusion.cache.cache_dit_backend import CUSTOM_DIT_ENABLERS
-        from vllm_omni.diffusion.models import cosmos3
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
-            Cosmos3OmniDiffusersPipeline,
-        )
-        from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin
-        from vllm_omni.diffusion.registry import (
-            _DIFFUSION_MODELS,
-            _DIFFUSION_POST_PROCESS_FUNCS,
-            _DIFFUSION_PRE_PROCESS_FUNCS,
-        )
-
-        assert issubclass(Cosmos3OmniDiffusersPipeline, nn.Module)
-        assert issubclass(Cosmos3OmniDiffusersPipeline, ProgressBarMixin)
-        assert Cosmos3OmniDiffusersPipeline.support_image_input is True
-        assert _DIFFUSION_MODELS["Cosmos3OmniDiffusersPipeline"] == (
-            "cosmos3",
-            "pipeline_cosmos3",
-            "Cosmos3OmniDiffusersPipeline",
-        )
-        assert _DIFFUSION_PRE_PROCESS_FUNCS["Cosmos3OmniDiffusersPipeline"] == "get_cosmos3_pre_process_func"
-        assert _DIFFUSION_POST_PROCESS_FUNCS["Cosmos3OmniDiffusersPipeline"] == "get_cosmos3_post_process_func"
-        assert "Cosmos3OmniDiffusersPipeline" in CUSTOM_DIT_ENABLERS
-        assert hasattr(cosmos3, "Cosmos3OmniDiffusersPipeline")
-        assert "Cosmos3OmniDiffusersPipeline" in cosmos3.__all__
-
-
-class TestPreAndPostProcess:
-    def test_preprocess_leaves_t2v_string_prompt_unchanged(self) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
-
-        request = SimpleNamespace(
-            prompts=["A robot walks through a warehouse."],
-            sampling_params=SimpleNamespace(height=None, width=None),
-        )
-
-        result = get_cosmos3_pre_process_func(SimpleNamespace())(request)
-
-        assert result is request
-        assert result.prompts == ["A robot walks through a warehouse."]
-        assert result.sampling_params.height is None
-        assert result.sampling_params.width is None
-
-    def test_preprocess_resizes_i2v_image_to_720p_aspect_and_stores_tensor(self) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
-
-        request = SimpleNamespace(
-            prompts=[
-                {
-                    "prompt": "A slow camera push.",
-                    "multi_modal_data": {"image": Image.new("RGB", (320, 160), "red")},
-                }
-            ],
-            sampling_params=SimpleNamespace(height=None, width=None),
-        )
-
-        result = get_cosmos3_pre_process_func(SimpleNamespace())(request)
-        prompt = result.prompts[0]
-
-        assert result.sampling_params.height == 672
-        assert result.sampling_params.width == 1344
-        preprocessed = prompt["additional_information"]["preprocessed_image"]
-        assert isinstance(preprocessed, torch.Tensor)
-        assert tuple(preprocessed.shape[-2:]) == (672, 1344)
-
-    def test_preprocess_preserves_explicit_size_for_i2v(self) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
-
-        request = SimpleNamespace(
-            prompts=[
-                {
-                    "prompt": "A slow camera push.",
-                    "multi_modal_data": {"image": Image.new("RGB", (320, 160), "red")},
-                }
-            ],
-            sampling_params=SimpleNamespace(height=64, width=96),
-        )
-
-        result = get_cosmos3_pre_process_func(SimpleNamespace())(request)
+def test_pipeline_registered_and_exported() -> None:
+    from vllm_omni.diffusion.cache.cache_dit_backend import CUSTOM_DIT_ENABLERS
+    from vllm_omni.diffusion.models import cosmos3
+    from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import Cosmos3OmniDiffusersPipeline
+    from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin
+    from vllm_omni.diffusion.registry import (
+        _DIFFUSION_MODELS,
+        _DIFFUSION_POST_PROCESS_FUNCS,
+        _DIFFUSION_PRE_PROCESS_FUNCS,
+    )
 
-        assert tuple(result.prompts[0]["additional_information"]["preprocessed_image"].shape[-2:]) == (64, 96)
+    assert issubclass(Cosmos3OmniDiffusersPipeline, nn.Module)
+    assert issubclass(Cosmos3OmniDiffusersPipeline, ProgressBarMixin)
+    assert Cosmos3OmniDiffusersPipeline.support_image_input is True
+    assert _DIFFUSION_MODELS["Cosmos3OmniDiffusersPipeline"] == (
+        "cosmos3",
+        "pipeline_cosmos3",
+        "Cosmos3OmniDiffusersPipeline",
+    )
+    assert _DIFFUSION_PRE_PROCESS_FUNCS["Cosmos3OmniDiffusersPipeline"] == "get_cosmos3_pre_process_func"
+    assert _DIFFUSION_POST_PROCESS_FUNCS["Cosmos3OmniDiffusersPipeline"] == "get_cosmos3_post_process_func"
+    assert "Cosmos3OmniDiffusersPipeline" in CUSTOM_DIT_ENABLERS
+    assert "Cosmos3OmniDiffusersPipeline" in cosmos3.__all__
 
-    def test_preprocess_action_video_stores_image_and_video_tensors(self) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
 
-        frames = [
-            Image.new("RGB", (8, 4), "red"),
-            Image.new("RGB", (8, 4), "green"),
-            Image.new("RGB", (8, 4), "blue"),
-        ]
-        request = SimpleNamespace(
-            prompts=[
-                {
-                    "prompt": "Move the robot.",
-                    "multi_modal_data": {"video": frames},
-                }
-            ],
-            sampling_params=SimpleNamespace(
-                height=16,
-                width=32,
-                extra_args={"action_mode": "forward_dynamics"},
-            ),
-        )
+def test_preprocess_i2v_image_and_action_video_inputs() -> None:
+    from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
 
-        result = get_cosmos3_pre_process_func(SimpleNamespace())(request)
-        additional = result.prompts[0]["additional_information"]
+    preprocess = get_cosmos3_pre_process_func(SimpleNamespace())
+    i2v = SimpleNamespace(
+        prompts=[{"prompt": "A slow camera push.", "multi_modal_data": {"image": Image.new("RGB", (320, 160))}}],
+        sampling_params=SimpleNamespace(height=None, width=None, extra_args={}),
+    )
 
-        assert tuple(additional["preprocessed_image"].shape) == (1, 3, 16, 32)
-        assert tuple(additional["preprocessed_video"].shape) == (1, 3, 3, 16, 32)
+    result = preprocess(i2v)
+    assert (result.sampling_params.height, result.sampling_params.width) == (672, 1344)
+    assert tuple(result.prompts[0]["additional_information"]["preprocessed_image"].shape[-2:]) == (672, 1344)
 
-    def test_postprocess_latent_passthrough_and_t2i_shape_validation(self) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_post_process_func
+    frames = [Image.new("RGB", (8, 4), color) for color in ("red", "green", "blue")]
+    action = SimpleNamespace(
+        prompts=[{"prompt": "Move.", "multi_modal_data": {"video": frames}}],
+        sampling_params=SimpleNamespace(height=16, width=32, extra_args={"action_mode": "forward_dynamics"}),
+    )
 
-        func = get_cosmos3_post_process_func(SimpleNamespace())
-        video = torch.zeros(1, 3, 1, 4, 4)
+    additional = preprocess(action).prompts[0]["additional_information"]
+    assert tuple(additional["preprocessed_image"].shape) == (1, 3, 16, 32)
+    assert tuple(additional["preprocessed_video"].shape) == (1, 3, 3, 16, 32)
 
-        assert func(video, output_type="latent") is video
 
-        images = func({"image": video})
-        assert len(images) == 1
-        assert images[0].size == (4, 4)
+def test_postprocess_handles_image_video_audio_and_validation() -> None:
+    from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_post_process_func
 
-        video_result = func({"video": video})
-        assert "video" in video_result
+    func = get_cosmos3_post_process_func(SimpleNamespace())
+    video = torch.zeros(1, 3, 1, 4, 4)
 
-        sound_result = func(
-            {
-                "video": video,
-                "audio": torch.ones(1, 2, 16),
-                "audio_sample_rate": 48000,
-            },
+    assert func(video, output_type="latent") is video
+    assert func({"image": video})[0].size == (4, 4)
+    assert "video" in func({"video": video})
+    assert (
+        func(
+            {"video": video, "audio": torch.ones(1, 2, 16), "audio_sample_rate": 48000},
             sampling_params=SimpleNamespace(extra_args={"resolved_frame_rate": 12}),
-        )
-        assert "video" in sound_result
-        assert sound_result["audio"].shape == (1, 2, 16)
-        assert sound_result["audio_sample_rate"] == 48000
-        assert sound_result["fps"] == 12
-
-        with pytest.raises(ValueError, match="text-to-image postprocess expects"):
-            func({"image": torch.zeros(1, 3, 2, 4, 4)})
-
-        with pytest.raises(ValueError, match="both image and video"):
-            func({"image": video, "video": video})
-
-        with pytest.raises(ValueError, match="does not support audio output"):
-            func({"image": video, "audio": torch.ones(1, 2, 16)})
-
-
-class TestPipelineHelpers:
-    def test_get_sp_param_prefers_extra_args_then_direct_attribute(self) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
-            Cosmos3OmniDiffusersPipeline,
-        )
-
-        sp = SimpleNamespace(extra_args={"flow_shift": 3.0}, flow_shift=2.0)
-        assert Cosmos3OmniDiffusersPipeline._get_sp_param(sp, "flow_shift", 1.0) == 3.0
-
-        sp = SimpleNamespace(extra_args={}, flow_shift=2.0)
-        assert Cosmos3OmniDiffusersPipeline._get_sp_param(sp, "flow_shift", 1.0) == 2.0
-
-        sp = SimpleNamespace(extra_args={})
-        assert Cosmos3OmniDiffusersPipeline._get_sp_param(sp, "flow_shift", 1.0) == 1.0
-
-    def test_apply_metadata_templates_adds_duration_and_resolution(self) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
-            Cosmos3OmniDiffusersPipeline,
-        )
-
-        prompt = Cosmos3OmniDiffusersPipeline._apply_metadata_templates(
-            "A city street.",
-            num_frames=48,
-            frame_rate=24,
-            height=720,
-            width=1280,
-        )
-
-        assert prompt == (
-            "A city street. The video is 2.0 seconds long and is of 24 FPS. This video is of 720x1280 resolution."
-        )
-
-    @pytest.mark.parametrize(
-        "tokenized",
-        [
-            [1, 2],
-            (1, 2),
-            {"input_ids": [[1, 2]]},
-            torch.tensor([1, 2]),
-        ],
+        )["audio_sample_rate"]
+        == 48000
     )
-    def test_normalize_token_ids_accepts_common_tokenizer_outputs(self, tokenized) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
-            Cosmos3OmniDiffusersPipeline,
-        )
-
-        assert Cosmos3OmniDiffusersPipeline._normalize_token_ids(tokenized) == [1, 2]
-
-    def test_normalize_token_ids_rejects_unknown_or_non_integer_values(self) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
-            Cosmos3OmniDiffusersPipeline,
-        )
-
-        with pytest.raises(TypeError, match="must return token IDs"):
-            Cosmos3OmniDiffusersPipeline._normalize_token_ids(object())
-
-        with pytest.raises(TypeError, match="non-integer token"):
-            Cosmos3OmniDiffusersPipeline._normalize_token_ids([object()])
-
-    def test_tokenize_prompt_adds_generation_tokens_and_padding(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-
-        class FakeTokenizer:
-            eos_token_id = 99
-            pad_token_id = 0
-
-            def __init__(self) -> None:
-                self.conversations = None
-
-            def apply_chat_template(self, conversations, tokenize: bool, add_generation_prompt: bool):
-                self.conversations = conversations
-                assert tokenize is True
-                assert add_generation_prompt is True
-                return [10, 11]
-
-            def convert_tokens_to_ids(self, token: str) -> int:
-                assert token == "<|vision_start|>"
-                return 88
-
-        tokenizer = FakeTokenizer()
-        pipeline.tokenizer = tokenizer
-
-        input_ids, attention_mask = pipeline._tokenize_prompt(
-            "hello",
-            max_sequence_length=6,
-            use_system_prompt=True,
-            system_prompt="system",
-        )
-
-        assert input_ids.tolist() == [[10, 11, 99, 88, 0, 0]]
-        assert attention_mask.tolist() == [[1, 1, 1, 1, 0, 0]]
-        assert tokenizer.conversations == [
-            {"role": "system", "content": "system"},
-            {"role": "user", "content": "hello"},
-        ]
-
-    def test_format_and_tokenize_uses_video_and_image_metadata_modes(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        captured: list[tuple[str, bool, str | None]] = []
 
-        def fake_tokenize(text, max_sequence_length, use_system_prompt=False, system_prompt=None):
-            del max_sequence_length
-            captured.append((text, use_system_prompt, system_prompt))
-            return _ids(len(captured)), _mask()
-
-        pipeline._tokenize_prompt = fake_tokenize  # type: ignore[method-assign]
-
-        pipeline._format_and_tokenize_prompts(
-            "A robot",
-            "bad",
-            num_frames=48,
-            frame_rate=24,
-            height=720,
-            width=1280,
-            max_sequence_length=32,
-            sp=SimpleNamespace(extra_args={"negative_metadata_mode": "inverse"}),
-            use_system_prompt=True,
-            is_t2i=False,
-        )
-        assert "The video is 2.0 seconds long" in captured[0][0]
-        assert "This video is of 720x1280 resolution" in captured[0][0]
-        assert "The video is not 2.0 seconds long" in captured[1][0]
-        assert captured[0][1] is True
-
-        captured.clear()
-        pipeline._format_and_tokenize_prompts(
-            "A robot",
-            "bad",
-            num_frames=1,
-            frame_rate=24,
-            height=1024,
-            width=1024,
-            max_sequence_length=32,
-            sp=SimpleNamespace(extra_args={}),
-            use_system_prompt=False,
-            is_t2i=True,
-        )
-        assert "This image is of 1024x1024 resolution" in captured[0][0]
-        assert "seconds long" not in captured[0][0]
-        assert captured[1][0] == "bad"
-
-    @pytest.mark.parametrize(
-        ("key", "expected"),
-        [
-            ("transformer.vae2llm.weight", "transformer.vae2llm.weight"),
-            ("model.embed_tokens.weight", "transformer.language_model.embed_tokens.weight"),
-            ("model.norm.weight", "transformer.language_model.norm.weight"),
-            ("model.norm_moe_gen.weight", "transformer.norm_moe_gen.weight"),
-            (
-                "model.layers.3.self_attn.q_proj.weight",
-                "transformer.language_model.layers.3.self_attn.q_proj.weight",
-            ),
-            (
-                "model.layers.3.self_attn.q_proj_moe_gen.weight",
-                "transformer.gen_layers.3.cross_attention.q_proj.weight",
-            ),
-            (
-                "model.layers.3.mlp_moe_gen.down_proj.weight",
-                "transformer.gen_layers.3.mlp.down_proj.weight",
-            ),
-            ("sound2llm.weight", "transformer.sound2llm.weight"),
-            ("llm2sound.bias", "transformer.llm2sound.bias"),
-            ("sound_modality_embed", "transformer.sound_modality_embed"),
-            ("sound_modality_embed.weight", "transformer.sound_modality_embed"),
-            ("action2llm.fc.weight", "transformer.action2llm.fc.weight"),
-            ("llm2action.bias.weight", "transformer.llm2action.bias.weight"),
-            ("action_modality_embed", "transformer.action_modality_embed"),
-            ("action_modality_embed.weight", "transformer.action_modality_embed"),
-            ("action_pos_embed.weight", None),
-            ("lm_head.weight", None),
-            ("other.weight", None),
-        ],
+    with pytest.raises(ValueError, match="text-to-image postprocess expects"):
+        func({"image": torch.zeros(1, 3, 2, 4, 4)})
+    with pytest.raises(ValueError, match="both image and video"):
+        func({"image": video, "video": video})
+
+
+def test_prompt_formatting_and_checkpoint_key_remap(make_cosmos3_pipeline) -> None:
+    from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import Cosmos3OmniDiffusersPipeline
+
+    pipeline = make_cosmos3_pipeline()
+    captured: list[str] = []
+    pipeline._tokenize_prompt = lambda text, *args, **kwargs: (captured.append(text) or _ids(len(captured)), _mask())
+
+    pipeline._format_and_tokenize_prompts(
+        "A robot",
+        "bad",
+        num_frames=48,
+        frame_rate=24,
+        height=720,
+        width=1280,
+        max_sequence_length=32,
+        sp=SimpleNamespace(extra_args={"negative_metadata_mode": "inverse"}),
+        use_system_prompt=True,
+        is_t2i=False,
     )
-    def test_remap_ckpt_key(self, key: str, expected: str | None) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
-            Cosmos3OmniDiffusersPipeline,
-        )
-
-        assert Cosmos3OmniDiffusersPipeline._remap_ckpt_key(key) == expected
-
-    def test_prepare_latents_shape_uses_cosmos_temporal_and_spatial_factors(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-
-        latents = pipeline._prepare_latents(
-            height=16,
-            width=24,
-            num_frames=5,
-            generator=torch.Generator(device="cpu").manual_seed(0),
-        )
-
-        assert latents.shape == (1, 2, 2, 2, 3)
-        assert latents.dtype == torch.float32
-
-    def test_sound_request_detection_uses_prompt_and_extra_args(self) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
-            Cosmos3OmniDiffusersPipeline,
-        )
-
-        assert Cosmos3OmniDiffusersPipeline._is_sound_request(
-            {"prompt": "x", "generate_sound": True},
-            SimpleNamespace(extra_args={}),
-        )
-        assert Cosmos3OmniDiffusersPipeline._is_sound_request(
-            {"prompt": "x"},
-            SimpleNamespace(extra_args={"enable_sound_generation": "true"}),
-        )
-        assert not Cosmos3OmniDiffusersPipeline._is_sound_request(
-            {"prompt": "x"},
-            SimpleNamespace(extra_args={"generate_sound": False}),
-        )
-
-    def test_prepare_sound_latents_uses_lazy_tokenizer_and_duration(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
-
-        class FakeSoundTokenizer:
-            sample_rate = 10
-            latent_ch = 3
-            hop_size = 4
-
-            def decode(self, latents: torch.Tensor) -> torch.Tensor:
-                return torch.ones(latents.shape[0], 2, 24)
-
-        pipeline._sound_tokenizer = FakeSoundTokenizer()
-
-        target_samples, duration, sample_rate = pipeline._resolve_sound_target_samples(
-            SimpleNamespace(extra_args={"sound_duration": 2.0}),
-            num_frames=9,
-            frame_rate=3.0,
-        )
-        latents, latent_frames = pipeline._prepare_sound_latents(
-            21,
-            torch.Generator(device="cpu").manual_seed(0),
-        )
-        audio = pipeline._decode_sound_latents(torch.zeros(1, 3, 6), target_audio_samples=21)
-
-        assert (target_samples, duration, sample_rate) == (20, 2.0, 10)
-        assert latents.shape == (1, 3, 6)
-        assert latent_frames == 6
-        assert audio.shape == (1, 2, 21)
-
-    def test_init_eagerly_loads_sound_tokenizer_when_transformer_supports_sound(
-        self,
-        tmp_path,
-        monkeypatch: pytest.MonkeyPatch,
-    ) -> None:
-        import vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 as cosmos3_module
-        from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-        class FakeTokenizer:
-            @classmethod
-            def from_pretrained(cls, *args, **kwargs):
-                return cls()
-
-        class FakeVAE:
-            config = SimpleNamespace(scale_factor_temporal=4, scale_factor_spatial=8)
-
-            @classmethod
-            def from_pretrained(cls, *args, **kwargs):
-                return cls()
-
-            def to(self, device):
-                self.device = device
-                return self
-
-        class FakeScheduler:
-            config = SimpleNamespace(flow_shift=1.0)
-
-            @classmethod
-            def from_pretrained(cls, *args, **kwargs):
-                return cls()
-
-        class FakeTransformer:
-            sound_gen = True
-
-        fake_sound_tokenizer = object()
-        calls = []
-
-        def fake_from_config(od_config):
-            calls.append(od_config)
-            return fake_sound_tokenizer
-
-        monkeypatch.setattr(cosmos3_module, "AutoTokenizer", FakeTokenizer)
-        monkeypatch.setattr(cosmos3_module, "DistributedAutoencoderKLWan", FakeVAE)
-        monkeypatch.setattr(cosmos3_module, "UniPCMultistepScheduler", FakeScheduler)
-        monkeypatch.setattr(cosmos3_module, "Cosmos3VFMTransformer", lambda *args, **kwargs: FakeTransformer())
-        monkeypatch.setattr(sound_tokenizer.Cosmos3SoundTokenizer, "from_config", staticmethod(fake_from_config))
-        monkeypatch.setattr(
-            cosmos3_module.Cosmos3OmniDiffusersPipeline,
-            "setup_diffusion_pipeline_profiler",
-            lambda self, **kwargs: None,
-        )
-
-        od_config = SimpleNamespace(
-            model=str(tmp_path),
-            dtype=torch.float32,
-            enable_cpu_offload=False,
-            flow_shift=None,
-            enable_diffusion_pipeline_profiler=False,
-        )
-        pipeline = cosmos3_module.Cosmos3OmniDiffusersPipeline(od_config=od_config)
-
-        assert calls == [od_config]
-        assert pipeline._sound_tokenizer is fake_sound_tokenizer
-        source = pipeline.weights_sources[0]
-        assert source.subfolder is None
-        assert source.prefix == "transformer."
-        assert source.allow_patterns_overrides == ["transformer/*.safetensors"]
-
-    def test_prepare_latents_i2v_conditions_first_latent_frame(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-
-        def fake_encode(image_tensor, num_frames, height, width):
-            del image_tensor, num_frames, height, width
-            return torch.full((1, 2, 2, 2, 3), 5.0)
-
-        pipeline._encode_conditioning_video = fake_encode  # type: ignore[method-assign]
-
-        latents, velocity_mask, image_latent = pipeline._prepare_latents_i2v(
-            image_tensor=torch.zeros(1, 3, 16, 24),
-            height=16,
-            width=24,
-            num_frames=5,
-            generator=torch.Generator(device="cpu").manual_seed(0),
-        )
-
-        assert latents.shape == (1, 2, 2, 2, 3)
-        torch.testing.assert_close(latents[:, :, 0], torch.full((1, 2, 2, 3), 5.0))
-        assert velocity_mask.tolist() == [[[[[0.0]], [[1.0]]]]]
-        torch.testing.assert_close(image_latent, torch.full((1, 2, 1, 2, 3), 5.0))
-
-    def test_prepare_action_latents_policy_uses_noise_and_raw_dim_mask(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(action_gen=True, action_dim=4)
-
-        action, velocity_mask, clean, raw_dim = pipeline._prepare_action_latents(
-            mode="policy",
-            action_chunk_size=3,
-            raw_action_dim=2,
-            generator=torch.Generator(device="cpu").manual_seed(0),
-            sp=SimpleNamespace(extra_args={}),
-        )
-
-        assert action.shape == (1, 3, 4)
-        assert raw_dim == 2
-        assert velocity_mask.tolist() == [[[1.0], [1.0], [1.0]]]
-        torch.testing.assert_close(action[:, :, 2:], torch.zeros(1, 3, 2))
-        torch.testing.assert_close(clean, torch.zeros(1, 3, 4))
-
-    def test_prepare_action_latents_forward_dynamics_conditions_supplied_actions(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(action_gen=True, action_dim=4)
-
-        action, velocity_mask, clean, raw_dim = pipeline._prepare_action_latents(
-            mode="forward_dynamics",
-            action_chunk_size=2,
-            raw_action_dim=None,
-            generator=torch.Generator(device="cpu").manual_seed(0),
-            sp=SimpleNamespace(extra_args={"action": [[1.0, 2.0], [3.0, 4.0]]}),
-        )
-
-        assert raw_dim == 2
-        assert velocity_mask.tolist() == [[[0.0], [0.0]]]
-        torch.testing.assert_close(action, clean)
-        torch.testing.assert_close(action[0, :, :2], torch.tensor([[1.0, 2.0], [3.0, 4.0]]))
-
-    def test_set_flow_shift_rebuilds_only_when_target_changes(self, make_cosmos3_pipeline, monkeypatch) -> None:
-        import vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 as cosmos3_module
-
-        pipeline = make_cosmos3_pipeline()
-
-        class FakeUniPCMultistepScheduler:
-            calls: list[tuple[object, float]] = []
-
-            @classmethod
-            def from_config(cls, config, flow_shift: float):
-                cls.calls.append((config, flow_shift))
-                return StubScheduler([1], flow_shift=flow_shift)
-
-        monkeypatch.setattr(cosmos3_module, "UniPCMultistepScheduler", FakeUniPCMultistepScheduler)
-        original_scheduler = pipeline.scheduler
-
-        pipeline._set_flow_shift(1.0)
-        assert pipeline.scheduler is original_scheduler
-        assert FakeUniPCMultistepScheduler.calls == []
-
-        pipeline._set_flow_shift(3.0)
-        assert pipeline.scheduler is not original_scheduler
-        assert pipeline._current_flow_shift == 3.0
-        assert FakeUniPCMultistepScheduler.calls == [(pipeline._base_scheduler_config, 3.0)]
-
-
-class TestDiffuse:
-    def test_diffuse_without_cfg_runs_one_cond_forward_per_step(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        latents = torch.zeros(1, 2, 2, 1, 1)
-
-        result = pipeline.diffuse(
-            latents=latents,
-            timesteps=torch.tensor([7, 3]),
-            cond_ids=_ids(2),
-            cond_mask=_mask(),
-            uncond_ids=_ids(1),
-            uncond_mask=_mask(),
-            guidance_scale=1.0,
-            shared_kwargs={"video_shape": (2, 1, 1), "fps": 24.0},
-        )
-
-        assert pipeline.transformer.reset_calls == 1
-        assert [call["token"] for call in pipeline.transformer.calls] == [2, 2]
-        torch.testing.assert_close(result, torch.full_like(latents, 4.0))
-
-    def test_diffuse_sequential_cfg_uses_separate_caches_and_interval_skip(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        latents = torch.zeros(1, 2, 1, 1, 1)
-
-        result = pipeline.diffuse(
-            latents=latents,
-            timesteps=torch.tensor([900, 100]),
-            cond_ids=_ids(2),
-            cond_mask=_mask(),
-            uncond_ids=_ids(1),
-            uncond_mask=_mask(),
-            guidance_scale=3.0,
-            shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0},
-            guidance_interval=(500.0, 1000.0),
-        )
-
-        assert [call["token"] for call in pipeline.transformer.calls] == [2, 1, 2]
-        assert pipeline.transformer.calls[0]["cache_before"] is None
-        assert pipeline.transformer.calls[1]["cache_before"] is None
-        assert pipeline.transformer.calls[2]["cache_before"] is not None
-        torch.testing.assert_close(result, torch.full_like(latents, 6.0))
-
-    def test_diffuse_cfg_parallel_uses_scale_one_outside_guidance_interval(
-        self,
-        make_cosmos3_pipeline,
-    ) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline._cfg_parallel_active = lambda: True  # type: ignore[method-assign]
-        latents = torch.zeros(1, 2, 1, 1, 1)
-        calls = []
-
-        def fake_predict_noise_maybe_with_cfg(**kwargs):
-            calls.append(kwargs)
-            return torch.ones_like(latents)
-
-        pipeline.predict_noise_maybe_with_cfg = fake_predict_noise_maybe_with_cfg  # type: ignore[method-assign]
-
-        result = pipeline.diffuse(
-            latents=latents,
-            timesteps=torch.tensor([900, 100]),
-            cond_ids=_ids(2),
-            cond_mask=_mask(),
-            uncond_ids=_ids(1),
-            uncond_mask=_mask(),
-            guidance_scale=4.0,
-            shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0},
-            guidance_interval=(500.0, 1000.0),
-        )
-
-        assert [call["true_cfg_scale"] for call in calls] == [4.0, 1.0]
-        assert calls[0]["positive_kwargs"]["text_ids"].item() == 2
-        assert calls[0]["negative_kwargs"]["text_ids"].item() == 1
-        torch.testing.assert_close(result, torch.full_like(latents, 2.0))
-
-    def test_diffuse_i2v_masks_conditioned_frame_and_reinjects_image_latent(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        latents = torch.zeros(1, 2, 2, 1, 1)
-        velocity_mask = torch.tensor([[[[[0.0]], [[1.0]]]]])
-        image_latent = torch.full((1, 2, 1, 1, 1), 7.0)
-
-        result = pipeline.diffuse(
-            latents=latents,
-            timesteps=torch.tensor([7]),
-            cond_ids=_ids(2),
-            cond_mask=_mask(),
-            uncond_ids=_ids(1),
-            uncond_mask=_mask(),
-            guidance_scale=1.0,
-            shared_kwargs={"video_shape": (2, 1, 1), "fps": 24.0},
-            velocity_mask=velocity_mask,
-            image_latent=image_latent,
-        )
-
-        torch.testing.assert_close(result[:, :, 0:1], image_latent)
-        torch.testing.assert_close(result[:, :, 1:2], torch.full((1, 2, 1, 1, 1), 2.0))
-
-    def test_diffuse_with_sound_steps_video_and_sound_jointly(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
-        latents = torch.zeros(1, 2, 1, 1, 1)
-        sound_latents = torch.zeros(1, 3, 2)
-
-        video_result, sound_result = pipeline.diffuse(
-            latents=latents,
-            sound_latents=sound_latents,
-            timesteps=torch.tensor([7, 3]),
-            cond_ids=_ids(2),
-            cond_mask=_mask(),
-            uncond_ids=_ids(1),
-            uncond_mask=_mask(),
-            guidance_scale=1.0,
-            shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0},
-        )
-
-        torch.testing.assert_close(video_result, torch.full_like(latents, 4.0))
-        torch.testing.assert_close(sound_result, torch.full_like(sound_latents, 24.0))
-        assert pipeline.scheduler.step_calls[0][0].shape == (1, latents.numel() + sound_latents.numel())
-
-    def test_diffuse_with_action_steps_video_and_action_jointly(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
-        latents = torch.zeros(1, 2, 1, 1, 1)
-        action_latents = torch.zeros(1, 3, 4)
-
-        video_result, action_result = pipeline.diffuse(
-            latents=latents,
-            action_latents=action_latents,
-            action_velocity_mask=torch.ones(1, 3, 1),
-            action_condition_latents=torch.zeros(1, 3, 4),
-            timesteps=torch.tensor([7, 3]),
-            cond_ids=_ids(2),
-            cond_mask=_mask(),
-            uncond_ids=_ids(1),
-            uncond_mask=_mask(),
-            guidance_scale=1.0,
-            shared_kwargs={
-                "video_shape": (1, 1, 1),
-                "fps": 24.0,
-                "action_domain_ids": torch.tensor([0]),
-                "action_noisy_mask": torch.ones(1, 3, 1),
-            },
-        )
-
-        torch.testing.assert_close(video_result, torch.full_like(latents, 4.0))
-        torch.testing.assert_close(action_result, torch.full_like(action_latents, 44.0))
-        assert pipeline.scheduler.step_calls[0][0].shape == (1, latents.numel() + action_latents.numel())
+    assert "The video is 2.0 seconds long" in captured[0]
+    assert "The video is not 2.0 seconds long" in captured[1]
+
+    remaps = {
+        "model.embed_tokens.weight": "transformer.language_model.embed_tokens.weight",
+        "model.layers.3.self_attn.q_proj.weight": "transformer.language_model.layers.3.self_attn.q_proj.weight",
+        "model.layers.3.self_attn.q_proj_moe_gen.weight": "transformer.gen_layers.3.cross_attention.q_proj.weight",
+        "sound2llm.weight": "transformer.sound2llm.weight",
+        "action_modality_embed.weight": "transformer.action_modality_embed",
+        "lm_head.weight": None,
+    }
+    assert {key: Cosmos3OmniDiffusersPipeline._remap_ckpt_key(key) for key in remaps} == remaps
+
+
+def test_prepare_latents_for_video_image_sound_and_action(make_cosmos3_pipeline) -> None:
+    pipeline = make_cosmos3_pipeline()
+    latents = pipeline._prepare_latents(16, 24, 5, torch.Generator(device="cpu").manual_seed(0))
+    assert latents.shape == (1, 2, 2, 2, 3)
+
+    pipeline._encode_conditioning_video = lambda *args, **kwargs: torch.full((1, 2, 2, 2, 3), 5.0)
+    i2v_latents, velocity_mask, image_latent = pipeline._prepare_latents_i2v(
+        torch.zeros(1, 3, 16, 24), 16, 24, 5, torch.Generator(device="cpu").manual_seed(0)
+    )
+    torch.testing.assert_close(i2v_latents[:, :, 0], torch.full((1, 2, 2, 3), 5.0))
+    assert velocity_mask.tolist() == [[[[[0.0]], [[1.0]]]]]
+    assert image_latent.shape == (1, 2, 1, 2, 3)
+
+    pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+    pipeline._sound_tokenizer = SimpleNamespace(
+        sample_rate=10,
+        latent_ch=3,
+        hop_size=4,
+        decode=lambda x: torch.ones(x.shape[0], 2, 24),
+    )
+    assert pipeline._resolve_sound_target_samples(SimpleNamespace(extra_args={"sound_duration": 2.0}), 9, 3.0) == (
+        20,
+        2.0,
+        10,
+    )
+    sound_latents, latent_frames = pipeline._prepare_sound_latents(21, torch.Generator(device="cpu").manual_seed(0))
+    assert (sound_latents.shape, latent_frames) == (torch.Size([1, 3, 6]), 6)
+    assert pipeline._decode_sound_latents(torch.zeros(1, 3, 6), target_audio_samples=21).shape == (1, 2, 21)
+
+    pipeline.transformer = pipeline.transformer.__class__(action_gen=True, action_dim=4)
+    action, action_mask, clean, raw_dim = pipeline._prepare_action_latents(
+        mode="forward_dynamics",
+        action_chunk_size=2,
+        raw_action_dim=None,
+        generator=torch.Generator(device="cpu").manual_seed(0),
+        sp=SimpleNamespace(extra_args={"action": [[1.0, 2.0], [3.0, 4.0]]}),
+    )
+    assert raw_dim == 2
+    assert action_mask.tolist() == [[[0.0], [0.0]]]
+    torch.testing.assert_close(action, clean)
+
+
+def test_diffuse_covers_cfg_i2v_and_multimodal_steps(make_cosmos3_pipeline) -> None:
+    pipeline = make_cosmos3_pipeline()
+    latents = torch.zeros(1, 2, 1, 1, 1)
+
+    result = pipeline.diffuse(
+        latents=latents,
+        timesteps=torch.tensor([900, 100]),
+        cond_ids=_ids(2),
+        cond_mask=_mask(),
+        uncond_ids=_ids(1),
+        uncond_mask=_mask(),
+        guidance_scale=3.0,
+        shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0},
+        guidance_interval=(500.0, 1000.0),
+    )
+    assert [call["token"] for call in pipeline.transformer.calls] == [2, 1, 2]
+    torch.testing.assert_close(result, torch.full_like(latents, 6.0))
+
+    i2v = pipeline.diffuse(
+        latents=torch.zeros(1, 2, 2, 1, 1),
+        timesteps=torch.tensor([7]),
+        cond_ids=_ids(2),
+        cond_mask=_mask(),
+        uncond_ids=_ids(1),
+        uncond_mask=_mask(),
+        guidance_scale=1.0,
+        shared_kwargs={"video_shape": (2, 1, 1), "fps": 24.0},
+        velocity_mask=torch.tensor([[[[[0.0]], [[1.0]]]]]),
+        image_latent=torch.full((1, 2, 1, 1, 1), 7.0),
+    )
+    torch.testing.assert_close(i2v[:, :, 0:1], torch.full((1, 2, 1, 1, 1), 7.0))
+
+    pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
+    video_result, action_result = pipeline.diffuse(
+        latents=latents,
+        action_latents=torch.zeros(1, 3, 4),
+        action_velocity_mask=torch.ones(1, 3, 1),
+        action_condition_latents=torch.zeros(1, 3, 4),
+        timesteps=torch.tensor([7, 3]),
+        cond_ids=_ids(2),
+        cond_mask=_mask(),
+        uncond_ids=_ids(1),
+        uncond_mask=_mask(),
+        guidance_scale=1.0,
+        shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0, "action_domain_ids": torch.tensor([0])},
+    )
+    torch.testing.assert_close(video_result, torch.full_like(latents, 4.0))
+    torch.testing.assert_close(action_result, torch.full((), 44.0).expand_as(action_result))
 
 
 class TestForwardRouting:
     def _install_forward_stubs(self, pipeline):
         captured: dict[str, object] = {"diffuse_calls": [], "prepare_calls": []}
 
-        def fake_format(
-            prompt,
-            negative_prompt,
-            num_frames,
-            frame_rate,
-            height,
-            width,
-            max_sequence_length,
-            sp,
-            use_system_prompt=False,
-            is_t2i=False,
-        ):
+        def fake_format(prompt, negative_prompt, num_frames, frame_rate, height, width, *args, **kwargs):
             captured["format"] = {
                 "prompt": prompt,
                 "negative_prompt": negative_prompt,
@@ -740,10 +232,7 @@ def fake_format(
                 "frame_rate": frame_rate,
                 "height": height,
                 "width": width,
-                "max_sequence_length": max_sequence_length,
-                "use_system_prompt": use_system_prompt,
-                "is_t2i": is_t2i,
-                "sp": sp,
+                "is_t2i": kwargs["is_t2i"],
             }
             return _ids(2), _mask(), _ids(1), _mask()
 
@@ -751,14 +240,6 @@ def fake_prepare(height, width, num_frames, generator):
             captured["prepare_calls"].append((height, width, num_frames, generator.initial_seed()))
             return torch.zeros(1, 2, 1, 1, 1)
 
-        def fake_set_flow_shift(target):
-            captured.setdefault("flow_shifts", []).append(target)
-            pipeline._current_flow_shift = target
-
-        def fake_set_scheduler_timesteps(num_inference_steps):
-            captured.setdefault("scheduler_steps", []).append(num_inference_steps)
-            pipeline.scheduler.timesteps = torch.tensor([7])
-
         def fake_diffuse(**kwargs):
             captured["diffuse_calls"].append(kwargs)
             outputs = [kwargs["latents"] + len(captured["diffuse_calls"])]
@@ -768,453 +249,137 @@ def fake_diffuse(**kwargs):
                 outputs.append(kwargs["sound_latents"] + 2.0)
             return outputs[0] if len(outputs) == 1 else tuple(outputs)
 
-        pipeline._format_and_tokenize_prompts = fake_format  # type: ignore[method-assign]
-        pipeline._prepare_latents = fake_prepare  # type: ignore[method-assign]
-        pipeline._set_flow_shift = fake_set_flow_shift  # type: ignore[method-assign]
-        pipeline._set_scheduler_timesteps = fake_set_scheduler_timesteps  # type: ignore[method-assign]
-        pipeline.diffuse = fake_diffuse  # type: ignore[method-assign]
-        pipeline._decode_latents = lambda latents: latents  # type: ignore[method-assign]
-        return captured
-
-    def _install_sound_stubs(self, pipeline):
-        sound_latents = torch.zeros(1, 3, 4)
-        decoded_audio = torch.ones(1, 2, 20)
-
-        def fake_resolve_sound_target_samples(sp, num_frames, frame_rate):
-            del sp, num_frames, frame_rate
-            return 20, 2.0, 10
-
-        def fake_prepare_sound_latents(target_samples, generator):
-            del target_samples, generator
-            return sound_latents, 4
-
-        pipeline._resolve_sound_target_samples = fake_resolve_sound_target_samples  # type: ignore[method-assign]
-        pipeline._prepare_sound_latents = fake_prepare_sound_latents  # type: ignore[method-assign]
-        pipeline._decode_sound_latents = lambda latents, target_samples: decoded_audio  # type: ignore[method-assign]
-        return sound_latents, decoded_audio
+        pipeline._format_and_tokenize_prompts = fake_format
+        pipeline._prepare_latents = fake_prepare
+        pipeline._set_flow_shift = lambda target: captured.setdefault("flow_shifts", []).append(target)
 
-    def test_forward_uses_t2i_defaults_and_generates_multiple_outputs(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        captured = self._install_forward_stubs(pipeline)
-        req = SimpleNamespace(
-            prompts=[{"prompt": "A painted robot", "modalities": ["image"]}],
-            sampling_params=make_sampling_params(num_outputs_per_prompt=2),
-        )
-
-        output = pipeline.forward(req)
-
-        assert captured["flow_shifts"] == [3.0]
-        assert captured["scheduler_steps"] == [50, 50]
-        assert captured["format"]["is_t2i"] is True
-        assert captured["format"]["negative_prompt"] == ""
-        assert captured["format"]["height"] == 1024
-        assert captured["format"]["width"] == 1024
-        assert captured["format"]["num_frames"] == 1
-        assert len(captured["diffuse_calls"]) == 2
-        assert captured["diffuse_calls"][0]["guidance_interval"] == (400.0, 1000.0)
-        assert output.output["image"].shape[0] == 2
-
-    def test_forward_uses_t2v_defaults_and_engine_flow_shift(self, make_cosmos3_pipeline) -> None:
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import COSMOS3_T2V_NEGATIVE_PROMPT
-
-        pipeline = make_cosmos3_pipeline()
-        captured = self._install_forward_stubs(pipeline)
-        req = SimpleNamespace(
-            prompts=[{"prompt": "A warehouse robot", "modalities": ["video"]}],
-            sampling_params=make_sampling_params(),
-        )
-
-        pipeline.forward(req)
-
-        assert captured["flow_shifts"] == [1.0]
-        assert captured["scheduler_steps"] == [35]
-        assert captured["format"]["is_t2i"] is False
-        assert captured["format"]["negative_prompt"] == COSMOS3_T2V_NEGATIVE_PROMPT
-        assert captured["format"]["height"] == 720
-        assert captured["format"]["width"] == 1280
-        assert captured["format"]["num_frames"] == 189
-        assert captured["diffuse_calls"][0]["guidance_scale"] == 6.0
-        assert captured["diffuse_calls"][0]["guidance_interval"] is None
-
-    def test_forward_defaults_to_video_without_modalities(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        captured = self._install_forward_stubs(pipeline)
-        req = SimpleNamespace(
-            prompts=["A warehouse robot"],
-            sampling_params=make_sampling_params(),
-        )
-
-        output = pipeline.forward(req)
+        def fake_set_scheduler_timesteps(steps):
+            captured.setdefault("scheduler_steps", []).append(steps)
+            pipeline.scheduler.timesteps = torch.tensor([7])
 
-        assert captured["format"]["is_t2i"] is False
-        assert "video" in output.output
+        pipeline._set_scheduler_timesteps = fake_set_scheduler_timesteps
+        pipeline.diffuse = fake_diffuse
+        pipeline._decode_latents = lambda latents: latents
+        return captured
 
-    def test_forward_flow_shifts_do_not_leak_between_t2v_and_t2i(
+    @pytest.mark.parametrize(
+        ("prompt", "sampling_params", "expected"),
+        [
+            (
+                {"prompt": "A painted robot", "modalities": ["image"]},
+                make_sampling_params(num_outputs_per_prompt=2),
+                {"key": "image", "is_t2i": True, "flow": [3.0], "steps": [50, 50], "frames": 1},
+            ),
+            (
+                "A warehouse robot",
+                make_sampling_params(),
+                {"key": "video", "is_t2i": False, "flow": [1.0], "steps": [35], "frames": 189},
+            ),
+        ],
+    )
+    def test_forward_defaults_and_mode_selection(
         self,
         make_cosmos3_pipeline,
+        prompt,
+        sampling_params,
+        expected,
     ) -> None:
         pipeline = make_cosmos3_pipeline()
         captured = self._install_forward_stubs(pipeline)
 
-        pipeline.forward(
-            SimpleNamespace(
-                prompts=[{"prompt": "A warehouse robot", "modalities": ["video"]}],
-                sampling_params=make_sampling_params(),
-            )
-        )
-        pipeline.forward(
-            SimpleNamespace(
-                prompts=[{"prompt": "A painted robot", "modalities": ["image"]}],
-                sampling_params=make_sampling_params(),
-            )
-        )
+        output = pipeline.forward(SimpleNamespace(prompts=[prompt], sampling_params=sampling_params))
 
-        assert captured["flow_shifts"] == [1.0, 3.0]
+        assert expected["key"] in output.output
+        assert captured["format"]["is_t2i"] is expected["is_t2i"]
+        assert captured["format"]["num_frames"] == expected["frames"]
+        assert captured["flow_shifts"] == expected["flow"]
+        assert captured["scheduler_steps"] == expected["steps"]
 
-    def test_forward_selects_i2v_latents_for_image_conditioning(self, make_cosmos3_pipeline) -> None:
+    def test_forward_i2v_sound_and_action_routes(self, make_cosmos3_pipeline) -> None:
         pipeline = make_cosmos3_pipeline()
         captured = self._install_forward_stubs(pipeline)
         image_tensor = torch.zeros(1, 3, 16, 16)
-        velocity_mask = torch.tensor([[[[[0.0]], [[1.0]]]]])
-        image_latent = torch.full((1, 2, 1, 1, 1), 5.0)
-
-        def fake_prepare_i2v(image, height, width, num_frames, generator):
-            captured["i2v_prepare"] = (image, height, width, num_frames, generator.initial_seed())
-            return torch.zeros(1, 2, 2, 1, 1), velocity_mask, image_latent
-
-        def fail_prepare(*args, **kwargs):
-            del args, kwargs
-            raise AssertionError("T2V latent preparation should not run for an I2V request")
-
-        pipeline._prepare_latents = fail_prepare  # type: ignore[method-assign]
-        pipeline._prepare_latents_i2v = fake_prepare_i2v  # type: ignore[method-assign]
-        req = SimpleNamespace(
-            prompts=[
-                {
-                    "prompt": "A robot starts moving.",
-                    "modalities": ["video"],
-                    "negative_prompt": "bad",
-                    "additional_information": {"preprocessed_image": image_tensor},
-                }
-            ],
-            sampling_params=make_sampling_params(height=16, width=16, num_frames=5),
-        )
-
-        pipeline.forward(req)
-
-        prepared_image, prepared_height, prepared_width, prepared_frames, _ = captured["i2v_prepare"]
-        assert prepared_image is image_tensor
-        assert prepared_height == 16
-        assert prepared_width == 16
-        assert prepared_frames == 5
-        diffuse_call = captured["diffuse_calls"][0]
-        assert diffuse_call["velocity_mask"] is velocity_mask
-        assert diffuse_call["image_latent"] is image_latent
-        assert diffuse_call["shared_kwargs"]["noisy_frame_mask"] is velocity_mask
+        velocity_mask = torch.ones(1, 1, 1, 1, 1)
 
-    def test_forward_policy_action_returns_custom_output(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
-        captured = self._install_forward_stubs(pipeline)
-        image_tensor = torch.zeros(1, 3, 16, 16)
-        req = SimpleNamespace(
-            prompts=[
-                {
-                    "prompt": "Pick the block.",
-                    "modalities": ["video"],
-                    "additional_information": {"preprocessed_image": image_tensor},
-                }
-            ],
-            sampling_params=make_sampling_params(
-                height=16,
-                width=16,
-                extra_args={
-                    "action_mode": "policy",
-                    "action_chunk_size": 2,
-                    "raw_action_dim": 2,
-                    "domain_name": "bridge_orig_lerobot",
-                },
-            ),
+        pipeline._prepare_latents_i2v = lambda *args, **kwargs: (
+            torch.zeros(1, 2, 1, 1, 1),
+            velocity_mask,
+            torch.zeros(1, 2, 1, 1, 1),
         )
-
-        output = pipeline.forward(req)
-
-        assert captured["format"]["negative_prompt"] == ""
-        diffuse_call = captured["diffuse_calls"][0]
-        assert diffuse_call["action_latents"].shape == (1, 2, 4)
-        assert diffuse_call["action_velocity_mask"].tolist() == [[[1.0], [1.0]]]
-        assert diffuse_call["shared_kwargs"]["action_domain_ids"].tolist() == [7]
-        assert diffuse_call["shared_kwargs"]["action_start_frame_offset"] == 1
-        assert output.custom_output["action"].shape == (1, 2, 2)
-        assert output.custom_output["raw_action_dim"] == 2
-        assert output.custom_output["action_mode"] == "policy"
-        assert output.custom_output["domain_id"] == 7
-
-    def test_forward_action_defaults_to_reference_chunk_size(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
-        captured = self._install_forward_stubs(pipeline)
-        req = SimpleNamespace(
-            prompts=[
-                {
-                    "prompt": "Pick the block.",
-                    "modalities": ["video"],
-                    "additional_information": {"preprocessed_image": torch.zeros(1, 3, 16, 16)},
-                }
-            ],
-            sampling_params=make_sampling_params(
-                height=16,
-                width=16,
-                extra_args={
-                    "action_mode": "policy",
-                    "raw_action_dim": 2,
-                    "domain_id": 0,
-                },
-            ),
-        )
-
-        pipeline.forward(req)
-
-        assert captured["format"]["num_frames"] == 17
-        assert captured["diffuse_calls"][0]["action_latents"].shape == (1, 16, 4)
-
-    def test_forward_forward_dynamics_uses_action_video_conditioning(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
-        captured = self._install_forward_stubs(pipeline)
-        video_tensor = torch.zeros(1, 3, 3, 16, 16)
-        condition_latents = torch.full((1, 2, 1, 2, 2), 6.0)
-
-        def fake_prepare_action_latents(**kwargs):
-            captured["prepare_action_latents"] = kwargs
-            action_latents = torch.zeros(1, 2, 4)
-            action_velocity_mask = torch.zeros(1, 2, 1)
-            clean_action = torch.zeros(1, 2, 4)
-            return action_latents, action_velocity_mask, clean_action, 2
-
-        def fake_prepare_action_video(video, mode, height, width, num_frames, generator):
-            captured["prepare_action_video"] = (video, mode, height, width, num_frames, generator.initial_seed())
-            latents = torch.zeros(1, 2, 1, 2, 2)
-            velocity_mask = torch.ones(1, 1, 1, 1, 1)
-            return latents, velocity_mask, condition_latents
-
-        def fail_prepare_i2v(*args, **kwargs):
-            raise AssertionError("forward_dynamics video input must not route through the i2v image path")
-
-        pipeline._prepare_action_latents = fake_prepare_action_latents  # type: ignore[method-assign]
-        pipeline._prepare_latents_action_video = fake_prepare_action_video  # type: ignore[method-assign]
-        pipeline._prepare_latents_i2v = fail_prepare_i2v  # type: ignore[method-assign]
-        req = SimpleNamespace(
-            prompts=[
-                {
-                    "prompt": "Move the robot.",
-                    "modalities": ["video"],
-                    "additional_information": {
-                        "preprocessed_image": torch.zeros(1, 3, 16, 16),
-                        "preprocessed_video": video_tensor,
-                    },
-                }
-            ],
-            sampling_params=make_sampling_params(
-                height=16,
-                width=16,
-                num_frames=3,
-                extra_args={
-                    "action_mode": "forward_dynamics",
-                    "action_chunk_size": 2,
-                    "domain_id": 0,
-                },
-            ),
+        pipeline.forward(
+            SimpleNamespace(
+                prompts=[
+                    {
+                        "prompt": "move",
+                        "modalities": ["video"],
+                        "additional_information": {"preprocessed_image": image_tensor},
+                    }
+                ],
+                sampling_params=make_sampling_params(height=16, width=16, num_frames=5),
+            )
         )
+        assert captured["diffuse_calls"][-1]["shared_kwargs"]["noisy_frame_mask"] is velocity_mask
 
-        pipeline.forward(req)
-
-        prepared_video, mode, height, width, num_frames, seed = captured["prepare_action_video"]
-        assert prepared_video is video_tensor
-        assert (mode, height, width, num_frames, seed) == ("forward_dynamics", 16, 16, 3, 123)
-        assert captured["prepare_action_latents"]["mode"] == "forward_dynamics"
-        diffuse_call = captured["diffuse_calls"][0]
-        assert diffuse_call["condition_latents"] is condition_latents
-        torch.testing.assert_close(diffuse_call["image_latent"], condition_latents[:, :, 0:1])
-
-    def test_forward_video_sound_decodes_and_returns_audio_payload(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
         pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
-        captured = self._install_forward_stubs(pipeline)
         sound_latents = torch.zeros(1, 3, 4)
-        decoded_audio = torch.ones(1, 2, 20)
-
-        def fake_resolve_sound_target_samples(sp, num_frames, frame_rate):
-            del sp, num_frames, frame_rate
-            return 20, 2.0, 10
-
-        def fake_prepare_sound_latents(target_samples, generator):
-            del target_samples, generator
-            return sound_latents, 4
-
-        pipeline._resolve_sound_target_samples = fake_resolve_sound_target_samples  # type: ignore[method-assign]
-        pipeline._prepare_sound_latents = fake_prepare_sound_latents  # type: ignore[method-assign]
-        pipeline._decode_sound_latents = lambda latents, target_samples: decoded_audio  # type: ignore[method-assign]
-
-        req = SimpleNamespace(
-            prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
-            sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0),
+        pipeline._resolve_sound_target_samples = lambda *args: (20, 2.0, 10)
+        pipeline._prepare_sound_latents = lambda *args: (sound_latents, 4)
+        pipeline._decode_sound_latents = lambda *args: torch.ones(1, 2, 20)
+        output = pipeline.forward(
+            SimpleNamespace(
+                prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
+                sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0),
+            )
         )
-
-        output = pipeline.forward(req)
-
-        assert captured["diffuse_calls"][0]["sound_latents"] is sound_latents
-        assert output.output["audio"] is decoded_audio
+        assert captured["diffuse_calls"][-1]["sound_latents"] is sound_latents
         assert output.output["audio_sample_rate"] == 10
-        assert "video" in output.output
 
-    def test_forward_decode_info_logs_only_on_rank_zero(
-        self,
-        make_cosmos3_pipeline,
-        monkeypatch: pytest.MonkeyPatch,
-        caplog,
-    ) -> None:
-        from vllm_omni.diffusion.models.cosmos3 import pipeline_cosmos3 as cosmos3_pipeline
-
-        monkeypatch.setattr(cosmos3_pipeline, "_is_rank_zero", lambda: True)
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
-        self._install_forward_stubs(pipeline)
-        self._install_sound_stubs(pipeline)
-        req = SimpleNamespace(
-            prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
-            sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0),
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
+        output = pipeline.forward(
+            SimpleNamespace(
+                prompts=[
+                    {
+                        "prompt": "Pick the block.",
+                        "modalities": ["video"],
+                        "additional_information": {"preprocessed_image": image_tensor},
+                    }
+                ],
+                sampling_params=make_sampling_params(
+                    height=16,
+                    width=16,
+                    extra_args={
+                        "action_mode": "policy",
+                        "action_chunk_size": 2,
+                        "raw_action_dim": 2,
+                        "domain_name": "bridge_orig_lerobot",
+                    },
+                ),
+            )
         )
+        assert captured["diffuse_calls"][-1]["shared_kwargs"]["action_domain_ids"].tolist() == [7]
+        assert output.custom_output["action"].shape == (1, 2, 2)
 
-        target_logger = logging.getLogger(cosmos3_pipeline.logger.name)
-        target_logger.addHandler(caplog.handler)
-        prev_level = target_logger.level
-        target_logger.setLevel(logging.INFO)
-        try:
-            pipeline.forward(req)
-        finally:
-            target_logger.removeHandler(caplog.handler)
-            target_logger.setLevel(prev_level)
-
-        messages = [record.getMessage() for record in caplog.records if record.name == cosmos3_pipeline.logger.name]
-        assert "Decoding video..." in messages
-        assert any(message.startswith("Video decoded in ") for message in messages)
-        assert any(message.startswith("Total pipeline time: ") for message in messages)
-        assert "Decoding sound..." in messages
-
-    def test_forward_decode_info_logs_suppressed_on_nonzero_rank(
+    @pytest.mark.parametrize(
+        ("prompt", "sampling_params", "message"),
+        [
+            (["one", "two"], make_sampling_params(), "single prompt"),
+            ([{"prompt": "one", "modalities": ["image", "video"]}], make_sampling_params(), "both image and video"),
+            (
+                [{"prompt": "x", "modalities": ["image"], "generate_sound": True}],
+                make_sampling_params(),
+                "only for video",
+            ),
+        ],
+    )
+    def test_forward_rejects_invalid_public_requests(
         self,
         make_cosmos3_pipeline,
-        monkeypatch: pytest.MonkeyPatch,
-        caplog,
+        prompt,
+        sampling_params,
+        message,
     ) -> None:
-        from vllm_omni.diffusion.models.cosmos3 import pipeline_cosmos3 as cosmos3_pipeline
-
-        monkeypatch.setattr(cosmos3_pipeline, "_is_rank_zero", lambda: False)
         pipeline = make_cosmos3_pipeline()
         pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
-        self._install_forward_stubs(pipeline)
-        _, decoded_audio = self._install_sound_stubs(pipeline)
-        req = SimpleNamespace(
-            prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
-            sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0),
-        )
-
-        target_logger = logging.getLogger(cosmos3_pipeline.logger.name)
-        target_logger.addHandler(caplog.handler)
-        prev_level = target_logger.level
-        target_logger.setLevel(logging.INFO)
-        try:
-            output = pipeline.forward(req)
-        finally:
-            target_logger.removeHandler(caplog.handler)
-            target_logger.setLevel(prev_level)
-
-        messages = [record.getMessage() for record in caplog.records if record.name == cosmos3_pipeline.logger.name]
-        assert output.output["audio"] is decoded_audio
-        assert not any(
-            message == "Decoding video..."
-            or message.startswith("Video decoded in ")
-            or message.startswith("Total pipeline time: ")
-            or message == "Decoding sound..."
-            for message in messages
-        )
-
-    def test_forward_rejects_multiple_prompts(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        req = SimpleNamespace(
-            prompts=["one", "two"],
-            sampling_params=make_sampling_params(),
-        )
-
-        with pytest.raises(ValueError, match="currently supports a single prompt"):
-            pipeline.forward(req)
-
-    def test_forward_rejects_conflicting_modalities(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        req = SimpleNamespace(
-            prompts=[{"prompt": "one", "modalities": ["image", "video"]}],
-            sampling_params=make_sampling_params(),
-        )
-
-        with pytest.raises(ValueError, match="cannot request both image and video"):
-            pipeline.forward(req)
-
-    def test_forward_rejects_sound_for_text_to_image(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
-        req = SimpleNamespace(
-            prompts=[{"prompt": "A robot", "modalities": ["image"], "generate_sound": True}],
-            sampling_params=make_sampling_params(),
-        )
-
-        with pytest.raises(ValueError, match="only for video outputs"):
-            pipeline.forward(req)
-
-    def test_forward_rejects_action_without_action_modules(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        req = SimpleNamespace(
-            prompts=[{"prompt": "A robot", "modalities": ["video"]}],
-            sampling_params=make_sampling_params(extra_args={"action_mode": "policy", "raw_action_dim": 2}),
-        )
-
-        with pytest.raises(ValueError, match="without action modules"):
-            pipeline.forward(req)
-
-    def test_forward_rejects_action_without_explicit_domain(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
-        req = SimpleNamespace(
-            prompts=[
-                {
-                    "prompt": "A robot",
-                    "modalities": ["video"],
-                    "additional_information": {"preprocessed_image": torch.zeros(1, 3, 16, 16)},
-                }
-            ],
-            sampling_params=make_sampling_params(
-                height=16,
-                width=16,
-                extra_args={"action_mode": "policy", "raw_action_dim": 2},
-            ),
-        )
-
-        with pytest.raises(ValueError, match=r"domain_id.*domain_name"):
-            pipeline.forward(req)
-
-    def test_forward_rejects_action_with_sound(self, make_cosmos3_pipeline) -> None:
-        pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(
-            latent_channel_size=2,
-            action_gen=True,
-            action_dim=4,
-            sound_gen=True,
-            sound_dim=3,
-        )
-        req = SimpleNamespace(
-            prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
-            sampling_params=make_sampling_params(extra_args={"action_mode": "policy", "raw_action_dim": 2}),
-        )
 
-        with pytest.raises(ValueError, match=r"action\+sound"):
-            pipeline.forward(req)
+        with pytest.raises(ValueError, match=message):
+            pipeline.forward(SimpleNamespace(prompts=prompt, sampling_params=sampling_params))
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
index 0e8e5034d85..47664c59e77 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
@@ -4,6 +4,7 @@
 from __future__ import annotations
 
 import json
+from pathlib import Path
 from types import SimpleNamespace
 
 import pytest
@@ -32,21 +33,17 @@ def decode(self, latents: torch.Tensor) -> torch.Tensor:
         return torch.zeros(latents.shape[0], self.audio_channels, 8)
 
 
-def test_from_config_loads_default_sound_tokenizer_component(
-    tmp_path,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    model_dir = tmp_path / "model"
-    tokenizer_dir = model_dir / "sound_tokenizer"
+def _write_component(root: Path, config: dict | None = None, checkpoint_name: str | None = None) -> Path:
+    tokenizer_dir = root / "sound_tokenizer"
     tokenizer_dir.mkdir(parents=True)
-    checkpoint_path = tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME
-    config_path = tokenizer_dir / "config.json"
-    checkpoint_path.write_bytes(b"stub")
-    config_path.write_text("{}", encoding="utf-8")
+    if checkpoint_name:
+        (tokenizer_dir / checkpoint_name).write_bytes(b"stub")
+    (tokenizer_dir / "config.json").write_text(json.dumps(config or {}), encoding="utf-8")
+    return tokenizer_dir
 
-    created = {}
+
+def _patch_fake_avae(monkeypatch: pytest.MonkeyPatch, created: dict) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
 
     class FakeAVAE(_FakeAVAEAudioTokenizer):
         def __init__(self, **kwargs) -> None:
@@ -56,80 +53,55 @@ def __init__(self, **kwargs) -> None:
     monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
     monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
 
+
+def test_from_config_loads_local_diffusers_component(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    tokenizer_dir = _write_component(model_dir, checkpoint_name=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    created = {}
+    _patch_fake_avae(monkeypatch, created)
+
     tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
         SimpleNamespace(
             model=str(model_dir),
-            custom_pipeline_args={
-                "sound_sample_rate": 32000,
-                "sound_hop_size": 800,
-                "sound_dim": 3,
-            },
+            custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800, "sound_dim": 3},
             dtype=torch.float32,
         )
     )
 
-    assert created["checkpoint_path"] == str(checkpoint_path)
-    assert created["config_path"] == str(config_path)
-    assert tokenizer.sample_rate == 32000
-    assert tokenizer.latent_ch == 3
-    assert tokenizer.hop_size == 800
+    assert created["checkpoint_path"] == str(tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    assert created["config_path"] == str(tokenizer_dir / "config.json")
+    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (32000, 3, 800)
 
 
-def test_from_config_downloads_default_sound_tokenizer_from_hf_repo(
-    tmp_path,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
+def test_from_config_downloads_component_from_hf_repo(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
     import huggingface_hub
 
     from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
 
     cache_dir = tmp_path / "hf"
-    tokenizer_dir = cache_dir / "sound_tokenizer"
-    tokenizer_dir.mkdir(parents=True)
-    checkpoint_path = tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME
-    config_path = tokenizer_dir / "config.json"
-    checkpoint_path.write_bytes(b"stub")
-    config_path.write_text("{}", encoding="utf-8")
-
+    _write_component(cache_dir, checkpoint_name=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
     calls = []
+    created = {}
+    _patch_fake_avae(monkeypatch, created)
 
-    def fake_snapshot_download(
-        repo_id: str,
-        *,
-        revision: str | None,
-        allow_patterns: list[str],
-    ) -> str:
+    def fake_snapshot_download(repo_id: str, *, revision: str | None, allow_patterns: list[str]) -> str:
         calls.append((repo_id, revision, allow_patterns))
         return str(cache_dir)
 
-    created = {}
-
-    class FakeAVAE(_FakeAVAEAudioTokenizer):
-        def __init__(self, **kwargs) -> None:
-            created.update(kwargs)
-            super().__init__(**kwargs)
-
     monkeypatch.setattr(huggingface_hub, "snapshot_download", fake_snapshot_download)
-    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
-    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
 
-    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+    sound_tokenizer.Cosmos3SoundTokenizer.from_config(
         SimpleNamespace(
             model="nvidia/cosmos3",
             revision="test-rev",
-            custom_pipeline_args={
-                "sound_sample_rate": 32000,
-                "sound_hop_size": 800,
-                "sound_dim": 3,
-            },
+            custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800, "sound_dim": 3},
             dtype=torch.float32,
         )
     )
 
-    assert created["checkpoint_path"] == str(checkpoint_path)
-    assert created["config_path"] == str(config_path)
-    assert tokenizer.sample_rate == 32000
-    assert tokenizer.latent_ch == 3
+    assert created["checkpoint_path"].endswith(DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
     assert calls == [
         (
             "nvidia/cosmos3",
@@ -139,125 +111,40 @@ def __init__(self, **kwargs) -> None:
     ]
 
 
-def test_from_config_uses_diffusers_sound_tokenizer_checkpoint_name(
-    tmp_path,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
+@pytest.mark.parametrize(
+    ("checkpoint_name", "message"),
+    [
+        (None, "no AVAE sound tokenizer checkpoint"),
+        ("model.safetensors", DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME),
+    ],
+)
+def test_default_component_requires_diffusers_checkpoint_name(tmp_path, checkpoint_name, message) -> None:
     from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
 
     model_dir = tmp_path / "model"
-    tokenizer_dir = model_dir / "sound_tokenizer"
-    tokenizer_dir.mkdir(parents=True)
-    checkpoint_path = tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME
-    checkpoint_path.write_bytes(b"stub")
-    (tokenizer_dir / "config.json").write_text("{}", encoding="utf-8")
-
-    created = {}
-
-    class FakeAVAE(_FakeAVAEAudioTokenizer):
-        def __init__(self, **kwargs) -> None:
-            created.update(kwargs)
-            super().__init__(**kwargs)
+    _write_component(model_dir, checkpoint_name=checkpoint_name)
 
-    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
-    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
-
-    sound_tokenizer.Cosmos3SoundTokenizer.from_config(
-        SimpleNamespace(model=str(model_dir), custom_pipeline_args={}, dtype=torch.float32)
-    )
-
-    assert created["checkpoint_path"] == str(checkpoint_path)
-
-
-def test_default_component_requires_sound_tokenizer_checkpoint(tmp_path) -> None:
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    model_dir = tmp_path / "model"
-    (model_dir / "sound_tokenizer").mkdir(parents=True)
-
-    with pytest.raises(ValueError, match="no AVAE sound tokenizer checkpoint"):
+    with pytest.raises(ValueError, match=message):
         sound_tokenizer.Cosmos3SoundTokenizer.from_config(
             SimpleNamespace(model=str(model_dir), custom_pipeline_args={}, dtype=torch.float32)
         )
 
 
-def test_default_component_rejects_legacy_sound_tokenizer_checkpoint_name(tmp_path) -> None:
+def test_component_config_precedence_and_conflict_detection(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
     from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
 
+    component_config = {
+        "sampling_rate": 48000,
+        "dec_out_channels": 2,
+        "vocoder_input_dim": 64,
+        "hop_size": 1920,
+    }
     model_dir = tmp_path / "model"
-    tokenizer_dir = model_dir / "sound_tokenizer"
-    tokenizer_dir.mkdir(parents=True)
-    (tokenizer_dir / "model.safetensors").write_bytes(b"stub")
-
-    with pytest.raises(ValueError, match=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME):
-        sound_tokenizer.Cosmos3SoundTokenizer.from_config(
-            SimpleNamespace(model=str(model_dir), custom_pipeline_args={}, dtype=torch.float32)
-        )
-
-
-def test_from_config_uses_nested_normalization_config(
-    tmp_path,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    model_dir = tmp_path / "model"
-    tokenizer_dir = model_dir / "sound_tokenizer"
-    tokenizer_dir.mkdir(parents=True)
-    (tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME).write_bytes(b"stub")
-    (tokenizer_dir / "config.json").write_text("{}", encoding="utf-8")
-
-    created = {}
-
-    class FakeAVAE(_FakeAVAEAudioTokenizer):
-        def __init__(self, **kwargs) -> None:
-            created.update(kwargs)
-            super().__init__(**kwargs)
-
-    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
-    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
-
-    sound_tokenizer.Cosmos3SoundTokenizer.from_config(
-        SimpleNamespace(
-            model=str(model_dir),
-            custom_pipeline_args={},
-            model_config={
-                "sound_tokenizer": {
-                    "normalize_latents": False,
-                    "normalization_type": "none",
-                }
-            },
-            dtype=torch.float32,
-        )
-    )
-
-    assert created["normalize_latents"] is False
-    assert created["normalization_type"] == "none"
-
-
-def test_from_config_custom_normalization_overrides_nested_config(
-    tmp_path,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    model_dir = tmp_path / "model"
-    tokenizer_dir = model_dir / "sound_tokenizer"
-    tokenizer_dir.mkdir(parents=True)
-    (tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME).write_bytes(b"stub")
-    (tokenizer_dir / "config.json").write_text("{}", encoding="utf-8")
-
+    _write_component(model_dir, component_config, DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
     created = {}
+    _patch_fake_avae(monkeypatch, created)
 
-    class FakeAVAE(_FakeAVAEAudioTokenizer):
-        def __init__(self, **kwargs) -> None:
-            created.update(kwargs)
-            super().__init__(**kwargs)
-
-    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
-    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
-
-    sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
         SimpleNamespace(
             model=str(model_dir),
             custom_pipeline_args={
@@ -265,92 +152,32 @@ def __init__(self, **kwargs) -> None:
                 "sound_normalization_type": "tanh",
                 "sound_tanh_input_scale": 2.0,
             },
-            model_config={
-                "sound_tokenizer": {
-                    "normalize_latents": False,
-                    "normalization_type": "none",
-                    "tanh_input_scale": 1.0,
-                }
-            },
-            dtype=torch.float32,
-        )
-    )
-
-    assert created["normalize_latents"] is True
-    assert created["normalization_type"] == "tanh"
-    assert created["tanh_input_scale"] == 2.0
-
-
-def test_from_config_uses_component_config_architecture_values(
-    tmp_path,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    model_dir = tmp_path / "model"
-    tokenizer_dir = model_dir / "sound_tokenizer"
-    tokenizer_dir.mkdir(parents=True)
-    (tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME).write_bytes(b"stub")
-    (tokenizer_dir / "config.json").write_text(
-        ('{"sampling_rate": 48000, "dec_out_channels": 2, "vocoder_input_dim": 64, "hop_size": 1920}'),
-        encoding="utf-8",
-    )
-
-    created = {}
-
-    class FakeAVAE(_FakeAVAEAudioTokenizer):
-        def __init__(self, **kwargs) -> None:
-            created.update(kwargs)
-            super().__init__(**kwargs)
-
-    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
-    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
-
-    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
-        SimpleNamespace(
-            model=str(model_dir),
-            custom_pipeline_args={},
             model_config={
                 "sound_tokenizer": {
                     "sample_rate": 32000,
                     "audio_channels": 1,
                     "io_channels": 3,
                     "hop_size": 800,
+                    "normalize_latents": False,
+                    "normalization_type": "none",
                 }
             },
             dtype=torch.float32,
         )
     )
 
-    assert created["sample_rate"] == 48000
-    assert created["audio_channels"] == 2
-    assert created["io_channels"] == 64
-    assert created["hop_size"] == 1920
-    assert tokenizer.sample_rate == 48000
-    assert tokenizer.latent_ch == 64
-    assert tokenizer.hop_size == 1920
-
-
-def test_from_config_rejects_custom_architecture_conflict_with_component_config(
-    tmp_path,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    model_dir = tmp_path / "model"
-    tokenizer_dir = model_dir / "sound_tokenizer"
-    tokenizer_dir.mkdir(parents=True)
-    (tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME).write_bytes(b"stub")
-    (tokenizer_dir / "config.json").write_text(
-        ('{"sampling_rate": 48000, "dec_out_channels": 2, "vocoder_input_dim": 64, "hop_size": 1920}'),
-        encoding="utf-8",
+    assert (created["sample_rate"], created["audio_channels"], created["io_channels"], created["hop_size"]) == (
+        48000,
+        2,
+        64,
+        1920,
     )
-
-    class FakeAVAE(_FakeAVAEAudioTokenizer):
-        pass
-
-    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
-    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
+    assert (created["normalize_latents"], created["normalization_type"], created["tanh_input_scale"]) == (
+        True,
+        "tanh",
+        2.0,
+    )
+    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (48000, 64, 1920)
 
     with pytest.raises(ValueError, match=r"sample_rate.*48000.*32000"):
         sound_tokenizer.Cosmos3SoundTokenizer.from_config(
@@ -381,13 +208,7 @@ def test_avae_uses_diffusers_decoder_state_dict_layout(tmp_path) -> None:
     config_path = tmp_path / "config.json"
     config_path.write_text(json.dumps(config), encoding="utf-8")
 
-    decoder = avae.OobleckDecoder(
-        channels=4,
-        input_channels=2,
-        audio_channels=1,
-        upsampling_ratios=[2],
-        channel_multiples=[1],
-    )
+    decoder = avae.OobleckDecoder(4, 2, 1, [2], [1])
     save_file({f"decoder.{key}": value for key, value in decoder.state_dict().items()}, str(checkpoint_path))
 
     tokenizer = avae.Cosmos3AVAEAudioTokenizer(
@@ -398,14 +219,8 @@ def test_avae_uses_diffusers_decoder_state_dict_layout(tmp_path) -> None:
     )
 
     keys = set(tokenizer.state_dict())
-    assert "decoder.conv1.weight_g" in keys
-    assert "decoder.block.0.snake1.alpha" in keys
-    assert "decoder.block.0.conv_t1.weight_g" in keys
-    assert "decoder.block.0.res_unit1.conv1.weight_g" in keys
-    assert "decoder.snake1.alpha" in keys
-    assert "decoder.conv2.weight_g" in keys
-    assert not any(key.startswith("decoder.layers.") for key in keys)
-    assert not any(key.startswith("model.decoder.") for key in keys)
+    assert {"decoder.conv1.weight_g", "decoder.block.0.conv_t1.weight_g", "decoder.conv2.weight_g"} <= keys
+    assert not any(key.startswith(("decoder.layers.", "model.decoder.")) for key in keys)
     assert tokenizer.decode(torch.zeros(1, 2, 3)).shape == (1, 1, 6)
     with pytest.raises(NotImplementedError, match="decoder-only"):
         tokenizer.encode(torch.zeros(1, 1, 6))
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index c25a5229290..efe73e2d41d 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -12,107 +12,59 @@
 pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
 
 
-def test_compute_mrope_position_ids_text_offsets_all_axes() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
-        compute_mrope_position_ids_text,
-    )
-
-    ids, next_offset = compute_mrope_position_ids_text(num_tokens=3, temporal_offset=5)
-
-    assert ids.tolist() == [[5, 6, 7], [5, 6, 7], [5, 6, 7]]
-    assert next_offset == 8
+def _tiny_cosmos3_config(**overrides):
+    config = {
+        "hidden_size": 8,
+        "num_hidden_layers": 0,
+        "num_attention_heads": 2,
+        "num_key_value_heads": 2,
+        "head_dim": 4,
+        "intermediate_size": 16,
+        "vocab_size": 32,
+        "latent_patch_size": 1,
+        "latent_channel": 2,
+        "rope_scaling": {"mrope_section": [1, 1, 0]},
+    }
+    config.update(overrides)
+    return config
 
 
-def test_compute_mrope_position_ids_vision_without_fps_modulation() -> None:
+def test_mrope_position_ids_cover_text_video_sound_and_action() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
+        compute_mrope_position_ids_action,
+        compute_mrope_position_ids_sound,
+        compute_mrope_position_ids_text,
         compute_mrope_position_ids_vision,
     )
 
-    ids, next_offset = compute_mrope_position_ids_vision(
-        grid_t=2,
-        grid_h=2,
-        grid_w=3,
-        temporal_offset=10,
-        fps=None,
-    )
-
-    assert ids.shape == (3, 12)
-    assert ids[0].tolist() == [10] * 6 + [11] * 6
-    assert ids[1].tolist() == [0, 0, 0, 1, 1, 1] * 2
-    assert ids[2].tolist() == [0, 1, 2, 0, 1, 2] * 2
-    assert next_offset == 12
+    text_ids, text_offset = compute_mrope_position_ids_text(num_tokens=3, temporal_offset=5)
+    assert text_ids.tolist() == [[5, 6, 7], [5, 6, 7], [5, 6, 7]]
+    assert text_offset == 8
 
+    vision_ids, vision_offset = compute_mrope_position_ids_vision(2, 2, 3, temporal_offset=10, fps=None)
+    assert vision_ids.shape == (3, 12)
+    assert vision_ids[0].tolist() == [10] * 6 + [11] * 6
+    assert vision_offset == 12
 
-def test_compute_mrope_position_ids_vision_with_fps_modulation() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
-        compute_mrope_position_ids_vision,
-    )
-
-    ids, next_offset = compute_mrope_position_ids_vision(
-        grid_t=2,
-        grid_h=1,
-        grid_w=1,
+    modulated_ids, modulated_offset = compute_mrope_position_ids_vision(
+        2,
+        1,
+        1,
         temporal_offset=10,
         fps=12.0,
         base_fps=24.0,
         temporal_compression_factor=4,
     )
+    torch.testing.assert_close(modulated_ids[0], torch.tensor([10.0, 12.0]))
+    assert modulated_offset == 13
 
-    torch.testing.assert_close(ids[0], torch.tensor([10.0, 12.0]))
-    assert ids.dtype == torch.float32
-    assert next_offset == 13
-
-
-def test_compute_mrope_position_ids_sound_uses_sound_latent_fps() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
-        compute_mrope_position_ids_sound,
-    )
-
-    ids, next_offset = compute_mrope_position_ids_sound(
-        grid_t=3,
-        temporal_offset=10,
-        sound_latent_fps=25.0,
-        base_fps=24.0,
-    )
-
-    torch.testing.assert_close(ids[0], torch.tensor([10.0, 10.96, 11.92]))
-    assert ids[1].tolist() == [0.0, 0.0, 0.0]
-    assert ids[2].tolist() == [0.0, 0.0, 0.0]
-    assert next_offset == 12
-
-
-def test_compute_mrope_position_ids_action_uses_start_frame_offset() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
-        compute_mrope_position_ids_action,
-    )
-
-    ids, next_offset = compute_mrope_position_ids_action(
-        grid_t=3,
-        temporal_offset=10,
-        action_fps=None,
-        start_frame_offset=1,
-    )
-
-    assert ids.tolist() == [[11, 12, 13], [0, 0, 0], [0, 0, 0]]
-    assert next_offset == 14
-
-
-def test_compute_mrope_position_ids_action_keeps_video_base_temporal_compression() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
-        compute_mrope_position_ids_action,
-    )
-
-    ids, next_offset = compute_mrope_position_ids_action(
-        grid_t=3,
-        temporal_offset=10,
-        action_fps=24.0,
-        base_fps=24.0,
-        base_temporal_compression_factor=4,
-        start_frame_offset=0,
-    )
+    sound_ids, sound_offset = compute_mrope_position_ids_sound(3, temporal_offset=10, sound_latent_fps=25.0)
+    torch.testing.assert_close(sound_ids[0], torch.tensor([10.0, 10.96, 11.92]))
+    assert sound_offset == 12
 
-    torch.testing.assert_close(ids[0], torch.tensor([10.0, 10.25, 10.5]))
-    assert next_offset == 11
+    action_ids, action_offset = compute_mrope_position_ids_action(3, temporal_offset=10, action_fps=None)
+    assert action_ids.tolist() == [[11, 12, 13], [0, 0, 0], [0, 0, 0]]
+    assert action_offset == 14
 
 
 @pytest.mark.parametrize(
@@ -130,16 +82,11 @@ def test_validate_supported_config_rejects_unsupported_flags(key: str, value) ->
 
     with pytest.raises(ValueError, match=f"{key}="):
         Cosmos3VFMTransformer._validate_supported_config({key: value})
-
-
-def test_validate_supported_config_accepts_defaults() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
     Cosmos3VFMTransformer._validate_supported_config({})
     Cosmos3VFMTransformer._validate_supported_config(None)
 
 
-def test_cosmos3_hsdp_conditions_match_und_and_gen_blocks() -> None:
+def test_transformer_sharding_offload_and_patch_round_trip_contracts() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     model = object.__new__(Cosmos3VFMTransformer)
@@ -149,110 +96,34 @@ def test_cosmos3_hsdp_conditions_match_und_and_gen_blocks() -> None:
     model.gen_layers = nn.ModuleList([nn.Linear(2, 2)])
     model.norm_moe_gen = nn.LayerNorm(2)
 
-    conditions = model._hsdp_shard_conditions
     matched = [
-        name for name, module in model.named_modules() if any(condition(name, module) for condition in conditions)
-    ]
-
-    assert matched == [
-        "language_model.layers.0",
-        "language_model.layers.1",
-        "gen_layers.0",
+        name
+        for name, module in model.named_modules()
+        if any(condition(name, module) for condition in model._hsdp_shard_conditions)
     ]
-
-
-def test_cosmos3_transformer_exposes_layerwise_offload_and_repeated_blocks() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
+    assert matched == ["language_model.layers.0", "language_model.layers.1", "gen_layers.0"]
     assert Cosmos3VFMTransformer._layerwise_offload_blocks_attrs == ["gen_layers"]
     assert Cosmos3VFMTransformer._repeated_blocks == ["Cosmos3GenDecoderLayer"]
 
-
-def test_patchify_unpatchify_round_trip_crops_padding() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    model = object.__new__(Cosmos3VFMTransformer)
-    nn.Module.__init__(model)
     model.latent_patch_size = 2
     model.latent_channel_size = 3
-
     latents = torch.arange(1 * 3 * 1 * 3 * 5, dtype=torch.float32).reshape(1, 3, 1, 3, 5)
-
-    tokens = model.patchify(latents, t=1, h=3, w=5)
-    restored = model.unpatchify(tokens, t=1, h=3, w=5)
-
-    assert tokens.shape == (1, 6, 12)
-    torch.testing.assert_close(restored, latents)
-
-
-def _tiny_cosmos3_config(**overrides):
-    config = {
-        "hidden_size": 8,
-        "num_hidden_layers": 0,
-        "num_attention_heads": 2,
-        "num_key_value_heads": 2,
-        "head_dim": 4,
-        "intermediate_size": 16,
-        "vocab_size": 32,
-        "latent_patch_size": 1,
-        "latent_channel": 2,
-        "rope_scaling": {"mrope_section": [1, 1, 0]},
-    }
-    config.update(overrides)
-    return config
+    torch.testing.assert_close(model.unpatchify(model.patchify(latents, t=1, h=3, w=5), t=1, h=3, w=5), latents)
 
 
-def test_sound_modules_created_only_when_sound_config_present() -> None:
+def test_sound_and_action_modules_follow_config() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     tiny = _tiny_cosmos3_config()
-
-    no_sound = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32))
-    explicit_disabled = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config={**tiny, "sound_gen": False, "sound_dim": 3},
-            dtype=torch.float32,
-        )
-    )
+    no_modal = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32))
     with_sound = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config={**tiny, "sound_gen": True, "sound_dim": 3},
-            dtype=torch.float32,
-        )
-    )
-    with_nested_sound_dim = Cosmos3VFMTransformer(
         SimpleNamespace(
             tf_model_config={**tiny, "sound_gen": True},
-            model_config={"sound_tokenizer": {"io_channels": 5}},
+            model_config={"sound_tokenizer": {"io_channels": 5, "sample_rate": 32000, "hop_size": 800}},
             custom_pipeline_args={},
             dtype=torch.float32,
         )
     )
-
-    assert no_sound.sound_gen is False
-    assert not hasattr(no_sound, "sound2llm")
-    assert explicit_disabled.sound_gen is False
-    assert not hasattr(explicit_disabled, "sound2llm")
-    assert with_sound.sound_gen is True
-    assert with_sound.sound2llm.in_features == 3
-    assert with_sound.llm2sound.out_features == 3
-    assert tuple(with_sound.sound_modality_embed.shape) == (8,)
-    assert with_nested_sound_dim.sound_dim == 5
-    assert with_nested_sound_dim.sound2llm.in_features == 5
-
-
-def test_action_modules_created_only_when_action_config_present() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    tiny = _tiny_cosmos3_config()
-
-    no_action = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32))
-    explicit_disabled = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config={**tiny, "action_gen": False, "max_action_dim": 6},
-            dtype=torch.float32,
-        )
-    )
     with_action = Cosmos3VFMTransformer(
         SimpleNamespace(
             tf_model_config={**tiny, "action_gen": True, "max_action_dim": 6, "num_embodiment_domains": 9},
@@ -260,156 +131,78 @@ def test_action_modules_created_only_when_action_config_present() -> None:
         )
     )
 
-    assert no_action.action_gen is False
-    assert not hasattr(no_action, "action2llm")
-    assert explicit_disabled.action_gen is False
-    assert not hasattr(explicit_disabled, "action2llm")
-    assert with_action.action_gen is True
+    assert no_modal.sound_gen is False
+    assert no_modal.action_gen is False
+    assert not hasattr(no_modal, "sound2llm")
+    assert not hasattr(no_modal, "action2llm")
+    assert with_sound.sound_dim == 5
+    assert with_sound.sound_latent_fps == 40.0
+    assert with_sound.sound2llm.in_features == 5
     assert with_action.action_dim == 6
     assert with_action.action2llm.num_domains == 9
-    assert tuple(with_action.action_modality_embed.shape) == (8,)
 
 
-def test_sound_latent_fps_derives_from_sound_tokenizer_config() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    tiny = _tiny_cosmos3_config(sound_gen=True, sound_dim=3)
-
-    derived = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config=tiny,
-            model_config={"sound_tokenizer": {"sample_rate": 32000, "hop_size": 800}},
-            custom_pipeline_args={},
-            dtype=torch.float32,
-        )
-    )
-    explicit = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config=tiny,
-            custom_pipeline_args={
-                "sound_sample_rate": 32000,
-                "sound_hop_size": 800,
-                "sound_latent_fps": 12.5,
-            },
-            dtype=torch.float32,
-        )
-    )
-
-    assert derived.sound_latent_fps == 40.0
-    assert explicit.sound_latent_fps == 12.5
-
-
-def test_pack_unpack_sound_round_trip_and_shape_validation() -> None:
+def test_sound_and_action_pack_unpack_validate_shapes() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     model = object.__new__(Cosmos3VFMTransformer)
     nn.Module.__init__(model)
     model.sound_dim = 3
+    model.action_dim = 3
 
-    latents = torch.arange(2 * 3 * 4, dtype=torch.float32).reshape(2, 3, 4)
-    tokens = model.pack_sound(latents)
-    restored = model.unpack_sound(tokens)
+    sound = torch.arange(2 * 3 * 4, dtype=torch.float32).reshape(2, 3, 4)
+    action = torch.arange(2 * 4 * 3, dtype=torch.float32).reshape(2, 4, 3)
+    torch.testing.assert_close(model.unpack_sound(model.pack_sound(sound)), sound)
+    torch.testing.assert_close(model.unpack_action(model.pack_action(action)), action)
 
-    assert tokens.shape == (2, 4, 3)
-    torch.testing.assert_close(restored, latents)
     with pytest.raises(ValueError, match="channel mismatch"):
         model.pack_sound(torch.zeros(1, 4, 2))
-
-
-def test_pack_unpack_action_round_trip_and_shape_validation() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    model = object.__new__(Cosmos3VFMTransformer)
-    nn.Module.__init__(model)
-    model.action_dim = 3
-
-    latents = torch.arange(2 * 4 * 3, dtype=torch.float32).reshape(2, 4, 3)
-    tokens = model.pack_action(latents)
-    restored = model.unpack_action(tokens)
-
-    assert tokens.shape == (2, 4, 3)
-    torch.testing.assert_close(restored, latents)
     with pytest.raises(ValueError, match="dimension mismatch"):
         model.pack_action(torch.zeros(1, 2, 4))
 
 
-def test_forward_with_sound_returns_video_and_sound_predictions() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    model = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0),
-            dtype=torch.float32,
-        )
-    )
-
-    video = torch.zeros(1, 2, 1, 2, 2)
-    sound = torch.zeros(1, 3, 4)
-    output = model(
-        hidden_states=video,
-        timestep=torch.tensor([1.0]),
-        text_ids=torch.tensor([[1, 2]], dtype=torch.long),
-        text_mask=torch.ones(1, 2, dtype=torch.long),
-        video_shape=(1, 2, 2),
-        fps=24.0,
-        sound_latents=sound,
-    )
-
-    assert isinstance(output, tuple)
-    video_pred, sound_pred = output
-    assert video_pred.shape == video.shape
-    assert sound_pred.shape == sound.shape
-
-
-def test_forward_with_action_returns_video_and_action_predictions() -> None:
+@pytest.mark.parametrize(
+    ("config", "extra_kwargs", "expected_shapes"),
+    [
+        (
+            _tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0),
+            {"sound_latents": torch.zeros(1, 3, 4)},
+            [(1, 2, 1, 2, 2), (1, 3, 4)],
+        ),
+        (
+            _tiny_cosmos3_config(action_gen=True, max_action_dim=3, num_embodiment_domains=4),
+            {"action_latents": torch.zeros(1, 5, 3), "action_domain_ids": torch.tensor([2])},
+            [(1, 2, 1, 2, 2), (1, 5, 3)],
+        ),
+    ],
+)
+def test_forward_returns_video_plus_optional_modality_predictions(config, extra_kwargs, expected_shapes) -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
-    model = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config=_tiny_cosmos3_config(
-                action_gen=True,
-                max_action_dim=3,
-                num_embodiment_domains=4,
-            ),
-            dtype=torch.float32,
-        )
-    )
-
-    video = torch.zeros(1, 2, 1, 2, 2)
-    action = torch.zeros(1, 5, 3)
-    output = model(
-        hidden_states=video,
+    output = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=config, dtype=torch.float32))(
+        hidden_states=torch.zeros(1, 2, 1, 2, 2),
         timestep=torch.tensor([1.0]),
         text_ids=torch.tensor([[1, 2]], dtype=torch.long),
         text_mask=torch.ones(1, 2, dtype=torch.long),
         video_shape=(1, 2, 2),
         fps=24.0,
-        action_latents=action,
-        action_domain_ids=torch.tensor([2]),
         action_noisy_mask=torch.ones(1, 5, 1),
+        **extra_kwargs,
     )
 
     assert isinstance(output, tuple)
-    video_pred, action_pred = output
-    assert video_pred.shape == video.shape
-    assert action_pred.shape == action.shape
+    assert [tuple(tensor.shape) for tensor in output] == expected_shapes
 
 
 def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch: pytest.MonkeyPatch) -> None:
     import vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 as cosmos3_module
 
     model = cosmos3_module.Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3),
-            dtype=torch.float32,
-        )
+        SimpleNamespace(tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3), dtype=torch.float32)
     )
     monkeypatch.setattr(cosmos3_module, "_get_ulysses_state", lambda: (2, 0, None))
 
-    with pytest.raises(
-        ValueError,
-        match=r"GEN sequence length \(3 = video tokens 2 \+ sound tokens 1\).*combined media sequence",
-    ):
+    with pytest.raises(ValueError, match=r"GEN sequence length \(3 = video tokens 2 \+ sound tokens 1\)"):
         model(
             hidden_states=torch.zeros(1, 2, 1, 1, 2),
             timestep=torch.tensor([1.0]),
@@ -421,21 +214,7 @@ def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch
         )
 
 
-def test_reset_cache_clears_und_and_gen_cache() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    model = object.__new__(Cosmos3VFMTransformer)
-    nn.Module.__init__(model)
-    model.cached_kv = object()
-    model.cached_freqs_gen = object()
-
-    model.reset_cache()
-
-    assert model.cached_kv is None
-    assert model.cached_freqs_gen is None
-
-
-def test_compute_rope_freqs_pads_text_and_offsets_vision_positions() -> None:
+def test_compute_rope_freqs_places_text_video_action_and_sound_positions() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     class FakeRotary:
@@ -445,8 +224,7 @@ def __init__(self) -> None:
         def __call__(self, x, position_ids):
             del x
             self.position_ids.append(position_ids.detach().cpu())
-            batch = position_ids.shape[1]
-            seq = position_ids.shape[2]
+            batch, seq = position_ids.shape[1], position_ids.shape[2]
             return torch.zeros(batch, seq, 4), torch.ones(batch, seq, 4)
 
     rotary = FakeRotary()
@@ -456,6 +234,8 @@ def __call__(self, x, position_ids):
     model.temporal_modality_margin = 100
     model.base_fps = 24.0
     model.temporal_compression_factor = 4
+    model.temporal_compression_factor_sound = 1
+    model.sound_latent_fps = 25.0
     model.enable_fps_modulation = False
 
     freqs_und, freqs_gen = model._compute_rope_freqs(
@@ -467,85 +247,13 @@ def __call__(self, x, position_ids):
         device=torch.device("cpu"),
         dtype=torch.float32,
     )
-
     text_pos, vision_pos = rotary.position_ids
     assert text_pos[:, 0, :].tolist() == [[0, 1, 0], [0, 1, 0], [0, 1, 0]]
-    assert text_pos[:, 1, :].tolist() == [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
     assert vision_pos[0, 0].tolist() == [102, 103]
-    assert vision_pos[0, 1].tolist() == [101, 102]
     assert freqs_und[0].shape == (2, 3, 1, 4)
     assert freqs_gen[0].shape == (2, 2, 1, 4)
 
-
-def test_compute_rope_freqs_appends_sound_positions_after_vision() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    class FakeRotary:
-        def __init__(self) -> None:
-            self.position_ids: list[torch.Tensor] = []
-
-        def __call__(self, x, position_ids):
-            del x
-            self.position_ids.append(position_ids.detach().cpu())
-            batch = position_ids.shape[1]
-            seq = position_ids.shape[2]
-            return torch.zeros(batch, seq, 4), torch.ones(batch, seq, 4)
-
-    rotary = FakeRotary()
-    model = object.__new__(Cosmos3VFMTransformer)
-    nn.Module.__init__(model)
-    model.language_model = SimpleNamespace(rotary_emb=rotary)
-    model.temporal_modality_margin = 100
-    model.base_fps = 24.0
-    model.temporal_compression_factor = 4
-    model.enable_fps_modulation = True
-    model.temporal_compression_factor_sound = 1
-    model.sound_latent_fps = 25.0
-
-    model._compute_rope_freqs(
-        text_mask=torch.tensor([[1, 1]], dtype=torch.long),
-        t=2,
-        hp=1,
-        wp=1,
-        fps=24.0,
-        device=torch.device("cpu"),
-        dtype=torch.float32,
-        t_sound=3,
-    )
-
-    _, gen_pos = rotary.position_ids
-    assert gen_pos.shape == (3, 1, 5)
-    torch.testing.assert_close(
-        gen_pos[0, 0],
-        torch.tensor([102.0, 103.0, 102.0, 102.96, 103.92]),
-    )
-
-
-def test_compute_rope_freqs_appends_action_positions_between_vision_and_sound() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    class FakeRotary:
-        def __init__(self) -> None:
-            self.position_ids: list[torch.Tensor] = []
-
-        def __call__(self, x, position_ids):
-            del x
-            self.position_ids.append(position_ids.detach().cpu())
-            batch = position_ids.shape[1]
-            seq = position_ids.shape[2]
-            return torch.zeros(batch, seq, 4), torch.ones(batch, seq, 4)
-
-    rotary = FakeRotary()
-    model = object.__new__(Cosmos3VFMTransformer)
-    nn.Module.__init__(model)
-    model.language_model = SimpleNamespace(rotary_emb=rotary)
-    model.temporal_modality_margin = 100
-    model.base_fps = 24.0
-    model.temporal_compression_factor = 4
-    model.enable_fps_modulation = False
-    model.temporal_compression_factor_sound = 1
-    model.sound_latent_fps = 25.0
-
+    rotary.position_ids.clear()
     model._compute_rope_freqs(
         text_mask=torch.tensor([[1, 1]], dtype=torch.long),
         t=2,
@@ -562,47 +270,3 @@ def __call__(self, x, position_ids):
     _, gen_pos = rotary.position_ids
     assert gen_pos.shape == (3, 1, 5)
     assert gen_pos[0, 0].tolist() == [102, 103, 103, 104, 102]
-
-
-def test_compute_rope_freqs_promotes_mixed_video_sound_position_dtypes() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    class FakeRotary:
-        def __init__(self) -> None:
-            self.position_ids: list[torch.Tensor] = []
-
-        def __call__(self, x, position_ids):
-            del x
-            self.position_ids.append(position_ids.detach().cpu())
-            batch = position_ids.shape[1]
-            seq = position_ids.shape[2]
-            return torch.zeros(batch, seq, 4), torch.ones(batch, seq, 4)
-
-    rotary = FakeRotary()
-    model = object.__new__(Cosmos3VFMTransformer)
-    nn.Module.__init__(model)
-    model.language_model = SimpleNamespace(rotary_emb=rotary)
-    model.temporal_modality_margin = 100
-    model.base_fps = 24.0
-    model.temporal_compression_factor = 4
-    model.enable_fps_modulation = True
-    model.temporal_compression_factor_sound = 1
-    model.sound_latent_fps = 25.0
-
-    model._compute_rope_freqs(
-        text_mask=torch.tensor([[1, 1]], dtype=torch.long),
-        t=1,
-        hp=1,
-        wp=1,
-        fps=None,
-        device=torch.device("cpu"),
-        dtype=torch.float32,
-        t_sound=3,
-    )
-
-    _, gen_pos = rotary.position_ids
-    assert gen_pos.dtype == torch.float32
-    torch.testing.assert_close(
-        gen_pos[0, 0],
-        torch.tensor([102.0, 102.0, 102.96, 103.92]),
-    )
diff --git a/tests/diffusion/models/test_cosmos3_guardrails.py b/tests/diffusion/models/test_cosmos3_guardrails.py
index 2e3457e174a..9ef45f77181 100644
--- a/tests/diffusion/models/test_cosmos3_guardrails.py
+++ b/tests/diffusion/models/test_cosmos3_guardrails.py
@@ -38,12 +38,16 @@ def generate(self, *args, **kwargs):
         return torch.cat([input_ids, torch.tensor([[99]], dtype=input_ids.dtype)], dim=-1)
 
 
-def test_qwen_guardrail_generation_accepts_batch_encoding() -> None:
+@pytest.mark.parametrize("as_batch_encoding", [True, False])
+def test_qwen_guardrail_generation_accepts_supported_tokenizer_outputs(as_batch_encoding: bool) -> None:
     from vllm_omni.diffusion.models.cosmos3.guardrails import _generate_qwen_guardrail_response
 
     input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long)
     attention_mask = torch.ones_like(input_ids)
-    tokenizer = _FakeTokenizer(BatchEncoding({"input_ids": input_ids, "attention_mask": attention_mask}))
+    model_inputs = (
+        BatchEncoding({"input_ids": input_ids, "attention_mask": attention_mask}) if as_batch_encoding else input_ids
+    )
+    tokenizer = _FakeTokenizer(model_inputs)
     model = _FakeModel()
 
     response = _generate_qwen_guardrail_response("a safe prompt", tokenizer, model, "cpu")
@@ -51,24 +55,12 @@ def test_qwen_guardrail_generation_accepts_batch_encoding() -> None:
     assert response == "safe"
     assert tokenizer.decoded_ids == [99]
     args, kwargs = model.calls[0]
-    assert args == ()
-    assert torch.equal(kwargs["input_ids"], input_ids)
-    assert torch.equal(kwargs["attention_mask"], attention_mask)
-    assert kwargs["max_new_tokens"] == 128
-
-
-def test_qwen_guardrail_generation_accepts_tensor_input_ids() -> None:
-    from vllm_omni.diffusion.models.cosmos3.guardrails import _generate_qwen_guardrail_response
-
-    input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long)
-    tokenizer = _FakeTokenizer(input_ids)
-    model = _FakeModel()
-
-    response = _generate_qwen_guardrail_response("a safe prompt", tokenizer, model, "cpu")
-
-    assert response == "safe"
-    assert tokenizer.decoded_ids == [99]
-    args, kwargs = model.calls[0]
-    assert len(args) == 1
-    assert torch.equal(args[0], input_ids)
-    assert kwargs == {"max_new_tokens": 128}
+    if as_batch_encoding:
+        assert args == ()
+        assert torch.equal(kwargs["input_ids"], input_ids)
+        assert torch.equal(kwargs["attention_mask"], attention_mask)
+        assert kwargs["max_new_tokens"] == 128
+    else:
+        assert len(args) == 1
+        assert torch.equal(args[0], input_ids)
+        assert kwargs == {"max_new_tokens": 128}
diff --git a/tests/e2e/accuracy/test_cosmos3_similarity.py b/tests/e2e/accuracy/test_cosmos3_similarity.py
index 166c56a9318..ff2350096e0 100644
--- a/tests/e2e/accuracy/test_cosmos3_similarity.py
+++ b/tests/e2e/accuracy/test_cosmos3_similarity.py
@@ -20,14 +20,12 @@
 
 pytestmark = [pytest.mark.full_model, pytest.mark.diffusion]
 
-
 MODEL_ENV_VAR = "VLLM_TEST_COSMOS3_MODEL"
 MODEL_ID = "cosmos3"
 PROMPT = "A small warehouse robot moves a blue box across a clean floor."
 NEGATIVE_PROMPT = "blurry, distorted, low quality"
 SEED = 42
-WIDTH = 256
-HEIGHT = 256
+WIDTH = HEIGHT = 256
 NUM_INFERENCE_STEPS = 2
 
 
@@ -35,6 +33,8 @@ def _model_name() -> str:
     model = os.environ.get(MODEL_ENV_VAR)
     if not model:
         pytest.skip(f"Set {MODEL_ENV_VAR} to run Cosmos3 full-model smoke tests.")
+    if not torch.cuda.is_available():
+        pytest.skip("Cosmos3 full-model smoke tests require CUDA.")
     return model
 
 
@@ -54,19 +54,14 @@ def _server_args() -> list[str]:
 def _image_data_url(image: Image.Image) -> str:
     buf = io.BytesIO()
     image.save(buf, format="PNG")
-    encoded = base64.b64encode(buf.getvalue()).decode("ascii")
-    return f"data:image/png;base64,{encoded}"
+    return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode('ascii')}"
 
 
 @pytest.mark.benchmark
 @hardware_test(res={"cuda": "H100"}, num_cards=1)
 def test_cosmos3_t2i_serving_smoke(accuracy_artifact_root: Path) -> None:
-    if not torch.cuda.is_available():
-        pytest.skip("Cosmos3 full-model smoke tests require CUDA.")
-
-    model = _model_name()
     output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID)
-    with OmniServer(model, _server_args(), use_omni=True) as server:
+    with OmniServer(_model_name(), _server_args(), use_omni=True) as server:
         response = requests.post(
             f"http://{server.host}:{server.port}/v1/images/generations",
             json={
@@ -91,65 +86,47 @@ def test_cosmos3_t2i_serving_smoke(accuracy_artifact_root: Path) -> None:
     assert image.size == (WIDTH, HEIGHT)
 
 
+@pytest.mark.parametrize(
+    ("name", "prompt", "num_frames", "image_reference"),
+    [
+        ("t2v", PROMPT, "1", None),
+        (
+            "i2v",
+            "The blue rectangle moves slowly forward.",
+            "5",
+            Image.new("RGB", (96, 64), color=(40, 80, 160)),
+        ),
+    ],
+)
 @pytest.mark.benchmark
 @hardware_test(res={"cuda": "H100"}, num_cards=1)
-def test_cosmos3_t2v_sync_serving_smoke(accuracy_artifact_root: Path) -> None:
-    if not torch.cuda.is_available():
-        pytest.skip("Cosmos3 full-model smoke tests require CUDA.")
-
-    model = _model_name()
-    output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID)
-    with OmniServer(model, _server_args(), use_omni=True) as server:
-        response = requests.post(
-            f"http://{server.host}:{server.port}/v1/videos/sync",
-            data={
-                "model": server.model,
-                "prompt": PROMPT,
-                "negative_prompt": NEGATIVE_PROMPT,
-                "size": f"{WIDTH}x{HEIGHT}",
-                "num_frames": "1",
-                "fps": "1",
-                "num_inference_steps": str(NUM_INFERENCE_STEPS),
-                "guidance_scale": "1.0",
-                "seed": str(SEED),
-            },
-            timeout=1800,
-        )
-
-    response.raise_for_status()
-    assert response.headers["content-type"].startswith("video/mp4")
-    assert response.content
-    (output_dir / "cosmos3_t2v.mp4").write_bytes(response.content)
-
-
-@pytest.mark.benchmark
-@hardware_test(res={"cuda": "H100"}, num_cards=1)
-def test_cosmos3_i2v_sync_serving_smoke(accuracy_artifact_root: Path) -> None:
-    if not torch.cuda.is_available():
-        pytest.skip("Cosmos3 full-model smoke tests require CUDA.")
-
-    model = _model_name()
+def test_cosmos3_video_serving_smoke(
+    accuracy_artifact_root: Path,
+    name: str,
+    prompt: str,
+    num_frames: str,
+    image_reference: Image.Image | None,
+) -> None:
     output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID)
-    reference = Image.new("RGB", (96, 64), color=(40, 80, 160))
-    with OmniServer(model, _server_args(), use_omni=True) as server:
-        response = requests.post(
-            f"http://{server.host}:{server.port}/v1/videos/sync",
-            data={
-                "model": server.model,
-                "prompt": "The blue rectangle moves slowly forward.",
-                "negative_prompt": NEGATIVE_PROMPT,
-                "image_reference": json.dumps({"image_url": _image_data_url(reference)}),
-                "size": f"{WIDTH}x{HEIGHT}",
-                "num_frames": "5",
-                "fps": "1",
-                "num_inference_steps": str(NUM_INFERENCE_STEPS),
-                "guidance_scale": "1.0",
-                "seed": str(SEED),
-            },
-            timeout=1800,
-        )
+    data = {
+        "model": "",
+        "prompt": prompt,
+        "negative_prompt": NEGATIVE_PROMPT,
+        "size": f"{WIDTH}x{HEIGHT}",
+        "num_frames": num_frames,
+        "fps": "1",
+        "num_inference_steps": str(NUM_INFERENCE_STEPS),
+        "guidance_scale": "1.0",
+        "seed": str(SEED),
+    }
+    if image_reference is not None:
+        data["image_reference"] = json.dumps({"image_url": _image_data_url(image_reference)})
+
+    with OmniServer(_model_name(), _server_args(), use_omni=True) as server:
+        data["model"] = server.model
+        response = requests.post(f"http://{server.host}:{server.port}/v1/videos/sync", data=data, timeout=1800)
 
     response.raise_for_status()
     assert response.headers["content-type"].startswith("video/mp4")
     assert response.content
-    (output_dir / "cosmos3_i2v.mp4").write_bytes(response.content)
+    (output_dir / f"cosmos3_{name}.mp4").write_bytes(response.content)

From 386d40fb1f45a18802b510ac358137ffb6d4fec1 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Wed, 20 May 2026 14:08:28 +0200
Subject: [PATCH 21/41] Scope Cosmos3 to core generation

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 docs/models/supported_models.md               |   2 +-
 tests/diffusion/models/cosmos3/conftest.py    |  20 +-
 .../models/cosmos3/test_cosmos3_pipeline.py   | 126 +--
 .../cosmos3/test_cosmos3_sound_tokenizer.py   | 226 ------
 .../cosmos3/test_cosmos3_transformer.py       | 130 +--
 .../openai_api/test_video_server.py           | 202 +----
 vllm_omni/diffusion/models/cosmos3/action.py  | 217 -----
 .../cosmos3/audio_tokenizer/__init__.py       |   6 -
 .../models/cosmos3/audio_tokenizer/avae.py    | 321 --------
 .../models/cosmos3/pipeline_cosmos3.py        | 751 +-----------------
 .../models/cosmos3/sound_tokenizer.py         | 537 -------------
 .../models/cosmos3/transformer_cosmos3.py     | 367 +--------
 vllm_omni/entrypoints/openai/api_server.py    |   9 +-
 .../entrypoints/openai/protocol/__init__.py   |   2 -
 .../entrypoints/openai/protocol/videos.py     |  22 -
 vllm_omni/entrypoints/openai/serving_video.py | 260 +-----
 16 files changed, 77 insertions(+), 3121 deletions(-)
 delete mode 100644 tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
 delete mode 100644 vllm_omni/diffusion/models/cosmos3/action.py
 delete mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
 delete mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
 delete mode 100644 vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index a1165611ddf..1ca01d3658e 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -32,7 +32,7 @@ th {
 | `ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanPipeline` | Wan2.1-T2V, Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-T2V-14B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
-| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V, T2V with sound, action policy | `nvidia/Cosmos3-Nano` | ✅︎ | | | |
+| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V | `nvidia/Cosmos3-Nano` | ✅︎ | | | |
 | `WanSpeechToVideoPipeline` | Wan2.2-S2V | `Wan-AI/Wan2.2-S2V-14B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Wan22VACEPipeline` | Wan2.1-VACE | `Wan-AI/Wan2.1-VACE-1.3B-diffusers`, `Wan-AI/Wan2.1-VACE-14B-diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `LTX2Pipeline` | LTX-2-T2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | |
diff --git a/tests/diffusion/models/cosmos3/conftest.py b/tests/diffusion/models/cosmos3/conftest.py
index 80a7105d2ca..aa3bba1acd5 100644
--- a/tests/diffusion/models/cosmos3/conftest.py
+++ b/tests/diffusion/models/cosmos3/conftest.py
@@ -75,17 +75,9 @@ def __init__(
         self,
         *,
         latent_channel_size: int = 2,
-        sound_gen: bool = False,
-        sound_dim: int = 3,
-        action_gen: bool = False,
-        action_dim: int = 4,
     ) -> None:
         super().__init__()
         self.latent_channel_size = latent_channel_size
-        self.sound_gen = sound_gen
-        self.sound_dim = sound_dim
-        self.action_gen = action_gen
-        self.action_dim = action_dim
         self.cached_kv: Any | None = None
         self.cached_freqs_gen: Any | None = None
         self.calls: list[dict[str, Any]] = []
@@ -104,9 +96,8 @@ def forward(
         text_ids: torch.Tensor,
         text_mask: torch.Tensor,
         **kwargs: Any,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         token = int(text_ids.reshape(-1)[0].item()) if text_ids.numel() else 0
-        sound_latents = kwargs.get("sound_latents")
         self.calls.append(
             {
                 "token": token,
@@ -120,13 +111,7 @@ def forward(
             marker = torch.tensor([token], dtype=torch.float32)
             self.cached_kv = [(marker, marker + 100)]
             self.cached_freqs_gen = (marker + 200, marker + 300)
-        action_latents = kwargs.get("action_latents")
-        outputs: list[torch.Tensor] = [torch.full_like(hidden_states, float(token))]
-        if action_latents is not None:
-            outputs.append(torch.full_like(action_latents, float(token + 20)))
-        if sound_latents is not None:
-            outputs.append(torch.full_like(sound_latents, float(token + 10)))
-        return outputs[0] if len(outputs) == 1 else tuple(outputs)
+        return torch.full_like(hidden_states, float(token))
 
 
 def passthrough_progress_bar(iterable):
@@ -167,7 +152,6 @@ def _make():
         pipeline._guidance_scale = None
         pipeline._num_timesteps = None
         pipeline.progress_bar = passthrough_progress_bar
-        pipeline._sound_tokenizer = None
         return pipeline
 
     return _make
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
index 452d8d4e8b5..28a53d35074 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -48,7 +48,7 @@ def test_pipeline_registered_and_exported() -> None:
     assert "Cosmos3OmniDiffusersPipeline" in cosmos3.__all__
 
 
-def test_preprocess_i2v_image_and_action_video_inputs() -> None:
+def test_preprocess_i2v_image_input() -> None:
     from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
 
     preprocess = get_cosmos3_pre_process_func(SimpleNamespace())
@@ -61,18 +61,8 @@ def test_preprocess_i2v_image_and_action_video_inputs() -> None:
     assert (result.sampling_params.height, result.sampling_params.width) == (672, 1344)
     assert tuple(result.prompts[0]["additional_information"]["preprocessed_image"].shape[-2:]) == (672, 1344)
 
-    frames = [Image.new("RGB", (8, 4), color) for color in ("red", "green", "blue")]
-    action = SimpleNamespace(
-        prompts=[{"prompt": "Move.", "multi_modal_data": {"video": frames}}],
-        sampling_params=SimpleNamespace(height=16, width=32, extra_args={"action_mode": "forward_dynamics"}),
-    )
-
-    additional = preprocess(action).prompts[0]["additional_information"]
-    assert tuple(additional["preprocessed_image"].shape) == (1, 3, 16, 32)
-    assert tuple(additional["preprocessed_video"].shape) == (1, 3, 3, 16, 32)
-
 
-def test_postprocess_handles_image_video_audio_and_validation() -> None:
+def test_postprocess_handles_image_video_and_validation() -> None:
     from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_post_process_func
 
     func = get_cosmos3_post_process_func(SimpleNamespace())
@@ -81,13 +71,6 @@ def test_postprocess_handles_image_video_audio_and_validation() -> None:
     assert func(video, output_type="latent") is video
     assert func({"image": video})[0].size == (4, 4)
     assert "video" in func({"video": video})
-    assert (
-        func(
-            {"video": video, "audio": torch.ones(1, 2, 16), "audio_sample_rate": 48000},
-            sampling_params=SimpleNamespace(extra_args={"resolved_frame_rate": 12}),
-        )["audio_sample_rate"]
-        == 48000
-    )
 
     with pytest.raises(ValueError, match="text-to-image postprocess expects"):
         func({"image": torch.zeros(1, 3, 2, 4, 4)})
@@ -121,14 +104,12 @@ def test_prompt_formatting_and_checkpoint_key_remap(make_cosmos3_pipeline) -> No
         "model.embed_tokens.weight": "transformer.language_model.embed_tokens.weight",
         "model.layers.3.self_attn.q_proj.weight": "transformer.language_model.layers.3.self_attn.q_proj.weight",
         "model.layers.3.self_attn.q_proj_moe_gen.weight": "transformer.gen_layers.3.cross_attention.q_proj.weight",
-        "sound2llm.weight": "transformer.sound2llm.weight",
-        "action_modality_embed.weight": "transformer.action_modality_embed",
         "lm_head.weight": None,
     }
     assert {key: Cosmos3OmniDiffusersPipeline._remap_ckpt_key(key) for key in remaps} == remaps
 
 
-def test_prepare_latents_for_video_image_sound_and_action(make_cosmos3_pipeline) -> None:
+def test_prepare_latents_for_video_and_image(make_cosmos3_pipeline) -> None:
     pipeline = make_cosmos3_pipeline()
     latents = pipeline._prepare_latents(16, 24, 5, torch.Generator(device="cpu").manual_seed(0))
     assert latents.shape == (1, 2, 2, 2, 3)
@@ -141,36 +122,8 @@ def test_prepare_latents_for_video_image_sound_and_action(make_cosmos3_pipeline)
     assert velocity_mask.tolist() == [[[[[0.0]], [[1.0]]]]]
     assert image_latent.shape == (1, 2, 1, 2, 3)
 
-    pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
-    pipeline._sound_tokenizer = SimpleNamespace(
-        sample_rate=10,
-        latent_ch=3,
-        hop_size=4,
-        decode=lambda x: torch.ones(x.shape[0], 2, 24),
-    )
-    assert pipeline._resolve_sound_target_samples(SimpleNamespace(extra_args={"sound_duration": 2.0}), 9, 3.0) == (
-        20,
-        2.0,
-        10,
-    )
-    sound_latents, latent_frames = pipeline._prepare_sound_latents(21, torch.Generator(device="cpu").manual_seed(0))
-    assert (sound_latents.shape, latent_frames) == (torch.Size([1, 3, 6]), 6)
-    assert pipeline._decode_sound_latents(torch.zeros(1, 3, 6), target_audio_samples=21).shape == (1, 2, 21)
-
-    pipeline.transformer = pipeline.transformer.__class__(action_gen=True, action_dim=4)
-    action, action_mask, clean, raw_dim = pipeline._prepare_action_latents(
-        mode="forward_dynamics",
-        action_chunk_size=2,
-        raw_action_dim=None,
-        generator=torch.Generator(device="cpu").manual_seed(0),
-        sp=SimpleNamespace(extra_args={"action": [[1.0, 2.0], [3.0, 4.0]]}),
-    )
-    assert raw_dim == 2
-    assert action_mask.tolist() == [[[0.0], [0.0]]]
-    torch.testing.assert_close(action, clean)
 
-
-def test_diffuse_covers_cfg_i2v_and_multimodal_steps(make_cosmos3_pipeline) -> None:
+def test_diffuse_covers_cfg_and_i2v_steps(make_cosmos3_pipeline) -> None:
     pipeline = make_cosmos3_pipeline()
     latents = torch.zeros(1, 2, 1, 1, 1)
 
@@ -202,23 +155,6 @@ def test_diffuse_covers_cfg_i2v_and_multimodal_steps(make_cosmos3_pipeline) -> N
     )
     torch.testing.assert_close(i2v[:, :, 0:1], torch.full((1, 2, 1, 1, 1), 7.0))
 
-    pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
-    video_result, action_result = pipeline.diffuse(
-        latents=latents,
-        action_latents=torch.zeros(1, 3, 4),
-        action_velocity_mask=torch.ones(1, 3, 1),
-        action_condition_latents=torch.zeros(1, 3, 4),
-        timesteps=torch.tensor([7, 3]),
-        cond_ids=_ids(2),
-        cond_mask=_mask(),
-        uncond_ids=_ids(1),
-        uncond_mask=_mask(),
-        guidance_scale=1.0,
-        shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0, "action_domain_ids": torch.tensor([0])},
-    )
-    torch.testing.assert_close(video_result, torch.full_like(latents, 4.0))
-    torch.testing.assert_close(action_result, torch.full((), 44.0).expand_as(action_result))
-
 
 class TestForwardRouting:
     def _install_forward_stubs(self, pipeline):
@@ -242,12 +178,7 @@ def fake_prepare(height, width, num_frames, generator):
 
         def fake_diffuse(**kwargs):
             captured["diffuse_calls"].append(kwargs)
-            outputs = [kwargs["latents"] + len(captured["diffuse_calls"])]
-            if kwargs.get("action_latents") is not None:
-                outputs.append(kwargs["action_latents"] + 3.0)
-            if kwargs.get("sound_latents") is not None:
-                outputs.append(kwargs["sound_latents"] + 2.0)
-            return outputs[0] if len(outputs) == 1 else tuple(outputs)
+            return kwargs["latents"] + len(captured["diffuse_calls"])
 
         pipeline._format_and_tokenize_prompts = fake_format
         pipeline._prepare_latents = fake_prepare
@@ -295,7 +226,7 @@ def test_forward_defaults_and_mode_selection(
         assert captured["flow_shifts"] == expected["flow"]
         assert captured["scheduler_steps"] == expected["steps"]
 
-    def test_forward_i2v_sound_and_action_routes(self, make_cosmos3_pipeline) -> None:
+    def test_forward_i2v_route(self, make_cosmos3_pipeline) -> None:
         pipeline = make_cosmos3_pipeline()
         captured = self._install_forward_stubs(pipeline)
         image_tensor = torch.zeros(1, 3, 16, 16)
@@ -320,55 +251,11 @@ def test_forward_i2v_sound_and_action_routes(self, make_cosmos3_pipeline) -> Non
         )
         assert captured["diffuse_calls"][-1]["shared_kwargs"]["noisy_frame_mask"] is velocity_mask
 
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
-        sound_latents = torch.zeros(1, 3, 4)
-        pipeline._resolve_sound_target_samples = lambda *args: (20, 2.0, 10)
-        pipeline._prepare_sound_latents = lambda *args: (sound_latents, 4)
-        pipeline._decode_sound_latents = lambda *args: torch.ones(1, 2, 20)
-        output = pipeline.forward(
-            SimpleNamespace(
-                prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
-                sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0),
-            )
-        )
-        assert captured["diffuse_calls"][-1]["sound_latents"] is sound_latents
-        assert output.output["audio_sample_rate"] == 10
-
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
-        output = pipeline.forward(
-            SimpleNamespace(
-                prompts=[
-                    {
-                        "prompt": "Pick the block.",
-                        "modalities": ["video"],
-                        "additional_information": {"preprocessed_image": image_tensor},
-                    }
-                ],
-                sampling_params=make_sampling_params(
-                    height=16,
-                    width=16,
-                    extra_args={
-                        "action_mode": "policy",
-                        "action_chunk_size": 2,
-                        "raw_action_dim": 2,
-                        "domain_name": "bridge_orig_lerobot",
-                    },
-                ),
-            )
-        )
-        assert captured["diffuse_calls"][-1]["shared_kwargs"]["action_domain_ids"].tolist() == [7]
-        assert output.custom_output["action"].shape == (1, 2, 2)
-
     @pytest.mark.parametrize(
         ("prompt", "sampling_params", "message"),
         [
             (["one", "two"], make_sampling_params(), "single prompt"),
             ([{"prompt": "one", "modalities": ["image", "video"]}], make_sampling_params(), "both image and video"),
-            (
-                [{"prompt": "x", "modalities": ["image"], "generate_sound": True}],
-                make_sampling_params(),
-                "only for video",
-            ),
         ],
     )
     def test_forward_rejects_invalid_public_requests(
@@ -379,7 +266,6 @@ def test_forward_rejects_invalid_public_requests(
         message,
     ) -> None:
         pipeline = make_cosmos3_pipeline()
-        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
 
         with pytest.raises(ValueError, match=message):
             pipeline.forward(SimpleNamespace(prompts=prompt, sampling_params=sampling_params))
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
deleted file mode 100644
index 47664c59e77..00000000000
--- a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from types import SimpleNamespace
-
-import pytest
-import torch
-
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
-
-DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME = "diffusion_pytorch_model.safetensors"
-
-
-class _FakeAVAEAudioTokenizer:
-    def __init__(self, **kwargs) -> None:
-        self.kwargs = kwargs
-        self.sample_rate = int(kwargs["sample_rate"])
-        self.audio_channels = int(kwargs["audio_channels"])
-        self.latent_ch = int(kwargs["io_channels"])
-        self.temporal_compression_factor = int(kwargs["hop_size"])
-
-    def get_latent_num_samples(self, num_audio_samples: int) -> int:
-        return int(num_audio_samples) // self.temporal_compression_factor
-
-    def get_audio_num_samples(self, num_latent_samples: int) -> int:
-        return int(num_latent_samples) * self.temporal_compression_factor
-
-    def decode(self, latents: torch.Tensor) -> torch.Tensor:
-        return torch.zeros(latents.shape[0], self.audio_channels, 8)
-
-
-def _write_component(root: Path, config: dict | None = None, checkpoint_name: str | None = None) -> Path:
-    tokenizer_dir = root / "sound_tokenizer"
-    tokenizer_dir.mkdir(parents=True)
-    if checkpoint_name:
-        (tokenizer_dir / checkpoint_name).write_bytes(b"stub")
-    (tokenizer_dir / "config.json").write_text(json.dumps(config or {}), encoding="utf-8")
-    return tokenizer_dir
-
-
-def _patch_fake_avae(monkeypatch: pytest.MonkeyPatch, created: dict) -> None:
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    class FakeAVAE(_FakeAVAEAudioTokenizer):
-        def __init__(self, **kwargs) -> None:
-            created.update(kwargs)
-            super().__init__(**kwargs)
-
-    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
-    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
-
-
-def test_from_config_loads_local_diffusers_component(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    model_dir = tmp_path / "model"
-    tokenizer_dir = _write_component(model_dir, checkpoint_name=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
-    created = {}
-    _patch_fake_avae(monkeypatch, created)
-
-    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
-        SimpleNamespace(
-            model=str(model_dir),
-            custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800, "sound_dim": 3},
-            dtype=torch.float32,
-        )
-    )
-
-    assert created["checkpoint_path"] == str(tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
-    assert created["config_path"] == str(tokenizer_dir / "config.json")
-    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (32000, 3, 800)
-
-
-def test_from_config_downloads_component_from_hf_repo(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
-    import huggingface_hub
-
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    cache_dir = tmp_path / "hf"
-    _write_component(cache_dir, checkpoint_name=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
-    calls = []
-    created = {}
-    _patch_fake_avae(monkeypatch, created)
-
-    def fake_snapshot_download(repo_id: str, *, revision: str | None, allow_patterns: list[str]) -> str:
-        calls.append((repo_id, revision, allow_patterns))
-        return str(cache_dir)
-
-    monkeypatch.setattr(huggingface_hub, "snapshot_download", fake_snapshot_download)
-
-    sound_tokenizer.Cosmos3SoundTokenizer.from_config(
-        SimpleNamespace(
-            model="nvidia/cosmos3",
-            revision="test-rev",
-            custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800, "sound_dim": 3},
-            dtype=torch.float32,
-        )
-    )
-
-    assert created["checkpoint_path"].endswith(DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
-    assert calls == [
-        (
-            "nvidia/cosmos3",
-            "test-rev",
-            ["sound_tokenizer/config.json", f"sound_tokenizer/{DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME}"],
-        )
-    ]
-
-
-@pytest.mark.parametrize(
-    ("checkpoint_name", "message"),
-    [
-        (None, "no AVAE sound tokenizer checkpoint"),
-        ("model.safetensors", DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME),
-    ],
-)
-def test_default_component_requires_diffusers_checkpoint_name(tmp_path, checkpoint_name, message) -> None:
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    model_dir = tmp_path / "model"
-    _write_component(model_dir, checkpoint_name=checkpoint_name)
-
-    with pytest.raises(ValueError, match=message):
-        sound_tokenizer.Cosmos3SoundTokenizer.from_config(
-            SimpleNamespace(model=str(model_dir), custom_pipeline_args={}, dtype=torch.float32)
-        )
-
-
-def test_component_config_precedence_and_conflict_detection(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
-    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
-
-    component_config = {
-        "sampling_rate": 48000,
-        "dec_out_channels": 2,
-        "vocoder_input_dim": 64,
-        "hop_size": 1920,
-    }
-    model_dir = tmp_path / "model"
-    _write_component(model_dir, component_config, DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
-    created = {}
-    _patch_fake_avae(monkeypatch, created)
-
-    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
-        SimpleNamespace(
-            model=str(model_dir),
-            custom_pipeline_args={
-                "sound_normalize_latents": True,
-                "sound_normalization_type": "tanh",
-                "sound_tanh_input_scale": 2.0,
-            },
-            model_config={
-                "sound_tokenizer": {
-                    "sample_rate": 32000,
-                    "audio_channels": 1,
-                    "io_channels": 3,
-                    "hop_size": 800,
-                    "normalize_latents": False,
-                    "normalization_type": "none",
-                }
-            },
-            dtype=torch.float32,
-        )
-    )
-
-    assert (created["sample_rate"], created["audio_channels"], created["io_channels"], created["hop_size"]) == (
-        48000,
-        2,
-        64,
-        1920,
-    )
-    assert (created["normalize_latents"], created["normalization_type"], created["tanh_input_scale"]) == (
-        True,
-        "tanh",
-        2.0,
-    )
-    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (48000, 64, 1920)
-
-    with pytest.raises(ValueError, match=r"sample_rate.*48000.*32000"):
-        sound_tokenizer.Cosmos3SoundTokenizer.from_config(
-            SimpleNamespace(
-                model=str(model_dir),
-                custom_pipeline_args={"sound_sample_rate": 32000},
-                dtype=torch.float32,
-            )
-        )
-
-
-def test_avae_uses_diffusers_decoder_state_dict_layout(tmp_path) -> None:
-    from safetensors.torch import save_file
-
-    from vllm_omni.diffusion.models.cosmos3.audio_tokenizer import avae
-
-    config = {
-        "sampling_rate": 8000,
-        "hop_size": 2,
-        "dec_dim": 4,
-        "dec_c_mults": [1],
-        "dec_strides": [2],
-        "dec_out_channels": 1,
-        "vocoder_input_dim": 2,
-        "normalization_type": "none",
-    }
-    checkpoint_path = tmp_path / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME
-    config_path = tmp_path / "config.json"
-    config_path.write_text(json.dumps(config), encoding="utf-8")
-
-    decoder = avae.OobleckDecoder(4, 2, 1, [2], [1])
-    save_file({f"decoder.{key}": value for key, value in decoder.state_dict().items()}, str(checkpoint_path))
-
-    tokenizer = avae.Cosmos3AVAEAudioTokenizer(
-        checkpoint_path=checkpoint_path,
-        config_path=config_path,
-        dtype=torch.float32,
-        device="cpu",
-    )
-
-    keys = set(tokenizer.state_dict())
-    assert {"decoder.conv1.weight_g", "decoder.block.0.conv_t1.weight_g", "decoder.conv2.weight_g"} <= keys
-    assert not any(key.startswith(("decoder.layers.", "model.decoder.")) for key in keys)
-    assert tokenizer.decode(torch.zeros(1, 2, 3)).shape == (1, 1, 6)
-    with pytest.raises(NotImplementedError, match="decoder-only"):
-        tokenizer.encode(torch.zeros(1, 1, 6))
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index efe73e2d41d..c37f1186873 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -29,10 +29,8 @@ def _tiny_cosmos3_config(**overrides):
     return config
 
 
-def test_mrope_position_ids_cover_text_video_sound_and_action() -> None:
+def test_mrope_position_ids_cover_text_and_video() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
-        compute_mrope_position_ids_action,
-        compute_mrope_position_ids_sound,
         compute_mrope_position_ids_text,
         compute_mrope_position_ids_vision,
     )
@@ -58,14 +56,6 @@ def test_mrope_position_ids_cover_text_video_sound_and_action() -> None:
     torch.testing.assert_close(modulated_ids[0], torch.tensor([10.0, 12.0]))
     assert modulated_offset == 13
 
-    sound_ids, sound_offset = compute_mrope_position_ids_sound(3, temporal_offset=10, sound_latent_fps=25.0)
-    torch.testing.assert_close(sound_ids[0], torch.tensor([10.0, 10.96, 11.92]))
-    assert sound_offset == 12
-
-    action_ids, action_offset = compute_mrope_position_ids_action(3, temporal_offset=10, action_fps=None)
-    assert action_ids.tolist() == [[11, 12, 13], [0, 0, 0], [0, 0, 0]]
-    assert action_offset == 14
-
 
 @pytest.mark.parametrize(
     ("key", "value"),
@@ -111,110 +101,24 @@ def test_transformer_sharding_offload_and_patch_round_trip_contracts() -> None:
     torch.testing.assert_close(model.unpatchify(model.patchify(latents, t=1, h=3, w=5), t=1, h=3, w=5), latents)
 
 
-def test_sound_and_action_modules_follow_config() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    tiny = _tiny_cosmos3_config()
-    no_modal = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32))
-    with_sound = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config={**tiny, "sound_gen": True},
-            model_config={"sound_tokenizer": {"io_channels": 5, "sample_rate": 32000, "hop_size": 800}},
-            custom_pipeline_args={},
-            dtype=torch.float32,
-        )
-    )
-    with_action = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config={**tiny, "action_gen": True, "max_action_dim": 6, "num_embodiment_domains": 9},
-            dtype=torch.float32,
-        )
-    )
-
-    assert no_modal.sound_gen is False
-    assert no_modal.action_gen is False
-    assert not hasattr(no_modal, "sound2llm")
-    assert not hasattr(no_modal, "action2llm")
-    assert with_sound.sound_dim == 5
-    assert with_sound.sound_latent_fps == 40.0
-    assert with_sound.sound2llm.in_features == 5
-    assert with_action.action_dim == 6
-    assert with_action.action2llm.num_domains == 9
-
-
-def test_sound_and_action_pack_unpack_validate_shapes() -> None:
+def test_forward_returns_video_prediction() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
-    model = object.__new__(Cosmos3VFMTransformer)
-    nn.Module.__init__(model)
-    model.sound_dim = 3
-    model.action_dim = 3
-
-    sound = torch.arange(2 * 3 * 4, dtype=torch.float32).reshape(2, 3, 4)
-    action = torch.arange(2 * 4 * 3, dtype=torch.float32).reshape(2, 4, 3)
-    torch.testing.assert_close(model.unpack_sound(model.pack_sound(sound)), sound)
-    torch.testing.assert_close(model.unpack_action(model.pack_action(action)), action)
-
-    with pytest.raises(ValueError, match="channel mismatch"):
-        model.pack_sound(torch.zeros(1, 4, 2))
-    with pytest.raises(ValueError, match="dimension mismatch"):
-        model.pack_action(torch.zeros(1, 2, 4))
-
-
-@pytest.mark.parametrize(
-    ("config", "extra_kwargs", "expected_shapes"),
-    [
-        (
-            _tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0),
-            {"sound_latents": torch.zeros(1, 3, 4)},
-            [(1, 2, 1, 2, 2), (1, 3, 4)],
-        ),
-        (
-            _tiny_cosmos3_config(action_gen=True, max_action_dim=3, num_embodiment_domains=4),
-            {"action_latents": torch.zeros(1, 5, 3), "action_domain_ids": torch.tensor([2])},
-            [(1, 2, 1, 2, 2), (1, 5, 3)],
-        ),
-    ],
-)
-def test_forward_returns_video_plus_optional_modality_predictions(config, extra_kwargs, expected_shapes) -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
-
-    output = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=config, dtype=torch.float32))(
+    output = Cosmos3VFMTransformer(
+        SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32)
+    )(
         hidden_states=torch.zeros(1, 2, 1, 2, 2),
         timestep=torch.tensor([1.0]),
         text_ids=torch.tensor([[1, 2]], dtype=torch.long),
         text_mask=torch.ones(1, 2, dtype=torch.long),
         video_shape=(1, 2, 2),
         fps=24.0,
-        action_noisy_mask=torch.ones(1, 5, 1),
-        **extra_kwargs,
     )
 
-    assert isinstance(output, tuple)
-    assert [tuple(tensor.shape) for tensor in output] == expected_shapes
+    assert tuple(output.shape) == (1, 2, 1, 2, 2)
 
 
-def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch: pytest.MonkeyPatch) -> None:
-    import vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 as cosmos3_module
-
-    model = cosmos3_module.Cosmos3VFMTransformer(
-        SimpleNamespace(tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3), dtype=torch.float32)
-    )
-    monkeypatch.setattr(cosmos3_module, "_get_ulysses_state", lambda: (2, 0, None))
-
-    with pytest.raises(ValueError, match=r"GEN sequence length \(3 = video tokens 2 \+ sound tokens 1\)"):
-        model(
-            hidden_states=torch.zeros(1, 2, 1, 1, 2),
-            timestep=torch.tensor([1.0]),
-            text_ids=torch.tensor([[1, 2]], dtype=torch.long),
-            text_mask=torch.ones(1, 2, dtype=torch.long),
-            video_shape=(1, 1, 2),
-            fps=24.0,
-            sound_latents=torch.zeros(1, 3, 1),
-        )
-
-
-def test_compute_rope_freqs_places_text_video_action_and_sound_positions() -> None:
+def test_compute_rope_freqs_places_text_and_video_positions() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     class FakeRotary:
@@ -234,8 +138,6 @@ def __call__(self, x, position_ids):
     model.temporal_modality_margin = 100
     model.base_fps = 24.0
     model.temporal_compression_factor = 4
-    model.temporal_compression_factor_sound = 1
-    model.sound_latent_fps = 25.0
     model.enable_fps_modulation = False
 
     freqs_und, freqs_gen = model._compute_rope_freqs(
@@ -252,21 +154,3 @@ def __call__(self, x, position_ids):
     assert vision_pos[0, 0].tolist() == [102, 103]
     assert freqs_und[0].shape == (2, 3, 1, 4)
     assert freqs_gen[0].shape == (2, 2, 1, 4)
-
-    rotary.position_ids.clear()
-    model._compute_rope_freqs(
-        text_mask=torch.tensor([[1, 1]], dtype=torch.long),
-        t=2,
-        hp=1,
-        wp=1,
-        fps=24.0,
-        device=torch.device("cpu"),
-        dtype=torch.float32,
-        t_action=2,
-        action_start_frame_offset=1,
-        t_sound=1,
-    )
-
-    _, gen_pos = rotary.position_ids
-    assert gen_pos.shape == (3, 1, 5)
-    assert gen_pos[0, 0].tolist() == [102, 103, 103, 104, 102]
diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py
index 57a09872397..d982cce7f35 100644
--- a/tests/entrypoints/openai_api/test_video_server.py
+++ b/tests/entrypoints/openai_api/test_video_server.py
@@ -39,17 +39,11 @@ class MockVideoResult:
     def __init__(
         self,
         videos,
-        audios=None,
-        sample_rate=None,
         custom_output=None,
         stage_durations=None,
         peak_memory_mb=0.0,
     ):
         self.multimodal_output = {"video": videos}
-        if audios is not None:
-            self.multimodal_output["audio"] = audios
-        if sample_rate is not None:
-            self.multimodal_output["audio_sample_rate"] = sample_rate
         self._custom_output = custom_output or {}
         self.stage_durations = stage_durations or {}
         self.peak_memory_mb = peak_memory_mb
@@ -179,49 +173,10 @@ def test_async_video_generation_bypasses_base64(test_client, mocker: MockerFixtu
     mock_base64.assert_not_called()
 
 
-def test_async_video_generation_with_audio_bypasses_base64(test_client, mocker: MockerFixture):
-    """Regression test: Ensure async video generation passes audio through
-    generate_video_bytes without bouncing through base64 encoding."""
-    mock_encode = mocker.patch(
-        "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes",
-        return_value=b"raw-mp4-bytes",
-    )
-
-    mock_base64 = mocker.patch(
-        "vllm_omni.entrypoints.openai.serving_video.encode_video_base64",
-        side_effect=RuntimeError("Regression: async video path should not base64 encode"),
-    )
-
-    engine = test_client.app.state.openai_serving_video._engine_client
-
-    async def _generate(prompt, request_id, sampling_params_list):
-        engine.captured_prompt = prompt
-        engine.captured_sampling_params_list = sampling_params_list
-        yield MockVideoResult([object()], audios=[object()], sample_rate=48000)
-
-    engine.generate = _generate
-
-    response = test_client.post(
-        "/v1/videos",
-        data={"prompt": "A base64 test with audio."},
-    )
-    assert response.status_code == 200
-    video_id = response.json()["id"]
-
-    _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value)
-    mock_base64.assert_not_called()
-
-    mock_encode.assert_called_once()
-    kwargs = mock_encode.call_args.kwargs
-    assert "audio" in kwargs
-    assert kwargs["audio"] is not None
-    assert kwargs["audio_sample_rate"] == 48000
-
-
 def test_t2v_video_generation_form(test_client, mocker: MockerFixture):
     fps_values = []
 
-    def _fake_encode(video, fps, audio=None, audio_sample_rate=None, **kwargs):
+    def _fake_encode(video, fps, **kwargs):
         fps_values.append(fps)
         return b"fake-video"
 
@@ -336,7 +291,7 @@ def test_i2v_video_generation_with_image_reference_form(test_client, mocker: Moc
 def test_seconds_defaults_fps_and_frames(test_client, mocker: MockerFixture):
     fps_values = []
 
-    def _fake_encode(video, fps, audio=None, audio_sample_rate=None, **kwargs):
+    def _fake_encode(video, fps, **kwargs):
         fps_values.append(fps)
         return b"fake-video"
 
@@ -400,8 +355,6 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture):
             "true_cfg_scale": "4.0",
             "boundary_ratio": "0.7",
             "flow_shift": "0.25",
-            "generate_sound": "true",
-            "sound_duration": "2.5",
         },
     )
 
@@ -416,8 +369,6 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture):
     assert captured.true_cfg_scale == 4.0
     assert captured.boundary_ratio == 0.7
     assert captured.extra_args["flow_shift"] == 0.25
-    assert captured.extra_args["generate_sound"] is True
-    assert captured.extra_args["sound_duration"] == 2.5
 
 
 def test_frame_interpolation_params_pass_to_diffusion_sampling_params(test_client, mocker: MockerFixture):
@@ -562,47 +513,6 @@ def _fake_encode(video, fps, **kwargs):
     assert fps_values == [16]
 
 
-def test_audio_sample_rate_comes_from_model_config(test_client, mocker: MockerFixture):
-    audio_sample_rates = []
-
-    def _fake_encode(video, fps, audio=None, audio_sample_rate=None, video_codec_options=None):
-        del video, fps, audio, video_codec_options
-        audio_sample_rates.append(audio_sample_rate)
-        return b"fake-video"
-
-    engine = test_client.app.state.openai_serving_video._engine_client
-    engine.model_config = SimpleNamespace(
-        hf_config=SimpleNamespace(
-            vocoder=SimpleNamespace(
-                config=SimpleNamespace(output_sampling_rate=16000),
-            ),
-        ),
-    )
-
-    async def _generate(prompt, request_id, sampling_params_list):
-        engine.captured_prompt = prompt
-        engine.captured_sampling_params_list = sampling_params_list
-        import numpy as np
-
-        yield MockVideoResult([np.zeros((1, 64, 64, 3), dtype=np.uint8)], audios=[object()])
-
-    engine.generate = _generate
-
-    mocker.patch(
-        "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes",
-        side_effect=_fake_encode,
-    )
-    response = test_client.post(
-        "/v1/videos",
-        data={"prompt": "video with audio"},
-    )
-
-    assert response.status_code == 200
-    video_id = response.json()["id"]
-    _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value)
-    assert audio_sample_rates == [16000]
-
-
 def test_video_job_persists_profiler_metadata(test_client, mocker: MockerFixture):
     engine = test_client.app.state.openai_serving_video._engine_client
 
@@ -628,109 +538,6 @@ async def _generate(prompt, request_id, sampling_params_list):
 
     assert completed["stage_durations"] == {"diffuse": 2.5, "vae.decode": 0.3}
     assert completed["peak_memory_mb"] == 4096.5
-    assert completed["action"] is None
-
-
-def test_video_generation_response_exposes_action_payload(mocker: MockerFixture):
-    engine = FakeAsyncOmni()
-    handler = OmniOpenAIServingVideo.for_diffusion(
-        diffusion_engine=engine,
-        model_name="Cosmos3-8B-UVA",
-    )
-
-    async def _generate(prompt, request_id, sampling_params_list):
-        del prompt, request_id, sampling_params_list
-        yield MockVideoResult(
-            [object()],
-            custom_output={
-                "action": np.array([[[1.5, 2.5], [3.5, 4.5]]], dtype=np.float32),
-                "raw_action_dim": 2,
-                "action_mode": "policy",
-                "domain_id": 7,
-            },
-        )
-
-    engine.generate = _generate
-    mocker.patch(
-        "vllm_omni.entrypoints.openai.serving_video.encode_video_base64",
-        return_value="encoded-video",
-    )
-
-    response = asyncio.run(
-        handler.generate_videos(
-            VideoGenerationRequest(prompt="predict actions"),
-            "action-json",
-        )
-    )
-
-    action = response.data[0].action
-    assert action is not None
-    assert action.data == [[1.5, 2.5], [3.5, 4.5]]
-    assert action.shape == [2, 2]
-    assert action.dtype == "float32"
-    assert action.raw_action_dim == 2
-    assert action.action_mode == "policy"
-    assert action.domain_id == 7
-    assert response.model_dump(mode="json")["data"][0]["action"]["data"] == [[1.5, 2.5], [3.5, 4.5]]
-
-
-def test_video_job_persists_action_metadata(test_client, mocker: MockerFixture):
-    engine = test_client.app.state.openai_serving_video._engine_client
-
-    async def _generate(prompt, request_id, sampling_params_list):
-        engine.captured_prompt = prompt
-        engine.captured_sampling_params_list = sampling_params_list
-        yield MockVideoResult(
-            [object()],
-            custom_output={
-                "action": np.array([[[1.0, 2.0], [3.0, 4.0]]], dtype=np.float32),
-                "raw_action_dim": 2,
-                "action_mode": "policy",
-                "domain_id": 7,
-            },
-        )
-
-    engine.generate = _generate
-    mocker.patch(
-        "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes",
-        return_value=b"fake-video",
-    )
-
-    response = test_client.post("/v1/videos", data={"prompt": "profile me"})
-    assert response.status_code == 200
-    video_id = response.json()["id"]
-    completed = _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value)
-
-    expected_action = {
-        "data": [[1.0, 2.0], [3.0, 4.0]],
-        "shape": [2, 2],
-        "dtype": "float32",
-        "raw_action_dim": 2,
-        "action_mode": "policy",
-        "domain_id": 7,
-    }
-    assert completed["action"] == expected_action
-
-    listed = test_client.get("/v1/videos").json()
-    assert listed["data"][0]["action"] == expected_action
-
-
-def test_action_extraction_accepts_unbatched_action():
-    result = MockVideoResult(
-        [object()],
-        custom_output={
-            "action": np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
-            "raw_action_dim": 2,
-            "action_mode": "policy",
-            "domain_id": 7,
-        },
-    )
-
-    actions = OmniOpenAIServingVideo._extract_action_outputs(result, expected_count=1)
-
-    assert actions[0] is not None
-    assert actions[0].data == [[1.0, 2.0], [3.0, 4.0]]
-    assert actions[0].shape == [2, 2]
 
 
 def test_missing_handler_returns_503():
@@ -864,9 +671,6 @@ def test_invalid_uploaded_input_reference_returns_400(test_client):
 def test_video_request_validation():
     req = VideoGenerationRequest(prompt="test")
     assert req.prompt == "test"
-    assert req.generate_sound is False
-    assert req.sound_duration is None
-    assert VideoGenerationRequest(prompt="test", generate_sound=True, sound_duration=1.5).generate_sound is True
     with pytest.raises(ValueError):
         VideoGenerationRequest(prompt="test", size="invalid")
 
@@ -879,8 +683,6 @@ def test_video_request_validation():
         VideoGenerationRequest(prompt="test", frame_interpolation_exp=0)
     with pytest.raises(ValueError):
         VideoGenerationRequest(prompt="test", frame_interpolation_scale=0)
-    with pytest.raises(ValueError):
-        VideoGenerationRequest(prompt="test", sound_duration=0)
 
 
 def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerFixture):
diff --git a/vllm_omni/diffusion/models/cosmos3/action.py b/vllm_omni/diffusion/models/cosmos3/action.py
deleted file mode 100644
index e2572bbb733..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/action.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Action-token helpers for Cosmos3 UVA/action generation."""
-
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-
-ACTION_MODE_POLICY = "policy"
-ACTION_MODE_FORWARD_DYNAMICS = "forward_dynamics"
-ACTION_MODE_INVERSE_DYNAMICS = "inverse_dynamics"
-ACTION_MODES = {
-    ACTION_MODE_POLICY,
-    ACTION_MODE_FORWARD_DYNAMICS,
-    ACTION_MODE_INVERSE_DYNAMICS,
-}
-
-
-EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = {
-    "no_action": 0,
-    "av": 1,
-    "camera_pose": 2,
-    "hand_pose": 3,
-    "pusht": 4,
-    "libero": 5,
-    "umi": 6,
-    "bridge_orig_lerobot": 7,
-    "droid_lerobot": 8,
-    "robomind-franka": 8,
-    "galbot": 9,
-    "robomind-franka-dual": 12,
-    "robomind-ur": 13,
-    "agibotworld": 15,
-    "agibot_gear_gripper": 15,
-    "agibot_gear_gripper_ext": 15,
-    "fractal": 20,
-}
-
-
-VIDEO_RES_SIZE_INFO: dict[str, dict[str, tuple[int, int]]] = {
-    "256": {
-        "1,1": (256, 256),
-        "4,3": (320, 256),
-        "3,4": (256, 320),
-        "16,9": (320, 192),
-        "9,16": (192, 320),
-    },
-    "480": {
-        "1,1": (640, 640),
-        "4,3": (736, 544),
-        "3,4": (544, 736),
-        "16,9": (832, 480),
-        "9,16": (480, 832),
-    },
-    "704": {
-        "1,1": (960, 960),
-        "4,3": (1088, 832),
-        "3,4": (832, 1088),
-        "16,9": (1280, 704),
-        "9,16": (704, 1280),
-    },
-    "720": {
-        "1,1": (960, 960),
-        "4,3": (1104, 832),
-        "3,4": (832, 1104),
-        "16,9": (1280, 720),
-        "9,16": (720, 1280),
-    },
-}
-
-
-def normalize_action_mode(mode: Any) -> str | None:
-    if mode is None:
-        return None
-    normalized = str(mode).strip().lower()
-    if not normalized:
-        return None
-    if normalized not in ACTION_MODES:
-        raise ValueError(f"Unsupported Cosmos3 action_mode={mode!r}; expected one of {sorted(ACTION_MODES)}.")
-    return normalized
-
-
-def resolve_domain_id(
-    *,
-    domain_id: Any = None,
-    domain_name: Any = None,
-    require_explicit: bool = False,
-) -> int:
-    if domain_id is not None:
-        resolved = int(domain_id)
-        if resolved < 0:
-            raise ValueError(f"Cosmos3 domain_id must be non-negative, got {resolved}.")
-        return resolved
-
-    if domain_name is None or str(domain_name).strip() == "":
-        if require_explicit:
-            raise ValueError(
-                "Cosmos3 action generation requires extra_args['domain_id'] or non-empty extra_args['domain_name']."
-            )
-        return 0
-
-    key = str(domain_name).strip().lower()
-    if key not in EMBODIMENT_TO_DOMAIN_ID:
-        raise ValueError(
-            f"Unknown Cosmos3 action domain_name={domain_name!r}; "
-            f"expected one of {sorted(EMBODIMENT_TO_DOMAIN_ID)} or pass domain_id directly."
-        )
-    return EMBODIMENT_TO_DOMAIN_ID[key]
-
-
-def action_condition_indexes(mode: str, action_length: int) -> list[int]:
-    mode = normalize_action_mode(mode)
-    if mode == ACTION_MODE_FORWARD_DYNAMICS:
-        return list(range(action_length))
-    if mode in {ACTION_MODE_POLICY, ACTION_MODE_INVERSE_DYNAMICS}:
-        return []
-    raise AssertionError(f"Unexpected action mode: {mode!r}")
-
-
-def vision_condition_indexes(mode: str, video_length: int, temporal_compression_factor: int) -> list[int]:
-    mode = normalize_action_mode(mode)
-    latent_frames = (video_length - 1) // temporal_compression_factor + 1
-    if mode in {ACTION_MODE_POLICY, ACTION_MODE_FORWARD_DYNAMICS}:
-        return [0]
-    if mode == ACTION_MODE_INVERSE_DYNAMICS:
-        return list(range(latent_frames))
-    raise AssertionError(f"Unexpected action mode: {mode!r}")
-
-
-def action_start_frame_offset(mode: str, action_length: int, video_length: int) -> int:
-    del mode
-    if action_length == video_length - 1:
-        return 1
-    if action_length == video_length:
-        return 0
-    raise ValueError(
-        "Cosmos3 action_chunk_size must equal num_frames - 1 or num_frames; "
-        f"got action_chunk_size={action_length}, num_frames={video_length}."
-    )
-
-
-def build_action_condition_mask(
-    mode: str,
-    action_length: int,
-    *,
-    device: torch.device,
-    dtype: torch.dtype,
-) -> torch.Tensor:
-    mask = torch.zeros(1, action_length, 1, device=device, dtype=dtype)
-    for idx in action_condition_indexes(mode, action_length):
-        mask[:, idx, :] = 1.0
-    return mask
-
-
-def build_vision_condition_mask(
-    mode: str,
-    video_length: int,
-    temporal_compression_factor: int,
-    *,
-    device: torch.device,
-    dtype: torch.dtype,
-) -> torch.Tensor:
-    latent_frames = (video_length - 1) // temporal_compression_factor + 1
-    mask = torch.zeros(1, 1, latent_frames, 1, 1, device=device, dtype=dtype)
-    for idx in vision_condition_indexes(mode, video_length, temporal_compression_factor):
-        mask[:, :, idx, :, :] = 1.0
-    return mask
-
-
-def pad_action_to_dim(action: torch.Tensor, action_dim: int) -> torch.Tensor:
-    if action.shape[-1] > action_dim:
-        raise ValueError(f"Cosmos3 action dimension {action.shape[-1]} exceeds model action_dim={action_dim}.")
-    if action.shape[-1] == action_dim:
-        return action
-    padding = torch.zeros(*action.shape[:-1], action_dim - action.shape[-1], dtype=action.dtype, device=action.device)
-    return torch.cat([action, padding], dim=-1)
-
-
-def load_action_tensor(action: Any = None, action_path: str | Path | None = None) -> torch.Tensor:
-    if action is None and action_path is None:
-        raise ValueError(
-            "Cosmos3 forward_dynamics action mode requires extra_args['action'] or extra_args['action_path']."
-        )
-    if action is None:
-        action = json.loads(Path(str(action_path)).read_text())
-    if isinstance(action, torch.Tensor):
-        tensor = action.detach().to(dtype=torch.float32)
-    else:
-        tensor = torch.as_tensor(np.asarray(action), dtype=torch.float32)
-    if tensor.ndim == 3 and tensor.shape[0] == 1:
-        tensor = tensor.squeeze(0)
-    if tensor.ndim != 2:
-        raise ValueError(f"Cosmos3 action must have shape [T, D], got {tuple(tensor.shape)}.")
-    return tensor
-
-
-def find_closest_target_size(h: int, w: int, resolution: str | int) -> tuple[int, int]:
-    key = str(resolution)
-    if key not in VIDEO_RES_SIZE_INFO:
-        raise ValueError(
-            f"Unknown Cosmos3 action resolution={resolution!r}; expected one of {sorted(VIDEO_RES_SIZE_INFO)}."
-        )
-    input_ratio = h / w
-    best_size = None
-    best_diff = float("inf")
-    for cand_w, cand_h in VIDEO_RES_SIZE_INFO[key].values():
-        diff = abs(input_ratio - cand_h / cand_w)
-        if diff < best_diff:
-            best_diff = diff
-            best_size = (cand_w, cand_h)
-    assert best_size is not None
-    return best_size
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
deleted file mode 100644
index cfb794705ba..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from .avae import Cosmos3AVAEAudioTokenizer
-
-__all__ = ["Cosmos3AVAEAudioTokenizer"]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
deleted file mode 100644
index 7f04177c2d1..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Diffusers-format AVAE audio tokenizer used by Cosmos3 sound generation."""
-
-from __future__ import annotations
-
-import json
-import math
-from pathlib import Path
-from typing import Any
-
-import torch
-from torch import nn
-from torch.nn.utils import weight_norm
-from vllm.logger import init_logger
-
-from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
-
-logger = init_logger(__name__)
-
-
-def _default_avae_config(
-    *,
-    sample_rate: int,
-    audio_channels: int,
-    io_channels: int,
-    hop_size: int,
-) -> dict[str, Any]:
-    return {
-        "sampling_rate": sample_rate,
-        "hop_size": hop_size,
-        "dec_dim": 320,
-        "dec_c_mults": [1, 2, 4, 8, 16],
-        "dec_strides": [2, 4, 5, 6, 8],
-        "dec_out_channels": audio_channels,
-        "vocoder_input_dim": io_channels,
-        "normalization_type": "none",
-        "normalize_latents": False,
-        "tanh_input_scale": 1.5,
-        "tanh_output_scale": 3.5,
-        "tanh_clamp": 0.995,
-    }
-
-
-def _config_get(config: dict[str, Any], *keys: str, default: Any = None) -> Any:
-    for key in keys:
-        value = config.get(key)
-        if value is not None:
-            return value
-    return default
-
-
-def _load_config(
-    config_path: str | Path | None,
-    *,
-    sample_rate: int,
-    audio_channels: int,
-    io_channels: int,
-    hop_size: int,
-) -> dict[str, Any]:
-    if config_path:
-        with open(config_path, encoding="utf-8") as f:
-            config = json.load(f)
-        if not isinstance(config, dict):
-            raise TypeError(f"Cosmos3 AVAE config must be a JSON object, got {type(config)!r}.")
-        return config
-    return _default_avae_config(
-        sample_rate=sample_rate,
-        audio_channels=audio_channels,
-        io_channels=io_channels,
-        hop_size=hop_size,
-    )
-
-
-def _load_checkpoint(path: str | Path, map_location: torch.device | str) -> dict[str, torch.Tensor]:
-    path = Path(path)
-    if path.suffix == ".safetensors":
-        try:
-            from safetensors.torch import load_file
-        except ImportError as exc:
-            raise ImportError("Loading AVAE .safetensors checkpoints requires safetensors.") from exc
-        checkpoint = load_file(str(path), device=str(map_location))
-    else:
-        checkpoint = torch.load(path, map_location=map_location)
-
-    if not isinstance(checkpoint, dict):
-        raise TypeError(f"AVAE checkpoint must be a flat state dict, got {type(checkpoint)!r}.")
-    if not all(isinstance(value, torch.Tensor) for value in checkpoint.values()):
-        raise TypeError("AVAE checkpoint must be a flat tensor state dict.")
-    return checkpoint
-
-
-def _validate_diffusers_state_dict(state_dict: dict[str, torch.Tensor]) -> None:
-    if not state_dict:
-        raise RuntimeError("AVAE checkpoint is empty.")
-
-    if not any(key.startswith("decoder.") for key in state_dict):
-        raise RuntimeError("Cosmos3 AVAE checkpoint must contain diffusers-format decoder.* keys.")
-
-
-class Snake1d(nn.Module):
-    """One-dimensional Snake activation matching diffusers' Oobleck layout."""
-
-    def __init__(self, hidden_dim: int, logscale: bool = True) -> None:
-        super().__init__()
-        self.alpha = nn.Parameter(torch.zeros(1, hidden_dim, 1))
-        self.beta = nn.Parameter(torch.zeros(1, hidden_dim, 1))
-        self.logscale = logscale
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        shape = hidden_states.shape
-        alpha = torch.exp(self.alpha) if self.logscale else self.alpha
-        beta = torch.exp(self.beta) if self.logscale else self.beta
-        hidden_states = hidden_states.reshape(shape[0], shape[1], -1)
-        hidden_states = hidden_states + (beta + 1e-9).reciprocal() * torch.sin(alpha * hidden_states).pow(2)
-        return hidden_states.reshape(shape)
-
-
-class OobleckResidualUnit(nn.Module):
-    """Residual unit used by the diffusers Oobleck decoder."""
-
-    def __init__(self, dimension: int = 16, dilation: int = 1) -> None:
-        super().__init__()
-        pad = ((7 - 1) * dilation) // 2
-        self.snake1 = Snake1d(dimension)
-        self.conv1 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad))
-        self.snake2 = Snake1d(dimension)
-        self.conv2 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=1))
-
-    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        output_tensor = self.conv1(self.snake1(hidden_state))
-        output_tensor = self.conv2(self.snake2(output_tensor))
-        padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2
-        if padding > 0:
-            hidden_state = hidden_state[..., padding:-padding]
-        return hidden_state + output_tensor
-
-
-class OobleckDecoderBlock(nn.Module):
-    """Decoder block used by the diffusers Oobleck decoder."""
-
-    def __init__(self, input_dim: int, output_dim: int, stride: int = 1, output_padding: int = 0) -> None:
-        super().__init__()
-        self.snake1 = Snake1d(input_dim)
-        self.conv_t1 = weight_norm(
-            nn.ConvTranspose1d(
-                input_dim,
-                output_dim,
-                kernel_size=2 * stride,
-                stride=stride,
-                padding=math.ceil(stride / 2),
-                output_padding=output_padding,
-            )
-        )
-        self.res_unit1 = OobleckResidualUnit(output_dim, dilation=1)
-        self.res_unit2 = OobleckResidualUnit(output_dim, dilation=3)
-        self.res_unit3 = OobleckResidualUnit(output_dim, dilation=9)
-
-    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        hidden_state = self.snake1(hidden_state)
-        hidden_state = self.conv_t1(hidden_state)
-        hidden_state = self.res_unit1(hidden_state)
-        hidden_state = self.res_unit2(hidden_state)
-        return self.res_unit3(hidden_state)
-
-
-class OobleckDecoder(nn.Module):
-    """Diffusers-compatible Oobleck decoder for Cosmos3 AVAE latents."""
-
-    def __init__(
-        self,
-        channels: int,
-        input_channels: int,
-        audio_channels: int,
-        upsampling_ratios: list[int],
-        channel_multiples: list[int],
-    ) -> None:
-        super().__init__()
-        strides = upsampling_ratios
-        channel_multiples = [1] + channel_multiples
-
-        self.conv1 = weight_norm(nn.Conv1d(input_channels, channels * channel_multiples[-1], kernel_size=7, padding=3))
-
-        block = []
-        for stride_index, stride in enumerate(strides):
-            block.append(
-                OobleckDecoderBlock(
-                    input_dim=channels * channel_multiples[len(strides) - stride_index],
-                    output_dim=channels * channel_multiples[len(strides) - stride_index - 1],
-                    stride=stride,
-                    output_padding=stride % 2,
-                )
-            )
-        self.block = nn.ModuleList(block)
-        self.snake1 = Snake1d(channels)
-        self.conv2 = weight_norm(nn.Conv1d(channels, audio_channels, kernel_size=7, padding=3, bias=False))
-
-    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
-        hidden_state = self.conv1(hidden_state)
-        for layer in self.block:
-            hidden_state = layer(hidden_state)
-        hidden_state = self.snake1(hidden_state)
-        return self.conv2(hidden_state)
-
-
-class Cosmos3AVAEAudioTokenizer(nn.Module):
-    """Decoder-only AVAE tokenizer for Cosmos3 audio latents."""
-
-    def __init__(
-        self,
-        *,
-        checkpoint_path: str | Path,
-        config_path: str | Path | None = None,
-        sample_rate: int = 48000,
-        audio_channels: int = 2,
-        io_channels: int = 64,
-        hop_size: int = 1920,
-        normalize_latents: bool = False,
-        normalization_type: str = "none",
-        tanh_input_scale: float = 1.5,
-        tanh_output_scale: float = 3.5,
-        tanh_clamp: float = 0.995,
-        dtype: torch.dtype = torch.bfloat16,
-        device: torch.device | str = "cuda",
-    ) -> None:
-        super().__init__()
-        self.dtype = dtype
-        self.device = torch.device(device)
-
-        config = _load_config(
-            config_path,
-            sample_rate=sample_rate,
-            audio_channels=audio_channels,
-            io_channels=io_channels,
-            hop_size=hop_size,
-        )
-        self.sample_rate = int(_config_get(config, "sampling_rate", "sample_rate", default=sample_rate))
-        self.audio_channels = int(
-            _config_get(
-                config,
-                "dec_out_channels",
-                "audio_channels",
-                default=2 if bool(config.get("stereo", audio_channels == 2)) else 1,
-            )
-        )
-        self.latent_ch = int(_config_get(config, "vocoder_input_dim", "io_channels", "latent_ch", default=io_channels))
-        dec_strides = [int(stride) for stride in _config_get(config, "dec_strides", default=[2, 4, 5, 6, 8])]
-        self.hop_size = int(
-            _config_get(config, "hop_size", default=math.prod(dec_strides) if dec_strides else hop_size)
-        )
-        dec_stride_product = math.prod(dec_strides)
-        if dec_stride_product != self.hop_size:
-            raise ValueError(
-                "Cosmos3 AVAE config dec_strides product must equal hop_size "
-                f"for correct latent/audio duration math: product={dec_stride_product}, hop_size={self.hop_size}."
-            )
-
-        normalization_type = str(_config_get(config, "normalization_type", default=normalization_type))
-        normalize_latents = bool(_config_get(config, "normalize_latents", default=normalize_latents))
-        if normalization_type == "none" and normalize_latents:
-            normalization_type = "tanh"
-        self.normalization_type = normalization_type
-        self.tanh_input_scale = float(_config_get(config, "tanh_input_scale", default=tanh_input_scale))
-        self.tanh_output_scale = float(_config_get(config, "tanh_output_scale", default=tanh_output_scale))
-        self.tanh_clamp = float(_config_get(config, "tanh_clamp", default=tanh_clamp))
-
-        self.decoder = OobleckDecoder(
-            channels=int(_config_get(config, "dec_dim", default=320)),
-            input_channels=self.latent_ch,
-            audio_channels=self.audio_channels,
-            upsampling_ratios=list(reversed(dec_strides)),
-            channel_multiples=list(_config_get(config, "dec_c_mults", default=[1, 2, 4, 8, 16])),
-        )
-        state_dict = _load_checkpoint(checkpoint_path, self.device)
-        _validate_diffusers_state_dict(state_dict)
-        self.load_state_dict(state_dict, strict=True)
-
-        self.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-        self.to(device=self.device, dtype=self.dtype)
-        if _is_rank_zero():
-            logger.info("Loaded diffusers-format Cosmos3 AVAE checkpoint from %s", checkpoint_path)
-
-    @property
-    def temporal_compression_factor(self) -> int:
-        return self.hop_size
-
-    def get_latent_num_samples(self, num_audio_samples: int) -> int:
-        return int(num_audio_samples) // self.temporal_compression_factor
-
-    def get_audio_num_samples(self, num_latent_samples: int) -> int:
-        return int(num_latent_samples) * self.temporal_compression_factor
-
-    def _denormalize_latent(self, latent: torch.Tensor) -> torch.Tensor:
-        if self.normalization_type == "tanh":
-            in_dtype = latent.dtype
-            latent = torch.clamp(
-                latent.float() / self.tanh_output_scale,
-                -self.tanh_clamp,
-                self.tanh_clamp,
-            )
-            return (torch.atanh(latent) * self.tanh_input_scale).to(in_dtype)
-        if self.normalization_type != "none":
-            raise ValueError(f"Unsupported AVAE normalization_type={self.normalization_type!r}.")
-        return latent
-
-    @torch.no_grad()
-    def encode(self, audio: torch.Tensor, force_pad: bool = False) -> torch.Tensor:
-        del audio, force_pad
-        raise NotImplementedError("Cosmos3AVAEAudioTokenizer is decoder-only for diffusers-format sound_tokenizer/.")
-
-    @torch.no_grad()
-    def decode(self, latent: torch.Tensor) -> torch.Tensor:
-        in_dtype = latent.dtype
-        squeeze = latent.ndim == 2
-        if squeeze:
-            latent = latent.unsqueeze(0)
-        z = self._denormalize_latent(latent.to(self.device)).to(self.dtype)
-        audio = self.decoder(z).clamp(-1.0, 1.0).to(in_dtype)
-        return audio.squeeze(0) if squeeze else audio
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index f05d036c525..70ef823eb12 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -45,19 +45,6 @@
 from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 
-from .action import (
-    ACTION_MODE_FORWARD_DYNAMICS,
-    ACTION_MODE_INVERSE_DYNAMICS,
-    ACTION_MODE_POLICY,
-    action_start_frame_offset,
-    build_action_condition_mask,
-    build_vision_condition_mask,
-    find_closest_target_size,
-    load_action_tensor,
-    normalize_action_mode,
-    pad_action_to_dim,
-    resolve_domain_id,
-)
 from .transformer_cosmos3 import Cosmos3VFMTransformer
 
 logger = init_logger(__name__)
@@ -103,67 +90,14 @@ def get_cosmos3_pre_process_func(od_config: OmniDiffusionConfig):
     if is_guardrails_enabled(od_config):
         ensure_initialized(od_config)
 
-    def _extra_args(request: OmniDiffusionRequest) -> dict[str, Any]:
-        extra = getattr(getattr(request, "sampling_params", None), "extra_args", None)
-        return extra if isinstance(extra, dict) else {}
-
-    def _request_action_mode(request: OmniDiffusionRequest) -> str | None:
-        return normalize_action_mode(_extra_args(request).get("action_mode"))
-
-    def _set_action_size_from_image(request: OmniDiffusionRequest, image: PIL.Image.Image) -> tuple[int, int]:
-        sp = request.sampling_params
-        if sp.height is not None and sp.width is not None:
-            return int(sp.height), int(sp.width)
-
-        extra = _extra_args(request)
-        resolution = extra.get("resolution", extra.get("image_size", 480))
-        target_w, target_h = find_closest_target_size(image.height, image.width, resolution)
-        if sp.height is None:
-            sp.height = target_h
-        if sp.width is None:
-            sp.width = target_w
-        return int(sp.height), int(sp.width)
-
     def _pil_to_rgb(value: Any) -> PIL.Image.Image:
         if isinstance(value, str):
             return PIL.Image.open(value).convert("RGB")
         if isinstance(value, PIL.Image.Image):
             return value.convert("RGB")
-        raise TypeError(f"Cosmos3 action preprocessing expected PIL image or image path, got {type(value)!r}.")
-
-    def _resize_and_pad_action_image(image: PIL.Image.Image, target_h: int, target_w: int) -> PIL.Image.Image:
-        scale = min(target_w / image.width, target_h / image.height, 1.0)
-        resize_w = max(1, int(scale * image.width + 0.5))
-        resize_h = max(1, int(scale * image.height + 0.5))
-        if (resize_w, resize_h) != image.size:
-            image = image.resize((resize_w, resize_h), PIL.Image.Resampling.BICUBIC)
-
-        array = np.asarray(image)
-        pad_h = target_h - resize_h
-        pad_w = target_w - resize_w
-        if pad_h < 0 or pad_w < 0:
-            raise ValueError(
-                f"Cosmos3 action image resize exceeded target size: resized={(resize_h, resize_w)}, "
-                f"target={(target_h, target_w)}."
-            )
-        if pad_h == 0 and pad_w == 0:
-            return image
-        pad_mode = "reflect" if pad_h < resize_h and pad_w < resize_w else "edge"
-        padded = np.pad(array, ((0, pad_h), (0, pad_w), (0, 0)), mode=pad_mode)
-        return PIL.Image.fromarray(padded)
-
-    def _preprocess_action_image(image: PIL.Image.Image, target_h: int, target_w: int) -> torch.Tensor:
-        image = _resize_and_pad_action_image(image, target_h, target_w)
-        return video_processor.preprocess(image, height=target_h, width=target_w)
-
-    def _preprocess_action_video(frames: list[Any], target_h: int, target_w: int) -> torch.Tensor:
-        if not frames:
-            raise ValueError("Cosmos3 action video input must contain at least one frame.")
-        processed = [_preprocess_action_image(_pil_to_rgb(frame), target_h, target_w).squeeze(0) for frame in frames]
-        return torch.stack(processed, dim=1).unsqueeze(0).contiguous()
+        raise TypeError(f"Cosmos3 preprocessing expected PIL image or image path, got {type(value)!r}.")
 
     def pre_process_func(request: OmniDiffusionRequest) -> OmniDiffusionRequest:
-        action_mode = _request_action_mode(request)
         if is_guardrails_enabled(od_config, request.sampling_params):
             for prompt in request.prompts:
                 text = prompt if isinstance(prompt, str) else prompt.get("prompt", "")
@@ -174,63 +108,39 @@ def pre_process_func(request: OmniDiffusionRequest) -> OmniDiffusionRequest:
                 continue
             multi_modal_data = prompt.get("multi_modal_data", {}) or {}
             raw_image = multi_modal_data.get("image")
-            raw_video = multi_modal_data.get("video")
-            if raw_image is None and not (action_mode is not None and raw_video is not None):
+            if raw_image is None:
                 continue
 
             if "additional_information" not in prompt:
                 prompt["additional_information"] = {}
 
-            if raw_image is None:
-                if not isinstance(raw_video, list) or not raw_video:
-                    raise TypeError("Cosmos3 action video input must be a non-empty list of PIL images or image paths.")
-                image = _pil_to_rgb(raw_video[0])
-            else:
-                image = _pil_to_rgb(raw_image)
+            image = _pil_to_rgb(raw_image)
 
             # Auto-calculate H/W from aspect ratio (720p max area)
             if request.sampling_params.height is None or request.sampling_params.width is None:
-                if action_mode is not None:
-                    _set_action_size_from_image(request, image)
-                else:
-                    max_area = 720 * 1280
-                    aspect_ratio = image.height / image.width
-                    mod_value = 16
-                    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
-                    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
-                    if request.sampling_params.height is None:
-                        request.sampling_params.height = height
-                    if request.sampling_params.width is None:
-                        request.sampling_params.width = width
+                max_area = 720 * 1280
+                aspect_ratio = image.height / image.width
+                mod_value = 16
+                height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+                width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+                if request.sampling_params.height is None:
+                    request.sampling_params.height = height
+                if request.sampling_params.width is None:
+                    request.sampling_params.width = width
 
             target_w = request.sampling_params.width
             target_h = request.sampling_params.height
-            if action_mode is not None:
-                prompt["additional_information"]["preprocessed_image"] = _preprocess_action_image(
-                    image,
-                    int(target_h),
-                    int(target_w),
-                )
-            else:
-                scale = max(target_w / image.width, target_h / image.height)
-                resize_w = int(np.ceil(scale * image.width))
-                resize_h = int(np.ceil(scale * image.height))
-                image = image.resize((resize_w, resize_h), PIL.Image.Resampling.LANCZOS)
-                left = (resize_w - target_w) // 2
-                top = (resize_h - target_h) // 2
-                image = image.crop((left, top, left + target_w, top + target_h))
-
-                prompt["additional_information"]["preprocessed_image"] = video_processor.preprocess(
-                    image, height=target_h, width=target_w
-                )
-            if action_mode is not None and raw_video is not None:
-                if not isinstance(raw_video, list):
-                    raise TypeError("Cosmos3 action video input must be a list of PIL images or image paths.")
-                prompt["additional_information"]["preprocessed_video"] = _preprocess_action_video(
-                    raw_video,
-                    int(target_h),
-                    int(target_w),
-                )
+            scale = max(target_w / image.width, target_h / image.height)
+            resize_w = int(np.ceil(scale * image.width))
+            resize_h = int(np.ceil(scale * image.height))
+            image = image.resize((resize_w, resize_h), PIL.Image.Resampling.LANCZOS)
+            left = (resize_w - target_w) // 2
+            top = (resize_h - target_h) // 2
+            image = image.crop((left, top, left + target_w, top + target_h))
+
+            prompt["additional_information"]["preprocessed_image"] = video_processor.preprocess(
+                image, height=target_h, width=target_w
+            )
             request.prompts[i] = prompt
 
         return request
@@ -243,28 +153,6 @@ def get_cosmos3_post_process_func(od_config: OmniDiffusionConfig):
 
     video_processor = VideoProcessor(vae_scale_factor=16)
 
-    def _sampling_param(sampling_params, key: str, default=None):
-        extra = getattr(sampling_params, "extra_args", None)
-        if isinstance(extra, dict) and extra.get(key) is not None:
-            return extra[key]
-        value = getattr(sampling_params, key, None)
-        return default if value is None else value
-
-    def _resolve_output_fps(sampling_params):
-        fps = (
-            _sampling_param(sampling_params, "resolved_frame_rate")
-            or _sampling_param(sampling_params, "frame_rate")
-            or _sampling_param(sampling_params, "fps")
-            or 24.0
-        )
-        try:
-            fps_value = float(fps)
-        except (TypeError, ValueError):
-            fps_value = 24.0
-        if fps_value <= 0:
-            fps_value = 24.0
-        return int(fps_value) if fps_value.is_integer() else fps_value
-
     def post_process_func(
         output: torch.Tensor | dict[str, torch.Tensor] | tuple,
         output_type: str = "np",
@@ -273,8 +161,6 @@ def post_process_func(
         if output_type == "latent":
             return output
 
-        audio = None
-        audio_sample_rate = None
         if isinstance(output, dict):
             if "image" in output and "video" in output:
                 raise ValueError("Cosmos3 output cannot contain both image and video payloads.")
@@ -284,23 +170,10 @@ def post_process_func(
                 video = output["video"]
             else:
                 raise ValueError("Cosmos3 postprocess expected an 'image' or 'video' output payload.")
-            audio = output.get("audio")
-            audio_sample_rate = output.get("audio_sample_rate")
-        elif isinstance(output, tuple):
-            if len(output) == 3:
-                video, audio, audio_sample_rate = output
-            elif len(output) == 2:
-                video, audio = output
-            else:
-                raise ValueError(
-                    "Cosmos3 postprocess expects output tensor, output dict, or (video, audio[, sample_rate]) tuple."
-                )
         else:
             video = output
 
         if isinstance(output, dict) and "image" in output:
-            if audio is not None:
-                raise ValueError("Cosmos3 text-to-image postprocess does not support audio output.")
             if video.ndim != 5 or video.shape[2] != 1:
                 raise ValueError(
                     "Cosmos3 text-to-image postprocess expects decoded output "
@@ -314,16 +187,7 @@ def post_process_func(
             return video_processor.postprocess(image, output_type="pil")
         if is_guardrails_enabled(od_config, sampling_params):
             video = check_video_safety(video)
-        result = {"video": video_processor.postprocess_video(video, output_type=output_type)}
-        if audio is None:
-            return result
-        if isinstance(audio, torch.Tensor):
-            audio = audio.detach().cpu()
-        result["audio"] = audio
-        result["fps"] = _resolve_output_fps(sampling_params)
-        if audio_sample_rate is not None:
-            result["audio_sample_rate"] = int(audio_sample_rate)
-        return result
+        return {"video": video_processor.postprocess_video(video, output_type=output_type)}
 
     return post_process_func
 
@@ -446,9 +310,6 @@ def __init__(
         self._guidance_scale = None
         self._num_timesteps = None
         self._loaded_weight_names: set[str] = set()
-        self._sound_tokenizer = None
-        if getattr(self.transformer, "sound_gen", False):
-            self._get_sound_tokenizer()
 
         self.setup_diffusion_pipeline_profiler(
             enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler
@@ -479,19 +340,9 @@ def _remap_ckpt_key(key: str) -> str | None:
                 "vae2llm.",
                 "llm2vae.",
                 "time_embedder.",
-                "sound2llm.",
-                "llm2sound.",
-                "action2llm.",
-                "llm2action.",
             )
         ):
             return f"transformer.{k}"
-        if k in ("sound_modality_embed", "sound_modality_embed.weight"):
-            return "transformer.sound_modality_embed"
-        if k in ("action_modality_embed", "action_modality_embed.weight"):
-            return "transformer.action_modality_embed"
-        if k.startswith("action_pos_embed."):
-            return None
 
         # Skip lm_head
         if k.startswith("lm_head."):
@@ -590,31 +441,12 @@ def _remapped_weights() -> Iterable[tuple[str, torch.Tensor]]:
         self.transformer.post_load_weights()
         self.transformer.eval()
         self._loaded_weight_names = set(loaded)
-        if getattr(self.transformer, "sound_gen", False):
-            sound_markers = ("sound2llm.", "llm2sound.", "sound_modality_embed")
-            missing = [marker.rstrip(".") for marker in sound_markers if not any(marker in name for name in loaded)]
-            if missing:
-                raise ValueError(
-                    "Cosmos3 transformer config enables sound generation, but "
-                    f"the checkpoint is missing sound weights for {missing}. "
-                    "Use a sound-capable transformer checkpoint."
-                )
-        if getattr(self.transformer, "action_gen", False):
-            action_markers = ("action2llm.", "llm2action.", "action_modality_embed")
-            missing = [marker.rstrip(".") for marker in action_markers if not any(marker in name for name in loaded)]
-            if missing:
-                raise ValueError(
-                    "Cosmos3 transformer config enables action generation, but "
-                    f"the checkpoint is missing action weights for {missing}. "
-                    "Use an action-capable transformer checkpoint."
-                )
         return loaded
 
     def predict_noise(self, **kwargs) -> torch.Tensor | tuple[torch.Tensor, ...]:
         """Override CFGParallelMixin.predict_noise for Cosmos3.
 
-        The transformer returns the raw prediction: video-only as a tensor,
-        or a tuple in video, action, sound order for multimodal generation.
+        The transformer returns the raw video noise prediction.
         """
         return self.transformer(**kwargs)
 
@@ -649,55 +481,6 @@ def _get_sp_param(sp, key: str, default=None):
             return val
         return default
 
-    @staticmethod
-    def _truthy(value) -> bool:
-        if isinstance(value, str):
-            return value.strip().lower() in {"1", "true", "yes", "on"}
-        return bool(value)
-
-    @classmethod
-    def _get_prompt_param(cls, prompt_data, key: str, default=None):
-        if not isinstance(prompt_data, dict):
-            return default
-        if prompt_data.get(key) is not None:
-            return prompt_data[key]
-        additional = prompt_data.get("additional_information")
-        if isinstance(additional, dict) and additional.get(key) is not None:
-            return additional[key]
-        return default
-
-    @classmethod
-    def _is_sound_request(cls, prompt_data, sp) -> bool:
-        keys = (
-            "sound_gen",
-            "generate_sound",
-            "enable_sound_generation",
-            "return_audio",
-            "output_audio",
-            "generate_audio",
-        )
-        for key in keys:
-            if cls._truthy(cls._get_prompt_param(prompt_data, key, None)):
-                return True
-            if cls._truthy(cls._get_sp_param(sp, key, None)):
-                return True
-        return False
-
-    @classmethod
-    def _get_action_mode(cls, prompt_data, sp) -> str | None:
-        return normalize_action_mode(
-            cls._get_sp_param(sp, "action_mode", cls._get_prompt_param(prompt_data, "action_mode", None))
-        )
-
-    def _get_sound_tokenizer(self):
-        if not hasattr(self, "_sound_tokenizer"):
-            self._sound_tokenizer = None
-        if self._sound_tokenizer is None:
-            from .sound_tokenizer import Cosmos3SoundTokenizer
-
-            self._sound_tokenizer = Cosmos3SoundTokenizer.from_config(self.od_config)
-        return self._sound_tokenizer
-
     @staticmethod
     def _is_t2i_request(req: OmniDiffusionRequest) -> bool:
         """Detect text-to-image mode from request-level prompt modalities."""
@@ -899,47 +682,6 @@ def _prepare_latents(
         )
         return randn_tensor(shape, generator=generator, device=self.device, dtype=self.dtype)
 
-    def _prepare_sound_latents(
-        self,
-        target_audio_samples: int,
-        generator: torch.Generator,
-    ) -> tuple[torch.Tensor, int]:
-        sound_tokenizer = self._get_sound_tokenizer()
-        hop_size = int(
-            getattr(sound_tokenizer, "hop_size", None) or getattr(sound_tokenizer, "temporal_compression_factor")
-        )
-        latent_frames = max(1, math.ceil(max(1, int(target_audio_samples)) / hop_size))
-        sound_dim = int(getattr(sound_tokenizer, "latent_ch", 64))
-        transformer_sound_dim = int(getattr(self.transformer, "sound_dim", sound_dim))
-        if sound_dim != transformer_sound_dim:
-            raise ValueError(
-                "Cosmos3 sound tokenizer latent channels do not match transformer "
-                f"sound_dim: tokenizer={sound_dim}, transformer={transformer_sound_dim}."
-            )
-        latents = randn_tensor(
-            (1, sound_dim, latent_frames),
-            generator=generator,
-            device=self.device,
-            dtype=self.dtype,
-        )
-        return latents, latent_frames
-
-    def _resolve_sound_target_samples(
-        self,
-        sp,
-        num_frames: int,
-        frame_rate: float,
-    ) -> tuple[int, float, int]:
-        sound_tokenizer = self._get_sound_tokenizer()
-        duration = self._get_sp_param(sp, "sound_duration", None)
-        if duration is None:
-            duration = self._get_sp_param(sp, "audio_duration", None)
-        if duration is None:
-            duration = num_frames / frame_rate
-        duration = max(float(duration), 1.0 / max(float(frame_rate), 1.0))
-        sample_rate = int(getattr(sound_tokenizer, "sample_rate", 48000))
-        return max(1, int(round(duration * sample_rate))), duration, sample_rate
-
     # -- VAE decode ----------------------------------------------------------
 
     def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
@@ -961,19 +703,6 @@ def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
         video = self.vae.decode(latents, return_dict=False)[0]
         return video
 
-    def _decode_sound_latents(
-        self,
-        sound_latents: torch.Tensor,
-        target_audio_samples: int,
-    ) -> torch.Tensor:
-        sound_tokenizer = self._get_sound_tokenizer()
-        audio = sound_tokenizer.decode(sound_latents.to(self.dtype))
-        if audio.shape[-1] > target_audio_samples:
-            audio = audio[..., :target_audio_samples]
-        elif audio.shape[-1] < target_audio_samples:
-            audio = torch.nn.functional.pad(audio, (0, target_audio_samples - audio.shape[-1]))
-        return audio.detach().cpu()
-
     # -- Prompt formatting + tokenization (shared by T2V and I2V) ------------
 
     def _format_and_tokenize_prompts(
@@ -1114,30 +843,6 @@ def _encode_conditioning_video(
 
         return latent.to(self.dtype)
 
-    def _encode_video_tensor(self, video_tensor: torch.Tensor) -> torch.Tensor:
-        """VAE-encode a preprocessed pixel video [1, 3, T, H, W]."""
-        if video_tensor.ndim == 4:
-            video_tensor = video_tensor.unsqueeze(0)
-        if video_tensor.ndim != 5:
-            raise ValueError(f"Cosmos3 video tensor must have shape [1, 3, T, H, W], got {tuple(video_tensor.shape)}.")
-        if video_tensor.shape[0] != 1 or video_tensor.shape[1] != 3:
-            raise ValueError(f"Cosmos3 video tensor must have shape [1, 3, T, H, W], got {tuple(video_tensor.shape)}.")
-
-        video = video_tensor.to(device=self.device, dtype=self.vae.dtype)
-        latent = self.vae.encode(video).latent_dist.mode()
-
-        if hasattr(self.vae.config, "latents_mean") and hasattr(self.vae.config, "latents_std"):
-            latents_mean = (
-                torch.tensor(self.vae.config.latents_mean).view(1, -1, 1, 1, 1).to(latent.device, latent.dtype)
-            )
-            latents_std = torch.tensor(self.vae.config.latents_std).view(1, -1, 1, 1, 1).to(latent.device, latent.dtype)
-            latent = (latent - latents_mean) / latents_std
-        else:
-            scaling_factor = getattr(self.vae.config, "scaling_factor", 1.0)
-            latent = latent * scaling_factor
-
-        return latent.to(self.dtype)
-
     def _prepare_latents_i2v(
         self,
         image_tensor: torch.Tensor,
@@ -1174,98 +879,6 @@ def _prepare_latents_i2v(
         velocity_mask = 1.0 - condition_mask
         return latents, velocity_mask, image_latent
 
-    def _prepare_latents_action_video(
-        self,
-        video_tensor: torch.Tensor,
-        mode: str,
-        height: int,
-        width: int,
-        num_frames: int,
-        generator: torch.Generator,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Prepare video latents for action modes with mode-specific conditioning."""
-        del height, width
-        C = self.transformer.latent_channel_size
-        T_lat = (num_frames - 1) // self.vae_scale_factor_temporal + 1
-        H_lat = video_tensor.shape[-2] // self.vae_scale_factor_spatial
-        W_lat = video_tensor.shape[-1] // self.vae_scale_factor_spatial
-
-        noise = randn_tensor(
-            (1, C, T_lat, H_lat, W_lat),
-            generator=generator,
-            device=self.device,
-            dtype=self.dtype,
-        )
-        cond_latent = self._encode_video_tensor(video_tensor)
-        if cond_latent.shape[2:] != noise.shape[2:]:
-            raise ValueError(
-                "Cosmos3 action video latent shape mismatch: "
-                f"encoded={tuple(cond_latent.shape)}, expected={tuple(noise.shape)}."
-            )
-        condition_mask = build_vision_condition_mask(
-            mode,
-            num_frames,
-            self.vae_scale_factor_temporal,
-            device=self.device,
-            dtype=self.dtype,
-        )
-        latents = condition_mask * cond_latent + (1.0 - condition_mask) * noise
-        velocity_mask = 1.0 - condition_mask
-        return latents, velocity_mask, cond_latent
-
-    def _prepare_action_latents(
-        self,
-        *,
-        mode: str,
-        action_chunk_size: int,
-        raw_action_dim: int | None,
-        generator: torch.Generator,
-        sp,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
-        action_dim = int(getattr(self.transformer, "action_dim", 64))
-        if mode == ACTION_MODE_FORWARD_DYNAMICS:
-            action = load_action_tensor(
-                self._get_sp_param(sp, "action", None),
-                self._get_sp_param(sp, "action_path", None),
-            )
-            if action.shape[0] < action_chunk_size:
-                pad = action[-1:].repeat(action_chunk_size - action.shape[0], 1)
-                action = torch.cat([action, pad], dim=0)
-            elif action.shape[0] > action_chunk_size:
-                action = action[:action_chunk_size]
-            if raw_action_dim is None:
-                raw_action_dim = int(action.shape[-1])
-            clean_action = pad_action_to_dim(action, action_dim)
-        else:
-            if raw_action_dim is None:
-                raise ValueError(
-                    "Cosmos3 action_mode='policy' and 'inverse_dynamics' require extra_args['raw_action_dim']."
-                )
-            clean_action = torch.zeros(action_chunk_size, action_dim, dtype=torch.float32)
-
-        raw_action_dim = int(raw_action_dim)
-        if raw_action_dim <= 0 or raw_action_dim > action_dim:
-            raise ValueError(f"Cosmos3 raw_action_dim must be in [1, {action_dim}], got {raw_action_dim}.")
-
-        clean_action = clean_action.to(device=self.device, dtype=self.dtype).unsqueeze(0)
-        condition_mask = build_action_condition_mask(
-            mode,
-            action_chunk_size,
-            device=self.device,
-            dtype=self.dtype,
-        )
-        noise = randn_tensor(
-            (1, action_chunk_size, action_dim),
-            generator=generator,
-            device=self.device,
-            dtype=self.dtype,
-        )
-        noise[:, :, raw_action_dim:] = 0
-        clean_action[:, :, raw_action_dim:] = 0
-        action_latents = condition_mask * clean_action + (1.0 - condition_mask) * noise
-        action_velocity_mask = 1.0 - condition_mask
-        return action_latents, action_velocity_mask, clean_action, raw_action_dim
-
     # -- Denoising loop (shared by T2V and I2V) -----------------------------
 
     def diffuse(
@@ -1279,15 +892,11 @@ def diffuse(
         guidance_scale: float,
         shared_kwargs: dict,
         *,
-        action_latents: torch.Tensor | None = None,
-        action_velocity_mask: torch.Tensor | None = None,
-        action_condition_latents: torch.Tensor | None = None,
-        sound_latents: torch.Tensor | None = None,
         velocity_mask: torch.Tensor | None = None,
         image_latent: torch.Tensor | None = None,
         condition_latents: torch.Tensor | None = None,
         guidance_interval: tuple[float, float] | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+    ) -> torch.Tensor:
         """Denoising loop with 3-mode CFG support (parallel, sequential, none).
 
         Cosmos3's UND pathway is text-dependent, so CFG needs separate K/V
@@ -1326,112 +935,21 @@ def _cfg_active_at(t: torch.Tensor) -> bool:
             lo, hi = guidance_interval
             return lo <= t_scalar <= hi
 
-        def _pack_joint(
-            video_tensor: torch.Tensor,
-            action_tensor: torch.Tensor | None = None,
-            sound_tensor: torch.Tensor | None = None,
-        ):
-            batch = video_tensor.shape[0]
-            tensors = [video_tensor]
-            if action_tensor is not None:
-                tensors.append(action_tensor)
-            if sound_tensor is not None:
-                tensors.append(sound_tensor)
-            flats = [tensor.reshape(batch, -1) for tensor in tensors]
-            return torch.cat(flats, dim=1), [tensor.shape for tensor in tensors], [flat.shape[1] for flat in flats]
-
-        def _unpack_joint(
-            packed: torch.Tensor,
-            shapes: list[torch.Size],
-            numels: list[int],
-        ) -> tuple[torch.Tensor, ...]:
-            outputs = []
-            offset = 0
-            for shape, numel in zip(shapes, numels, strict=True):
-                outputs.append(packed[:, offset : offset + numel].reshape(shape))
-                offset += numel
-            return tuple(outputs)
-
-        def _split_noise_pred(
-            noise_pred: torch.Tensor | tuple[torch.Tensor, ...],
-        ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
-            has_action = action_latents is not None
-            has_sound = sound_latents is not None
-            if not has_action and not has_sound:
-                if isinstance(noise_pred, tuple):
-                    raise ValueError("Cosmos3 video-only diffusion received tuple predictions.")
-                return noise_pred, None, None
-            if not isinstance(noise_pred, tuple):
-                raise ValueError("Cosmos3 multimodal diffusion expects transformer predictions as a tuple.")
-            expected = 1 + int(has_action) + int(has_sound)
-            if len(noise_pred) != expected:
-                raise ValueError(
-                    f"Cosmos3 multimodal diffusion expected {expected} predictions, got {len(noise_pred)}."
-                )
-            video_pred = noise_pred[0]
-            idx = 1
-            action_pred = noise_pred[idx] if has_action else None
-            if has_action:
-                idx += 1
-            sound_pred = noise_pred[idx] if has_sound else None
-            return video_pred, action_pred, sound_pred
-
         def _step(
-            noise_pred: torch.Tensor | tuple[torch.Tensor, ...],
+            noise_pred: torch.Tensor,
             t: torch.Tensor,
             latents: torch.Tensor,
-            action_latents: torch.Tensor | None,
-            sound_latents: torch.Tensor | None,
-        ) -> torch.Tensor | tuple[torch.Tensor, ...]:
-            video_pred, action_pred, sound_pred = _split_noise_pred(noise_pred)
+        ) -> torch.Tensor:
+            if isinstance(noise_pred, tuple):
+                raise ValueError("Cosmos3 video-only diffusion received tuple predictions.")
             if velocity_mask is not None:
-                video_pred = video_pred * velocity_mask
-            if action_pred is not None and action_velocity_mask is not None:
-                action_pred = action_pred * action_velocity_mask
-            if action_latents is None and sound_latents is None:
-                latents = self.scheduler.step(video_pred, t, latents, return_dict=False)[0]
-            else:
-                packed_noise, shapes, numels = _pack_joint(video_pred, action_pred, sound_pred)
-                packed_latents, _, _ = _pack_joint(latents, action_latents, sound_latents)
-                packed_next = self.scheduler.step(packed_noise, t, packed_latents, return_dict=False)[0]
-                unpacked = _unpack_joint(packed_next, shapes, numels)
-                latents = unpacked[0]
-                idx = 1
-                if action_latents is not None:
-                    action_latents = unpacked[idx]
-                    idx += 1
-                if sound_latents is not None:
-                    sound_latents = unpacked[idx]
+                noise_pred = noise_pred * velocity_mask
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
             if condition_latents is not None and velocity_mask is not None:
                 latents = velocity_mask * latents + (1.0 - velocity_mask) * condition_latents
             elif image_latent is not None:
                 latents[:, :, 0:1, :, :] = image_latent
-            if action_latents is not None and action_condition_latents is not None and action_velocity_mask is not None:
-                action_latents = (
-                    action_velocity_mask * action_latents + (1.0 - action_velocity_mask) * action_condition_latents
-                )
-            outputs = [latents]
-            if action_latents is not None:
-                outputs.append(action_latents)
-            if sound_latents is not None:
-                outputs.append(sound_latents)
-            return outputs[0] if len(outputs) == 1 else tuple(outputs)
-
-        def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
-            nonlocal latents, action_latents, sound_latents
-            if action_latents is None and sound_latents is None:
-                assert isinstance(step_out, torch.Tensor)
-                latents = step_out
-                return
-            if not isinstance(step_out, tuple):
-                raise ValueError("Cosmos3 multimodal diffusion step returned a non-tuple result.")
-            latents = step_out[0]
-            idx = 1
-            if action_latents is not None:
-                action_latents = step_out[idx]
-                idx += 1
-            if sound_latents is not None:
-                sound_latents = step_out[idx]
+            return latents
 
         if cfg_parallel:
             for t in self.progress_bar(timesteps):
@@ -1449,8 +967,6 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                         timestep=timestep,
                         text_ids=cond_ids,
                         text_mask=cond_mask,
-                        action_latents=action_latents,
-                        sound_latents=sound_latents,
                         **shared_kwargs,
                     ),
                     negative_kwargs=dict(
@@ -1458,13 +974,11 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                         timestep=timestep,
                         text_ids=uncond_ids,
                         text_mask=uncond_mask,
-                        action_latents=action_latents,
-                        sound_latents=sound_latents,
                         **shared_kwargs,
                     ),
                     cfg_normalize=False,
                 )
-                _assign_step_out(_step(noise_pred, t, latents, action_latents, sound_latents))
+                latents = _step(noise_pred, t, latents)
 
         elif do_cfg:
             cond_cache: tuple = (None, None)
@@ -1480,8 +994,6 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                     timestep=timestep,
                     text_ids=cond_ids,
                     text_mask=cond_mask,
-                    action_latents=action_latents,
-                    sound_latents=sound_latents,
                     **shared_kwargs,
                 )
                 if cond_cache[0] is None:
@@ -1494,8 +1006,6 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                         timestep=timestep,
                         text_ids=uncond_ids,
                         text_mask=uncond_mask,
-                        action_latents=action_latents,
-                        sound_latents=sound_latents,
                         **shared_kwargs,
                     )
                     if uncond_cache[0] is None:
@@ -1507,7 +1017,7 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                     # the cond branch) and gives a free speedup for T2I.
                     noise_pred = noise_cond
 
-                _assign_step_out(_step(noise_pred, t, latents, action_latents, sound_latents))
+                latents = _step(noise_pred, t, latents)
 
         else:
             for t in self.progress_bar(timesteps):
@@ -1517,18 +1027,11 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                     timestep=timestep,
                     text_ids=cond_ids,
                     text_mask=cond_mask,
-                    action_latents=action_latents,
-                    sound_latents=sound_latents,
                     **shared_kwargs,
                 )
-                _assign_step_out(_step(noise_pred, t, latents, action_latents, sound_latents))
+                latents = _step(noise_pred, t, latents)
 
-        outputs = [latents]
-        if action_latents is not None:
-            outputs.append(action_latents)
-        if sound_latents is not None:
-            outputs.append(sound_latents)
-        return outputs[0] if len(outputs) == 1 else tuple(outputs)
+        return latents
 
     # -- Forward (main generation entry point) -------------------------------
 
@@ -1547,43 +1050,17 @@ def forward(
             prompt = prompt_data
             negative_prompt = None
             image_tensor = None
-            action_video_tensor = None
         else:
             prompt = prompt_data.get("prompt", "")
             negative_prompt = prompt_data.get("negative_prompt")
             additional_info = prompt_data.get("additional_information", {}) or {}
             image_tensor = additional_info.get("preprocessed_image")
-            action_video_tensor = additional_info.get("preprocessed_video")
 
         sp = req.sampling_params
         is_t2i = self._is_t2i_request(req)
-        sound_enabled = self._is_sound_request(prompt_data, sp)
-        action_mode = self._get_action_mode(prompt_data, sp)
-        action_enabled = action_mode is not None
-        if action_enabled and is_t2i:
-            raise ValueError("Cosmos3 action generation is supported only for video outputs.")
-        if action_enabled and sound_enabled:
-            raise ValueError("Cosmos3 action+sound joint generation is not supported in this phase.")
-        if action_enabled and not getattr(self.transformer, "action_gen", False):
-            raise ValueError(
-                "Cosmos3 action generation was requested, but the transformer was "
-                "initialized without action modules. Check that the checkpoint config "
-                "enables action_gen and includes action weights."
-            )
-        if sound_enabled and is_t2i:
-            raise ValueError(
-                "Cosmos3 sound generation is supported only for video outputs in "
-                "this phase; text-to-image with sound is unsupported."
-            )
-        if sound_enabled and not getattr(self.transformer, "sound_gen", False):
-            raise ValueError(
-                "Cosmos3 sound generation was requested, but the transformer was "
-                "initialized without sound modules. Check that the checkpoint config "
-                "enables sound_gen or defines sound_dim and includes sound weights."
-            )
-        is_i2v = image_tensor is not None and not is_t2i and not action_enabled
+        is_i2v = image_tensor is not None and not is_t2i
         if negative_prompt is None:
-            if is_t2i or action_enabled:
+            if is_t2i:
                 negative_prompt = COSMOS3_DEFAULT_NEGATIVE_PROMPT
             elif is_i2v:
                 negative_prompt = COSMOS3_I2V_NEGATIVE_PROMPT
@@ -1616,36 +1093,6 @@ def forward(
             default_guidance_interval = None
             batch_size = 1  # Existing video pipeline assumes B=1.
 
-        if action_enabled:
-            action_chunk_param = self._get_sp_param(sp, "action_chunk_size", None)
-            if action_chunk_param is not None:
-                action_chunk_size = int(action_chunk_param)
-                if sp.num_frames is None:
-                    num_frames = action_chunk_size + 1
-            elif sp.num_frames is None:
-                action_chunk_size = 16
-                num_frames = action_chunk_size + 1
-            else:
-                action_chunk_size = int(num_frames) - 1
-            if action_chunk_size <= 0:
-                raise ValueError(f"Cosmos3 action_chunk_size must be positive, got {action_chunk_size}.")
-            if num_frames not in (action_chunk_size, action_chunk_size + 1):
-                raise ValueError(
-                    "Cosmos3 action requests require num_frames to equal action_chunk_size "
-                    f"or action_chunk_size + 1; got num_frames={num_frames}, action_chunk_size={action_chunk_size}."
-                )
-            num_inference_steps = sp.num_inference_steps or 30
-            guidance_scale = sp.guidance_scale if sp.guidance_scale is not None else 1.0
-            default_flow_shift = 5.0
-
-        domain_id = None
-        if action_enabled:
-            domain_id = resolve_domain_id(
-                domain_id=self._get_sp_param(sp, "domain_id", None),
-                domain_name=self._get_sp_param(sp, "domain_name", None),
-                require_explicit=True,
-            )
-
         # Runtime controls: prefer ``extra_args`` (OpenAI endpoints write
         # there) over direct attrs.
         flow_shift_target = float(self._get_sp_param(sp, "flow_shift", default_flow_shift))
@@ -1655,23 +1102,6 @@ def forward(
         max_sequence_length = self._get_sp_param(sp, "max_sequence_length", 512) or 512
         use_system_prompt = bool(self._get_sp_param(sp, "use_system_prompt", False))
 
-        if action_enabled and action_video_tensor is None:
-            extra_action_video = self._get_sp_param(sp, "action_video", None)
-            if isinstance(extra_action_video, torch.Tensor):
-                action_video_tensor = extra_action_video
-        if action_enabled and isinstance(action_video_tensor, torch.Tensor):
-            if action_video_tensor.ndim == 4:
-                action_video_tensor = action_video_tensor.unsqueeze(0)
-            if action_video_tensor.ndim != 5:
-                raise ValueError(
-                    "Cosmos3 extra_args['action_video'] must have shape [1, 3, T, H, W] "
-                    f"or [3, T, H, W], got {tuple(action_video_tensor.shape)}."
-                )
-            if sp.height is None:
-                height = int(action_video_tensor.shape[-2])
-            if sp.width is None:
-                width = int(action_video_tensor.shape[-1])
-
         self._guidance_scale = guidance_scale
         self._num_timesteps = num_inference_steps
 
@@ -1707,58 +1137,7 @@ def forward(
         # batching B=N together would require expanding text K/V (UND
         # pathway is text-only and cached) and is left as a future
         # optimization.
-        action_latents = None
-        action_velocity_mask = None
-        action_condition_latents = None
-        raw_action_dim = None
-        action_offset = 1
-        if action_enabled:
-            if action_video_tensor is not None and action_video_tensor.ndim == 4:
-                action_video_tensor = action_video_tensor.unsqueeze(0)
-            if action_video_tensor is not None and action_video_tensor.ndim != 5:
-                raise ValueError(
-                    "Cosmos3 action video tensor must have shape [1, 3, T, H, W] "
-                    f"or [3, T, H, W], got {tuple(action_video_tensor.shape)}."
-                )
-            if action_video_tensor is not None and action_video_tensor.shape[2] < num_frames:
-                pad = action_video_tensor[:, :, -1:].repeat(1, 1, num_frames - action_video_tensor.shape[2], 1, 1)
-                action_video_tensor = torch.cat([action_video_tensor, pad], dim=2)
-            elif action_video_tensor is not None and action_video_tensor.shape[2] > num_frames:
-                action_video_tensor = action_video_tensor[:, :, :num_frames]
-
-            if action_mode == ACTION_MODE_INVERSE_DYNAMICS and action_video_tensor is None:
-                raise ValueError("Cosmos3 inverse_dynamics action mode requires multi_modal_data['video'].")
-            if action_mode in {ACTION_MODE_POLICY, ACTION_MODE_FORWARD_DYNAMICS} and image_tensor is None:
-                if action_video_tensor is None:
-                    raise ValueError(
-                        f"Cosmos3 action_mode={action_mode!r} requires multi_modal_data['image'] "
-                        "or multi_modal_data['video']."
-                    )
-                image_tensor = action_video_tensor[:, :, 0]
-
-            raw_action_dim_param = self._get_sp_param(sp, "raw_action_dim", None)
-            raw_action_dim = int(raw_action_dim_param) if raw_action_dim_param is not None else None
-            action_prepared = self._prepare_action_latents(
-                mode=action_mode,
-                action_chunk_size=action_chunk_size,
-                raw_action_dim=raw_action_dim,
-                generator=generator,
-                sp=sp,
-            )
-            action_latents, action_velocity_mask, action_condition_latents, raw_action_dim = action_prepared
-            action_offset = action_start_frame_offset(action_mode, action_chunk_size, num_frames)
-
-        if action_enabled and action_video_tensor is not None:
-            latents, velocity_mask, condition_latents = self._prepare_latents_action_video(
-                action_video_tensor,
-                action_mode,
-                height,
-                width,
-                num_frames,
-                generator,
-            )
-            image_latent = condition_latents[:, :, 0:1]
-        elif image_tensor is not None and not is_t2i:
+        if image_tensor is not None and not is_t2i:
             latents, velocity_mask, image_latent = self._prepare_latents_i2v(
                 image_tensor,
                 height,
@@ -1773,13 +1152,6 @@ def forward(
             image_latent = None
             condition_latents = None
 
-        sound_latents = None
-        target_audio_samples = None
-        sound_sample_rate = None
-        if sound_enabled:
-            target_audio_samples, _, sound_sample_rate = self._resolve_sound_target_samples(sp, num_frames, frame_rate)
-            sound_latents, _ = self._prepare_sound_latents(target_audio_samples, generator)
-
         T_latent = latents.shape[2]
         H_latent = latents.shape[3]
         W_latent = latents.shape[4]
@@ -1789,13 +1161,6 @@ def forward(
         shared_kwargs = dict(video_shape=video_shape, fps=frame_rate)
         if velocity_mask is not None:
             shared_kwargs["noisy_frame_mask"] = velocity_mask
-        if action_enabled:
-            shared_kwargs.update(
-                action_domain_ids=torch.tensor([domain_id], dtype=torch.long, device=self.device),
-                action_noisy_mask=action_velocity_mask,
-                action_start_frame_offset=action_offset,
-                action_fps=float(self._get_sp_param(sp, "action_fps", frame_rate) or frame_rate),
-            )
 
         def _run_diffusion(start_latents):
             self._set_scheduler_timesteps(num_inference_steps)
@@ -1808,10 +1173,6 @@ def _run_diffusion(start_latents):
                 uncond_mask=uncond_mask,
                 guidance_scale=guidance_scale,
                 shared_kwargs=shared_kwargs,
-                action_latents=action_latents,
-                action_velocity_mask=action_velocity_mask,
-                action_condition_latents=action_condition_latents,
-                sound_latents=sound_latents,
                 velocity_mask=velocity_mask,
                 image_latent=image_latent,
                 condition_latents=condition_latents,
@@ -1832,15 +1193,7 @@ def _run_diffusion(start_latents):
                 samples.append(_run_diffusion(next_latents))
             latents = torch.cat(samples, dim=0)
         else:
-            diffusion_output = _run_diffusion(latents)
-            if action_enabled and sound_enabled:
-                latents, action_latents, sound_latents = diffusion_output
-            elif action_enabled:
-                latents, action_latents = diffusion_output
-            elif sound_enabled:
-                latents, sound_latents = diffusion_output
-            else:
-                latents = diffusion_output
+            latents = _run_diffusion(latents)
 
         # --- Decode ---
         if _is_rank_zero():
@@ -1851,26 +1204,4 @@ def _run_diffusion(start_latents):
             logger.info("Video decoded in %.2fs", time.time() - decode_start)
             logger.info("Total pipeline time: %.2fs", time.time() - pipeline_start)
 
-        if sound_enabled:
-            if sound_latents is None or target_audio_samples is None or sound_sample_rate is None:
-                raise ValueError("Cosmos3 sound generation finished without sound latents.")
-            if _is_rank_zero():
-                logger.info("Decoding sound...")
-            audio = self._decode_sound_latents(sound_latents, target_audio_samples)
-            return DiffusionOutput(output={"video": video, "audio": audio, "audio_sample_rate": sound_sample_rate})
-
-        if action_enabled:
-            if action_latents is None or raw_action_dim is None or domain_id is None:
-                raise ValueError("Cosmos3 action generation finished without action latents.")
-            action = action_latents[:, :, :raw_action_dim].detach().cpu()
-            return DiffusionOutput(
-                output={"video": video},
-                custom_output={
-                    "action": action,
-                    "raw_action_dim": raw_action_dim,
-                    "action_mode": action_mode,
-                    "domain_id": domain_id,
-                },
-            )
-
         return DiffusionOutput(output={"image": video} if is_t2i else {"video": video})
diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
deleted file mode 100644
index 281b7e1d9f0..00000000000
--- a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
+++ /dev/null
@@ -1,537 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Cosmos3 sound tokenizer integration."""
-
-from __future__ import annotations
-
-import json
-import os
-from pathlib import Path
-from typing import Any
-
-import torch
-from vllm.logger import init_logger
-
-from vllm_omni.diffusion.data import OmniDiffusionConfig
-from vllm_omni.diffusion.distributed.utils import get_local_device
-from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
-
-from .audio_tokenizer import Cosmos3AVAEAudioTokenizer
-
-logger = init_logger(__name__)
-
-DEFAULT_SOUND_SAMPLE_RATE = 48000
-DEFAULT_SOUND_CHANNELS = 2
-DEFAULT_SOUND_DIM = 64
-DEFAULT_SOUND_HOP_SIZE = 1920
-DEFAULT_SOUND_LATENT_FPS = DEFAULT_SOUND_SAMPLE_RATE / DEFAULT_SOUND_HOP_SIZE
-DEFAULT_SOUND_NORMALIZE_LATENTS = False
-DEFAULT_SOUND_NORMALIZATION_TYPE = "none"
-DEFAULT_SOUND_TANH_INPUT_SCALE = 1.5
-DEFAULT_SOUND_TANH_OUTPUT_SCALE = 3.5
-DEFAULT_SOUND_TANH_CLAMP = 0.995
-SOUND_TOKENIZER_COMPONENT_NAME = "sound_tokenizer"
-SOUND_TOKENIZER_CHECKPOINT_NAME = "diffusion_pytorch_model.safetensors"
-
-
-def _pipeline_args(od_config: OmniDiffusionConfig) -> dict[str, Any]:
-    return dict(getattr(od_config, "custom_pipeline_args", None) or {})
-
-
-def _config_get(config: Any, key: str, default: Any = None) -> Any:
-    if config is None:
-        return default
-    if isinstance(config, dict):
-        return config.get(key, default)
-    if hasattr(config, "get"):
-        value = config.get(key, None)
-        return default if value is None else value
-    return getattr(config, key, default)
-
-
-def _config_path_get(config: Any, *keys: str) -> Any:
-    value = config
-    for key in keys:
-        value = _config_get(value, key, None)
-        if value is None:
-            return None
-    return value
-
-
-def _sound_tokenizer_config_from(config: Any) -> Any:
-    """Return nested ``sound_tokenizer`` config from Cosmos3 config shapes."""
-    for path in (
-        ("sound_tokenizer",),
-        ("model", "config", "sound_tokenizer"),
-        ("config", "sound_tokenizer"),
-        ("model_config", "sound_tokenizer"),
-    ):
-        value = _config_path_get(config, *path)
-        if value is not None:
-            return value
-    return None
-
-
-def _nested_sound_tokenizer_configs(od_config: OmniDiffusionConfig | None) -> tuple[Any, ...]:
-    if od_config is None:
-        return ()
-    configs = []
-    for source in (
-        getattr(od_config, "model_config", None),
-        getattr(od_config, "tf_model_config", None),
-    ):
-        config = _sound_tokenizer_config_from(source)
-        if config is not None:
-            configs.append(config)
-    return tuple(configs)
-
-
-def _first_value_from_configs(configs: tuple[Any, ...], keys: tuple[str, ...]) -> Any:
-    for config in configs:
-        for key in keys:
-            value = _config_get(config, key, None)
-            if value is not None:
-                return value
-    return None
-
-
-def _top_level_model_value(od_config: OmniDiffusionConfig | None, keys: tuple[str, ...]) -> Any:
-    if od_config is None:
-        return None
-    for source in (
-        getattr(od_config, "model_config", None),
-        getattr(od_config, "tf_model_config", None),
-    ):
-        for key in keys:
-            for path in ((key,), ("model", "config", key), ("config", key), ("model_config", key)):
-                value = _config_path_get(source, *path)
-                if value is not None:
-                    return value
-    return None
-
-
-def _custom_arg_value(args: dict[str, Any], keys: tuple[str, ...]) -> Any:
-    for key in keys:
-        value = args.get(key)
-        if value is not None:
-            return value
-    return None
-
-
-def _as_bool(value: Any) -> bool:
-    if isinstance(value, str):
-        return value.strip().lower() in {"1", "true", "yes", "on"}
-    return bool(value)
-
-
-def _as_audio_channels(value: Any) -> int:
-    if isinstance(value, bool):
-        return 2 if value else 1
-    if isinstance(value, str) and value.strip().lower() in {
-        "1",
-        "0",
-        "true",
-        "false",
-        "yes",
-        "no",
-        "on",
-        "off",
-    }:
-        return 2 if _as_bool(value) else 1
-    return int(value)
-
-
-def _resolve_model_file(path: Any, model_root: str | None) -> str | None:
-    if not path:
-        return None
-    path = str(path)
-    if "://" in path or os.path.isabs(path) or os.path.exists(path) or not model_root:
-        return path
-    return str(Path(model_root) / path)
-
-
-def _load_sound_tokenizer_component_config(config_path: str | None) -> dict[str, Any]:
-    if not config_path:
-        return {}
-    with open(config_path, encoding="utf-8") as f:
-        config = json.load(f)
-    if not isinstance(config, dict):
-        raise TypeError(f"Cosmos3 sound tokenizer config must be a JSON object, got {type(config)!r}.")
-    return config
-
-
-def _component_audio_channels(config: dict[str, Any]) -> Any:
-    if config.get("dec_out_channels") is not None:
-        return config["dec_out_channels"]
-    if config.get("audio_channels") is not None:
-        return config["audio_channels"]
-    if config.get("stereo") is not None:
-        return 2 if _as_bool(config["stereo"]) else 1
-    return None
-
-
-def _component_arch_values(config: dict[str, Any]) -> dict[str, Any]:
-    values = {
-        "sample_rate": config.get("sampling_rate", config.get("sample_rate")),
-        "audio_channels": _component_audio_channels(config),
-        "io_channels": config.get("vocoder_input_dim", config.get("io_channels", config.get("latent_ch"))),
-        "hop_size": config.get("hop_size"),
-    }
-    return {key: value for key, value in values.items() if value is not None}
-
-
-def _resolve_arch_value(
-    od_config: OmniDiffusionConfig,
-    args: dict[str, Any],
-    component_values: dict[str, Any],
-    *,
-    field: str,
-    custom_keys: tuple[str, ...],
-    nested_keys: tuple[str, ...],
-    top_level_keys: tuple[str, ...],
-    default: Any,
-    cast,
-) -> Any:
-    custom_value = _custom_arg_value(args, custom_keys)
-    component_value = component_values.get(field)
-    if component_value is not None:
-        resolved = cast(component_value)
-        if custom_value is not None and cast(custom_value) != resolved:
-            raise ValueError(
-                "Conflicting Cosmos3 sound tokenizer architecture override for "
-                f"{field}: component config has {resolved!r}, custom args have {cast(custom_value)!r}."
-            )
-        return resolved
-
-    if custom_value is not None:
-        return cast(custom_value)
-
-    nested_value = _first_value_from_configs(_nested_sound_tokenizer_configs(od_config), nested_keys)
-    if nested_value is not None:
-        return cast(nested_value)
-
-    top_value = _top_level_model_value(od_config, top_level_keys)
-    if top_value is not None:
-        return cast(top_value)
-
-    return cast(default)
-
-
-def _resolve_normalization_value(
-    od_config: OmniDiffusionConfig,
-    args: dict[str, Any],
-    *,
-    name: str,
-    default: Any,
-    aliases: tuple[str, ...] = (),
-) -> Any:
-    keys = (f"sound_{name}", name, *aliases)
-    custom_value = _custom_arg_value(args, keys)
-    if custom_value is not None:
-        return custom_value
-    nested_value = _first_value_from_configs(_nested_sound_tokenizer_configs(od_config), (name, *aliases))
-    return default if nested_value is None else nested_value
-
-
-def get_sound_config_value(
-    od_config: OmniDiffusionConfig,
-    name: str,
-    default: Any,
-    aliases: tuple[str, ...] = (),
-) -> Any:
-    # Backward-compatible generic accessor.  Prefer the more specific helpers
-    # below for Cosmos3 sound tokenizer fields so precedence stays explicit.
-    keys = (name, *aliases)
-    for config in (
-        _pipeline_args(od_config),
-        getattr(od_config, "model_config", None),
-        getattr(od_config, "tf_model_config", None),
-    ):
-        if config is None:
-            continue
-        for key in keys:
-            if hasattr(config, "get"):
-                value = config.get(key, None)
-            else:
-                value = getattr(config, key, None)
-            if value is not None:
-                return value
-    return default
-
-
-def get_sound_sample_rate(od_config: OmniDiffusionConfig) -> int:
-    args = _pipeline_args(od_config)
-    return _resolve_arch_value(
-        od_config,
-        args,
-        {},
-        field="sample_rate",
-        custom_keys=("sound_sample_rate", "sample_rate"),
-        nested_keys=("sample_rate", "sampling_rate"),
-        top_level_keys=("sound_sample_rate", "sample_rate"),
-        default=DEFAULT_SOUND_SAMPLE_RATE,
-        cast=int,
-    )
-
-
-def get_sound_channels(od_config: OmniDiffusionConfig) -> int:
-    args = _pipeline_args(od_config)
-    return _resolve_arch_value(
-        od_config,
-        args,
-        {},
-        field="audio_channels",
-        custom_keys=("sound_audio_channels", "audio_channels", "stereo"),
-        nested_keys=("audio_channels", "dec_out_channels", "stereo"),
-        top_level_keys=("sound_audio_channels", "audio_channels", "stereo"),
-        default=DEFAULT_SOUND_CHANNELS,
-        cast=_as_audio_channels,
-    )
-
-
-def get_sound_dim(od_config: OmniDiffusionConfig | None) -> int:
-    if od_config is None:
-        return DEFAULT_SOUND_DIM
-    args = _pipeline_args(od_config)
-    custom_value = _custom_arg_value(args, ("sound_dim", "io_channels", "latent_ch"))
-    if custom_value is not None:
-        return int(custom_value)
-    top_value = _top_level_model_value(od_config, ("sound_dim",))
-    if top_value is not None:
-        return int(top_value)
-    nested_value = _first_value_from_configs(
-        _nested_sound_tokenizer_configs(od_config),
-        ("io_channels", "vocoder_input_dim", "latent_ch"),
-    )
-    return int(DEFAULT_SOUND_DIM if nested_value is None else nested_value)
-
-
-def get_sound_hop_size(od_config: OmniDiffusionConfig) -> int:
-    args = _pipeline_args(od_config)
-    return _resolve_arch_value(
-        od_config,
-        args,
-        {},
-        field="hop_size",
-        custom_keys=("sound_hop_size", "hop_size"),
-        nested_keys=("hop_size",),
-        top_level_keys=("sound_hop_size", "hop_size"),
-        default=DEFAULT_SOUND_HOP_SIZE,
-        cast=int,
-    )
-
-
-def get_sound_latent_fps(od_config: OmniDiffusionConfig | None) -> float:
-    if od_config is None:
-        return DEFAULT_SOUND_LATENT_FPS
-    args = _pipeline_args(od_config)
-    custom_value = _custom_arg_value(args, ("sound_latent_fps",))
-    if custom_value is not None:
-        return float(custom_value)
-    top_value = _top_level_model_value(od_config, ("sound_latent_fps",))
-    if top_value is not None:
-        return float(top_value)
-    nested_configs = _nested_sound_tokenizer_configs(od_config)
-    nested_fps = _first_value_from_configs(nested_configs, ("sound_latent_fps", "latent_fps"))
-    if nested_fps is not None:
-        return float(nested_fps)
-    sample_rate = _first_value_from_configs(nested_configs, ("sample_rate", "sampling_rate"))
-    hop_size = _first_value_from_configs(nested_configs, ("hop_size",))
-    if sample_rate is not None and hop_size is not None:
-        return float(sample_rate) / float(hop_size)
-    return float(DEFAULT_SOUND_LATENT_FPS)
-
-
-class Cosmos3SoundTokenizer:
-    """Thin adapter around the local AVAE tokenizer implementation."""
-
-    def __init__(self, tokenizer: Any) -> None:
-        self.tokenizer = tokenizer
-        self.sample_rate = int(getattr(tokenizer, "sample_rate", DEFAULT_SOUND_SAMPLE_RATE))
-        self.audio_channels = int(getattr(tokenizer, "audio_channels", DEFAULT_SOUND_CHANNELS))
-        self.latent_ch = int(getattr(tokenizer, "latent_ch", DEFAULT_SOUND_DIM))
-        self.hop_size = int(getattr(tokenizer, "temporal_compression_factor", DEFAULT_SOUND_HOP_SIZE))
-
-    @classmethod
-    def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer:
-        args = _pipeline_args(od_config)
-        model_path = getattr(od_config, "model", None)
-        explicit_avae_path = (
-            args.get("sound_tokenizer_path")
-            or args.get("avae_path")
-            or args.get("cosmos3_avae_path")
-            or os.environ.get("COSMOS3_SOUND_TOKENIZER_PATH")
-        )
-        explicit_config_path = args.get("sound_tokenizer_config_path") or os.environ.get(
-            "COSMOS3_SOUND_TOKENIZER_CONFIG_PATH"
-        )
-
-        model_root = str(model_path) if model_path and os.path.isdir(model_path) else None
-        if model_root is None and model_path and not explicit_avae_path:
-            from huggingface_hub import snapshot_download
-
-            model_root = snapshot_download(
-                repo_id=str(model_path),
-                revision=getattr(od_config, "revision", None),
-                allow_patterns=[
-                    f"{SOUND_TOKENIZER_COMPONENT_NAME}/config.json",
-                    f"{SOUND_TOKENIZER_COMPONENT_NAME}/{SOUND_TOKENIZER_CHECKPOINT_NAME}",
-                ],
-            )
-
-        if explicit_avae_path:
-            avae_path = _resolve_model_file(explicit_avae_path, model_root)
-        else:
-            tokenizer_dir = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME if model_root else None
-            candidate = tokenizer_dir / SOUND_TOKENIZER_CHECKPOINT_NAME if tokenizer_dir else None
-            avae_path = str(candidate) if candidate and candidate.exists() else None
-
-        if not avae_path:
-            raise ValueError(
-                "Cosmos3 sound generation was requested, but no AVAE sound "
-                "tokenizer checkpoint was provided. Set "
-                "custom_pipeline_args['sound_tokenizer_path'] or "
-                "COSMOS3_SOUND_TOKENIZER_PATH, or include "
-                f"{SOUND_TOKENIZER_COMPONENT_NAME}/{SOUND_TOKENIZER_CHECKPOINT_NAME} under the model path."
-            )
-
-        config_path = _resolve_model_file(explicit_config_path, model_root)
-        if config_path is None and model_root:
-            candidate = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME / "config.json"
-            config_path = str(candidate) if candidate.exists() else None
-        component_config = _load_sound_tokenizer_component_config(config_path)
-        component_values = _component_arch_values(component_config)
-
-        sample_rate = _resolve_arch_value(
-            od_config,
-            args,
-            component_values,
-            field="sample_rate",
-            custom_keys=("sound_sample_rate", "sample_rate"),
-            nested_keys=("sample_rate", "sampling_rate"),
-            top_level_keys=("sound_sample_rate", "sample_rate"),
-            default=DEFAULT_SOUND_SAMPLE_RATE,
-            cast=int,
-        )
-        audio_channels = _resolve_arch_value(
-            od_config,
-            args,
-            component_values,
-            field="audio_channels",
-            custom_keys=("sound_audio_channels", "audio_channels", "stereo"),
-            nested_keys=("audio_channels", "dec_out_channels", "stereo"),
-            top_level_keys=("sound_audio_channels", "audio_channels", "stereo"),
-            default=DEFAULT_SOUND_CHANNELS,
-            cast=_as_audio_channels,
-        )
-        sound_dim = _resolve_arch_value(
-            od_config,
-            args,
-            component_values,
-            field="io_channels",
-            custom_keys=("sound_dim", "io_channels", "latent_ch"),
-            nested_keys=("io_channels", "vocoder_input_dim", "latent_ch"),
-            top_level_keys=("sound_dim",),
-            default=DEFAULT_SOUND_DIM,
-            cast=int,
-        )
-        hop_size = _resolve_arch_value(
-            od_config,
-            args,
-            component_values,
-            field="hop_size",
-            custom_keys=("sound_hop_size", "hop_size"),
-            nested_keys=("hop_size",),
-            top_level_keys=("sound_hop_size", "hop_size"),
-            default=DEFAULT_SOUND_HOP_SIZE,
-            cast=int,
-        )
-        normalize_latents = _as_bool(
-            _resolve_normalization_value(
-                od_config,
-                args,
-                name="normalize_latents",
-                default=DEFAULT_SOUND_NORMALIZE_LATENTS,
-            )
-        )
-        normalization_type = str(
-            _resolve_normalization_value(
-                od_config,
-                args,
-                name="normalization_type",
-                default=DEFAULT_SOUND_NORMALIZATION_TYPE,
-            )
-        )
-        tanh_input_scale = float(
-            _resolve_normalization_value(
-                od_config,
-                args,
-                name="tanh_input_scale",
-                default=DEFAULT_SOUND_TANH_INPUT_SCALE,
-            )
-        )
-        tanh_output_scale = float(
-            _resolve_normalization_value(
-                od_config,
-                args,
-                name="tanh_output_scale",
-                default=DEFAULT_SOUND_TANH_OUTPUT_SCALE,
-            )
-        )
-        tanh_clamp = float(
-            _resolve_normalization_value(
-                od_config,
-                args,
-                name="tanh_clamp",
-                default=DEFAULT_SOUND_TANH_CLAMP,
-            )
-        )
-        tokenizer = Cosmos3AVAEAudioTokenizer(
-            checkpoint_path=str(avae_path),
-            config_path=config_path,
-            sample_rate=sample_rate,
-            audio_channels=audio_channels,
-            io_channels=sound_dim,
-            hop_size=hop_size,
-            normalize_latents=normalize_latents,
-            normalization_type=normalization_type,
-            tanh_input_scale=tanh_input_scale,
-            tanh_output_scale=tanh_output_scale,
-            tanh_clamp=tanh_clamp,
-            dtype=getattr(od_config, "dtype", torch.bfloat16),
-            device=get_local_device(),
-        )
-        if _is_rank_zero():
-            logger.info(
-                "Loaded Cosmos3 AVAE sound tokenizer from %s (sr=%d, channels=%d, latent_ch=%d, hop=%d)",
-                avae_path,
-                sample_rate,
-                audio_channels,
-                sound_dim,
-                hop_size,
-            )
-        return cls(tokenizer)
-
-    def get_latent_num_samples(self, num_audio_samples: int) -> int:
-        return int(self.tokenizer.get_latent_num_samples(num_audio_samples))
-
-    def get_audio_num_samples(self, num_latent_samples: int) -> int:
-        return int(self.tokenizer.get_audio_num_samples(num_latent_samples))
-
-    @torch.no_grad()
-    def decode(self, latents: torch.Tensor) -> torch.Tensor:
-        """Decode sound latents.
-
-        Args:
-            latents: ``[B, C, T]`` or ``[C, T]`` tensor.
-
-        Returns:
-            ``[B, audio_channels, N]`` tensor for batched input, or
-            ``[audio_channels, N]`` for unbatched input.
-        """
-        squeeze = latents.ndim == 2
-        if squeeze:
-            latents = latents.unsqueeze(0)
-        audio = self.tokenizer.decode(latents)
-        audio = audio.clamp(-1.0, 1.0)
-        return audio.squeeze(0) if squeeze else audio
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 353f77d7598..12d30be4ba6 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -83,51 +83,6 @@ def _tf_config_get(config: Any, key: str, default: Any) -> Any:
     return getattr(config, key, default)
 
 
-def _nested_get(value: Any, key: str) -> Any:
-    if isinstance(value, dict):
-        if key in value:
-            return value[key]
-        for child in value.values():
-            found = _nested_get(child, key)
-            if found is not None:
-                return found
-    elif isinstance(value, list | tuple):
-        for child in value:
-            found = _nested_get(child, key)
-            if found is not None:
-                return found
-    return None
-
-
-def _od_config_get(od_config: Any, key: str, default: Any = None) -> Any:
-    """Read Cosmos3 options from runtime, model, or transformer config."""
-    if od_config is None:
-        return default
-    for attr in ("custom_pipeline_args", "model_config"):
-        source = getattr(od_config, attr, None) or {}
-        if isinstance(source, dict):
-            if key in source:
-                return source[key]
-            found = _nested_get(source, key)
-            if found is not None:
-                return found
-    tf_model_config = getattr(od_config, "tf_model_config", None)
-    if isinstance(tf_model_config, dict):
-        if key in tf_model_config:
-            return tf_model_config[key]
-        found = _nested_get(tf_model_config, key)
-        if found is not None:
-            return found
-    value = _tf_config_get(tf_model_config, key, None)
-    return default if value is None else value
-
-
-def _as_bool(value: Any) -> bool:
-    if isinstance(value, str):
-        return value.strip().lower() in {"1", "true", "yes", "on"}
-    return bool(value)
-
-
 # ---------------------------------------------------------------------------
 # RMSNorm
 # ---------------------------------------------------------------------------
@@ -152,47 +107,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return self.weight * hidden_states.to(input_dtype)
 
 
-class DomainAwareLinear(nn.Module):
-    """Linear projection with one weight/bias pair per action embodiment domain."""
-
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        num_domains: int,
-        *,
-        dtype: torch.dtype = torch.bfloat16,
-    ) -> None:
-        super().__init__()
-        self.input_size = int(input_size)
-        self.output_size = int(output_size)
-        self.num_domains = int(num_domains)
-        self.fc = nn.Embedding(self.num_domains, self.output_size * self.input_size, dtype=dtype)
-        self.bias = nn.Embedding(self.num_domains, self.output_size, dtype=dtype)
-        nn.init.xavier_uniform_(self.fc.weight)
-        nn.init.zeros_(self.bias.weight)
-
-    def forward(self, x: torch.Tensor, domain_id: torch.Tensor) -> torch.Tensor:
-        if domain_id.ndim == 0:
-            domain_id = domain_id.unsqueeze(0)
-        domain_id = domain_id.to(device=x.device, dtype=torch.long).reshape(-1)
-        if x.shape[0] != domain_id.shape[0]:
-            raise ValueError(
-                "Cosmos3 action domain_id batch size must match action tokens: "
-                f"tokens={x.shape[0]}, domain_id={domain_id.shape[0]}."
-            )
-        if torch.any((domain_id < 0) | (domain_id >= self.num_domains)):
-            raise ValueError(f"Cosmos3 action domain_id must be in [0, {self.num_domains}), got {domain_id.tolist()}.")
-
-        weight = self.fc(domain_id).view(domain_id.shape[0], self.input_size, self.output_size)
-        bias = self.bias(domain_id).view(domain_id.shape[0], self.output_size)
-        if x.ndim == 2:
-            return torch.bmm(x.unsqueeze(1), weight).squeeze(1) + bias
-        if x.ndim == 3:
-            return torch.bmm(x, weight) + bias.unsqueeze(1)
-        raise ValueError(f"Cosmos3 DomainAwareLinear expected rank-2 or rank-3 input, got {tuple(x.shape)}.")
-
-
 # ---------------------------------------------------------------------------
 # Rotary Position Embeddings (mRoPE)
 # ---------------------------------------------------------------------------
@@ -263,54 +177,6 @@ def compute_mrope_position_ids_vision(
     return mrope_ids, next_offset
 
 
-def compute_mrope_position_ids_sound(
-    grid_t: int,
-    temporal_offset: int | float,
-    sound_latent_fps: float,
-    base_fps: float = 24.0,
-    temporal_compression_factor_sound: int = 1,
-    enable_fps_modulation: bool = True,
-    base_temporal_compression_factor: int | None = None,
-) -> tuple[torch.Tensor, int | float]:
-    """Generate mRoPE IDs for sound tokens as a (T, 1, 1) grid."""
-    del base_temporal_compression_factor
-    return compute_mrope_position_ids_vision(
-        grid_t=grid_t,
-        grid_h=1,
-        grid_w=1,
-        temporal_offset=temporal_offset,
-        fps=sound_latent_fps,
-        base_fps=base_fps,
-        temporal_compression_factor=temporal_compression_factor_sound,
-        base_temporal_compression_factor=temporal_compression_factor_sound,
-        enable_fps_modulation=enable_fps_modulation,
-    )
-
-
-def compute_mrope_position_ids_action(
-    grid_t: int,
-    temporal_offset: int | float,
-    action_fps: float | None,
-    base_fps: float = 24.0,
-    base_temporal_compression_factor: int = 4,
-    enable_fps_modulation: bool = True,
-    start_frame_offset: int = 1,
-) -> tuple[torch.Tensor, int | float]:
-    """Generate mRoPE IDs for action tokens as a frame-rate (T, 1, 1) grid."""
-    return compute_mrope_position_ids_vision(
-        grid_t=grid_t,
-        grid_h=1,
-        grid_w=1,
-        temporal_offset=temporal_offset,
-        fps=action_fps,
-        base_fps=base_fps,
-        temporal_compression_factor=1,
-        base_temporal_compression_factor=base_temporal_compression_factor,
-        enable_fps_modulation=enable_fps_modulation,
-        start_frame_offset=start_frame_offset,
-    )
-
-
 class Qwen3VLTextRotaryEmbedding(nn.Module):
     """Multi-dimensional rotary position embedding for Qwen3-VL."""
 
@@ -1055,32 +921,9 @@ def __init__(
         self.latent_channel_size = int(_tf_config_get(model_config, "latent_channel", 48))
         self.timestep_scale = float(_tf_config_get(model_config, "timestep_scale", 0.001))
         self.base_fps = float(_tf_config_get(model_config, "base_fps", 24.0))
-        sound_gen_value = _od_config_get(od_config, "sound_gen", None)
-        sound_dim_value = _od_config_get(od_config, "sound_dim", None)
-        if sound_dim_value is None:
-            sound_dim_value = _od_config_get(od_config, "io_channels", None)
-        if sound_dim_value is None:
-            sound_dim_value = _od_config_get(od_config, "vocoder_input_dim", None)
-        if sound_dim_value is None:
-            sound_dim_value = _od_config_get(od_config, "latent_ch", None)
-        self.sound_gen = _as_bool(sound_gen_value) if sound_gen_value is not None else sound_dim_value is not None
-        from .sound_tokenizer import get_sound_dim, get_sound_latent_fps
-
-        self.sound_dim = int(sound_dim_value if sound_dim_value is not None else get_sound_dim(od_config))
-        action_gen_value = _od_config_get(od_config, "action_gen", None)
-        action_dim_value = _od_config_get(od_config, "action_dim", None)
-        if action_dim_value is None:
-            action_dim_value = _od_config_get(od_config, "max_action_dim", None)
-        self.action_gen = _as_bool(action_gen_value) if action_gen_value is not None else False
-        self.action_dim = int(action_dim_value if action_dim_value is not None else 64)
-        self.num_embodiment_domains = int(_od_config_get(od_config, "num_embodiment_domains", 32))
-        self.sound_latent_fps = float(get_sound_latent_fps(od_config))
         if temporal_compression_factor is None:
             temporal_compression_factor = _tf_config_get(model_config, "temporal_compression_factor", 4)
         self.temporal_compression_factor = int(temporal_compression_factor)
-        self.temporal_compression_factor_sound = int(
-            _tf_config_get(model_config, "temporal_compression_factor_sound", 1)
-        )
         self.enable_fps_modulation = bool(_tf_config_get(model_config, "enable_fps_modulation", True))
         self.temporal_modality_margin = int(
             _tf_config_get(
@@ -1113,25 +956,6 @@ def __init__(
         # vae2llm / llm2vae are small projection layers; not worth quantizing.
         self.vae2llm = nn.Linear(self.patch_latent_dim, self.hidden_size)
         self.llm2vae = nn.Linear(self.hidden_size, self.patch_latent_dim)
-        if self.action_gen:
-            self.action2llm = DomainAwareLinear(
-                self.action_dim,
-                self.hidden_size,
-                self.num_embodiment_domains,
-                dtype=dtype,
-            )
-            self.llm2action = DomainAwareLinear(
-                self.hidden_size,
-                self.action_dim,
-                self.num_embodiment_domains,
-                dtype=dtype,
-            )
-            self.action_modality_embed = nn.Parameter(torch.zeros(self.hidden_size, dtype=dtype))
-        if self.sound_gen:
-            self.sound2llm = nn.Linear(self.sound_dim, self.hidden_size)
-            self.llm2sound = nn.Linear(self.hidden_size, self.sound_dim)
-            self.sound_modality_embed = nn.Parameter(torch.zeros(self.hidden_size))
-
         self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=torch.bfloat16)
 
         self.gen_layers = nn.ModuleList(
@@ -1211,36 +1035,6 @@ def unpatchify(self, tokens: torch.Tensor, t: int, h: int, w: int) -> torch.Tens
             x = x[:, :, :, :h, :w]
         return x
 
-    def pack_sound(self, sound_latents: torch.Tensor) -> torch.Tensor:
-        """[B, C_sound, T_sound] -> [B, T_sound, C_sound]."""
-        if sound_latents.ndim != 3:
-            raise ValueError(f"Cosmos3 sound latents must have shape [B, C, T], got {tuple(sound_latents.shape)}.")
-        if sound_latents.shape[1] != self.sound_dim:
-            raise ValueError(
-                f"Cosmos3 sound latent channel mismatch: expected {self.sound_dim}, got {sound_latents.shape[1]}."
-            )
-        return sound_latents.permute(0, 2, 1).contiguous()
-
-    @staticmethod
-    def unpack_sound(tokens: torch.Tensor) -> torch.Tensor:
-        """[B, T_sound, C_sound] -> [B, C_sound, T_sound]."""
-        return tokens.permute(0, 2, 1).contiguous()
-
-    def pack_action(self, action_latents: torch.Tensor) -> torch.Tensor:
-        """Validate and return action latents as [B, T_action, D_action] tokens."""
-        if action_latents.ndim != 3:
-            raise ValueError(f"Cosmos3 action latents must have shape [B, T, D], got {tuple(action_latents.shape)}.")
-        if action_latents.shape[-1] != self.action_dim:
-            raise ValueError(
-                f"Cosmos3 action latent dimension mismatch: expected {self.action_dim}, got {action_latents.shape[-1]}."
-            )
-        return action_latents.contiguous()
-
-    @staticmethod
-    def unpack_action(tokens: torch.Tensor) -> torch.Tensor:
-        """Return [B, T_action, D_action] action predictions."""
-        return tokens.contiguous()
-
     # -- RoPE computation ----------------------------------------------------
 
     def _compute_rope_freqs(
@@ -1252,18 +1046,12 @@ def _compute_rope_freqs(
         fps: float | None,
         device: torch.device,
         dtype: torch.dtype,
-        t_action: int | None = None,
-        action_start_frame_offset: int = 1,
-        action_fps: float | None = None,
-        t_sound: int | None = None,
     ) -> tuple[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
         """Compute mRoPE cos/sin for UND text and GEN media pathways."""
         B = text_mask.shape[0]
         S_text = text_mask.shape[1]
         text_lengths = text_mask.sum(dim=1).long()
         effective_fps = fps if fps is not None and t > 1 else None
-        action_frames = int(t_action or 0)
-        sound_frames = int(t_sound or 0)
 
         text_pos_list = []
         gen_pos_list = []
@@ -1281,32 +1069,6 @@ def _compute_rope_freqs(
                 temporal_compression_factor=self.temporal_compression_factor,
                 enable_fps_modulation=self.enable_fps_modulation,
             )
-            gen_positions = [v_pos]
-            if action_frames > 0:
-                a_pos, _ = compute_mrope_position_ids_action(
-                    action_frames,
-                    temporal_offset=media_temporal_offset,
-                    action_fps=action_fps if action_fps is not None else fps,
-                    base_fps=self.base_fps,
-                    base_temporal_compression_factor=self.temporal_compression_factor,
-                    enable_fps_modulation=self.enable_fps_modulation,
-                    start_frame_offset=action_start_frame_offset,
-                )
-                gen_positions.append(a_pos)
-            if sound_frames > 0:
-                s_pos, _ = compute_mrope_position_ids_sound(
-                    sound_frames,
-                    temporal_offset=media_temporal_offset,
-                    sound_latent_fps=self.sound_latent_fps,
-                    base_fps=self.base_fps,
-                    temporal_compression_factor_sound=getattr(self, "temporal_compression_factor_sound", 1),
-                    enable_fps_modulation=self.enable_fps_modulation,
-                )
-                gen_positions.append(s_pos)
-            pos_dtype = gen_positions[0].dtype
-            for pos in gen_positions[1:]:
-                pos_dtype = torch.promote_types(pos_dtype, pos.dtype)
-            v_pos = torch.cat([pos.to(pos_dtype) for pos in gen_positions], dim=1)
             if real_len < S_text:
                 t_pos = torch.cat(
                     [t_pos, torch.zeros(3, S_text - real_len, dtype=t_pos.dtype)],
@@ -1338,34 +1100,18 @@ def _validate_gen_sequence_parallel(
         *,
         s_gen: int,
         s_video: int,
-        s_action: int,
-        s_sound: int,
-        has_action: bool,
-        has_sound: bool,
         ulysses_size: int,
     ) -> None:
         if ulysses_size <= 1 or s_gen % ulysses_size == 0:
             return
 
-        detail_parts = [f"video tokens {s_video}"]
-        if has_action:
-            detail_parts.append(f"action tokens {s_action}")
-        if has_sound:
-            detail_parts.append(f"sound tokens {s_sound}")
-        detail = " = " + " + ".join(detail_parts) if len(detail_parts) > 1 else ""
         adjust_detail = (
-            "Adjust the spatial resolution, frame count, action chunk size, "
-            "sound duration, or sound latent FPS so the combined media sequence is a "
-            "multiple of ulysses_degree."
-            if has_action or has_sound
-            else (
-                "Adjust the spatial resolution so that "
-                "t * ceil(h/patch) * ceil(w/patch) is a multiple "
-                "of ulysses_degree."
-            )
+            "Adjust the spatial resolution so that "
+            "t * ceil(h/patch) * ceil(w/patch) is a multiple "
+            "of ulysses_degree."
         )
         raise ValueError(
-            f"GEN sequence length ({s_gen}{detail}) must be divisible by "
+            f"GEN sequence length ({s_gen} video tokens {s_video}) must be divisible by "
             f"ulysses_degree ({ulysses_size}). {adjust_detail}"
         )
 
@@ -1379,15 +1125,9 @@ def forward(
         text_mask: torch.Tensor,
         video_shape: tuple[int, int, int],
         fps: float | None = None,
-        action_latents: torch.Tensor | None = None,
-        action_domain_ids: torch.Tensor | None = None,
-        action_noisy_mask: torch.Tensor | None = None,
-        action_start_frame_offset: int = 1,
-        action_fps: float | None = None,
-        sound_latents: torch.Tensor | None = None,
         noisy_frame_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+    ) -> torch.Tensor:
         """
         Args:
             hidden_states: [B, C, t, h, w] noisy latents
@@ -1396,37 +1136,17 @@ def forward(
             text_mask: [B, S_text] attention mask (1=real, 0=pad)
             video_shape: (t, h, w) in latent space
             fps: video frame rate for temporal mRoPE modulation
-            action_latents: Optional [B, T_action, D_action] noisy action latents.
-            action_domain_ids: Optional [B] embodiment domain IDs for action projections.
-            action_noisy_mask: Optional [B, T_action, 1] mask where 1=noisy
-                action token and 0=clean conditioned token.
-            sound_latents: Optional [B, C_sound, T_sound] noisy sound latents.
             noisy_frame_mask: Optional [B, 1, t, 1, 1] mask where 1=noisy (add
                 timestep embedding, predict velocity) and 0=conditioned (clean
                 context, skip timestep embedding).  None means all frames noisy
                 (T2V mode).
 
         Returns:
-            [B, C, t, h, w] velocity prediction, or
-            tuple outputs in video, action, sound order when extra modalities are provided.
+            [B, C, t, h, w] velocity prediction.
         """
         t, h, w = video_shape
         hp, wp, _, _ = self._pad_to_patch_size(h, w)
         max_real_len = int(text_mask.sum(dim=1).max().item())
-        has_action = action_latents is not None
-        has_sound = sound_latents is not None
-        if has_action and not self.action_gen:
-            raise ValueError(
-                "Cosmos3 action generation was requested, but this transformer "
-                "was initialized without action modules. Check that the "
-                "transformer config enables action_gen."
-            )
-        if has_sound and not self.sound_gen:
-            raise ValueError(
-                "Cosmos3 sound generation was requested, but this transformer "
-                "was initialized without sound modules. Check that the "
-                "transformer config enables sound_gen or defines sound_dim."
-            )
 
         # Query Ulysses state at runtime
         ulysses_size, _, _ = _get_ulysses_state()
@@ -1434,30 +1154,6 @@ def forward(
         # Patchify latents and project to hidden space
         hidden_video = self.vae2llm(self.patchify(hidden_states, t, h, w))
         s_video = hidden_video.shape[1]
-        s_action = 0
-        hidden_action = None
-        s_sound = 0
-        hidden_sound = None
-        if action_latents is not None:
-            if action_latents.shape[0] != hidden_states.shape[0]:
-                raise ValueError(
-                    "Cosmos3 action and video batch sizes must match: "
-                    f"video={hidden_states.shape[0]}, action={action_latents.shape[0]}."
-                )
-            if action_domain_ids is None:
-                action_domain_ids = torch.zeros(action_latents.shape[0], dtype=torch.long, device=action_latents.device)
-            hidden_action = self.action2llm(self.pack_action(action_latents), action_domain_ids)
-            hidden_action = hidden_action + self.action_modality_embed.to(hidden_action.dtype)
-            s_action = hidden_action.shape[1]
-        if sound_latents is not None:
-            if sound_latents.shape[0] != hidden_states.shape[0]:
-                raise ValueError(
-                    "Cosmos3 sound and video batch sizes must match: "
-                    f"video={hidden_states.shape[0]}, sound={sound_latents.shape[0]}."
-                )
-            hidden_sound = self.sound2llm(self.pack_sound(sound_latents))
-            hidden_sound = hidden_sound + self.sound_modality_embed.to(hidden_sound.dtype)
-            s_sound = hidden_sound.shape[1]
 
         # Timestep embedding (fp32 for precision).
         # For I2V: only add to noisy tokens, not conditioned ones.
@@ -1480,25 +1176,7 @@ def forward(
         else:
             hidden_video = hidden_video + time_embed.unsqueeze(1)
 
-        if hidden_action is not None:
-            if action_noisy_mask is None:
-                hidden_action = hidden_action + time_embed.unsqueeze(1)
-            else:
-                if action_noisy_mask.shape != (hidden_action.shape[0], hidden_action.shape[1], 1):
-                    raise ValueError(
-                        "Cosmos3 action_noisy_mask must have shape [B, T_action, 1], "
-                        f"got {tuple(action_noisy_mask.shape)}."
-                    )
-                hidden_action = hidden_action + time_embed.unsqueeze(1) * action_noisy_mask.to(hidden_action.dtype)
-
-        if hidden_sound is not None:
-            hidden_sound = hidden_sound + time_embed.unsqueeze(1)
-        hidden_parts = [hidden_video]
-        if hidden_action is not None:
-            hidden_parts.append(hidden_action)
-        if hidden_sound is not None:
-            hidden_parts.append(hidden_sound)
-        hidden_gen = torch.cat(hidden_parts, dim=1)
+        hidden_gen = hidden_video
 
         with torch.nn.attention.sdpa_kernel(self.sdpa_backends, set_priority=True):
             # Run UND pathway once and cache K/V (replicated across all ranks)
@@ -1511,10 +1189,6 @@ def forward(
                     fps,
                     hidden_states.device,
                     hidden_states.dtype,
-                    t_action=s_action,
-                    action_start_frame_offset=action_start_frame_offset,
-                    action_fps=action_fps,
-                    t_sound=s_sound,
                 )
                 cached_kv_full = self.language_model(text_ids, text_mask, freqs_und)
                 self.cached_freqs_gen = freqs_gen
@@ -1531,10 +1205,6 @@ def forward(
             self._validate_gen_sequence_parallel(
                 s_gen=hidden_gen.shape[1],
                 s_video=s_video,
-                s_action=s_action,
-                s_sound=s_sound,
-                has_action=has_action,
-                has_sound=has_sound,
                 ulysses_size=ulysses_size,
             )
             freqs_cos, freqs_sin = self.cached_freqs_gen
@@ -1568,28 +1238,7 @@ def forward(
 
         # Final norm and project back to latent space
         hidden_gen = self.norm_moe_gen(hidden_gen)
-        if not has_action and not has_sound:
-            return self.unpatchify(self.llm2vae(hidden_gen), t, h, w)
-
-        split_sizes = [s_video]
-        if has_action:
-            split_sizes.append(s_action)
-        if has_sound:
-            split_sizes.append(s_sound)
-        split_hidden = hidden_gen.split(split_sizes, dim=1)
-        hidden_video = split_hidden[0]
-        video_pred = self.unpatchify(self.llm2vae(hidden_video), t, h, w)
-        outputs: list[torch.Tensor] = [video_pred]
-        split_idx = 1
-        if has_action:
-            hidden_action = split_hidden[split_idx]
-            split_idx += 1
-            assert action_domain_ids is not None
-            outputs.append(self.unpack_action(self.llm2action(hidden_action, action_domain_ids)))
-        if has_sound:
-            hidden_sound = split_hidden[split_idx]
-            outputs.append(self.unpack_sound(self.llm2sound(hidden_sound)))
-        return tuple(outputs)
+        return self.unpatchify(self.llm2vae(hidden_gen), t, h, w)
 
     def post_load_weights(self) -> None:
         """Post-load processing: ensure correct dtypes."""
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index b38d2b65242..2be82957d4a 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -2449,7 +2449,7 @@ async def _run_video_generation_job(
     started_at = time.perf_counter()
     output_path = None
     try:
-        video_bytes, stage_durations, peak_memory_mb, action = await handler.generate_video_bytes(
+        video_bytes, stage_durations, peak_memory_mb = await handler.generate_video_bytes(
             request, video_id, reference_image=reference_image
         )
 
@@ -2467,7 +2467,6 @@ async def _run_video_generation_job(
                 "inference_time_s": time.perf_counter() - started_at,
                 "stage_durations": stage_durations,
                 "peak_memory_mb": peak_memory_mb,
-                "action": action,
             },
         )
     except (EngineGenerateError, EngineDeadError) as exc:
@@ -2532,8 +2531,6 @@ async def _parse_video_form(
     flow_shift: float | None = Form(default=None),
     true_cfg_scale: float | None = Form(default=None),
     seed: int | None = Form(default=None),
-    generate_sound: bool | None = Form(default=None),
-    sound_duration: float | None = Form(default=None, gt=0.0),
     negative_prompt: str | None = Form(default=None),
     enable_frame_interpolation: bool | None = Form(default=None),
     frame_interpolation_exp: int | None = Form(default=None, ge=1),
@@ -2574,8 +2571,6 @@ async def _parse_video_form(
         "flow_shift": flow_shift,
         "true_cfg_scale": true_cfg_scale,
         "seed": seed,
-        "generate_sound": generate_sound,
-        "sound_duration": sound_duration,
         "negative_prompt": negative_prompt,
         "enable_frame_interpolation": enable_frame_interpolation,
         "frame_interpolation_exp": frame_interpolation_exp,
@@ -2679,7 +2674,7 @@ async def create_video_sync(
     raw_request.state.request_metadata = RequestResponseMetadata(request_id=request_id)
     started_at = time.perf_counter()
     try:
-        video_bytes, stage_durations, peak_memory_mb, _action = await asyncio.wait_for(
+        video_bytes, stage_durations, peak_memory_mb = await asyncio.wait_for(
             handler.generate_video_bytes(request, request_id, reference_image=reference_image),
             timeout=VIDEO_SYNC_TIMEOUT_S,
         )
diff --git a/vllm_omni/entrypoints/openai/protocol/__init__.py b/vllm_omni/entrypoints/openai/protocol/__init__.py
index 58ff188250e..c73203cc4db 100644
--- a/vllm_omni/entrypoints/openai/protocol/__init__.py
+++ b/vllm_omni/entrypoints/openai/protocol/__init__.py
@@ -9,7 +9,6 @@
     ResponseFormat,
 )
 from vllm_omni.entrypoints.openai.protocol.videos import (
-    VideoAction,
     VideoData,
     VideoGenerationRequest,
     VideoGenerationResponse,
@@ -20,7 +19,6 @@
     "ImageGenerationRequest",
     "ImageGenerationResponse",
     "ResponseFormat",
-    "VideoAction",
     "VideoData",
     "VideoGenerationRequest",
     "VideoGenerationResponse",
diff --git a/vllm_omni/entrypoints/openai/protocol/videos.py b/vllm_omni/entrypoints/openai/protocol/videos.py
index ec5ab14e8d8..d46c8d43d6b 100644
--- a/vllm_omni/entrypoints/openai/protocol/videos.py
+++ b/vllm_omni/entrypoints/openai/protocol/videos.py
@@ -149,15 +149,6 @@ class VideoGenerationRequest(BaseModel):
         description="True CFG scale (model-specific parameter, may be ignored if not supported)",
     )
     seed: int | None = Field(default=None, description="Random seed for reproducibility")
-    generate_sound: bool = Field(
-        default=False,
-        description="Request model-generated audio for video models that support sound generation.",
-    )
-    sound_duration: float | None = Field(
-        default=None,
-        gt=0.0,
-        description="Duration in seconds for model-generated audio. Defaults to the generated video duration.",
-    )
 
     # vllm-omni extensions for post-generation frame interpolation.
     enable_frame_interpolation: bool = Field(
@@ -220,24 +211,12 @@ def resolve_video_params(self) -> VideoParams:
         return vp
 
 
-class VideoAction(BaseModel):
-    """Generated action sequence returned by action-capable video models."""
-
-    data: list[Any] = Field(..., description="JSON-serializable nested action values")
-    shape: list[int] = Field(..., description="Shape of the returned action data")
-    dtype: str | None = Field(default=None, description="Source action dtype, if available")
-    raw_action_dim: int | None = Field(default=None, description="Raw action dimension requested by the model")
-    action_mode: str | None = Field(default=None, description="Action generation mode")
-    domain_id: int | None = Field(default=None, description="Action embodiment domain id")
-
-
 class VideoData(BaseModel):
     """Single generated video data."""
 
     b64_json: str | None = Field(default=None, description="Base64-encoded MP4 video")
     url: str | None = Field(default=None, description="Video URL (not implemented)")
     revised_prompt: str | None = Field(default=None, description="Revised prompt (OpenAI compatibility, always null)")
-    action: VideoAction | None = Field(default=None, description="Generated action sequence metadata, if any")
 
 
 class VideoGenerationResponse(BaseModel):
@@ -310,7 +289,6 @@ class VideoResponse(BaseModel):
         default=0.0,
         description="Peak device memory usage in MB reported by the diffusion pipeline.",
     )
-    action: VideoAction | None = Field(default=None, description="Generated action sequence metadata, if any")
 
     @property
     def file_extension(self) -> str:
diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py
index fcc65fab4aa..d21d09b2ba3 100644
--- a/vllm_omni/entrypoints/openai/serving_video.py
+++ b/vllm_omni/entrypoints/openai/serving_video.py
@@ -17,7 +17,6 @@
 from vllm_omni.diffusion.data import GuardrailViolationError
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 from vllm_omni.entrypoints.openai.protocol.videos import (
-    VideoAction,
     VideoData,
     VideoGenerationRequest,
     VideoGenerationResponse,
@@ -45,9 +44,6 @@ class VideoGenerationArtifacts:
     """Normalized outputs and profiler metadata extracted from one request."""
 
     videos: list[Any]
-    audios: list[Any | None]
-    actions: list[VideoAction | None]
-    audio_sample_rate: int
     output_fps: int
     stage_durations: dict[str, float]
     peak_memory_mb: float
@@ -98,7 +94,7 @@ async def _run_and_extract(
         *,
         reference_image: ReferenceImage | None = None,
     ) -> VideoGenerationArtifacts:
-        """Run the generation pipeline and extract video/audio/profiler outputs."""
+        """Run the generation pipeline and extract video/profiler outputs."""
         prompt: OmniTextPrompt = OmniTextPrompt(prompt=request.prompt, modalities=["video"])
         if request.negative_prompt is not None:
             prompt["negative_prompt"] = request.negative_prompt
@@ -151,10 +147,6 @@ async def _run_and_extract(
         )
         if "flow_shift" in provided_fields and request.flow_shift is not None:
             gen_params.extra_args["flow_shift"] = request.flow_shift
-        if "generate_sound" in provided_fields:
-            gen_params.extra_args["generate_sound"] = request.generate_sound
-        if "sound_duration" in provided_fields and request.sound_duration is not None:
-            gen_params.extra_args["sound_duration"] = request.sound_duration
 
         # Apply model-specific extra parameters
         if request.extra_params is not None:
@@ -179,15 +171,9 @@ async def _run_and_extract(
 
         result = await self._run_generation(prompt, gen_params, reference_id)
         videos = self._extract_video_outputs(result)
-        audios = self._extract_audio_outputs(result, expected_count=len(videos))
-        actions = self._extract_action_outputs(result, expected_count=len(videos))
-        audio_sample_rate = self._resolve_audio_sample_rate(result)
         output_fps = (vp.fps or self._resolve_fps(result) or 24) * self._resolve_video_fps_multiplier(result)
         return VideoGenerationArtifacts(
             videos=videos,
-            audios=audios,
-            actions=actions,
-            audio_sample_rate=audio_sample_rate,
             output_fps=output_fps,
             stage_durations=self._extract_stage_durations(result),
             peak_memory_mb=self._extract_peak_memory_mb(result),
@@ -210,20 +196,13 @@ async def generate_videos(
         _t_encode_start = time.perf_counter()
         video_data = [
             VideoData(
-                b64_json=(
-                    encode_video_base64(video, fps=artifacts.output_fps, video_codec_options=video_codec_options)
-                    if artifacts.audios[idx] is None
-                    else encode_video_base64(
-                        video,
-                        fps=artifacts.output_fps,
-                        audio=artifacts.audios[idx],
-                        audio_sample_rate=artifacts.audio_sample_rate,
-                        video_codec_options=video_codec_options,
-                    )
+                b64_json=encode_video_base64(
+                    video,
+                    fps=artifacts.output_fps,
+                    video_codec_options=video_codec_options,
                 ),
-                action=artifacts.actions[idx],
             )
-            for idx, video in enumerate(artifacts.videos)
+            for video in artifacts.videos
         ]
         _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000
         logger.info("Video response encoding (MP4+base64): %.2f ms", _t_encode_ms)
@@ -240,7 +219,7 @@ async def generate_video_bytes(
         reference_id: str,
         *,
         reference_image: ReferenceImage | None = None,
-    ) -> tuple[bytes, dict[str, float], float, VideoAction | None]:
+    ) -> tuple[bytes, dict[str, float], float]:
         """Generate a video and return raw MP4 bytes, bypassing base64 encoding."""
         artifacts = await self._run_and_extract(request, reference_id, reference_image=reference_image)
         if len(artifacts.videos) > 1:
@@ -249,8 +228,6 @@ async def generate_video_bytes(
                 reference_id,
                 len(artifacts.videos),
             )
-        audio = artifacts.audios[0]
-
         video_codec_options = {"preset": "ultrafast", "threads": "0"}
         if request.extra_params is not None and isinstance(request.extra_params, dict):
             if "video_codec_options" in request.extra_params:
@@ -260,12 +237,11 @@ async def generate_video_bytes(
         video_bytes = _encode_video_bytes(
             artifacts.videos[0],
             fps=artifacts.output_fps,
-            **({"audio": audio, "audio_sample_rate": artifacts.audio_sample_rate} if audio is not None else {}),
             video_codec_options=video_codec_options,
         )
         _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000
         logger.info("Video response encoding (MP4 bytes): %.2f ms", _t_encode_ms)
-        return video_bytes, artifacts.stage_durations, artifacts.peak_memory_mb, artifacts.actions[0]
+        return video_bytes, artifacts.stage_durations, artifacts.peak_memory_mb
 
     @staticmethod
     def _resolve_video_fps_multiplier(result: Any) -> int:
@@ -407,50 +383,6 @@ def _extract_video_outputs(self, result: Any) -> list[Any]:
             )
         return normalized
 
-    @staticmethod
-    def _extract_audio_outputs(result: Any, expected_count: int) -> list[Any | None]:
-        audio = None
-        if hasattr(result, "multimodal_output") and result.multimodal_output:
-            audio = result.multimodal_output.get("audio")
-        elif hasattr(result, "request_output"):
-            request_output = result.request_output
-            if isinstance(request_output, dict) and request_output.get("multimodal_output"):
-                mm_output = request_output.get("multimodal_output") or {}
-                audio = mm_output.get("audio")
-            elif hasattr(request_output, "multimodal_output") and request_output.multimodal_output:
-                audio = request_output.multimodal_output.get("audio")
-
-        if audio is None:
-            return [None] * expected_count
-
-        if isinstance(audio, (list, tuple)):
-            if len(audio) == expected_count and any(hasattr(item, "shape") or hasattr(item, "ndim") for item in audio):
-                return list(audio)
-            if expected_count == 1:
-                return [audio]
-
-        if hasattr(audio, "ndim") and getattr(audio, "ndim", None) is not None and audio.ndim > 1:
-            first_dim = getattr(audio, "shape", [0])[0]
-            if first_dim == expected_count:
-                return [audio[i] for i in range(expected_count)]
-
-        if expected_count == 1:
-            return [audio]
-
-        return [audio] + [None] * max(expected_count - 1, 0)
-
-    @classmethod
-    def _extract_action_outputs(cls, result: Any, expected_count: int) -> list[VideoAction | None]:
-        custom_output = cls._extract_custom_output(result)
-        if not custom_output or "action" not in custom_output:
-            return [None] * expected_count
-
-        action_items = cls._split_action_payload(custom_output["action"], expected_count)
-        return [
-            cls._make_video_action(action_item, custom_output) if action_item is not None else None
-            for action_item in action_items
-        ]
-
     @staticmethod
     def _extract_custom_output(result: Any) -> dict[str, Any]:
         custom_output = getattr(result, "custom_output", None)
@@ -469,102 +401,6 @@ def _extract_custom_output(result: Any) -> dict[str, Any]:
 
         return custom_output if isinstance(custom_output, dict) else {}
 
-    @classmethod
-    def _split_action_payload(cls, action: Any, expected_count: int) -> list[Any | None]:
-        if expected_count <= 0:
-            return []
-
-        shape = cls._shape_of(action)
-        if len(shape) >= 3:
-            count = min(shape[0], expected_count)
-            actions = [cls._index_action(action, i) for i in range(count)]
-            actions.extend([None] * (expected_count - count))
-            return actions
-
-        return [action] + [None] * (expected_count - 1)
-
-    @classmethod
-    def _make_video_action(cls, action: Any, custom_output: dict[str, Any]) -> VideoAction:
-        data = cls._to_jsonable(action)
-        if not isinstance(data, list):
-            data = [data]
-
-        action_mode = custom_output.get("action_mode")
-        return VideoAction(
-            data=data,
-            shape=cls._shape_of(action),
-            dtype=cls._dtype_of(action),
-            raw_action_dim=cls._coerce_optional_int(custom_output.get("raw_action_dim")),
-            action_mode=str(action_mode) if action_mode is not None else None,
-            domain_id=cls._coerce_optional_int(custom_output.get("domain_id")),
-        )
-
-    @staticmethod
-    def _index_action(action: Any, index: int) -> Any:
-        try:
-            return action[index]
-        except (IndexError, KeyError, TypeError):
-            return None
-
-    @classmethod
-    def _to_jsonable(cls, value: Any) -> Any:
-        if hasattr(value, "detach"):
-            value = value.detach()
-        if hasattr(value, "cpu"):
-            value = value.cpu()
-        if hasattr(value, "tolist"):
-            return cls._to_jsonable(value.tolist())
-        if isinstance(value, (list, tuple)):
-            return [cls._to_jsonable(item) for item in value]
-        if hasattr(value, "item"):
-            try:
-                return value.item()
-            except (TypeError, ValueError):
-                pass
-        return value
-
-    @classmethod
-    def _shape_of(cls, value: Any) -> list[int]:
-        shape = getattr(value, "shape", None)
-        if shape is not None:
-            try:
-                return [int(dim) for dim in shape]
-            except (TypeError, ValueError):
-                pass
-        if isinstance(value, (list, tuple)):
-            if not value:
-                return [0]
-            return [len(value)] + cls._shape_of(value[0])
-        return []
-
-    @staticmethod
-    def _dtype_of(value: Any) -> str | None:
-        dtype = getattr(value, "dtype", None)
-        return str(dtype) if dtype is not None else None
-
-    @staticmethod
-    def _coerce_optional_int(value: Any) -> int | None:
-        if value is None:
-            return None
-        try:
-            value = value.item() if hasattr(value, "item") else value
-            return int(value)
-        except (TypeError, ValueError):
-            return None
-
-    def _resolve_audio_sample_rate(self, result: Any) -> int:
-        result_sample_rate = self._extract_audio_sample_rate_from_result(result)
-        if result_sample_rate is not None:
-            return result_sample_rate
-
-        model_config = getattr(self._engine_client, "model_config", None)
-        hf_config = getattr(model_config, "hf_config", None)
-        config_sample_rate = self._extract_audio_sample_rate_from_config(hf_config)
-        if config_sample_rate is not None:
-            return config_sample_rate
-
-        return 24000
-
     @staticmethod
     def _resolve_fps(result: Any) -> int | None:
         """Extract fps from multimodal_output if the model reported it."""
@@ -604,86 +440,6 @@ def _resolve_fps(result: Any) -> int | None:
                         pass
 
         return None
-
-    @classmethod
-    def _extract_audio_sample_rate_from_result(cls, result: Any) -> int | None:
-        multimodal_output = getattr(result, "multimodal_output", None)
-        if isinstance(multimodal_output, dict):
-            sample_rate = cls._coerce_audio_sample_rate(
-                multimodal_output.get("audio_sample_rate")
-                or multimodal_output.get("sample_rate")
-                or multimodal_output.get("sampling_rate")
-                or multimodal_output.get("sr")
-            )
-            if sample_rate is not None:
-                return sample_rate
-
-        request_output = getattr(result, "request_output", None)
-        if isinstance(request_output, dict):
-            multimodal_output = request_output.get("multimodal_output") or {}
-            if isinstance(multimodal_output, dict):
-                return cls._coerce_audio_sample_rate(
-                    multimodal_output.get("audio_sample_rate")
-                    or multimodal_output.get("sample_rate")
-                    or multimodal_output.get("sampling_rate")
-                    or multimodal_output.get("sr")
-                )
-        elif hasattr(request_output, "multimodal_output"):
-            multimodal_output = getattr(request_output, "multimodal_output", None)
-            if isinstance(multimodal_output, dict):
-                return cls._coerce_audio_sample_rate(
-                    multimodal_output.get("audio_sample_rate")
-                    or multimodal_output.get("sample_rate")
-                    or multimodal_output.get("sampling_rate")
-                    or multimodal_output.get("sr")
-                )
-
-        return None
-
-    @classmethod
-    def _extract_audio_sample_rate_from_config(cls, config: Any) -> int | None:
-        if config is None:
-            return None
-
-        for attr_name in ("output_sampling_rate", "audio_sample_rate", "sample_rate", "sampling_rate"):
-            raw_value = config.get(attr_name) if isinstance(config, dict) else getattr(config, attr_name, None)
-            sample_rate = cls._coerce_audio_sample_rate(raw_value)
-            if sample_rate is not None:
-                return sample_rate
-
-        for component_name in ("vocoder", "audio_vae"):
-            component = (
-                config.get(component_name) if isinstance(config, dict) else getattr(config, component_name, None)
-            )
-            if component is None:
-                continue
-
-            sample_rate = cls._extract_audio_sample_rate_from_config(component)
-            if sample_rate is not None:
-                return sample_rate
-
-            component_config = (
-                component.get("config") if isinstance(component, dict) else getattr(component, "config", None)
-            )
-            sample_rate = cls._extract_audio_sample_rate_from_config(component_config)
-            if sample_rate is not None:
-                return sample_rate
-
-        return None
-
-    @staticmethod
-    def _coerce_audio_sample_rate(value: Any) -> int | None:
-        if value is None:
-            return None
-
-        try:
-            sample_rate = value.item() if hasattr(value, "item") else value
-            sample_rate = int(sample_rate)
-        except (TypeError, ValueError):
-            return None
-
-        return sample_rate if sample_rate > 0 else None
-
     @staticmethod
     def _extract_stage_durations(result: Any) -> dict[str, float]:
         stage_durations = getattr(result, "stage_durations", None)

From c3500c0be0500789124f1d0ecc99eb24bcc5cea3 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Wed, 20 May 2026 16:09:05 +0200
Subject: [PATCH 22/41] Add back some of the unnecessarily deleted code

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../openai_api/test_video_server.py           |  91 ++++++++++-
 vllm_omni/entrypoints/openai/serving_video.py | 154 +++++++++++++++++-
 2 files changed, 235 insertions(+), 10 deletions(-)

diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py
index d982cce7f35..36b19333980 100644
--- a/tests/entrypoints/openai_api/test_video_server.py
+++ b/tests/entrypoints/openai_api/test_video_server.py
@@ -13,7 +13,6 @@
 import time
 from types import SimpleNamespace
 
-import numpy as np
 import pytest
 from fastapi import FastAPI
 from fastapi.testclient import TestClient
@@ -39,11 +38,17 @@ class MockVideoResult:
     def __init__(
         self,
         videos,
+        audios=None,
+        sample_rate=None,
         custom_output=None,
         stage_durations=None,
         peak_memory_mb=0.0,
     ):
         self.multimodal_output = {"video": videos}
+        if audios is not None:
+            self.multimodal_output["audio"] = audios
+        if sample_rate is not None:
+            self.multimodal_output["audio_sample_rate"] = sample_rate
         self._custom_output = custom_output or {}
         self.stage_durations = stage_durations or {}
         self.peak_memory_mb = peak_memory_mb
@@ -173,10 +178,49 @@ def test_async_video_generation_bypasses_base64(test_client, mocker: MockerFixtu
     mock_base64.assert_not_called()
 
 
+def test_async_video_generation_with_audio_bypasses_base64(test_client, mocker: MockerFixture):
+    """Regression test: Ensure async video generation passes audio through
+    generate_video_bytes without bouncing through base64 encoding."""
+    mock_encode = mocker.patch(
+        "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes",
+        return_value=b"raw-mp4-bytes",
+    )
+
+    mock_base64 = mocker.patch(
+        "vllm_omni.entrypoints.openai.serving_video.encode_video_base64",
+        side_effect=RuntimeError("Regression: async video path should not base64 encode"),
+    )
+
+    engine = test_client.app.state.openai_serving_video._engine_client
+
+    async def _generate(prompt, request_id, sampling_params_list):
+        engine.captured_prompt = prompt
+        engine.captured_sampling_params_list = sampling_params_list
+        yield MockVideoResult([object()], audios=[object()], sample_rate=48000)
+
+    engine.generate = _generate
+
+    response = test_client.post(
+        "/v1/videos",
+        data={"prompt": "A base64 test with audio."},
+    )
+    assert response.status_code == 200
+    video_id = response.json()["id"]
+
+    _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value)
+    mock_base64.assert_not_called()
+
+    mock_encode.assert_called_once()
+    kwargs = mock_encode.call_args.kwargs
+    assert "audio" in kwargs
+    assert kwargs["audio"] is not None
+    assert kwargs["audio_sample_rate"] == 48000
+
+
 def test_t2v_video_generation_form(test_client, mocker: MockerFixture):
     fps_values = []
 
-    def _fake_encode(video, fps, **kwargs):
+    def _fake_encode(video, fps, audio=None, audio_sample_rate=None, **kwargs):
         fps_values.append(fps)
         return b"fake-video"
 
@@ -291,7 +335,7 @@ def test_i2v_video_generation_with_image_reference_form(test_client, mocker: Moc
 def test_seconds_defaults_fps_and_frames(test_client, mocker: MockerFixture):
     fps_values = []
 
-    def _fake_encode(video, fps, **kwargs):
+    def _fake_encode(video, fps, audio=None, audio_sample_rate=None, **kwargs):
         fps_values.append(fps)
         return b"fake-video"
 
@@ -513,6 +557,47 @@ def _fake_encode(video, fps, **kwargs):
     assert fps_values == [16]
 
 
+def test_audio_sample_rate_comes_from_model_config(test_client, mocker: MockerFixture):
+    audio_sample_rates = []
+
+    def _fake_encode(video, fps, audio=None, audio_sample_rate=None, video_codec_options=None):
+        del video, fps, audio, video_codec_options
+        audio_sample_rates.append(audio_sample_rate)
+        return b"fake-video"
+
+    engine = test_client.app.state.openai_serving_video._engine_client
+    engine.model_config = SimpleNamespace(
+        hf_config=SimpleNamespace(
+            vocoder=SimpleNamespace(
+                config=SimpleNamespace(output_sampling_rate=16000),
+            ),
+        ),
+    )
+
+    async def _generate(prompt, request_id, sampling_params_list):
+        engine.captured_prompt = prompt
+        engine.captured_sampling_params_list = sampling_params_list
+        import numpy as np
+
+        yield MockVideoResult([np.zeros((1, 64, 64, 3), dtype=np.uint8)], audios=[object()])
+
+    engine.generate = _generate
+
+    mocker.patch(
+        "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes",
+        side_effect=_fake_encode,
+    )
+    response = test_client.post(
+        "/v1/videos",
+        data={"prompt": "video with audio"},
+    )
+
+    assert response.status_code == 200
+    video_id = response.json()["id"]
+    _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value)
+    assert audio_sample_rates == [16000]
+
+
 def test_video_job_persists_profiler_metadata(test_client, mocker: MockerFixture):
     engine = test_client.app.state.openai_serving_video._engine_client
 
diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py
index d21d09b2ba3..ccf54da6d0a 100644
--- a/vllm_omni/entrypoints/openai/serving_video.py
+++ b/vllm_omni/entrypoints/openai/serving_video.py
@@ -44,6 +44,8 @@ class VideoGenerationArtifacts:
     """Normalized outputs and profiler metadata extracted from one request."""
 
     videos: list[Any]
+    audios: list[Any | None]
+    audio_sample_rate: int
     output_fps: int
     stage_durations: dict[str, float]
     peak_memory_mb: float
@@ -94,7 +96,7 @@ async def _run_and_extract(
         *,
         reference_image: ReferenceImage | None = None,
     ) -> VideoGenerationArtifacts:
-        """Run the generation pipeline and extract video/profiler outputs."""
+        """Run the generation pipeline and extract video/audio/profiler outputs."""
         prompt: OmniTextPrompt = OmniTextPrompt(prompt=request.prompt, modalities=["video"])
         if request.negative_prompt is not None:
             prompt["negative_prompt"] = request.negative_prompt
@@ -171,9 +173,13 @@ async def _run_and_extract(
 
         result = await self._run_generation(prompt, gen_params, reference_id)
         videos = self._extract_video_outputs(result)
+        audios = self._extract_audio_outputs(result, expected_count=len(videos))
+        audio_sample_rate = self._resolve_audio_sample_rate(result)
         output_fps = (vp.fps or self._resolve_fps(result) or 24) * self._resolve_video_fps_multiplier(result)
         return VideoGenerationArtifacts(
             videos=videos,
+            audios=audios,
+            audio_sample_rate=audio_sample_rate,
             output_fps=output_fps,
             stage_durations=self._extract_stage_durations(result),
             peak_memory_mb=self._extract_peak_memory_mb(result),
@@ -196,13 +202,19 @@ async def generate_videos(
         _t_encode_start = time.perf_counter()
         video_data = [
             VideoData(
-                b64_json=encode_video_base64(
-                    video,
-                    fps=artifacts.output_fps,
-                    video_codec_options=video_codec_options,
-                ),
+                b64_json=(
+                    encode_video_base64(video, fps=artifacts.output_fps, video_codec_options=video_codec_options)
+                    if artifacts.audios[idx] is None
+                    else encode_video_base64(
+                        video,
+                        fps=artifacts.output_fps,
+                        audio=artifacts.audios[idx],
+                        audio_sample_rate=artifacts.audio_sample_rate,
+                        video_codec_options=video_codec_options,
+                    )
+                )
             )
-            for video in artifacts.videos
+            for idx, video in enumerate(artifacts.videos)
         ]
         _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000
         logger.info("Video response encoding (MP4+base64): %.2f ms", _t_encode_ms)
@@ -228,6 +240,8 @@ async def generate_video_bytes(
                 reference_id,
                 len(artifacts.videos),
             )
+        audio = artifacts.audios[0]
+
         video_codec_options = {"preset": "ultrafast", "threads": "0"}
         if request.extra_params is not None and isinstance(request.extra_params, dict):
             if "video_codec_options" in request.extra_params:
@@ -237,6 +251,7 @@ async def generate_video_bytes(
         video_bytes = _encode_video_bytes(
             artifacts.videos[0],
             fps=artifacts.output_fps,
+            **({"audio": audio, "audio_sample_rate": artifacts.audio_sample_rate} if audio is not None else {}),
             video_codec_options=video_codec_options,
         )
         _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000
@@ -383,6 +398,51 @@ def _extract_video_outputs(self, result: Any) -> list[Any]:
             )
         return normalized
 
+    @staticmethod
+    def _extract_audio_outputs(result: Any, expected_count: int) -> list[Any | None]:
+        audio = None
+        if hasattr(result, "multimodal_output") and result.multimodal_output:
+            audio = result.multimodal_output.get("audio")
+        elif hasattr(result, "request_output"):
+            request_output = result.request_output
+            if isinstance(request_output, dict) and request_output.get("multimodal_output"):
+                mm_output = request_output.get("multimodal_output") or {}
+                audio = mm_output.get("audio")
+            elif hasattr(request_output, "multimodal_output") and request_output.multimodal_output:
+                audio = request_output.multimodal_output.get("audio")
+
+        if audio is None:
+            return [None] * expected_count
+
+        if isinstance(audio, (list, tuple)):
+            if len(audio) == expected_count and any(hasattr(item, "shape") or hasattr(item, "ndim") for item in audio):
+                return list(audio)
+            if expected_count == 1:
+                return [audio]
+
+        if hasattr(audio, "ndim") and getattr(audio, "ndim", None) is not None and audio.ndim > 1:
+            first_dim = getattr(audio, "shape", [0])[0]
+            if first_dim == expected_count:
+                return [audio[i] for i in range(expected_count)]
+
+        if expected_count == 1:
+            return [audio]
+
+        return [audio] + [None] * max(expected_count - 1, 0)
+
+    def _resolve_audio_sample_rate(self, result: Any) -> int:
+        result_sample_rate = self._extract_audio_sample_rate_from_result(result)
+        if result_sample_rate is not None:
+            return result_sample_rate
+
+        model_config = getattr(self._engine_client, "model_config", None)
+        hf_config = getattr(model_config, "hf_config", None)
+        config_sample_rate = self._extract_audio_sample_rate_from_config(hf_config)
+        if config_sample_rate is not None:
+            return config_sample_rate
+
+        return 24000
+
     @staticmethod
     def _extract_custom_output(result: Any) -> dict[str, Any]:
         custom_output = getattr(result, "custom_output", None)
@@ -440,6 +500,86 @@ def _resolve_fps(result: Any) -> int | None:
                         pass
 
         return None
+
+    @classmethod
+    def _extract_audio_sample_rate_from_result(cls, result: Any) -> int | None:
+        multimodal_output = getattr(result, "multimodal_output", None)
+        if isinstance(multimodal_output, dict):
+            sample_rate = cls._coerce_audio_sample_rate(
+                multimodal_output.get("audio_sample_rate")
+                or multimodal_output.get("sample_rate")
+                or multimodal_output.get("sampling_rate")
+                or multimodal_output.get("sr")
+            )
+            if sample_rate is not None:
+                return sample_rate
+
+        request_output = getattr(result, "request_output", None)
+        if isinstance(request_output, dict):
+            multimodal_output = request_output.get("multimodal_output") or {}
+            if isinstance(multimodal_output, dict):
+                return cls._coerce_audio_sample_rate(
+                    multimodal_output.get("audio_sample_rate")
+                    or multimodal_output.get("sample_rate")
+                    or multimodal_output.get("sampling_rate")
+                    or multimodal_output.get("sr")
+                )
+        elif hasattr(request_output, "multimodal_output"):
+            multimodal_output = getattr(request_output, "multimodal_output", None)
+            if isinstance(multimodal_output, dict):
+                return cls._coerce_audio_sample_rate(
+                    multimodal_output.get("audio_sample_rate")
+                    or multimodal_output.get("sample_rate")
+                    or multimodal_output.get("sampling_rate")
+                    or multimodal_output.get("sr")
+                )
+
+        return None
+
+    @classmethod
+    def _extract_audio_sample_rate_from_config(cls, config: Any) -> int | None:
+        if config is None:
+            return None
+
+        for attr_name in ("output_sampling_rate", "audio_sample_rate", "sample_rate", "sampling_rate"):
+            raw_value = config.get(attr_name) if isinstance(config, dict) else getattr(config, attr_name, None)
+            sample_rate = cls._coerce_audio_sample_rate(raw_value)
+            if sample_rate is not None:
+                return sample_rate
+
+        for component_name in ("vocoder", "audio_vae"):
+            component = (
+                config.get(component_name) if isinstance(config, dict) else getattr(config, component_name, None)
+            )
+            if component is None:
+                continue
+
+            sample_rate = cls._extract_audio_sample_rate_from_config(component)
+            if sample_rate is not None:
+                return sample_rate
+
+            component_config = (
+                component.get("config") if isinstance(component, dict) else getattr(component, "config", None)
+            )
+            sample_rate = cls._extract_audio_sample_rate_from_config(component_config)
+            if sample_rate is not None:
+                return sample_rate
+
+        return None
+
+    @staticmethod
+    def _coerce_audio_sample_rate(value: Any) -> int | None:
+        if value is None:
+            return None
+
+        try:
+            sample_rate = value.item() if hasattr(value, "item") else value
+            sample_rate = int(sample_rate)
+        except (TypeError, ValueError):
+            return None
+
+        return sample_rate if sample_rate > 0 else None
+
     @staticmethod
     def _extract_stage_durations(result: Any) -> dict[str, float]:
         stage_durations = getattr(result, "stage_durations", None)

From 8536f5b1421f78c7df06af6d96fa195c1ceb6384 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Wed, 20 May 2026 16:09:26 +0200
Subject: [PATCH 23/41] Linter fixes

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 tests/diffusion/models/cosmos3/test_cosmos3_transformer.py | 4 +---
 vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py     | 1 -
 vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py  | 4 +---
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index c37f1186873..c4a2721099b 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -104,9 +104,7 @@ def test_transformer_sharding_offload_and_patch_round_trip_contracts() -> None:
 def test_forward_returns_video_prediction() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
-    output = Cosmos3VFMTransformer(
-        SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32)
-    )(
+    output = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32))(
         hidden_states=torch.zeros(1, 2, 1, 2, 2),
         timestep=torch.tensor([1.0]),
         text_ids=torch.tensor([[1, 2]], dtype=torch.long),
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 70ef823eb12..e721ff8741e 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -15,7 +15,6 @@
 
 from __future__ import annotations
 
-import math
 import os
 import time
 from collections.abc import Iterable
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 12d30be4ba6..c5c6c563da3 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -1106,9 +1106,7 @@ def _validate_gen_sequence_parallel(
             return
 
         adjust_detail = (
-            "Adjust the spatial resolution so that "
-            "t * ceil(h/patch) * ceil(w/patch) is a multiple "
-            "of ulysses_degree."
+            "Adjust the spatial resolution so that t * ceil(h/patch) * ceil(w/patch) is a multiple of ulysses_degree."
         )
         raise ValueError(
             f"GEN sequence length ({s_gen} video tokens {s_video}) must be divisible by "

From bcf609e6ed488c2ec7bde69e3e05bb62366b821c Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Fri, 22 May 2026 14:29:52 +0200
Subject: [PATCH 24/41] Simplified the guardrails with cosmos-guardrail package

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../models/test_cosmos3_guardrails.py         |  66 ----
 .../diffusion/models/cosmos3/guardrails.py    | 372 ++++--------------
 2 files changed, 68 insertions(+), 370 deletions(-)
 delete mode 100644 tests/diffusion/models/test_cosmos3_guardrails.py

diff --git a/tests/diffusion/models/test_cosmos3_guardrails.py b/tests/diffusion/models/test_cosmos3_guardrails.py
deleted file mode 100644
index 9ef45f77181..00000000000
--- a/tests/diffusion/models/test_cosmos3_guardrails.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
-import pytest
-import torch
-from transformers.tokenization_utils_base import BatchEncoding
-
-pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
-
-
-class _FakeTokenizer:
-    def __init__(self, model_inputs):
-        self.model_inputs = model_inputs
-        self.decoded_ids: list[int] | None = None
-
-    def apply_chat_template(self, conversations, *, tokenize: bool, return_tensors: str, add_generation_prompt: bool):
-        assert conversations == [{"role": "user", "content": "a safe prompt"}]
-        assert tokenize is True
-        assert return_tensors == "pt"
-        assert add_generation_prompt is True
-        return self.model_inputs
-
-    def decode(self, token_ids, *, skip_special_tokens: bool) -> str:
-        assert skip_special_tokens is True
-        self.decoded_ids = token_ids.tolist()
-        return "safe"
-
-
-class _FakeModel:
-    def __init__(self) -> None:
-        self.calls: list[tuple[tuple[object, ...], dict[str, object]]] = []
-
-    def generate(self, *args, **kwargs):
-        self.calls.append((args, kwargs))
-        input_ids = args[0] if args else kwargs["input_ids"]
-        return torch.cat([input_ids, torch.tensor([[99]], dtype=input_ids.dtype)], dim=-1)
-
-
-@pytest.mark.parametrize("as_batch_encoding", [True, False])
-def test_qwen_guardrail_generation_accepts_supported_tokenizer_outputs(as_batch_encoding: bool) -> None:
-    from vllm_omni.diffusion.models.cosmos3.guardrails import _generate_qwen_guardrail_response
-
-    input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long)
-    attention_mask = torch.ones_like(input_ids)
-    model_inputs = (
-        BatchEncoding({"input_ids": input_ids, "attention_mask": attention_mask}) if as_batch_encoding else input_ids
-    )
-    tokenizer = _FakeTokenizer(model_inputs)
-    model = _FakeModel()
-
-    response = _generate_qwen_guardrail_response("a safe prompt", tokenizer, model, "cpu")
-
-    assert response == "safe"
-    assert tokenizer.decoded_ids == [99]
-    args, kwargs = model.calls[0]
-    if as_batch_encoding:
-        assert args == ()
-        assert torch.equal(kwargs["input_ids"], input_ids)
-        assert torch.equal(kwargs["attention_mask"], attention_mask)
-        assert kwargs["max_new_tokens"] == 128
-    else:
-        assert len(args) == 1
-        assert torch.equal(args[0], input_ids)
-        assert kwargs == {"max_new_tokens": 128}
diff --git a/vllm_omni/diffusion/models/cosmos3/guardrails.py b/vllm_omni/diffusion/models/cosmos3/guardrails.py
index 0aaac1a7639..92655ee3843 100644
--- a/vllm_omni/diffusion/models/cosmos3/guardrails.py
+++ b/vllm_omni/diffusion/models/cosmos3/guardrails.py
@@ -2,21 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Cosmos3 guardrail hooks for vllm-omni.
 
-Text: Blocklist (keyword matching) + Qwen3Guard (0.6B LLM classifier)
-Video: RetinaFace face blur
+Thin adapter around the ``cosmos_guardrail`` package's ``CosmosSafetyChecker``
+(Blocklist + Qwen3Guard for text, RetinaFace face-blur for video).
 
 Enable via custom_pipeline_args or the test script:
-    python test_cosmos3.py --model ... --guardrails
+    python test_cosmos3.py --model ...
+Disable explicitly with ``--no-guardrails``.
 """
 
 from __future__ import annotations
 
-import os
-import warnings
-from collections.abc import Callable, Mapping
+from collections.abc import Callable
 from typing import Any
 
-import cv2
 import numpy as np
 import torch
 from vllm.logger import init_logger
@@ -26,307 +24,68 @@
 
 logger = init_logger(__name__)
 
-TextGuardrailFn = Callable[[str], None]
-VideoGuardrailFn = Callable[[np.ndarray], np.ndarray]
 
-_text_guardrail: TextGuardrailFn | None = None
-_video_guardrail: VideoGuardrailFn | None = None
-_initialized = False
+try:
+    from cosmos_guardrail import CosmosSafetyChecker
 
-GUARDRAIL_HF_REPO = "nvidia/Cosmos-Guardrail1"
-GUARDRAIL_HF_REVISION = "d6d4bfa899a71454a700907664f3e88f503950cf"
-
-
-def set_text_guardrail(fn: TextGuardrailFn) -> None:
-    global _text_guardrail
-    _text_guardrail = fn
+    _COSMOS_GUARDRAIL_AVAILABLE = True
+except ImportError:
+    _COSMOS_GUARDRAIL_AVAILABLE = False
 
+    class CosmosSafetyChecker:  # type: ignore[no-redef]
+        # Raised at runtime (not import time) so guardrail-less inference
+        # continues to work when ``cosmos_guardrail`` is not installed and
+        # ``model_config["guardrails"]`` is False.
+        def __init__(self, *args, **kwargs):
+            raise ValueError(
+                f"You have disabled the safety checker for {self.__class__}. This is in violation of the "
+                "[NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). "
+                f"Please ensure that you are compliant with the license agreement."
+            )
 
-def set_video_guardrail(fn: VideoGuardrailFn) -> None:
-    global _video_guardrail
-    _video_guardrail = fn
 
+TextGuardrailFn = Callable[[str], None]
+VideoGuardrailFn = Callable[[np.ndarray], np.ndarray]
 
-# ---------------------------------------------------------------------------
-# Face pixelation utility
-# ---------------------------------------------------------------------------
-def _pixelate_face(face_img: np.ndarray, blocks: int = 5) -> np.ndarray:
-    h, w = face_img.shape[:2]
-    if h == 0 or w == 0:
-        return face_img
-    temp = cv2.resize(face_img, (blocks, blocks), interpolation=cv2.INTER_LINEAR)
-    return cv2.resize(temp, (w, h), interpolation=cv2.INTER_NEAREST)
+_text_guardrail: TextGuardrailFn | None = None
+_video_guardrail: VideoGuardrailFn | None = None
 
 
 # ---------------------------------------------------------------------------
 # Default guardrail builders
 # ---------------------------------------------------------------------------
-def _download_checkpoint() -> str:
-    from huggingface_hub import snapshot_download
-
-    return snapshot_download(GUARDRAIL_HF_REPO, revision=GUARDRAIL_HF_REVISION)
-
-
-def _move_tokenizer_output_to_device(tokenizer_output: object, device: str) -> object:
-    if hasattr(tokenizer_output, "to"):
-        return tokenizer_output.to(device)
-    if isinstance(tokenizer_output, Mapping):
-        return {key: value.to(device) if hasattr(value, "to") else value for key, value in tokenizer_output.items()}
-    return tokenizer_output
-
-
-def _qwen_input_length(input_ids: object) -> int:
-    if hasattr(input_ids, "shape"):
-        return int(input_ids.shape[-1])
-    if isinstance(input_ids, list | tuple):
-        if input_ids and isinstance(input_ids[0], list | tuple):
-            return len(input_ids[0])
-        return len(input_ids)
-    raise TypeError(f"Qwen3Guard tokenizer returned unsupported input_ids type: {type(input_ids).__name__}")
-
-
-def _generate_qwen_guardrail_response(prompt: str, tokenizer: Any, model: Any, device: str) -> str:
-    conversations = [{"role": "user", "content": prompt}]
-    model_inputs = tokenizer.apply_chat_template(
-        conversations,
-        tokenize=True,
-        return_tensors="pt",
-        add_generation_prompt=True,
-    )
-    model_inputs = _move_tokenizer_output_to_device(model_inputs, device)
-
-    if isinstance(model_inputs, torch.Tensor):
-        input_ids = model_inputs
-        generate_kwargs = {}
-        generate_args = (input_ids,)
-    elif isinstance(model_inputs, Mapping):
-        if "input_ids" not in model_inputs:
-            raise TypeError("Qwen3Guard tokenizer output must include input_ids.")
-        input_ids = model_inputs["input_ids"]
-        generate_kwargs = dict(model_inputs)
-        generate_args = ()
-    else:
-        input_ids = getattr(model_inputs, "input_ids", None)
-        if input_ids is None:
-            raise TypeError(
-                "Qwen3Guard tokenizer must return a tensor or mapping with input_ids; "
-                f"got {type(model_inputs).__name__}"
-            )
-        generate_kwargs = {"input_ids": input_ids}
-        generate_args = ()
-
-    input_length = _qwen_input_length(input_ids)
-    with torch.no_grad():
-        output_ids = model.generate(*generate_args, **generate_kwargs, max_new_tokens=128)
-    return tokenizer.decode(
-        output_ids[0][input_length:],
-        skip_special_tokens=True,
-    )
-
-
-def _build_text_guardrail(offload_to_cpu: bool) -> TextGuardrailFn:
-    checkers: list[Callable[[str], tuple[bool, str]]] = []
-
-    # 1. Blocklist
-    try:
-        import nltk
-        from better_profanity import profanity as profanity_filter
-
-        ckpt_dir = _download_checkpoint()
-        blocklist_dir = os.path.join(ckpt_dir, "blocklist")
-        nltk.data.path.append(os.path.join(blocklist_dir, "nltk_data"))
-
-        def _read_keywords(dirpath: str) -> list[str]:
-            words: list[str] = []
-            if not os.path.isdir(dirpath):
-                return words
-            for fname in sorted(os.listdir(dirpath)):
-                fpath = os.path.join(dirpath, fname)
-                if os.path.isfile(fpath):
-                    with open(fpath) as f:
-                        words.extend(line.strip() for line in f if line.strip())
-            return words
-
-        blocklist_words = _read_keywords(os.path.join(blocklist_dir, "custom"))
-        whitelist_words = _read_keywords(os.path.join(blocklist_dir, "whitelist"))
-        profanity_filter.load_censor_words(custom_words=blocklist_words, whitelist_words=whitelist_words)
-
-        def _blocklist_check(prompt: str) -> tuple[bool, str]:
-            if profanity_filter.contains_profanity(prompt):
-                return False, "Blocked by keyword filter"
-            return True, ""
-
-        checkers.append(_blocklist_check)
-        if _is_rank_zero():
-            logger.info("Blocklist guardrail loaded (%d keywords)", len(blocklist_words))
-    except ImportError:
-        logger.warning("better-profanity or nltk not installed; skipping blocklist guardrail")
-
-    # 2. Qwen3Guard
-    try:
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-
-        model_id = "Qwen/Qwen3Guard-Gen-0.6B"
-        qwen_tokenizer = AutoTokenizer.from_pretrained(model_id)
-        device = "cpu" if offload_to_cpu else "cuda"
-        qwen_model = (
-            AutoModelForCausalLM.from_pretrained(
-                model_id,
-                torch_dtype=torch.bfloat16,
-            )
-            .to(device)
-            .eval()
-        )
-
-        def _qwen_check(prompt: str) -> tuple[bool, str]:
-            response = _generate_qwen_guardrail_response(prompt, qwen_tokenizer, qwen_model, device)
-            if "unsafe" in response.lower():
-                return False, f"Qwen3Guard: {response.strip()}"
-            return True, ""
-
-        checkers.append(_qwen_check)
-        if _is_rank_zero():
-            logger.info("Qwen3Guard guardrail loaded")
-    except ImportError:
-        logger.warning("transformers not installed; skipping Qwen3Guard")
+def _nn_models(runner: Any) -> list[torch.nn.Module]:
+    return [m for m in runner.models if isinstance(m, torch.nn.Module)]
+
 
+def _build_text_guardrail(checker: Any) -> TextGuardrailFn:
     def text_guardrail(prompt: str) -> None:
-        for checker in checkers:
-            is_safe, msg = checker(prompt)
-            if not is_safe:
-                raise GuardrailViolationError(f"Guardrail blocked prompt: {msg}")
+        if not checker.check_text_safety(prompt):
+            # CosmosSafetyChecker logs the specific reason at CRITICAL.
+            raise GuardrailViolationError("Guardrail blocked prompt")
 
     return text_guardrail
 
 
-def _build_video_guardrail(offload_to_cpu: bool) -> VideoGuardrailFn:
-    ckpt_dir = _download_checkpoint()
-    face_blurrer: Callable[[np.ndarray], np.ndarray] | None = None
-
-    # `offload_to_cpu` controls idle weight placement only; the forward pass
-    # always runs on `compute_device` and weights are returned to CPU after.
+def _build_video_guardrail(checker: Any, offload_to_cpu: bool) -> VideoGuardrailFn:
+    video_models = _nn_models(checker.video_guardrail)
     compute_device = "cuda"
-    idle_device = "cpu" if offload_to_cpu else compute_device
-
-    # Face blur: RetinaFace + pixelation
-    try:
-        from retinaface.data import cfg_re50
-        from retinaface.layers.functions.prior_box import PriorBox
-        from retinaface.models.retinaface import RetinaFace
-        from retinaface.utils.nms.py_cpu_nms import py_cpu_nms
-
-        face_ckpt = os.path.join(ckpt_dir, "face_blur_filter", "Resnet50_Final.pth")
-        if not os.path.exists(face_ckpt):
-            raise FileNotFoundError(face_ckpt)
-
-        cfg = dict(cfg_re50)
-        cfg["pretrain"] = False
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            retinaface_net = RetinaFace(cfg=cfg, phase="test")
-
-        # Load weights (strip 'module.' prefix if present)
-        pretrained_dict = torch.load(face_ckpt, map_location="cpu", weights_only=True)
-        if "state_dict" in pretrained_dict:
-            pretrained_dict = pretrained_dict["state_dict"]
-        pretrained_dict = {
-            k.replace("module.", "", 1) if k.startswith("module.") else k: v for k, v in pretrained_dict.items()
-        }
-        retinaface_net.load_state_dict(pretrained_dict, strict=False)
-        retinaface_net = retinaface_net.to(idle_device, dtype=torch.float32).eval()
-
-        CONF_THRESH = 0.7
-        NMS_THRESH = 0.4
-        TOP_K = 5000
-        KEEP_TOP_K = 750
-
-        def _decode_batch(loc, priors, variances):
-            batch_size = loc.size(0)
-            p = priors.unsqueeze(0).expand(batch_size, -1, -1)
-            boxes = torch.cat(
-                (
-                    p[:, :, :2] + loc[:, :, :2] * variances[0] * p[:, :, 2:],
-                    p[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1]),
-                ),
-                dim=2,
-            )
-            boxes[:, :, :2] -= boxes[:, :, 2:] / 2
-            boxes[:, :, 2:] += boxes[:, :, :2]
-            return boxes
-
-        def _face_blur(frames: np.ndarray) -> np.ndarray:
-            nonlocal retinaface_net
-            if offload_to_cpu:
-                retinaface_net = retinaface_net.to(compute_device)
-
-            prior_data = None
-            scale = None
-            result_frames = []
-
-            try:
-                for frame in frames:
-                    frame_t = torch.from_numpy(frame).to(compute_device, dtype=torch.float32)
-                    frame_t = frame_t.permute(2, 0, 1).unsqueeze(0)  # [1, C, H, W]
-                    frame_t = frame_t[:, [2, 1, 0], :, :]  # RGB → BGR
-                    means = torch.tensor([104.0, 117.0, 123.0], device=compute_device, dtype=torch.float32).view(
-                        1, 3, 1, 1
-                    )
-                    frame_t = frame_t - means
-
-                    h, w = frame_t.shape[2], frame_t.shape[3]
-                    if prior_data is None:
-                        priorbox = PriorBox(cfg, image_size=(h, w))
-                        prior_data = priorbox.forward().to(compute_device, dtype=torch.float32)
-                    if scale is None:
-                        scale = torch.tensor([w, h, w, h], device=compute_device, dtype=torch.float32)
-
-                    with torch.no_grad():
-                        loc, conf, _ = retinaface_net(frame_t)
-
-                    boxes = _decode_batch(loc, prior_data, cfg["variance"])
-                    boxes = (boxes * scale).squeeze(0).cpu().numpy()
-                    scores = conf.squeeze(0)[:, 1].cpu().numpy()
-
-                    # Filter by confidence
-                    inds = np.where(scores > CONF_THRESH)[0]
-                    boxes_f = boxes[inds]
-                    scores_f = scores[inds]
-                    order = scores_f.argsort()[::-1][:TOP_K]
-                    boxes_f = boxes_f[order]
-                    scores_f = scores_f[order]
-
-                    # NMS
-                    dets = np.hstack((boxes_f, scores_f[:, np.newaxis])).astype(np.float32)
-                    keep = py_cpu_nms(dets, NMS_THRESH)
-                    dets = dets[keep][:KEEP_TOP_K]
-
-                    out_frame = frame.copy()
-                    for det in dets:
-                        x1, y1, x2, y2 = map(int, det[:4])
-                        if x2 - x1 < 20 or y2 - y1 < 20:
-                            continue
-                        max_h, max_w = out_frame.shape[:2]
-                        y1c, y2c = max(y1, 0), min(y2, max_h)
-                        x1c, x2c = max(x1, 0), min(x2, max_w)
-                        out_frame[y1c:y2c, x1c:x2c] = _pixelate_face(out_frame[y1c:y2c, x1c:x2c])
-
-                    result_frames.append(out_frame)
-            finally:
-                if offload_to_cpu:
-                    retinaface_net = retinaface_net.to("cpu")
-
-            return np.array(result_frames)
-
-        face_blurrer = _face_blur
-        if _is_rank_zero():
-            logger.info("Face blur filter loaded (RetinaFace Resnet50)")
-    except (ImportError, FileNotFoundError) as e:
-        logger.warning("Could not load face blur filter: %s", e)
 
     def video_guardrail(frames: np.ndarray) -> np.ndarray:
-        if face_blurrer is not None:
-            frames = face_blurrer(frames)
-        return frames
+        if offload_to_cpu:
+            for m in video_models:
+                m.to(compute_device)
+        try:
+            result = checker.check_video_safety(frames)
+        finally:
+            if offload_to_cpu:
+                for m in video_models:
+                    m.to("cpu")
+        # ``check_video_safety`` returns ``None`` when the content safety
+        # filter blocks the frames. The face-blur postprocessor (the only
+        # video module enabled by default) does not block, so in practice
+        # ``result`` is always an ndarray here.
+        return result if result is not None else frames
 
     return video_guardrail
 
@@ -335,19 +94,28 @@ def video_guardrail(frames: np.ndarray) -> np.ndarray:
 # Initialization
 # ---------------------------------------------------------------------------
 def _init_default_guardrails(offload_to_cpu: bool = False) -> None:
-    global _text_guardrail, _video_guardrail, _initialized
-    if _initialized:
+    global _text_guardrail, _video_guardrail
+    if _text_guardrail is not None:
         return
     if _is_rank_zero():
         logger.info("Initializing Cosmos3 guardrails (offload_to_cpu=%s)...", offload_to_cpu)
-    # Build into locals first so a partial failure doesn't leave the module
-    # in a half-initialized state (one guardrail set, the other missing,
-    # and `_initialized` still False so the next call retries from scratch).
-    text_fn = _build_text_guardrail(offload_to_cpu)
-    video_fn = _build_video_guardrail(offload_to_cpu)
-    _text_guardrail = text_fn
-    _video_guardrail = video_fn
-    _initialized = True
+
+    # Instantiation raises ValueError when ``cosmos_guardrail`` is not
+    # installed - this is the right moment to fail loudly because the
+    # caller has opted in to guardrails.
+    checker = CosmosSafetyChecker()
+
+    # Place text models on their resting device permanently. Video models
+    # idle on CPU when offload is on and move to GPU per-call (handled in
+    # the video guardrail closure).
+    idle_device = "cpu" if offload_to_cpu else "cuda"
+    for m in _nn_models(checker.text_guardrail):
+        m.to(idle_device)
+    for m in _nn_models(checker.video_guardrail):
+        m.to(idle_device)
+
+    _text_guardrail = _build_text_guardrail(checker)
+    _video_guardrail = _build_video_guardrail(checker, offload_to_cpu)
     if _is_rank_zero():
         logger.info("Cosmos3 guardrails initialized.")
 
@@ -358,7 +126,8 @@ def _init_default_guardrails(offload_to_cpu: bool = False) -> None:
 def ensure_initialized(od_config: Any) -> None:
     if not is_guardrails_enabled(od_config):
         return
-    _init_default_guardrails(offload_to_cpu=get_offload_flag(od_config))
+    cfg = getattr(od_config, "model_config", None) or {}
+    _init_default_guardrails(offload_to_cpu=bool(cfg.get("offload_guardrail_models", False)))
 
 
 def check_text_safety(prompt: str) -> None:
@@ -407,8 +176,3 @@ def is_guardrails_enabled(od_config: Any, sampling_params: Any = None) -> bool:
         if per_request is not None:
             return bool(per_request)
     return True
-
-
-def get_offload_flag(od_config: Any) -> bool:
-    cfg = getattr(od_config, "model_config", None) or {}
-    return bool(cfg.get("offload_guardrail_models", False))

From 3bf5f7392eb18263f8e4bae8838f5c52ea58ad11 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Fri, 22 May 2026 15:15:23 +0200
Subject: [PATCH 25/41] Remove the Cosmos3 model from pipeline due to loading
 issues

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 vllm_omni/config/pipeline_registry.py |  4 ----
 vllm_omni/deploy/cosmos3.yaml         | 14 --------------
 2 files changed, 18 deletions(-)
 delete mode 100644 vllm_omni/deploy/cosmos3.yaml

diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py
index 046aa89f82c..555f35e173a 100644
--- a/vllm_omni/config/pipeline_registry.py
+++ b/vllm_omni/config/pipeline_registry.py
@@ -33,10 +33,6 @@
 # --- Multi-stage omni pipelines (LLM-centric; audio / video I/O) ---
 _OMNI_PIPELINES: dict[str, tuple[str, str]] = {
     # model_type -> (module_path, variable_name)
-    "cosmos3": (
-        "vllm_omni.diffusion.models.cosmos3.pipeline",
-        "COSMOS3_PIPELINE",
-    ),
     "qwen2_5_omni": (
         "vllm_omni.model_executor.models.qwen2_5_omni.pipeline",
         "QWEN2_5_OMNI_PIPELINE",
diff --git a/vllm_omni/deploy/cosmos3.yaml b/vllm_omni/deploy/cosmos3.yaml
deleted file mode 100644
index 2f3ed85a797..00000000000
--- a/vllm_omni/deploy/cosmos3.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Cosmos3 single-stage diffusion deploy config.
-#
-# This config is auto-loaded for Diffusers repos whose model_index.json has
-# _class_name: Cosmos3OmniDiffusersPipeline. Pass --deploy-config only for
-# local overrides such as disabling guardrails.
-
-async_chunk: false
-trust_remote_code: true
-
-stages:
-  - stage_id: 0
-    max_num_seqs: 1
-    enforce_eager: true
-    model_class_name: Cosmos3OmniDiffusersPipeline

From c4b88861ee88f5c2949f16570889b886908cf9b2 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Fri, 22 May 2026 15:29:06 +0200
Subject: [PATCH 26/41] Removed introduced guardrail error

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 tests/entrypoints/test_omni_entrypoints.py    | 81 -------------------
 vllm_omni/diffusion/data.py                   | 18 -----
 vllm_omni/diffusion/diffusion_engine.py       | 17 +---
 .../inline_stage_diffusion_client.py          |  4 +-
 .../diffusion/models/cosmos3/guardrails.py    |  6 +-
 vllm_omni/diffusion/stage_diffusion_client.py |  9 +--
 vllm_omni/diffusion/stage_diffusion_proc.py   |  4 +-
 .../diffusion/worker/diffusion_worker.py      | 13 +--
 vllm_omni/engine/messages.py                  |  1 -
 vllm_omni/engine/orchestrator.py              |  1 -
 vllm_omni/entrypoints/async_omni.py           | 11 +--
 vllm_omni/entrypoints/omni_base.py            | 20 +----
 vllm_omni/entrypoints/openai/serving_video.py | 19 ++---
 vllm_omni/outputs.py                          |  4 -
 14 files changed, 20 insertions(+), 188 deletions(-)

diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py
index d8cbc9e8858..86a554e28e2 100644
--- a/tests/entrypoints/test_omni_entrypoints.py
+++ b/tests/entrypoints/test_omni_entrypoints.py
@@ -13,10 +13,6 @@
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 
-from vllm_omni.diffusion.data import (
-    DiffusionErrorType,
-    GuardrailViolationError,
-)
 from vllm_omni.engine.async_omni_engine import StageRuntimeInfo
 from vllm_omni.engine.messages import ErrorMessage, OutputMessage
 from vllm_omni.entrypoints.async_omni import AsyncOmni
@@ -417,17 +413,6 @@ def _enqueue_error_message(engine: FakeAsyncOmniEngine, msg: dict[str, Any]) ->
     )
 
 
-def _enqueue_guardrail_error_message(engine: FakeAsyncOmniEngine, msg: dict[str, Any]) -> None:
-    engine.output_q.put_nowait(
-        ErrorMessage(
-            request_id=msg["request_id"],
-            stage_id=0,
-            error="Guardrail blocked prompt: unsafe",
-            error_type=DiffusionErrorType.GUARDRAIL_VIOLATION,
-        )
-    )
-
-
 def _enqueue_fatal_error_message(engine: FakeAsyncOmniEngine, msg: dict[str, Any]) -> None:
     engine.output_q.put_nowait(
         ErrorMessage(
@@ -862,7 +847,6 @@ def _enqueue_stage_error(
     *,
     error_text: str,
     kill_engine: bool = False,
-    error_type: str | None = None,
 ):
     """Enqueue a stage error output, optionally killing the engine."""
     if kill_engine:
@@ -870,7 +854,6 @@ def _enqueue_stage_error(
     engine_output = OmniRequestOutput.from_error(
         msg["request_id"],
         error_text,
-        error_type=error_type,
     )
     engine_output.payload = ""
     engine.output_q.put_nowait(
@@ -923,49 +906,6 @@ async def test_async_omni_propagates_engine_generate_error(monkeypatch: pytest.M
         app.shutdown()
 
 
-@pytest.mark.asyncio
-async def test_async_omni_rehydrates_guardrail_stage_error(monkeypatch: pytest.MonkeyPatch):
-    """Structured guardrail errors should not be flattened to EngineGenerateError."""
-
-    engine = FakeAsyncOmniEngine(
-        stage_metadata=THREE_STAGE_META,
-        on_add_request=lambda eng, msg: _enqueue_stage_error(
-            eng,
-            msg,
-            error_text="Guardrail blocked prompt: unsafe",
-            error_type=DiffusionErrorType.GUARDRAIL_VIOLATION,
-        ),
-    )
-    _patch_engine(monkeypatch, engine)
-
-    app = AsyncOmni("dummy-model")
-    try:
-        with pytest.raises(GuardrailViolationError, match="Guardrail blocked prompt"):
-            async for _ in app.generate(prompt="hello", request_id="req-guardrail-output"):
-                pass
-    finally:
-        app.shutdown()
-
-
-@pytest.mark.asyncio
-async def test_async_omni_rehydrates_guardrail_error_message(monkeypatch: pytest.MonkeyPatch):
-    """Request-scoped ErrorMessage metadata should reach the request generator."""
-
-    engine = FakeAsyncOmniEngine(
-        stage_metadata=THREE_STAGE_META,
-        on_add_request=_enqueue_guardrail_error_message,
-    )
-    _patch_engine(monkeypatch, engine)
-
-    app = AsyncOmni("dummy-model")
-    try:
-        with pytest.raises(GuardrailViolationError, match="Guardrail blocked prompt"):
-            async for _ in app.generate(prompt="hello", request_id="req-guardrail-message"):
-                pass
-    finally:
-        app.shutdown()
-
-
 # ───────── OmniBase.check_health() aggregation ─────────
 
 
@@ -1080,27 +1020,6 @@ def test_omni_propagates_engine_generate_error(monkeypatch: pytest.MonkeyPatch):
         app.shutdown()
 
 
-def test_omni_rehydrates_guardrail_stage_error(monkeypatch: pytest.MonkeyPatch):
-    """Synchronous generation should preserve structured guardrail failures."""
-    engine = FakeAsyncOmniEngine(
-        stage_metadata=THREE_STAGE_META,
-        on_add_request=lambda eng, msg: _enqueue_stage_error(
-            eng,
-            msg,
-            error_text="Guardrail blocked video: unsafe",
-            error_type=DiffusionErrorType.GUARDRAIL_VIOLATION,
-        ),
-    )
-    _patch_engine(monkeypatch, engine)
-
-    app = Omni("dummy-model")
-    try:
-        with pytest.raises(GuardrailViolationError, match="Guardrail blocked video"):
-            list(app.generate(["hello"], py_generator=False, use_tqdm=False))
-    finally:
-        app.shutdown()
-
-
 def test_omni_errored_property_alive(monkeypatch: pytest.MonkeyPatch):
     """Omni.errored (inherited from OmniBase) returns False when healthy."""
     engine = FakeAsyncOmniEngine(stage_metadata=THREE_STAGE_META)
diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py
index 7131f3605c2..c45ee63e7f1 100644
--- a/vllm_omni/diffusion/data.py
+++ b/vllm_omni/diffusion/data.py
@@ -6,7 +6,6 @@
 import random
 from collections.abc import Callable, Mapping
 from dataclasses import dataclass, field, fields
-from enum import Enum
 from typing import TYPE_CHECKING, Any
 
 import diffusers
@@ -945,7 +944,6 @@ class DiffusionOutput:
     trajectory_log_probs: torch.Tensor | dict | None = None
     trajectory_decoded: list[Image.Image] | None = None
     error: str | None = None
-    error_type: str | None = None
     aborted: bool = False
     abort_message: str | None = None
 
@@ -969,22 +967,6 @@ class DiffusionRequestAbortedError(RuntimeError):
     """Raised when a diffusion request ends via user-visible abort."""
 
 
-class GuardrailViolationError(ValueError):
-    """Raised when a guardrail blocks user input or generated output."""
-
-
-class DiffusionErrorType(str, Enum):
-    """Stable, serializable identifiers for recoverable diffusion errors."""
-
-    GUARDRAIL_VIOLATION = "guardrail_violation"
-
-
-def diffusion_error_type_from_exception(exc: BaseException) -> DiffusionErrorType | None:
-    if isinstance(exc, GuardrailViolationError):
-        return DiffusionErrorType.GUARDRAIL_VIOLATION
-    return None
-
-
 @dataclass
 class AttentionSpec:
     """Specifies a backend and its backend-specific parameters for one attention role."""
diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py
index a3137037d3e..0c22c1632e1 100644
--- a/vllm_omni/diffusion/diffusion_engine.py
+++ b/vllm_omni/diffusion/diffusion_engine.py
@@ -20,12 +20,9 @@
 from vllm.v1.engine.exceptions import EngineDeadError
 
 from vllm_omni.diffusion.data import (
-    DiffusionErrorType,
     DiffusionOutput,
     DiffusionRequestAbortedError,
-    GuardrailViolationError,
     OmniDiffusionConfig,
-    diffusion_error_type_from_exception,
 )
 from vllm_omni.diffusion.executor.abstract import DiffusionExecutor
 from vllm_omni.diffusion.registry import (
@@ -225,8 +222,6 @@ async def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]:
         if output.aborted:
             raise DiffusionRequestAbortedError(output.abort_message or "Diffusion request aborted.")
         if output.error:
-            if output.error_type == DiffusionErrorType.GUARDRAIL_VIOLATION:
-                raise GuardrailViolationError(output.error)
             raise RuntimeError(output.error)
         logger.debug("Generation completed successfully.")
 
@@ -485,17 +480,13 @@ def _busy_loop(self):
                 logger.error(
                     "Execution failed for diffusion requests %s", sched_output.scheduled_req_ids, exc_info=True
                 )
-                error_type = diffusion_error_type_from_exception(exc)
                 runner_output = BatchRunnerOutput.from_list(
                     [
                         RunnerOutput(
                             req_id=req_id,
                             step_index=None,
                             finished=True,
-                            result=DiffusionOutput(
-                                error=str(exc),
-                                error_type=error_type,
-                            ),
+                            result=DiffusionOutput(error=str(exc)),
                         )
                         for req_id in sched_output.scheduled_req_ids
                     ]
@@ -648,15 +639,11 @@ def add_req_and_wait_for_response(self, request: OmniDiffusionRequest) -> Diffus
                     raise
                 except Exception as exc:
                     logger.error("Execution failed for diffusion request %s", sched_req_id, exc_info=True)
-                    error_type = diffusion_error_type_from_exception(exc)
                     runner_output = RunnerOutput(
                         req_id=sched_req_id,
                         step_index=None,
                         finished=True,
-                        result=DiffusionOutput(
-                            error=str(exc),
-                            error_type=error_type,
-                        ),
+                        result=DiffusionOutput(error=str(exc)),
                     )
 
                 self._process_aborts_queue()
diff --git a/vllm_omni/diffusion/inline_stage_diffusion_client.py b/vllm_omni/diffusion/inline_stage_diffusion_client.py
index 558662ff902..b6f881d1cdc 100644
--- a/vllm_omni/diffusion/inline_stage_diffusion_client.py
+++ b/vllm_omni/diffusion/inline_stage_diffusion_client.py
@@ -17,7 +17,7 @@
 from vllm.logger import init_logger
 from vllm.v1.engine.exceptions import EngineDeadError
 
-from vllm_omni.diffusion.data import DiffusionRequestAbortedError, diffusion_error_type_from_exception
+from vllm_omni.diffusion.data import DiffusionRequestAbortedError
 from vllm_omni.diffusion.diffusion_engine import DiffusionEngine
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.engine.stage_client import StageClientBase
@@ -148,7 +148,6 @@ async def _dispatch_request(
                 images=[],
             )
             error_output.error = str(e)
-            error_output.error_type = diffusion_error_type_from_exception(e)
             self._output_queue.put_nowait(error_output)
         finally:
             self._tasks.pop(request_id, None)
@@ -255,7 +254,6 @@ async def _dispatch_batch(
                 images=[],
             )
             error_output.error = str(e)
-            error_output.error_type = diffusion_error_type_from_exception(e)
             self._output_queue.put_nowait(error_output)
         finally:
             self._tasks.pop(request_id, None)
diff --git a/vllm_omni/diffusion/models/cosmos3/guardrails.py b/vllm_omni/diffusion/models/cosmos3/guardrails.py
index 92655ee3843..571265e32c4 100644
--- a/vllm_omni/diffusion/models/cosmos3/guardrails.py
+++ b/vllm_omni/diffusion/models/cosmos3/guardrails.py
@@ -19,7 +19,6 @@
 import torch
 from vllm.logger import init_logger
 
-from vllm_omni.diffusion.data import GuardrailViolationError
 from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
 
 logger = init_logger(__name__)
@@ -39,7 +38,8 @@ class CosmosSafetyChecker:  # type: ignore[no-redef]
         def __init__(self, *args, **kwargs):
             raise ValueError(
                 f"You have disabled the safety checker for {self.__class__}. This is in violation of the "
-                "[NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). "
+                "[NVIDIA Open Model License Agreement]"
+                "(https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). "
                 f"Please ensure that you are compliant with the license agreement."
             )
 
@@ -62,7 +62,7 @@ def _build_text_guardrail(checker: Any) -> TextGuardrailFn:
     def text_guardrail(prompt: str) -> None:
         if not checker.check_text_safety(prompt):
             # CosmosSafetyChecker logs the specific reason at CRITICAL.
-            raise GuardrailViolationError("Guardrail blocked prompt")
+            raise ValueError("Guardrail blocked prompt")
 
     return text_guardrail
 
diff --git a/vllm_omni/diffusion/stage_diffusion_client.py b/vllm_omni/diffusion/stage_diffusion_client.py
index 92208b7c52b..6ab44b4b0c6 100644
--- a/vllm_omni/diffusion/stage_diffusion_client.py
+++ b/vllm_omni/diffusion/stage_diffusion_client.py
@@ -228,7 +228,6 @@ def _drain_responses(self) -> None:
                 req_id = msg.get("request_id")
                 rpc_id = msg.get("rpc_id")
                 error_msg = msg.get("error")
-                error_type = msg.get("error_type")
                 logger.error(
                     "[StageDiffusionClient] stage-%s [rep-%s] subprocess error for %s: %s",
                     self.stage_id,
@@ -245,13 +244,7 @@ def _drain_responses(self) -> None:
                 # Route request errors as error outputs so the Orchestrator
                 # sees the request complete (instead of hanging forever).
                 if req_id is not None:
-                    self._output_queue.put_nowait(
-                        OmniRequestOutput.from_error(
-                            req_id,
-                            error_msg,
-                            error_type=error_type,
-                        )
-                    )
+                    self._output_queue.put_nowait(OmniRequestOutput.from_error(req_id, error_msg))
 
     # Fields that are subprocess-local and cannot be serialized across
     # process boundaries.  They are recreated in the subprocess with
diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py
index dd0ed6ef865..871a29729f2 100644
--- a/vllm_omni/diffusion/stage_diffusion_proc.py
+++ b/vllm_omni/diffusion/stage_diffusion_proc.py
@@ -24,7 +24,7 @@
 from vllm.utils.system_utils import get_mp_context
 from vllm.v1.utils import shutdown
 
-from vllm_omni.diffusion.data import DiffusionRequestAbortedError, diffusion_error_type_from_exception
+from vllm_omni.diffusion.data import DiffusionRequestAbortedError
 from vllm_omni.diffusion.diffusion_engine import DiffusionEngine
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.distributed.omni_connectors.utils.serialization import (
@@ -399,7 +399,6 @@ async def _dispatch_request(
                             "type": "error",
                             "request_id": request_id,
                             "error": str(e),
-                            "error_type": diffusion_error_type_from_exception(e),
                         }
                     )
                 )
@@ -483,7 +482,6 @@ async def _dispatch_batch(
                                         "type": "error",
                                         "request_id": rid,
                                         "error": str(e),
-                                        "error_type": diffusion_error_type_from_exception(e),
                                     }
                                 )
                             )
diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py
index 66a674c82cf..e2c4b97c101 100644
--- a/vllm_omni/diffusion/worker/diffusion_worker.py
+++ b/vllm_omni/diffusion/worker/diffusion_worker.py
@@ -33,7 +33,6 @@
     OmniDiffusionConfig,
     OmniSleepTask,
     OmniWakeTask,
-    diffusion_error_type_from_exception,
 )
 from vllm_omni.diffusion.distributed.parallel_state import (
     destroy_distributed_env,
@@ -753,12 +752,7 @@ def worker_busy_loop(self) -> None:
                 except Exception as e:
                     logger.error(f"Error processing RPC: {e}", exc_info=True)
                     if self.result_mq is not None:
-                        self.return_result(
-                            DiffusionOutput(
-                                error=str(e),
-                                error_type=diffusion_error_type_from_exception(e),
-                            )
-                        )
+                        self.return_result(DiffusionOutput(error=str(e)))
 
             elif isinstance(msg, dict) and msg.get("type") == "shutdown":
                 logger.info("Worker %s: Received shutdown message", self.gpu_id)
@@ -774,10 +768,7 @@ def worker_busy_loop(self) -> None:
                         f"Error executing forward in event loop: {e}",
                         exc_info=True,
                     )
-                    output = DiffusionOutput(
-                        error=str(e),
-                        error_type=diffusion_error_type_from_exception(e),
-                    )
+                    output = DiffusionOutput(error=str(e))
 
                 try:
                     self.return_result(output)
diff --git a/vllm_omni/engine/messages.py b/vllm_omni/engine/messages.py
index eaab7226ea5..0e55105f13f 100644
--- a/vllm_omni/engine/messages.py
+++ b/vllm_omni/engine/messages.py
@@ -71,7 +71,6 @@ class UnregisterRemoteReplicaMessage(EngineQueueMessage, kw_only=True):
 class ErrorMessage(EngineQueueMessage, kw_only=True):
     type: Literal["error"] = "error"
     error: str
-    error_type: str | None = None
     fatal: bool = False
     request_id: str | None = None
     stage_id: int | None = None
diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py
index 6af983d57a7..27bb44ad7e5 100644
--- a/vllm_omni/engine/orchestrator.py
+++ b/vllm_omni/engine/orchestrator.py
@@ -691,7 +691,6 @@ async def _handle_stage_error(self, stage_id: int, output: Any) -> None:
                 request_id=parent_id,
                 stage_id=stage_id,
                 error=output.error,
-                error_type=getattr(output, "error_type", None),
             )
         )
         await self._cleanup_request_ids(
diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py
index 2bc81c6dc28..3485f98f71d 100644
--- a/vllm_omni/entrypoints/async_omni.py
+++ b/vllm_omni/entrypoints/async_omni.py
@@ -520,7 +520,7 @@ async def _process_orchestrator_results(
                         result.error,
                         error_stage_id=result.stage_id,
                     )
-                self._raise_nonfatal_error_message(result)
+                raise RuntimeError(result.error)
 
             if not isinstance(result, OutputMessage):
                 logger.warning("[AsyncOmni] Dropping unexpected per-request message %r", result)
@@ -591,15 +591,6 @@ async def _final_output_loop():
                         await self.event_resolver.resolve(msg)
                         continue
 
-                    if isinstance(msg, ErrorMessage) and msg.request_id is not None:
-                        req_state = self.request_states.get(msg.request_id)
-                        if req_state is None:
-                            logger.debug("[AsyncOmni] Dropping error for unknown req %s", msg.request_id)
-                            continue
-                        req_state.stage_id = msg.stage_id
-                        await req_state.queue.put(msg)
-                        continue
-
                     should_continue, _, stage_id, req_state = self._handle_output_message(msg)
                     if should_continue:
                         continue
diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index b8a21670fd0..f2367437ea5 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -13,7 +13,6 @@
 from vllm.logger import init_logger
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
 
-from vllm_omni.diffusion.data import DiffusionErrorType, GuardrailViolationError
 from vllm_omni.engine.async_omni_engine import AsyncOmniEngine
 from vllm_omni.engine.messages import (
     EngineQueueMessage,
@@ -326,7 +325,7 @@ def _handle_output_message(
                     msg.error,
                     error_stage_id=msg.stage_id,
                 )
-            self._raise_nonfatal_error_message(msg)
+            raise RuntimeError(msg.error)
 
         if not isinstance(msg, OutputMessage):
             logger.warning("[%s] got unexpected msg type: %s", self.__class__.__name__, msg.type)
@@ -348,15 +347,6 @@ def _handle_output_message(
 
         return False, req_id, stage_id, req_state
 
-    @staticmethod
-    def _is_guardrail_violation(error_type: str | None) -> bool:
-        return error_type == DiffusionErrorType.GUARDRAIL_VIOLATION
-
-    def _raise_nonfatal_error_message(self, msg: ErrorMessage) -> None:
-        if self._is_guardrail_violation(msg.error_type):
-            raise GuardrailViolationError(msg.error)
-        raise RuntimeError(msg.error)
-
     def _check_engine_output_error(
         self,
         result: OutputMessage,
@@ -366,15 +356,13 @@ def _check_engine_output_error(
         """Raise if ``engine_outputs`` carries an error field.
 
         Raises :class:`EngineDeadError` when ``self.errored`` indicates the
-        engine is unrecoverable. For recoverable, single-request failures,
-        raises :class:`GuardrailViolationError` when the error metadata marks
-        a guardrail block, otherwise :class:`EngineGenerateError`.
+        engine is unrecoverable, otherwise :class:`EngineGenerateError`
+        (recoverable, single-request failure).
         """
         engine_outputs = result.engine_outputs
         error_text = getattr(engine_outputs, "error", None)
         if error_text is None:
             return
-        error_type = getattr(engine_outputs, "error_type", None)
         logger.error(
             "[%s] Stage error for req=%s stage-%s: %s",
             self.__class__.__name__,
@@ -388,8 +376,6 @@ def _check_engine_output_error(
                 error_text,
                 error_stage_id=stage_id,
             )
-        if self._is_guardrail_violation(error_type):
-            raise GuardrailViolationError(error_text)
         raise EngineGenerateError(error_text)
 
     def _process_single_result(
diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py
index ccf54da6d0a..b6ed49996fe 100644
--- a/vllm_omni/entrypoints/openai/serving_video.py
+++ b/vllm_omni/entrypoints/openai/serving_video.py
@@ -14,7 +14,6 @@
 from vllm.engine.protocol import EngineClient
 from vllm.logger import init_logger
 
-from vllm_omni.diffusion.data import GuardrailViolationError
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 from vllm_omni.entrypoints.openai.protocol.videos import (
     VideoData,
@@ -328,18 +327,12 @@ async def _run_generation(
         )
 
         result = None
-        try:
-            async for output in engine_client.generate(
-                prompt=prompt,
-                request_id=request_id,
-                sampling_params_list=sampling_params_list,
-            ):
-                result = output
-        except GuardrailViolationError as exc:
-            raise HTTPException(
-                status_code=HTTPStatus.BAD_REQUEST.value,
-                detail=str(exc),
-            ) from exc
+        async for output in engine_client.generate(
+            prompt=prompt,
+            request_id=request_id,
+            sampling_params_list=sampling_params_list,
+        ):
+            result = output
 
         if result is None:
             raise HTTPException(
diff --git a/vllm_omni/outputs.py b/vllm_omni/outputs.py
index bdf7b973ab8..b4d308ebf8f 100644
--- a/vllm_omni/outputs.py
+++ b/vllm_omni/outputs.py
@@ -102,15 +102,12 @@ class OmniRequestOutput:
 
     # error handling
     error: str | None = None
-    error_type: str | None = None
 
     @classmethod
     def from_error(
         cls,
         request_id: str,
         error_message: str,
-        *,
-        error_type: str | None = None,
     ) -> "OmniRequestOutput":
         """Create a terminal error output.
 
@@ -125,7 +122,6 @@ def from_error(
             request_id=request_id,
             finished=True,
             error=error_message,
-            error_type=error_type,
         )
 
     @classmethod

From 0d325bf733d024c8393d81d7e753575c1da93b2b Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Fri, 22 May 2026 15:35:26 +0200
Subject: [PATCH 27/41] Improved error message

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 vllm_omni/diffusion/models/cosmos3/guardrails.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_omni/diffusion/models/cosmos3/guardrails.py b/vllm_omni/diffusion/models/cosmos3/guardrails.py
index 571265e32c4..71525a6272e 100644
--- a/vllm_omni/diffusion/models/cosmos3/guardrails.py
+++ b/vllm_omni/diffusion/models/cosmos3/guardrails.py
@@ -41,6 +41,7 @@ def __init__(self, *args, **kwargs):
                 "[NVIDIA Open Model License Agreement]"
                 "(https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). "
                 f"Please ensure that you are compliant with the license agreement."
+                f"Please install cosmos-guardrail package to enable safety checks."
             )
 
 

From 397d2feb2404ed4a0ef423bfb459f96870d2e42f Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Fri, 22 May 2026 23:26:46 +0200
Subject: [PATCH 28/41] Remove custom RMSNorm

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../models/cosmos3/transformer_cosmos3.py     | 45 +++++--------------
 1 file changed, 11 insertions(+), 34 deletions(-)

diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index c5c6c563da3..7568207b7eb 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -31,6 +31,7 @@
 from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata
 from vllm_omni.diffusion.attention.layer import Attention as FrameworkAttention
 from vllm_omni.diffusion.distributed.sp_plan import SequenceParallelInput, SequenceParallelOutput
+from vllm_omni.diffusion.layers.norm import RMSNorm
 
 logger = init_logger(__name__)
 
@@ -83,30 +84,6 @@ def _tf_config_get(config: Any, key: str, default: Any) -> Any:
     return getattr(config, key, default)
 
 
-# ---------------------------------------------------------------------------
-# RMSNorm
-# ---------------------------------------------------------------------------
-class Qwen3VLTextRMSNorm(nn.Module):
-    """RMSNorm compatible with Qwen3-VL / T5LayerNorm."""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        eps: float = 1e-6,
-        dtype: torch.dtype = torch.bfloat16,
-    ) -> None:
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=dtype))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
 # ---------------------------------------------------------------------------
 # Rotary Position Embeddings (mRoPE)
 # ---------------------------------------------------------------------------
@@ -395,8 +372,8 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        self.q_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=rms_norm_eps, dtype=dtype)
-        self.k_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=rms_norm_eps, dtype=dtype)
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
 
     def forward(
         self,
@@ -509,8 +486,8 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        self.q_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=rms_norm_eps, dtype=dtype)
-        self.k_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=rms_norm_eps, dtype=dtype)
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
 
         self.local_attn = FrameworkAttention(
             num_heads=self.num_heads_local,
@@ -648,8 +625,8 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
         )
-        self.input_layernorm = Qwen3VLTextRMSNorm(hidden_size, eps=rms_norm_eps, dtype=dtype)
-        self.post_attention_layernorm = Qwen3VLTextRMSNorm(hidden_size, eps=rms_norm_eps, dtype=dtype)
+        self.input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps)
         self.mlp = Cosmos3GatedMLP(
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
@@ -707,8 +684,8 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.cross_attention",
         )
-        self.input_layernorm = Qwen3VLTextRMSNorm(hidden_size, eps=rms_norm_eps, dtype=dtype)
-        self.post_attention_layernorm = Qwen3VLTextRMSNorm(hidden_size, eps=rms_norm_eps, dtype=dtype)
+        self.input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps)
         self.mlp = Cosmos3GatedMLP(
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
@@ -802,7 +779,7 @@ def __init__(
                 for i in range(num_hidden_layers)
             ]
         )
-        self.norm = Qwen3VLTextRMSNorm(hidden_size, eps=rms_norm_eps, dtype=dtype)
+        self.norm = RMSNorm(hidden_size, eps=rms_norm_eps)
 
     def forward(
         self,
@@ -976,7 +953,7 @@ def __init__(
             ]
         )
 
-        self.norm_moe_gen = Qwen3VLTextRMSNorm(self.hidden_size, eps=self.rms_norm_eps, dtype=dtype)
+        self.norm_moe_gen = RMSNorm(self.hidden_size, eps=self.rms_norm_eps)
         self.gen_sp_prepare = Cosmos3GenSPPrepare()
         self.gen_sp_gather = nn.Identity()
 

From 82b8a2cd873b711afbb437a07f8670dfb975d82f Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Fri, 22 May 2026 23:57:53 +0200
Subject: [PATCH 29/41] Added Cosmos3 as a pipeline

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 vllm_omni/config/pipeline_registry.py          |  4 ++++
 vllm_omni/deploy/cosmos3_omni.yaml             | 14 ++++++++++++++
 vllm_omni/diffusion/models/cosmos3/pipeline.py |  2 +-
 vllm_omni/entrypoints/utils.py                 |  2 +-
 4 files changed, 20 insertions(+), 2 deletions(-)
 create mode 100644 vllm_omni/deploy/cosmos3_omni.yaml

diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py
index 555f35e173a..3616168df7d 100644
--- a/vllm_omni/config/pipeline_registry.py
+++ b/vllm_omni/config/pipeline_registry.py
@@ -33,6 +33,10 @@
 # --- Multi-stage omni pipelines (LLM-centric; audio / video I/O) ---
 _OMNI_PIPELINES: dict[str, tuple[str, str]] = {
     # model_type -> (module_path, variable_name)
+    "cosmos3_omni": (
+        "vllm_omni.diffusion.models.cosmos3.pipeline",
+        "COSMOS3_PIPELINE",
+    ),
     "qwen2_5_omni": (
         "vllm_omni.model_executor.models.qwen2_5_omni.pipeline",
         "QWEN2_5_OMNI_PIPELINE",
diff --git a/vllm_omni/deploy/cosmos3_omni.yaml b/vllm_omni/deploy/cosmos3_omni.yaml
new file mode 100644
index 00000000000..2f3ed85a797
--- /dev/null
+++ b/vllm_omni/deploy/cosmos3_omni.yaml
@@ -0,0 +1,14 @@
+# Cosmos3 single-stage diffusion deploy config.
+#
+# This config is auto-loaded for Diffusers repos whose model_index.json has
+# _class_name: Cosmos3OmniDiffusersPipeline. Pass --deploy-config only for
+# local overrides such as disabling guardrails.
+
+async_chunk: false
+trust_remote_code: true
+
+stages:
+  - stage_id: 0
+    max_num_seqs: 1
+    enforce_eager: true
+    model_class_name: Cosmos3OmniDiffusersPipeline
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline.py b/vllm_omni/diffusion/models/cosmos3/pipeline.py
index a6c84959586..23bd47f0a3d 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline.py
@@ -9,7 +9,7 @@
 )
 
 COSMOS3_PIPELINE = PipelineConfig(
-    model_type="cosmos3",
+    model_type="cosmos3_omni",
     model_arch="Cosmos3ForConditionalGeneration",
     hf_architectures=("Cosmos3ForConditionalGeneration",),
     diffusers_class_name="Cosmos3OmniDiffusersPipeline",
diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py
index 79351a3266b..4ec0698da00 100644
--- a/vllm_omni/entrypoints/utils.py
+++ b/vllm_omni/entrypoints/utils.py
@@ -36,7 +36,7 @@ def _warn_deprecated_explicit_keys(kwargs: dict[str, Any]) -> None:
 
 
 _DIFFUSERS_CLASS_TO_CONFIG: dict[str, str] = {
-    "Cosmos3OmniDiffusersPipeline": "cosmos3",
+    "Cosmos3OmniDiffusersPipeline": "cosmos3_omni",
     "GlmImagePipeline": "glm_image",
 }
 

From 2e88a23ee35024f45a00fe5b75b183b955fd4d80 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Sat, 23 May 2026 00:21:45 +0200
Subject: [PATCH 30/41] Cleanup

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 vllm_omni/entrypoints/omni_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py
index f2367437ea5..7bd27be817d 100644
--- a/vllm_omni/entrypoints/omni_base.py
+++ b/vllm_omni/entrypoints/omni_base.py
@@ -356,7 +356,7 @@ def _check_engine_output_error(
         """Raise if ``engine_outputs`` carries an error field.
 
         Raises :class:`EngineDeadError` when ``self.errored`` indicates the
-        engine is unrecoverable, otherwise :class:`EngineGenerateError`
+        engine is unrecoverable, otherwise raises :class:`EngineGenerateError`
         (recoverable, single-request failure).
         """
         engine_outputs = result.engine_outputs

From bd12fd4fc609fa55f299165b802ad75e7e97c639 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Mon, 25 May 2026 10:56:23 +0200
Subject: [PATCH 31/41] Adapted to new checkpoint format

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../models/cosmos3/test_cosmos3_pipeline.py   | 17 ++++-
 .../models/cosmos3/pipeline_cosmos3.py        | 60 +++++++--------
 .../models/cosmos3/transformer_cosmos3.py     | 74 +++++++++----------
 3 files changed, 81 insertions(+), 70 deletions(-)

diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
index 28a53d35074..a936fabcee5 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -101,10 +101,21 @@ def test_prompt_formatting_and_checkpoint_key_remap(make_cosmos3_pipeline) -> No
     assert "The video is not 2.0 seconds long" in captured[1]
 
     remaps = {
+        "embed_tokens.weight": "transformer.language_model.embed_tokens.weight",
         "model.embed_tokens.weight": "transformer.language_model.embed_tokens.weight",
-        "model.layers.3.self_attn.q_proj.weight": "transformer.language_model.layers.3.self_attn.q_proj.weight",
-        "model.layers.3.self_attn.q_proj_moe_gen.weight": "transformer.gen_layers.3.cross_attention.q_proj.weight",
-        "lm_head.weight": None,
+        "norm.weight": "transformer.language_model.norm.weight",
+        "norm_moe_gen.weight": "transformer.norm_moe_gen.weight",
+        "proj_in.weight": "transformer.proj_in.weight",
+        "proj_out.bias": "transformer.proj_out.bias",
+        "layers.3.self_attn.to_q.weight": "transformer.language_model.layers.3.self_attn.to_q.weight",
+        "layers.3.self_attn.to_out.weight": "transformer.language_model.layers.3.self_attn.to_out.weight",
+        "layers.3.self_attn.norm_q.weight": "transformer.language_model.layers.3.self_attn.norm_q.weight",
+        "layers.3.self_attn.add_q_proj.weight": "transformer.gen_layers.3.cross_attention.to_q.weight",
+        "layers.3.self_attn.to_add_out.weight": "transformer.gen_layers.3.cross_attention.to_out.weight",
+        "layers.3.self_attn.norm_added_q.weight": "transformer.gen_layers.3.cross_attention.norm_q.weight",
+        "transformer.model.layers.3.self_attn.add_k_proj.weight": (
+            "transformer.gen_layers.3.cross_attention.to_k.weight"
+        ),
     }
     assert {key: Cosmos3OmniDiffusersPipeline._remap_ckpt_key(key) for key in remaps} == remaps
 
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index e721ff8741e..8dd0ae4708f 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -321,10 +321,12 @@ def _remap_ckpt_key(key: str) -> str | None:
         """Remap a Diffusers transformer key to the model parameter namespace.
 
         Checkpoint keys arrive with a synthetic ``transformer.`` prefix from
-        ``weights_sources``.  The source checkpoint itself uses the Diffusers
-        transformer namespace: top-level projections plus ``model.*`` for the
-        Qwen3-VL backbone.  UND and GEN components share each layer in the
-        source and are split into separate module lists here.
+        ``weights_sources``.  The source checkpoint itself uses the prefixless
+        Diffusers transformer namespace: top-level projections plus Qwen3-VL
+        backbone keys.  UND and GEN components share each layer in the source
+        and are split into separate module lists here.  Some sources wrap the
+        transformer namespace under ``model.``; that wrapper is structural and
+        is stripped before applying the Cosmos3 leaf-name remap.
 
         Returns the remapped name under ``transformer.``, or None to skip.
         """
@@ -332,12 +334,14 @@ def _remap_ckpt_key(key: str) -> str | None:
         # Strip the weights_sources prefix
         if k.startswith("transformer."):
             k = k[len("transformer.") :]
+        if k.startswith("model."):
+            k = k[len("model.") :]
 
         # Top-level generation components.
         if k.startswith(
             (
-                "vae2llm.",
-                "llm2vae.",
+                "proj_in.",
+                "proj_out.",
                 "time_embedder.",
             )
         ):
@@ -347,19 +351,15 @@ def _remap_ckpt_key(key: str) -> str | None:
         if k.startswith("lm_head."):
             return None
 
-        # embed_tokens / norm → language_model.*
-        if k.startswith("model.embed_tokens."):
-            return f"transformer.language_model.{k[len('model.') :]}"
-        if k.startswith("model.norm."):
-            return f"transformer.language_model.{k[len('model.') :]}"
+        # embed_tokens / norm -> language_model.*
+        if k.startswith("embed_tokens."):
+            return f"transformer.language_model.{k}"
+        if k.startswith("norm."):
+            return f"transformer.language_model.{k}"
 
-        # norm_moe_gen → top level
-        if k.startswith("model.norm_moe_gen."):
-            return f"transformer.{k[len('model.') :]}"
-
-        if not k.startswith("model.layers."):
-            return None
-        k = k[len("model.") :]
+        # norm_moe_gen -> top level
+        if k.startswith("norm_moe_gen."):
+            return f"transformer.{k}"
 
         if not k.startswith("layers."):
             return None
@@ -375,19 +375,19 @@ def _remap_ckpt_key(key: str) -> str | None:
 
         _LAYER_MAP = {
             # UND attention
-            "self_attn.q_proj.": f"{und_lp}.self_attn.q_proj.",
-            "self_attn.k_proj.": f"{und_lp}.self_attn.k_proj.",
-            "self_attn.v_proj.": f"{und_lp}.self_attn.v_proj.",
-            "self_attn.o_proj.": f"{und_lp}.self_attn.o_proj.",
-            "self_attn.q_norm.": f"{und_lp}.self_attn.q_norm.",
-            "self_attn.k_norm.": f"{und_lp}.self_attn.k_norm.",
+            "self_attn.to_q.": f"{und_lp}.self_attn.to_q.",
+            "self_attn.to_k.": f"{und_lp}.self_attn.to_k.",
+            "self_attn.to_v.": f"{und_lp}.self_attn.to_v.",
+            "self_attn.to_out.": f"{und_lp}.self_attn.to_out.",
+            "self_attn.norm_q.": f"{und_lp}.self_attn.norm_q.",
+            "self_attn.norm_k.": f"{und_lp}.self_attn.norm_k.",
             # GEN attention
-            "self_attn.q_proj_moe_gen.": f"{gen_lp}.cross_attention.q_proj.",
-            "self_attn.k_proj_moe_gen.": f"{gen_lp}.cross_attention.k_proj.",
-            "self_attn.v_proj_moe_gen.": f"{gen_lp}.cross_attention.v_proj.",
-            "self_attn.o_proj_moe_gen.": f"{gen_lp}.cross_attention.o_proj.",
-            "self_attn.q_norm_moe_gen.": f"{gen_lp}.cross_attention.q_norm.",
-            "self_attn.k_norm_moe_gen.": f"{gen_lp}.cross_attention.k_norm.",
+            "self_attn.add_q_proj.": f"{gen_lp}.cross_attention.to_q.",
+            "self_attn.add_k_proj.": f"{gen_lp}.cross_attention.to_k.",
+            "self_attn.add_v_proj.": f"{gen_lp}.cross_attention.to_v.",
+            "self_attn.to_add_out.": f"{gen_lp}.cross_attention.to_out.",
+            "self_attn.norm_added_q.": f"{gen_lp}.cross_attention.norm_q.",
+            "self_attn.norm_added_k.": f"{gen_lp}.cross_attention.norm_k.",
             # Norms
             "input_layernorm.": f"{und_lp}.input_layernorm.",
             "post_attention_layernorm.": f"{und_lp}.post_attention_layernorm.",
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 7568207b7eb..c1f6464caf3 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -335,45 +335,45 @@ def __init__(
         self.num_heads_local = self.num_heads // tp_size
         self.num_kv_heads_local = self.num_kv_heads // tp_size
 
-        self.q_proj = ColumnParallelLinear(
+        self.to_q = ColumnParallelLinear(
             hidden_size,
             self.num_heads * self.head_dim,
             bias=False,
             gather_output=False,
             return_bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.q_proj",
+            prefix=f"{prefix}.to_q",
         )
-        self.k_proj = ColumnParallelLinear(
+        self.to_k = ColumnParallelLinear(
             hidden_size,
             self.num_kv_heads * self.head_dim,
             bias=False,
             gather_output=False,
             return_bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.k_proj",
+            prefix=f"{prefix}.to_k",
         )
-        self.v_proj = ColumnParallelLinear(
+        self.to_v = ColumnParallelLinear(
             hidden_size,
             self.num_kv_heads * self.head_dim,
             bias=False,
             gather_output=False,
             return_bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.v_proj",
+            prefix=f"{prefix}.to_v",
         )
-        self.o_proj = RowParallelLinear(
+        self.to_out = RowParallelLinear(
             self.num_heads * self.head_dim,
             hidden_size,
             bias=False,
             input_is_parallel=True,
             return_bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.o_proj",
+            prefix=f"{prefix}.to_out",
         )
 
-        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
-        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.norm_q = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.norm_k = RMSNorm(self.head_dim, eps=rms_norm_eps)
 
     def forward(
         self,
@@ -384,13 +384,13 @@ def forward(
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         B, S, _ = hidden_states.shape
 
-        q = self.q_proj(hidden_states).view(B, S, self.num_heads_local, self.head_dim)
-        k = self.k_proj(hidden_states).view(B, S, self.num_kv_heads_local, self.head_dim)
-        v = self.v_proj(hidden_states).view(B, S, self.num_kv_heads_local, self.head_dim)
+        q = self.to_q(hidden_states).view(B, S, self.num_heads_local, self.head_dim)
+        k = self.to_k(hidden_states).view(B, S, self.num_kv_heads_local, self.head_dim)
+        v = self.to_v(hidden_states).view(B, S, self.num_kv_heads_local, self.head_dim)
 
         # Per-head QK norm
-        q = F.rms_norm(q, (q.shape[-1],), self.q_norm.weight, self.q_norm.variance_epsilon)
-        k = F.rms_norm(k, (k.shape[-1],), self.k_norm.weight, self.k_norm.variance_epsilon)
+        q = F.rms_norm(q, (q.shape[-1],), self.norm_q.weight, self.norm_q.variance_epsilon)
+        k = F.rms_norm(k, (k.shape[-1],), self.norm_k.weight, self.norm_k.variance_epsilon)
 
         # Qwen3-style RoPE
         q, k = _apply_rotary_pos_emb(q, k, freqs_cos, freqs_sin)
@@ -409,7 +409,7 @@ def forward(
             out = F.scaled_dot_product_attention(q_t, k_t, v_t, is_causal=True, enable_gqa=True)
 
         out = out.transpose(1, 2).contiguous().view(B, S, -1)
-        return self.o_proj(out), k, v
+        return self.to_out(out), k, v
 
 
 class Cosmos3CrossAttention(nn.Module):
@@ -449,45 +449,45 @@ def __init__(
         self.num_heads_local = self.num_heads // tp_size
         self.num_kv_heads_local = self.num_kv_heads // tp_size
 
-        self.q_proj = ColumnParallelLinear(
+        self.to_q = ColumnParallelLinear(
             hidden_size,
             self.num_heads * self.head_dim,
             bias=False,
             gather_output=False,
             return_bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.q_proj",
+            prefix=f"{prefix}.to_q",
         )
-        self.k_proj = ColumnParallelLinear(
+        self.to_k = ColumnParallelLinear(
             hidden_size,
             self.num_kv_heads * self.head_dim,
             bias=False,
             gather_output=False,
             return_bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.k_proj",
+            prefix=f"{prefix}.to_k",
         )
-        self.v_proj = ColumnParallelLinear(
+        self.to_v = ColumnParallelLinear(
             hidden_size,
             self.num_kv_heads * self.head_dim,
             bias=False,
             gather_output=False,
             return_bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.v_proj",
+            prefix=f"{prefix}.to_v",
         )
-        self.o_proj = RowParallelLinear(
+        self.to_out = RowParallelLinear(
             self.num_heads * self.head_dim,
             hidden_size,
             bias=False,
             input_is_parallel=True,
             return_bias=False,
             quant_config=quant_config,
-            prefix=f"{prefix}.o_proj",
+            prefix=f"{prefix}.to_out",
         )
 
-        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
-        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.norm_q = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.norm_k = RMSNorm(self.head_dim, eps=rms_norm_eps)
 
         self.local_attn = FrameworkAttention(
             num_heads=self.num_heads_local,
@@ -576,13 +576,13 @@ def forward(
         """
         B, S_gen, _ = hidden_states.shape
 
-        q = self.q_proj(hidden_states).view(B, S_gen, self.num_heads_local, self.head_dim)
-        k = self.k_proj(hidden_states).view(B, S_gen, self.num_kv_heads_local, self.head_dim)
-        v = self.v_proj(hidden_states).view(B, S_gen, self.num_kv_heads_local, self.head_dim)
+        q = self.to_q(hidden_states).view(B, S_gen, self.num_heads_local, self.head_dim)
+        k = self.to_k(hidden_states).view(B, S_gen, self.num_kv_heads_local, self.head_dim)
+        v = self.to_v(hidden_states).view(B, S_gen, self.num_kv_heads_local, self.head_dim)
 
         # Per-head QK norm
-        q = F.rms_norm(q, (q.shape[-1],), self.q_norm.weight, self.q_norm.variance_epsilon)
-        k = F.rms_norm(k, (k.shape[-1],), self.k_norm.weight, self.k_norm.variance_epsilon)
+        q = F.rms_norm(q, (q.shape[-1],), self.norm_q.weight, self.norm_q.variance_epsilon)
+        k = F.rms_norm(k, (k.shape[-1],), self.norm_k.weight, self.norm_k.variance_epsilon)
 
         # Qwen3-style RoPE
         q, k = _apply_rotary_pos_emb(q, k, freqs_cos, freqs_sin)
@@ -592,7 +592,7 @@ def forward(
         else:
             out = self._forward_local(q, k, v, k_und, v_und)
 
-        return self.o_proj(out)
+        return self.to_out(out)
 
 
 # ---------------------------------------------------------------------------
@@ -930,9 +930,9 @@ def __init__(
             prefix="language_model",
         )
 
-        # vae2llm / llm2vae are small projection layers; not worth quantizing.
-        self.vae2llm = nn.Linear(self.patch_latent_dim, self.hidden_size)
-        self.llm2vae = nn.Linear(self.hidden_size, self.patch_latent_dim)
+        # Video projection layers are small; not worth quantizing.
+        self.proj_in = nn.Linear(self.patch_latent_dim, self.hidden_size)
+        self.proj_out = nn.Linear(self.hidden_size, self.patch_latent_dim)
         self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=torch.bfloat16)
 
         self.gen_layers = nn.ModuleList(
@@ -1127,7 +1127,7 @@ def forward(
         ulysses_size, _, _ = _get_ulysses_state()
 
         # Patchify latents and project to hidden space
-        hidden_video = self.vae2llm(self.patchify(hidden_states, t, h, w))
+        hidden_video = self.proj_in(self.patchify(hidden_states, t, h, w))
         s_video = hidden_video.shape[1]
 
         # Timestep embedding (fp32 for precision).
@@ -1213,7 +1213,7 @@ def forward(
 
         # Final norm and project back to latent space
         hidden_gen = self.norm_moe_gen(hidden_gen)
-        return self.unpatchify(self.llm2vae(hidden_gen), t, h, w)
+        return self.unpatchify(self.proj_out(hidden_gen), t, h, w)
 
     def post_load_weights(self) -> None:
         """Post-load processing: ensure correct dtypes."""

From 2c8ec8405a6ac15d3c99f4552263997daca2f5d2 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Mon, 25 May 2026 18:40:32 +0200
Subject: [PATCH 32/41] Linter fixes

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 tests/diffusion/cache/test_cache_dit.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/diffusion/cache/test_cache_dit.py b/tests/diffusion/cache/test_cache_dit.py
index 62e1c78e6fc..146084a508c 100644
--- a/tests/diffusion/cache/test_cache_dit.py
+++ b/tests/diffusion/cache/test_cache_dit.py
@@ -64,8 +64,8 @@ def test_cosmos3_cache_dit_wraps_gen_layers(mock_cache_dit, mock_block_adapter):
     assert adapter_kwargs["blocks"] == [gen_layers]
     assert adapter_kwargs["has_separate_cfg"] is True
     assert adapter_kwargs["check_forward_pattern"] is False
-    
-    
+
+
 # This test is skipped on ROCm since rocm_unquantized_gemm doesn't support CPU backend
 @pytest.mark.skipif(
     current_omni_platform.is_rocm(),

From 6d16f65688f57d4c0943f3c11947edeae2cc8f5f Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Mon, 25 May 2026 23:04:13 +0200
Subject: [PATCH 33/41] Cleaned up the code

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../models/cosmos3/pipeline_cosmos3.py        |  8 +++-
 .../models/cosmos3/transformer_cosmos3.py     | 40 +++++++------------
 vllm_omni/inputs/data.py                      |  2 +
 3 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 8dd0ae4708f..5921a535c01 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -238,7 +238,7 @@ def __init__(
             )
         self.od_config = od_config
         self.device = get_local_device()
-        self.dtype = getattr(od_config, "dtype", torch.bfloat16)
+        self.dtype = od_config.dtype
 
         model_path = od_config.model
         local_files_only = os.path.exists(model_path)
@@ -254,7 +254,7 @@ def __init__(
         self.vae = DistributedAutoencoderKLWan.from_pretrained(
             model_path,
             subfolder="vae",
-            torch_dtype=torch.bfloat16,
+            torch_dtype=self.dtype,
             local_files_only=local_files_only,
         ).to(self.device)
 
@@ -493,6 +493,10 @@ def _is_t2i_request(req: OmniDiffusionRequest) -> bool:
             modalities = [modalities]
         if "image" in modalities and "video" in modalities:
             raise ValueError("Cosmos3 prompt modalities cannot request both image and video output.")
+
+        accepted_modalities = ["image", "video", "text", "audio"]
+        if any([x not in accepted_modalities for x in modalities]):
+            raise ValueError(f"Incorrect modality value in {modalities}, expected one of {accepted_modalities}.")
         return "image" in modalities
 
     def _set_flow_shift(self, target_shift: float) -> None:
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index c1f6464caf3..8406edb7b96 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -30,6 +30,7 @@
 
 from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata
 from vllm_omni.diffusion.attention.layer import Attention as FrameworkAttention
+from vllm_omni.diffusion.data import OmniDiffusionConfig
 from vllm_omni.diffusion.distributed.sp_plan import SequenceParallelInput, SequenceParallelOutput
 from vllm_omni.diffusion.layers.norm import RMSNorm
 
@@ -41,19 +42,16 @@ def _get_ulysses_state() -> tuple[int, int, dist.ProcessGroup | None]:
 
     Returns (1, 0, None) when sequence parallelism is not active.
     """
-    try:
-        from vllm_omni.diffusion.distributed.parallel_state import (
-            get_sp_group,
-            get_ulysses_parallel_rank,
-            get_ulysses_parallel_world_size,
-        )
-
-        size = get_ulysses_parallel_world_size()
-        if size <= 1:
-            return 1, 0, None
-        return size, get_ulysses_parallel_rank(), get_sp_group().ulysses_group
-    except Exception:
+    from vllm_omni.diffusion.distributed.parallel_state import (
+        get_sp_group,
+        get_ulysses_parallel_rank,
+        get_ulysses_parallel_world_size,
+    )
+
+    size = get_ulysses_parallel_world_size()
+    if size <= 1:
         return 1, 0, None
+    return size, get_ulysses_parallel_rank(), get_sp_group().ulysses_group
 
 
 def _is_sp_active() -> bool:
@@ -321,7 +319,6 @@ def __init__(
         num_key_value_heads: int,
         head_dim: int,
         rms_norm_eps: float,
-        dtype: torch.dtype = torch.bfloat16,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
@@ -435,7 +432,6 @@ def __init__(
         num_key_value_heads: int,
         head_dim: int,
         rms_norm_eps: float,
-        dtype: torch.dtype = torch.bfloat16,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
@@ -610,7 +606,6 @@ def __init__(
         num_key_value_heads: int,
         head_dim: int,
         rms_norm_eps: float,
-        dtype: torch.dtype = torch.bfloat16,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
@@ -621,7 +616,6 @@ def __init__(
             num_key_value_heads=num_key_value_heads,
             head_dim=head_dim,
             rms_norm_eps=rms_norm_eps,
-            dtype=dtype,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
         )
@@ -668,7 +662,6 @@ def __init__(
         num_key_value_heads: int,
         head_dim: int,
         rms_norm_eps: float,
-        dtype: torch.dtype = torch.bfloat16,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
@@ -680,7 +673,6 @@ def __init__(
             num_key_value_heads=num_key_value_heads,
             head_dim=head_dim,
             rms_norm_eps=rms_norm_eps,
-            dtype=dtype,
             quant_config=quant_config,
             prefix=f"{prefix}.cross_attention",
         )
@@ -752,7 +744,6 @@ def __init__(
         rms_norm_eps: float,
         rope_theta: float,
         mrope_section: list[int],
-        dtype: torch.dtype = torch.bfloat16,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
@@ -772,7 +763,6 @@ def __init__(
                     num_key_value_heads=num_key_value_heads,
                     head_dim=head_dim,
                     rms_norm_eps=rms_norm_eps,
-                    dtype=dtype,
                     quant_config=quant_config,
                     prefix=f"{prefix}.layers.{i}",
                 )
@@ -875,12 +865,12 @@ def _validate_supported_config(model_config: Any) -> None:
 
     def __init__(
         self,
-        od_config: object | None = None,
+        od_config: OmniDiffusionConfig,
         *,
         temporal_compression_factor: int | None = None,
     ) -> None:
         super().__init__()
-        model_config = getattr(od_config, "tf_model_config", None) if od_config else None
+        model_config = od_config.tf_model_config
         self._validate_supported_config(model_config)
         rope_scaling = _tf_config_get(model_config, "rope_scaling", {}) or {}
 
@@ -911,7 +901,7 @@ def __init__(
         )
         self.patch_latent_dim = (self.latent_patch_size**2) * self.latent_channel_size
 
-        dtype = getattr(od_config, "dtype", torch.bfloat16) if od_config else torch.bfloat16
+        dtype = od_config.dtype
         quant_config = getattr(od_config, "quantization_config", None) if od_config else None
 
         self.language_model = Cosmos3LanguageModel(
@@ -925,7 +915,6 @@ def __init__(
             rms_norm_eps=self.rms_norm_eps,
             rope_theta=self.rope_theta,
             mrope_section=self.mrope_section,
-            dtype=dtype,
             quant_config=quant_config,
             prefix="language_model",
         )
@@ -933,7 +922,7 @@ def __init__(
         # Video projection layers are small; not worth quantizing.
         self.proj_in = nn.Linear(self.patch_latent_dim, self.hidden_size)
         self.proj_out = nn.Linear(self.hidden_size, self.patch_latent_dim)
-        self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=torch.bfloat16)
+        self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=dtype)
 
         self.gen_layers = nn.ModuleList(
             [
@@ -945,7 +934,6 @@ def __init__(
                     num_key_value_heads=self.num_key_value_heads,
                     head_dim=self.head_dim,
                     rms_norm_eps=self.rms_norm_eps,
-                    dtype=dtype,
                     quant_config=quant_config,
                     prefix=f"gen_layers.{i}",
                 )
diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py
index 99877b6b9c0..a2b6bf722d5 100644
--- a/vllm_omni/inputs/data.py
+++ b/vllm_omni/inputs/data.py
@@ -33,6 +33,8 @@ class OmniTextPrompt(TextPrompt):
     """
 
     negative_prompt: NotRequired[str]
+    # Using modalities field to differentiate between different tasks for the same pipeline
+    # for example Cosmos3OmniDiffusersPipeline handles t2i and t2v in the same pipeline.
     modalities: NotRequired[list[str]]
     prompt_embeds: NotRequired[torch.Tensor]
     negative_prompt_embeds: NotRequired[torch.Tensor]

From 85dfeb262a536aa2a98e23f17743877240e871cf Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Mon, 25 May 2026 23:45:12 +0200
Subject: [PATCH 34/41] Fixed a test

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../cosmos3/test_cosmos3_transformer.py       |  10 +-
 tests/diffusion/test_diffusion_engine.py      |  77 +++++++++++
 tests/diffusion/test_diffusion_ipc.py         | 120 +++++++++++++++++-
 3 files changed, 203 insertions(+), 4 deletions(-)

diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index c4a2721099b..730079c116a 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -101,10 +101,14 @@ def test_transformer_sharding_offload_and_patch_round_trip_contracts() -> None:
     torch.testing.assert_close(model.unpatchify(model.patchify(latents, t=1, h=3, w=5), t=1, h=3, w=5), latents)
 
 
-def test_forward_returns_video_prediction() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+def test_forward_returns_video_prediction(monkeypatch: pytest.MonkeyPatch) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import transformer_cosmos3
+
+    monkeypatch.setattr(transformer_cosmos3, "_get_ulysses_state", lambda: (1, 0, None))
 
-    output = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32))(
+    output = transformer_cosmos3.Cosmos3VFMTransformer(
+        SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32)
+    )(
         hidden_states=torch.zeros(1, 2, 1, 2, 2),
         timestep=torch.tensor([1.0]),
         text_ids=torch.tensor([[1, 2]], dtype=torch.long),
diff --git a/tests/diffusion/test_diffusion_engine.py b/tests/diffusion/test_diffusion_engine.py
index dca9a001ad8..5091a6f6e5a 100644
--- a/tests/diffusion/test_diffusion_engine.py
+++ b/tests/diffusion/test_diffusion_engine.py
@@ -10,6 +10,9 @@
 from typing import Any
 
 import pytest
+import torch
+
+from vllm_omni.diffusion.diffusion_engine import _move_tensor_tree_to_cpu
 
 
 @dataclass
@@ -63,6 +66,80 @@ def update_from_output(self, sched_output, runner_output):
         return [req.request_id for req in sched_output.scheduled_new_reqs]
 
 
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@pytest.mark.cpu
+def test_move_tensor_tree_keeps_cpu_tensor_identity() -> None:
+    tensor = torch.arange(8, dtype=torch.float32)
+
+    moved = _move_tensor_tree_to_cpu(tensor)
+
+    assert moved is tensor
+
+
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@pytest.mark.cpu
+def test_move_tensor_tree_preserves_nested_structure_without_mutating_input() -> None:
+    tensor = torch.arange(4, dtype=torch.float32)
+    nested_tensor = torch.arange(6, dtype=torch.float32).reshape(2, 3)
+    sentinel = object()
+    payload = {
+        "tensor": tensor,
+        "list": [nested_tensor, sentinel],
+        "tuple": ({"inner": tensor}, "metadata"),
+        "scalar": 3,
+    }
+
+    moved = _move_tensor_tree_to_cpu(payload)
+
+    assert moved is not payload
+    assert set(moved) == {"tensor", "list", "tuple", "scalar"}
+    assert moved["list"] is not payload["list"]
+    assert moved["tuple"] is not payload["tuple"]
+    assert moved["tuple"][0] is not payload["tuple"][0]
+    assert moved["tensor"] is tensor
+    assert moved["list"][0] is nested_tensor
+    assert moved["list"][1] is sentinel
+    assert moved["tuple"][0]["inner"] is tensor
+    assert moved["tuple"][1] == "metadata"
+    assert moved["scalar"] == 3
+    assert payload["list"][0] is nested_tensor
+    assert payload["list"][1] is sentinel
+    assert payload["tuple"][0]["inner"] is tensor
+    assert payload["tuple"][1] == "metadata"
+
+
+@pytest.mark.core_model
+@pytest.mark.diffusion
+@pytest.mark.cpu
+def test_move_tensor_tree_returns_non_tensor_values_unchanged() -> None:
+    value = object()
+
+    moved = _move_tensor_tree_to_cpu(value)
+
+    assert moved is value
+
+
+@pytest.mark.diffusion
+@pytest.mark.cuda
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+def test_move_tensor_tree_moves_nested_cuda_tensors_to_cpu() -> None:
+    tensor = torch.arange(8, dtype=torch.float32, device="cuda")
+    other = torch.arange(4, dtype=torch.int64, device="cuda")
+    payload = {"tensor": tensor, "items": [other, ("keep", tensor)]}
+
+    moved = _move_tensor_tree_to_cpu(payload)
+
+    assert moved["tensor"].device.type == "cpu"
+    assert moved["items"][0].device.type == "cpu"
+    assert moved["items"][1][1].device.type == "cpu"
+    torch.testing.assert_close(moved["tensor"], tensor.cpu())
+    torch.testing.assert_close(moved["items"][0], other.cpu())
+    torch.testing.assert_close(moved["items"][1][1], tensor.cpu())
+    assert moved["items"][1][0] == "keep"
+
+
 @pytest.mark.asyncio
 async def test_async_add_req_and_wait_for_response():
     from vllm_omni.diffusion.diffusion_engine import DiffusionEngine
diff --git a/tests/diffusion/test_diffusion_ipc.py b/tests/diffusion/test_diffusion_ipc.py
index 43e96b834f6..b7995e51601 100644
--- a/tests/diffusion/test_diffusion_ipc.py
+++ b/tests/diffusion/test_diffusion_ipc.py
@@ -1,10 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import contextlib
+
+import pytest
 import torch
 
 from vllm_omni.diffusion.data import DiffusionOutput
-from vllm_omni.diffusion.ipc import pack_diffusion_output_shm, unpack_diffusion_output_shm
+from vllm_omni.diffusion.ipc import (
+    _SHM_TENSOR_THRESHOLD,
+    _pack_value_if_large,
+    _unpack_if_shm_handle,
+    pack_diffusion_output_shm,
+    unpack_diffusion_output_shm,
+)
+
+pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
+
+
+def _large_numel(dtype: torch.dtype) -> int:
+    return (_SHM_TENSOR_THRESHOLD // torch.empty((), dtype=dtype).element_size()) + 1
+
+
+def _cleanup_shm_handle(value: object) -> None:
+    if isinstance(value, dict) and value.get("__tensor_shm__"):
+        with contextlib.suppress(FileNotFoundError):
+            _unpack_if_shm_handle(value)
 
 
 def test_diffusion_output_dict_tensors_round_trip_through_shm() -> None:
@@ -23,3 +44,100 @@ def test_diffusion_output_dict_tensors_round_trip_through_shm() -> None:
     torch.testing.assert_close(output.output["image"], image)
     torch.testing.assert_close(output.output["video"], video)
     assert output.output["metadata"] == {"keep": "inline"}
+
+
+def test_pack_value_keeps_tensor_at_threshold_inline() -> None:
+    tensor = torch.arange(
+        _SHM_TENSOR_THRESHOLD // torch.empty((), dtype=torch.float32).element_size(),
+        dtype=torch.float32,
+    )
+
+    packed = _pack_value_if_large(tensor)
+
+    assert packed is tensor
+
+
+def test_pack_value_packs_large_tensor_and_round_trips() -> None:
+    tensor = torch.arange(_large_numel(torch.float32), dtype=torch.float32)
+    packed = _pack_value_if_large(tensor)
+
+    try:
+        assert isinstance(packed, dict)
+        assert packed["__tensor_shm__"] is True
+        assert packed["shape"] == [tensor.numel()]
+        assert packed["torch_dtype"] == "torch.float32"
+
+        unpacked = _unpack_if_shm_handle(packed)
+        assert isinstance(unpacked, torch.Tensor)
+        torch.testing.assert_close(unpacked, tensor)
+    finally:
+        _cleanup_shm_handle(packed)
+
+
+def test_pack_value_recurses_nested_dicts_without_mutating_other_values() -> None:
+    large = torch.arange(_large_numel(torch.float32), dtype=torch.float32)
+    small = torch.arange(8, dtype=torch.float32)
+    list_tensor = torch.arange(_large_numel(torch.float32), dtype=torch.float32)
+    payload = {
+        "media": {
+            "large": large,
+            "small": small,
+        },
+        "list_value": [list_tensor],
+        "metadata": {"prompt": "keep inline"},
+    }
+
+    packed = _pack_value_if_large(payload)
+
+    try:
+        assert packed is not payload
+        assert packed["media"] is not payload["media"]
+        assert packed["media"]["large"]["__tensor_shm__"] is True
+        assert packed["media"]["small"] is small
+        assert packed["list_value"] is payload["list_value"]
+        assert packed["list_value"][0] is list_tensor
+        assert packed["metadata"] == {"prompt": "keep inline"}
+
+        unpacked_large = _unpack_if_shm_handle(packed["media"]["large"])
+        assert isinstance(unpacked_large, torch.Tensor)
+        torch.testing.assert_close(unpacked_large, large)
+    finally:
+        handle = packed.get("media", {}).get("large") if isinstance(packed, dict) else None
+        _cleanup_shm_handle(handle)
+
+
+def test_pack_value_preserves_dtype_shape_and_values_for_bfloat16() -> None:
+    tensor = torch.arange(_large_numel(torch.bfloat16), dtype=torch.float32).to(torch.bfloat16).reshape(1, -1)
+    packed = _pack_value_if_large(tensor)
+
+    try:
+        assert isinstance(packed, dict)
+        assert packed["__tensor_shm__"] is True
+        assert packed["shape"] == list(tensor.shape)
+        assert packed["torch_dtype"] == "torch.bfloat16"
+        assert packed["numpy_dtype"] == "float32"
+
+        unpacked = _unpack_if_shm_handle(packed)
+        assert isinstance(unpacked, torch.Tensor)
+        assert unpacked.dtype == torch.bfloat16
+        torch.testing.assert_close(unpacked, tensor)
+    finally:
+        _cleanup_shm_handle(packed)
+
+
+def test_pack_value_packs_non_contiguous_large_tensor_values() -> None:
+    tensor = torch.arange(_large_numel(torch.float32) * 2, dtype=torch.float32).reshape(-1, 2)[:, 0]
+    assert not tensor.is_contiguous()
+
+    packed = _pack_value_if_large(tensor)
+
+    try:
+        assert isinstance(packed, dict)
+        assert packed["__tensor_shm__"] is True
+        assert packed["shape"] == list(tensor.shape)
+
+        unpacked = _unpack_if_shm_handle(packed)
+        assert isinstance(unpacked, torch.Tensor)
+        torch.testing.assert_close(unpacked, tensor)
+    finally:
+        _cleanup_shm_handle(packed)

From c6185a1324397b3729b1c67f4b59b8438d317957 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 26 May 2026 10:17:12 +0200
Subject: [PATCH 35/41] Fixed multi-GPU with deploy configs

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 tests/test_config_factory.py     | 58 +++++++++++++++++++++++++++++
 vllm_omni/config/stage_config.py | 63 ++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)

diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py
index 9ac3d859c1e..2691aadf51c 100644
--- a/tests/test_config_factory.py
+++ b/tests/test_config_factory.py
@@ -173,6 +173,64 @@ def test_to_omegaconf_omits_none_deploy_overrides_for_engine_args(self):
         for name in deploy_override_field_names() - {"devices"}:
             assert name not in engine_args
 
+    def test_to_omegaconf_diffusion_folds_flat_parallel_keys(self):
+        """Flat parallelism overrides on a diffusion stage must be folded into
+        ``engine_args.parallel_config`` and reflected in ``runtime.devices``.
+
+        Without this, OmniDiffusionConfig.from_kwargs filters out flat
+        ``ulysses_degree`` / ``ring_degree`` / etc. and the diffusion pipeline
+        launches single-GPU regardless of the CLI flag.
+        """
+        config = StageConfig(
+            stage_id=0,
+            model_stage="diffusion",
+            stage_type=StageType.DIFFUSION,
+            runtime_overrides={"ulysses_degree": 2},
+        )
+        omega_config = config.to_omegaconf()
+
+        assert "ulysses_degree" not in omega_config.engine_args
+        assert omega_config.engine_args.parallel_config.ulysses_degree == 2
+        assert omega_config.runtime.devices == "0,1"
+
+    def test_to_omegaconf_diffusion_default_world_size_one(self):
+        """Diffusion stages with no parallelism overrides default to a single GPU."""
+        config = StageConfig(
+            stage_id=0,
+            model_stage="diffusion",
+            stage_type=StageType.DIFFUSION,
+        )
+        omega_config = config.to_omegaconf()
+        assert omega_config.runtime.devices == "0"
+
+    def test_to_omegaconf_diffusion_preserves_explicit_devices(self):
+        """When the deploy YAML or CLI sets ``runtime.devices`` explicitly, the
+        derived world-size calculation must not overwrite it."""
+        config = StageConfig(
+            stage_id=0,
+            model_stage="diffusion",
+            stage_type=StageType.DIFFUSION,
+            yaml_runtime={"devices": "2,3"},
+            runtime_overrides={"ulysses_degree": 2},
+        )
+        omega_config = config.to_omegaconf()
+        assert omega_config.runtime.devices == "2,3"
+        assert omega_config.engine_args.parallel_config.ulysses_degree == 2
+
+    def test_to_omegaconf_diffusion_merges_yaml_parallel_config(self):
+        """CLI flat overrides win over a deploy YAML ``parallel_config`` dict."""
+        config = StageConfig(
+            stage_id=0,
+            model_stage="diffusion",
+            stage_type=StageType.DIFFUSION,
+            yaml_engine_args={"parallel_config": {"tensor_parallel_size": 1, "ulysses_degree": 1}},
+            runtime_overrides={"ulysses_degree": 4},
+        )
+        omega_config = config.to_omegaconf()
+        assert omega_config.engine_args.parallel_config.ulysses_degree == 4
+        assert omega_config.engine_args.parallel_config.tensor_parallel_size == 1
+        assert omega_config.runtime.devices == "0,1,2,3"
+
 
 class TestModelPipeline:
     """Tests for ModelPipeline class."""
diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index c459ecabe73..651706eef90 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -7,6 +7,7 @@
 import dataclasses
 import re
 import warnings
+from collections.abc import Mapping
 from dataclasses import asdict, dataclass, field, fields
 from enum import Enum
 from pathlib import Path
@@ -41,6 +42,58 @@ def _warn_deprecated_kwargs(kwargs: dict[str, Any]) -> None:
 _STAGE_OVERRIDE_PATTERN = re.compile(r"^stage_(\d+)_(.+)$")
 
 
+def _diffusion_parallel_field_names() -> frozenset[str]:
+    """Names of the fields on ``DiffusionParallelConfig``.
+
+    Lazy-imported because ``vllm_omni.diffusion.data`` pulls in heavy
+    diffusion-only dependencies that we don't want loaded just to construct
+    LLM stage configs.
+    """
+    from vllm_omni.diffusion.data import DiffusionParallelConfig
+
+    return frozenset(f.name for f in fields(DiffusionParallelConfig))
+
+
+def _fold_flat_parallel_keys(engine_args: dict[str, Any]) -> dict[str, Any]:
+    """Fold flat parallelism keys in *engine_args* into a nested ``parallel_config``.
+
+    Diffusion stages express parallelism via ``OmniDiffusionConfig.parallel_config``
+    (a ``DiffusionParallelConfig``), not via top-level engine args. CLI flags
+    like ``--ulysses-degree`` land as flat keys in ``engine_args`` and would be
+    silently dropped by ``OmniDiffusionConfig.from_kwargs`` (which filters
+    kwargs to ``OmniDiffusionConfig``'s own field set), leaving
+    ``parallel_config`` at its default with ``world_size=1``.
+
+    Returns the merged parallel_config dict so callers can use it to derive
+    ``runtime.devices`` without instantiating ``DiffusionParallelConfig`` twice.
+    """
+    parallel_field_names = _diffusion_parallel_field_names()
+    parallel_dict: dict[str, Any] = {}
+
+    existing = engine_args.get("parallel_config")
+    if isinstance(existing, Mapping):
+        parallel_dict.update(dict(existing))
+    elif dataclasses.is_dataclass(existing) and not isinstance(existing, type):
+        parallel_dict.update(asdict(existing))
+
+    flat_keys = parallel_field_names & engine_args.keys()
+    for key in flat_keys:
+        value = engine_args.pop(key)
+        if value is not None:
+            parallel_dict[key] = value
+
+    if parallel_dict:
+        engine_args["parallel_config"] = parallel_dict
+    return parallel_dict
+
+
+def _diffusion_world_size(parallel_dict: dict[str, Any]) -> int:
+    """Build a ``DiffusionParallelConfig`` from *parallel_dict* and return ``world_size``."""
+    from vllm_omni.diffusion.data import DiffusionParallelConfig
+
+    return max(1, int(DiffusionParallelConfig.from_dict(parallel_dict).world_size))
+
+
 def build_stage_runtime_overrides(
     stage_id: int,
     cli_overrides: dict[str, Any],
@@ -951,6 +1004,16 @@ def to_omegaconf(self) -> Any:
         if self.runtime_overrides.get("devices") is not None:
             runtime["devices"] = self.runtime_overrides["devices"]
 
+        if StageType(self.stage_type) == StageType.DIFFUSION:
+            parallel_dict = _fold_flat_parallel_keys(engine_args)
+            # Mirror the legacy ``_create_default_diffusion_stage_cfg`` factory:
+            # when neither the deploy YAML nor CLI set ``runtime.devices``,
+            # derive it from ``parallel_config.world_size`` so the orchestrator
+            # spawns the right number of GPU workers.
+            if "devices" not in runtime:
+                world_size = _diffusion_world_size(parallel_dict)
+                runtime["devices"] = ",".join(str(i) for i in range(world_size))
+
         # Legacy compat: migrate runtime.max_batch_size → engine_args.max_num_seqs
         legacy_mbs = runtime.pop("max_batch_size", None)
         cli_mbs = self.runtime_overrides.get("max_batch_size")

From ed5e391f3630b64df303ce23f7c7788eb31438a7 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 28 May 2026 10:37:52 +0200
Subject: [PATCH 36/41] Linter fix

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 tests/test_config_factory.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py
index a8f6b7dbd63..bc32309d8aa 100644
--- a/tests/test_config_factory.py
+++ b/tests/test_config_factory.py
@@ -230,7 +230,7 @@ def test_to_omegaconf_diffusion_merges_yaml_parallel_config(self):
         assert omega_config.engine_args.parallel_config.ulysses_degree == 4
         assert omega_config.engine_args.parallel_config.tensor_parallel_size == 1
         assert omega_config.runtime.devices == "0,1,2,3"
-        
+
     def test_to_omegaconf_diffusion_parallel_overrides_replace_nested_values(self):
         config = StageConfig(
             stage_id=1,

From 5a4eb97198018f0c0825d4e8611db20ab9f99d18 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 28 May 2026 12:18:26 +0200
Subject: [PATCH 37/41] Reverted c6185a1324397b3729b1c67f4b59b8438d317957

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 tests/test_config_factory.py                  | 58 -----------------
 vllm_omni/config/stage_config.py              | 63 -------------------
 .../models/cosmos3/pipeline_cosmos3.py        |  2 -
 3 files changed, 123 deletions(-)

diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py
index bc32309d8aa..0db1d4fbae8 100644
--- a/tests/test_config_factory.py
+++ b/tests/test_config_factory.py
@@ -173,64 +173,6 @@ def test_to_omegaconf_omits_none_deploy_overrides_for_engine_args(self):
         for name in deploy_override_field_names() - {"devices"}:
             assert name not in engine_args
 
-    def test_to_omegaconf_diffusion_folds_flat_parallel_keys(self):
-        """Flat parallelism overrides on a diffusion stage must be folded into
-        ``engine_args.parallel_config`` and reflected in ``runtime.devices``.
-
-        Without this, OmniDiffusionConfig.from_kwargs filters out flat
-        ``ulysses_degree`` / ``ring_degree`` / etc. and the diffusion pipeline
-        launches single-GPU regardless of the CLI flag.
-        """
-        config = StageConfig(
-            stage_id=0,
-            model_stage="diffusion",
-            stage_type=StageType.DIFFUSION,
-            runtime_overrides={"ulysses_degree": 2},
-        )
-        omega_config = config.to_omegaconf()
-
-        assert "ulysses_degree" not in omega_config.engine_args
-        assert omega_config.engine_args.parallel_config.ulysses_degree == 2
-        assert omega_config.runtime.devices == "0,1"
-
-    def test_to_omegaconf_diffusion_default_world_size_one(self):
-        """Diffusion stages with no parallelism overrides default to a single GPU."""
-        config = StageConfig(
-            stage_id=0,
-            model_stage="diffusion",
-            stage_type=StageType.DIFFUSION,
-        )
-        omega_config = config.to_omegaconf()
-        assert omega_config.runtime.devices == "0"
-
-    def test_to_omegaconf_diffusion_preserves_explicit_devices(self):
-        """When the deploy YAML or CLI sets ``runtime.devices`` explicitly, the
-        derived world-size calculation must not overwrite it."""
-        config = StageConfig(
-            stage_id=0,
-            model_stage="diffusion",
-            stage_type=StageType.DIFFUSION,
-            yaml_runtime={"devices": "2,3"},
-            runtime_overrides={"ulysses_degree": 2},
-        )
-        omega_config = config.to_omegaconf()
-        assert omega_config.runtime.devices == "2,3"
-        assert omega_config.engine_args.parallel_config.ulysses_degree == 2
-
-    def test_to_omegaconf_diffusion_merges_yaml_parallel_config(self):
-        """CLI flat overrides win over a deploy YAML ``parallel_config`` dict."""
-        config = StageConfig(
-            stage_id=0,
-            model_stage="diffusion",
-            stage_type=StageType.DIFFUSION,
-            yaml_engine_args={"parallel_config": {"tensor_parallel_size": 1, "ulysses_degree": 1}},
-            runtime_overrides={"ulysses_degree": 4},
-        )
-        omega_config = config.to_omegaconf()
-        assert omega_config.engine_args.parallel_config.ulysses_degree == 4
-        assert omega_config.engine_args.parallel_config.tensor_parallel_size == 1
-        assert omega_config.runtime.devices == "0,1,2,3"
-
     def test_to_omegaconf_diffusion_parallel_overrides_replace_nested_values(self):
         config = StageConfig(
             stage_id=1,
diff --git a/vllm_omni/config/stage_config.py b/vllm_omni/config/stage_config.py
index bfbc8f12559..f6a88546f82 100644
--- a/vllm_omni/config/stage_config.py
+++ b/vllm_omni/config/stage_config.py
@@ -7,7 +7,6 @@
 import dataclasses
 import re
 import warnings
-from collections.abc import Mapping
 from dataclasses import asdict, dataclass, field, fields
 from enum import Enum
 from pathlib import Path
@@ -42,58 +41,6 @@ def _warn_deprecated_kwargs(kwargs: dict[str, Any]) -> None:
 _STAGE_OVERRIDE_PATTERN = re.compile(r"^stage_(\d+)_(.+)$")
 
 
-def _diffusion_parallel_field_names() -> frozenset[str]:
-    """Names of the fields on ``DiffusionParallelConfig``.
-
-    Lazy-imported because ``vllm_omni.diffusion.data`` pulls in heavy
-    diffusion-only dependencies that we don't want loaded just to construct
-    LLM stage configs.
-    """
-    from vllm_omni.diffusion.data import DiffusionParallelConfig
-
-    return frozenset(f.name for f in fields(DiffusionParallelConfig))
-
-
-def _fold_flat_parallel_keys(engine_args: dict[str, Any]) -> dict[str, Any]:
-    """Fold flat parallelism keys in *engine_args* into a nested ``parallel_config``.
-
-    Diffusion stages express parallelism via ``OmniDiffusionConfig.parallel_config``
-    (a ``DiffusionParallelConfig``), not via top-level engine args. CLI flags
-    like ``--ulysses-degree`` land as flat keys in ``engine_args`` and would be
-    silently dropped by ``OmniDiffusionConfig.from_kwargs`` (which filters
-    kwargs to ``OmniDiffusionConfig``'s own field set), leaving
-    ``parallel_config`` at its default with ``world_size=1``.
-
-    Returns the merged parallel_config dict so callers can use it to derive
-    ``runtime.devices`` without instantiating ``DiffusionParallelConfig`` twice.
-    """
-    parallel_field_names = _diffusion_parallel_field_names()
-    parallel_dict: dict[str, Any] = {}
-
-    existing = engine_args.get("parallel_config")
-    if isinstance(existing, Mapping):
-        parallel_dict.update(dict(existing))
-    elif dataclasses.is_dataclass(existing) and not isinstance(existing, type):
-        parallel_dict.update(asdict(existing))
-
-    flat_keys = parallel_field_names & engine_args.keys()
-    for key in flat_keys:
-        value = engine_args.pop(key)
-        if value is not None:
-            parallel_dict[key] = value
-
-    if parallel_dict:
-        engine_args["parallel_config"] = parallel_dict
-    return parallel_dict
-
-
-def _diffusion_world_size(parallel_dict: dict[str, Any]) -> int:
-    """Build a ``DiffusionParallelConfig`` from *parallel_dict* and return ``world_size``."""
-    from vllm_omni.diffusion.data import DiffusionParallelConfig
-
-    return max(1, int(DiffusionParallelConfig.from_dict(parallel_dict).world_size))
-
-
 def build_stage_runtime_overrides(
     stage_id: int,
     cli_overrides: dict[str, Any],
@@ -1077,16 +1024,6 @@ def to_omegaconf(self) -> Any:
         if runtime_overrides.get("num_replicas") is not None:
             runtime["num_replicas"] = runtime_overrides["num_replicas"]
 
-        if StageType(self.stage_type) == StageType.DIFFUSION:
-            parallel_dict = _fold_flat_parallel_keys(engine_args)
-            # Mirror the legacy ``_create_default_diffusion_stage_cfg`` factory:
-            # when neither the deploy YAML nor CLI set ``runtime.devices``,
-            # derive it from ``parallel_config.world_size`` so the orchestrator
-            # spawns the right number of GPU workers.
-            if "devices" not in runtime:
-                world_size = _diffusion_world_size(parallel_dict)
-                runtime["devices"] = ",".join(str(i) for i in range(world_size))
-
         # Legacy compat: migrate runtime.max_batch_size → engine_args.max_num_seqs
         legacy_mbs = runtime.pop("max_batch_size", None)
         cli_mbs = runtime_overrides.get("max_batch_size")
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 5921a535c01..934d4af8f37 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -308,7 +308,6 @@ def __init__(
 
         self._guidance_scale = None
         self._num_timesteps = None
-        self._loaded_weight_names: set[str] = set()
 
         self.setup_diffusion_pipeline_profiler(
             enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler
@@ -439,7 +438,6 @@ def _remapped_weights() -> Iterable[tuple[str, torch.Tensor]]:
         loaded = loader.load_weights(_remapped_weights())
         self.transformer.post_load_weights()
         self.transformer.eval()
-        self._loaded_weight_names = set(loaded)
         return loaded
 
     def predict_noise(self, **kwargs) -> torch.Tensor | tuple[torch.Tensor, ...]:

From d9a35abbeac54d1a798c2a99fc96eb582e6f24b2 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 28 May 2026 12:30:14 +0200
Subject: [PATCH 38/41] Fix CUDNN_ATTN for GQA

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 vllm_omni/diffusion/attention/backends/cudnn_attn.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm_omni/diffusion/attention/backends/cudnn_attn.py b/vllm_omni/diffusion/attention/backends/cudnn_attn.py
index f27fe18706f..44026c56910 100644
--- a/vllm_omni/diffusion/attention/backends/cudnn_attn.py
+++ b/vllm_omni/diffusion/attention/backends/cudnn_attn.py
@@ -51,6 +51,8 @@ def __init__(
     ) -> None:
         self.causal = causal
         self.softmax_scale = softmax_scale
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
 
     def forward_cuda(
         self,
@@ -84,6 +86,7 @@ def forward_cuda(
                     dropout_p=0.0,
                     is_causal=self.causal,
                     scale=self.softmax_scale,
+                    enable_gqa=self.num_heads != self.num_kv_heads,
                 )
         except RuntimeError as e:
             if "No available kernel" not in str(e):

From 7c91b3cde4afce921d39269914192480f8bc7202 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 28 May 2026 14:13:41 +0200
Subject: [PATCH 39/41] Answered review comments

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 .../models/cosmos3/pipeline_cosmos3.py        | 108 ++++++------------
 .../models/cosmos3/transformer_cosmos3.py     |  53 ++++-----
 2 files changed, 54 insertions(+), 107 deletions(-)

diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 934d4af8f37..9a20d4fa80d 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -48,17 +48,6 @@
 
 logger = init_logger(__name__)
 
-COSMOS3_DEFAULT_NEGATIVE_PROMPT = ""
-COSMOS3_VIDEO_NEGATIVE_PROMPT = (
-    "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
-    "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
-    "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, "
-    "jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
-    "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
-    "Overall, the video is of poor quality."
-)
-COSMOS3_T2V_NEGATIVE_PROMPT = COSMOS3_VIDEO_NEGATIVE_PROMPT
-COSMOS3_I2V_NEGATIVE_PROMPT = COSMOS3_VIDEO_NEGATIVE_PROMPT
 COSMOS3_DURATION_TEMPLATE = "The video is {duration:.1f} seconds long and is of {fps:.0f} FPS."
 COSMOS3_RESOLUTION_TEMPLATE = "This video is of {height}x{width} resolution."
 COSMOS3_IMAGE_RESOLUTION_TEMPLATE = "This image is of {height}x{width} resolution."
@@ -544,20 +533,21 @@ def _apply_metadata_templates(
         resolution_template: str | None = COSMOS3_RESOLUTION_TEMPLATE,
         force_duration_template: bool = False,
     ) -> str:
-        """Append duration and resolution metadata to a prompt.
-
-        Strips trailing dot and appends ``". <template>"`` for each.
         """
+        Append duration and resolution metadata to a prompt.
+        """
+        parts: list[str] = []
+        head = prompt.rstrip(".").strip()
+        if head:
+            parts.append(head)
         if duration_template is not None and (num_frames > 1 or force_duration_template):
             duration = num_frames / frame_rate
-            dur_text = duration_template.format(duration=duration, fps=frame_rate)
-            prompt = prompt.rstrip(".") + ". " + dur_text
-
+            parts.append(duration_template.format(duration=duration, fps=frame_rate).rstrip("."))
         if resolution_template is not None:
-            res_text = resolution_template.format(height=height, width=width)
-            prompt = prompt.rstrip(".") + ". " + res_text
-
-        return prompt
+            parts.append(resolution_template.format(height=height, width=width).rstrip("."))
+        if not parts:
+            return ""
+        return ". ".join(parts) + "."
 
     # -- Tokenization --------------------------------------------------------
 
@@ -748,54 +738,26 @@ def _format_and_tokenize_prompts(
         if _is_rank_zero():
             logger.info("Final prompt: '%s'", prompt)
 
-        # Negative prompt metadata: "none" | "same" | "inverse".
-        # "same"    = same templates as positive (CFG guides caption only).
-        # "inverse" = inverted templates ("not {duration}...", "not {height}x{width}...").
-        # "none"    = no metadata on negative prompt.
-        # negative_prompt_keep_metadata=True upgrades "none" to "same" (compat).
-        # T2I uses a plain neg prompt by default.
-        neg_meta_default = "none" if is_t2i else "same"
-        neg_meta_mode = self._get_sp_param(sp, "negative_metadata_mode", "none")
-        keep_metadata = bool(self._get_sp_param(sp, "negative_prompt_keep_metadata", not is_t2i))
-        if keep_metadata and neg_meta_mode == "none":
-            neg_meta_mode = neg_meta_default
-
-        if neg_meta_mode == "same":
-            negative_prompt = (
-                self._apply_metadata_templates(
-                    negative_prompt,
-                    num_frames,
-                    frame_rate,
-                    height,
-                    width,
-                    duration_template=dur_tmpl,
-                    resolution_template=res_tmpl,
-                )
-                .lstrip(".")
-                .strip()
-            )
-        elif neg_meta_mode == "inverse":
-            inv_dur = COSMOS3_INVERSE_DURATION_TEMPLATE if dur_tmpl else None
-            if res_tmpl is None:
-                inv_res = None
-            elif is_t2i:
-                inv_res = COSMOS3_INVERSE_IMAGE_RESOLUTION_TEMPLATE
-            else:
-                inv_res = COSMOS3_INVERSE_RESOLUTION_TEMPLATE
-            negative_prompt = (
-                self._apply_metadata_templates(
-                    negative_prompt,
-                    num_frames,
-                    frame_rate,
-                    height,
-                    width,
-                    duration_template=inv_dur,
-                    resolution_template=inv_res,
-                    force_duration_template=True,
-                )
-                .lstrip(".")
-                .strip()
-            )
+        # Negative prompt: inverse templates ("not {duration}...", "not {height}x{width}...").
+        # Applied whenever the matching positive template is enabled; an empty
+        # negative_prompt yields output that starts with the template, not a dot.
+        inv_dur = COSMOS3_INVERSE_DURATION_TEMPLATE if dur_tmpl else None
+        if res_tmpl is None:
+            inv_res = None
+        elif is_t2i:
+            inv_res = COSMOS3_INVERSE_IMAGE_RESOLUTION_TEMPLATE
+        else:
+            inv_res = COSMOS3_INVERSE_RESOLUTION_TEMPLATE
+        negative_prompt = self._apply_metadata_templates(
+            negative_prompt,
+            num_frames,
+            frame_rate,
+            height,
+            width,
+            duration_template=inv_dur,
+            resolution_template=inv_res,
+            force_duration_template=True,
+        )
 
         default_sys_prompt = COSMOS3_T2I_SYSTEM_PROMPT if is_t2i else COSMOS3_SYSTEM_PROMPT
         sys_prompt = self._get_sp_param(sp, "system_prompt", default_sys_prompt) or default_sys_prompt
@@ -1059,14 +1021,8 @@ def forward(
 
         sp = req.sampling_params
         is_t2i = self._is_t2i_request(req)
-        is_i2v = image_tensor is not None and not is_t2i
         if negative_prompt is None:
-            if is_t2i:
-                negative_prompt = COSMOS3_DEFAULT_NEGATIVE_PROMPT
-            elif is_i2v:
-                negative_prompt = COSMOS3_I2V_NEGATIVE_PROMPT
-            else:
-                negative_prompt = COSMOS3_T2V_NEGATIVE_PROMPT
+            negative_prompt = ""
 
         # T2I and T2V share the same model + forward path; only defaults
         # differ:
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 8406edb7b96..9dbaaad4b57 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -32,6 +32,7 @@
 from vllm_omni.diffusion.attention.layer import Attention as FrameworkAttention
 from vllm_omni.diffusion.data import OmniDiffusionConfig
 from vllm_omni.diffusion.distributed.sp_plan import SequenceParallelInput, SequenceParallelOutput
+from vllm_omni.diffusion.forward_context import get_forward_context, is_forward_context_available
 from vllm_omni.diffusion.layers.norm import RMSNorm
 
 logger = init_logger(__name__)
@@ -60,17 +61,10 @@ def _is_sp_active() -> bool:
     Follows the Bagel pattern: read ``forward_context.sp_active`` which returns
     True when ``sequence_parallel_size > 1`` even without ``_sp_plan`` hooks.
     """
-    try:
-        from vllm_omni.diffusion.forward_context import (
-            get_forward_context,
-            is_forward_context_available,
-        )
 
-        if not is_forward_context_available():
-            return False
-        return get_forward_context().sp_active
-    except Exception:
+    if not is_forward_context_available():
         return False
+    return get_forward_context().sp_active
 
 
 def _tf_config_get(config: Any, key: str, default: Any) -> Any:
@@ -109,7 +103,6 @@ def compute_mrope_position_ids_vision(
     temporal_compression_factor: int = 4,
     base_temporal_compression_factor: int | None = None,
     enable_fps_modulation: bool = True,
-    start_frame_offset: int = 0,
 ) -> tuple[torch.Tensor, int | float]:
     """Generate 3D mRoPE position IDs for vision tokens.
 
@@ -127,17 +120,10 @@ def compute_mrope_position_ids_vision(
         )
         base_tps = base_fps / effective_base_tcf
         frame_indices = torch.arange(grid_t, dtype=torch.float32)
-        t_index = (
-            ((frame_indices + start_frame_offset) / tps * base_tps + temporal_offset)
-            .view(-1, 1)
-            .expand(-1, grid_h * grid_w)
-            .flatten()
-        )
+        t_index = (frame_indices / tps * base_tps + temporal_offset).view(-1, 1).expand(-1, grid_h * grid_w).flatten()
     else:
-        t_index = (
-            torch.arange(grid_t, dtype=torch.long).view(-1, 1).expand(-1, grid_h * grid_w).flatten()
-            + int(temporal_offset)
-            + start_frame_offset
+        t_index = torch.arange(grid_t, dtype=torch.long).view(-1, 1).expand(-1, grid_h * grid_w).flatten() + int(
+            temporal_offset
         )
 
     h_index = torch.arange(grid_h, dtype=torch.long).view(1, -1, 1).expand(grid_t, -1, grid_w).flatten()
@@ -769,30 +755,31 @@ def __init__(
                 for i in range(num_hidden_layers)
             ]
         )
+        # TODO: Not used right now, will be used in the future for prompt upsampler.
         self.norm = RMSNorm(hidden_size, eps=rms_norm_eps)
 
     def forward(
         self,
         text_ids: torch.Tensor,
-        text_mask: torch.Tensor,
         freqs: tuple[torch.Tensor, torch.Tensor],
     ) -> list[tuple[torch.Tensor, torch.Tensor]]:
         """
         Args:
             text_ids: [B, S] token IDs
-            text_mask: [B, S] float mask (1=real, 0=pad)
             freqs: (cos, sin) each [B, S, 1, D]
 
         Returns:
             List of (K, V) per layer, each [B, S, H_kv, D].
+
+        No padding mask is applied: with right-padding + causal self-attention,
+        real query positions only attend to real keys, and the caller trims pad
+        K/V via ``max_real_len`` before the GEN cross-attention sees them.
         """
         hidden = self.embed_tokens(text_ids)
-        mask_3d = text_mask.unsqueeze(-1)  # [B, S, 1]
 
         cached_kv: list[tuple[torch.Tensor, torch.Tensor]] = []
         for layer in self.layers:
-            hidden = hidden * mask_3d
-            hidden, k, v = layer(hidden, freqs, text_mask=None)
+            hidden, k, v = layer(hidden, freqs)
             cached_kv.append((k, v))
 
         return cached_kv
@@ -1064,7 +1051,6 @@ def reset_cache(self) -> None:
     def _validate_gen_sequence_parallel(
         *,
         s_gen: int,
-        s_video: int,
         ulysses_size: int,
     ) -> None:
         if ulysses_size <= 1 or s_gen % ulysses_size == 0:
@@ -1074,7 +1060,7 @@ def _validate_gen_sequence_parallel(
             "Adjust the spatial resolution so that t * ceil(h/patch) * ceil(w/patch) is a multiple of ulysses_degree."
         )
         raise ValueError(
-            f"GEN sequence length ({s_gen} video tokens {s_video}) must be divisible by "
+            f"GEN sequence length ({s_gen} video tokens) must be divisible by "
             f"ulysses_degree ({ulysses_size}). {adjust_detail}"
         )
 
@@ -1109,14 +1095,20 @@ def forward(
         """
         t, h, w = video_shape
         hp, wp, _, _ = self._pad_to_patch_size(h, w)
-        max_real_len = int(text_mask.sum(dim=1).max().item())
+        text_lengths = text_mask.sum(dim=1)
+        min_real_len = int(text_lengths.min().item())
+        max_real_len = int(text_lengths.max().item())
+        if min_real_len != max_real_len:
+            raise ValueError(
+                f"Cosmos3 requires identical real text lengths within a batch "
+                f"(got min={min_real_len}, max={max_real_len})."
+            )
 
         # Query Ulysses state at runtime
         ulysses_size, _, _ = _get_ulysses_state()
 
         # Patchify latents and project to hidden space
         hidden_video = self.proj_in(self.patchify(hidden_states, t, h, w))
-        s_video = hidden_video.shape[1]
 
         # Timestep embedding (fp32 for precision).
         # For I2V: only add to noisy tokens, not conditioned ones.
@@ -1153,7 +1145,7 @@ def forward(
                     hidden_states.device,
                     hidden_states.dtype,
                 )
-                cached_kv_full = self.language_model(text_ids, text_mask, freqs_und)
+                cached_kv_full = self.language_model(text_ids, freqs_und)
                 self.cached_freqs_gen = freqs_gen
 
                 # Trim to real text length (remove padding).  K/V stay replicated;
@@ -1167,7 +1159,6 @@ def forward(
                 raise RuntimeError("Cosmos3 GEN cache was not initialized before running GEN layers.")
             self._validate_gen_sequence_parallel(
                 s_gen=hidden_gen.shape[1],
-                s_video=s_video,
                 ulysses_size=ulysses_size,
             )
             freqs_cos, freqs_sin = self.cached_freqs_gen

From b78f8813e8e80f4bff5974eb12614942faa41955 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 28 May 2026 17:58:43 +0200
Subject: [PATCH 40/41] Add Cosmos3 sound generation

---
 docs/models/supported_models.md               |   2 +-
 tests/diffusion/models/cosmos3/conftest.py    |  13 +-
 .../models/cosmos3/test_cosmos3_pipeline.py   |  71 ++-
 .../cosmos3/test_cosmos3_sound_tokenizer.py   | 226 ++++++++
 .../cosmos3/test_cosmos3_transformer.py       | 105 +++-
 .../openai_api/test_video_server.py           |   9 +
 .../cosmos3/audio_tokenizer/__init__.py       |   6 +
 .../models/cosmos3/audio_tokenizer/avae.py    | 323 +++++++++++
 .../models/cosmos3/pipeline_cosmos3.py        | 293 +++++++++-
 .../models/cosmos3/sound_tokenizer.py         | 537 ++++++++++++++++++
 .../models/cosmos3/transformer_cosmos3.py     | 195 ++++++-
 vllm_omni/entrypoints/openai/api_server.py    |   4 +
 .../entrypoints/openai/protocol/videos.py     |   9 +
 vllm_omni/entrypoints/openai/serving_video.py |   4 +
 14 files changed, 1766 insertions(+), 31 deletions(-)
 create mode 100644 tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index cdfa7ead751..9101265a866 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -33,7 +33,7 @@ th {
 | `ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanPipeline` | Wan2.1-T2V, Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-T2V-14B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
-| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V | `nvidia/Cosmos3-Nano` | ✅︎ | | | |
+| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V, T2V with sound | `nvidia/Cosmos3-Nano` | ✅︎ | | | |
 | `WanSpeechToVideoPipeline` | Wan2.2-S2V | `Wan-AI/Wan2.2-S2V-14B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Wan22VACEPipeline` | Wan2.1-VACE | `Wan-AI/Wan2.1-VACE-1.3B-diffusers`, `Wan-AI/Wan2.1-VACE-14B-diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `LTX2Pipeline` | LTX-2-T2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | |
diff --git a/tests/diffusion/models/cosmos3/conftest.py b/tests/diffusion/models/cosmos3/conftest.py
index aa3bba1acd5..7075065447c 100644
--- a/tests/diffusion/models/cosmos3/conftest.py
+++ b/tests/diffusion/models/cosmos3/conftest.py
@@ -75,9 +75,13 @@ def __init__(
         self,
         *,
         latent_channel_size: int = 2,
+        sound_gen: bool = False,
+        sound_dim: int = 3,
     ) -> None:
         super().__init__()
         self.latent_channel_size = latent_channel_size
+        self.sound_gen = sound_gen
+        self.sound_dim = sound_dim
         self.cached_kv: Any | None = None
         self.cached_freqs_gen: Any | None = None
         self.calls: list[dict[str, Any]] = []
@@ -96,8 +100,9 @@ def forward(
         text_ids: torch.Tensor,
         text_mask: torch.Tensor,
         **kwargs: Any,
-    ) -> torch.Tensor:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         token = int(text_ids.reshape(-1)[0].item()) if text_ids.numel() else 0
+        sound_latents = kwargs.get("sound_latents")
         self.calls.append(
             {
                 "token": token,
@@ -111,7 +116,10 @@ def forward(
             marker = torch.tensor([token], dtype=torch.float32)
             self.cached_kv = [(marker, marker + 100)]
             self.cached_freqs_gen = (marker + 200, marker + 300)
-        return torch.full_like(hidden_states, float(token))
+        outputs: list[torch.Tensor] = [torch.full_like(hidden_states, float(token))]
+        if sound_latents is not None:
+            outputs.append(torch.full_like(sound_latents, float(token + 10)))
+        return outputs[0] if len(outputs) == 1 else tuple(outputs)
 
 
 def passthrough_progress_bar(iterable):
@@ -152,6 +160,7 @@ def _make():
         pipeline._guidance_scale = None
         pipeline._num_timesteps = None
         pipeline.progress_bar = passthrough_progress_bar
+        pipeline._sound_tokenizer = None
         return pipeline
 
     return _make
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
index a936fabcee5..b4471973b7d 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -62,7 +62,7 @@ def test_preprocess_i2v_image_input() -> None:
     assert tuple(result.prompts[0]["additional_information"]["preprocessed_image"].shape[-2:]) == (672, 1344)
 
 
-def test_postprocess_handles_image_video_and_validation() -> None:
+def test_postprocess_handles_image_video_audio_and_validation() -> None:
     from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_post_process_func
 
     func = get_cosmos3_post_process_func(SimpleNamespace())
@@ -71,6 +71,13 @@ def test_postprocess_handles_image_video_and_validation() -> None:
     assert func(video, output_type="latent") is video
     assert func({"image": video})[0].size == (4, 4)
     assert "video" in func({"video": video})
+    assert (
+        func(
+            {"video": video, "audio": torch.ones(1, 2, 16), "audio_sample_rate": 48000},
+            sampling_params=SimpleNamespace(extra_args={"resolved_frame_rate": 12}),
+        )["audio_sample_rate"]
+        == 48000
+    )
 
     with pytest.raises(ValueError, match="text-to-image postprocess expects"):
         func({"image": torch.zeros(1, 3, 2, 4, 4)})
@@ -120,7 +127,7 @@ def test_prompt_formatting_and_checkpoint_key_remap(make_cosmos3_pipeline) -> No
     assert {key: Cosmos3OmniDiffusersPipeline._remap_ckpt_key(key) for key in remaps} == remaps
 
 
-def test_prepare_latents_for_video_and_image(make_cosmos3_pipeline) -> None:
+def test_prepare_latents_for_video_image_and_sound(make_cosmos3_pipeline) -> None:
     pipeline = make_cosmos3_pipeline()
     latents = pipeline._prepare_latents(16, 24, 5, torch.Generator(device="cpu").manual_seed(0))
     assert latents.shape == (1, 2, 2, 2, 3)
@@ -133,8 +140,24 @@ def test_prepare_latents_for_video_and_image(make_cosmos3_pipeline) -> None:
     assert velocity_mask.tolist() == [[[[[0.0]], [[1.0]]]]]
     assert image_latent.shape == (1, 2, 1, 2, 3)
 
+    pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+    pipeline._sound_tokenizer = SimpleNamespace(
+        sample_rate=10,
+        latent_ch=3,
+        hop_size=4,
+        decode=lambda x: torch.ones(x.shape[0], 2, 24),
+    )
+    assert pipeline._resolve_sound_target_samples(SimpleNamespace(extra_args={"sound_duration": 2.0}), 9, 3.0) == (
+        20,
+        2.0,
+        10,
+    )
+    sound_latents, latent_frames = pipeline._prepare_sound_latents(21, torch.Generator(device="cpu").manual_seed(0))
+    assert (sound_latents.shape, latent_frames) == (torch.Size([1, 3, 6]), 6)
+    assert pipeline._decode_sound_latents(torch.zeros(1, 3, 6), target_audio_samples=21).shape == (1, 2, 21)
 
-def test_diffuse_covers_cfg_and_i2v_steps(make_cosmos3_pipeline) -> None:
+
+def test_diffuse_covers_cfg_i2v_and_sound_steps(make_cosmos3_pipeline) -> None:
     pipeline = make_cosmos3_pipeline()
     latents = torch.zeros(1, 2, 1, 1, 1)
 
@@ -166,6 +189,21 @@ def test_diffuse_covers_cfg_and_i2v_steps(make_cosmos3_pipeline) -> None:
     )
     torch.testing.assert_close(i2v[:, :, 0:1], torch.full((1, 2, 1, 1, 1), 7.0))
 
+    pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+    video_result, sound_result = pipeline.diffuse(
+        latents=latents,
+        sound_latents=torch.zeros(1, 3, 4),
+        timesteps=torch.tensor([7, 3]),
+        cond_ids=_ids(2),
+        cond_mask=_mask(),
+        uncond_ids=_ids(1),
+        uncond_mask=_mask(),
+        guidance_scale=1.0,
+        shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0},
+    )
+    torch.testing.assert_close(video_result, torch.full_like(latents, 4.0))
+    torch.testing.assert_close(sound_result, torch.full((), 24.0).expand_as(sound_result))
+
 
 class TestForwardRouting:
     def _install_forward_stubs(self, pipeline):
@@ -189,7 +227,10 @@ def fake_prepare(height, width, num_frames, generator):
 
         def fake_diffuse(**kwargs):
             captured["diffuse_calls"].append(kwargs)
-            return kwargs["latents"] + len(captured["diffuse_calls"])
+            outputs = [kwargs["latents"] + len(captured["diffuse_calls"])]
+            if kwargs.get("sound_latents") is not None:
+                outputs.append(kwargs["sound_latents"] + 2.0)
+            return outputs[0] if len(outputs) == 1 else tuple(outputs)
 
         pipeline._format_and_tokenize_prompts = fake_format
         pipeline._prepare_latents = fake_prepare
@@ -237,7 +278,7 @@ def test_forward_defaults_and_mode_selection(
         assert captured["flow_shifts"] == expected["flow"]
         assert captured["scheduler_steps"] == expected["steps"]
 
-    def test_forward_i2v_route(self, make_cosmos3_pipeline) -> None:
+    def test_forward_i2v_and_sound_routes(self, make_cosmos3_pipeline) -> None:
         pipeline = make_cosmos3_pipeline()
         captured = self._install_forward_stubs(pipeline)
         image_tensor = torch.zeros(1, 3, 16, 16)
@@ -262,11 +303,30 @@ def test_forward_i2v_route(self, make_cosmos3_pipeline) -> None:
         )
         assert captured["diffuse_calls"][-1]["shared_kwargs"]["noisy_frame_mask"] is velocity_mask
 
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+        sound_latents = torch.zeros(1, 3, 4)
+        pipeline._resolve_sound_target_samples = lambda *args: (20, 2.0, 10)
+        pipeline._prepare_sound_latents = lambda *args: (sound_latents, 4)
+        pipeline._decode_sound_latents = lambda *args: torch.ones(1, 2, 20)
+        output = pipeline.forward(
+            SimpleNamespace(
+                prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
+                sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0),
+            )
+        )
+        assert captured["diffuse_calls"][-1]["sound_latents"] is sound_latents
+        assert output.output["audio_sample_rate"] == 10
+
     @pytest.mark.parametrize(
         ("prompt", "sampling_params", "message"),
         [
             (["one", "two"], make_sampling_params(), "single prompt"),
             ([{"prompt": "one", "modalities": ["image", "video"]}], make_sampling_params(), "both image and video"),
+            (
+                [{"prompt": "x", "modalities": ["image"], "generate_sound": True}],
+                make_sampling_params(),
+                "only for video",
+            ),
         ],
     )
     def test_forward_rejects_invalid_public_requests(
@@ -277,6 +337,7 @@ def test_forward_rejects_invalid_public_requests(
         message,
     ) -> None:
         pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
 
         with pytest.raises(ValueError, match=message):
             pipeline.forward(SimpleNamespace(prompts=prompt, sampling_params=sampling_params))
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
new file mode 100644
index 00000000000..47664c59e77
--- /dev/null
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
@@ -0,0 +1,226 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
+
+DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME = "diffusion_pytorch_model.safetensors"
+
+
+class _FakeAVAEAudioTokenizer:
+    def __init__(self, **kwargs) -> None:
+        self.kwargs = kwargs
+        self.sample_rate = int(kwargs["sample_rate"])
+        self.audio_channels = int(kwargs["audio_channels"])
+        self.latent_ch = int(kwargs["io_channels"])
+        self.temporal_compression_factor = int(kwargs["hop_size"])
+
+    def get_latent_num_samples(self, num_audio_samples: int) -> int:
+        return int(num_audio_samples) // self.temporal_compression_factor
+
+    def get_audio_num_samples(self, num_latent_samples: int) -> int:
+        return int(num_latent_samples) * self.temporal_compression_factor
+
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        return torch.zeros(latents.shape[0], self.audio_channels, 8)
+
+
+def _write_component(root: Path, config: dict | None = None, checkpoint_name: str | None = None) -> Path:
+    tokenizer_dir = root / "sound_tokenizer"
+    tokenizer_dir.mkdir(parents=True)
+    if checkpoint_name:
+        (tokenizer_dir / checkpoint_name).write_bytes(b"stub")
+    (tokenizer_dir / "config.json").write_text(json.dumps(config or {}), encoding="utf-8")
+    return tokenizer_dir
+
+
+def _patch_fake_avae(monkeypatch: pytest.MonkeyPatch, created: dict) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    class FakeAVAE(_FakeAVAEAudioTokenizer):
+        def __init__(self, **kwargs) -> None:
+            created.update(kwargs)
+            super().__init__(**kwargs)
+
+    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
+    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
+
+
+def test_from_config_loads_local_diffusers_component(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    tokenizer_dir = _write_component(model_dir, checkpoint_name=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    created = {}
+    _patch_fake_avae(monkeypatch, created)
+
+    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model=str(model_dir),
+            custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800, "sound_dim": 3},
+            dtype=torch.float32,
+        )
+    )
+
+    assert created["checkpoint_path"] == str(tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    assert created["config_path"] == str(tokenizer_dir / "config.json")
+    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (32000, 3, 800)
+
+
+def test_from_config_downloads_component_from_hf_repo(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
+    import huggingface_hub
+
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    cache_dir = tmp_path / "hf"
+    _write_component(cache_dir, checkpoint_name=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    calls = []
+    created = {}
+    _patch_fake_avae(monkeypatch, created)
+
+    def fake_snapshot_download(repo_id: str, *, revision: str | None, allow_patterns: list[str]) -> str:
+        calls.append((repo_id, revision, allow_patterns))
+        return str(cache_dir)
+
+    monkeypatch.setattr(huggingface_hub, "snapshot_download", fake_snapshot_download)
+
+    sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model="nvidia/cosmos3",
+            revision="test-rev",
+            custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800, "sound_dim": 3},
+            dtype=torch.float32,
+        )
+    )
+
+    assert created["checkpoint_path"].endswith(DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    assert calls == [
+        (
+            "nvidia/cosmos3",
+            "test-rev",
+            ["sound_tokenizer/config.json", f"sound_tokenizer/{DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME}"],
+        )
+    ]
+
+
+@pytest.mark.parametrize(
+    ("checkpoint_name", "message"),
+    [
+        (None, "no AVAE sound tokenizer checkpoint"),
+        ("model.safetensors", DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME),
+    ],
+)
+def test_default_component_requires_diffusers_checkpoint_name(tmp_path, checkpoint_name, message) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    _write_component(model_dir, checkpoint_name=checkpoint_name)
+
+    with pytest.raises(ValueError, match=message):
+        sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+            SimpleNamespace(model=str(model_dir), custom_pipeline_args={}, dtype=torch.float32)
+        )
+
+
+def test_component_config_precedence_and_conflict_detection(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    component_config = {
+        "sampling_rate": 48000,
+        "dec_out_channels": 2,
+        "vocoder_input_dim": 64,
+        "hop_size": 1920,
+    }
+    model_dir = tmp_path / "model"
+    _write_component(model_dir, component_config, DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    created = {}
+    _patch_fake_avae(monkeypatch, created)
+
+    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model=str(model_dir),
+            custom_pipeline_args={
+                "sound_normalize_latents": True,
+                "sound_normalization_type": "tanh",
+                "sound_tanh_input_scale": 2.0,
+            },
+            model_config={
+                "sound_tokenizer": {
+                    "sample_rate": 32000,
+                    "audio_channels": 1,
+                    "io_channels": 3,
+                    "hop_size": 800,
+                    "normalize_latents": False,
+                    "normalization_type": "none",
+                }
+            },
+            dtype=torch.float32,
+        )
+    )
+
+    assert (created["sample_rate"], created["audio_channels"], created["io_channels"], created["hop_size"]) == (
+        48000,
+        2,
+        64,
+        1920,
+    )
+    assert (created["normalize_latents"], created["normalization_type"], created["tanh_input_scale"]) == (
+        True,
+        "tanh",
+        2.0,
+    )
+    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (48000, 64, 1920)
+
+    with pytest.raises(ValueError, match=r"sample_rate.*48000.*32000"):
+        sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+            SimpleNamespace(
+                model=str(model_dir),
+                custom_pipeline_args={"sound_sample_rate": 32000},
+                dtype=torch.float32,
+            )
+        )
+
+
+def test_avae_uses_diffusers_decoder_state_dict_layout(tmp_path) -> None:
+    from safetensors.torch import save_file
+
+    from vllm_omni.diffusion.models.cosmos3.audio_tokenizer import avae
+
+    config = {
+        "sampling_rate": 8000,
+        "hop_size": 2,
+        "dec_dim": 4,
+        "dec_c_mults": [1],
+        "dec_strides": [2],
+        "dec_out_channels": 1,
+        "vocoder_input_dim": 2,
+        "normalization_type": "none",
+    }
+    checkpoint_path = tmp_path / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME
+    config_path = tmp_path / "config.json"
+    config_path.write_text(json.dumps(config), encoding="utf-8")
+
+    decoder = avae.OobleckDecoder(4, 2, 1, [2], [1])
+    save_file({f"decoder.{key}": value for key, value in decoder.state_dict().items()}, str(checkpoint_path))
+
+    tokenizer = avae.Cosmos3AVAEAudioTokenizer(
+        checkpoint_path=checkpoint_path,
+        config_path=config_path,
+        dtype=torch.float32,
+        device="cpu",
+    )
+
+    keys = set(tokenizer.state_dict())
+    assert {"decoder.conv1.weight_g", "decoder.block.0.conv_t1.weight_g", "decoder.conv2.weight_g"} <= keys
+    assert not any(key.startswith(("decoder.layers.", "model.decoder.")) for key in keys)
+    assert tokenizer.decode(torch.zeros(1, 2, 3)).shape == (1, 1, 6)
+    with pytest.raises(NotImplementedError, match="decoder-only"):
+        tokenizer.encode(torch.zeros(1, 1, 6))
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index 730079c116a..38db56e0c26 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -29,8 +29,9 @@ def _tiny_cosmos3_config(**overrides):
     return config
 
 
-def test_mrope_position_ids_cover_text_and_video() -> None:
+def test_mrope_position_ids_cover_text_video_and_sound() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
+        compute_mrope_position_ids_sound,
         compute_mrope_position_ids_text,
         compute_mrope_position_ids_vision,
     )
@@ -56,6 +57,10 @@ def test_mrope_position_ids_cover_text_and_video() -> None:
     torch.testing.assert_close(modulated_ids[0], torch.tensor([10.0, 12.0]))
     assert modulated_offset == 13
 
+    sound_ids, sound_offset = compute_mrope_position_ids_sound(3, temporal_offset=10, sound_latent_fps=25.0)
+    torch.testing.assert_close(sound_ids[0], torch.tensor([10.0, 10.96, 11.92]))
+    assert sound_offset == 12
+
 
 @pytest.mark.parametrize(
     ("key", "value"),
@@ -115,12 +120,90 @@ def test_forward_returns_video_prediction(monkeypatch: pytest.MonkeyPatch) -> No
         text_mask=torch.ones(1, 2, dtype=torch.long),
         video_shape=(1, 2, 2),
         fps=24.0,
+        sound_latents=torch.zeros(1, 3, 4),
     )
 
     assert tuple(output.shape) == (1, 2, 1, 2, 2)
 
 
-def test_compute_rope_freqs_places_text_and_video_positions() -> None:
+def test_sound_modules_follow_config() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    tiny = _tiny_cosmos3_config()
+    no_modal = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32))
+    with_sound = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config={**tiny, "sound_gen": True},
+            model_config={"sound_tokenizer": {"io_channels": 5, "sample_rate": 32000, "hop_size": 800}},
+            custom_pipeline_args={},
+            dtype=torch.float32,
+        )
+    )
+
+    assert no_modal.sound_gen is False
+    assert not hasattr(no_modal, "audio_proj_in")
+    assert with_sound.sound_dim == 5
+    assert with_sound.sound_latent_fps == 40.0
+    assert with_sound.audio_proj_in.in_features == 5
+
+
+def test_sound_pack_unpack_validate_shapes() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.sound_dim = 3
+
+    sound = torch.arange(2 * 3 * 4, dtype=torch.float32).reshape(2, 3, 4)
+    torch.testing.assert_close(model.unpack_sound(model.pack_sound(sound)), sound)
+
+    with pytest.raises(ValueError, match="channel mismatch"):
+        model.pack_sound(torch.zeros(1, 4, 2))
+
+
+def test_forward_returns_video_and_sound_predictions() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    output = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0),
+            dtype=torch.float32,
+        )
+    )(
+        hidden_states=torch.zeros(1, 2, 1, 2, 2),
+        timestep=torch.tensor([1.0]),
+        text_ids=torch.tensor([[1, 2]], dtype=torch.long),
+        text_mask=torch.ones(1, 2, dtype=torch.long),
+        video_shape=(1, 2, 2),
+        fps=24.0,
+        sound_latents=torch.zeros(1, 3, 4),
+    )
+
+    assert isinstance(output, tuple)
+    assert [tuple(tensor.shape) for tensor in output] == [(1, 2, 1, 2, 2), (1, 3, 4)]
+
+
+def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch: pytest.MonkeyPatch) -> None:
+    import vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 as cosmos3_module
+
+    model = cosmos3_module.Cosmos3VFMTransformer(
+        SimpleNamespace(tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3), dtype=torch.float32)
+    )
+    monkeypatch.setattr(cosmos3_module, "_get_ulysses_state", lambda: (2, 0, None))
+
+    with pytest.raises(ValueError, match=r"GEN sequence length \(3 = video tokens 2 \+ sound tokens 1\)"):
+        model(
+            hidden_states=torch.zeros(1, 2, 1, 1, 2),
+            timestep=torch.tensor([1.0]),
+            text_ids=torch.tensor([[1, 2]], dtype=torch.long),
+            text_mask=torch.ones(1, 2, dtype=torch.long),
+            video_shape=(1, 1, 2),
+            fps=24.0,
+            sound_latents=torch.zeros(1, 3, 1),
+        )
+
+
+def test_compute_rope_freqs_places_text_video_and_sound_positions() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     class FakeRotary:
@@ -140,6 +223,8 @@ def __call__(self, x, position_ids):
     model.temporal_modality_margin = 100
     model.base_fps = 24.0
     model.temporal_compression_factor = 4
+    model.temporal_compression_factor_sound = 1
+    model.sound_latent_fps = 25.0
     model.enable_fps_modulation = False
 
     freqs_und, freqs_gen = model._compute_rope_freqs(
@@ -156,3 +241,19 @@ def __call__(self, x, position_ids):
     assert vision_pos[0, 0].tolist() == [102, 103]
     assert freqs_und[0].shape == (2, 3, 1, 4)
     assert freqs_gen[0].shape == (2, 2, 1, 4)
+
+    rotary.position_ids.clear()
+    model._compute_rope_freqs(
+        text_mask=torch.tensor([[1, 1]], dtype=torch.long),
+        t=2,
+        hp=1,
+        wp=1,
+        fps=24.0,
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+        t_sound=1,
+    )
+
+    _, gen_pos = rotary.position_ids
+    assert gen_pos.shape == (3, 1, 3)
+    assert gen_pos[0, 0].tolist() == [102, 103, 102]
diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py
index 36b19333980..de1f14c7455 100644
--- a/tests/entrypoints/openai_api/test_video_server.py
+++ b/tests/entrypoints/openai_api/test_video_server.py
@@ -399,6 +399,8 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture):
             "true_cfg_scale": "4.0",
             "boundary_ratio": "0.7",
             "flow_shift": "0.25",
+            "generate_sound": "true",
+            "sound_duration": "2.5",
         },
     )
 
@@ -413,6 +415,8 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture):
     assert captured.true_cfg_scale == 4.0
     assert captured.boundary_ratio == 0.7
     assert captured.extra_args["flow_shift"] == 0.25
+    assert captured.extra_args["generate_sound"] is True
+    assert captured.extra_args["sound_duration"] == 2.5
 
 
 def test_frame_interpolation_params_pass_to_diffusion_sampling_params(test_client, mocker: MockerFixture):
@@ -756,6 +760,9 @@ def test_invalid_uploaded_input_reference_returns_400(test_client):
 def test_video_request_validation():
     req = VideoGenerationRequest(prompt="test")
     assert req.prompt == "test"
+    assert req.generate_sound is False
+    assert req.sound_duration is None
+    assert VideoGenerationRequest(prompt="test", generate_sound=True, sound_duration=1.5).generate_sound is True
     with pytest.raises(ValueError):
         VideoGenerationRequest(prompt="test", size="invalid")
 
@@ -768,6 +775,8 @@ def test_video_request_validation():
         VideoGenerationRequest(prompt="test", frame_interpolation_exp=0)
     with pytest.raises(ValueError):
         VideoGenerationRequest(prompt="test", frame_interpolation_scale=0)
+    with pytest.raises(ValueError):
+        VideoGenerationRequest(prompt="test", sound_duration=0)
 
 
 def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerFixture):
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
new file mode 100644
index 00000000000..cfb794705ba
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .avae import Cosmos3AVAEAudioTokenizer
+
+__all__ = ["Cosmos3AVAEAudioTokenizer"]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
new file mode 100644
index 00000000000..4ddb8d41527
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
@@ -0,0 +1,323 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Diffusers-format AVAE audio tokenizer used by Cosmos3 sound generation."""
+
+from __future__ import annotations
+
+import json
+import math
+from pathlib import Path
+from typing import Any
+
+import torch
+from torch import nn
+from torch.nn.utils import weight_norm
+from vllm.logger import init_logger
+
+from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
+
+logger = init_logger(__name__)
+
+
+def _default_avae_config(
+    *,
+    sample_rate: int,
+    audio_channels: int,
+    io_channels: int,
+    hop_size: int,
+) -> dict[str, Any]:
+    return {
+        "sampling_rate": sample_rate,
+        "hop_size": hop_size,
+        "dec_dim": 320,
+        "dec_c_mults": [1, 2, 4, 8, 16],
+        "dec_strides": [2, 4, 5, 6, 8],
+        "dec_out_channels": audio_channels,
+        "vocoder_input_dim": io_channels,
+        "normalization_type": "none",
+        "normalize_latents": False,
+        "tanh_input_scale": 1.5,
+        "tanh_output_scale": 3.5,
+        "tanh_clamp": 0.995,
+    }
+
+
+def _config_get(config: dict[str, Any], *keys: str, default: Any = None) -> Any:
+    for key in keys:
+        value = config.get(key)
+        if value is not None:
+            return value
+    return default
+
+
+def _load_config(
+    config_path: str | Path | None,
+    *,
+    sample_rate: int,
+    audio_channels: int,
+    io_channels: int,
+    hop_size: int,
+) -> dict[str, Any]:
+    if config_path:
+        with open(config_path, encoding="utf-8") as f:
+            config = json.load(f)
+        if not isinstance(config, dict):
+            raise TypeError(f"Cosmos3 AVAE config must be a JSON object, got {type(config)!r}.")
+        return config
+    return _default_avae_config(
+        sample_rate=sample_rate,
+        audio_channels=audio_channels,
+        io_channels=io_channels,
+        hop_size=hop_size,
+    )
+
+
+def _load_checkpoint(path: str | Path, map_location: torch.device | str) -> dict[str, torch.Tensor]:
+    path = Path(path)
+    if path.suffix == ".safetensors":
+        try:
+            from safetensors.torch import load_file
+        except ImportError as exc:
+            raise ImportError("Loading AVAE .safetensors checkpoints requires safetensors.") from exc
+        checkpoint = load_file(str(path), device=str(map_location))
+    else:
+        checkpoint = torch.load(path, map_location=map_location)
+
+    if not isinstance(checkpoint, dict):
+        raise TypeError(f"AVAE checkpoint must be a flat state dict, got {type(checkpoint)!r}.")
+    if not all(isinstance(value, torch.Tensor) for value in checkpoint.values()):
+        raise TypeError("AVAE checkpoint must be a flat tensor state dict.")
+    return checkpoint
+
+
+def _validate_diffusers_state_dict(state_dict: dict[str, torch.Tensor]) -> None:
+    if not state_dict:
+        raise RuntimeError("AVAE checkpoint is empty.")
+
+    if not any(key.startswith("decoder.") for key in state_dict):
+        raise RuntimeError("Cosmos3 AVAE checkpoint must contain diffusers-format decoder.* keys.")
+
+
+class Snake1d(nn.Module):
+    """One-dimensional Snake activation matching diffusers' Oobleck layout."""
+
+    def __init__(self, hidden_dim: int, logscale: bool = True) -> None:
+        super().__init__()
+        self.alpha = nn.Parameter(torch.zeros(1, hidden_dim, 1))
+        self.beta = nn.Parameter(torch.zeros(1, hidden_dim, 1))
+        self.logscale = logscale
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        shape = hidden_states.shape
+        alpha = torch.exp(self.alpha) if self.logscale else self.alpha
+        beta = torch.exp(self.beta) if self.logscale else self.beta
+        hidden_states = hidden_states.reshape(shape[0], shape[1], -1)
+        hidden_states = hidden_states + (beta + 1e-9).reciprocal() * torch.sin(alpha * hidden_states).pow(2)
+        return hidden_states.reshape(shape)
+
+
+class OobleckResidualUnit(nn.Module):
+    """Residual unit used by the diffusers Oobleck decoder."""
+
+    def __init__(self, dimension: int = 16, dilation: int = 1) -> None:
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.snake1 = Snake1d(dimension)
+        self.conv1 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad))
+        self.snake2 = Snake1d(dimension)
+        self.conv2 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=1))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        output_tensor = self.conv1(self.snake1(hidden_state))
+        output_tensor = self.conv2(self.snake2(output_tensor))
+        padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2
+        if padding > 0:
+            hidden_state = hidden_state[..., padding:-padding]
+        return hidden_state + output_tensor
+
+
+class OobleckDecoderBlock(nn.Module):
+    """Decoder block used by the diffusers Oobleck decoder."""
+
+    def __init__(self, input_dim: int, output_dim: int, stride: int = 1, output_padding: int = 0) -> None:
+        super().__init__()
+        self.snake1 = Snake1d(input_dim)
+        self.conv_t1 = weight_norm(
+            nn.ConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                output_padding=output_padding,
+            )
+        )
+        self.res_unit1 = OobleckResidualUnit(output_dim, dilation=1)
+        self.res_unit2 = OobleckResidualUnit(output_dim, dilation=3)
+        self.res_unit3 = OobleckResidualUnit(output_dim, dilation=9)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv_t1(hidden_state)
+        hidden_state = self.res_unit1(hidden_state)
+        hidden_state = self.res_unit2(hidden_state)
+        return self.res_unit3(hidden_state)
+
+
+class OobleckDecoder(nn.Module):
+    """Diffusers-compatible Oobleck decoder for Cosmos3 AVAE latents."""
+
+    def __init__(
+        self,
+        channels: int,
+        input_channels: int,
+        audio_channels: int,
+        upsampling_ratios: list[int],
+        channel_multiples: list[int],
+    ) -> None:
+        super().__init__()
+        strides = upsampling_ratios
+        channel_multiples = [1] + channel_multiples
+
+        self.conv1 = weight_norm(nn.Conv1d(input_channels, channels * channel_multiples[-1], kernel_size=7, padding=3))
+
+        block = []
+        for stride_index, stride in enumerate(strides):
+            block.append(
+                OobleckDecoderBlock(
+                    input_dim=channels * channel_multiples[len(strides) - stride_index],
+                    output_dim=channels * channel_multiples[len(strides) - stride_index - 1],
+                    stride=stride,
+                    output_padding=stride % 2,
+                )
+            )
+        self.block = nn.ModuleList(block)
+        self.snake1 = Snake1d(channels)
+        self.conv2 = weight_norm(nn.Conv1d(channels, audio_channels, kernel_size=7, padding=3, bias=False))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.conv1(hidden_state)
+        for layer in self.block:
+            hidden_state = layer(hidden_state)
+        hidden_state = self.snake1(hidden_state)
+        return self.conv2(hidden_state)
+
+
+class Cosmos3AVAEAudioTokenizer(nn.Module):
+    """Decoder-only AVAE tokenizer for Cosmos3 audio latents."""
+
+    def __init__(
+        self,
+        *,
+        checkpoint_path: str | Path,
+        config_path: str | Path | None = None,
+        sample_rate: int = 48000,
+        audio_channels: int = 2,
+        io_channels: int = 64,
+        hop_size: int = 1920,
+        normalize_latents: bool = False,
+        normalization_type: str = "none",
+        tanh_input_scale: float = 1.5,
+        tanh_output_scale: float = 3.5,
+        tanh_clamp: float = 0.995,
+        dtype: torch.dtype = torch.bfloat16,
+        device: torch.device | str = "cuda",
+    ) -> None:
+        super().__init__()
+        self.dtype = dtype
+        self.device = torch.device(device)
+
+        config = _load_config(
+            config_path,
+            sample_rate=sample_rate,
+            audio_channels=audio_channels,
+            io_channels=io_channels,
+            hop_size=hop_size,
+        )
+        self.sample_rate = int(_config_get(config, "sampling_rate", "sample_rate", default=sample_rate))
+        self.audio_channels = int(
+            _config_get(
+                config,
+                "dec_out_channels",
+                "audio_channels",
+                default=2 if bool(config.get("stereo", audio_channels == 2)) else 1,
+            )
+        )
+        self.latent_ch = int(_config_get(config, "vocoder_input_dim", "io_channels", "latent_ch", default=io_channels))
+        dec_strides = [int(stride) for stride in _config_get(config, "dec_strides", default=[2, 4, 5, 6, 8])]
+        self.hop_size = int(
+            _config_get(config, "hop_size", default=math.prod(dec_strides) if dec_strides else hop_size)
+        )
+        dec_stride_product = math.prod(dec_strides)
+        if dec_stride_product != self.hop_size:
+            raise ValueError(
+                "Cosmos3 AVAE config dec_strides product must equal hop_size "
+                f"for correct latent/audio duration math: product={dec_stride_product}, hop_size={self.hop_size}."
+            )
+
+        normalization_type = str(_config_get(config, "normalization_type", default=normalization_type))
+        normalize_latents = bool(_config_get(config, "normalize_latents", default=normalize_latents))
+        if normalization_type == "none" and normalize_latents:
+            normalization_type = "tanh"
+        self.normalization_type = normalization_type
+        self.tanh_input_scale = float(_config_get(config, "tanh_input_scale", default=tanh_input_scale))
+        self.tanh_output_scale = float(_config_get(config, "tanh_output_scale", default=tanh_output_scale))
+        self.tanh_clamp = float(_config_get(config, "tanh_clamp", default=tanh_clamp))
+
+        self.decoder = OobleckDecoder(
+            channels=int(_config_get(config, "dec_dim", default=320)),
+            input_channels=self.latent_ch,
+            audio_channels=self.audio_channels,
+            upsampling_ratios=list(reversed(dec_strides)),
+            channel_multiples=list(_config_get(config, "dec_c_mults", default=[1, 2, 4, 8, 16])),
+        )
+        state_dict = _load_checkpoint(checkpoint_path, self.device)
+        _validate_diffusers_state_dict(state_dict)
+
+        # The checkpoint also contains encoder weights, which we do not support here, hence strict=False
+        self.load_state_dict(state_dict, strict=False)
+
+        self.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+        self.to(device=self.device, dtype=self.dtype)
+        if _is_rank_zero():
+            logger.info("Loaded diffusers-format Cosmos3 AVAE checkpoint from %s", checkpoint_path)
+
+    @property
+    def temporal_compression_factor(self) -> int:
+        return self.hop_size
+
+    def get_latent_num_samples(self, num_audio_samples: int) -> int:
+        return int(num_audio_samples) // self.temporal_compression_factor
+
+    def get_audio_num_samples(self, num_latent_samples: int) -> int:
+        return int(num_latent_samples) * self.temporal_compression_factor
+
+    def _denormalize_latent(self, latent: torch.Tensor) -> torch.Tensor:
+        if self.normalization_type == "tanh":
+            in_dtype = latent.dtype
+            latent = torch.clamp(
+                latent.float() / self.tanh_output_scale,
+                -self.tanh_clamp,
+                self.tanh_clamp,
+            )
+            return (torch.atanh(latent) * self.tanh_input_scale).to(in_dtype)
+        if self.normalization_type != "none":
+            raise ValueError(f"Unsupported AVAE normalization_type={self.normalization_type!r}.")
+        return latent
+
+    @torch.no_grad()
+    def encode(self, audio: torch.Tensor, force_pad: bool = False) -> torch.Tensor:
+        del audio, force_pad
+        raise NotImplementedError("Cosmos3AVAEAudioTokenizer is decoder-only for diffusers-format sound_tokenizer/.")
+
+    @torch.no_grad()
+    def decode(self, latent: torch.Tensor) -> torch.Tensor:
+        in_dtype = latent.dtype
+        squeeze = latent.ndim == 2
+        if squeeze:
+            latent = latent.unsqueeze(0)
+        z = self._denormalize_latent(latent.to(self.device)).to(self.dtype)
+        audio = self.decoder(z).clamp(-1.0, 1.0).to(in_dtype)
+        return audio.squeeze(0) if squeeze else audio
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 9a20d4fa80d..32c129c613f 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import math
 import os
 import time
 from collections.abc import Iterable
@@ -141,6 +142,28 @@ def get_cosmos3_post_process_func(od_config: OmniDiffusionConfig):
 
     video_processor = VideoProcessor(vae_scale_factor=16)
 
+    def _sampling_param(sampling_params, key: str, default=None):
+        extra = getattr(sampling_params, "extra_args", None)
+        if isinstance(extra, dict) and extra.get(key) is not None:
+            return extra[key]
+        value = getattr(sampling_params, key, None)
+        return default if value is None else value
+
+    def _resolve_output_fps(sampling_params):
+        fps = (
+            _sampling_param(sampling_params, "resolved_frame_rate")
+            or _sampling_param(sampling_params, "frame_rate")
+            or _sampling_param(sampling_params, "fps")
+            or 24.0
+        )
+        try:
+            fps_value = float(fps)
+        except (TypeError, ValueError):
+            fps_value = 24.0
+        if fps_value <= 0:
+            fps_value = 24.0
+        return int(fps_value) if fps_value.is_integer() else fps_value
+
     def post_process_func(
         output: torch.Tensor | dict[str, torch.Tensor] | tuple,
         output_type: str = "np",
@@ -149,6 +172,8 @@ def post_process_func(
         if output_type == "latent":
             return output
 
+        audio = None
+        audio_sample_rate = None
         if isinstance(output, dict):
             if "image" in output and "video" in output:
                 raise ValueError("Cosmos3 output cannot contain both image and video payloads.")
@@ -158,10 +183,23 @@ def post_process_func(
                 video = output["video"]
             else:
                 raise ValueError("Cosmos3 postprocess expected an 'image' or 'video' output payload.")
+            audio = output.get("audio")
+            audio_sample_rate = output.get("audio_sample_rate")
+        elif isinstance(output, tuple):
+            if len(output) == 3:
+                video, audio, audio_sample_rate = output
+            elif len(output) == 2:
+                video, audio = output
+            else:
+                raise ValueError(
+                    "Cosmos3 postprocess expects output tensor, output dict, or (video, audio[, sample_rate]) tuple."
+                )
         else:
             video = output
 
         if isinstance(output, dict) and "image" in output:
+            if audio is not None:
+                raise ValueError("Cosmos3 text-to-image postprocess does not support audio output.")
             if video.ndim != 5 or video.shape[2] != 1:
                 raise ValueError(
                     "Cosmos3 text-to-image postprocess expects decoded output "
@@ -175,7 +213,16 @@ def post_process_func(
             return video_processor.postprocess(image, output_type="pil")
         if is_guardrails_enabled(od_config, sampling_params):
             video = check_video_safety(video)
-        return {"video": video_processor.postprocess_video(video, output_type=output_type)}
+        result = {"video": video_processor.postprocess_video(video, output_type=output_type)}
+        if audio is None:
+            return result
+        if isinstance(audio, torch.Tensor):
+            audio = audio.detach().cpu()
+        result["audio"] = audio
+        result["fps"] = _resolve_output_fps(sampling_params)
+        if audio_sample_rate is not None:
+            result["audio_sample_rate"] = int(audio_sample_rate)
+        return result
 
     return post_process_func
 
@@ -297,6 +344,9 @@ def __init__(
 
         self._guidance_scale = None
         self._num_timesteps = None
+        self._sound_tokenizer = None
+        if getattr(self.transformer, "sound_gen", False):
+            self._get_sound_tokenizer()
 
         self.setup_diffusion_pipeline_profiler(
             enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler
@@ -331,9 +381,13 @@ def _remap_ckpt_key(key: str) -> str | None:
                 "proj_in.",
                 "proj_out.",
                 "time_embedder.",
+                "audio_proj_in.",
+                "audio_proj_out.",
             )
         ):
             return f"transformer.{k}"
+        if k in ("audio_modality_embed", "audio_modality_embed.weight"):
+            return "transformer.audio_modality_embed"
 
         # Skip lm_head
         if k.startswith("lm_head."):
@@ -427,12 +481,22 @@ def _remapped_weights() -> Iterable[tuple[str, torch.Tensor]]:
         loaded = loader.load_weights(_remapped_weights())
         self.transformer.post_load_weights()
         self.transformer.eval()
+        if getattr(self.transformer, "sound_gen", False):
+            sound_markers = ("audio_proj_in.", "audio_proj_out.", "audio_modality_embed")
+            missing = [marker.rstrip(".") for marker in sound_markers if not any(marker in name for name in loaded)]
+            if missing:
+                raise ValueError(
+                    "Cosmos3 transformer config enables sound generation, but "
+                    f"the checkpoint is missing sound weights for {missing}. "
+                    "Use a sound-capable transformer checkpoint."
+                )
         return loaded
 
     def predict_noise(self, **kwargs) -> torch.Tensor | tuple[torch.Tensor, ...]:
         """Override CFGParallelMixin.predict_noise for Cosmos3.
 
-        The transformer returns the raw video noise prediction.
+        The transformer returns the raw prediction: video-only as a tensor,
+        or a tuple in video, sound order for sound generation.
         """
         return self.transformer(**kwargs)
 
@@ -467,6 +531,49 @@ def _get_sp_param(sp, key: str, default=None):
             return val
         return default
 
+    @staticmethod
+    def _truthy(value) -> bool:
+        if isinstance(value, str):
+            return value.strip().lower() in {"1", "true", "yes", "on"}
+        return bool(value)
+
+    @classmethod
+    def _get_prompt_param(cls, prompt_data, key: str, default=None):
+        if not isinstance(prompt_data, dict):
+            return default
+        if prompt_data.get(key) is not None:
+            return prompt_data[key]
+        additional = prompt_data.get("additional_information")
+        if isinstance(additional, dict) and additional.get(key) is not None:
+            return additional[key]
+        return default
+
+    @classmethod
+    def _is_sound_request(cls, prompt_data, sp) -> bool:
+        keys = (
+            "sound_gen",
+            "generate_sound",
+            "enable_sound_generation",
+            "return_audio",
+            "output_audio",
+            "generate_audio",
+        )
+        for key in keys:
+            if cls._truthy(cls._get_prompt_param(prompt_data, key, None)):
+                return True
+            if cls._truthy(cls._get_sp_param(sp, key, None)):
+                return True
+        return False
+
+    def _get_sound_tokenizer(self):
+        if not hasattr(self, "_sound_tokenizer"):
+            self._sound_tokenizer = None
+        if self._sound_tokenizer is None:
+            from .sound_tokenizer import Cosmos3SoundTokenizer
+
+            self._sound_tokenizer = Cosmos3SoundTokenizer.from_config(self.od_config)
+        return self._sound_tokenizer
+
     @staticmethod
     def _is_t2i_request(req: OmniDiffusionRequest) -> bool:
         """Detect text-to-image mode from request-level prompt modalities."""
@@ -673,6 +780,47 @@ def _prepare_latents(
         )
         return randn_tensor(shape, generator=generator, device=self.device, dtype=self.dtype)
 
+    def _prepare_sound_latents(
+        self,
+        target_audio_samples: int,
+        generator: torch.Generator,
+    ) -> tuple[torch.Tensor, int]:
+        sound_tokenizer = self._get_sound_tokenizer()
+        hop_size = int(
+            getattr(sound_tokenizer, "hop_size", None) or getattr(sound_tokenizer, "temporal_compression_factor")
+        )
+        latent_frames = max(1, math.ceil(max(1, int(target_audio_samples)) / hop_size))
+        sound_dim = int(getattr(sound_tokenizer, "latent_ch", 64))
+        transformer_sound_dim = int(getattr(self.transformer, "sound_dim", sound_dim))
+        if sound_dim != transformer_sound_dim:
+            raise ValueError(
+                "Cosmos3 sound tokenizer latent channels do not match transformer "
+                f"sound_dim: tokenizer={sound_dim}, transformer={transformer_sound_dim}."
+            )
+        latents = randn_tensor(
+            (1, sound_dim, latent_frames),
+            generator=generator,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        return latents, latent_frames
+
+    def _resolve_sound_target_samples(
+        self,
+        sp,
+        num_frames: int,
+        frame_rate: float,
+    ) -> tuple[int, float, int]:
+        sound_tokenizer = self._get_sound_tokenizer()
+        duration = self._get_sp_param(sp, "sound_duration", None)
+        if duration is None:
+            duration = self._get_sp_param(sp, "audio_duration", None)
+        if duration is None:
+            duration = num_frames / frame_rate
+        duration = max(float(duration), 1.0 / max(float(frame_rate), 1.0))
+        sample_rate = int(getattr(sound_tokenizer, "sample_rate", 48000))
+        return max(1, int(round(duration * sample_rate))), duration, sample_rate
+
     # -- VAE decode ----------------------------------------------------------
 
     def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
@@ -694,6 +842,19 @@ def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
         video = self.vae.decode(latents, return_dict=False)[0]
         return video
 
+    def _decode_sound_latents(
+        self,
+        sound_latents: torch.Tensor,
+        target_audio_samples: int,
+    ) -> torch.Tensor:
+        sound_tokenizer = self._get_sound_tokenizer()
+        audio = sound_tokenizer.decode(sound_latents.to(self.dtype))
+        if audio.shape[-1] > target_audio_samples:
+            audio = audio[..., :target_audio_samples]
+        elif audio.shape[-1] < target_audio_samples:
+            audio = torch.nn.functional.pad(audio, (0, target_audio_samples - audio.shape[-1]))
+        return audio.detach().cpu()
+
     # -- Prompt formatting + tokenization (shared by T2V and I2V) ------------
 
     def _format_and_tokenize_prompts(
@@ -855,11 +1016,12 @@ def diffuse(
         guidance_scale: float,
         shared_kwargs: dict,
         *,
+        sound_latents: torch.Tensor | None = None,
         velocity_mask: torch.Tensor | None = None,
         image_latent: torch.Tensor | None = None,
         condition_latents: torch.Tensor | None = None,
         guidance_interval: tuple[float, float] | None = None,
-    ) -> torch.Tensor:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         """Denoising loop with 3-mode CFG support (parallel, sequential, none).
 
         Cosmos3's UND pathway is text-dependent, so CFG needs separate K/V
@@ -898,21 +1060,82 @@ def _cfg_active_at(t: torch.Tensor) -> bool:
             lo, hi = guidance_interval
             return lo <= t_scalar <= hi
 
+        def _pack_joint(
+            video_tensor: torch.Tensor,
+            sound_tensor: torch.Tensor | None = None,
+        ):
+            batch = video_tensor.shape[0]
+            tensors = [video_tensor]
+            if sound_tensor is not None:
+                tensors.append(sound_tensor)
+            flats = [tensor.reshape(batch, -1) for tensor in tensors]
+            return torch.cat(flats, dim=1), [tensor.shape for tensor in tensors], [flat.shape[1] for flat in flats]
+
+        def _unpack_joint(
+            packed: torch.Tensor,
+            shapes: list[torch.Size],
+            numels: list[int],
+        ) -> tuple[torch.Tensor, ...]:
+            outputs = []
+            offset = 0
+            for shape, numel in zip(shapes, numels, strict=True):
+                outputs.append(packed[:, offset : offset + numel].reshape(shape))
+                offset += numel
+            return tuple(outputs)
+
+        def _split_noise_pred(
+            noise_pred: torch.Tensor | tuple[torch.Tensor, ...],
+        ) -> tuple[torch.Tensor, torch.Tensor | None]:
+            has_sound = sound_latents is not None
+            if not has_sound:
+                if isinstance(noise_pred, tuple):
+                    raise ValueError("Cosmos3 video-only diffusion received tuple predictions.")
+                return noise_pred, None
+            if not isinstance(noise_pred, tuple):
+                raise ValueError("Cosmos3 multimodal diffusion expects transformer predictions as a tuple.")
+            if len(noise_pred) != 2:
+                raise ValueError(f"Cosmos3 sound diffusion expected 2 predictions, got {len(noise_pred)}.")
+            return noise_pred[0], noise_pred[1]
+
         def _step(
-            noise_pred: torch.Tensor,
+            noise_pred: torch.Tensor | tuple[torch.Tensor, ...],
             t: torch.Tensor,
             latents: torch.Tensor,
-        ) -> torch.Tensor:
-            if isinstance(noise_pred, tuple):
-                raise ValueError("Cosmos3 video-only diffusion received tuple predictions.")
+            sound_latents: torch.Tensor | None,
+        ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+            video_pred, sound_pred = _split_noise_pred(noise_pred)
             if velocity_mask is not None:
-                noise_pred = noise_pred * velocity_mask
-            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                video_pred = video_pred * velocity_mask
+            if sound_latents is None:
+                latents = self.scheduler.step(video_pred, t, latents, return_dict=False)[0]
+            else:
+                packed_noise, shapes, numels = _pack_joint(video_pred, sound_pred)
+                packed_latents, _, _ = _pack_joint(latents, sound_latents)
+                packed_next = self.scheduler.step(packed_noise, t, packed_latents, return_dict=False)[0]
+                unpacked = _unpack_joint(packed_next, shapes, numels)
+                latents = unpacked[0]
+                if sound_latents is not None:
+                    sound_latents = unpacked[1]
             if condition_latents is not None and velocity_mask is not None:
                 latents = velocity_mask * latents + (1.0 - velocity_mask) * condition_latents
             elif image_latent is not None:
                 latents[:, :, 0:1, :, :] = image_latent
-            return latents
+            outputs = [latents]
+            if sound_latents is not None:
+                outputs.append(sound_latents)
+            return outputs[0] if len(outputs) == 1 else tuple(outputs)
+
+        def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
+            nonlocal latents, sound_latents
+            if sound_latents is None:
+                assert isinstance(step_out, torch.Tensor)
+                latents = step_out
+                return
+            if not isinstance(step_out, tuple):
+                raise ValueError("Cosmos3 multimodal diffusion step returned a non-tuple result.")
+            latents = step_out[0]
+            if sound_latents is not None:
+                sound_latents = step_out[1]
 
         if cfg_parallel:
             for t in self.progress_bar(timesteps):
@@ -930,6 +1153,7 @@ def _step(
                         timestep=timestep,
                         text_ids=cond_ids,
                         text_mask=cond_mask,
+                        sound_latents=sound_latents,
                         **shared_kwargs,
                     ),
                     negative_kwargs=dict(
@@ -937,11 +1161,12 @@ def _step(
                         timestep=timestep,
                         text_ids=uncond_ids,
                         text_mask=uncond_mask,
+                        sound_latents=sound_latents,
                         **shared_kwargs,
                     ),
                     cfg_normalize=False,
                 )
-                latents = _step(noise_pred, t, latents)
+                _assign_step_out(_step(noise_pred, t, latents, sound_latents))
 
         elif do_cfg:
             cond_cache: tuple = (None, None)
@@ -957,6 +1182,7 @@ def _step(
                     timestep=timestep,
                     text_ids=cond_ids,
                     text_mask=cond_mask,
+                    sound_latents=sound_latents,
                     **shared_kwargs,
                 )
                 if cond_cache[0] is None:
@@ -969,6 +1195,7 @@ def _step(
                         timestep=timestep,
                         text_ids=uncond_ids,
                         text_mask=uncond_mask,
+                        sound_latents=sound_latents,
                         **shared_kwargs,
                     )
                     if uncond_cache[0] is None:
@@ -980,7 +1207,7 @@ def _step(
                     # the cond branch) and gives a free speedup for T2I.
                     noise_pred = noise_cond
 
-                latents = _step(noise_pred, t, latents)
+                _assign_step_out(_step(noise_pred, t, latents, sound_latents))
 
         else:
             for t in self.progress_bar(timesteps):
@@ -990,11 +1217,15 @@ def _step(
                     timestep=timestep,
                     text_ids=cond_ids,
                     text_mask=cond_mask,
+                    sound_latents=sound_latents,
                     **shared_kwargs,
                 )
-                latents = _step(noise_pred, t, latents)
+                _assign_step_out(_step(noise_pred, t, latents, sound_latents))
 
-        return latents
+        outputs = [latents]
+        if sound_latents is not None:
+            outputs.append(sound_latents)
+        return outputs[0] if len(outputs) == 1 else tuple(outputs)
 
     # -- Forward (main generation entry point) -------------------------------
 
@@ -1021,6 +1252,18 @@ def forward(
 
         sp = req.sampling_params
         is_t2i = self._is_t2i_request(req)
+        sound_enabled = self._is_sound_request(prompt_data, sp)
+        if sound_enabled and is_t2i:
+            raise ValueError(
+                "Cosmos3 sound generation is supported only for video outputs in "
+                "this phase; text-to-image with sound is unsupported."
+            )
+        if sound_enabled and not getattr(self.transformer, "sound_gen", False):
+            raise ValueError(
+                "Cosmos3 sound generation was requested, but the transformer was "
+                "initialized without sound modules. Check that the checkpoint config "
+                "enables sound_gen or defines sound_dim and includes sound weights."
+            )
         if negative_prompt is None:
             negative_prompt = ""
 
@@ -1109,6 +1352,13 @@ def forward(
             image_latent = None
             condition_latents = None
 
+        sound_latents = None
+        target_audio_samples = None
+        sound_sample_rate = None
+        if sound_enabled:
+            target_audio_samples, _, sound_sample_rate = self._resolve_sound_target_samples(sp, num_frames, frame_rate)
+            sound_latents, _ = self._prepare_sound_latents(target_audio_samples, generator)
+
         T_latent = latents.shape[2]
         H_latent = latents.shape[3]
         W_latent = latents.shape[4]
@@ -1130,6 +1380,7 @@ def _run_diffusion(start_latents):
                 uncond_mask=uncond_mask,
                 guidance_scale=guidance_scale,
                 shared_kwargs=shared_kwargs,
+                sound_latents=sound_latents,
                 velocity_mask=velocity_mask,
                 image_latent=image_latent,
                 condition_latents=condition_latents,
@@ -1150,7 +1401,11 @@ def _run_diffusion(start_latents):
                 samples.append(_run_diffusion(next_latents))
             latents = torch.cat(samples, dim=0)
         else:
-            latents = _run_diffusion(latents)
+            diffusion_output = _run_diffusion(latents)
+            if sound_enabled:
+                latents, sound_latents = diffusion_output
+            else:
+                latents = diffusion_output
 
         # --- Decode ---
         if _is_rank_zero():
@@ -1161,4 +1416,12 @@ def _run_diffusion(start_latents):
             logger.info("Video decoded in %.2fs", time.time() - decode_start)
             logger.info("Total pipeline time: %.2fs", time.time() - pipeline_start)
 
+        if sound_enabled:
+            if sound_latents is None or target_audio_samples is None or sound_sample_rate is None:
+                raise ValueError("Cosmos3 sound generation finished without sound latents.")
+            if _is_rank_zero():
+                logger.info("Decoding sound...")
+            audio = self._decode_sound_latents(sound_latents, target_audio_samples)
+            return DiffusionOutput(output={"video": video, "audio": audio, "audio_sample_rate": sound_sample_rate})
+
         return DiffusionOutput(output={"image": video} if is_t2i else {"video": video})
diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
new file mode 100644
index 00000000000..281b7e1d9f0
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
@@ -0,0 +1,537 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cosmos3 sound tokenizer integration."""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+import torch
+from vllm.logger import init_logger
+
+from vllm_omni.diffusion.data import OmniDiffusionConfig
+from vllm_omni.diffusion.distributed.utils import get_local_device
+from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
+
+from .audio_tokenizer import Cosmos3AVAEAudioTokenizer
+
+logger = init_logger(__name__)
+
+DEFAULT_SOUND_SAMPLE_RATE = 48000
+DEFAULT_SOUND_CHANNELS = 2
+DEFAULT_SOUND_DIM = 64
+DEFAULT_SOUND_HOP_SIZE = 1920
+DEFAULT_SOUND_LATENT_FPS = DEFAULT_SOUND_SAMPLE_RATE / DEFAULT_SOUND_HOP_SIZE
+DEFAULT_SOUND_NORMALIZE_LATENTS = False
+DEFAULT_SOUND_NORMALIZATION_TYPE = "none"
+DEFAULT_SOUND_TANH_INPUT_SCALE = 1.5
+DEFAULT_SOUND_TANH_OUTPUT_SCALE = 3.5
+DEFAULT_SOUND_TANH_CLAMP = 0.995
+SOUND_TOKENIZER_COMPONENT_NAME = "sound_tokenizer"
+SOUND_TOKENIZER_CHECKPOINT_NAME = "diffusion_pytorch_model.safetensors"
+
+
+def _pipeline_args(od_config: OmniDiffusionConfig) -> dict[str, Any]:
+    return dict(getattr(od_config, "custom_pipeline_args", None) or {})
+
+
+def _config_get(config: Any, key: str, default: Any = None) -> Any:
+    if config is None:
+        return default
+    if isinstance(config, dict):
+        return config.get(key, default)
+    if hasattr(config, "get"):
+        value = config.get(key, None)
+        return default if value is None else value
+    return getattr(config, key, default)
+
+
+def _config_path_get(config: Any, *keys: str) -> Any:
+    value = config
+    for key in keys:
+        value = _config_get(value, key, None)
+        if value is None:
+            return None
+    return value
+
+
+def _sound_tokenizer_config_from(config: Any) -> Any:
+    """Return nested ``sound_tokenizer`` config from Cosmos3 config shapes."""
+    for path in (
+        ("sound_tokenizer",),
+        ("model", "config", "sound_tokenizer"),
+        ("config", "sound_tokenizer"),
+        ("model_config", "sound_tokenizer"),
+    ):
+        value = _config_path_get(config, *path)
+        if value is not None:
+            return value
+    return None
+
+
+def _nested_sound_tokenizer_configs(od_config: OmniDiffusionConfig | None) -> tuple[Any, ...]:
+    if od_config is None:
+        return ()
+    configs = []
+    for source in (
+        getattr(od_config, "model_config", None),
+        getattr(od_config, "tf_model_config", None),
+    ):
+        config = _sound_tokenizer_config_from(source)
+        if config is not None:
+            configs.append(config)
+    return tuple(configs)
+
+
+def _first_value_from_configs(configs: tuple[Any, ...], keys: tuple[str, ...]) -> Any:
+    for config in configs:
+        for key in keys:
+            value = _config_get(config, key, None)
+            if value is not None:
+                return value
+    return None
+
+
+def _top_level_model_value(od_config: OmniDiffusionConfig | None, keys: tuple[str, ...]) -> Any:
+    if od_config is None:
+        return None
+    for source in (
+        getattr(od_config, "model_config", None),
+        getattr(od_config, "tf_model_config", None),
+    ):
+        for key in keys:
+            for path in ((key,), ("model", "config", key), ("config", key), ("model_config", key)):
+                value = _config_path_get(source, *path)
+                if value is not None:
+                    return value
+    return None
+
+
+def _custom_arg_value(args: dict[str, Any], keys: tuple[str, ...]) -> Any:
+    for key in keys:
+        value = args.get(key)
+        if value is not None:
+            return value
+    return None
+
+
+def _as_bool(value: Any) -> bool:
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
+
+
+def _as_audio_channels(value: Any) -> int:
+    if isinstance(value, bool):
+        return 2 if value else 1
+    if isinstance(value, str) and value.strip().lower() in {
+        "1",
+        "0",
+        "true",
+        "false",
+        "yes",
+        "no",
+        "on",
+        "off",
+    }:
+        return 2 if _as_bool(value) else 1
+    return int(value)
+
+
+def _resolve_model_file(path: Any, model_root: str | None) -> str | None:
+    if not path:
+        return None
+    path = str(path)
+    if "://" in path or os.path.isabs(path) or os.path.exists(path) or not model_root:
+        return path
+    return str(Path(model_root) / path)
+
+
+def _load_sound_tokenizer_component_config(config_path: str | None) -> dict[str, Any]:
+    if not config_path:
+        return {}
+    with open(config_path, encoding="utf-8") as f:
+        config = json.load(f)
+    if not isinstance(config, dict):
+        raise TypeError(f"Cosmos3 sound tokenizer config must be a JSON object, got {type(config)!r}.")
+    return config
+
+
+def _component_audio_channels(config: dict[str, Any]) -> Any:
+    if config.get("dec_out_channels") is not None:
+        return config["dec_out_channels"]
+    if config.get("audio_channels") is not None:
+        return config["audio_channels"]
+    if config.get("stereo") is not None:
+        return 2 if _as_bool(config["stereo"]) else 1
+    return None
+
+
+def _component_arch_values(config: dict[str, Any]) -> dict[str, Any]:
+    values = {
+        "sample_rate": config.get("sampling_rate", config.get("sample_rate")),
+        "audio_channels": _component_audio_channels(config),
+        "io_channels": config.get("vocoder_input_dim", config.get("io_channels", config.get("latent_ch"))),
+        "hop_size": config.get("hop_size"),
+    }
+    return {key: value for key, value in values.items() if value is not None}
+
+
+def _resolve_arch_value(
+    od_config: OmniDiffusionConfig,
+    args: dict[str, Any],
+    component_values: dict[str, Any],
+    *,
+    field: str,
+    custom_keys: tuple[str, ...],
+    nested_keys: tuple[str, ...],
+    top_level_keys: tuple[str, ...],
+    default: Any,
+    cast,
+) -> Any:
+    custom_value = _custom_arg_value(args, custom_keys)
+    component_value = component_values.get(field)
+    if component_value is not None:
+        resolved = cast(component_value)
+        if custom_value is not None and cast(custom_value) != resolved:
+            raise ValueError(
+                "Conflicting Cosmos3 sound tokenizer architecture override for "
+                f"{field}: component config has {resolved!r}, custom args have {cast(custom_value)!r}."
+            )
+        return resolved
+
+    if custom_value is not None:
+        return cast(custom_value)
+
+    nested_value = _first_value_from_configs(_nested_sound_tokenizer_configs(od_config), nested_keys)
+    if nested_value is not None:
+        return cast(nested_value)
+
+    top_value = _top_level_model_value(od_config, top_level_keys)
+    if top_value is not None:
+        return cast(top_value)
+
+    return cast(default)
+
+
+def _resolve_normalization_value(
+    od_config: OmniDiffusionConfig,
+    args: dict[str, Any],
+    *,
+    name: str,
+    default: Any,
+    aliases: tuple[str, ...] = (),
+) -> Any:
+    keys = (f"sound_{name}", name, *aliases)
+    custom_value = _custom_arg_value(args, keys)
+    if custom_value is not None:
+        return custom_value
+    nested_value = _first_value_from_configs(_nested_sound_tokenizer_configs(od_config), (name, *aliases))
+    return default if nested_value is None else nested_value
+
+
+def get_sound_config_value(
+    od_config: OmniDiffusionConfig,
+    name: str,
+    default: Any,
+    aliases: tuple[str, ...] = (),
+) -> Any:
+    # Backward-compatible generic accessor.  Prefer the more specific helpers
+    # below for Cosmos3 sound tokenizer fields so precedence stays explicit.
+    keys = (name, *aliases)
+    for config in (
+        _pipeline_args(od_config),
+        getattr(od_config, "model_config", None),
+        getattr(od_config, "tf_model_config", None),
+    ):
+        if config is None:
+            continue
+        for key in keys:
+            if hasattr(config, "get"):
+                value = config.get(key, None)
+            else:
+                value = getattr(config, key, None)
+            if value is not None:
+                return value
+    return default
+
+
+def get_sound_sample_rate(od_config: OmniDiffusionConfig) -> int:
+    args = _pipeline_args(od_config)
+    return _resolve_arch_value(
+        od_config,
+        args,
+        {},
+        field="sample_rate",
+        custom_keys=("sound_sample_rate", "sample_rate"),
+        nested_keys=("sample_rate", "sampling_rate"),
+        top_level_keys=("sound_sample_rate", "sample_rate"),
+        default=DEFAULT_SOUND_SAMPLE_RATE,
+        cast=int,
+    )
+
+
+def get_sound_channels(od_config: OmniDiffusionConfig) -> int:
+    args = _pipeline_args(od_config)
+    return _resolve_arch_value(
+        od_config,
+        args,
+        {},
+        field="audio_channels",
+        custom_keys=("sound_audio_channels", "audio_channels", "stereo"),
+        nested_keys=("audio_channels", "dec_out_channels", "stereo"),
+        top_level_keys=("sound_audio_channels", "audio_channels", "stereo"),
+        default=DEFAULT_SOUND_CHANNELS,
+        cast=_as_audio_channels,
+    )
+
+
+def get_sound_dim(od_config: OmniDiffusionConfig | None) -> int:
+    if od_config is None:
+        return DEFAULT_SOUND_DIM
+    args = _pipeline_args(od_config)
+    custom_value = _custom_arg_value(args, ("sound_dim", "io_channels", "latent_ch"))
+    if custom_value is not None:
+        return int(custom_value)
+    top_value = _top_level_model_value(od_config, ("sound_dim",))
+    if top_value is not None:
+        return int(top_value)
+    nested_value = _first_value_from_configs(
+        _nested_sound_tokenizer_configs(od_config),
+        ("io_channels", "vocoder_input_dim", "latent_ch"),
+    )
+    return int(DEFAULT_SOUND_DIM if nested_value is None else nested_value)
+
+
+def get_sound_hop_size(od_config: OmniDiffusionConfig) -> int:
+    args = _pipeline_args(od_config)
+    return _resolve_arch_value(
+        od_config,
+        args,
+        {},
+        field="hop_size",
+        custom_keys=("sound_hop_size", "hop_size"),
+        nested_keys=("hop_size",),
+        top_level_keys=("sound_hop_size", "hop_size"),
+        default=DEFAULT_SOUND_HOP_SIZE,
+        cast=int,
+    )
+
+
+def get_sound_latent_fps(od_config: OmniDiffusionConfig | None) -> float:
+    if od_config is None:
+        return DEFAULT_SOUND_LATENT_FPS
+    args = _pipeline_args(od_config)
+    custom_value = _custom_arg_value(args, ("sound_latent_fps",))
+    if custom_value is not None:
+        return float(custom_value)
+    top_value = _top_level_model_value(od_config, ("sound_latent_fps",))
+    if top_value is not None:
+        return float(top_value)
+    nested_configs = _nested_sound_tokenizer_configs(od_config)
+    nested_fps = _first_value_from_configs(nested_configs, ("sound_latent_fps", "latent_fps"))
+    if nested_fps is not None:
+        return float(nested_fps)
+    sample_rate = _first_value_from_configs(nested_configs, ("sample_rate", "sampling_rate"))
+    hop_size = _first_value_from_configs(nested_configs, ("hop_size",))
+    if sample_rate is not None and hop_size is not None:
+        return float(sample_rate) / float(hop_size)
+    return float(DEFAULT_SOUND_LATENT_FPS)
+
+
+class Cosmos3SoundTokenizer:
+    """Thin adapter around the local AVAE tokenizer implementation."""
+
+    def __init__(self, tokenizer: Any) -> None:
+        self.tokenizer = tokenizer
+        self.sample_rate = int(getattr(tokenizer, "sample_rate", DEFAULT_SOUND_SAMPLE_RATE))
+        self.audio_channels = int(getattr(tokenizer, "audio_channels", DEFAULT_SOUND_CHANNELS))
+        self.latent_ch = int(getattr(tokenizer, "latent_ch", DEFAULT_SOUND_DIM))
+        self.hop_size = int(getattr(tokenizer, "temporal_compression_factor", DEFAULT_SOUND_HOP_SIZE))
+
+    @classmethod
+    def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer:
+        args = _pipeline_args(od_config)
+        model_path = getattr(od_config, "model", None)
+        explicit_avae_path = (
+            args.get("sound_tokenizer_path")
+            or args.get("avae_path")
+            or args.get("cosmos3_avae_path")
+            or os.environ.get("COSMOS3_SOUND_TOKENIZER_PATH")
+        )
+        explicit_config_path = args.get("sound_tokenizer_config_path") or os.environ.get(
+            "COSMOS3_SOUND_TOKENIZER_CONFIG_PATH"
+        )
+
+        model_root = str(model_path) if model_path and os.path.isdir(model_path) else None
+        if model_root is None and model_path and not explicit_avae_path:
+            from huggingface_hub import snapshot_download
+
+            model_root = snapshot_download(
+                repo_id=str(model_path),
+                revision=getattr(od_config, "revision", None),
+                allow_patterns=[
+                    f"{SOUND_TOKENIZER_COMPONENT_NAME}/config.json",
+                    f"{SOUND_TOKENIZER_COMPONENT_NAME}/{SOUND_TOKENIZER_CHECKPOINT_NAME}",
+                ],
+            )
+
+        if explicit_avae_path:
+            avae_path = _resolve_model_file(explicit_avae_path, model_root)
+        else:
+            tokenizer_dir = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME if model_root else None
+            candidate = tokenizer_dir / SOUND_TOKENIZER_CHECKPOINT_NAME if tokenizer_dir else None
+            avae_path = str(candidate) if candidate and candidate.exists() else None
+
+        if not avae_path:
+            raise ValueError(
+                "Cosmos3 sound generation was requested, but no AVAE sound "
+                "tokenizer checkpoint was provided. Set "
+                "custom_pipeline_args['sound_tokenizer_path'] or "
+                "COSMOS3_SOUND_TOKENIZER_PATH, or include "
+                f"{SOUND_TOKENIZER_COMPONENT_NAME}/{SOUND_TOKENIZER_CHECKPOINT_NAME} under the model path."
+            )
+
+        config_path = _resolve_model_file(explicit_config_path, model_root)
+        if config_path is None and model_root:
+            candidate = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME / "config.json"
+            config_path = str(candidate) if candidate.exists() else None
+        component_config = _load_sound_tokenizer_component_config(config_path)
+        component_values = _component_arch_values(component_config)
+
+        sample_rate = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="sample_rate",
+            custom_keys=("sound_sample_rate", "sample_rate"),
+            nested_keys=("sample_rate", "sampling_rate"),
+            top_level_keys=("sound_sample_rate", "sample_rate"),
+            default=DEFAULT_SOUND_SAMPLE_RATE,
+            cast=int,
+        )
+        audio_channels = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="audio_channels",
+            custom_keys=("sound_audio_channels", "audio_channels", "stereo"),
+            nested_keys=("audio_channels", "dec_out_channels", "stereo"),
+            top_level_keys=("sound_audio_channels", "audio_channels", "stereo"),
+            default=DEFAULT_SOUND_CHANNELS,
+            cast=_as_audio_channels,
+        )
+        sound_dim = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="io_channels",
+            custom_keys=("sound_dim", "io_channels", "latent_ch"),
+            nested_keys=("io_channels", "vocoder_input_dim", "latent_ch"),
+            top_level_keys=("sound_dim",),
+            default=DEFAULT_SOUND_DIM,
+            cast=int,
+        )
+        hop_size = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="hop_size",
+            custom_keys=("sound_hop_size", "hop_size"),
+            nested_keys=("hop_size",),
+            top_level_keys=("sound_hop_size", "hop_size"),
+            default=DEFAULT_SOUND_HOP_SIZE,
+            cast=int,
+        )
+        normalize_latents = _as_bool(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="normalize_latents",
+                default=DEFAULT_SOUND_NORMALIZE_LATENTS,
+            )
+        )
+        normalization_type = str(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="normalization_type",
+                default=DEFAULT_SOUND_NORMALIZATION_TYPE,
+            )
+        )
+        tanh_input_scale = float(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="tanh_input_scale",
+                default=DEFAULT_SOUND_TANH_INPUT_SCALE,
+            )
+        )
+        tanh_output_scale = float(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="tanh_output_scale",
+                default=DEFAULT_SOUND_TANH_OUTPUT_SCALE,
+            )
+        )
+        tanh_clamp = float(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="tanh_clamp",
+                default=DEFAULT_SOUND_TANH_CLAMP,
+            )
+        )
+        tokenizer = Cosmos3AVAEAudioTokenizer(
+            checkpoint_path=str(avae_path),
+            config_path=config_path,
+            sample_rate=sample_rate,
+            audio_channels=audio_channels,
+            io_channels=sound_dim,
+            hop_size=hop_size,
+            normalize_latents=normalize_latents,
+            normalization_type=normalization_type,
+            tanh_input_scale=tanh_input_scale,
+            tanh_output_scale=tanh_output_scale,
+            tanh_clamp=tanh_clamp,
+            dtype=getattr(od_config, "dtype", torch.bfloat16),
+            device=get_local_device(),
+        )
+        if _is_rank_zero():
+            logger.info(
+                "Loaded Cosmos3 AVAE sound tokenizer from %s (sr=%d, channels=%d, latent_ch=%d, hop=%d)",
+                avae_path,
+                sample_rate,
+                audio_channels,
+                sound_dim,
+                hop_size,
+            )
+        return cls(tokenizer)
+
+    def get_latent_num_samples(self, num_audio_samples: int) -> int:
+        return int(self.tokenizer.get_latent_num_samples(num_audio_samples))
+
+    def get_audio_num_samples(self, num_latent_samples: int) -> int:
+        return int(self.tokenizer.get_audio_num_samples(num_latent_samples))
+
+    @torch.no_grad()
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        """Decode sound latents.
+
+        Args:
+            latents: ``[B, C, T]`` or ``[C, T]`` tensor.
+
+        Returns:
+            ``[B, audio_channels, N]`` tensor for batched input, or
+            ``[audio_channels, N]`` for unbatched input.
+        """
+        squeeze = latents.ndim == 2
+        if squeeze:
+            latents = latents.unsqueeze(0)
+        audio = self.tokenizer.decode(latents)
+        audio = audio.clamp(-1.0, 1.0)
+        return audio.squeeze(0) if squeeze else audio
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 9dbaaad4b57..1f5d822367d 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -76,6 +76,51 @@ def _tf_config_get(config: Any, key: str, default: Any) -> Any:
     return getattr(config, key, default)
 
 
+def _nested_get(value: Any, key: str) -> Any:
+    if isinstance(value, dict):
+        if key in value:
+            return value[key]
+        for child in value.values():
+            found = _nested_get(child, key)
+            if found is not None:
+                return found
+    elif isinstance(value, list | tuple):
+        for child in value:
+            found = _nested_get(child, key)
+            if found is not None:
+                return found
+    return None
+
+
+def _od_config_get(od_config: Any, key: str, default: Any = None) -> Any:
+    """Read Cosmos3 options from runtime, model, or transformer config."""
+    if od_config is None:
+        return default
+    for attr in ("custom_pipeline_args", "model_config"):
+        source = getattr(od_config, attr, None) or {}
+        if isinstance(source, dict):
+            if key in source:
+                return source[key]
+            found = _nested_get(source, key)
+            if found is not None:
+                return found
+    tf_model_config = getattr(od_config, "tf_model_config", None)
+    if isinstance(tf_model_config, dict):
+        if key in tf_model_config:
+            return tf_model_config[key]
+        found = _nested_get(tf_model_config, key)
+        if found is not None:
+            return found
+    value = _tf_config_get(tf_model_config, key, None)
+    return default if value is None else value
+
+
+def _as_bool(value: Any) -> bool:
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
+
+
 # ---------------------------------------------------------------------------
 # Rotary Position Embeddings (mRoPE)
 # ---------------------------------------------------------------------------
@@ -138,6 +183,30 @@ def compute_mrope_position_ids_vision(
     return mrope_ids, next_offset
 
 
+def compute_mrope_position_ids_sound(
+    grid_t: int,
+    temporal_offset: int | float,
+    sound_latent_fps: float,
+    base_fps: float = 24.0,
+    temporal_compression_factor_sound: int = 1,
+    enable_fps_modulation: bool = True,
+    base_temporal_compression_factor: int | None = None,
+) -> tuple[torch.Tensor, int | float]:
+    """Generate mRoPE IDs for sound tokens as a (T, 1, 1) grid."""
+    del base_temporal_compression_factor
+    return compute_mrope_position_ids_vision(
+        grid_t=grid_t,
+        grid_h=1,
+        grid_w=1,
+        temporal_offset=temporal_offset,
+        fps=sound_latent_fps,
+        base_fps=base_fps,
+        temporal_compression_factor=temporal_compression_factor_sound,
+        base_temporal_compression_factor=temporal_compression_factor_sound,
+        enable_fps_modulation=enable_fps_modulation,
+    )
+
+
 class Qwen3VLTextRotaryEmbedding(nn.Module):
     """Multi-dimensional rotary position embedding for Qwen3-VL."""
 
@@ -875,9 +944,25 @@ def __init__(
         self.latent_channel_size = int(_tf_config_get(model_config, "latent_channel", 48))
         self.timestep_scale = float(_tf_config_get(model_config, "timestep_scale", 0.001))
         self.base_fps = float(_tf_config_get(model_config, "base_fps", 24.0))
+        sound_gen_value = _od_config_get(od_config, "sound_gen", None)
+        sound_dim_value = _od_config_get(od_config, "sound_dim", None)
+        if sound_dim_value is None:
+            sound_dim_value = _od_config_get(od_config, "io_channels", None)
+        if sound_dim_value is None:
+            sound_dim_value = _od_config_get(od_config, "vocoder_input_dim", None)
+        if sound_dim_value is None:
+            sound_dim_value = _od_config_get(od_config, "latent_ch", None)
+        self.sound_gen = _as_bool(sound_gen_value) if sound_gen_value is not None else sound_dim_value is not None
+        from .sound_tokenizer import get_sound_dim, get_sound_latent_fps
+
+        self.sound_dim = int(sound_dim_value if sound_dim_value is not None else get_sound_dim(od_config))
+        self.sound_latent_fps = float(get_sound_latent_fps(od_config))
         if temporal_compression_factor is None:
             temporal_compression_factor = _tf_config_get(model_config, "temporal_compression_factor", 4)
         self.temporal_compression_factor = int(temporal_compression_factor)
+        self.temporal_compression_factor_sound = int(
+            _tf_config_get(model_config, "temporal_compression_factor_sound", 1)
+        )
         self.enable_fps_modulation = bool(_tf_config_get(model_config, "enable_fps_modulation", True))
         self.temporal_modality_margin = int(
             _tf_config_get(
@@ -910,6 +995,12 @@ def __init__(
         self.proj_in = nn.Linear(self.patch_latent_dim, self.hidden_size)
         self.proj_out = nn.Linear(self.hidden_size, self.patch_latent_dim)
         self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=dtype)
+        if self.sound_gen:
+            self.audio_proj_in = nn.Linear(self.sound_dim, self.hidden_size)
+            self.audio_proj_out = nn.Linear(self.hidden_size, self.sound_dim)
+            self.audio_modality_embed = nn.Parameter(torch.zeros(self.hidden_size))
+
+        self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=torch.bfloat16)
 
         self.gen_layers = nn.ModuleList(
             [
@@ -987,6 +1078,21 @@ def unpatchify(self, tokens: torch.Tensor, t: int, h: int, w: int) -> torch.Tens
             x = x[:, :, :, :h, :w]
         return x
 
+    def pack_sound(self, sound_latents: torch.Tensor) -> torch.Tensor:
+        """[B, C_sound, T_sound] -> [B, T_sound, C_sound]."""
+        if sound_latents.ndim != 3:
+            raise ValueError(f"Cosmos3 sound latents must have shape [B, C, T], got {tuple(sound_latents.shape)}.")
+        if sound_latents.shape[1] != self.sound_dim:
+            raise ValueError(
+                f"Cosmos3 sound latent channel mismatch: expected {self.sound_dim}, got {sound_latents.shape[1]}."
+            )
+        return sound_latents.permute(0, 2, 1).contiguous()
+
+    @staticmethod
+    def unpack_sound(tokens: torch.Tensor) -> torch.Tensor:
+        """[B, T_sound, C_sound] -> [B, C_sound, T_sound]."""
+        return tokens.permute(0, 2, 1).contiguous()
+
     # -- RoPE computation ----------------------------------------------------
 
     def _compute_rope_freqs(
@@ -998,12 +1104,14 @@ def _compute_rope_freqs(
         fps: float | None,
         device: torch.device,
         dtype: torch.dtype,
+        t_sound: int | None = None,
     ) -> tuple[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
         """Compute mRoPE cos/sin for UND text and GEN media pathways."""
         B = text_mask.shape[0]
         S_text = text_mask.shape[1]
         text_lengths = text_mask.sum(dim=1).long()
         effective_fps = fps if fps is not None and t > 1 else None
+        sound_frames = int(t_sound or 0)
 
         text_pos_list = []
         gen_pos_list = []
@@ -1021,6 +1129,21 @@ def _compute_rope_freqs(
                 temporal_compression_factor=self.temporal_compression_factor,
                 enable_fps_modulation=self.enable_fps_modulation,
             )
+            gen_positions = [v_pos]
+            if sound_frames > 0:
+                s_pos, _ = compute_mrope_position_ids_sound(
+                    sound_frames,
+                    temporal_offset=media_temporal_offset,
+                    sound_latent_fps=self.sound_latent_fps,
+                    base_fps=self.base_fps,
+                    temporal_compression_factor_sound=getattr(self, "temporal_compression_factor_sound", 1),
+                    enable_fps_modulation=self.enable_fps_modulation,
+                )
+                gen_positions.append(s_pos)
+            pos_dtype = gen_positions[0].dtype
+            for pos in gen_positions[1:]:
+                pos_dtype = torch.promote_types(pos_dtype, pos.dtype)
+            v_pos = torch.cat([pos.to(pos_dtype) for pos in gen_positions], dim=1)
             if real_len < S_text:
                 t_pos = torch.cat(
                     [t_pos, torch.zeros(3, S_text - real_len, dtype=t_pos.dtype)],
@@ -1051,16 +1174,31 @@ def reset_cache(self) -> None:
     def _validate_gen_sequence_parallel(
         *,
         s_gen: int,
+        s_video: int,
+        s_sound: int,
+        has_sound: bool,
         ulysses_size: int,
     ) -> None:
         if ulysses_size <= 1 or s_gen % ulysses_size == 0:
             return
 
+        detail_parts = [f"video tokens {s_video}"]
+        if has_sound:
+            detail_parts.append(f"sound tokens {s_sound}")
+        detail = " = " + " + ".join(detail_parts) if len(detail_parts) > 1 else ""
         adjust_detail = (
-            "Adjust the spatial resolution so that t * ceil(h/patch) * ceil(w/patch) is a multiple of ulysses_degree."
+            "Adjust the spatial resolution, frame count, sound duration, "
+            "or sound latent FPS so the combined media sequence is a "
+            "multiple of ulysses_degree."
+            if has_sound
+            else (
+                "Adjust the spatial resolution so that "
+                "t * ceil(h/patch) * ceil(w/patch) is a multiple "
+                "of ulysses_degree."
+            )
         )
         raise ValueError(
-            f"GEN sequence length ({s_gen} video tokens) must be divisible by "
+            f"GEN sequence length ({s_gen}{detail}) must be divisible by "
             f"ulysses_degree ({ulysses_size}). {adjust_detail}"
         )
 
@@ -1074,9 +1212,10 @@ def forward(
         text_mask: torch.Tensor,
         video_shape: tuple[int, int, int],
         fps: float | None = None,
+        sound_latents: torch.Tensor | None = None,
         noisy_frame_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> torch.Tensor:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         """
         Args:
             hidden_states: [B, C, t, h, w] noisy latents
@@ -1085,13 +1224,15 @@ def forward(
             text_mask: [B, S_text] attention mask (1=real, 0=pad)
             video_shape: (t, h, w) in latent space
             fps: video frame rate for temporal mRoPE modulation
+            sound_latents: Optional [B, C_sound, T_sound] noisy sound latents.
             noisy_frame_mask: Optional [B, 1, t, 1, 1] mask where 1=noisy (add
                 timestep embedding, predict velocity) and 0=conditioned (clean
                 context, skip timestep embedding).  None means all frames noisy
                 (T2V mode).
 
         Returns:
-            [B, C, t, h, w] velocity prediction.
+            [B, C, t, h, w] velocity prediction, or
+            tuple outputs in video, sound order when sound latents are provided.
         """
         t, h, w = video_shape
         hp, wp, _, _ = self._pad_to_patch_size(h, w)
@@ -1103,12 +1244,31 @@ def forward(
                 f"Cosmos3 requires identical real text lengths within a batch "
                 f"(got min={min_real_len}, max={max_real_len})."
             )
+        has_sound = sound_latents is not None
+        if has_sound and not self.sound_gen:
+            raise ValueError(
+                "Cosmos3 sound generation was requested, but this transformer "
+                "was initialized without sound modules. Check that the "
+                "transformer config enables sound_gen or defines sound_dim."
+            )
 
         # Query Ulysses state at runtime
         ulysses_size, _, _ = _get_ulysses_state()
 
         # Patchify latents and project to hidden space
         hidden_video = self.proj_in(self.patchify(hidden_states, t, h, w))
+        s_video = hidden_video.shape[1]
+        s_sound = 0
+        hidden_sound = None
+        if sound_latents is not None:
+            if sound_latents.shape[0] != hidden_states.shape[0]:
+                raise ValueError(
+                    "Cosmos3 sound and video batch sizes must match: "
+                    f"video={hidden_states.shape[0]}, sound={sound_latents.shape[0]}."
+                )
+            hidden_sound = self.audio_proj_in(self.pack_sound(sound_latents))
+            hidden_sound = hidden_sound + self.audio_modality_embed.to(hidden_sound.dtype)
+            s_sound = hidden_sound.shape[1]
 
         # Timestep embedding (fp32 for precision).
         # For I2V: only add to noisy tokens, not conditioned ones.
@@ -1131,7 +1291,12 @@ def forward(
         else:
             hidden_video = hidden_video + time_embed.unsqueeze(1)
 
-        hidden_gen = hidden_video
+        if hidden_sound is not None:
+            hidden_sound = hidden_sound + time_embed.unsqueeze(1)
+        hidden_parts = [hidden_video]
+        if hidden_sound is not None:
+            hidden_parts.append(hidden_sound)
+        hidden_gen = torch.cat(hidden_parts, dim=1)
 
         with torch.nn.attention.sdpa_kernel(self.sdpa_backends, set_priority=True):
             # Run UND pathway once and cache K/V (replicated across all ranks)
@@ -1144,6 +1309,7 @@ def forward(
                     fps,
                     hidden_states.device,
                     hidden_states.dtype,
+                    t_sound=s_sound,
                 )
                 cached_kv_full = self.language_model(text_ids, freqs_und)
                 self.cached_freqs_gen = freqs_gen
@@ -1159,6 +1325,9 @@ def forward(
                 raise RuntimeError("Cosmos3 GEN cache was not initialized before running GEN layers.")
             self._validate_gen_sequence_parallel(
                 s_gen=hidden_gen.shape[1],
+                s_video=s_video,
+                s_sound=s_sound,
+                has_sound=has_sound,
                 ulysses_size=ulysses_size,
             )
             freqs_cos, freqs_sin = self.cached_freqs_gen
@@ -1192,7 +1361,21 @@ def forward(
 
         # Final norm and project back to latent space
         hidden_gen = self.norm_moe_gen(hidden_gen)
-        return self.unpatchify(self.proj_out(hidden_gen), t, h, w)
+        if not has_sound:
+            return self.unpatchify(self.proj_out(hidden_gen), t, h, w)
+
+        split_sizes = [s_video]
+        if has_sound:
+            split_sizes.append(s_sound)
+        split_hidden = hidden_gen.split(split_sizes, dim=1)
+        hidden_video = split_hidden[0]
+        video_pred = self.unpatchify(self.proj_out(hidden_video), t, h, w)
+        outputs: list[torch.Tensor] = [video_pred]
+        split_idx = 1
+        if has_sound:
+            hidden_sound = split_hidden[split_idx]
+            outputs.append(self.unpack_sound(self.audio_proj_out(hidden_sound)))
+        return tuple(outputs)
 
     def post_load_weights(self) -> None:
         """Post-load processing: ensure correct dtypes."""
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index e28e7763c4d..d7757dd80b5 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -2517,6 +2517,8 @@ async def _parse_video_form(
     flow_shift: float | None = Form(default=None),
     true_cfg_scale: float | None = Form(default=None),
     seed: int | None = Form(default=None),
+    generate_sound: bool | None = Form(default=None),
+    sound_duration: float | None = Form(default=None, gt=0.0),
     negative_prompt: str | None = Form(default=None),
     enable_frame_interpolation: bool | None = Form(default=None),
     frame_interpolation_exp: int | None = Form(default=None, ge=1),
@@ -2557,6 +2559,8 @@ async def _parse_video_form(
         "flow_shift": flow_shift,
         "true_cfg_scale": true_cfg_scale,
         "seed": seed,
+        "generate_sound": generate_sound,
+        "sound_duration": sound_duration,
         "negative_prompt": negative_prompt,
         "enable_frame_interpolation": enable_frame_interpolation,
         "frame_interpolation_exp": frame_interpolation_exp,
diff --git a/vllm_omni/entrypoints/openai/protocol/videos.py b/vllm_omni/entrypoints/openai/protocol/videos.py
index d46c8d43d6b..887e3ce67ea 100644
--- a/vllm_omni/entrypoints/openai/protocol/videos.py
+++ b/vllm_omni/entrypoints/openai/protocol/videos.py
@@ -149,6 +149,15 @@ class VideoGenerationRequest(BaseModel):
         description="True CFG scale (model-specific parameter, may be ignored if not supported)",
     )
     seed: int | None = Field(default=None, description="Random seed for reproducibility")
+    generate_sound: bool = Field(
+        default=False,
+        description="Request model-generated audio for video models that support sound generation.",
+    )
+    sound_duration: float | None = Field(
+        default=None,
+        gt=0.0,
+        description="Duration in seconds for model-generated audio. Defaults to the generated video duration.",
+    )
 
     # vllm-omni extensions for post-generation frame interpolation.
     enable_frame_interpolation: bool = Field(
diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py
index b6ed49996fe..57a76594a0f 100644
--- a/vllm_omni/entrypoints/openai/serving_video.py
+++ b/vllm_omni/entrypoints/openai/serving_video.py
@@ -148,6 +148,10 @@ async def _run_and_extract(
         )
         if "flow_shift" in provided_fields and request.flow_shift is not None:
             gen_params.extra_args["flow_shift"] = request.flow_shift
+        if "generate_sound" in provided_fields:
+            gen_params.extra_args["generate_sound"] = request.generate_sound
+        if "sound_duration" in provided_fields and request.sound_duration is not None:
+            gen_params.extra_args["sound_duration"] = request.sound_duration
 
         # Apply model-specific extra parameters
         if request.extra_params is not None:

From 2d0725e030b52baa74f5a5537cec52b221607f17 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 28 May 2026 18:01:54 +0200
Subject: [PATCH 41/41] Added action generation

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 docs/models/supported_models.md               |   2 +-
 tests/diffusion/models/cosmos3/conftest.py    |   7 +
 .../models/cosmos3/test_cosmos3_pipeline.py   |  69 ++-
 .../cosmos3/test_cosmos3_transformer.py       |  63 ++-
 .../openai_api/test_video_server.py           | 109 ++++
 vllm_omni/diffusion/models/cosmos3/action.py  | 217 ++++++++
 .../models/cosmos3/pipeline_cosmos3.py        | 501 ++++++++++++++++--
 .../models/cosmos3/transformer_cosmos3.py     | 184 ++++++-
 vllm_omni/entrypoints/openai/api_server.py    |   5 +-
 .../entrypoints/openai/protocol/__init__.py   |   2 +
 .../entrypoints/openai/protocol/videos.py     |  13 +
 vllm_omni/entrypoints/openai/serving_video.py | 119 ++++-
 12 files changed, 1209 insertions(+), 82 deletions(-)
 create mode 100644 vllm_omni/diffusion/models/cosmos3/action.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 9101265a866..ebebf554b69 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -33,7 +33,7 @@ th {
 | `ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanPipeline` | Wan2.1-T2V, Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-T2V-14B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
-| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V, T2V with sound | `nvidia/Cosmos3-Nano` | ✅︎ | | | |
+| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V, T2V with sound, action policy | `nvidia/Cosmos3-Nano` | ✅︎ | | | |
 | `WanSpeechToVideoPipeline` | Wan2.2-S2V | `Wan-AI/Wan2.2-S2V-14B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Wan22VACEPipeline` | Wan2.1-VACE | `Wan-AI/Wan2.1-VACE-1.3B-diffusers`, `Wan-AI/Wan2.1-VACE-14B-diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `LTX2Pipeline` | LTX-2-T2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | |
diff --git a/tests/diffusion/models/cosmos3/conftest.py b/tests/diffusion/models/cosmos3/conftest.py
index 7075065447c..80a7105d2ca 100644
--- a/tests/diffusion/models/cosmos3/conftest.py
+++ b/tests/diffusion/models/cosmos3/conftest.py
@@ -77,11 +77,15 @@ def __init__(
         latent_channel_size: int = 2,
         sound_gen: bool = False,
         sound_dim: int = 3,
+        action_gen: bool = False,
+        action_dim: int = 4,
     ) -> None:
         super().__init__()
         self.latent_channel_size = latent_channel_size
         self.sound_gen = sound_gen
         self.sound_dim = sound_dim
+        self.action_gen = action_gen
+        self.action_dim = action_dim
         self.cached_kv: Any | None = None
         self.cached_freqs_gen: Any | None = None
         self.calls: list[dict[str, Any]] = []
@@ -116,7 +120,10 @@ def forward(
             marker = torch.tensor([token], dtype=torch.float32)
             self.cached_kv = [(marker, marker + 100)]
             self.cached_freqs_gen = (marker + 200, marker + 300)
+        action_latents = kwargs.get("action_latents")
         outputs: list[torch.Tensor] = [torch.full_like(hidden_states, float(token))]
+        if action_latents is not None:
+            outputs.append(torch.full_like(action_latents, float(token + 20)))
         if sound_latents is not None:
             outputs.append(torch.full_like(sound_latents, float(token + 10)))
         return outputs[0] if len(outputs) == 1 else tuple(outputs)
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
index b4471973b7d..52d47f8a2ed 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -48,7 +48,7 @@ def test_pipeline_registered_and_exported() -> None:
     assert "Cosmos3OmniDiffusersPipeline" in cosmos3.__all__
 
 
-def test_preprocess_i2v_image_input() -> None:
+def test_preprocess_i2v_image_and_action_video_inputs() -> None:
     from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
 
     preprocess = get_cosmos3_pre_process_func(SimpleNamespace())
@@ -61,6 +61,16 @@ def test_preprocess_i2v_image_input() -> None:
     assert (result.sampling_params.height, result.sampling_params.width) == (672, 1344)
     assert tuple(result.prompts[0]["additional_information"]["preprocessed_image"].shape[-2:]) == (672, 1344)
 
+    frames = [Image.new("RGB", (8, 4), color) for color in ("red", "green", "blue")]
+    action = SimpleNamespace(
+        prompts=[{"prompt": "Move.", "multi_modal_data": {"video": frames}}],
+        sampling_params=SimpleNamespace(height=16, width=32, extra_args={"action_mode": "forward_dynamics"}),
+    )
+
+    additional = preprocess(action).prompts[0]["additional_information"]
+    assert tuple(additional["preprocessed_image"].shape) == (1, 3, 16, 32)
+    assert tuple(additional["preprocessed_video"].shape) == (1, 3, 3, 16, 32)
+
 
 def test_postprocess_handles_image_video_audio_and_validation() -> None:
     from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_post_process_func
@@ -127,7 +137,7 @@ def test_prompt_formatting_and_checkpoint_key_remap(make_cosmos3_pipeline) -> No
     assert {key: Cosmos3OmniDiffusersPipeline._remap_ckpt_key(key) for key in remaps} == remaps
 
 
-def test_prepare_latents_for_video_image_and_sound(make_cosmos3_pipeline) -> None:
+def test_prepare_latents_for_video_image_sound_and_action(make_cosmos3_pipeline) -> None:
     pipeline = make_cosmos3_pipeline()
     latents = pipeline._prepare_latents(16, 24, 5, torch.Generator(device="cpu").manual_seed(0))
     assert latents.shape == (1, 2, 2, 2, 3)
@@ -156,8 +166,20 @@ def test_prepare_latents_for_video_image_and_sound(make_cosmos3_pipeline) -> Non
     assert (sound_latents.shape, latent_frames) == (torch.Size([1, 3, 6]), 6)
     assert pipeline._decode_sound_latents(torch.zeros(1, 3, 6), target_audio_samples=21).shape == (1, 2, 21)
 
+    pipeline.transformer = pipeline.transformer.__class__(action_gen=True, action_dim=4)
+    action, action_mask, clean, raw_dim = pipeline._prepare_action_latents(
+        mode="forward_dynamics",
+        action_chunk_size=2,
+        raw_action_dim=None,
+        generator=torch.Generator(device="cpu").manual_seed(0),
+        sp=SimpleNamespace(extra_args={"action": [[1.0, 2.0], [3.0, 4.0]]}),
+    )
+    assert raw_dim == 2
+    assert action_mask.tolist() == [[[0.0], [0.0]]]
+    torch.testing.assert_close(action, clean)
+
 
-def test_diffuse_covers_cfg_i2v_and_sound_steps(make_cosmos3_pipeline) -> None:
+def test_diffuse_covers_cfg_i2v_and_multimodal_steps(make_cosmos3_pipeline) -> None:
     pipeline = make_cosmos3_pipeline()
     latents = torch.zeros(1, 2, 1, 1, 1)
 
@@ -189,20 +211,22 @@ def test_diffuse_covers_cfg_i2v_and_sound_steps(make_cosmos3_pipeline) -> None:
     )
     torch.testing.assert_close(i2v[:, :, 0:1], torch.full((1, 2, 1, 1, 1), 7.0))
 
-    pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
-    video_result, sound_result = pipeline.diffuse(
+    pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
+    video_result, action_result = pipeline.diffuse(
         latents=latents,
-        sound_latents=torch.zeros(1, 3, 4),
+        action_latents=torch.zeros(1, 3, 4),
+        action_velocity_mask=torch.ones(1, 3, 1),
+        action_condition_latents=torch.zeros(1, 3, 4),
         timesteps=torch.tensor([7, 3]),
         cond_ids=_ids(2),
         cond_mask=_mask(),
         uncond_ids=_ids(1),
         uncond_mask=_mask(),
         guidance_scale=1.0,
-        shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0},
+        shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0, "action_domain_ids": torch.tensor([0])},
     )
     torch.testing.assert_close(video_result, torch.full_like(latents, 4.0))
-    torch.testing.assert_close(sound_result, torch.full((), 24.0).expand_as(sound_result))
+    torch.testing.assert_close(action_result, torch.full((), 44.0).expand_as(action_result))
 
 
 class TestForwardRouting:
@@ -228,6 +252,8 @@ def fake_prepare(height, width, num_frames, generator):
         def fake_diffuse(**kwargs):
             captured["diffuse_calls"].append(kwargs)
             outputs = [kwargs["latents"] + len(captured["diffuse_calls"])]
+            if kwargs.get("action_latents") is not None:
+                outputs.append(kwargs["action_latents"] + 3.0)
             if kwargs.get("sound_latents") is not None:
                 outputs.append(kwargs["sound_latents"] + 2.0)
             return outputs[0] if len(outputs) == 1 else tuple(outputs)
@@ -278,7 +304,7 @@ def test_forward_defaults_and_mode_selection(
         assert captured["flow_shifts"] == expected["flow"]
         assert captured["scheduler_steps"] == expected["steps"]
 
-    def test_forward_i2v_and_sound_routes(self, make_cosmos3_pipeline) -> None:
+    def test_forward_i2v_sound_and_action_routes(self, make_cosmos3_pipeline) -> None:
         pipeline = make_cosmos3_pipeline()
         captured = self._install_forward_stubs(pipeline)
         image_tensor = torch.zeros(1, 3, 16, 16)
@@ -317,6 +343,31 @@ def test_forward_i2v_and_sound_routes(self, make_cosmos3_pipeline) -> None:
         assert captured["diffuse_calls"][-1]["sound_latents"] is sound_latents
         assert output.output["audio_sample_rate"] == 10
 
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, action_gen=True, action_dim=4)
+        output = pipeline.forward(
+            SimpleNamespace(
+                prompts=[
+                    {
+                        "prompt": "Pick the block.",
+                        "modalities": ["video"],
+                        "additional_information": {"preprocessed_image": image_tensor},
+                    }
+                ],
+                sampling_params=make_sampling_params(
+                    height=16,
+                    width=16,
+                    extra_args={
+                        "action_mode": "policy",
+                        "action_chunk_size": 2,
+                        "raw_action_dim": 2,
+                        "domain_name": "bridge_orig_lerobot",
+                    },
+                ),
+            )
+        )
+        assert captured["diffuse_calls"][-1]["shared_kwargs"]["action_domain_ids"].tolist() == [7]
+        assert output.custom_output["action"].shape == (1, 2, 2)
+
     @pytest.mark.parametrize(
         ("prompt", "sampling_params", "message"),
         [
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index 38db56e0c26..d2f22b81760 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -29,8 +29,9 @@ def _tiny_cosmos3_config(**overrides):
     return config
 
 
-def test_mrope_position_ids_cover_text_video_and_sound() -> None:
+def test_mrope_position_ids_cover_text_video_sound_and_action() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
+        compute_mrope_position_ids_action,
         compute_mrope_position_ids_sound,
         compute_mrope_position_ids_text,
         compute_mrope_position_ids_vision,
@@ -61,6 +62,10 @@ def test_mrope_position_ids_cover_text_video_and_sound() -> None:
     torch.testing.assert_close(sound_ids[0], torch.tensor([10.0, 10.96, 11.92]))
     assert sound_offset == 12
 
+    action_ids, action_offset = compute_mrope_position_ids_action(3, temporal_offset=10, action_fps=None)
+    assert action_ids.tolist() == [[11, 12, 13], [0, 0, 0], [0, 0, 0]]
+    assert action_offset == 14
+
 
 @pytest.mark.parametrize(
     ("key", "value"),
@@ -126,7 +131,7 @@ def test_forward_returns_video_prediction(monkeypatch: pytest.MonkeyPatch) -> No
     assert tuple(output.shape) == (1, 2, 1, 2, 2)
 
 
-def test_sound_modules_follow_config() -> None:
+def test_sound_and_action_modules_follow_config() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     tiny = _tiny_cosmos3_config()
@@ -139,48 +144,74 @@ def test_sound_modules_follow_config() -> None:
             dtype=torch.float32,
         )
     )
+    with_action = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config={**tiny, "action_gen": True, "max_action_dim": 6, "num_embodiment_domains": 9},
+            dtype=torch.float32,
+        )
+    )
 
     assert no_modal.sound_gen is False
+    assert no_modal.action_gen is False
     assert not hasattr(no_modal, "audio_proj_in")
+    assert not hasattr(no_modal, "action_proj_in")
     assert with_sound.sound_dim == 5
     assert with_sound.sound_latent_fps == 40.0
     assert with_sound.audio_proj_in.in_features == 5
+    assert with_action.action_dim == 6
+    assert with_action.action_proj_in.num_domains == 9
 
 
-def test_sound_pack_unpack_validate_shapes() -> None:
+def test_sound_and_action_pack_unpack_validate_shapes() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     model = object.__new__(Cosmos3VFMTransformer)
     nn.Module.__init__(model)
     model.sound_dim = 3
+    model.action_dim = 3
 
     sound = torch.arange(2 * 3 * 4, dtype=torch.float32).reshape(2, 3, 4)
+    action = torch.arange(2 * 4 * 3, dtype=torch.float32).reshape(2, 4, 3)
     torch.testing.assert_close(model.unpack_sound(model.pack_sound(sound)), sound)
+    torch.testing.assert_close(model.unpack_action(model.pack_action(action)), action)
 
     with pytest.raises(ValueError, match="channel mismatch"):
         model.pack_sound(torch.zeros(1, 4, 2))
+    with pytest.raises(ValueError, match="dimension mismatch"):
+        model.pack_action(torch.zeros(1, 2, 4))
 
 
-def test_forward_returns_video_and_sound_predictions() -> None:
+@pytest.mark.parametrize(
+    ("config", "extra_kwargs", "expected_shapes"),
+    [
+        (
+            _tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0),
+            {"sound_latents": torch.zeros(1, 3, 4)},
+            [(1, 2, 1, 2, 2), (1, 3, 4)],
+        ),
+        (
+            _tiny_cosmos3_config(action_gen=True, max_action_dim=3, num_embodiment_domains=4),
+            {"action_latents": torch.zeros(1, 5, 3), "action_domain_ids": torch.tensor([2])},
+            [(1, 2, 1, 2, 2), (1, 5, 3)],
+        ),
+    ],
+)
+def test_forward_returns_video_plus_optional_modality_predictions(config, extra_kwargs, expected_shapes) -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
-    output = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0),
-            dtype=torch.float32,
-        )
-    )(
+    output = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=config, dtype=torch.float32))(
         hidden_states=torch.zeros(1, 2, 1, 2, 2),
         timestep=torch.tensor([1.0]),
         text_ids=torch.tensor([[1, 2]], dtype=torch.long),
         text_mask=torch.ones(1, 2, dtype=torch.long),
         video_shape=(1, 2, 2),
         fps=24.0,
-        sound_latents=torch.zeros(1, 3, 4),
+        action_noisy_mask=torch.ones(1, 5, 1),
+        **extra_kwargs,
     )
 
     assert isinstance(output, tuple)
-    assert [tuple(tensor.shape) for tensor in output] == [(1, 2, 1, 2, 2), (1, 3, 4)]
+    assert [tuple(tensor.shape) for tensor in output] == expected_shapes
 
 
 def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch: pytest.MonkeyPatch) -> None:
@@ -203,7 +234,7 @@ def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch
         )
 
 
-def test_compute_rope_freqs_places_text_video_and_sound_positions() -> None:
+def test_compute_rope_freqs_places_text_video_action_and_sound_positions() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     class FakeRotary:
@@ -251,9 +282,11 @@ def __call__(self, x, position_ids):
         fps=24.0,
         device=torch.device("cpu"),
         dtype=torch.float32,
+        t_action=2,
+        action_start_frame_offset=1,
         t_sound=1,
     )
 
     _, gen_pos = rotary.position_ids
-    assert gen_pos.shape == (3, 1, 3)
-    assert gen_pos[0, 0].tolist() == [102, 103, 102]
+    assert gen_pos.shape == (3, 1, 5)
+    assert gen_pos[0, 0].tolist() == [102, 103, 103, 104, 102]
diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py
index de1f14c7455..dcff3054b38 100644
--- a/tests/entrypoints/openai_api/test_video_server.py
+++ b/tests/entrypoints/openai_api/test_video_server.py
@@ -627,6 +627,115 @@ async def _generate(prompt, request_id, sampling_params_list):
 
     assert completed["stage_durations"] == {"diffuse": 2.5, "vae.decode": 0.3}
     assert completed["peak_memory_mb"] == 4096.5
+    assert completed["action"] is None
+
+
+def test_video_generation_response_exposes_action_payload(mocker: MockerFixture):
+    engine = FakeAsyncOmni()
+    handler = OmniOpenAIServingVideo.for_diffusion(
+        diffusion_engine=engine,
+        model_name="Cosmos3-8B-UVA",
+    )
+
+    async def _generate(prompt, request_id, sampling_params_list):
+        del prompt, request_id, sampling_params_list
+        import numpy as np
+
+        yield MockVideoResult(
+            [object()],
+            custom_output={
+                "action": np.array([[[1.5, 2.5], [3.5, 4.5]]], dtype=np.float32),
+                "raw_action_dim": 2,
+                "action_mode": "policy",
+                "domain_id": 7,
+            },
+        )
+
+    engine.generate = _generate
+    mocker.patch(
+        "vllm_omni.entrypoints.openai.serving_video.encode_video_base64",
+        return_value="encoded-video",
+    )
+
+    response = asyncio.run(
+        handler.generate_videos(
+            VideoGenerationRequest(prompt="predict actions"),
+            "action-json",
+        )
+    )
+
+    action = response.data[0].action
+    assert action is not None
+    assert action.data == [[1.5, 2.5], [3.5, 4.5]]
+    assert action.shape == [2, 2]
+    assert action.dtype == "float32"
+    assert action.raw_action_dim == 2
+    assert action.action_mode == "policy"
+    assert action.domain_id == 7
+    assert response.model_dump(mode="json")["data"][0]["action"]["data"] == [[1.5, 2.5], [3.5, 4.5]]
+
+
+def test_video_job_persists_action_metadata(test_client, mocker: MockerFixture):
+    engine = test_client.app.state.openai_serving_video._engine_client
+
+    async def _generate(prompt, request_id, sampling_params_list):
+        import numpy as np
+
+        engine.captured_prompt = prompt
+        engine.captured_sampling_params_list = sampling_params_list
+        yield MockVideoResult(
+            [object()],
+            custom_output={
+                "action": np.array([[[1.0, 2.0], [3.0, 4.0]]], dtype=np.float32),
+                "raw_action_dim": 2,
+                "action_mode": "policy",
+                "domain_id": 7,
+            },
+        )
+
+    engine.generate = _generate
+    mocker.patch(
+        "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes",
+        return_value=b"fake-video",
+    )
+
+    response = test_client.post("/v1/videos", data={"prompt": "profile me"})
+    assert response.status_code == 200
+    video_id = response.json()["id"]
+    completed = _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value)
+
+    expected_action = {
+        "data": [[1.0, 2.0], [3.0, 4.0]],
+        "shape": [2, 2],
+        "dtype": "float32",
+        "raw_action_dim": 2,
+        "action_mode": "policy",
+        "domain_id": 7,
+    }
+    assert completed["action"] == expected_action
+
+    listed = test_client.get("/v1/videos").json()
+    assert listed["data"][0]["action"] == expected_action
+
+
+def test_action_extraction_accepts_unbatched_action():
+    import numpy as np
+
+    result = MockVideoResult(
+        [object()],
+        custom_output={
+            "action": np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
+            "raw_action_dim": 2,
+            "action_mode": "policy",
+            "domain_id": 7,
+        },
+    )
+
+    actions = OmniOpenAIServingVideo._extract_action_outputs(result, expected_count=1)
+
+    assert actions[0] is not None
+    assert actions[0].data == [[1.0, 2.0], [3.0, 4.0]]
+    assert actions[0].shape == [2, 2]
 
 
 def test_missing_handler_returns_503():
diff --git a/vllm_omni/diffusion/models/cosmos3/action.py b/vllm_omni/diffusion/models/cosmos3/action.py
new file mode 100644
index 00000000000..e2572bbb733
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/action.py
@@ -0,0 +1,217 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Action-token helpers for Cosmos3 UVA/action generation."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+
+ACTION_MODE_POLICY = "policy"
+ACTION_MODE_FORWARD_DYNAMICS = "forward_dynamics"
+ACTION_MODE_INVERSE_DYNAMICS = "inverse_dynamics"
+ACTION_MODES = {
+    ACTION_MODE_POLICY,
+    ACTION_MODE_FORWARD_DYNAMICS,
+    ACTION_MODE_INVERSE_DYNAMICS,
+}
+
+
+EMBODIMENT_TO_DOMAIN_ID: dict[str, int] = {
+    "no_action": 0,
+    "av": 1,
+    "camera_pose": 2,
+    "hand_pose": 3,
+    "pusht": 4,
+    "libero": 5,
+    "umi": 6,
+    "bridge_orig_lerobot": 7,
+    "droid_lerobot": 8,
+    "robomind-franka": 8,
+    "galbot": 9,
+    "robomind-franka-dual": 12,
+    "robomind-ur": 13,
+    "agibotworld": 15,
+    "agibot_gear_gripper": 15,
+    "agibot_gear_gripper_ext": 15,
+    "fractal": 20,
+}
+
+
+VIDEO_RES_SIZE_INFO: dict[str, dict[str, tuple[int, int]]] = {
+    "256": {
+        "1,1": (256, 256),
+        "4,3": (320, 256),
+        "3,4": (256, 320),
+        "16,9": (320, 192),
+        "9,16": (192, 320),
+    },
+    "480": {
+        "1,1": (640, 640),
+        "4,3": (736, 544),
+        "3,4": (544, 736),
+        "16,9": (832, 480),
+        "9,16": (480, 832),
+    },
+    "704": {
+        "1,1": (960, 960),
+        "4,3": (1088, 832),
+        "3,4": (832, 1088),
+        "16,9": (1280, 704),
+        "9,16": (704, 1280),
+    },
+    "720": {
+        "1,1": (960, 960),
+        "4,3": (1104, 832),
+        "3,4": (832, 1104),
+        "16,9": (1280, 720),
+        "9,16": (720, 1280),
+    },
+}
+
+
+def normalize_action_mode(mode: Any) -> str | None:
+    if mode is None:
+        return None
+    normalized = str(mode).strip().lower()
+    if not normalized:
+        return None
+    if normalized not in ACTION_MODES:
+        raise ValueError(f"Unsupported Cosmos3 action_mode={mode!r}; expected one of {sorted(ACTION_MODES)}.")
+    return normalized
+
+
+def resolve_domain_id(
+    *,
+    domain_id: Any = None,
+    domain_name: Any = None,
+    require_explicit: bool = False,
+) -> int:
+    if domain_id is not None:
+        resolved = int(domain_id)
+        if resolved < 0:
+            raise ValueError(f"Cosmos3 domain_id must be non-negative, got {resolved}.")
+        return resolved
+
+    if domain_name is None or str(domain_name).strip() == "":
+        if require_explicit:
+            raise ValueError(
+                "Cosmos3 action generation requires extra_args['domain_id'] or non-empty extra_args['domain_name']."
+            )
+        return 0
+
+    key = str(domain_name).strip().lower()
+    if key not in EMBODIMENT_TO_DOMAIN_ID:
+        raise ValueError(
+            f"Unknown Cosmos3 action domain_name={domain_name!r}; "
+            f"expected one of {sorted(EMBODIMENT_TO_DOMAIN_ID)} or pass domain_id directly."
+        )
+    return EMBODIMENT_TO_DOMAIN_ID[key]
+
+
+def action_condition_indexes(mode: str, action_length: int) -> list[int]:
+    mode = normalize_action_mode(mode)
+    if mode == ACTION_MODE_FORWARD_DYNAMICS:
+        return list(range(action_length))
+    if mode in {ACTION_MODE_POLICY, ACTION_MODE_INVERSE_DYNAMICS}:
+        return []
+    raise AssertionError(f"Unexpected action mode: {mode!r}")
+
+
+def vision_condition_indexes(mode: str, video_length: int, temporal_compression_factor: int) -> list[int]:
+    mode = normalize_action_mode(mode)
+    latent_frames = (video_length - 1) // temporal_compression_factor + 1
+    if mode in {ACTION_MODE_POLICY, ACTION_MODE_FORWARD_DYNAMICS}:
+        return [0]
+    if mode == ACTION_MODE_INVERSE_DYNAMICS:
+        return list(range(latent_frames))
+    raise AssertionError(f"Unexpected action mode: {mode!r}")
+
+
+def action_start_frame_offset(mode: str, action_length: int, video_length: int) -> int:
+    del mode
+    if action_length == video_length - 1:
+        return 1
+    if action_length == video_length:
+        return 0
+    raise ValueError(
+        "Cosmos3 action_chunk_size must equal num_frames - 1 or num_frames; "
+        f"got action_chunk_size={action_length}, num_frames={video_length}."
+    )
+
+
+def build_action_condition_mask(
+    mode: str,
+    action_length: int,
+    *,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    mask = torch.zeros(1, action_length, 1, device=device, dtype=dtype)
+    for idx in action_condition_indexes(mode, action_length):
+        mask[:, idx, :] = 1.0
+    return mask
+
+
+def build_vision_condition_mask(
+    mode: str,
+    video_length: int,
+    temporal_compression_factor: int,
+    *,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    latent_frames = (video_length - 1) // temporal_compression_factor + 1
+    mask = torch.zeros(1, 1, latent_frames, 1, 1, device=device, dtype=dtype)
+    for idx in vision_condition_indexes(mode, video_length, temporal_compression_factor):
+        mask[:, :, idx, :, :] = 1.0
+    return mask
+
+
+def pad_action_to_dim(action: torch.Tensor, action_dim: int) -> torch.Tensor:
+    if action.shape[-1] > action_dim:
+        raise ValueError(f"Cosmos3 action dimension {action.shape[-1]} exceeds model action_dim={action_dim}.")
+    if action.shape[-1] == action_dim:
+        return action
+    padding = torch.zeros(*action.shape[:-1], action_dim - action.shape[-1], dtype=action.dtype, device=action.device)
+    return torch.cat([action, padding], dim=-1)
+
+
+def load_action_tensor(action: Any = None, action_path: str | Path | None = None) -> torch.Tensor:
+    if action is None and action_path is None:
+        raise ValueError(
+            "Cosmos3 forward_dynamics action mode requires extra_args['action'] or extra_args['action_path']."
+        )
+    if action is None:
+        action = json.loads(Path(str(action_path)).read_text())
+    if isinstance(action, torch.Tensor):
+        tensor = action.detach().to(dtype=torch.float32)
+    else:
+        tensor = torch.as_tensor(np.asarray(action), dtype=torch.float32)
+    if tensor.ndim == 3 and tensor.shape[0] == 1:
+        tensor = tensor.squeeze(0)
+    if tensor.ndim != 2:
+        raise ValueError(f"Cosmos3 action must have shape [T, D], got {tuple(tensor.shape)}.")
+    return tensor
+
+
+def find_closest_target_size(h: int, w: int, resolution: str | int) -> tuple[int, int]:
+    key = str(resolution)
+    if key not in VIDEO_RES_SIZE_INFO:
+        raise ValueError(
+            f"Unknown Cosmos3 action resolution={resolution!r}; expected one of {sorted(VIDEO_RES_SIZE_INFO)}."
+        )
+    input_ratio = h / w
+    best_size = None
+    best_diff = float("inf")
+    for cand_w, cand_h in VIDEO_RES_SIZE_INFO[key].values():
+        diff = abs(input_ratio - cand_h / cand_w)
+        if diff < best_diff:
+            best_diff = diff
+            best_size = (cand_w, cand_h)
+    assert best_size is not None
+    return best_size
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 32c129c613f..45ef135086c 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -45,6 +45,19 @@
 from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 
+from .action import (
+    ACTION_MODE_FORWARD_DYNAMICS,
+    ACTION_MODE_INVERSE_DYNAMICS,
+    ACTION_MODE_POLICY,
+    action_start_frame_offset,
+    build_action_condition_mask,
+    build_vision_condition_mask,
+    find_closest_target_size,
+    load_action_tensor,
+    normalize_action_mode,
+    pad_action_to_dim,
+    resolve_domain_id,
+)
 from .transformer_cosmos3 import Cosmos3VFMTransformer
 
 logger = init_logger(__name__)
@@ -79,14 +92,67 @@ def get_cosmos3_pre_process_func(od_config: OmniDiffusionConfig):
     if is_guardrails_enabled(od_config):
         ensure_initialized(od_config)
 
+    def _extra_args(request: OmniDiffusionRequest) -> dict[str, Any]:
+        extra = getattr(getattr(request, "sampling_params", None), "extra_args", None)
+        return extra if isinstance(extra, dict) else {}
+
+    def _request_action_mode(request: OmniDiffusionRequest) -> str | None:
+        return normalize_action_mode(_extra_args(request).get("action_mode"))
+
+    def _set_action_size_from_image(request: OmniDiffusionRequest, image: PIL.Image.Image) -> tuple[int, int]:
+        sp = request.sampling_params
+        if sp.height is not None and sp.width is not None:
+            return int(sp.height), int(sp.width)
+
+        extra = _extra_args(request)
+        resolution = extra.get("resolution", extra.get("image_size", 480))
+        target_w, target_h = find_closest_target_size(image.height, image.width, resolution)
+        if sp.height is None:
+            sp.height = target_h
+        if sp.width is None:
+            sp.width = target_w
+        return int(sp.height), int(sp.width)
+
     def _pil_to_rgb(value: Any) -> PIL.Image.Image:
         if isinstance(value, str):
             return PIL.Image.open(value).convert("RGB")
         if isinstance(value, PIL.Image.Image):
             return value.convert("RGB")
-        raise TypeError(f"Cosmos3 preprocessing expected PIL image or image path, got {type(value)!r}.")
+        raise TypeError(f"Cosmos3 action preprocessing expected PIL image or image path, got {type(value)!r}.")
+
+    def _resize_and_pad_action_image(image: PIL.Image.Image, target_h: int, target_w: int) -> PIL.Image.Image:
+        scale = min(target_w / image.width, target_h / image.height, 1.0)
+        resize_w = max(1, int(scale * image.width + 0.5))
+        resize_h = max(1, int(scale * image.height + 0.5))
+        if (resize_w, resize_h) != image.size:
+            image = image.resize((resize_w, resize_h), PIL.Image.Resampling.BICUBIC)
+
+        array = np.asarray(image)
+        pad_h = target_h - resize_h
+        pad_w = target_w - resize_w
+        if pad_h < 0 or pad_w < 0:
+            raise ValueError(
+                f"Cosmos3 action image resize exceeded target size: resized={(resize_h, resize_w)}, "
+                f"target={(target_h, target_w)}."
+            )
+        if pad_h == 0 and pad_w == 0:
+            return image
+        pad_mode = "reflect" if pad_h < resize_h and pad_w < resize_w else "edge"
+        padded = np.pad(array, ((0, pad_h), (0, pad_w), (0, 0)), mode=pad_mode)
+        return PIL.Image.fromarray(padded)
+
+    def _preprocess_action_image(image: PIL.Image.Image, target_h: int, target_w: int) -> torch.Tensor:
+        image = _resize_and_pad_action_image(image, target_h, target_w)
+        return video_processor.preprocess(image, height=target_h, width=target_w)
+
+    def _preprocess_action_video(frames: list[Any], target_h: int, target_w: int) -> torch.Tensor:
+        if not frames:
+            raise ValueError("Cosmos3 action video input must contain at least one frame.")
+        processed = [_preprocess_action_image(_pil_to_rgb(frame), target_h, target_w).squeeze(0) for frame in frames]
+        return torch.stack(processed, dim=1).unsqueeze(0).contiguous()
 
     def pre_process_func(request: OmniDiffusionRequest) -> OmniDiffusionRequest:
+        action_mode = _request_action_mode(request)
         if is_guardrails_enabled(od_config, request.sampling_params):
             for prompt in request.prompts:
                 text = prompt if isinstance(prompt, str) else prompt.get("prompt", "")
@@ -97,39 +163,63 @@ def pre_process_func(request: OmniDiffusionRequest) -> OmniDiffusionRequest:
                 continue
             multi_modal_data = prompt.get("multi_modal_data", {}) or {}
             raw_image = multi_modal_data.get("image")
-            if raw_image is None:
+            raw_video = multi_modal_data.get("video")
+            if raw_image is None and not (action_mode is not None and raw_video is not None):
                 continue
 
             if "additional_information" not in prompt:
                 prompt["additional_information"] = {}
 
-            image = _pil_to_rgb(raw_image)
+            if raw_image is None:
+                if not isinstance(raw_video, list) or not raw_video:
+                    raise TypeError("Cosmos3 action video input must be a non-empty list of PIL images or image paths.")
+                image = _pil_to_rgb(raw_video[0])
+            else:
+                image = _pil_to_rgb(raw_image)
 
             # Auto-calculate H/W from aspect ratio (720p max area)
             if request.sampling_params.height is None or request.sampling_params.width is None:
-                max_area = 720 * 1280
-                aspect_ratio = image.height / image.width
-                mod_value = 16
-                height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
-                width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
-                if request.sampling_params.height is None:
-                    request.sampling_params.height = height
-                if request.sampling_params.width is None:
-                    request.sampling_params.width = width
+                if action_mode is not None:
+                    _set_action_size_from_image(request, image)
+                else:
+                    max_area = 720 * 1280
+                    aspect_ratio = image.height / image.width
+                    mod_value = 16
+                    height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
+                    width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
+                    if request.sampling_params.height is None:
+                        request.sampling_params.height = height
+                    if request.sampling_params.width is None:
+                        request.sampling_params.width = width
 
             target_w = request.sampling_params.width
             target_h = request.sampling_params.height
-            scale = max(target_w / image.width, target_h / image.height)
-            resize_w = int(np.ceil(scale * image.width))
-            resize_h = int(np.ceil(scale * image.height))
-            image = image.resize((resize_w, resize_h), PIL.Image.Resampling.LANCZOS)
-            left = (resize_w - target_w) // 2
-            top = (resize_h - target_h) // 2
-            image = image.crop((left, top, left + target_w, top + target_h))
-
-            prompt["additional_information"]["preprocessed_image"] = video_processor.preprocess(
-                image, height=target_h, width=target_w
-            )
+            if action_mode is not None:
+                prompt["additional_information"]["preprocessed_image"] = _preprocess_action_image(
+                    image,
+                    int(target_h),
+                    int(target_w),
+                )
+            else:
+                scale = max(target_w / image.width, target_h / image.height)
+                resize_w = int(np.ceil(scale * image.width))
+                resize_h = int(np.ceil(scale * image.height))
+                image = image.resize((resize_w, resize_h), PIL.Image.Resampling.LANCZOS)
+                left = (resize_w - target_w) // 2
+                top = (resize_h - target_h) // 2
+                image = image.crop((left, top, left + target_w, top + target_h))
+
+                prompt["additional_information"]["preprocessed_image"] = video_processor.preprocess(
+                    image, height=target_h, width=target_w
+                )
+            if action_mode is not None and raw_video is not None:
+                if not isinstance(raw_video, list):
+                    raise TypeError("Cosmos3 action video input must be a list of PIL images or image paths.")
+                prompt["additional_information"]["preprocessed_video"] = _preprocess_action_video(
+                    raw_video,
+                    int(target_h),
+                    int(target_w),
+                )
             request.prompts[i] = prompt
 
         return request
@@ -381,13 +471,19 @@ def _remap_ckpt_key(key: str) -> str | None:
                 "proj_in.",
                 "proj_out.",
                 "time_embedder.",
-                "audio_proj_in.",
-                "audio_proj_out.",
+                "sound2llm.",
+                "llm2sound.",
+                "action_proj_in.",
+                "action_proj_out.",
             )
         ):
             return f"transformer.{k}"
-        if k in ("audio_modality_embed", "audio_modality_embed.weight"):
-            return "transformer.audio_modality_embed"
+        if k in ("sound_modality_embed", "sound_modality_embed.weight"):
+            return "transformer.sound_modality_embed"
+        if k in ("action_modality_embed", "action_modality_embed.weight"):
+            return "transformer.action_modality_embed"
+        if k.startswith("action_pos_embed."):
+            return None
 
         # Skip lm_head
         if k.startswith("lm_head."):
@@ -490,13 +586,22 @@ def _remapped_weights() -> Iterable[tuple[str, torch.Tensor]]:
                     f"the checkpoint is missing sound weights for {missing}. "
                     "Use a sound-capable transformer checkpoint."
                 )
+        if getattr(self.transformer, "action_gen", False):
+            action_markers = ("action_proj_in.", "action_proj_out.", "action_modality_embed")
+            missing = [marker.rstrip(".") for marker in action_markers if not any(marker in name for name in loaded)]
+            if missing:
+                raise ValueError(
+                    "Cosmos3 transformer config enables action generation, but "
+                    f"the checkpoint is missing action weights for {missing}. "
+                    "Use an action-capable transformer checkpoint."
+                )
         return loaded
 
     def predict_noise(self, **kwargs) -> torch.Tensor | tuple[torch.Tensor, ...]:
         """Override CFGParallelMixin.predict_noise for Cosmos3.
 
         The transformer returns the raw prediction: video-only as a tensor,
-        or a tuple in video, sound order for sound generation.
+        or a tuple in video, action, sound order for multimodal generation.
         """
         return self.transformer(**kwargs)
 
@@ -565,6 +670,12 @@ def _is_sound_request(cls, prompt_data, sp) -> bool:
                 return True
         return False
 
+    @classmethod
+    def _get_action_mode(cls, prompt_data, sp) -> str | None:
+        return normalize_action_mode(
+            cls._get_sp_param(sp, "action_mode", cls._get_prompt_param(prompt_data, "action_mode", None))
+        )
+
     def _get_sound_tokenizer(self):
         if not hasattr(self, "_sound_tokenizer"):
             self._sound_tokenizer = None
@@ -967,6 +1078,30 @@ def _encode_conditioning_video(
 
         return latent.to(self.dtype)
 
+    def _encode_video_tensor(self, video_tensor: torch.Tensor) -> torch.Tensor:
+        """VAE-encode a preprocessed pixel video [1, 3, T, H, W]."""
+        if video_tensor.ndim == 4:
+            video_tensor = video_tensor.unsqueeze(0)
+        if video_tensor.ndim != 5:
+            raise ValueError(f"Cosmos3 video tensor must have shape [1, 3, T, H, W], got {tuple(video_tensor.shape)}.")
+        if video_tensor.shape[0] != 1 or video_tensor.shape[1] != 3:
+            raise ValueError(f"Cosmos3 video tensor must have shape [1, 3, T, H, W], got {tuple(video_tensor.shape)}.")
+
+        video = video_tensor.to(device=self.device, dtype=self.vae.dtype)
+        latent = self.vae.encode(video).latent_dist.mode()
+
+        if hasattr(self.vae.config, "latents_mean") and hasattr(self.vae.config, "latents_std"):
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean).view(1, -1, 1, 1, 1).to(latent.device, latent.dtype)
+            )
+            latents_std = torch.tensor(self.vae.config.latents_std).view(1, -1, 1, 1, 1).to(latent.device, latent.dtype)
+            latent = (latent - latents_mean) / latents_std
+        else:
+            scaling_factor = getattr(self.vae.config, "scaling_factor", 1.0)
+            latent = latent * scaling_factor
+
+        return latent.to(self.dtype)
+
     def _prepare_latents_i2v(
         self,
         image_tensor: torch.Tensor,
@@ -1003,6 +1138,98 @@ def _prepare_latents_i2v(
         velocity_mask = 1.0 - condition_mask
         return latents, velocity_mask, image_latent
 
+    def _prepare_latents_action_video(
+        self,
+        video_tensor: torch.Tensor,
+        mode: str,
+        height: int,
+        width: int,
+        num_frames: int,
+        generator: torch.Generator,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Prepare video latents for action modes with mode-specific conditioning."""
+        del height, width
+        C = self.transformer.latent_channel_size
+        T_lat = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        H_lat = video_tensor.shape[-2] // self.vae_scale_factor_spatial
+        W_lat = video_tensor.shape[-1] // self.vae_scale_factor_spatial
+
+        noise = randn_tensor(
+            (1, C, T_lat, H_lat, W_lat),
+            generator=generator,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        cond_latent = self._encode_video_tensor(video_tensor)
+        if cond_latent.shape[2:] != noise.shape[2:]:
+            raise ValueError(
+                "Cosmos3 action video latent shape mismatch: "
+                f"encoded={tuple(cond_latent.shape)}, expected={tuple(noise.shape)}."
+            )
+        condition_mask = build_vision_condition_mask(
+            mode,
+            num_frames,
+            self.vae_scale_factor_temporal,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        latents = condition_mask * cond_latent + (1.0 - condition_mask) * noise
+        velocity_mask = 1.0 - condition_mask
+        return latents, velocity_mask, cond_latent
+
+    def _prepare_action_latents(
+        self,
+        *,
+        mode: str,
+        action_chunk_size: int,
+        raw_action_dim: int | None,
+        generator: torch.Generator,
+        sp,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+        action_dim = int(getattr(self.transformer, "action_dim", 64))
+        if mode == ACTION_MODE_FORWARD_DYNAMICS:
+            action = load_action_tensor(
+                self._get_sp_param(sp, "action", None),
+                self._get_sp_param(sp, "action_path", None),
+            )
+            if action.shape[0] < action_chunk_size:
+                pad = action[-1:].repeat(action_chunk_size - action.shape[0], 1)
+                action = torch.cat([action, pad], dim=0)
+            elif action.shape[0] > action_chunk_size:
+                action = action[:action_chunk_size]
+            if raw_action_dim is None:
+                raw_action_dim = int(action.shape[-1])
+            clean_action = pad_action_to_dim(action, action_dim)
+        else:
+            if raw_action_dim is None:
+                raise ValueError(
+                    "Cosmos3 action_mode='policy' and 'inverse_dynamics' require extra_args['raw_action_dim']."
+                )
+            clean_action = torch.zeros(action_chunk_size, action_dim, dtype=torch.float32)
+
+        raw_action_dim = int(raw_action_dim)
+        if raw_action_dim <= 0 or raw_action_dim > action_dim:
+            raise ValueError(f"Cosmos3 raw_action_dim must be in [1, {action_dim}], got {raw_action_dim}.")
+
+        clean_action = clean_action.to(device=self.device, dtype=self.dtype).unsqueeze(0)
+        condition_mask = build_action_condition_mask(
+            mode,
+            action_chunk_size,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        noise = randn_tensor(
+            (1, action_chunk_size, action_dim),
+            generator=generator,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        noise[:, :, raw_action_dim:] = 0
+        clean_action[:, :, raw_action_dim:] = 0
+        action_latents = condition_mask * clean_action + (1.0 - condition_mask) * noise
+        action_velocity_mask = 1.0 - condition_mask
+        return action_latents, action_velocity_mask, clean_action, raw_action_dim
+
     # -- Denoising loop (shared by T2V and I2V) -----------------------------
 
     def diffuse(
@@ -1016,6 +1243,9 @@ def diffuse(
         guidance_scale: float,
         shared_kwargs: dict,
         *,
+        action_latents: torch.Tensor | None = None,
+        action_velocity_mask: torch.Tensor | None = None,
+        action_condition_latents: torch.Tensor | None = None,
         sound_latents: torch.Tensor | None = None,
         velocity_mask: torch.Tensor | None = None,
         image_latent: torch.Tensor | None = None,
@@ -1062,10 +1292,13 @@ def _cfg_active_at(t: torch.Tensor) -> bool:
 
         def _pack_joint(
             video_tensor: torch.Tensor,
+            action_tensor: torch.Tensor | None = None,
             sound_tensor: torch.Tensor | None = None,
         ):
             batch = video_tensor.shape[0]
             tensors = [video_tensor]
+            if action_tensor is not None:
+                tensors.append(action_tensor)
             if sound_tensor is not None:
                 tensors.append(sound_tensor)
             flats = [tensor.reshape(batch, -1) for tensor in tensors]
@@ -1085,57 +1318,84 @@ def _unpack_joint(
 
         def _split_noise_pred(
             noise_pred: torch.Tensor | tuple[torch.Tensor, ...],
-        ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+            has_action = action_latents is not None
             has_sound = sound_latents is not None
-            if not has_sound:
+            if not has_action and not has_sound:
                 if isinstance(noise_pred, tuple):
                     raise ValueError("Cosmos3 video-only diffusion received tuple predictions.")
-                return noise_pred, None
+                return noise_pred, None, None
             if not isinstance(noise_pred, tuple):
                 raise ValueError("Cosmos3 multimodal diffusion expects transformer predictions as a tuple.")
-            if len(noise_pred) != 2:
-                raise ValueError(f"Cosmos3 sound diffusion expected 2 predictions, got {len(noise_pred)}.")
-            return noise_pred[0], noise_pred[1]
+            expected = 1 + int(has_action) + int(has_sound)
+            if len(noise_pred) != expected:
+                raise ValueError(
+                    f"Cosmos3 multimodal diffusion expected {expected} predictions, got {len(noise_pred)}."
+                )
+            video_pred = noise_pred[0]
+            idx = 1
+            action_pred = noise_pred[idx] if has_action else None
+            if has_action:
+                idx += 1
+            sound_pred = noise_pred[idx] if has_sound else None
+            return video_pred, action_pred, sound_pred
 
         def _step(
             noise_pred: torch.Tensor | tuple[torch.Tensor, ...],
             t: torch.Tensor,
             latents: torch.Tensor,
+            action_latents: torch.Tensor | None,
             sound_latents: torch.Tensor | None,
         ) -> torch.Tensor | tuple[torch.Tensor, ...]:
-            video_pred, sound_pred = _split_noise_pred(noise_pred)
+            video_pred, action_pred, sound_pred = _split_noise_pred(noise_pred)
             if velocity_mask is not None:
                 video_pred = video_pred * velocity_mask
-            if sound_latents is None:
+            if action_pred is not None and action_velocity_mask is not None:
+                action_pred = action_pred * action_velocity_mask
+            if action_latents is None and sound_latents is None:
                 latents = self.scheduler.step(video_pred, t, latents, return_dict=False)[0]
             else:
-                packed_noise, shapes, numels = _pack_joint(video_pred, sound_pred)
-                packed_latents, _, _ = _pack_joint(latents, sound_latents)
+                packed_noise, shapes, numels = _pack_joint(video_pred, action_pred, sound_pred)
+                packed_latents, _, _ = _pack_joint(latents, action_latents, sound_latents)
                 packed_next = self.scheduler.step(packed_noise, t, packed_latents, return_dict=False)[0]
                 unpacked = _unpack_joint(packed_next, shapes, numels)
                 latents = unpacked[0]
+                idx = 1
+                if action_latents is not None:
+                    action_latents = unpacked[idx]
+                    idx += 1
                 if sound_latents is not None:
-                    sound_latents = unpacked[1]
+                    sound_latents = unpacked[idx]
             if condition_latents is not None and velocity_mask is not None:
                 latents = velocity_mask * latents + (1.0 - velocity_mask) * condition_latents
             elif image_latent is not None:
                 latents[:, :, 0:1, :, :] = image_latent
+            if action_latents is not None and action_condition_latents is not None and action_velocity_mask is not None:
+                action_latents = (
+                    action_velocity_mask * action_latents + (1.0 - action_velocity_mask) * action_condition_latents
+                )
             outputs = [latents]
+            if action_latents is not None:
+                outputs.append(action_latents)
             if sound_latents is not None:
                 outputs.append(sound_latents)
             return outputs[0] if len(outputs) == 1 else tuple(outputs)
 
         def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
-            nonlocal latents, sound_latents
-            if sound_latents is None:
+            nonlocal latents, action_latents, sound_latents
+            if action_latents is None and sound_latents is None:
                 assert isinstance(step_out, torch.Tensor)
                 latents = step_out
                 return
             if not isinstance(step_out, tuple):
                 raise ValueError("Cosmos3 multimodal diffusion step returned a non-tuple result.")
             latents = step_out[0]
+            idx = 1
+            if action_latents is not None:
+                action_latents = step_out[idx]
+                idx += 1
             if sound_latents is not None:
-                sound_latents = step_out[1]
+                sound_latents = step_out[idx]
 
         if cfg_parallel:
             for t in self.progress_bar(timesteps):
@@ -1153,6 +1413,7 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                         timestep=timestep,
                         text_ids=cond_ids,
                         text_mask=cond_mask,
+                        action_latents=action_latents,
                         sound_latents=sound_latents,
                         **shared_kwargs,
                     ),
@@ -1161,12 +1422,13 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                         timestep=timestep,
                         text_ids=uncond_ids,
                         text_mask=uncond_mask,
+                        action_latents=action_latents,
                         sound_latents=sound_latents,
                         **shared_kwargs,
                     ),
                     cfg_normalize=False,
                 )
-                _assign_step_out(_step(noise_pred, t, latents, sound_latents))
+                _assign_step_out(_step(noise_pred, t, latents, action_latents, sound_latents))
 
         elif do_cfg:
             cond_cache: tuple = (None, None)
@@ -1182,6 +1444,7 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                     timestep=timestep,
                     text_ids=cond_ids,
                     text_mask=cond_mask,
+                    action_latents=action_latents,
                     sound_latents=sound_latents,
                     **shared_kwargs,
                 )
@@ -1195,6 +1458,7 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                         timestep=timestep,
                         text_ids=uncond_ids,
                         text_mask=uncond_mask,
+                        action_latents=action_latents,
                         sound_latents=sound_latents,
                         **shared_kwargs,
                     )
@@ -1207,7 +1471,7 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                     # the cond branch) and gives a free speedup for T2I.
                     noise_pred = noise_cond
 
-                _assign_step_out(_step(noise_pred, t, latents, sound_latents))
+                _assign_step_out(_step(noise_pred, t, latents, action_latents, sound_latents))
 
         else:
             for t in self.progress_bar(timesteps):
@@ -1217,12 +1481,15 @@ def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
                     timestep=timestep,
                     text_ids=cond_ids,
                     text_mask=cond_mask,
+                    action_latents=action_latents,
                     sound_latents=sound_latents,
                     **shared_kwargs,
                 )
-                _assign_step_out(_step(noise_pred, t, latents, sound_latents))
+                _assign_step_out(_step(noise_pred, t, latents, action_latents, sound_latents))
 
         outputs = [latents]
+        if action_latents is not None:
+            outputs.append(action_latents)
         if sound_latents is not None:
             outputs.append(sound_latents)
         return outputs[0] if len(outputs) == 1 else tuple(outputs)
@@ -1244,15 +1511,29 @@ def forward(
             prompt = prompt_data
             negative_prompt = None
             image_tensor = None
+            action_video_tensor = None
         else:
             prompt = prompt_data.get("prompt", "")
             negative_prompt = prompt_data.get("negative_prompt")
             additional_info = prompt_data.get("additional_information", {}) or {}
             image_tensor = additional_info.get("preprocessed_image")
+            action_video_tensor = additional_info.get("preprocessed_video")
 
         sp = req.sampling_params
         is_t2i = self._is_t2i_request(req)
         sound_enabled = self._is_sound_request(prompt_data, sp)
+        action_mode = self._get_action_mode(prompt_data, sp)
+        action_enabled = action_mode is not None
+        if action_enabled and is_t2i:
+            raise ValueError("Cosmos3 action generation is supported only for video outputs.")
+        if action_enabled and sound_enabled:
+            raise ValueError("Cosmos3 action+sound joint generation is not supported in this phase.")
+        if action_enabled and not getattr(self.transformer, "action_gen", False):
+            raise ValueError(
+                "Cosmos3 action generation was requested, but the transformer was "
+                "initialized without action modules. Check that the checkpoint config "
+                "enables action_gen and includes action weights."
+            )
         if sound_enabled and is_t2i:
             raise ValueError(
                 "Cosmos3 sound generation is supported only for video outputs in "
@@ -1293,6 +1574,36 @@ def forward(
             default_guidance_interval = None
             batch_size = 1  # Existing video pipeline assumes B=1.
 
+        if action_enabled:
+            action_chunk_param = self._get_sp_param(sp, "action_chunk_size", None)
+            if action_chunk_param is not None:
+                action_chunk_size = int(action_chunk_param)
+                if sp.num_frames is None:
+                    num_frames = action_chunk_size + 1
+            elif sp.num_frames is None:
+                action_chunk_size = 16
+                num_frames = action_chunk_size + 1
+            else:
+                action_chunk_size = int(num_frames) - 1
+            if action_chunk_size <= 0:
+                raise ValueError(f"Cosmos3 action_chunk_size must be positive, got {action_chunk_size}.")
+            if num_frames not in (action_chunk_size, action_chunk_size + 1):
+                raise ValueError(
+                    "Cosmos3 action requests require num_frames to equal action_chunk_size "
+                    f"or action_chunk_size + 1; got num_frames={num_frames}, action_chunk_size={action_chunk_size}."
+                )
+            num_inference_steps = sp.num_inference_steps or 30
+            guidance_scale = sp.guidance_scale if sp.guidance_scale is not None else 1.0
+            default_flow_shift = 5.0
+
+        domain_id = None
+        if action_enabled:
+            domain_id = resolve_domain_id(
+                domain_id=self._get_sp_param(sp, "domain_id", None),
+                domain_name=self._get_sp_param(sp, "domain_name", None),
+                require_explicit=True,
+            )
+
         # Runtime controls: prefer ``extra_args`` (OpenAI endpoints write
         # there) over direct attrs.
         flow_shift_target = float(self._get_sp_param(sp, "flow_shift", default_flow_shift))
@@ -1302,6 +1613,23 @@ def forward(
         max_sequence_length = self._get_sp_param(sp, "max_sequence_length", 512) or 512
         use_system_prompt = bool(self._get_sp_param(sp, "use_system_prompt", False))
 
+        if action_enabled and action_video_tensor is None:
+            extra_action_video = self._get_sp_param(sp, "action_video", None)
+            if isinstance(extra_action_video, torch.Tensor):
+                action_video_tensor = extra_action_video
+        if action_enabled and isinstance(action_video_tensor, torch.Tensor):
+            if action_video_tensor.ndim == 4:
+                action_video_tensor = action_video_tensor.unsqueeze(0)
+            if action_video_tensor.ndim != 5:
+                raise ValueError(
+                    "Cosmos3 extra_args['action_video'] must have shape [1, 3, T, H, W] "
+                    f"or [3, T, H, W], got {tuple(action_video_tensor.shape)}."
+                )
+            if sp.height is None:
+                height = int(action_video_tensor.shape[-2])
+            if sp.width is None:
+                width = int(action_video_tensor.shape[-1])
+
         self._guidance_scale = guidance_scale
         self._num_timesteps = num_inference_steps
 
@@ -1337,7 +1665,58 @@ def forward(
         # batching B=N together would require expanding text K/V (UND
         # pathway is text-only and cached) and is left as a future
         # optimization.
-        if image_tensor is not None and not is_t2i:
+        action_latents = None
+        action_velocity_mask = None
+        action_condition_latents = None
+        raw_action_dim = None
+        action_offset = 1
+        if action_enabled:
+            if action_video_tensor is not None and action_video_tensor.ndim == 4:
+                action_video_tensor = action_video_tensor.unsqueeze(0)
+            if action_video_tensor is not None and action_video_tensor.ndim != 5:
+                raise ValueError(
+                    "Cosmos3 action video tensor must have shape [1, 3, T, H, W] "
+                    f"or [3, T, H, W], got {tuple(action_video_tensor.shape)}."
+                )
+            if action_video_tensor is not None and action_video_tensor.shape[2] < num_frames:
+                pad = action_video_tensor[:, :, -1:].repeat(1, 1, num_frames - action_video_tensor.shape[2], 1, 1)
+                action_video_tensor = torch.cat([action_video_tensor, pad], dim=2)
+            elif action_video_tensor is not None and action_video_tensor.shape[2] > num_frames:
+                action_video_tensor = action_video_tensor[:, :, :num_frames]
+
+            if action_mode == ACTION_MODE_INVERSE_DYNAMICS and action_video_tensor is None:
+                raise ValueError("Cosmos3 inverse_dynamics action mode requires multi_modal_data['video'].")
+            if action_mode in {ACTION_MODE_POLICY, ACTION_MODE_FORWARD_DYNAMICS} and image_tensor is None:
+                if action_video_tensor is None:
+                    raise ValueError(
+                        f"Cosmos3 action_mode={action_mode!r} requires multi_modal_data['image'] "
+                        "or multi_modal_data['video']."
+                    )
+                image_tensor = action_video_tensor[:, :, 0]
+
+            raw_action_dim_param = self._get_sp_param(sp, "raw_action_dim", None)
+            raw_action_dim = int(raw_action_dim_param) if raw_action_dim_param is not None else None
+            action_prepared = self._prepare_action_latents(
+                mode=action_mode,
+                action_chunk_size=action_chunk_size,
+                raw_action_dim=raw_action_dim,
+                generator=generator,
+                sp=sp,
+            )
+            action_latents, action_velocity_mask, action_condition_latents, raw_action_dim = action_prepared
+            action_offset = action_start_frame_offset(action_mode, action_chunk_size, num_frames)
+
+        if action_enabled and action_video_tensor is not None:
+            latents, velocity_mask, condition_latents = self._prepare_latents_action_video(
+                action_video_tensor,
+                action_mode,
+                height,
+                width,
+                num_frames,
+                generator,
+            )
+            image_latent = condition_latents[:, :, 0:1]
+        elif image_tensor is not None and not is_t2i:
             latents, velocity_mask, image_latent = self._prepare_latents_i2v(
                 image_tensor,
                 height,
@@ -1368,6 +1747,13 @@ def forward(
         shared_kwargs = dict(video_shape=video_shape, fps=frame_rate)
         if velocity_mask is not None:
             shared_kwargs["noisy_frame_mask"] = velocity_mask
+        if action_enabled:
+            shared_kwargs.update(
+                action_domain_ids=torch.tensor([domain_id], dtype=torch.long, device=self.device),
+                action_noisy_mask=action_velocity_mask,
+                action_start_frame_offset=action_offset,
+                action_fps=float(self._get_sp_param(sp, "action_fps", frame_rate) or frame_rate),
+            )
 
         def _run_diffusion(start_latents):
             self._set_scheduler_timesteps(num_inference_steps)
@@ -1380,6 +1766,9 @@ def _run_diffusion(start_latents):
                 uncond_mask=uncond_mask,
                 guidance_scale=guidance_scale,
                 shared_kwargs=shared_kwargs,
+                action_latents=action_latents,
+                action_velocity_mask=action_velocity_mask,
+                action_condition_latents=action_condition_latents,
                 sound_latents=sound_latents,
                 velocity_mask=velocity_mask,
                 image_latent=image_latent,
@@ -1402,7 +1791,11 @@ def _run_diffusion(start_latents):
             latents = torch.cat(samples, dim=0)
         else:
             diffusion_output = _run_diffusion(latents)
-            if sound_enabled:
+            if action_enabled and sound_enabled:
+                latents, action_latents, sound_latents = diffusion_output
+            elif action_enabled:
+                latents, action_latents = diffusion_output
+            elif sound_enabled:
                 latents, sound_latents = diffusion_output
             else:
                 latents = diffusion_output
@@ -1424,4 +1817,18 @@ def _run_diffusion(start_latents):
             audio = self._decode_sound_latents(sound_latents, target_audio_samples)
             return DiffusionOutput(output={"video": video, "audio": audio, "audio_sample_rate": sound_sample_rate})
 
+        if action_enabled:
+            if action_latents is None or raw_action_dim is None or domain_id is None:
+                raise ValueError("Cosmos3 action generation finished without action latents.")
+            action = action_latents[:, :, :raw_action_dim].detach().cpu()
+            return DiffusionOutput(
+                output={"video": video},
+                custom_output={
+                    "action": action,
+                    "raw_action_dim": raw_action_dim,
+                    "action_mode": action_mode,
+                    "domain_id": domain_id,
+                },
+            )
+
         return DiffusionOutput(output={"image": video} if is_t2i else {"video": video})
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 1f5d822367d..52a52f8d042 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -121,6 +121,47 @@ def _as_bool(value: Any) -> bool:
     return bool(value)
 
 
+class DomainAwareLinear(nn.Module):
+    """Linear projection with one weight/bias pair per action embodiment domain."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        num_domains: int,
+        *,
+        dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        super().__init__()
+        self.input_size = int(input_size)
+        self.output_size = int(output_size)
+        self.num_domains = int(num_domains)
+        self.fc = nn.Embedding(self.num_domains, self.output_size * self.input_size, dtype=dtype)
+        self.bias = nn.Embedding(self.num_domains, self.output_size, dtype=dtype)
+        nn.init.xavier_uniform_(self.fc.weight)
+        nn.init.zeros_(self.bias.weight)
+
+    def forward(self, x: torch.Tensor, domain_id: torch.Tensor) -> torch.Tensor:
+        if domain_id.ndim == 0:
+            domain_id = domain_id.unsqueeze(0)
+        domain_id = domain_id.to(device=x.device, dtype=torch.long).reshape(-1)
+        if x.shape[0] != domain_id.shape[0]:
+            raise ValueError(
+                "Cosmos3 action domain_id batch size must match action tokens: "
+                f"tokens={x.shape[0]}, domain_id={domain_id.shape[0]}."
+            )
+        if torch.any((domain_id < 0) | (domain_id >= self.num_domains)):
+            raise ValueError(f"Cosmos3 action domain_id must be in [0, {self.num_domains}), got {domain_id.tolist()}.")
+
+        weight = self.fc(domain_id).view(domain_id.shape[0], self.input_size, self.output_size)
+        bias = self.bias(domain_id).view(domain_id.shape[0], self.output_size)
+        if x.ndim == 2:
+            return torch.bmm(x.unsqueeze(1), weight).squeeze(1) + bias
+        if x.ndim == 3:
+            return torch.bmm(x, weight) + bias.unsqueeze(1)
+        raise ValueError(f"Cosmos3 DomainAwareLinear expected rank-2 or rank-3 input, got {tuple(x.shape)}.")
+
+
 # ---------------------------------------------------------------------------
 # Rotary Position Embeddings (mRoPE)
 # ---------------------------------------------------------------------------
@@ -207,6 +248,30 @@ def compute_mrope_position_ids_sound(
     )
 
 
+def compute_mrope_position_ids_action(
+    grid_t: int,
+    temporal_offset: int | float,
+    action_fps: float | None,
+    base_fps: float = 24.0,
+    base_temporal_compression_factor: int = 4,
+    enable_fps_modulation: bool = True,
+    start_frame_offset: int = 1,
+) -> tuple[torch.Tensor, int | float]:
+    """Generate mRoPE IDs for action tokens as a frame-rate (T, 1, 1) grid."""
+    return compute_mrope_position_ids_vision(
+        grid_t=grid_t,
+        grid_h=1,
+        grid_w=1,
+        temporal_offset=temporal_offset,
+        fps=action_fps,
+        base_fps=base_fps,
+        temporal_compression_factor=1,
+        base_temporal_compression_factor=base_temporal_compression_factor,
+        enable_fps_modulation=enable_fps_modulation,
+        start_frame_offset=start_frame_offset,
+    )
+
+
 class Qwen3VLTextRotaryEmbedding(nn.Module):
     """Multi-dimensional rotary position embedding for Qwen3-VL."""
 
@@ -956,6 +1021,13 @@ def __init__(
         from .sound_tokenizer import get_sound_dim, get_sound_latent_fps
 
         self.sound_dim = int(sound_dim_value if sound_dim_value is not None else get_sound_dim(od_config))
+        action_gen_value = _od_config_get(od_config, "action_gen", None)
+        action_dim_value = _od_config_get(od_config, "action_dim", None)
+        if action_dim_value is None:
+            action_dim_value = _od_config_get(od_config, "max_action_dim", None)
+        self.action_gen = _as_bool(action_gen_value) if action_gen_value is not None else False
+        self.action_dim = int(action_dim_value if action_dim_value is not None else 64)
+        self.num_embodiment_domains = int(_od_config_get(od_config, "num_embodiment_domains", 32))
         self.sound_latent_fps = float(get_sound_latent_fps(od_config))
         if temporal_compression_factor is None:
             temporal_compression_factor = _tf_config_get(model_config, "temporal_compression_factor", 4)
@@ -995,6 +1067,20 @@ def __init__(
         self.proj_in = nn.Linear(self.patch_latent_dim, self.hidden_size)
         self.proj_out = nn.Linear(self.hidden_size, self.patch_latent_dim)
         self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=dtype)
+        if self.action_gen:
+            self.action_proj_in = DomainAwareLinear(
+                self.action_dim,
+                self.hidden_size,
+                self.num_embodiment_domains,
+                dtype=dtype,
+            )
+            self.action_proj_out = DomainAwareLinear(
+                self.hidden_size,
+                self.action_dim,
+                self.num_embodiment_domains,
+                dtype=dtype,
+            )
+            self.action_modality_embed = nn.Parameter(torch.zeros(self.hidden_size, dtype=dtype))
         if self.sound_gen:
             self.audio_proj_in = nn.Linear(self.sound_dim, self.hidden_size)
             self.audio_proj_out = nn.Linear(self.hidden_size, self.sound_dim)
@@ -1093,6 +1179,21 @@ def unpack_sound(tokens: torch.Tensor) -> torch.Tensor:
         """[B, T_sound, C_sound] -> [B, C_sound, T_sound]."""
         return tokens.permute(0, 2, 1).contiguous()
 
+    def pack_action(self, action_latents: torch.Tensor) -> torch.Tensor:
+        """Validate and return action latents as [B, T_action, D_action] tokens."""
+        if action_latents.ndim != 3:
+            raise ValueError(f"Cosmos3 action latents must have shape [B, T, D], got {tuple(action_latents.shape)}.")
+        if action_latents.shape[-1] != self.action_dim:
+            raise ValueError(
+                f"Cosmos3 action latent dimension mismatch: expected {self.action_dim}, got {action_latents.shape[-1]}."
+            )
+        return action_latents.contiguous()
+
+    @staticmethod
+    def unpack_action(tokens: torch.Tensor) -> torch.Tensor:
+        """Return [B, T_action, D_action] action predictions."""
+        return tokens.contiguous()
+
     # -- RoPE computation ----------------------------------------------------
 
     def _compute_rope_freqs(
@@ -1104,6 +1205,9 @@ def _compute_rope_freqs(
         fps: float | None,
         device: torch.device,
         dtype: torch.dtype,
+        t_action: int | None = None,
+        action_start_frame_offset: int = 1,
+        action_fps: float | None = None,
         t_sound: int | None = None,
     ) -> tuple[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
         """Compute mRoPE cos/sin for UND text and GEN media pathways."""
@@ -1111,6 +1215,7 @@ def _compute_rope_freqs(
         S_text = text_mask.shape[1]
         text_lengths = text_mask.sum(dim=1).long()
         effective_fps = fps if fps is not None and t > 1 else None
+        action_frames = int(t_action or 0)
         sound_frames = int(t_sound or 0)
 
         text_pos_list = []
@@ -1130,6 +1235,17 @@ def _compute_rope_freqs(
                 enable_fps_modulation=self.enable_fps_modulation,
             )
             gen_positions = [v_pos]
+            if action_frames > 0:
+                a_pos, _ = compute_mrope_position_ids_action(
+                    action_frames,
+                    temporal_offset=media_temporal_offset,
+                    action_fps=action_fps if action_fps is not None else fps,
+                    base_fps=self.base_fps,
+                    base_temporal_compression_factor=self.temporal_compression_factor,
+                    enable_fps_modulation=self.enable_fps_modulation,
+                    start_frame_offset=action_start_frame_offset,
+                )
+                gen_positions.append(a_pos)
             if sound_frames > 0:
                 s_pos, _ = compute_mrope_position_ids_sound(
                     sound_frames,
@@ -1175,7 +1291,9 @@ def _validate_gen_sequence_parallel(
         *,
         s_gen: int,
         s_video: int,
+        s_action: int,
         s_sound: int,
+        has_action: bool,
         has_sound: bool,
         ulysses_size: int,
     ) -> None:
@@ -1183,14 +1301,16 @@ def _validate_gen_sequence_parallel(
             return
 
         detail_parts = [f"video tokens {s_video}"]
+        if has_action:
+            detail_parts.append(f"action tokens {s_action}")
         if has_sound:
             detail_parts.append(f"sound tokens {s_sound}")
         detail = " = " + " + ".join(detail_parts) if len(detail_parts) > 1 else ""
         adjust_detail = (
-            "Adjust the spatial resolution, frame count, sound duration, "
-            "or sound latent FPS so the combined media sequence is a "
+            "Adjust the spatial resolution, frame count, action chunk size, "
+            "sound duration, or sound latent FPS so the combined media sequence is a "
             "multiple of ulysses_degree."
-            if has_sound
+            if has_action or has_sound
             else (
                 "Adjust the spatial resolution so that "
                 "t * ceil(h/patch) * ceil(w/patch) is a multiple "
@@ -1212,6 +1332,11 @@ def forward(
         text_mask: torch.Tensor,
         video_shape: tuple[int, int, int],
         fps: float | None = None,
+        action_latents: torch.Tensor | None = None,
+        action_domain_ids: torch.Tensor | None = None,
+        action_noisy_mask: torch.Tensor | None = None,
+        action_start_frame_offset: int = 1,
+        action_fps: float | None = None,
         sound_latents: torch.Tensor | None = None,
         noisy_frame_mask: torch.Tensor | None = None,
         **kwargs,
@@ -1224,6 +1349,10 @@ def forward(
             text_mask: [B, S_text] attention mask (1=real, 0=pad)
             video_shape: (t, h, w) in latent space
             fps: video frame rate for temporal mRoPE modulation
+            action_latents: Optional [B, T_action, D_action] noisy action latents.
+            action_domain_ids: Optional [B] embodiment domain IDs for action projections.
+            action_noisy_mask: Optional [B, T_action, 1] mask where 1=noisy
+                action token and 0=clean conditioned token.
             sound_latents: Optional [B, C_sound, T_sound] noisy sound latents.
             noisy_frame_mask: Optional [B, 1, t, 1, 1] mask where 1=noisy (add
                 timestep embedding, predict velocity) and 0=conditioned (clean
@@ -1232,7 +1361,7 @@ def forward(
 
         Returns:
             [B, C, t, h, w] velocity prediction, or
-            tuple outputs in video, sound order when sound latents are provided.
+            tuple outputs in video, action, sound order when extra modalities are provided.
         """
         t, h, w = video_shape
         hp, wp, _, _ = self._pad_to_patch_size(h, w)
@@ -1244,7 +1373,14 @@ def forward(
                 f"Cosmos3 requires identical real text lengths within a batch "
                 f"(got min={min_real_len}, max={max_real_len})."
             )
+        has_action = action_latents is not None
         has_sound = sound_latents is not None
+        if has_action and not self.action_gen:
+            raise ValueError(
+                "Cosmos3 action generation was requested, but this transformer "
+                "was initialized without action modules. Check that the "
+                "transformer config enables action_gen."
+            )
         if has_sound and not self.sound_gen:
             raise ValueError(
                 "Cosmos3 sound generation was requested, but this transformer "
@@ -1258,8 +1394,21 @@ def forward(
         # Patchify latents and project to hidden space
         hidden_video = self.proj_in(self.patchify(hidden_states, t, h, w))
         s_video = hidden_video.shape[1]
+        s_action = 0
+        hidden_action = None
         s_sound = 0
         hidden_sound = None
+        if action_latents is not None:
+            if action_latents.shape[0] != hidden_states.shape[0]:
+                raise ValueError(
+                    "Cosmos3 action and video batch sizes must match: "
+                    f"video={hidden_states.shape[0]}, action={action_latents.shape[0]}."
+                )
+            if action_domain_ids is None:
+                action_domain_ids = torch.zeros(action_latents.shape[0], dtype=torch.long, device=action_latents.device)
+            hidden_action = self.action_proj_in(self.pack_action(action_latents), action_domain_ids)
+            hidden_action = hidden_action + self.action_modality_embed.to(hidden_action.dtype)
+            s_action = hidden_action.shape[1]
         if sound_latents is not None:
             if sound_latents.shape[0] != hidden_states.shape[0]:
                 raise ValueError(
@@ -1291,9 +1440,22 @@ def forward(
         else:
             hidden_video = hidden_video + time_embed.unsqueeze(1)
 
+        if hidden_action is not None:
+            if action_noisy_mask is None:
+                hidden_action = hidden_action + time_embed.unsqueeze(1)
+            else:
+                if action_noisy_mask.shape != (hidden_action.shape[0], hidden_action.shape[1], 1):
+                    raise ValueError(
+                        "Cosmos3 action_noisy_mask must have shape [B, T_action, 1], "
+                        f"got {tuple(action_noisy_mask.shape)}."
+                    )
+                hidden_action = hidden_action + time_embed.unsqueeze(1) * action_noisy_mask.to(hidden_action.dtype)
+
         if hidden_sound is not None:
             hidden_sound = hidden_sound + time_embed.unsqueeze(1)
         hidden_parts = [hidden_video]
+        if hidden_action is not None:
+            hidden_parts.append(hidden_action)
         if hidden_sound is not None:
             hidden_parts.append(hidden_sound)
         hidden_gen = torch.cat(hidden_parts, dim=1)
@@ -1309,6 +1471,9 @@ def forward(
                     fps,
                     hidden_states.device,
                     hidden_states.dtype,
+                    t_action=s_action,
+                    action_start_frame_offset=action_start_frame_offset,
+                    action_fps=action_fps,
                     t_sound=s_sound,
                 )
                 cached_kv_full = self.language_model(text_ids, freqs_und)
@@ -1326,7 +1491,9 @@ def forward(
             self._validate_gen_sequence_parallel(
                 s_gen=hidden_gen.shape[1],
                 s_video=s_video,
+                s_action=s_action,
                 s_sound=s_sound,
+                has_action=has_action,
                 has_sound=has_sound,
                 ulysses_size=ulysses_size,
             )
@@ -1361,10 +1528,12 @@ def forward(
 
         # Final norm and project back to latent space
         hidden_gen = self.norm_moe_gen(hidden_gen)
-        if not has_sound:
+        if not has_action and not has_sound:
             return self.unpatchify(self.proj_out(hidden_gen), t, h, w)
 
         split_sizes = [s_video]
+        if has_action:
+            split_sizes.append(s_action)
         if has_sound:
             split_sizes.append(s_sound)
         split_hidden = hidden_gen.split(split_sizes, dim=1)
@@ -1372,6 +1541,11 @@ def forward(
         video_pred = self.unpatchify(self.proj_out(hidden_video), t, h, w)
         outputs: list[torch.Tensor] = [video_pred]
         split_idx = 1
+        if has_action:
+            hidden_action = split_hidden[split_idx]
+            split_idx += 1
+            assert action_domain_ids is not None
+            outputs.append(self.unpack_action(self.action_proj_out(hidden_action, action_domain_ids)))
         if has_sound:
             hidden_sound = split_hidden[split_idx]
             outputs.append(self.unpack_sound(self.audio_proj_out(hidden_sound)))
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index d7757dd80b5..ac8a7ad5449 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -2435,7 +2435,7 @@ async def _run_video_generation_job(
     started_at = time.perf_counter()
     output_path = None
     try:
-        video_bytes, stage_durations, peak_memory_mb = await handler.generate_video_bytes(
+        video_bytes, stage_durations, peak_memory_mb, action = await handler.generate_video_bytes(
             request, video_id, reference_image=reference_image
         )
 
@@ -2453,6 +2453,7 @@ async def _run_video_generation_job(
                 "inference_time_s": time.perf_counter() - started_at,
                 "stage_durations": stage_durations,
                 "peak_memory_mb": peak_memory_mb,
+                "action": action,
             },
         )
     except (EngineGenerateError, EngineDeadError) as exc:
@@ -2664,7 +2665,7 @@ async def create_video_sync(
     raw_request.state.request_metadata = RequestResponseMetadata(request_id=request_id)
     started_at = time.perf_counter()
     try:
-        video_bytes, stage_durations, peak_memory_mb = await asyncio.wait_for(
+        video_bytes, stage_durations, peak_memory_mb, _action = await asyncio.wait_for(
             handler.generate_video_bytes(request, request_id, reference_image=reference_image),
             timeout=VIDEO_SYNC_TIMEOUT_S,
         )
diff --git a/vllm_omni/entrypoints/openai/protocol/__init__.py b/vllm_omni/entrypoints/openai/protocol/__init__.py
index c68f6f59879..0d8ddd82d90 100644
--- a/vllm_omni/entrypoints/openai/protocol/__init__.py
+++ b/vllm_omni/entrypoints/openai/protocol/__init__.py
@@ -13,6 +13,7 @@
     ResponseFormat,
 )
 from vllm_omni.entrypoints.openai.protocol.videos import (
+    VideoAction,
     VideoData,
     VideoGenerationRequest,
     VideoGenerationResponse,
@@ -27,6 +28,7 @@
     "ImageGenerationRequest",
     "ImageGenerationResponse",
     "ResponseFormat",
+    "VideoAction",
     "VideoData",
     "VideoGenerationRequest",
     "VideoGenerationResponse",
diff --git a/vllm_omni/entrypoints/openai/protocol/videos.py b/vllm_omni/entrypoints/openai/protocol/videos.py
index 887e3ce67ea..ec5ab14e8d8 100644
--- a/vllm_omni/entrypoints/openai/protocol/videos.py
+++ b/vllm_omni/entrypoints/openai/protocol/videos.py
@@ -220,12 +220,24 @@ def resolve_video_params(self) -> VideoParams:
         return vp
 
 
+class VideoAction(BaseModel):
+    """Generated action sequence returned by action-capable video models."""
+
+    data: list[Any] = Field(..., description="JSON-serializable nested action values")
+    shape: list[int] = Field(..., description="Shape of the returned action data")
+    dtype: str | None = Field(default=None, description="Source action dtype, if available")
+    raw_action_dim: int | None = Field(default=None, description="Raw action dimension requested by the model")
+    action_mode: str | None = Field(default=None, description="Action generation mode")
+    domain_id: int | None = Field(default=None, description="Action embodiment domain id")
+
+
 class VideoData(BaseModel):
     """Single generated video data."""
 
     b64_json: str | None = Field(default=None, description="Base64-encoded MP4 video")
     url: str | None = Field(default=None, description="Video URL (not implemented)")
     revised_prompt: str | None = Field(default=None, description="Revised prompt (OpenAI compatibility, always null)")
+    action: VideoAction | None = Field(default=None, description="Generated action sequence metadata, if any")
 
 
 class VideoGenerationResponse(BaseModel):
@@ -298,6 +310,7 @@ class VideoResponse(BaseModel):
         default=0.0,
         description="Peak device memory usage in MB reported by the diffusion pipeline.",
     )
+    action: VideoAction | None = Field(default=None, description="Generated action sequence metadata, if any")
 
     @property
     def file_extension(self) -> str:
diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py
index 57a76594a0f..9b173f9c0d1 100644
--- a/vllm_omni/entrypoints/openai/serving_video.py
+++ b/vllm_omni/entrypoints/openai/serving_video.py
@@ -16,6 +16,7 @@
 
 from vllm_omni.entrypoints.async_omni import AsyncOmni
 from vllm_omni.entrypoints.openai.protocol.videos import (
+    VideoAction,
     VideoData,
     VideoGenerationRequest,
     VideoGenerationResponse,
@@ -44,6 +45,7 @@ class VideoGenerationArtifacts:
 
     videos: list[Any]
     audios: list[Any | None]
+    actions: list[VideoAction | None]
     audio_sample_rate: int
     output_fps: int
     stage_durations: dict[str, float]
@@ -177,11 +179,13 @@ async def _run_and_extract(
         result = await self._run_generation(prompt, gen_params, reference_id)
         videos = self._extract_video_outputs(result)
         audios = self._extract_audio_outputs(result, expected_count=len(videos))
+        actions = self._extract_action_outputs(result, expected_count=len(videos))
         audio_sample_rate = self._resolve_audio_sample_rate(result)
         output_fps = (vp.fps or self._resolve_fps(result) or 24) * self._resolve_video_fps_multiplier(result)
         return VideoGenerationArtifacts(
             videos=videos,
             audios=audios,
+            actions=actions,
             audio_sample_rate=audio_sample_rate,
             output_fps=output_fps,
             stage_durations=self._extract_stage_durations(result),
@@ -215,7 +219,8 @@ async def generate_videos(
                         audio_sample_rate=artifacts.audio_sample_rate,
                         video_codec_options=video_codec_options,
                     )
-                )
+                ),
+                action=artifacts.actions[idx],
             )
             for idx, video in enumerate(artifacts.videos)
         ]
@@ -234,7 +239,7 @@ async def generate_video_bytes(
         reference_id: str,
         *,
         reference_image: ReferenceImage | None = None,
-    ) -> tuple[bytes, dict[str, float], float]:
+    ) -> tuple[bytes, dict[str, float], float, VideoAction | None]:
         """Generate a video and return raw MP4 bytes, bypassing base64 encoding."""
         artifacts = await self._run_and_extract(request, reference_id, reference_image=reference_image)
         if len(artifacts.videos) > 1:
@@ -259,7 +264,7 @@ async def generate_video_bytes(
         )
         _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000
         logger.info("Video response encoding (MP4 bytes): %.2f ms", _t_encode_ms)
-        return video_bytes, artifacts.stage_durations, artifacts.peak_memory_mb
+        return video_bytes, artifacts.stage_durations, artifacts.peak_memory_mb, artifacts.actions[0]
 
     @staticmethod
     def _resolve_video_fps_multiplier(result: Any) -> int:
@@ -440,6 +445,18 @@ def _resolve_audio_sample_rate(self, result: Any) -> int:
 
         return 24000
 
+    @classmethod
+    def _extract_action_outputs(cls, result: Any, expected_count: int) -> list[VideoAction | None]:
+        custom_output = cls._extract_custom_output(result)
+        if not custom_output or "action" not in custom_output:
+            return [None] * expected_count
+
+        action_items = cls._split_action_payload(custom_output["action"], expected_count)
+        return [
+            cls._make_video_action(action_item, custom_output) if action_item is not None else None
+            for action_item in action_items
+        ]
+
     @staticmethod
     def _extract_custom_output(result: Any) -> dict[str, Any]:
         custom_output = getattr(result, "custom_output", None)
@@ -458,6 +475,102 @@ def _extract_custom_output(result: Any) -> dict[str, Any]:
 
         return custom_output if isinstance(custom_output, dict) else {}
 
+    @classmethod
+    def _split_action_payload(cls, action: Any, expected_count: int) -> list[Any | None]:
+        if expected_count <= 0:
+            return []
+
+        shape = cls._shape_of(action)
+        if len(shape) >= 3:
+            count = min(shape[0], expected_count)
+            actions = [cls._index_action(action, i) for i in range(count)]
+            actions.extend([None] * (expected_count - count))
+            return actions
+
+        return [action] + [None] * (expected_count - 1)
+
+    @classmethod
+    def _make_video_action(cls, action: Any, custom_output: dict[str, Any]) -> VideoAction:
+        data = cls._to_jsonable(action)
+        if not isinstance(data, list):
+            data = [data]
+
+        action_mode = custom_output.get("action_mode")
+        return VideoAction(
+            data=data,
+            shape=cls._shape_of(action),
+            dtype=cls._dtype_of(action),
+            raw_action_dim=cls._coerce_optional_int(custom_output.get("raw_action_dim")),
+            action_mode=str(action_mode) if action_mode is not None else None,
+            domain_id=cls._coerce_optional_int(custom_output.get("domain_id")),
+        )
+
+    @staticmethod
+    def _index_action(action: Any, index: int) -> Any:
+        try:
+            return action[index]
+        except (IndexError, KeyError, TypeError):
+            return None
+
+    @classmethod
+    def _to_jsonable(cls, value: Any) -> Any:
+        if hasattr(value, "detach"):
+            value = value.detach()
+        if hasattr(value, "cpu"):
+            value = value.cpu()
+        if hasattr(value, "tolist"):
+            return cls._to_jsonable(value.tolist())
+        if isinstance(value, (list, tuple)):
+            return [cls._to_jsonable(item) for item in value]
+        if hasattr(value, "item"):
+            try:
+                return value.item()
+            except (TypeError, ValueError):
+                pass
+        return value
+
+    @classmethod
+    def _shape_of(cls, value: Any) -> list[int]:
+        shape = getattr(value, "shape", None)
+        if shape is not None:
+            try:
+                return [int(dim) for dim in shape]
+            except (TypeError, ValueError):
+                pass
+        if isinstance(value, (list, tuple)):
+            if not value:
+                return [0]
+            return [len(value)] + cls._shape_of(value[0])
+        return []
+
+    @staticmethod
+    def _dtype_of(value: Any) -> str | None:
+        dtype = getattr(value, "dtype", None)
+        return str(dtype) if dtype is not None else None
+
+    @staticmethod
+    def _coerce_optional_int(value: Any) -> int | None:
+        if value is None:
+            return None
+        try:
+            value = value.item() if hasattr(value, "item") else value
+            return int(value)
+        except (TypeError, ValueError):
+            return None
+
+    def _resolve_audio_sample_rate(self, result: Any) -> int:
+        result_sample_rate = self._extract_audio_sample_rate_from_result(result)
+        if result_sample_rate is not None:
+            return result_sample_rate
+
+        model_config = getattr(self._engine_client, "model_config", None)
+        hf_config = getattr(model_config, "hf_config", None)
+        config_sample_rate = self._extract_audio_sample_rate_from_config(hf_config)
+        if config_sample_rate is not None:
+            return config_sample_rate
+
+        return 24000
+
     @staticmethod
     def _resolve_fps(result: Any) -> int | None:
         """Extract fps from multimodal_output if the model reported it."""