From a37fd9e307c90f5b6433676235051119030bc4d7 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Thu, 28 May 2026 17:58:43 +0200
Subject: [PATCH 01/11] Add Cosmos3 sound generation

Signed-off-by: Maciej Bala <mbala@nvidia.com>
Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 docs/models/supported_models.md               |   2 +-
 tests/diffusion/models/cosmos3/conftest.py    | 185 ++++++
 .../models/cosmos3/test_cosmos3_pipeline.py   |  72 ++-
 .../cosmos3/test_cosmos3_sound_tokenizer.py   | 226 ++++++++
 .../cosmos3/test_cosmos3_transformer.py       | 105 +++-
 .../openai_api/test_video_server.py           |   9 +
 .../cosmos3/audio_tokenizer/__init__.py       |   6 +
 .../models/cosmos3/audio_tokenizer/avae.py    | 323 +++++++++++
 .../models/cosmos3/pipeline_cosmos3.py        | 293 +++++++++-
 .../models/cosmos3/sound_tokenizer.py         | 537 ++++++++++++++++++
 .../models/cosmos3/transformer_cosmos3.py     | 195 ++++++-
 vllm_omni/entrypoints/openai/api_server.py    |   4 +
 .../entrypoints/openai/protocol/videos.py     |   9 +
 vllm_omni/entrypoints/openai/serving_video.py |   4 +
 14 files changed, 1941 insertions(+), 29 deletions(-)
 create mode 100644 tests/diffusion/models/cosmos3/conftest.py
 create mode 100644 tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
 create mode 100644 vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index c4e181d5917..6482d503c8b 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -33,7 +33,7 @@ th {
 | `ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanPipeline` | Wan2.1-T2V, Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-T2V-14B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
-| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V | `nvidia/Cosmos3-Nano` | ✅︎ | | | |
+| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V, T2V with sound | `nvidia/Cosmos3-Nano` | ✅︎ | | | |
 | `WanSpeechToVideoPipeline` | Wan2.2-S2V | `Wan-AI/Wan2.2-S2V-14B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `Wan22VACEPipeline` | Wan2.1-VACE | `Wan-AI/Wan2.1-VACE-1.3B-diffusers`, `Wan-AI/Wan2.1-VACE-14B-diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ |
 | `LTX2Pipeline` | LTX-2-T2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | |
diff --git a/tests/diffusion/models/cosmos3/conftest.py b/tests/diffusion/models/cosmos3/conftest.py
new file mode 100644
index 00000000000..7075065447c
--- /dev/null
+++ b/tests/diffusion/models/cosmos3/conftest.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import sys
+import types
+from types import SimpleNamespace
+from typing import Any
+
+import pytest
+import torch
+from torch import nn
+
+
+class StubScheduler:
+    def __init__(self, timesteps: list[int] | None = None, *, flow_shift: float = 1.0) -> None:
+        self.timesteps = torch.tensor(timesteps or [9, 3], dtype=torch.int64)
+        self.config = SimpleNamespace(num_train_timesteps=1000, flow_shift=flow_shift)
+        self.set_timesteps_calls: list[tuple[int, torch.device]] = []
+        self.step_calls: list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = []
+
+    def set_timesteps(self, num_steps: int, device: torch.device) -> None:
+        self.set_timesteps_calls.append((num_steps, device))
+        self.timesteps = torch.arange(num_steps, 0, -1, dtype=torch.int64, device=device)
+
+    def step(self, noise_pred: torch.Tensor, timestep: torch.Tensor, latents: torch.Tensor, **kwargs):
+        del kwargs
+        self.step_calls.append((noise_pred.clone(), timestep.clone(), latents.clone()))
+        return (latents + noise_pred,)
+
+
+class _ModeLatentDist:
+    def __init__(self, latents: torch.Tensor) -> None:
+        self._latents = latents
+
+    def mode(self) -> torch.Tensor:
+        return self._latents
+
+
+class StubCosmos3VAE:
+    dtype = torch.float32
+
+    def __init__(self, z_dim: int = 2, *, temporal: int = 4, spatial: int = 8) -> None:
+        self.config = SimpleNamespace(
+            z_dim=z_dim,
+            scale_factor_temporal=temporal,
+            scale_factor_spatial=spatial,
+            latents_mean=[0.0] * z_dim,
+            latents_std=[1.0] * z_dim,
+        )
+
+    def encode(self, video: torch.Tensor):
+        latent_frames = (video.shape[2] - 1) // self.config.scale_factor_temporal + 1
+        latent_height = video.shape[-2] // self.config.scale_factor_spatial
+        latent_width = video.shape[-1] // self.config.scale_factor_spatial
+        latents = torch.ones(
+            video.shape[0],
+            self.config.z_dim,
+            latent_frames,
+            latent_height,
+            latent_width,
+            dtype=video.dtype,
+            device=video.device,
+        )
+        return SimpleNamespace(latent_dist=_ModeLatentDist(latents))
+
+    def decode(self, latents: torch.Tensor, return_dict: bool = False):
+        del return_dict
+        return (latents,)
+
+
+class StubCosmos3Transformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        latent_channel_size: int = 2,
+        sound_gen: bool = False,
+        sound_dim: int = 3,
+    ) -> None:
+        super().__init__()
+        self.latent_channel_size = latent_channel_size
+        self.sound_gen = sound_gen
+        self.sound_dim = sound_dim
+        self.cached_kv: Any | None = None
+        self.cached_freqs_gen: Any | None = None
+        self.calls: list[dict[str, Any]] = []
+        self.reset_calls = 0
+
+    def reset_cache(self) -> None:
+        self.reset_calls += 1
+        self.cached_kv = None
+        self.cached_freqs_gen = None
+
+    def forward(
+        self,
+        *,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        text_ids: torch.Tensor,
+        text_mask: torch.Tensor,
+        **kwargs: Any,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        token = int(text_ids.reshape(-1)[0].item()) if text_ids.numel() else 0
+        sound_latents = kwargs.get("sound_latents")
+        self.calls.append(
+            {
+                "token": token,
+                "timestep": timestep.clone(),
+                "text_mask": text_mask.clone(),
+                "cache_before": self.cached_kv,
+                "kwargs": dict(kwargs),
+            }
+        )
+        if self.cached_kv is None:
+            marker = torch.tensor([token], dtype=torch.float32)
+            self.cached_kv = [(marker, marker + 100)]
+            self.cached_freqs_gen = (marker + 200, marker + 300)
+        outputs: list[torch.Tensor] = [torch.full_like(hidden_states, float(token))]
+        if sound_latents is not None:
+            outputs.append(torch.full_like(sound_latents, float(token + 10)))
+        return outputs[0] if len(outputs) == 1 else tuple(outputs)
+
+
+def passthrough_progress_bar(iterable):
+    return iterable
+
+
+@pytest.fixture(autouse=True)
+def fake_cosmos3_guardrails(monkeypatch: pytest.MonkeyPatch):
+    module = types.ModuleType("vllm_omni.diffusion.models.cosmos3.guardrails")
+    module.is_guardrails_enabled = lambda od_config, sampling_params=None: False
+    module.ensure_initialized = lambda od_config: None
+    module.check_text_safety = lambda text: None
+    module.check_video_safety = lambda video: video
+    monkeypatch.setitem(sys.modules, module.__name__, module)
+    return module
+
+
+@pytest.fixture
+def make_cosmos3_pipeline():
+    def _make():
+        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
+            Cosmos3OmniDiffusersPipeline,
+        )
+
+        pipeline = object.__new__(Cosmos3OmniDiffusersPipeline)
+        nn.Module.__init__(pipeline)
+        pipeline.od_config = SimpleNamespace()
+        pipeline.device = torch.device("cpu")
+        pipeline.dtype = torch.float32
+        pipeline.transformer = StubCosmos3Transformer(latent_channel_size=2)
+        pipeline.vae = StubCosmos3VAE(z_dim=2)
+        pipeline.vae_scale_factor_temporal = 4
+        pipeline.vae_scale_factor_spatial = 8
+        pipeline.scheduler = StubScheduler([9, 3], flow_shift=1.0)
+        pipeline._base_scheduler_config = pipeline.scheduler.config
+        pipeline._engine_init_flow_shift = 1.0
+        pipeline._current_flow_shift = 1.0
+        pipeline._guidance_scale = None
+        pipeline._num_timesteps = None
+        pipeline.progress_bar = passthrough_progress_bar
+        pipeline._sound_tokenizer = None
+        return pipeline
+
+    return _make
+
+
+def make_sampling_params(**overrides: Any) -> SimpleNamespace:
+    values = {
+        "height": None,
+        "width": None,
+        "num_frames": None,
+        "num_inference_steps": None,
+        "guidance_scale": None,
+        "generator": None,
+        "seed": 123,
+        "num_outputs_per_prompt": 1,
+        "frame_rate": None,
+        "resolved_frame_rate": None,
+        "max_sequence_length": None,
+        "extra_args": {},
+    }
+    values.update(overrides)
+    return SimpleNamespace(**values)
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
index 31b40b6eee5..b6116d9265d 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -227,7 +227,7 @@ def test_preprocess_i2v_image_input() -> None:
     assert tuple(result.prompts[0]["additional_information"]["preprocessed_image"].shape[-2:]) == (672, 1344)
 
 
-def test_postprocess_handles_image_video_and_validation() -> None:
+def test_postprocess_handles_image_video_audio_and_validation() -> None:
     from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_post_process_func
 
     func = get_cosmos3_post_process_func(SimpleNamespace())
@@ -235,6 +235,14 @@ def test_postprocess_handles_image_video_and_validation() -> None:
 
     assert func(video, output_type="latent") is video
     assert func({"image": video})[0].size == (4, 4)
+    assert "video" in func({"video": video})
+    assert (
+        func(
+            {"video": video, "audio": torch.ones(1, 2, 16), "audio_sample_rate": 48000},
+            sampling_params=SimpleNamespace(extra_args={"resolved_frame_rate": 12}),
+        )["audio_sample_rate"]
+        == 48000
+    )
 
     with pytest.raises(ValueError, match="text-to-image postprocess expects"):
         func({"image": torch.zeros(1, 3, 2, 4, 4)})
@@ -293,7 +301,7 @@ def test_prompt_formatting_and_checkpoint_key_remap(make_cosmos3_pipeline) -> No
     assert {key: Cosmos3OmniDiffusersPipeline._remap_ckpt_key(key) for key in remaps} == remaps
 
 
-def test_prepare_latents_for_video_and_image(make_cosmos3_pipeline) -> None:
+def test_prepare_latents_for_video_image_and_sound(make_cosmos3_pipeline) -> None:
     pipeline = make_cosmos3_pipeline()
     latents = pipeline._prepare_latents(16, 24, 5, torch.Generator(device="cpu").manual_seed(0))
     assert latents.shape == (1, 2, 2, 2, 3)
@@ -306,8 +314,24 @@ def test_prepare_latents_for_video_and_image(make_cosmos3_pipeline) -> None:
     assert velocity_mask.tolist() == [[[[[0.0]], [[1.0]]]]]
     assert image_latent.shape == (1, 2, 1, 2, 3)
 
+    pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+    pipeline._sound_tokenizer = SimpleNamespace(
+        sample_rate=10,
+        latent_ch=3,
+        hop_size=4,
+        decode=lambda x: torch.ones(x.shape[0], 2, 24),
+    )
+    assert pipeline._resolve_sound_target_samples(SimpleNamespace(extra_args={"sound_duration": 2.0}), 9, 3.0) == (
+        20,
+        2.0,
+        10,
+    )
+    sound_latents, latent_frames = pipeline._prepare_sound_latents(21, torch.Generator(device="cpu").manual_seed(0))
+    assert (sound_latents.shape, latent_frames) == (torch.Size([1, 3, 6]), 6)
+    assert pipeline._decode_sound_latents(torch.zeros(1, 3, 6), target_audio_samples=21).shape == (1, 2, 21)
 
-def test_diffuse_covers_cfg_and_i2v_steps(make_cosmos3_pipeline) -> None:
+
+def test_diffuse_covers_cfg_i2v_and_sound_steps(make_cosmos3_pipeline) -> None:
     pipeline = make_cosmos3_pipeline()
     latents = torch.zeros(1, 2, 1, 1, 1)
 
@@ -339,6 +363,21 @@ def test_diffuse_covers_cfg_and_i2v_steps(make_cosmos3_pipeline) -> None:
     )
     torch.testing.assert_close(i2v[:, :, 0:1], torch.full((1, 2, 1, 1, 1), 7.0))
 
+    pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+    video_result, sound_result = pipeline.diffuse(
+        latents=latents,
+        sound_latents=torch.zeros(1, 3, 4),
+        timesteps=torch.tensor([7, 3]),
+        cond_ids=_ids(2),
+        cond_mask=_mask(),
+        uncond_ids=_ids(1),
+        uncond_mask=_mask(),
+        guidance_scale=1.0,
+        shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0},
+    )
+    torch.testing.assert_close(video_result, torch.full_like(latents, 4.0))
+    torch.testing.assert_close(sound_result, torch.full((), 24.0).expand_as(sound_result))
+
 
 def test_diffuse_keeps_paired_cfg_when_cache_dit_active(make_cosmos3_pipeline) -> None:
     """With cache-dit active the uncond pass is kept even outside the guidance
@@ -395,7 +434,10 @@ def fake_prepare(height, width, num_frames, generator):
 
         def fake_diffuse(**kwargs):
             captured["diffuse_calls"].append(kwargs)
-            return kwargs["latents"] + len(captured["diffuse_calls"])
+            outputs = [kwargs["latents"] + len(captured["diffuse_calls"])]
+            if kwargs.get("sound_latents") is not None:
+                outputs.append(kwargs["sound_latents"] + 2.0)
+            return outputs[0] if len(outputs) == 1 else tuple(outputs)
 
         pipeline._format_and_tokenize_prompts = fake_format
         pipeline._prepare_latents = fake_prepare
@@ -437,7 +479,7 @@ def test_forward_defaults_and_mode_selection(
         assert captured["flow_shifts"] == expected["flow"]
         assert [call[0] for call in pipeline.scheduler.set_timesteps_calls] == expected["steps"]
 
-    def test_forward_i2v_route(self, make_cosmos3_pipeline) -> None:
+    def test_forward_i2v_and_sound_routes(self, make_cosmos3_pipeline) -> None:
         pipeline = make_cosmos3_pipeline()
         captured = self._install_forward_stubs(pipeline)
         image_tensor = torch.zeros(1, 3, 16, 16)
@@ -462,11 +504,30 @@ def test_forward_i2v_route(self, make_cosmos3_pipeline) -> None:
         )
         assert captured["diffuse_calls"][-1]["shared_kwargs"]["noisy_frame_mask"] is velocity_mask
 
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
+        sound_latents = torch.zeros(1, 3, 4)
+        pipeline._resolve_sound_target_samples = lambda *args: (20, 2.0, 10)
+        pipeline._prepare_sound_latents = lambda *args: (sound_latents, 4)
+        pipeline._decode_sound_latents = lambda *args: torch.ones(1, 2, 20)
+        output = pipeline.forward(
+            SimpleNamespace(
+                prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}],
+                sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0),
+            )
+        )
+        assert captured["diffuse_calls"][-1]["sound_latents"] is sound_latents
+        assert output.output["audio_sample_rate"] == 10
+
     @pytest.mark.parametrize(
         ("prompt", "sampling_params", "message"),
         [
             (["one", "two"], make_sampling_params(), "single prompt"),
             ([{"prompt": "one", "modalities": ["image", "video"]}], make_sampling_params(), "both image and video"),
+            (
+                [{"prompt": "x", "modalities": ["image"], "generate_sound": True}],
+                make_sampling_params(),
+                "only for video",
+            ),
         ],
     )
     def test_forward_rejects_invalid_public_requests(
@@ -477,6 +538,7 @@ def test_forward_rejects_invalid_public_requests(
         message,
     ) -> None:
         pipeline = make_cosmos3_pipeline()
+        pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3)
 
         with pytest.raises(ValueError, match=message):
             pipeline.forward(SimpleNamespace(prompts=prompt, sampling_params=sampling_params))
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
new file mode 100644
index 00000000000..47664c59e77
--- /dev/null
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
@@ -0,0 +1,226 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+import torch
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
+
+DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME = "diffusion_pytorch_model.safetensors"
+
+
+class _FakeAVAEAudioTokenizer:
+    def __init__(self, **kwargs) -> None:
+        self.kwargs = kwargs
+        self.sample_rate = int(kwargs["sample_rate"])
+        self.audio_channels = int(kwargs["audio_channels"])
+        self.latent_ch = int(kwargs["io_channels"])
+        self.temporal_compression_factor = int(kwargs["hop_size"])
+
+    def get_latent_num_samples(self, num_audio_samples: int) -> int:
+        return int(num_audio_samples) // self.temporal_compression_factor
+
+    def get_audio_num_samples(self, num_latent_samples: int) -> int:
+        return int(num_latent_samples) * self.temporal_compression_factor
+
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        return torch.zeros(latents.shape[0], self.audio_channels, 8)
+
+
+def _write_component(root: Path, config: dict | None = None, checkpoint_name: str | None = None) -> Path:
+    tokenizer_dir = root / "sound_tokenizer"
+    tokenizer_dir.mkdir(parents=True)
+    if checkpoint_name:
+        (tokenizer_dir / checkpoint_name).write_bytes(b"stub")
+    (tokenizer_dir / "config.json").write_text(json.dumps(config or {}), encoding="utf-8")
+    return tokenizer_dir
+
+
+def _patch_fake_avae(monkeypatch: pytest.MonkeyPatch, created: dict) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    class FakeAVAE(_FakeAVAEAudioTokenizer):
+        def __init__(self, **kwargs) -> None:
+            created.update(kwargs)
+            super().__init__(**kwargs)
+
+    monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE)
+    monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu"))
+
+
+def test_from_config_loads_local_diffusers_component(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    tokenizer_dir = _write_component(model_dir, checkpoint_name=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    created = {}
+    _patch_fake_avae(monkeypatch, created)
+
+    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model=str(model_dir),
+            custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800, "sound_dim": 3},
+            dtype=torch.float32,
+        )
+    )
+
+    assert created["checkpoint_path"] == str(tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    assert created["config_path"] == str(tokenizer_dir / "config.json")
+    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (32000, 3, 800)
+
+
+def test_from_config_downloads_component_from_hf_repo(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
+    import huggingface_hub
+
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    cache_dir = tmp_path / "hf"
+    _write_component(cache_dir, checkpoint_name=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    calls = []
+    created = {}
+    _patch_fake_avae(monkeypatch, created)
+
+    def fake_snapshot_download(repo_id: str, *, revision: str | None, allow_patterns: list[str]) -> str:
+        calls.append((repo_id, revision, allow_patterns))
+        return str(cache_dir)
+
+    monkeypatch.setattr(huggingface_hub, "snapshot_download", fake_snapshot_download)
+
+    sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model="nvidia/cosmos3",
+            revision="test-rev",
+            custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800, "sound_dim": 3},
+            dtype=torch.float32,
+        )
+    )
+
+    assert created["checkpoint_path"].endswith(DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    assert calls == [
+        (
+            "nvidia/cosmos3",
+            "test-rev",
+            ["sound_tokenizer/config.json", f"sound_tokenizer/{DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME}"],
+        )
+    ]
+
+
+@pytest.mark.parametrize(
+    ("checkpoint_name", "message"),
+    [
+        (None, "no AVAE sound tokenizer checkpoint"),
+        ("model.safetensors", DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME),
+    ],
+)
+def test_default_component_requires_diffusers_checkpoint_name(tmp_path, checkpoint_name, message) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    model_dir = tmp_path / "model"
+    _write_component(model_dir, checkpoint_name=checkpoint_name)
+
+    with pytest.raises(ValueError, match=message):
+        sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+            SimpleNamespace(model=str(model_dir), custom_pipeline_args={}, dtype=torch.float32)
+        )
+
+
+def test_component_config_precedence_and_conflict_detection(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+
+    component_config = {
+        "sampling_rate": 48000,
+        "dec_out_channels": 2,
+        "vocoder_input_dim": 64,
+        "hop_size": 1920,
+    }
+    model_dir = tmp_path / "model"
+    _write_component(model_dir, component_config, DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
+    created = {}
+    _patch_fake_avae(monkeypatch, created)
+
+    tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+        SimpleNamespace(
+            model=str(model_dir),
+            custom_pipeline_args={
+                "sound_normalize_latents": True,
+                "sound_normalization_type": "tanh",
+                "sound_tanh_input_scale": 2.0,
+            },
+            model_config={
+                "sound_tokenizer": {
+                    "sample_rate": 32000,
+                    "audio_channels": 1,
+                    "io_channels": 3,
+                    "hop_size": 800,
+                    "normalize_latents": False,
+                    "normalization_type": "none",
+                }
+            },
+            dtype=torch.float32,
+        )
+    )
+
+    assert (created["sample_rate"], created["audio_channels"], created["io_channels"], created["hop_size"]) == (
+        48000,
+        2,
+        64,
+        1920,
+    )
+    assert (created["normalize_latents"], created["normalization_type"], created["tanh_input_scale"]) == (
+        True,
+        "tanh",
+        2.0,
+    )
+    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (48000, 64, 1920)
+
+    with pytest.raises(ValueError, match=r"sample_rate.*48000.*32000"):
+        sound_tokenizer.Cosmos3SoundTokenizer.from_config(
+            SimpleNamespace(
+                model=str(model_dir),
+                custom_pipeline_args={"sound_sample_rate": 32000},
+                dtype=torch.float32,
+            )
+        )
+
+
+def test_avae_uses_diffusers_decoder_state_dict_layout(tmp_path) -> None:
+    from safetensors.torch import save_file
+
+    from vllm_omni.diffusion.models.cosmos3.audio_tokenizer import avae
+
+    config = {
+        "sampling_rate": 8000,
+        "hop_size": 2,
+        "dec_dim": 4,
+        "dec_c_mults": [1],
+        "dec_strides": [2],
+        "dec_out_channels": 1,
+        "vocoder_input_dim": 2,
+        "normalization_type": "none",
+    }
+    checkpoint_path = tmp_path / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME
+    config_path = tmp_path / "config.json"
+    config_path.write_text(json.dumps(config), encoding="utf-8")
+
+    decoder = avae.OobleckDecoder(4, 2, 1, [2], [1])
+    save_file({f"decoder.{key}": value for key, value in decoder.state_dict().items()}, str(checkpoint_path))
+
+    tokenizer = avae.Cosmos3AVAEAudioTokenizer(
+        checkpoint_path=checkpoint_path,
+        config_path=config_path,
+        dtype=torch.float32,
+        device="cpu",
+    )
+
+    keys = set(tokenizer.state_dict())
+    assert {"decoder.conv1.weight_g", "decoder.block.0.conv_t1.weight_g", "decoder.conv2.weight_g"} <= keys
+    assert not any(key.startswith(("decoder.layers.", "model.decoder.")) for key in keys)
+    assert tokenizer.decode(torch.zeros(1, 2, 3)).shape == (1, 1, 6)
+    with pytest.raises(NotImplementedError, match="decoder-only"):
+        tokenizer.encode(torch.zeros(1, 1, 6))
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index 730079c116a..38db56e0c26 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -29,8 +29,9 @@ def _tiny_cosmos3_config(**overrides):
     return config
 
 
-def test_mrope_position_ids_cover_text_and_video() -> None:
+def test_mrope_position_ids_cover_text_video_and_sound() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import (
+        compute_mrope_position_ids_sound,
         compute_mrope_position_ids_text,
         compute_mrope_position_ids_vision,
     )
@@ -56,6 +57,10 @@ def test_mrope_position_ids_cover_text_and_video() -> None:
     torch.testing.assert_close(modulated_ids[0], torch.tensor([10.0, 12.0]))
     assert modulated_offset == 13
 
+    sound_ids, sound_offset = compute_mrope_position_ids_sound(3, temporal_offset=10, sound_latent_fps=25.0)
+    torch.testing.assert_close(sound_ids[0], torch.tensor([10.0, 10.96, 11.92]))
+    assert sound_offset == 12
+
 
 @pytest.mark.parametrize(
     ("key", "value"),
@@ -115,12 +120,90 @@ def test_forward_returns_video_prediction(monkeypatch: pytest.MonkeyPatch) -> No
         text_mask=torch.ones(1, 2, dtype=torch.long),
         video_shape=(1, 2, 2),
         fps=24.0,
+        sound_latents=torch.zeros(1, 3, 4),
     )
 
     assert tuple(output.shape) == (1, 2, 1, 2, 2)
 
 
-def test_compute_rope_freqs_places_text_and_video_positions() -> None:
+def test_sound_modules_follow_config() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    tiny = _tiny_cosmos3_config()
+    no_modal = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32))
+    with_sound = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config={**tiny, "sound_gen": True},
+            model_config={"sound_tokenizer": {"io_channels": 5, "sample_rate": 32000, "hop_size": 800}},
+            custom_pipeline_args={},
+            dtype=torch.float32,
+        )
+    )
+
+    assert no_modal.sound_gen is False
+    assert not hasattr(no_modal, "audio_proj_in")
+    assert with_sound.sound_dim == 5
+    assert with_sound.sound_latent_fps == 40.0
+    assert with_sound.audio_proj_in.in_features == 5
+
+
+def test_sound_pack_unpack_validate_shapes() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    model = object.__new__(Cosmos3VFMTransformer)
+    nn.Module.__init__(model)
+    model.sound_dim = 3
+
+    sound = torch.arange(2 * 3 * 4, dtype=torch.float32).reshape(2, 3, 4)
+    torch.testing.assert_close(model.unpack_sound(model.pack_sound(sound)), sound)
+
+    with pytest.raises(ValueError, match="channel mismatch"):
+        model.pack_sound(torch.zeros(1, 4, 2))
+
+
+def test_forward_returns_video_and_sound_predictions() -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
+    output = Cosmos3VFMTransformer(
+        SimpleNamespace(
+            tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0),
+            dtype=torch.float32,
+        )
+    )(
+        hidden_states=torch.zeros(1, 2, 1, 2, 2),
+        timestep=torch.tensor([1.0]),
+        text_ids=torch.tensor([[1, 2]], dtype=torch.long),
+        text_mask=torch.ones(1, 2, dtype=torch.long),
+        video_shape=(1, 2, 2),
+        fps=24.0,
+        sound_latents=torch.zeros(1, 3, 4),
+    )
+
+    assert isinstance(output, tuple)
+    assert [tuple(tensor.shape) for tensor in output] == [(1, 2, 1, 2, 2), (1, 3, 4)]
+
+
+def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch: pytest.MonkeyPatch) -> None:
+    import vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 as cosmos3_module
+
+    model = cosmos3_module.Cosmos3VFMTransformer(
+        SimpleNamespace(tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3), dtype=torch.float32)
+    )
+    monkeypatch.setattr(cosmos3_module, "_get_ulysses_state", lambda: (2, 0, None))
+
+    with pytest.raises(ValueError, match=r"GEN sequence length \(3 = video tokens 2 \+ sound tokens 1\)"):
+        model(
+            hidden_states=torch.zeros(1, 2, 1, 1, 2),
+            timestep=torch.tensor([1.0]),
+            text_ids=torch.tensor([[1, 2]], dtype=torch.long),
+            text_mask=torch.ones(1, 2, dtype=torch.long),
+            video_shape=(1, 1, 2),
+            fps=24.0,
+            sound_latents=torch.zeros(1, 3, 1),
+        )
+
+
+def test_compute_rope_freqs_places_text_video_and_sound_positions() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     class FakeRotary:
@@ -140,6 +223,8 @@ def __call__(self, x, position_ids):
     model.temporal_modality_margin = 100
     model.base_fps = 24.0
     model.temporal_compression_factor = 4
+    model.temporal_compression_factor_sound = 1
+    model.sound_latent_fps = 25.0
     model.enable_fps_modulation = False
 
     freqs_und, freqs_gen = model._compute_rope_freqs(
@@ -156,3 +241,19 @@ def __call__(self, x, position_ids):
     assert vision_pos[0, 0].tolist() == [102, 103]
     assert freqs_und[0].shape == (2, 3, 1, 4)
     assert freqs_gen[0].shape == (2, 2, 1, 4)
+
+    rotary.position_ids.clear()
+    model._compute_rope_freqs(
+        text_mask=torch.tensor([[1, 1]], dtype=torch.long),
+        t=2,
+        hp=1,
+        wp=1,
+        fps=24.0,
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+        t_sound=1,
+    )
+
+    _, gen_pos = rotary.position_ids
+    assert gen_pos.shape == (3, 1, 3)
+    assert gen_pos[0, 0].tolist() == [102, 103, 102]
diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py
index 36b19333980..de1f14c7455 100644
--- a/tests/entrypoints/openai_api/test_video_server.py
+++ b/tests/entrypoints/openai_api/test_video_server.py
@@ -399,6 +399,8 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture):
             "true_cfg_scale": "4.0",
             "boundary_ratio": "0.7",
             "flow_shift": "0.25",
+            "generate_sound": "true",
+            "sound_duration": "2.5",
         },
     )
 
@@ -413,6 +415,8 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture):
     assert captured.true_cfg_scale == 4.0
     assert captured.boundary_ratio == 0.7
     assert captured.extra_args["flow_shift"] == 0.25
+    assert captured.extra_args["generate_sound"] is True
+    assert captured.extra_args["sound_duration"] == 2.5
 
 
 def test_frame_interpolation_params_pass_to_diffusion_sampling_params(test_client, mocker: MockerFixture):
@@ -756,6 +760,9 @@ def test_invalid_uploaded_input_reference_returns_400(test_client):
 def test_video_request_validation():
     req = VideoGenerationRequest(prompt="test")
     assert req.prompt == "test"
+    assert req.generate_sound is False
+    assert req.sound_duration is None
+    assert VideoGenerationRequest(prompt="test", generate_sound=True, sound_duration=1.5).generate_sound is True
     with pytest.raises(ValueError):
         VideoGenerationRequest(prompt="test", size="invalid")
 
@@ -768,6 +775,8 @@ def test_video_request_validation():
         VideoGenerationRequest(prompt="test", frame_interpolation_exp=0)
     with pytest.raises(ValueError):
         VideoGenerationRequest(prompt="test", frame_interpolation_scale=0)
+    with pytest.raises(ValueError):
+        VideoGenerationRequest(prompt="test", sound_duration=0)
 
 
 def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerFixture):
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
new file mode 100644
index 00000000000..cfb794705ba
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .avae import Cosmos3AVAEAudioTokenizer
+
+__all__ = ["Cosmos3AVAEAudioTokenizer"]
diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
new file mode 100644
index 00000000000..4ddb8d41527
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py
@@ -0,0 +1,323 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Diffusers-format AVAE audio tokenizer used by Cosmos3 sound generation."""
+
+from __future__ import annotations
+
+import json
+import math
+from pathlib import Path
+from typing import Any
+
+import torch
+from torch import nn
+from torch.nn.utils import weight_norm
+from vllm.logger import init_logger
+
+from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
+
+logger = init_logger(__name__)
+
+
+def _default_avae_config(
+    *,
+    sample_rate: int,
+    audio_channels: int,
+    io_channels: int,
+    hop_size: int,
+) -> dict[str, Any]:
+    return {
+        "sampling_rate": sample_rate,
+        "hop_size": hop_size,
+        "dec_dim": 320,
+        "dec_c_mults": [1, 2, 4, 8, 16],
+        "dec_strides": [2, 4, 5, 6, 8],
+        "dec_out_channels": audio_channels,
+        "vocoder_input_dim": io_channels,
+        "normalization_type": "none",
+        "normalize_latents": False,
+        "tanh_input_scale": 1.5,
+        "tanh_output_scale": 3.5,
+        "tanh_clamp": 0.995,
+    }
+
+
+def _config_get(config: dict[str, Any], *keys: str, default: Any = None) -> Any:
+    for key in keys:
+        value = config.get(key)
+        if value is not None:
+            return value
+    return default
+
+
+def _load_config(
+    config_path: str | Path | None,
+    *,
+    sample_rate: int,
+    audio_channels: int,
+    io_channels: int,
+    hop_size: int,
+) -> dict[str, Any]:
+    if config_path:
+        with open(config_path, encoding="utf-8") as f:
+            config = json.load(f)
+        if not isinstance(config, dict):
+            raise TypeError(f"Cosmos3 AVAE config must be a JSON object, got {type(config)!r}.")
+        return config
+    return _default_avae_config(
+        sample_rate=sample_rate,
+        audio_channels=audio_channels,
+        io_channels=io_channels,
+        hop_size=hop_size,
+    )
+
+
+def _load_checkpoint(path: str | Path, map_location: torch.device | str) -> dict[str, torch.Tensor]:
+    path = Path(path)
+    if path.suffix == ".safetensors":
+        try:
+            from safetensors.torch import load_file
+        except ImportError as exc:
+            raise ImportError("Loading AVAE .safetensors checkpoints requires safetensors.") from exc
+        checkpoint = load_file(str(path), device=str(map_location))
+    else:
+        checkpoint = torch.load(path, map_location=map_location)
+
+    if not isinstance(checkpoint, dict):
+        raise TypeError(f"AVAE checkpoint must be a flat state dict, got {type(checkpoint)!r}.")
+    if not all(isinstance(value, torch.Tensor) for value in checkpoint.values()):
+        raise TypeError("AVAE checkpoint must be a flat tensor state dict.")
+    return checkpoint
+
+
+def _validate_diffusers_state_dict(state_dict: dict[str, torch.Tensor]) -> None:
+    if not state_dict:
+        raise RuntimeError("AVAE checkpoint is empty.")
+
+    if not any(key.startswith("decoder.") for key in state_dict):
+        raise RuntimeError("Cosmos3 AVAE checkpoint must contain diffusers-format decoder.* keys.")
+
+
+class Snake1d(nn.Module):
+    """One-dimensional Snake activation matching diffusers' Oobleck layout."""
+
+    def __init__(self, hidden_dim: int, logscale: bool = True) -> None:
+        super().__init__()
+        self.alpha = nn.Parameter(torch.zeros(1, hidden_dim, 1))
+        self.beta = nn.Parameter(torch.zeros(1, hidden_dim, 1))
+        self.logscale = logscale
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        shape = hidden_states.shape
+        alpha = torch.exp(self.alpha) if self.logscale else self.alpha
+        beta = torch.exp(self.beta) if self.logscale else self.beta
+        hidden_states = hidden_states.reshape(shape[0], shape[1], -1)
+        hidden_states = hidden_states + (beta + 1e-9).reciprocal() * torch.sin(alpha * hidden_states).pow(2)
+        return hidden_states.reshape(shape)
+
+
+class OobleckResidualUnit(nn.Module):
+    """Residual unit used by the diffusers Oobleck decoder."""
+
+    def __init__(self, dimension: int = 16, dilation: int = 1) -> None:
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.snake1 = Snake1d(dimension)
+        self.conv1 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad))
+        self.snake2 = Snake1d(dimension)
+        self.conv2 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=1))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        output_tensor = self.conv1(self.snake1(hidden_state))
+        output_tensor = self.conv2(self.snake2(output_tensor))
+        padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2
+        if padding > 0:
+            hidden_state = hidden_state[..., padding:-padding]
+        return hidden_state + output_tensor
+
+
+class OobleckDecoderBlock(nn.Module):
+    """Decoder block used by the diffusers Oobleck decoder."""
+
+    def __init__(self, input_dim: int, output_dim: int, stride: int = 1, output_padding: int = 0) -> None:
+        super().__init__()
+        self.snake1 = Snake1d(input_dim)
+        self.conv_t1 = weight_norm(
+            nn.ConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+                output_padding=output_padding,
+            )
+        )
+        self.res_unit1 = OobleckResidualUnit(output_dim, dilation=1)
+        self.res_unit2 = OobleckResidualUnit(output_dim, dilation=3)
+        self.res_unit3 = OobleckResidualUnit(output_dim, dilation=9)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv_t1(hidden_state)
+        hidden_state = self.res_unit1(hidden_state)
+        hidden_state = self.res_unit2(hidden_state)
+        return self.res_unit3(hidden_state)
+
+
+class OobleckDecoder(nn.Module):
+    """Diffusers-compatible Oobleck decoder for Cosmos3 AVAE latents."""
+
+    def __init__(
+        self,
+        channels: int,
+        input_channels: int,
+        audio_channels: int,
+        upsampling_ratios: list[int],
+        channel_multiples: list[int],
+    ) -> None:
+        super().__init__()
+        strides = upsampling_ratios
+        channel_multiples = [1] + channel_multiples
+
+        self.conv1 = weight_norm(nn.Conv1d(input_channels, channels * channel_multiples[-1], kernel_size=7, padding=3))
+
+        block = []
+        for stride_index, stride in enumerate(strides):
+            block.append(
+                OobleckDecoderBlock(
+                    input_dim=channels * channel_multiples[len(strides) - stride_index],
+                    output_dim=channels * channel_multiples[len(strides) - stride_index - 1],
+                    stride=stride,
+                    output_padding=stride % 2,
+                )
+            )
+        self.block = nn.ModuleList(block)
+        self.snake1 = Snake1d(channels)
+        self.conv2 = weight_norm(nn.Conv1d(channels, audio_channels, kernel_size=7, padding=3, bias=False))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.conv1(hidden_state)
+        for layer in self.block:
+            hidden_state = layer(hidden_state)
+        hidden_state = self.snake1(hidden_state)
+        return self.conv2(hidden_state)
+
+
+class Cosmos3AVAEAudioTokenizer(nn.Module):
+    """Decoder-only AVAE tokenizer for Cosmos3 audio latents."""
+
+    def __init__(
+        self,
+        *,
+        checkpoint_path: str | Path,
+        config_path: str | Path | None = None,
+        sample_rate: int = 48000,
+        audio_channels: int = 2,
+        io_channels: int = 64,
+        hop_size: int = 1920,
+        normalize_latents: bool = False,
+        normalization_type: str = "none",
+        tanh_input_scale: float = 1.5,
+        tanh_output_scale: float = 3.5,
+        tanh_clamp: float = 0.995,
+        dtype: torch.dtype = torch.bfloat16,
+        device: torch.device | str = "cuda",
+    ) -> None:
+        super().__init__()
+        self.dtype = dtype
+        self.device = torch.device(device)
+
+        config = _load_config(
+            config_path,
+            sample_rate=sample_rate,
+            audio_channels=audio_channels,
+            io_channels=io_channels,
+            hop_size=hop_size,
+        )
+        self.sample_rate = int(_config_get(config, "sampling_rate", "sample_rate", default=sample_rate))
+        self.audio_channels = int(
+            _config_get(
+                config,
+                "dec_out_channels",
+                "audio_channels",
+                default=2 if bool(config.get("stereo", audio_channels == 2)) else 1,
+            )
+        )
+        self.latent_ch = int(_config_get(config, "vocoder_input_dim", "io_channels", "latent_ch", default=io_channels))
+        dec_strides = [int(stride) for stride in _config_get(config, "dec_strides", default=[2, 4, 5, 6, 8])]
+        self.hop_size = int(
+            _config_get(config, "hop_size", default=math.prod(dec_strides) if dec_strides else hop_size)
+        )
+        dec_stride_product = math.prod(dec_strides)
+        if dec_stride_product != self.hop_size:
+            raise ValueError(
+                "Cosmos3 AVAE config dec_strides product must equal hop_size "
+                f"for correct latent/audio duration math: product={dec_stride_product}, hop_size={self.hop_size}."
+            )
+
+        normalization_type = str(_config_get(config, "normalization_type", default=normalization_type))
+        normalize_latents = bool(_config_get(config, "normalize_latents", default=normalize_latents))
+        if normalization_type == "none" and normalize_latents:
+            normalization_type = "tanh"
+        self.normalization_type = normalization_type
+        self.tanh_input_scale = float(_config_get(config, "tanh_input_scale", default=tanh_input_scale))
+        self.tanh_output_scale = float(_config_get(config, "tanh_output_scale", default=tanh_output_scale))
+        self.tanh_clamp = float(_config_get(config, "tanh_clamp", default=tanh_clamp))
+
+        self.decoder = OobleckDecoder(
+            channels=int(_config_get(config, "dec_dim", default=320)),
+            input_channels=self.latent_ch,
+            audio_channels=self.audio_channels,
+            upsampling_ratios=list(reversed(dec_strides)),
+            channel_multiples=list(_config_get(config, "dec_c_mults", default=[1, 2, 4, 8, 16])),
+        )
+        state_dict = _load_checkpoint(checkpoint_path, self.device)
+        _validate_diffusers_state_dict(state_dict)
+
+        # The checkpoint also contains encoder weights, which we do not support here, hence strict=False
+        self.load_state_dict(state_dict, strict=False)
+
+        self.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+        self.to(device=self.device, dtype=self.dtype)
+        if _is_rank_zero():
+            logger.info("Loaded diffusers-format Cosmos3 AVAE checkpoint from %s", checkpoint_path)
+
+    @property
+    def temporal_compression_factor(self) -> int:
+        return self.hop_size
+
+    def get_latent_num_samples(self, num_audio_samples: int) -> int:
+        return int(num_audio_samples) // self.temporal_compression_factor
+
+    def get_audio_num_samples(self, num_latent_samples: int) -> int:
+        return int(num_latent_samples) * self.temporal_compression_factor
+
+    def _denormalize_latent(self, latent: torch.Tensor) -> torch.Tensor:
+        if self.normalization_type == "tanh":
+            in_dtype = latent.dtype
+            latent = torch.clamp(
+                latent.float() / self.tanh_output_scale,
+                -self.tanh_clamp,
+                self.tanh_clamp,
+            )
+            return (torch.atanh(latent) * self.tanh_input_scale).to(in_dtype)
+        if self.normalization_type != "none":
+            raise ValueError(f"Unsupported AVAE normalization_type={self.normalization_type!r}.")
+        return latent
+
+    @torch.no_grad()
+    def encode(self, audio: torch.Tensor, force_pad: bool = False) -> torch.Tensor:
+        del audio, force_pad
+        raise NotImplementedError("Cosmos3AVAEAudioTokenizer is decoder-only for diffusers-format sound_tokenizer/.")
+
+    @torch.no_grad()
+    def decode(self, latent: torch.Tensor) -> torch.Tensor:
+        in_dtype = latent.dtype
+        squeeze = latent.ndim == 2
+        if squeeze:
+            latent = latent.unsqueeze(0)
+        z = self._denormalize_latent(latent.to(self.device)).to(self.dtype)
+        audio = self.decoder(z).clamp(-1.0, 1.0).to(in_dtype)
+        return audio.squeeze(0) if squeeze else audio
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 102b9216082..543add3ac46 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import math
 import os
 import time
 from collections.abc import Iterable
@@ -160,6 +161,28 @@ def get_cosmos3_post_process_func(od_config: OmniDiffusionConfig):
 
     video_processor = VideoProcessor(vae_scale_factor=16)
 
+    def _sampling_param(sampling_params, key: str, default=None):
+        extra = getattr(sampling_params, "extra_args", None)
+        if isinstance(extra, dict) and extra.get(key) is not None:
+            return extra[key]
+        value = getattr(sampling_params, key, None)
+        return default if value is None else value
+
+    def _resolve_output_fps(sampling_params):
+        fps = (
+            _sampling_param(sampling_params, "resolved_frame_rate")
+            or _sampling_param(sampling_params, "frame_rate")
+            or _sampling_param(sampling_params, "fps")
+            or 24.0
+        )
+        try:
+            fps_value = float(fps)
+        except (TypeError, ValueError):
+            fps_value = 24.0
+        if fps_value <= 0:
+            fps_value = 24.0
+        return int(fps_value) if fps_value.is_integer() else fps_value
+
     def post_process_func(
         output: torch.Tensor | dict[str, torch.Tensor] | tuple,
         output_type: str = "np",
@@ -168,6 +191,8 @@ def post_process_func(
         if output_type == "latent":
             return output
 
+        audio = None
+        audio_sample_rate = None
         if isinstance(output, dict):
             if "image" in output and "video" in output:
                 raise ValueError("Cosmos3 output cannot contain both image and video payloads.")
@@ -177,10 +202,23 @@ def post_process_func(
                 video = output["video"]
             else:
                 raise ValueError("Cosmos3 postprocess expected an 'image' or 'video' output payload.")
+            audio = output.get("audio")
+            audio_sample_rate = output.get("audio_sample_rate")
+        elif isinstance(output, tuple):
+            if len(output) == 3:
+                video, audio, audio_sample_rate = output
+            elif len(output) == 2:
+                video, audio = output
+            else:
+                raise ValueError(
+                    "Cosmos3 postprocess expects output tensor, output dict, or (video, audio[, sample_rate]) tuple."
+                )
         else:
             video = output
 
         if isinstance(output, dict) and "image" in output:
+            if audio is not None:
+                raise ValueError("Cosmos3 text-to-image postprocess does not support audio output.")
             if video.ndim != 5 or video.shape[2] != 1:
                 raise ValueError(
                     "Cosmos3 text-to-image postprocess expects decoded output "
@@ -194,7 +232,16 @@ def post_process_func(
             return video_processor.postprocess(image, output_type="pil")
         if is_guardrails_enabled(od_config, sampling_params):
             video = check_video_safety(video)
-        return video_processor.postprocess_video(video, output_type=output_type)
+        result = {"video": video_processor.postprocess_video(video, output_type=output_type)}
+        if audio is None:
+            return result
+        if isinstance(audio, torch.Tensor):
+            audio = audio.detach().cpu()
+        result["audio"] = audio
+        result["fps"] = _resolve_output_fps(sampling_params)
+        if audio_sample_rate is not None:
+            result["audio_sample_rate"] = int(audio_sample_rate)
+        return result
 
     return post_process_func
 
@@ -317,6 +364,9 @@ def __init__(
 
         self._guidance_scale = None
         self._num_timesteps = None
+        self._sound_tokenizer = None
+        if getattr(self.transformer, "sound_gen", False):
+            self._get_sound_tokenizer()
 
         # Set True by ``enable_cache_for_cosmos3`` when cache-dit is enabled on
         # this pipeline. Tells the sequential-CFG loop to keep paired
@@ -357,9 +407,13 @@ def _remap_ckpt_key(key: str) -> str | None:
                 "proj_in.",
                 "proj_out.",
                 "time_embedder.",
+                "audio_proj_in.",
+                "audio_proj_out.",
             )
         ):
             return f"transformer.{k}"
+        if k in ("audio_modality_embed", "audio_modality_embed.weight"):
+            return "transformer.audio_modality_embed"
 
         # Skip lm_head
         if k.startswith("lm_head."):
@@ -453,12 +507,22 @@ def _remapped_weights() -> Iterable[tuple[str, torch.Tensor]]:
         loaded = loader.load_weights(_remapped_weights())
         self.transformer.post_load_weights()
         self.transformer.eval()
+        if getattr(self.transformer, "sound_gen", False):
+            sound_markers = ("audio_proj_in.", "audio_proj_out.", "audio_modality_embed")
+            missing = [marker.rstrip(".") for marker in sound_markers if not any(marker in name for name in loaded)]
+            if missing:
+                raise ValueError(
+                    "Cosmos3 transformer config enables sound generation, but "
+                    f"the checkpoint is missing sound weights for {missing}. "
+                    "Use a sound-capable transformer checkpoint."
+                )
         return loaded
 
     def predict_noise(self, **kwargs) -> torch.Tensor | tuple[torch.Tensor, ...]:
         """Override CFGParallelMixin.predict_noise for Cosmos3.
 
-        The transformer returns the raw video noise prediction.
+        The transformer returns the raw prediction: video-only as a tensor,
+        or a tuple in video, sound order for sound generation.
         """
         return self.transformer(**kwargs)
 
@@ -509,6 +573,49 @@ def _get_sp_param(sp: OmniDiffusionSamplingParams, key: str, default: Any = None
             return val
         return default
 
+    @staticmethod
+    def _truthy(value) -> bool:
+        if isinstance(value, str):
+            return value.strip().lower() in {"1", "true", "yes", "on"}
+        return bool(value)
+
+    @classmethod
+    def _get_prompt_param(cls, prompt_data, key: str, default=None):
+        if not isinstance(prompt_data, dict):
+            return default
+        if prompt_data.get(key) is not None:
+            return prompt_data[key]
+        additional = prompt_data.get("additional_information")
+        if isinstance(additional, dict) and additional.get(key) is not None:
+            return additional[key]
+        return default
+
+    @classmethod
+    def _is_sound_request(cls, prompt_data, sp) -> bool:
+        keys = (
+            "sound_gen",
+            "generate_sound",
+            "enable_sound_generation",
+            "return_audio",
+            "output_audio",
+            "generate_audio",
+        )
+        for key in keys:
+            if cls._truthy(cls._get_prompt_param(prompt_data, key, None)):
+                return True
+            if cls._truthy(cls._get_sp_param(sp, key, None)):
+                return True
+        return False
+
+    def _get_sound_tokenizer(self):
+        if not hasattr(self, "_sound_tokenizer"):
+            self._sound_tokenizer = None
+        if self._sound_tokenizer is None:
+            from .sound_tokenizer import Cosmos3SoundTokenizer
+
+            self._sound_tokenizer = Cosmos3SoundTokenizer.from_config(self.od_config)
+        return self._sound_tokenizer
+
     @staticmethod
     def _is_t2i_request(req: OmniDiffusionRequest) -> bool:
         """Detect text-to-image mode from request-level prompt modalities."""
@@ -721,6 +828,47 @@ def _prepare_latents(
         )
         return randn_tensor(shape, generator=generator, device=self.device, dtype=self.dtype)
 
+    def _prepare_sound_latents(
+        self,
+        target_audio_samples: int,
+        generator: torch.Generator,
+    ) -> tuple[torch.Tensor, int]:
+        sound_tokenizer = self._get_sound_tokenizer()
+        hop_size = int(
+            getattr(sound_tokenizer, "hop_size", None) or getattr(sound_tokenizer, "temporal_compression_factor")
+        )
+        latent_frames = max(1, math.ceil(max(1, int(target_audio_samples)) / hop_size))
+        sound_dim = int(getattr(sound_tokenizer, "latent_ch", 64))
+        transformer_sound_dim = int(getattr(self.transformer, "sound_dim", sound_dim))
+        if sound_dim != transformer_sound_dim:
+            raise ValueError(
+                "Cosmos3 sound tokenizer latent channels do not match transformer "
+                f"sound_dim: tokenizer={sound_dim}, transformer={transformer_sound_dim}."
+            )
+        latents = randn_tensor(
+            (1, sound_dim, latent_frames),
+            generator=generator,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        return latents, latent_frames
+
+    def _resolve_sound_target_samples(
+        self,
+        sp,
+        num_frames: int,
+        frame_rate: float,
+    ) -> tuple[int, float, int]:
+        sound_tokenizer = self._get_sound_tokenizer()
+        duration = self._get_sp_param(sp, "sound_duration", None)
+        if duration is None:
+            duration = self._get_sp_param(sp, "audio_duration", None)
+        if duration is None:
+            duration = num_frames / frame_rate
+        duration = max(float(duration), 1.0 / max(float(frame_rate), 1.0))
+        sample_rate = int(getattr(sound_tokenizer, "sample_rate", 48000))
+        return max(1, int(round(duration * sample_rate))), duration, sample_rate
+
     # -- VAE decode ----------------------------------------------------------
 
     def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
@@ -742,6 +890,19 @@ def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
         video = self.vae.decode(latents, return_dict=False)[0]
         return video
 
+    def _decode_sound_latents(
+        self,
+        sound_latents: torch.Tensor,
+        target_audio_samples: int,
+    ) -> torch.Tensor:
+        sound_tokenizer = self._get_sound_tokenizer()
+        audio = sound_tokenizer.decode(sound_latents.to(self.dtype))
+        if audio.shape[-1] > target_audio_samples:
+            audio = audio[..., :target_audio_samples]
+        elif audio.shape[-1] < target_audio_samples:
+            audio = torch.nn.functional.pad(audio, (0, target_audio_samples - audio.shape[-1]))
+        return audio.detach().cpu()
+
     # -- Prompt formatting + tokenization (shared by T2V and I2V) ------------
 
     def _format_and_tokenize_prompts(
@@ -903,11 +1064,12 @@ def diffuse(
         guidance_scale: float,
         shared_kwargs: dict,
         *,
+        sound_latents: torch.Tensor | None = None,
         velocity_mask: torch.Tensor | None = None,
         image_latent: torch.Tensor | None = None,
         condition_latents: torch.Tensor | None = None,
         guidance_interval: tuple[float, float] | None = None,
-    ) -> torch.Tensor:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         """Denoising loop with 3-mode CFG support (parallel, sequential, none).
 
         Cosmos3's UND pathway is text-dependent, so CFG needs separate K/V
@@ -946,21 +1108,82 @@ def _cfg_active_at(t: torch.Tensor) -> bool:
             lo, hi = guidance_interval
             return lo <= t_scalar <= hi
 
+        def _pack_joint(
+            video_tensor: torch.Tensor,
+            sound_tensor: torch.Tensor | None = None,
+        ):
+            batch = video_tensor.shape[0]
+            tensors = [video_tensor]
+            if sound_tensor is not None:
+                tensors.append(sound_tensor)
+            flats = [tensor.reshape(batch, -1) for tensor in tensors]
+            return torch.cat(flats, dim=1), [tensor.shape for tensor in tensors], [flat.shape[1] for flat in flats]
+
+        def _unpack_joint(
+            packed: torch.Tensor,
+            shapes: list[torch.Size],
+            numels: list[int],
+        ) -> tuple[torch.Tensor, ...]:
+            outputs = []
+            offset = 0
+            for shape, numel in zip(shapes, numels, strict=True):
+                outputs.append(packed[:, offset : offset + numel].reshape(shape))
+                offset += numel
+            return tuple(outputs)
+
+        def _split_noise_pred(
+            noise_pred: torch.Tensor | tuple[torch.Tensor, ...],
+        ) -> tuple[torch.Tensor, torch.Tensor | None]:
+            has_sound = sound_latents is not None
+            if not has_sound:
+                if isinstance(noise_pred, tuple):
+                    raise ValueError("Cosmos3 video-only diffusion received tuple predictions.")
+                return noise_pred, None
+            if not isinstance(noise_pred, tuple):
+                raise ValueError("Cosmos3 multimodal diffusion expects transformer predictions as a tuple.")
+            if len(noise_pred) != 2:
+                raise ValueError(f"Cosmos3 sound diffusion expected 2 predictions, got {len(noise_pred)}.")
+            return noise_pred[0], noise_pred[1]
+
         def _step(
-            noise_pred: torch.Tensor,
+            noise_pred: torch.Tensor | tuple[torch.Tensor, ...],
             t: torch.Tensor,
             latents: torch.Tensor,
-        ) -> torch.Tensor:
-            if isinstance(noise_pred, tuple):
-                raise ValueError("Cosmos3 noise prediction must be a single tensor; got a tuple.")
+            sound_latents: torch.Tensor | None,
+        ) -> torch.Tensor | tuple[torch.Tensor, ...]:
+            video_pred, sound_pred = _split_noise_pred(noise_pred)
             if velocity_mask is not None:
-                noise_pred = noise_pred * velocity_mask
-            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                video_pred = video_pred * velocity_mask
+            if sound_latents is None:
+                latents = self.scheduler.step(video_pred, t, latents, return_dict=False)[0]
+            else:
+                packed_noise, shapes, numels = _pack_joint(video_pred, sound_pred)
+                packed_latents, _, _ = _pack_joint(latents, sound_latents)
+                packed_next = self.scheduler.step(packed_noise, t, packed_latents, return_dict=False)[0]
+                unpacked = _unpack_joint(packed_next, shapes, numels)
+                latents = unpacked[0]
+                if sound_latents is not None:
+                    sound_latents = unpacked[1]
             if condition_latents is not None and velocity_mask is not None:
                 latents = velocity_mask * latents + (1.0 - velocity_mask) * condition_latents
             elif image_latent is not None:
                 latents[:, :, 0:1, :, :] = image_latent
-            return latents
+            outputs = [latents]
+            if sound_latents is not None:
+                outputs.append(sound_latents)
+            return outputs[0] if len(outputs) == 1 else tuple(outputs)
+
+        def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None:
+            nonlocal latents, sound_latents
+            if sound_latents is None:
+                assert isinstance(step_out, torch.Tensor)
+                latents = step_out
+                return
+            if not isinstance(step_out, tuple):
+                raise ValueError("Cosmos3 multimodal diffusion step returned a non-tuple result.")
+            latents = step_out[0]
+            if sound_latents is not None:
+                sound_latents = step_out[1]
 
         if cfg_parallel:
             for t in self.progress_bar(timesteps):
@@ -978,6 +1201,7 @@ def _step(
                         timestep=timestep,
                         text_ids=cond_ids,
                         text_mask=cond_mask,
+                        sound_latents=sound_latents,
                         **shared_kwargs,
                     ),
                     negative_kwargs=dict(
@@ -985,11 +1209,12 @@ def _step(
                         timestep=timestep,
                         text_ids=uncond_ids,
                         text_mask=uncond_mask,
+                        sound_latents=sound_latents,
                         **shared_kwargs,
                     ),
                     cfg_normalize=False,
                 )
-                latents = _step(noise_pred, t, latents)
+                _assign_step_out(_step(noise_pred, t, latents, sound_latents))
 
         elif do_cfg:
             cond_cache: tuple = (None, None)
@@ -1007,6 +1232,7 @@ def _step(
                     timestep=timestep,
                     text_ids=cond_ids,
                     text_mask=cond_mask,
+                    sound_latents=sound_latents,
                     **shared_kwargs,
                 )
                 if cond_cache[0] is None:
@@ -1019,6 +1245,7 @@ def _step(
                         timestep=timestep,
                         text_ids=uncond_ids,
                         text_mask=uncond_mask,
+                        sound_latents=sound_latents,
                         **shared_kwargs,
                     )
                     if uncond_cache[0] is None:
@@ -1031,7 +1258,7 @@ def _step(
                 else:
                     noise_pred = noise_cond
 
-                latents = _step(noise_pred, t, latents)
+                _assign_step_out(_step(noise_pred, t, latents, sound_latents))
 
         else:
             for t in self.progress_bar(timesteps):
@@ -1041,11 +1268,15 @@ def _step(
                     timestep=timestep,
                     text_ids=cond_ids,
                     text_mask=cond_mask,
+                    sound_latents=sound_latents,
                     **shared_kwargs,
                 )
-                latents = _step(noise_pred, t, latents)
+                _assign_step_out(_step(noise_pred, t, latents, sound_latents))
 
-        return latents
+        outputs = [latents]
+        if sound_latents is not None:
+            outputs.append(sound_latents)
+        return outputs[0] if len(outputs) == 1 else tuple(outputs)
 
     # -- Forward (main generation entry point) -------------------------------
 
@@ -1072,6 +1303,18 @@ def forward(
 
         sp = req.sampling_params
         is_t2i = self._is_t2i_request(req)
+        sound_enabled = self._is_sound_request(prompt_data, sp)
+        if sound_enabled and is_t2i:
+            raise ValueError(
+                "Cosmos3 sound generation is supported only for video outputs in "
+                "this phase; text-to-image with sound is unsupported."
+            )
+        if sound_enabled and not getattr(self.transformer, "sound_gen", False):
+            raise ValueError(
+                "Cosmos3 sound generation was requested, but the transformer was "
+                "initialized without sound modules. Check that the checkpoint config "
+                "enables sound_gen or defines sound_dim and includes sound weights."
+            )
         if negative_prompt is None:
             negative_prompt = ""
 
@@ -1163,6 +1406,13 @@ def forward(
             image_latent = None
             condition_latents = None
 
+        sound_latents = None
+        target_audio_samples = None
+        sound_sample_rate = None
+        if sound_enabled:
+            target_audio_samples, _, sound_sample_rate = self._resolve_sound_target_samples(sp, num_frames, frame_rate)
+            sound_latents, _ = self._prepare_sound_latents(target_audio_samples, generator)
+
         T_latent = latents.shape[2]
         H_latent = latents.shape[3]
         W_latent = latents.shape[4]
@@ -1184,6 +1434,7 @@ def _run_diffusion(start_latents):
                 uncond_mask=uncond_mask,
                 guidance_scale=guidance_scale,
                 shared_kwargs=shared_kwargs,
+                sound_latents=sound_latents,
                 velocity_mask=velocity_mask,
                 image_latent=image_latent,
                 condition_latents=condition_latents,
@@ -1204,7 +1455,11 @@ def _run_diffusion(start_latents):
                 samples.append(_run_diffusion(next_latents))
             latents = torch.cat(samples, dim=0)
         else:
-            latents = _run_diffusion(latents)
+            diffusion_output = _run_diffusion(latents)
+            if sound_enabled:
+                latents, sound_latents = diffusion_output
+            else:
+                latents = diffusion_output
 
         # --- Decode ---
         if _is_rank_zero():
@@ -1215,4 +1470,12 @@ def _run_diffusion(start_latents):
             logger.info("Video decoded in %.2fs", time.time() - decode_start)
             logger.info("Total pipeline time: %.2fs", time.time() - pipeline_start)
 
+        if sound_enabled:
+            if sound_latents is None or target_audio_samples is None or sound_sample_rate is None:
+                raise ValueError("Cosmos3 sound generation finished without sound latents.")
+            if _is_rank_zero():
+                logger.info("Decoding sound...")
+            audio = self._decode_sound_latents(sound_latents, target_audio_samples)
+            return DiffusionOutput(output={"video": video, "audio": audio, "audio_sample_rate": sound_sample_rate})
+
         return DiffusionOutput(output={"image": video} if is_t2i else {"video": video})
diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
new file mode 100644
index 00000000000..281b7e1d9f0
--- /dev/null
+++ b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
@@ -0,0 +1,537 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cosmos3 sound tokenizer integration."""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+import torch
+from vllm.logger import init_logger
+
+from vllm_omni.diffusion.data import OmniDiffusionConfig
+from vllm_omni.diffusion.distributed.utils import get_local_device
+from vllm_omni.diffusion.models.progress_bar import _is_rank_zero
+
+from .audio_tokenizer import Cosmos3AVAEAudioTokenizer
+
+logger = init_logger(__name__)
+
+DEFAULT_SOUND_SAMPLE_RATE = 48000
+DEFAULT_SOUND_CHANNELS = 2
+DEFAULT_SOUND_DIM = 64
+DEFAULT_SOUND_HOP_SIZE = 1920
+DEFAULT_SOUND_LATENT_FPS = DEFAULT_SOUND_SAMPLE_RATE / DEFAULT_SOUND_HOP_SIZE
+DEFAULT_SOUND_NORMALIZE_LATENTS = False
+DEFAULT_SOUND_NORMALIZATION_TYPE = "none"
+DEFAULT_SOUND_TANH_INPUT_SCALE = 1.5
+DEFAULT_SOUND_TANH_OUTPUT_SCALE = 3.5
+DEFAULT_SOUND_TANH_CLAMP = 0.995
+SOUND_TOKENIZER_COMPONENT_NAME = "sound_tokenizer"
+SOUND_TOKENIZER_CHECKPOINT_NAME = "diffusion_pytorch_model.safetensors"
+
+
+def _pipeline_args(od_config: OmniDiffusionConfig) -> dict[str, Any]:
+    return dict(getattr(od_config, "custom_pipeline_args", None) or {})
+
+
+def _config_get(config: Any, key: str, default: Any = None) -> Any:
+    if config is None:
+        return default
+    if isinstance(config, dict):
+        return config.get(key, default)
+    if hasattr(config, "get"):
+        value = config.get(key, None)
+        return default if value is None else value
+    return getattr(config, key, default)
+
+
+def _config_path_get(config: Any, *keys: str) -> Any:
+    value = config
+    for key in keys:
+        value = _config_get(value, key, None)
+        if value is None:
+            return None
+    return value
+
+
+def _sound_tokenizer_config_from(config: Any) -> Any:
+    """Return nested ``sound_tokenizer`` config from Cosmos3 config shapes."""
+    for path in (
+        ("sound_tokenizer",),
+        ("model", "config", "sound_tokenizer"),
+        ("config", "sound_tokenizer"),
+        ("model_config", "sound_tokenizer"),
+    ):
+        value = _config_path_get(config, *path)
+        if value is not None:
+            return value
+    return None
+
+
+def _nested_sound_tokenizer_configs(od_config: OmniDiffusionConfig | None) -> tuple[Any, ...]:
+    if od_config is None:
+        return ()
+    configs = []
+    for source in (
+        getattr(od_config, "model_config", None),
+        getattr(od_config, "tf_model_config", None),
+    ):
+        config = _sound_tokenizer_config_from(source)
+        if config is not None:
+            configs.append(config)
+    return tuple(configs)
+
+
+def _first_value_from_configs(configs: tuple[Any, ...], keys: tuple[str, ...]) -> Any:
+    for config in configs:
+        for key in keys:
+            value = _config_get(config, key, None)
+            if value is not None:
+                return value
+    return None
+
+
+def _top_level_model_value(od_config: OmniDiffusionConfig | None, keys: tuple[str, ...]) -> Any:
+    if od_config is None:
+        return None
+    for source in (
+        getattr(od_config, "model_config", None),
+        getattr(od_config, "tf_model_config", None),
+    ):
+        for key in keys:
+            for path in ((key,), ("model", "config", key), ("config", key), ("model_config", key)):
+                value = _config_path_get(source, *path)
+                if value is not None:
+                    return value
+    return None
+
+
+def _custom_arg_value(args: dict[str, Any], keys: tuple[str, ...]) -> Any:
+    for key in keys:
+        value = args.get(key)
+        if value is not None:
+            return value
+    return None
+
+
+def _as_bool(value: Any) -> bool:
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
+
+
+def _as_audio_channels(value: Any) -> int:
+    if isinstance(value, bool):
+        return 2 if value else 1
+    if isinstance(value, str) and value.strip().lower() in {
+        "1",
+        "0",
+        "true",
+        "false",
+        "yes",
+        "no",
+        "on",
+        "off",
+    }:
+        return 2 if _as_bool(value) else 1
+    return int(value)
+
+
+def _resolve_model_file(path: Any, model_root: str | None) -> str | None:
+    if not path:
+        return None
+    path = str(path)
+    if "://" in path or os.path.isabs(path) or os.path.exists(path) or not model_root:
+        return path
+    return str(Path(model_root) / path)
+
+
+def _load_sound_tokenizer_component_config(config_path: str | None) -> dict[str, Any]:
+    if not config_path:
+        return {}
+    with open(config_path, encoding="utf-8") as f:
+        config = json.load(f)
+    if not isinstance(config, dict):
+        raise TypeError(f"Cosmos3 sound tokenizer config must be a JSON object, got {type(config)!r}.")
+    return config
+
+
+def _component_audio_channels(config: dict[str, Any]) -> Any:
+    if config.get("dec_out_channels") is not None:
+        return config["dec_out_channels"]
+    if config.get("audio_channels") is not None:
+        return config["audio_channels"]
+    if config.get("stereo") is not None:
+        return 2 if _as_bool(config["stereo"]) else 1
+    return None
+
+
+def _component_arch_values(config: dict[str, Any]) -> dict[str, Any]:
+    values = {
+        "sample_rate": config.get("sampling_rate", config.get("sample_rate")),
+        "audio_channels": _component_audio_channels(config),
+        "io_channels": config.get("vocoder_input_dim", config.get("io_channels", config.get("latent_ch"))),
+        "hop_size": config.get("hop_size"),
+    }
+    return {key: value for key, value in values.items() if value is not None}
+
+
+def _resolve_arch_value(
+    od_config: OmniDiffusionConfig,
+    args: dict[str, Any],
+    component_values: dict[str, Any],
+    *,
+    field: str,
+    custom_keys: tuple[str, ...],
+    nested_keys: tuple[str, ...],
+    top_level_keys: tuple[str, ...],
+    default: Any,
+    cast,
+) -> Any:
+    custom_value = _custom_arg_value(args, custom_keys)
+    component_value = component_values.get(field)
+    if component_value is not None:
+        resolved = cast(component_value)
+        if custom_value is not None and cast(custom_value) != resolved:
+            raise ValueError(
+                "Conflicting Cosmos3 sound tokenizer architecture override for "
+                f"{field}: component config has {resolved!r}, custom args have {cast(custom_value)!r}."
+            )
+        return resolved
+
+    if custom_value is not None:
+        return cast(custom_value)
+
+    nested_value = _first_value_from_configs(_nested_sound_tokenizer_configs(od_config), nested_keys)
+    if nested_value is not None:
+        return cast(nested_value)
+
+    top_value = _top_level_model_value(od_config, top_level_keys)
+    if top_value is not None:
+        return cast(top_value)
+
+    return cast(default)
+
+
+def _resolve_normalization_value(
+    od_config: OmniDiffusionConfig,
+    args: dict[str, Any],
+    *,
+    name: str,
+    default: Any,
+    aliases: tuple[str, ...] = (),
+) -> Any:
+    keys = (f"sound_{name}", name, *aliases)
+    custom_value = _custom_arg_value(args, keys)
+    if custom_value is not None:
+        return custom_value
+    nested_value = _first_value_from_configs(_nested_sound_tokenizer_configs(od_config), (name, *aliases))
+    return default if nested_value is None else nested_value
+
+
+def get_sound_config_value(
+    od_config: OmniDiffusionConfig,
+    name: str,
+    default: Any,
+    aliases: tuple[str, ...] = (),
+) -> Any:
+    # Backward-compatible generic accessor.  Prefer the more specific helpers
+    # below for Cosmos3 sound tokenizer fields so precedence stays explicit.
+    keys = (name, *aliases)
+    for config in (
+        _pipeline_args(od_config),
+        getattr(od_config, "model_config", None),
+        getattr(od_config, "tf_model_config", None),
+    ):
+        if config is None:
+            continue
+        for key in keys:
+            if hasattr(config, "get"):
+                value = config.get(key, None)
+            else:
+                value = getattr(config, key, None)
+            if value is not None:
+                return value
+    return default
+
+
+def get_sound_sample_rate(od_config: OmniDiffusionConfig) -> int:
+    args = _pipeline_args(od_config)
+    return _resolve_arch_value(
+        od_config,
+        args,
+        {},
+        field="sample_rate",
+        custom_keys=("sound_sample_rate", "sample_rate"),
+        nested_keys=("sample_rate", "sampling_rate"),
+        top_level_keys=("sound_sample_rate", "sample_rate"),
+        default=DEFAULT_SOUND_SAMPLE_RATE,
+        cast=int,
+    )
+
+
+def get_sound_channels(od_config: OmniDiffusionConfig) -> int:
+    args = _pipeline_args(od_config)
+    return _resolve_arch_value(
+        od_config,
+        args,
+        {},
+        field="audio_channels",
+        custom_keys=("sound_audio_channels", "audio_channels", "stereo"),
+        nested_keys=("audio_channels", "dec_out_channels", "stereo"),
+        top_level_keys=("sound_audio_channels", "audio_channels", "stereo"),
+        default=DEFAULT_SOUND_CHANNELS,
+        cast=_as_audio_channels,
+    )
+
+
+def get_sound_dim(od_config: OmniDiffusionConfig | None) -> int:
+    if od_config is None:
+        return DEFAULT_SOUND_DIM
+    args = _pipeline_args(od_config)
+    custom_value = _custom_arg_value(args, ("sound_dim", "io_channels", "latent_ch"))
+    if custom_value is not None:
+        return int(custom_value)
+    top_value = _top_level_model_value(od_config, ("sound_dim",))
+    if top_value is not None:
+        return int(top_value)
+    nested_value = _first_value_from_configs(
+        _nested_sound_tokenizer_configs(od_config),
+        ("io_channels", "vocoder_input_dim", "latent_ch"),
+    )
+    return int(DEFAULT_SOUND_DIM if nested_value is None else nested_value)
+
+
+def get_sound_hop_size(od_config: OmniDiffusionConfig) -> int:
+    args = _pipeline_args(od_config)
+    return _resolve_arch_value(
+        od_config,
+        args,
+        {},
+        field="hop_size",
+        custom_keys=("sound_hop_size", "hop_size"),
+        nested_keys=("hop_size",),
+        top_level_keys=("sound_hop_size", "hop_size"),
+        default=DEFAULT_SOUND_HOP_SIZE,
+        cast=int,
+    )
+
+
+def get_sound_latent_fps(od_config: OmniDiffusionConfig | None) -> float:
+    if od_config is None:
+        return DEFAULT_SOUND_LATENT_FPS
+    args = _pipeline_args(od_config)
+    custom_value = _custom_arg_value(args, ("sound_latent_fps",))
+    if custom_value is not None:
+        return float(custom_value)
+    top_value = _top_level_model_value(od_config, ("sound_latent_fps",))
+    if top_value is not None:
+        return float(top_value)
+    nested_configs = _nested_sound_tokenizer_configs(od_config)
+    nested_fps = _first_value_from_configs(nested_configs, ("sound_latent_fps", "latent_fps"))
+    if nested_fps is not None:
+        return float(nested_fps)
+    sample_rate = _first_value_from_configs(nested_configs, ("sample_rate", "sampling_rate"))
+    hop_size = _first_value_from_configs(nested_configs, ("hop_size",))
+    if sample_rate is not None and hop_size is not None:
+        return float(sample_rate) / float(hop_size)
+    return float(DEFAULT_SOUND_LATENT_FPS)
+
+
+class Cosmos3SoundTokenizer:
+    """Thin adapter around the local AVAE tokenizer implementation."""
+
+    def __init__(self, tokenizer: Any) -> None:
+        self.tokenizer = tokenizer
+        self.sample_rate = int(getattr(tokenizer, "sample_rate", DEFAULT_SOUND_SAMPLE_RATE))
+        self.audio_channels = int(getattr(tokenizer, "audio_channels", DEFAULT_SOUND_CHANNELS))
+        self.latent_ch = int(getattr(tokenizer, "latent_ch", DEFAULT_SOUND_DIM))
+        self.hop_size = int(getattr(tokenizer, "temporal_compression_factor", DEFAULT_SOUND_HOP_SIZE))
+
+    @classmethod
+    def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer:
+        args = _pipeline_args(od_config)
+        model_path = getattr(od_config, "model", None)
+        explicit_avae_path = (
+            args.get("sound_tokenizer_path")
+            or args.get("avae_path")
+            or args.get("cosmos3_avae_path")
+            or os.environ.get("COSMOS3_SOUND_TOKENIZER_PATH")
+        )
+        explicit_config_path = args.get("sound_tokenizer_config_path") or os.environ.get(
+            "COSMOS3_SOUND_TOKENIZER_CONFIG_PATH"
+        )
+
+        model_root = str(model_path) if model_path and os.path.isdir(model_path) else None
+        if model_root is None and model_path and not explicit_avae_path:
+            from huggingface_hub import snapshot_download
+
+            model_root = snapshot_download(
+                repo_id=str(model_path),
+                revision=getattr(od_config, "revision", None),
+                allow_patterns=[
+                    f"{SOUND_TOKENIZER_COMPONENT_NAME}/config.json",
+                    f"{SOUND_TOKENIZER_COMPONENT_NAME}/{SOUND_TOKENIZER_CHECKPOINT_NAME}",
+                ],
+            )
+
+        if explicit_avae_path:
+            avae_path = _resolve_model_file(explicit_avae_path, model_root)
+        else:
+            tokenizer_dir = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME if model_root else None
+            candidate = tokenizer_dir / SOUND_TOKENIZER_CHECKPOINT_NAME if tokenizer_dir else None
+            avae_path = str(candidate) if candidate and candidate.exists() else None
+
+        if not avae_path:
+            raise ValueError(
+                "Cosmos3 sound generation was requested, but no AVAE sound "
+                "tokenizer checkpoint was provided. Set "
+                "custom_pipeline_args['sound_tokenizer_path'] or "
+                "COSMOS3_SOUND_TOKENIZER_PATH, or include "
+                f"{SOUND_TOKENIZER_COMPONENT_NAME}/{SOUND_TOKENIZER_CHECKPOINT_NAME} under the model path."
+            )
+
+        config_path = _resolve_model_file(explicit_config_path, model_root)
+        if config_path is None and model_root:
+            candidate = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME / "config.json"
+            config_path = str(candidate) if candidate.exists() else None
+        component_config = _load_sound_tokenizer_component_config(config_path)
+        component_values = _component_arch_values(component_config)
+
+        sample_rate = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="sample_rate",
+            custom_keys=("sound_sample_rate", "sample_rate"),
+            nested_keys=("sample_rate", "sampling_rate"),
+            top_level_keys=("sound_sample_rate", "sample_rate"),
+            default=DEFAULT_SOUND_SAMPLE_RATE,
+            cast=int,
+        )
+        audio_channels = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="audio_channels",
+            custom_keys=("sound_audio_channels", "audio_channels", "stereo"),
+            nested_keys=("audio_channels", "dec_out_channels", "stereo"),
+            top_level_keys=("sound_audio_channels", "audio_channels", "stereo"),
+            default=DEFAULT_SOUND_CHANNELS,
+            cast=_as_audio_channels,
+        )
+        sound_dim = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="io_channels",
+            custom_keys=("sound_dim", "io_channels", "latent_ch"),
+            nested_keys=("io_channels", "vocoder_input_dim", "latent_ch"),
+            top_level_keys=("sound_dim",),
+            default=DEFAULT_SOUND_DIM,
+            cast=int,
+        )
+        hop_size = _resolve_arch_value(
+            od_config,
+            args,
+            component_values,
+            field="hop_size",
+            custom_keys=("sound_hop_size", "hop_size"),
+            nested_keys=("hop_size",),
+            top_level_keys=("sound_hop_size", "hop_size"),
+            default=DEFAULT_SOUND_HOP_SIZE,
+            cast=int,
+        )
+        normalize_latents = _as_bool(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="normalize_latents",
+                default=DEFAULT_SOUND_NORMALIZE_LATENTS,
+            )
+        )
+        normalization_type = str(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="normalization_type",
+                default=DEFAULT_SOUND_NORMALIZATION_TYPE,
+            )
+        )
+        tanh_input_scale = float(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="tanh_input_scale",
+                default=DEFAULT_SOUND_TANH_INPUT_SCALE,
+            )
+        )
+        tanh_output_scale = float(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="tanh_output_scale",
+                default=DEFAULT_SOUND_TANH_OUTPUT_SCALE,
+            )
+        )
+        tanh_clamp = float(
+            _resolve_normalization_value(
+                od_config,
+                args,
+                name="tanh_clamp",
+                default=DEFAULT_SOUND_TANH_CLAMP,
+            )
+        )
+        tokenizer = Cosmos3AVAEAudioTokenizer(
+            checkpoint_path=str(avae_path),
+            config_path=config_path,
+            sample_rate=sample_rate,
+            audio_channels=audio_channels,
+            io_channels=sound_dim,
+            hop_size=hop_size,
+            normalize_latents=normalize_latents,
+            normalization_type=normalization_type,
+            tanh_input_scale=tanh_input_scale,
+            tanh_output_scale=tanh_output_scale,
+            tanh_clamp=tanh_clamp,
+            dtype=getattr(od_config, "dtype", torch.bfloat16),
+            device=get_local_device(),
+        )
+        if _is_rank_zero():
+            logger.info(
+                "Loaded Cosmos3 AVAE sound tokenizer from %s (sr=%d, channels=%d, latent_ch=%d, hop=%d)",
+                avae_path,
+                sample_rate,
+                audio_channels,
+                sound_dim,
+                hop_size,
+            )
+        return cls(tokenizer)
+
+    def get_latent_num_samples(self, num_audio_samples: int) -> int:
+        return int(self.tokenizer.get_latent_num_samples(num_audio_samples))
+
+    def get_audio_num_samples(self, num_latent_samples: int) -> int:
+        return int(self.tokenizer.get_audio_num_samples(num_latent_samples))
+
+    @torch.no_grad()
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        """Decode sound latents.
+
+        Args:
+            latents: ``[B, C, T]`` or ``[C, T]`` tensor.
+
+        Returns:
+            ``[B, audio_channels, N]`` tensor for batched input, or
+            ``[audio_channels, N]`` for unbatched input.
+        """
+        squeeze = latents.ndim == 2
+        if squeeze:
+            latents = latents.unsqueeze(0)
+        audio = self.tokenizer.decode(latents)
+        audio = audio.clamp(-1.0, 1.0)
+        return audio.squeeze(0) if squeeze else audio
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 7b3848a089a..49af3fde3d0 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -76,6 +76,51 @@ def _tf_config_get(config: Any, key: str, default: Any) -> Any:
     return getattr(config, key, default)
 
 
+def _nested_get(value: Any, key: str) -> Any:
+    if isinstance(value, dict):
+        if key in value:
+            return value[key]
+        for child in value.values():
+            found = _nested_get(child, key)
+            if found is not None:
+                return found
+    elif isinstance(value, list | tuple):
+        for child in value:
+            found = _nested_get(child, key)
+            if found is not None:
+                return found
+    return None
+
+
+def _od_config_get(od_config: Any, key: str, default: Any = None) -> Any:
+    """Read Cosmos3 options from runtime, model, or transformer config."""
+    if od_config is None:
+        return default
+    for attr in ("custom_pipeline_args", "model_config"):
+        source = getattr(od_config, attr, None) or {}
+        if isinstance(source, dict):
+            if key in source:
+                return source[key]
+            found = _nested_get(source, key)
+            if found is not None:
+                return found
+    tf_model_config = getattr(od_config, "tf_model_config", None)
+    if isinstance(tf_model_config, dict):
+        if key in tf_model_config:
+            return tf_model_config[key]
+        found = _nested_get(tf_model_config, key)
+        if found is not None:
+            return found
+    value = _tf_config_get(tf_model_config, key, None)
+    return default if value is None else value
+
+
+def _as_bool(value: Any) -> bool:
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
+
+
 # ---------------------------------------------------------------------------
 # Rotary Position Embeddings (mRoPE)
 # ---------------------------------------------------------------------------
@@ -138,6 +183,30 @@ def compute_mrope_position_ids_vision(
     return mrope_ids, next_offset
 
 
+def compute_mrope_position_ids_sound(
+    grid_t: int,
+    temporal_offset: int | float,
+    sound_latent_fps: float,
+    base_fps: float = 24.0,
+    temporal_compression_factor_sound: int = 1,
+    enable_fps_modulation: bool = True,
+    base_temporal_compression_factor: int | None = None,
+) -> tuple[torch.Tensor, int | float]:
+    """Generate mRoPE IDs for sound tokens as a (T, 1, 1) grid."""
+    del base_temporal_compression_factor
+    return compute_mrope_position_ids_vision(
+        grid_t=grid_t,
+        grid_h=1,
+        grid_w=1,
+        temporal_offset=temporal_offset,
+        fps=sound_latent_fps,
+        base_fps=base_fps,
+        temporal_compression_factor=temporal_compression_factor_sound,
+        base_temporal_compression_factor=temporal_compression_factor_sound,
+        enable_fps_modulation=enable_fps_modulation,
+    )
+
+
 class Qwen3VLTextRotaryEmbedding(nn.Module):
     """Multi-dimensional rotary position embedding for Qwen3-VL."""
 
@@ -859,9 +928,25 @@ def __init__(
         self.latent_channel_size = int(_tf_config_get(model_config, "latent_channel", 48))
         self.timestep_scale = float(_tf_config_get(model_config, "timestep_scale", 0.001))
         self.base_fps = float(_tf_config_get(model_config, "base_fps", 24.0))
+        sound_gen_value = _od_config_get(od_config, "sound_gen", None)
+        sound_dim_value = _od_config_get(od_config, "sound_dim", None)
+        if sound_dim_value is None:
+            sound_dim_value = _od_config_get(od_config, "io_channels", None)
+        if sound_dim_value is None:
+            sound_dim_value = _od_config_get(od_config, "vocoder_input_dim", None)
+        if sound_dim_value is None:
+            sound_dim_value = _od_config_get(od_config, "latent_ch", None)
+        self.sound_gen = _as_bool(sound_gen_value) if sound_gen_value is not None else sound_dim_value is not None
+        from .sound_tokenizer import get_sound_dim, get_sound_latent_fps
+
+        self.sound_dim = int(sound_dim_value if sound_dim_value is not None else get_sound_dim(od_config))
+        self.sound_latent_fps = float(get_sound_latent_fps(od_config))
         if temporal_compression_factor is None:
             temporal_compression_factor = _tf_config_get(model_config, "temporal_compression_factor", 4)
         self.temporal_compression_factor = int(temporal_compression_factor)
+        self.temporal_compression_factor_sound = int(
+            _tf_config_get(model_config, "temporal_compression_factor_sound", 1)
+        )
         self.enable_fps_modulation = bool(_tf_config_get(model_config, "enable_fps_modulation", True))
         self.temporal_modality_margin = int(
             _tf_config_get(
@@ -894,6 +979,12 @@ def __init__(
         self.proj_in = nn.Linear(self.patch_latent_dim, self.hidden_size)
         self.proj_out = nn.Linear(self.hidden_size, self.patch_latent_dim)
         self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=dtype)
+        if self.sound_gen:
+            self.audio_proj_in = nn.Linear(self.sound_dim, self.hidden_size)
+            self.audio_proj_out = nn.Linear(self.hidden_size, self.sound_dim)
+            self.audio_modality_embed = nn.Parameter(torch.zeros(self.hidden_size))
+
+        self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=torch.bfloat16)
 
         self.gen_layers = nn.ModuleList(
             [
@@ -962,6 +1053,21 @@ def unpatchify(self, tokens: torch.Tensor, t: int, h: int, w: int) -> torch.Tens
             x = x[:, :, :, :h, :w]
         return x
 
+    def pack_sound(self, sound_latents: torch.Tensor) -> torch.Tensor:
+        """[B, C_sound, T_sound] -> [B, T_sound, C_sound]."""
+        if sound_latents.ndim != 3:
+            raise ValueError(f"Cosmos3 sound latents must have shape [B, C, T], got {tuple(sound_latents.shape)}.")
+        if sound_latents.shape[1] != self.sound_dim:
+            raise ValueError(
+                f"Cosmos3 sound latent channel mismatch: expected {self.sound_dim}, got {sound_latents.shape[1]}."
+            )
+        return sound_latents.permute(0, 2, 1).contiguous()
+
+    @staticmethod
+    def unpack_sound(tokens: torch.Tensor) -> torch.Tensor:
+        """[B, T_sound, C_sound] -> [B, C_sound, T_sound]."""
+        return tokens.permute(0, 2, 1).contiguous()
+
     # -- RoPE computation ----------------------------------------------------
 
     def _compute_rope_freqs(
@@ -973,12 +1079,14 @@ def _compute_rope_freqs(
         fps: float | None,
         device: torch.device,
         dtype: torch.dtype,
+        t_sound: int | None = None,
     ) -> tuple[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
         """Compute mRoPE cos/sin for UND text and GEN media pathways."""
         B = text_mask.shape[0]
         S_text = text_mask.shape[1]
         text_lengths = text_mask.sum(dim=1).long()
         effective_fps = fps if fps is not None and t > 1 else None
+        sound_frames = int(t_sound or 0)
 
         text_pos_list = []
         gen_pos_list = []
@@ -996,6 +1104,21 @@ def _compute_rope_freqs(
                 temporal_compression_factor=self.temporal_compression_factor,
                 enable_fps_modulation=self.enable_fps_modulation,
             )
+            gen_positions = [v_pos]
+            if sound_frames > 0:
+                s_pos, _ = compute_mrope_position_ids_sound(
+                    sound_frames,
+                    temporal_offset=media_temporal_offset,
+                    sound_latent_fps=self.sound_latent_fps,
+                    base_fps=self.base_fps,
+                    temporal_compression_factor_sound=getattr(self, "temporal_compression_factor_sound", 1),
+                    enable_fps_modulation=self.enable_fps_modulation,
+                )
+                gen_positions.append(s_pos)
+            pos_dtype = gen_positions[0].dtype
+            for pos in gen_positions[1:]:
+                pos_dtype = torch.promote_types(pos_dtype, pos.dtype)
+            v_pos = torch.cat([pos.to(pos_dtype) for pos in gen_positions], dim=1)
             if real_len < S_text:
                 t_pos = torch.cat(
                     [t_pos, torch.zeros(3, S_text - real_len, dtype=t_pos.dtype)],
@@ -1026,16 +1149,31 @@ def reset_cache(self) -> None:
     def _validate_gen_sequence_parallel(
         *,
         s_gen: int,
+        s_video: int,
+        s_sound: int,
+        has_sound: bool,
         ulysses_size: int,
     ) -> None:
         if ulysses_size <= 1 or s_gen % ulysses_size == 0:
             return
 
+        detail_parts = [f"video tokens {s_video}"]
+        if has_sound:
+            detail_parts.append(f"sound tokens {s_sound}")
+        detail = " = " + " + ".join(detail_parts) if len(detail_parts) > 1 else ""
         adjust_detail = (
-            "Adjust the spatial resolution so that t * ceil(h/patch) * ceil(w/patch) is a multiple of ulysses_degree."
+            "Adjust the spatial resolution, frame count, sound duration, "
+            "or sound latent FPS so the combined media sequence is a "
+            "multiple of ulysses_degree."
+            if has_sound
+            else (
+                "Adjust the spatial resolution so that "
+                "t * ceil(h/patch) * ceil(w/patch) is a multiple "
+                "of ulysses_degree."
+            )
         )
         raise ValueError(
-            f"GEN sequence length ({s_gen} video tokens) must be divisible by "
+            f"GEN sequence length ({s_gen}{detail}) must be divisible by "
             f"ulysses_degree ({ulysses_size}). {adjust_detail}"
         )
 
@@ -1049,9 +1187,10 @@ def forward(
         text_mask: torch.Tensor,
         video_shape: tuple[int, int, int],
         fps: float | None = None,
+        sound_latents: torch.Tensor | None = None,
         noisy_frame_mask: torch.Tensor | None = None,
         **kwargs,
-    ) -> torch.Tensor:
+    ) -> torch.Tensor | tuple[torch.Tensor, ...]:
         """
         Args:
             hidden_states: [B, C, t, h, w] noisy latents
@@ -1060,13 +1199,15 @@ def forward(
             text_mask: [B, S_text] attention mask (1=real, 0=pad)
             video_shape: (t, h, w) in latent space
             fps: video frame rate for temporal mRoPE modulation
+            sound_latents: Optional [B, C_sound, T_sound] noisy sound latents.
             noisy_frame_mask: Optional [B, 1, t, 1, 1] mask where 1=noisy (add
                 timestep embedding, predict velocity) and 0=conditioned (clean
                 context, skip timestep embedding).  None means all frames noisy
                 (T2V mode).
 
         Returns:
-            [B, C, t, h, w] velocity prediction.
+            [B, C, t, h, w] velocity prediction, or
+            tuple outputs in video, sound order when sound latents are provided.
         """
         t, h, w = video_shape
         hp, wp, _, _ = self._pad_to_patch_size(h, w)
@@ -1078,12 +1219,31 @@ def forward(
                 f"Cosmos3 requires identical real text lengths within a batch "
                 f"(got min={min_real_len}, max={max_real_len})."
             )
+        has_sound = sound_latents is not None
+        if has_sound and not self.sound_gen:
+            raise ValueError(
+                "Cosmos3 sound generation was requested, but this transformer "
+                "was initialized without sound modules. Check that the "
+                "transformer config enables sound_gen or defines sound_dim."
+            )
 
         # Query Ulysses state at runtime
         ulysses_size, _, _ = _get_ulysses_state()
 
         # Patchify latents and project to hidden space
         hidden_video = self.proj_in(self.patchify(hidden_states, t, h, w))
+        s_video = hidden_video.shape[1]
+        s_sound = 0
+        hidden_sound = None
+        if sound_latents is not None:
+            if sound_latents.shape[0] != hidden_states.shape[0]:
+                raise ValueError(
+                    "Cosmos3 sound and video batch sizes must match: "
+                    f"video={hidden_states.shape[0]}, sound={sound_latents.shape[0]}."
+                )
+            hidden_sound = self.audio_proj_in(self.pack_sound(sound_latents))
+            hidden_sound = hidden_sound + self.audio_modality_embed.to(hidden_sound.dtype)
+            s_sound = hidden_sound.shape[1]
 
         # Timestep embedding (fp32 for precision).
         # For I2V: only add to noisy tokens, not conditioned ones.
@@ -1106,7 +1266,12 @@ def forward(
         else:
             hidden_video = hidden_video + time_embed.unsqueeze(1)
 
-        hidden_gen = hidden_video
+        if hidden_sound is not None:
+            hidden_sound = hidden_sound + time_embed.unsqueeze(1)
+        hidden_parts = [hidden_video]
+        if hidden_sound is not None:
+            hidden_parts.append(hidden_sound)
+        hidden_gen = torch.cat(hidden_parts, dim=1)
 
         # Run UND pathway once and cache K/V (replicated across all ranks)
         if self.cached_kv is None:
@@ -1118,6 +1283,7 @@ def forward(
                 fps,
                 hidden_states.device,
                 hidden_states.dtype,
+                t_sound=s_sound,
             )
             cached_kv_full = self.language_model(text_ids, freqs_und)
             self.cached_freqs_gen = freqs_gen
@@ -1133,6 +1299,9 @@ def forward(
             raise RuntimeError("Cosmos3 GEN cache was not initialized before running GEN layers.")
         self._validate_gen_sequence_parallel(
             s_gen=hidden_gen.shape[1],
+            s_video=s_video,
+            s_sound=s_sound,
+            has_sound=has_sound,
             ulysses_size=ulysses_size,
         )
         freqs_cos, freqs_sin = self.cached_freqs_gen
@@ -1166,7 +1335,21 @@ def forward(
 
         # Final norm and project back to latent space
         hidden_gen = self.norm_moe_gen(hidden_gen)
-        return self.unpatchify(self.proj_out(hidden_gen), t, h, w)
+        if not has_sound:
+            return self.unpatchify(self.proj_out(hidden_gen), t, h, w)
+
+        split_sizes = [s_video]
+        if has_sound:
+            split_sizes.append(s_sound)
+        split_hidden = hidden_gen.split(split_sizes, dim=1)
+        hidden_video = split_hidden[0]
+        video_pred = self.unpatchify(self.proj_out(hidden_video), t, h, w)
+        outputs: list[torch.Tensor] = [video_pred]
+        split_idx = 1
+        if has_sound:
+            hidden_sound = split_hidden[split_idx]
+            outputs.append(self.unpack_sound(self.audio_proj_out(hidden_sound)))
+        return tuple(outputs)
 
     def post_load_weights(self) -> None:
         """Post-load processing: ensure correct dtypes."""
diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py
index 2c738853128..28acc7379c9 100644
--- a/vllm_omni/entrypoints/openai/api_server.py
+++ b/vllm_omni/entrypoints/openai/api_server.py
@@ -2627,6 +2627,8 @@ async def _parse_video_form(
     flow_shift: float | None = Form(default=None),
     true_cfg_scale: float | None = Form(default=None),
     seed: int | None = Form(default=None),
+    generate_sound: bool | None = Form(default=None),
+    sound_duration: float | None = Form(default=None, gt=0.0),
     negative_prompt: str | None = Form(default=None),
     enable_frame_interpolation: bool | None = Form(default=None),
     frame_interpolation_exp: int | None = Form(default=None, ge=1),
@@ -2667,6 +2669,8 @@ async def _parse_video_form(
         "flow_shift": flow_shift,
         "true_cfg_scale": true_cfg_scale,
         "seed": seed,
+        "generate_sound": generate_sound,
+        "sound_duration": sound_duration,
         "negative_prompt": negative_prompt,
         "enable_frame_interpolation": enable_frame_interpolation,
         "frame_interpolation_exp": frame_interpolation_exp,
diff --git a/vllm_omni/entrypoints/openai/protocol/videos.py b/vllm_omni/entrypoints/openai/protocol/videos.py
index d46c8d43d6b..887e3ce67ea 100644
--- a/vllm_omni/entrypoints/openai/protocol/videos.py
+++ b/vllm_omni/entrypoints/openai/protocol/videos.py
@@ -149,6 +149,15 @@ class VideoGenerationRequest(BaseModel):
         description="True CFG scale (model-specific parameter, may be ignored if not supported)",
     )
     seed: int | None = Field(default=None, description="Random seed for reproducibility")
+    generate_sound: bool = Field(
+        default=False,
+        description="Request model-generated audio for video models that support sound generation.",
+    )
+    sound_duration: float | None = Field(
+        default=None,
+        gt=0.0,
+        description="Duration in seconds for model-generated audio. Defaults to the generated video duration.",
+    )
 
     # vllm-omni extensions for post-generation frame interpolation.
     enable_frame_interpolation: bool = Field(
diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py
index b6ed49996fe..57a76594a0f 100644
--- a/vllm_omni/entrypoints/openai/serving_video.py
+++ b/vllm_omni/entrypoints/openai/serving_video.py
@@ -148,6 +148,10 @@ async def _run_and_extract(
         )
         if "flow_shift" in provided_fields and request.flow_shift is not None:
             gen_params.extra_args["flow_shift"] = request.flow_shift
+        if "generate_sound" in provided_fields:
+            gen_params.extra_args["generate_sound"] = request.generate_sound
+        if "sound_duration" in provided_fields and request.sound_duration is not None:
+            gen_params.extra_args["sound_duration"] = request.sound_duration
 
         # Apply model-specific extra parameters
         if request.extra_params is not None:

From 1b7e40d1e6ae95f14c357a297ae292020b5e1a2f Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 2 Jun 2026 10:52:47 +0200
Subject: [PATCH 02/11] Fix tests; small improvements

Signed-off-by: Maciej Bala <mbala@nvidia.com>
Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 tests/diffusion/models/cosmos3/conftest.py    | 185 ------------------
 .../models/cosmos3/test_cosmos3_pipeline.py   |  17 +-
 .../cosmos3/test_cosmos3_transformer.py       |   9 +-
 .../models/cosmos3/pipeline_cosmos3.py        |  11 +-
 .../models/cosmos3/transformer_cosmos3.py     |   2 -
 5 files changed, 26 insertions(+), 198 deletions(-)
 delete mode 100644 tests/diffusion/models/cosmos3/conftest.py

diff --git a/tests/diffusion/models/cosmos3/conftest.py b/tests/diffusion/models/cosmos3/conftest.py
deleted file mode 100644
index 7075065447c..00000000000
--- a/tests/diffusion/models/cosmos3/conftest.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
-import sys
-import types
-from types import SimpleNamespace
-from typing import Any
-
-import pytest
-import torch
-from torch import nn
-
-
-class StubScheduler:
-    def __init__(self, timesteps: list[int] | None = None, *, flow_shift: float = 1.0) -> None:
-        self.timesteps = torch.tensor(timesteps or [9, 3], dtype=torch.int64)
-        self.config = SimpleNamespace(num_train_timesteps=1000, flow_shift=flow_shift)
-        self.set_timesteps_calls: list[tuple[int, torch.device]] = []
-        self.step_calls: list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = []
-
-    def set_timesteps(self, num_steps: int, device: torch.device) -> None:
-        self.set_timesteps_calls.append((num_steps, device))
-        self.timesteps = torch.arange(num_steps, 0, -1, dtype=torch.int64, device=device)
-
-    def step(self, noise_pred: torch.Tensor, timestep: torch.Tensor, latents: torch.Tensor, **kwargs):
-        del kwargs
-        self.step_calls.append((noise_pred.clone(), timestep.clone(), latents.clone()))
-        return (latents + noise_pred,)
-
-
-class _ModeLatentDist:
-    def __init__(self, latents: torch.Tensor) -> None:
-        self._latents = latents
-
-    def mode(self) -> torch.Tensor:
-        return self._latents
-
-
-class StubCosmos3VAE:
-    dtype = torch.float32
-
-    def __init__(self, z_dim: int = 2, *, temporal: int = 4, spatial: int = 8) -> None:
-        self.config = SimpleNamespace(
-            z_dim=z_dim,
-            scale_factor_temporal=temporal,
-            scale_factor_spatial=spatial,
-            latents_mean=[0.0] * z_dim,
-            latents_std=[1.0] * z_dim,
-        )
-
-    def encode(self, video: torch.Tensor):
-        latent_frames = (video.shape[2] - 1) // self.config.scale_factor_temporal + 1
-        latent_height = video.shape[-2] // self.config.scale_factor_spatial
-        latent_width = video.shape[-1] // self.config.scale_factor_spatial
-        latents = torch.ones(
-            video.shape[0],
-            self.config.z_dim,
-            latent_frames,
-            latent_height,
-            latent_width,
-            dtype=video.dtype,
-            device=video.device,
-        )
-        return SimpleNamespace(latent_dist=_ModeLatentDist(latents))
-
-    def decode(self, latents: torch.Tensor, return_dict: bool = False):
-        del return_dict
-        return (latents,)
-
-
-class StubCosmos3Transformer(nn.Module):
-    def __init__(
-        self,
-        *,
-        latent_channel_size: int = 2,
-        sound_gen: bool = False,
-        sound_dim: int = 3,
-    ) -> None:
-        super().__init__()
-        self.latent_channel_size = latent_channel_size
-        self.sound_gen = sound_gen
-        self.sound_dim = sound_dim
-        self.cached_kv: Any | None = None
-        self.cached_freqs_gen: Any | None = None
-        self.calls: list[dict[str, Any]] = []
-        self.reset_calls = 0
-
-    def reset_cache(self) -> None:
-        self.reset_calls += 1
-        self.cached_kv = None
-        self.cached_freqs_gen = None
-
-    def forward(
-        self,
-        *,
-        hidden_states: torch.Tensor,
-        timestep: torch.Tensor,
-        text_ids: torch.Tensor,
-        text_mask: torch.Tensor,
-        **kwargs: Any,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        token = int(text_ids.reshape(-1)[0].item()) if text_ids.numel() else 0
-        sound_latents = kwargs.get("sound_latents")
-        self.calls.append(
-            {
-                "token": token,
-                "timestep": timestep.clone(),
-                "text_mask": text_mask.clone(),
-                "cache_before": self.cached_kv,
-                "kwargs": dict(kwargs),
-            }
-        )
-        if self.cached_kv is None:
-            marker = torch.tensor([token], dtype=torch.float32)
-            self.cached_kv = [(marker, marker + 100)]
-            self.cached_freqs_gen = (marker + 200, marker + 300)
-        outputs: list[torch.Tensor] = [torch.full_like(hidden_states, float(token))]
-        if sound_latents is not None:
-            outputs.append(torch.full_like(sound_latents, float(token + 10)))
-        return outputs[0] if len(outputs) == 1 else tuple(outputs)
-
-
-def passthrough_progress_bar(iterable):
-    return iterable
-
-
-@pytest.fixture(autouse=True)
-def fake_cosmos3_guardrails(monkeypatch: pytest.MonkeyPatch):
-    module = types.ModuleType("vllm_omni.diffusion.models.cosmos3.guardrails")
-    module.is_guardrails_enabled = lambda od_config, sampling_params=None: False
-    module.ensure_initialized = lambda od_config: None
-    module.check_text_safety = lambda text: None
-    module.check_video_safety = lambda video: video
-    monkeypatch.setitem(sys.modules, module.__name__, module)
-    return module
-
-
-@pytest.fixture
-def make_cosmos3_pipeline():
-    def _make():
-        from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import (
-            Cosmos3OmniDiffusersPipeline,
-        )
-
-        pipeline = object.__new__(Cosmos3OmniDiffusersPipeline)
-        nn.Module.__init__(pipeline)
-        pipeline.od_config = SimpleNamespace()
-        pipeline.device = torch.device("cpu")
-        pipeline.dtype = torch.float32
-        pipeline.transformer = StubCosmos3Transformer(latent_channel_size=2)
-        pipeline.vae = StubCosmos3VAE(z_dim=2)
-        pipeline.vae_scale_factor_temporal = 4
-        pipeline.vae_scale_factor_spatial = 8
-        pipeline.scheduler = StubScheduler([9, 3], flow_shift=1.0)
-        pipeline._base_scheduler_config = pipeline.scheduler.config
-        pipeline._engine_init_flow_shift = 1.0
-        pipeline._current_flow_shift = 1.0
-        pipeline._guidance_scale = None
-        pipeline._num_timesteps = None
-        pipeline.progress_bar = passthrough_progress_bar
-        pipeline._sound_tokenizer = None
-        return pipeline
-
-    return _make
-
-
-def make_sampling_params(**overrides: Any) -> SimpleNamespace:
-    values = {
-        "height": None,
-        "width": None,
-        "num_frames": None,
-        "num_inference_steps": None,
-        "guidance_scale": None,
-        "generator": None,
-        "seed": 123,
-        "num_outputs_per_prompt": 1,
-        "frame_rate": None,
-        "resolved_frame_rate": None,
-        "max_sequence_length": None,
-        "extra_args": {},
-    }
-    values.update(overrides)
-    return SimpleNamespace(**values)
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
index b6116d9265d..3c042275341 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -78,9 +78,13 @@ def __init__(
         self,
         *,
         latent_channel_size: int = 2,
+        sound_gen: bool = False,
+        sound_dim: int = 3,
     ) -> None:
         super().__init__()
         self.latent_channel_size = latent_channel_size
+        self.sound_gen = sound_gen
+        self.sound_dim = sound_dim
         self.cached_kv: Any | None = None
         self.cached_freqs_gen: Any | None = None
         self.calls: list[dict[str, Any]] = []
@@ -99,8 +103,9 @@ def forward(
         text_ids: torch.Tensor,
         text_mask: torch.Tensor,
         **kwargs: Any,
-    ) -> torch.Tensor:
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         token = int(text_ids.reshape(-1)[0].item()) if text_ids.numel() else 0
+        sound_latents = kwargs.get("sound_latents")
         self.calls.append(
             {
                 "token": token,
@@ -114,7 +119,10 @@ def forward(
             marker = torch.tensor([token], dtype=torch.float32)
             self.cached_kv = [(marker, marker + 100)]
             self.cached_freqs_gen = (marker + 200, marker + 300)
-        return torch.full_like(hidden_states, float(token))
+        outputs: list[torch.Tensor] = [torch.full_like(hidden_states, float(token))]
+        if sound_latents is not None:
+            outputs.append(torch.full_like(sound_latents, float(token + 10)))
+        return outputs[0] if len(outputs) == 1 else tuple(outputs)
 
 
 def passthrough_progress_bar(iterable):
@@ -155,6 +163,7 @@ def _make():
         pipeline._guidance_scale = None
         pipeline._num_timesteps = None
         pipeline._cache_dit_requires_paired_cfg = False
+        pipeline._sound_tokenizer = None
         pipeline.progress_bar = passthrough_progress_bar
         return pipeline
 
@@ -235,7 +244,9 @@ def test_postprocess_handles_image_video_audio_and_validation() -> None:
 
     assert func(video, output_type="latent") is video
     assert func({"image": video})[0].size == (4, 4)
-    assert "video" in func({"video": video})
+    # Video-only postprocess returns the bare processed video (not a dict),
+    # matching the image/latent branches and peer audio-capable pipelines.
+    assert not isinstance(func({"video": video}), dict)
     assert (
         func(
             {"video": video, "audio": torch.ones(1, 2, 16), "audio_sample_rate": 48000},
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index 38db56e0c26..bd1a9588b7e 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -120,7 +120,6 @@ def test_forward_returns_video_prediction(monkeypatch: pytest.MonkeyPatch) -> No
         text_mask=torch.ones(1, 2, dtype=torch.long),
         video_shape=(1, 2, 2),
         fps=24.0,
-        sound_latents=torch.zeros(1, 3, 4),
     )
 
     assert tuple(output.shape) == (1, 2, 1, 2, 2)
@@ -161,10 +160,12 @@ def test_sound_pack_unpack_validate_shapes() -> None:
         model.pack_sound(torch.zeros(1, 4, 2))
 
 
-def test_forward_returns_video_and_sound_predictions() -> None:
-    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+def test_forward_returns_video_and_sound_predictions(monkeypatch: pytest.MonkeyPatch) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import transformer_cosmos3
 
-    output = Cosmos3VFMTransformer(
+    monkeypatch.setattr(transformer_cosmos3, "_get_ulysses_state", lambda: (1, 0, None))
+
+    output = transformer_cosmos3.Cosmos3VFMTransformer(
         SimpleNamespace(
             tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0),
             dtype=torch.float32,
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 543add3ac46..672f77715a2 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -232,13 +232,16 @@ def post_process_func(
             return video_processor.postprocess(image, output_type="pil")
         if is_guardrails_enabled(od_config, sampling_params):
             video = check_video_safety(video)
-        result = {"video": video_processor.postprocess_video(video, output_type=output_type)}
+        processed_video = video_processor.postprocess_video(video, output_type=output_type)
         if audio is None:
-            return result
+            return processed_video
         if isinstance(audio, torch.Tensor):
             audio = audio.detach().cpu()
-        result["audio"] = audio
-        result["fps"] = _resolve_output_fps(sampling_params)
+        result = {
+            "video": processed_video,
+            "audio": audio,
+            "fps": _resolve_output_fps(sampling_params),
+        }
         if audio_sample_rate is not None:
             result["audio_sample_rate"] = int(audio_sample_rate)
         return result
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 49af3fde3d0..31fbf69d66d 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -984,8 +984,6 @@ def __init__(
             self.audio_proj_out = nn.Linear(self.hidden_size, self.sound_dim)
             self.audio_modality_embed = nn.Parameter(torch.zeros(self.hidden_size))
 
-        self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=torch.bfloat16)
-
         self.gen_layers = nn.ModuleList(
             [
                 Cosmos3GenDecoderLayer(

From 6638fbc5e91a4ce1b7deda0fd362c61f696d7d7e Mon Sep 17 00:00:00 2001
From: Bartosz Stefaniak <bstefaniak@nvidia.com>
Date: Tue, 2 Jun 2026 16:08:46 +0000
Subject: [PATCH 03/11] Remove unused parameter

Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index 31fbf69d66d..e1810bd7103 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -190,10 +190,8 @@ def compute_mrope_position_ids_sound(
     base_fps: float = 24.0,
     temporal_compression_factor_sound: int = 1,
     enable_fps_modulation: bool = True,
-    base_temporal_compression_factor: int | None = None,
 ) -> tuple[torch.Tensor, int | float]:
     """Generate mRoPE IDs for sound tokens as a (T, 1, 1) grid."""
-    del base_temporal_compression_factor
     return compute_mrope_position_ids_vision(
         grid_t=grid_t,
         grid_h=1,

From 9b8b239f10a11972025edf4f4c7d6e3a07e47eb3 Mon Sep 17 00:00:00 2001
From: Bartosz Stefaniak <bstefaniak@nvidia.com>
Date: Tue, 2 Jun 2026 19:03:00 +0000
Subject: [PATCH 04/11] Comment about packed modalities into single tensor

Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 672f77715a2..90de6575eee 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -1111,6 +1111,9 @@ def _cfg_active_at(t: torch.Tensor) -> bool:
             lo, hi = guidance_interval
             return lo <= t_scalar <= hi
 
+        # Joint scheduler step over multiple modalities. Safe for flow-matching schedulers
+        # because the update is linear per element; revisit this if Cosmos3 adopts a
+        # scheduler with cross-element dependencies (e.g. per-modality timestep).
         def _pack_joint(
             video_tensor: torch.Tensor,
             sound_tensor: torch.Tensor | None = None,

From e82a83180838178a2ddc02015c8ecffb81981541 Mon Sep 17 00:00:00 2001
From: Bartosz Stefaniak <bstefaniak@nvidia.com>
Date: Tue, 2 Jun 2026 19:03:30 +0000
Subject: [PATCH 05/11] Enable sound generation only thorough "generate_sound",
 "sound_gen" flags

Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 90de6575eee..33c05efbf95 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -595,15 +595,7 @@ def _get_prompt_param(cls, prompt_data, key: str, default=None):
 
     @classmethod
     def _is_sound_request(cls, prompt_data, sp) -> bool:
-        keys = (
-            "sound_gen",
-            "generate_sound",
-            "enable_sound_generation",
-            "return_audio",
-            "output_audio",
-            "generate_audio",
-        )
-        for key in keys:
+        for key in ("generate_sound", "sound_gen"):
             if cls._truthy(cls._get_prompt_param(prompt_data, key, None)):
                 return True
             if cls._truthy(cls._get_sp_param(sp, key, None)):

From 2ee73c6019513ace43ff996a95096d671d56802b Mon Sep 17 00:00:00 2001
From: Bartosz Stefaniak <bstefaniak@nvidia.com>
Date: Tue, 2 Jun 2026 19:17:50 +0000
Subject: [PATCH 06/11] Pass sound_dim/sound_latent_fps into transformer from
 initialized sound tokenizer

Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 .../models/cosmos3/test_cosmos3_pipeline.py   | 122 ++++++++++++++++++
 .../cosmos3/test_cosmos3_sound_tokenizer.py   |  14 +-
 .../cosmos3/test_cosmos3_transformer.py       |  43 ++++--
 .../models/cosmos3/pipeline_cosmos3.py        |  19 ++-
 .../models/cosmos3/sound_tokenizer.py         |  47 ++-----
 .../models/cosmos3/transformer_cosmos3.py     |  41 ++++--
 6 files changed, 215 insertions(+), 71 deletions(-)

diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
index 3c042275341..0e441766a97 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py
@@ -73,6 +73,24 @@ def decode(self, latents: torch.Tensor, return_dict: bool = False):
         return (latents,)
 
 
+class StubCosmos3AVAE:
+    def __init__(self, **kwargs: Any) -> None:
+        self.kwargs = kwargs
+        self.sample_rate = int(kwargs["sample_rate"])
+        self.audio_channels = int(kwargs["audio_channels"])
+        self.latent_ch = int(kwargs["io_channels"])
+        self.temporal_compression_factor = int(kwargs["hop_size"])
+
+    def get_latent_num_samples(self, num_audio_samples: int) -> int:
+        return int(num_audio_samples) // self.temporal_compression_factor
+
+    def get_audio_num_samples(self, num_latent_samples: int) -> int:
+        return int(num_latent_samples) * self.temporal_compression_factor
+
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        return torch.zeros(latents.shape[0], self.audio_channels, 8)
+
+
 class StubCosmos3Transformer(nn.Module):
     def __init__(
         self,
@@ -80,11 +98,13 @@ def __init__(
         latent_channel_size: int = 2,
         sound_gen: bool = False,
         sound_dim: int = 3,
+        sound_latent_fps: float = 25.0,
     ) -> None:
         super().__init__()
         self.latent_channel_size = latent_channel_size
         self.sound_gen = sound_gen
         self.sound_dim = sound_dim
+        self.sound_latent_fps = sound_latent_fps
         self.cached_kv: Any | None = None
         self.cached_freqs_gen: Any | None = None
         self.calls: list[dict[str, Any]] = []
@@ -222,6 +242,108 @@ def test_pipeline_registered_and_exported() -> None:
     assert "Cosmos3OmniDiffusersPipeline" in cosmos3.__all__
 
 
+@pytest.fixture
+def stub_real_pipeline_init(monkeypatch: pytest.MonkeyPatch):
+    from vllm_omni.diffusion.models.cosmos3 import pipeline_cosmos3
+
+    class _StubAutoTokenizer:
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            return SimpleNamespace()
+
+    class _StubDiffusersVAE:
+        config = SimpleNamespace(scale_factor_temporal=4, scale_factor_spatial=8)
+
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            return cls()
+
+        def to(self, _device):
+            return self
+
+    class _StubDiffusersScheduler:
+        config = SimpleNamespace(flow_shift=1.0)
+
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            return cls()
+
+    class _StubVideoProcessor:
+        def __init__(self, *args, **kwargs) -> None:
+            pass
+
+    monkeypatch.setattr(pipeline_cosmos3, "AutoTokenizer", _StubAutoTokenizer)
+    monkeypatch.setattr(pipeline_cosmos3, "DistributedAutoencoderKLWan", _StubDiffusersVAE)
+    monkeypatch.setattr(pipeline_cosmos3, "UniPCMultistepScheduler", _StubDiffusersScheduler)
+    monkeypatch.setattr(pipeline_cosmos3, "VideoProcessor", _StubVideoProcessor)
+    monkeypatch.setattr(pipeline_cosmos3, "get_local_device", lambda: torch.device("cpu"))
+
+
+def _make_od_config(*, sound_gen: bool) -> SimpleNamespace:
+    tf_model_config = {
+        "hidden_size": 8,
+        "num_hidden_layers": 0,
+        "num_attention_heads": 2,
+        "num_key_value_heads": 2,
+        "head_dim": 4,
+        "intermediate_size": 16,
+        "vocab_size": 32,
+        "latent_patch_size": 1,
+        "latent_channel": 2,
+        "rope_scaling": {"mrope_section": [1, 1, 0]},
+    }
+    if sound_gen:
+        tf_model_config["sound_gen"] = True
+    return SimpleNamespace(
+        enable_cpu_offload=False,
+        enable_diffusion_pipeline_profiler=False,
+        model="/nonexistent/model/path",
+        dtype=torch.float32,
+        flow_shift=None,
+        quantization_config=None,
+        custom_pipeline_args={},
+        model_config={},
+        tf_model_config=tf_model_config,
+    )
+
+
+def test_pipeline_init_skips_tokenizer_when_sound_disabled(stub_real_pipeline_init) -> None:
+    from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import Cosmos3OmniDiffusersPipeline
+
+    pipeline = Cosmos3OmniDiffusersPipeline(od_config=_make_od_config(sound_gen=False))
+
+    assert pipeline._sound_tokenizer is None
+    assert pipeline.transformer.sound_gen is False
+    assert not hasattr(pipeline.transformer, "audio_proj_in")
+    assert not hasattr(pipeline.transformer, "audio_proj_out")
+
+
+def test_pipeline_init_passes_tokenizer_attrs_into_transformer(
+    stub_real_pipeline_init,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer
+    from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import Cosmos3OmniDiffusersPipeline
+
+    stub_tokenizer = sound_tokenizer.Cosmos3SoundTokenizer(
+        StubCosmos3AVAE(sample_rate=32000, audio_channels=2, io_channels=5, hop_size=800)
+    )
+    monkeypatch.setattr(
+        sound_tokenizer.Cosmos3SoundTokenizer,
+        "from_config",
+        classmethod(lambda cls, od_config: stub_tokenizer),
+    )
+
+    pipeline = Cosmos3OmniDiffusersPipeline(od_config=_make_od_config(sound_gen=True))
+
+    assert pipeline._sound_tokenizer is stub_tokenizer
+    assert pipeline.transformer.sound_gen is True
+    assert pipeline.transformer.sound_dim == pipeline._sound_tokenizer.latent_ch == 5
+    assert pipeline.transformer.sound_latent_fps == pipeline._sound_tokenizer.latent_fps == 40.0
+    assert pipeline.transformer.audio_proj_in.in_features == 5
+    assert pipeline.transformer.audio_proj_out.out_features == 5
+
+
 def test_preprocess_i2v_image_input() -> None:
     from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func
 
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
index 47664c59e77..7ab04cc212f 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py
@@ -72,7 +72,12 @@ def test_from_config_loads_local_diffusers_component(tmp_path, monkeypatch: pyte
 
     assert created["checkpoint_path"] == str(tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME)
     assert created["config_path"] == str(tokenizer_dir / "config.json")
-    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (32000, 3, 800)
+    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size, tokenizer.latent_fps) == (
+        32000,
+        3,
+        800,
+        40.0,
+    )
 
 
 def test_from_config_downloads_component_from_hf_repo(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None:
@@ -177,7 +182,12 @@ def test_component_config_precedence_and_conflict_detection(tmp_path, monkeypatc
         "tanh",
         2.0,
     )
-    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (48000, 64, 1920)
+    assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size, tokenizer.latent_fps) == (
+        48000,
+        64,
+        1920,
+        25.0,
+    )
 
     with pytest.raises(ValueError, match=r"sample_rate.*48000.*32000"):
         sound_tokenizer.Cosmos3SoundTokenizer.from_config(
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index bd1a9588b7e..6878b6b96ed 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 from types import SimpleNamespace
-
+from typing import Any
 import pytest
 import torch
 from torch import nn
@@ -125,18 +125,16 @@ def test_forward_returns_video_prediction(monkeypatch: pytest.MonkeyPatch) -> No
     assert tuple(output.shape) == (1, 2, 1, 2, 2)
 
 
-def test_sound_modules_follow_config() -> None:
+def test_sound_modules_follow_injected_sound_dim() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
     tiny = _tiny_cosmos3_config()
     no_modal = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32))
     with_sound = Cosmos3VFMTransformer(
-        SimpleNamespace(
-            tf_model_config={**tiny, "sound_gen": True},
-            model_config={"sound_tokenizer": {"io_channels": 5, "sample_rate": 32000, "hop_size": 800}},
-            custom_pipeline_args={},
-            dtype=torch.float32,
-        )
+        SimpleNamespace(tf_model_config=tiny, dtype=torch.float32),
+        sound_gen=True,
+        sound_dim=5,
+        sound_latent_fps=40.0,
     )
 
     assert no_modal.sound_gen is False
@@ -146,6 +144,23 @@ def test_sound_modules_follow_config() -> None:
     assert with_sound.audio_proj_in.in_features == 5
 
 
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"sound_gen": True},
+        {"sound_gen": True, "sound_dim": 5},
+        {"sound_gen": True, "sound_latent_fps": 40.0},
+    ],
+)
+def test_transformer_requires_sound_dim_and_fps_when_sound_gen_true(kwargs: dict[str, Any]) -> None:
+    from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+    with pytest.raises(ValueError, match=r"requires an explicit sound_dim and sound_latent_fps"):
+        Cosmos3VFMTransformer(
+            SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32),
+            **kwargs,
+        )
+
+
 def test_sound_pack_unpack_validate_shapes() -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
 
@@ -167,9 +182,12 @@ def test_forward_returns_video_and_sound_predictions(monkeypatch: pytest.MonkeyP
 
     output = transformer_cosmos3.Cosmos3VFMTransformer(
         SimpleNamespace(
-            tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0),
+            tf_model_config=_tiny_cosmos3_config(),
             dtype=torch.float32,
-        )
+        ),
+        sound_gen=True,
+        sound_dim=3,
+        sound_latent_fps=40.0,
     )(
         hidden_states=torch.zeros(1, 2, 1, 2, 2),
         timestep=torch.tensor([1.0]),
@@ -188,7 +206,10 @@ def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch
     import vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 as cosmos3_module
 
     model = cosmos3_module.Cosmos3VFMTransformer(
-        SimpleNamespace(tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3), dtype=torch.float32)
+        SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32),
+        sound_gen=True,
+        sound_dim=3,
+        sound_latent_fps=40.0,
     )
     monkeypatch.setattr(cosmos3_module, "_get_ulysses_state", lambda: (2, 0, None))
 
diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
index 33c05efbf95..5290e21204e 100644
--- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py
@@ -46,7 +46,7 @@
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.inputs.data import OmniDiffusionSamplingParams
 
-from .transformer_cosmos3 import Cosmos3VFMTransformer
+from .transformer_cosmos3 import Cosmos3VFMTransformer, resolve_sound_gen
 
 logger = init_logger(__name__)
 
@@ -324,10 +324,22 @@ def __init__(
         self.vae_scale_factor_temporal = int(self.vae.config.scale_factor_temporal)
         self.vae_scale_factor_spatial = getattr(self.vae.config, "scale_factor_spatial", 16)
 
+        sound_gen = resolve_sound_gen(od_config)
+        sound_dim = None
+        sound_latent_fps = None
+        self._sound_tokenizer = None
+        if sound_gen:
+            self._sound_tokenizer = self._get_sound_tokenizer()
+            sound_dim = self._sound_tokenizer.latent_ch
+            sound_latent_fps = self._sound_tokenizer.latent_fps
+
         # --- Transformer (weights loaded later via weights_sources) ---
         self.transformer = Cosmos3VFMTransformer(
             od_config=od_config,
             temporal_compression_factor=self.vae_scale_factor_temporal,
+            sound_gen=sound_gen,
+            sound_dim=sound_dim,
+            sound_latent_fps=sound_latent_fps,
         )
 
         # --- Scheduler ---
@@ -367,9 +379,6 @@ def __init__(
 
         self._guidance_scale = None
         self._num_timesteps = None
-        self._sound_tokenizer = None
-        if getattr(self.transformer, "sound_gen", False):
-            self._get_sound_tokenizer()
 
         # Set True by ``enable_cache_for_cosmos3`` when cache-dit is enabled on
         # this pipeline. Tells the sequential-CFG loop to keep paired
@@ -603,8 +612,6 @@ def _is_sound_request(cls, prompt_data, sp) -> bool:
         return False
 
     def _get_sound_tokenizer(self):
-        if not hasattr(self, "_sound_tokenizer"):
-            self._sound_tokenizer = None
         if self._sound_tokenizer is None:
             from .sound_tokenizer import Cosmos3SoundTokenizer
 
diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
index 281b7e1d9f0..4e2d6f7ee76 100644
--- a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
+++ b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
@@ -289,23 +289,6 @@ def get_sound_channels(od_config: OmniDiffusionConfig) -> int:
     )
 
 
-def get_sound_dim(od_config: OmniDiffusionConfig | None) -> int:
-    if od_config is None:
-        return DEFAULT_SOUND_DIM
-    args = _pipeline_args(od_config)
-    custom_value = _custom_arg_value(args, ("sound_dim", "io_channels", "latent_ch"))
-    if custom_value is not None:
-        return int(custom_value)
-    top_value = _top_level_model_value(od_config, ("sound_dim",))
-    if top_value is not None:
-        return int(top_value)
-    nested_value = _first_value_from_configs(
-        _nested_sound_tokenizer_configs(od_config),
-        ("io_channels", "vocoder_input_dim", "latent_ch"),
-    )
-    return int(DEFAULT_SOUND_DIM if nested_value is None else nested_value)
-
-
 def get_sound_hop_size(od_config: OmniDiffusionConfig) -> int:
     args = _pipeline_args(od_config)
     return _resolve_arch_value(
@@ -321,27 +304,6 @@ def get_sound_hop_size(od_config: OmniDiffusionConfig) -> int:
     )
 
 
-def get_sound_latent_fps(od_config: OmniDiffusionConfig | None) -> float:
-    if od_config is None:
-        return DEFAULT_SOUND_LATENT_FPS
-    args = _pipeline_args(od_config)
-    custom_value = _custom_arg_value(args, ("sound_latent_fps",))
-    if custom_value is not None:
-        return float(custom_value)
-    top_value = _top_level_model_value(od_config, ("sound_latent_fps",))
-    if top_value is not None:
-        return float(top_value)
-    nested_configs = _nested_sound_tokenizer_configs(od_config)
-    nested_fps = _first_value_from_configs(nested_configs, ("sound_latent_fps", "latent_fps"))
-    if nested_fps is not None:
-        return float(nested_fps)
-    sample_rate = _first_value_from_configs(nested_configs, ("sample_rate", "sampling_rate"))
-    hop_size = _first_value_from_configs(nested_configs, ("hop_size",))
-    if sample_rate is not None and hop_size is not None:
-        return float(sample_rate) / float(hop_size)
-    return float(DEFAULT_SOUND_LATENT_FPS)
-
-
 class Cosmos3SoundTokenizer:
     """Thin adapter around the local AVAE tokenizer implementation."""
 
@@ -351,6 +313,11 @@ def __init__(self, tokenizer: Any) -> None:
         self.audio_channels = int(getattr(tokenizer, "audio_channels", DEFAULT_SOUND_CHANNELS))
         self.latent_ch = int(getattr(tokenizer, "latent_ch", DEFAULT_SOUND_DIM))
         self.hop_size = int(getattr(tokenizer, "temporal_compression_factor", DEFAULT_SOUND_HOP_SIZE))
+        if self.hop_size <= 0:
+            raise ValueError(
+                f"Cosmos3 sound tokenizer hop_size must be positive, got {self.hop_size}."
+            )
+        self.latent_fps = float(self.sample_rate) / float(self.hop_size)
 
     @classmethod
     def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer:
@@ -503,12 +470,14 @@ def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer:
         )
         if _is_rank_zero():
             logger.info(
-                "Loaded Cosmos3 AVAE sound tokenizer from %s (sr=%d, channels=%d, latent_ch=%d, hop=%d)",
+                "Loaded Cosmos3 AVAE sound tokenizer from %s "
+                "(sr=%d, channels=%d, latent_ch=%d, hop=%d, latent_fps=%.3f)",
                 avae_path,
                 sample_rate,
                 audio_channels,
                 sound_dim,
                 hop_size,
+                float(sample_rate) / float(hop_size),
             )
         return cls(tokenizer)
 
diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
index e1810bd7103..5ff2683fdda 100644
--- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
+++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py
@@ -121,6 +121,21 @@ def _as_bool(value: Any) -> bool:
     return bool(value)
 
 
+def resolve_sound_gen(od_config: Any) -> bool:
+    """Capability gate shared by the pipeline and transformer.
+
+    Explicit ``sound_gen`` flag wins (including an explicit False);
+    otherwise infer from the presence of any sound-width key in od_config.
+    """
+    sound_gen_value = _od_config_get(od_config, "sound_gen", None)
+    if sound_gen_value is not None:
+        return _as_bool(sound_gen_value)
+    for key in ("sound_dim", "io_channels", "vocoder_input_dim", "latent_ch"):
+        if _od_config_get(od_config, key, None) is not None:
+            return True
+    return False
+
+
 # ---------------------------------------------------------------------------
 # Rotary Position Embeddings (mRoPE)
 # ---------------------------------------------------------------------------
@@ -906,6 +921,9 @@ def __init__(
         od_config: OmniDiffusionConfig,
         *,
         temporal_compression_factor: int | None = None,
+        sound_gen: bool = False,
+        sound_dim: int | None = None,
+        sound_latent_fps: float | None = None,
     ) -> None:
         super().__init__()
         model_config = od_config.tf_model_config
@@ -926,19 +944,16 @@ def __init__(
         self.latent_channel_size = int(_tf_config_get(model_config, "latent_channel", 48))
         self.timestep_scale = float(_tf_config_get(model_config, "timestep_scale", 0.001))
         self.base_fps = float(_tf_config_get(model_config, "base_fps", 24.0))
-        sound_gen_value = _od_config_get(od_config, "sound_gen", None)
-        sound_dim_value = _od_config_get(od_config, "sound_dim", None)
-        if sound_dim_value is None:
-            sound_dim_value = _od_config_get(od_config, "io_channels", None)
-        if sound_dim_value is None:
-            sound_dim_value = _od_config_get(od_config, "vocoder_input_dim", None)
-        if sound_dim_value is None:
-            sound_dim_value = _od_config_get(od_config, "latent_ch", None)
-        self.sound_gen = _as_bool(sound_gen_value) if sound_gen_value is not None else sound_dim_value is not None
-        from .sound_tokenizer import get_sound_dim, get_sound_latent_fps
-
-        self.sound_dim = int(sound_dim_value if sound_dim_value is not None else get_sound_dim(od_config))
-        self.sound_latent_fps = float(get_sound_latent_fps(od_config))
+        self.sound_gen = sound_gen
+        self.sound_dim = sound_dim
+        self.sound_latent_fps = sound_latent_fps
+
+        if self.sound_gen and (sound_dim is None or sound_latent_fps is None):
+            raise ValueError(
+                "Cosmos3VFMTransformer requires an explicit sound_dim and sound_latent_fps when sound_gen is True; "
+                "the pipeline must pass Cosmos3SoundTokenizer.latent_ch so the audio projection "
+                "layers are sized from the authoritative AVAE latent width."
+            )
         if temporal_compression_factor is None:
             temporal_compression_factor = _tf_config_get(model_config, "temporal_compression_factor", 4)
         self.temporal_compression_factor = int(temporal_compression_factor)

From 04ffce45365e12d9726fcb74904040ab9352372f Mon Sep 17 00:00:00 2001
From: Bartosz Stefaniak <bstefaniak@nvidia.com>
Date: Tue, 2 Jun 2026 19:17:58 +0000
Subject: [PATCH 07/11] Update recipes

Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 recipes/README.md              |  2 +-
 recipes/nvidia/Cosmos3-Nano.md | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/recipes/README.md b/recipes/README.md
index 48e9d0a80e7..416ed77fe93 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -36,7 +36,7 @@ recipes/
 | [`LTX/LTX-2.md`](./LTX/LTX-2.md) | Text-to-video and image-to-video serving | 1x H200 141GB |
 | [`LTX/LTX-2.3.md`](./LTX/LTX-2.3.md) | Text-to-video with audio generation (22B) | 1x GPU (96GB VRAM) |
 | [`mistralai/Voxtral-TTS.md`](./mistralai/Voxtral-TTS.md) | Online serving for TTS | 1x RTX 4090 24GB |
-| [`nvidia/Cosmos3-Nano.md`](./nvidia/Cosmos3-Nano.md) | Text-to-image, text-to-video, and image-to-video generation | 1x H200 141GB / B300 |
+| [`nvidia/Cosmos3-Nano.md`](./nvidia/Cosmos3-Nano.md) | Text-to-image, text-to-video, image-to-video generation, text to video with sound  | 1x H200 141GB / B300 |
 | [`OpenBMB/MiniCPM-o-4_5.md`](./OpenBMB/MiniCPM-o-4_5.md) | Online serving for omni multimodal chat (text / image / audio / video → text + 24 kHz speech) | 2x A100/H100 80GB / 3x mid-tier GPU / 8x RTX 4090 24GB |
 | [`OpenBMB/VoxCPM2.md`](./OpenBMB/VoxCPM2.md) | Online + offline TTS with native AR pipeline (48 kHz, 30+ languages) | 1x RTX 4090 24GB |
 | [`Qwen/Qwen-Image.md`](./Qwen/Qwen-Image.md) | Text-to-image serving with step-wise continuous batching replay and ModelOpt mixed FP8/NVFP4 | 1x A100 80GB / 2x B200 |
diff --git a/recipes/nvidia/Cosmos3-Nano.md b/recipes/nvidia/Cosmos3-Nano.md
index 5d5e524da58..8113bfb7081 100644
--- a/recipes/nvidia/Cosmos3-Nano.md
+++ b/recipes/nvidia/Cosmos3-Nano.md
@@ -20,6 +20,7 @@ the mode is selected per request:
 - **T2V** — `POST /v1/videos/sync` with `num_frames > 1` and no reference image.
 - **I2V** — `POST /v1/videos/sync` with a reference image (`input_reference` file
   upload, or `image_reference` JSON).
+- **T2VS** — `POST /v1/videos/sync` with `num_frames > 1`, no reference image and `generate_sound=true`.
 
 ## References
 
@@ -116,6 +117,25 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync \
   -F "seed=1111" \
   -F "input_reference=@/path/to/reference.jpg;type=image/jpeg" \
   -o cosmos3_i2v.mp4
+
+
+# Text-to-video-with-sound
+curl -sS -X POST http://localhost:8000/v1/videos/sync \
+  -H "Accept: video/mp4" \
+  -F "prompt=The video opens with a view of a well-lit indoor fruit display. A robotic arm picks up a pear, an orange, and a carambola one by one, placing each into a plastic bag in a shopping cart with red handles. The video is 7.875 seconds long, 24 FPS, and 1280x720. Audio description: soft servo whirs, gentle fruit thuds, plastic bag rustling, and a faint refrigeration hum." \
+  -F "negative_prompt=blurry, distorted, low quality" \
+  -F "size=1280x720" \
+  -F "num_frames=189" \
+  -F "fps=24" \
+  -F "num_inference_steps=35" \
+  -F "guidance_scale=6.0" \
+  -F "max_sequence_length=4096" \
+  -F "flow_shift=10.0" \
+  -F "seed=42" \
+  -F "generate_sound=true" \
+  -F "sound_duration=7.875" \
+  -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \
+  -o cosmos3_t2v_with_sound.mp4
 ```
 
 #### Notes

From a0a98683b955f9d30deb47ec80f9b31713532acf Mon Sep 17 00:00:00 2001
From: Bartosz Stefaniak <bstefaniak@nvidia.com>
Date: Tue, 2 Jun 2026 19:19:43 +0000
Subject: [PATCH 08/11] lint

Signed-off-by: Bartosz Stefaniak <bstefaniak@nvidia.com>
Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 recipes/nvidia/Cosmos3-Nano.md                             | 2 +-
 tests/diffusion/models/cosmos3/test_cosmos3_transformer.py | 2 ++
 vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py      | 4 +---
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/recipes/nvidia/Cosmos3-Nano.md b/recipes/nvidia/Cosmos3-Nano.md
index 8113bfb7081..1698fa14f61 100644
--- a/recipes/nvidia/Cosmos3-Nano.md
+++ b/recipes/nvidia/Cosmos3-Nano.md
@@ -131,7 +131,7 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync \
   -F "guidance_scale=6.0" \
   -F "max_sequence_length=4096" \
   -F "flow_shift=10.0" \
-  -F "seed=42" \
+  -F "seed=0" \
   -F "generate_sound=true" \
   -F "sound_duration=7.875" \
   -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \
diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
index 6878b6b96ed..062cd8abf98 100644
--- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
+++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py
@@ -5,6 +5,7 @@
 
 from types import SimpleNamespace
 from typing import Any
+
 import pytest
 import torch
 from torch import nn
@@ -154,6 +155,7 @@ def test_sound_modules_follow_injected_sound_dim() -> None:
 )
 def test_transformer_requires_sound_dim_and_fps_when_sound_gen_true(kwargs: dict[str, Any]) -> None:
     from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer
+
     with pytest.raises(ValueError, match=r"requires an explicit sound_dim and sound_latent_fps"):
         Cosmos3VFMTransformer(
             SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32),
diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
index 4e2d6f7ee76..66937b33c6b 100644
--- a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
+++ b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py
@@ -314,9 +314,7 @@ def __init__(self, tokenizer: Any) -> None:
         self.latent_ch = int(getattr(tokenizer, "latent_ch", DEFAULT_SOUND_DIM))
         self.hop_size = int(getattr(tokenizer, "temporal_compression_factor", DEFAULT_SOUND_HOP_SIZE))
         if self.hop_size <= 0:
-            raise ValueError(
-                f"Cosmos3 sound tokenizer hop_size must be positive, got {self.hop_size}."
-            )
+            raise ValueError(f"Cosmos3 sound tokenizer hop_size must be positive, got {self.hop_size}.")
         self.latent_fps = float(self.sample_rate) / float(self.hop_size)
 
     @classmethod

From 8c340fe5dda614aec9d12702ccdb704962b074ce Mon Sep 17 00:00:00 2001
From: lishunyang12 <lishunyang12@163.com>
Date: Tue, 2 Jun 2026 19:09:50 +0000
Subject: [PATCH 09/11] add video+sound usage to Cosmos3-Nano recipe

Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 recipes/nvidia/Cosmos3-Nano.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/recipes/nvidia/Cosmos3-Nano.md b/recipes/nvidia/Cosmos3-Nano.md
index 1698fa14f61..41df35e2883 100644
--- a/recipes/nvidia/Cosmos3-Nano.md
+++ b/recipes/nvidia/Cosmos3-Nano.md
@@ -6,7 +6,7 @@
 
 - Vendor: NVIDIA
 - Model: `nvidia/Cosmos3-Nano`
-- Task: Text-to-image (T2I), text-to-video (T2V), and image-to-video (I2V) generation
+- Task: Text-to-image (T2I), text-to-video (T2V), and image-to-video (I2V) generation, with optional synchronized audio (video + sound)
 - Mode: Online serving with the OpenAI-compatible image/video APIs, plus offline generation via the `Omni` API
 - Maintainer: Community
 
@@ -20,13 +20,17 @@ the mode is selected per request:
 - **T2V** — `POST /v1/videos/sync` with `num_frames > 1` and no reference image.
 - **I2V** — `POST /v1/videos/sync` with a reference image (`input_reference` file
   upload, or `image_reference` JSON).
-- **T2VS** — `POST /v1/videos/sync` with `num_frames > 1`, no reference image and `generate_sound=true`.
+- **T2VS / I2VS** — add `generate_sound=true` (and optional `sound_duration`) to a
+  T2V/I2V `/v1/videos/sync` request to also generate synchronized audio, muxed into
+  the mp4 as AAC 48 kHz stereo. See the official model card's "Video + Audio" examples.
 
 ## References
 
 - Model card (authoritative usage + example assets): <https://huggingface.co/nvidia/Cosmos3-Nano>
 - Example inputs/outputs live in the repo's `assets/` (`example_t2v_prompt.json`,
-  `example_i2v_prompt.json`, `example_i2v_input.jpg`, `negative_prompt.json`).
+  `example_i2v_prompt.json`, `example_i2v_input.jpg`, `negative_prompt.json`;
+  audio examples: `example_t2vs_prompt.json`, `example_t2vs_output.mp4`,
+  `example_i2vs_output.mp4`).
 - Prompt upsampling (recommended for quality): the model expects JSON-upsampled
   structured prompts; see NVIDIA's `cosmos-framework` prompt-upsampling docs.
 - Pipeline: [`vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py`](../../vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py)
@@ -163,8 +167,8 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync \
     the server fails at pipeline build with a gated-repo / safety-checker error.
   - A guardrail-blocked prompt currently returns HTTP 500
     (`"Guardrail blocked prompt"`).
-  - Video + audio, and action (policy / forward- / inverse-dynamics) modalities
-    are not part of this integration yet.
+  - Action (policy / forward- / inverse-dynamics) modalities are not part of
+    this integration yet.
 
 ### 1x GPU (Offline generation)
 

From 96243ef11ae2cbde89e8a96e681b3ba0645a40a4 Mon Sep 17 00:00:00 2001
From: lishunyang12 <lishunyang12@163.com>
Date: Tue, 2 Jun 2026 20:22:03 +0000
Subject: [PATCH 10/11] add Cosmos3-Super recipe

Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 recipes/README.md               |  1 +
 recipes/nvidia/Cosmos3-Super.md | 87 +++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 recipes/nvidia/Cosmos3-Super.md

diff --git a/recipes/README.md b/recipes/README.md
index 416ed77fe93..161bcdd5edc 100644
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -37,6 +37,7 @@ recipes/
 | [`LTX/LTX-2.3.md`](./LTX/LTX-2.3.md) | Text-to-video with audio generation (22B) | 1x GPU (96GB VRAM) |
 | [`mistralai/Voxtral-TTS.md`](./mistralai/Voxtral-TTS.md) | Online serving for TTS | 1x RTX 4090 24GB |
 | [`nvidia/Cosmos3-Nano.md`](./nvidia/Cosmos3-Nano.md) | Text-to-image, text-to-video, image-to-video generation, text to video with sound  | 1x H200 141GB / B300 |
+| [`nvidia/Cosmos3-Super.md`](./nvidia/Cosmos3-Super.md) | 64B T2I / T2V / I2V generation (+ optional audio) | 8x H200/H100/A100 / 2x H200 / B300 |
 | [`OpenBMB/MiniCPM-o-4_5.md`](./OpenBMB/MiniCPM-o-4_5.md) | Online serving for omni multimodal chat (text / image / audio / video → text + 24 kHz speech) | 2x A100/H100 80GB / 3x mid-tier GPU / 8x RTX 4090 24GB |
 | [`OpenBMB/VoxCPM2.md`](./OpenBMB/VoxCPM2.md) | Online + offline TTS with native AR pipeline (48 kHz, 30+ languages) | 1x RTX 4090 24GB |
 | [`Qwen/Qwen-Image.md`](./Qwen/Qwen-Image.md) | Text-to-image serving with step-wise continuous batching replay and ModelOpt mixed FP8/NVFP4 | 1x A100 80GB / 2x B200 |
diff --git a/recipes/nvidia/Cosmos3-Super.md b/recipes/nvidia/Cosmos3-Super.md
new file mode 100644
index 00000000000..33d3ec093b9
--- /dev/null
+++ b/recipes/nvidia/Cosmos3-Super.md
@@ -0,0 +1,87 @@
+# Cosmos3-Super
+
+> Frontier 64B world model: text-to-image, text-to-video, image-to-video (+ optional audio)
+
+## Summary
+
+- Vendor: NVIDIA
+- Model: `nvidia/Cosmos3-Super` (64B; also `Cosmos3-Super-Text2Image`, `Cosmos3-Super-Image2Video`)
+- Task: T2I, T2V, I2V generation, with optional synchronized audio (video + sound)
+- Mode: Online serving with the OpenAI-compatible image/video APIs
+- Maintainer: Community
+
+## When to use this recipe
+
+Use this recipe to deploy the 64B `nvidia/Cosmos3-Super` for the highest-quality
+Cosmos3 generation. It shares the same `Cosmos3OmniDiffusersPipeline` and request
+formats as [Cosmos3-Nano](./Cosmos3-Nano.md) — only the checkpoint size and the
+recommended parallelism differ. Mode is selected per request (T2I →
+`/v1/images/generations`; T2V/I2V → `/v1/videos/sync`; add `generate_sound=true`
+for audio).
+
+## References
+
+- Model card (authoritative usage + example assets): <https://huggingface.co/nvidia/Cosmos3-Super>
+- Nano recipe (same APIs/params): [`Cosmos3-Nano.md`](./Cosmos3-Nano.md)
+- Pipeline: [`vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py`](../../vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py)
+
+## Hardware Support
+
+## GPU
+
+### 8x H200/H100/A100 (recommended, per model card)
+
+```bash
+vllm serve nvidia/Cosmos3-Super \
+  --omni \
+  --host 0.0.0.0 --port 8000 \
+  --cfg-parallel-size 2 \
+  --ulysses-degree 4 \
+  --use-hsdp --hsdp-shard-size 8 \
+  --init-timeout 1800
+```
+
+### 2x H200 / B300 (minimum)
+
+```bash
+vllm serve nvidia/Cosmos3-Super \
+  --omni \
+  --host 0.0.0.0 --port 8000 \
+  --cfg-parallel-size 2 \
+  --use-hsdp --hsdp-shard-size 2 \
+  --init-timeout 1800
+```
+
+Guardrails are on by default (gated `nvidia/Cosmos-1.0-Guardrail` — `pip install
+cosmos-guardrail`, accept the license, set `HF_TOKEN`); add `--no-guardrails` to
+disable. `--enable-layerwise-offload` reduces VRAM on smaller GPUs.
+
+#### Verification
+
+Requests are identical to Nano (see [`Cosmos3-Nano.md`](./Cosmos3-Nano.md) for full
+T2I/T2V/I2V/T2VS curls); official params: `size=1280x720, num_frames=189, fps=24,
+num_inference_steps=35, guidance_scale=6.0, flow_shift=10.0, max_sequence_length=4096`.
+
+```bash
+curl http://localhost:8000/v1/models
+# T2V (official prompt assets give best quality)
+curl -sS -X POST http://localhost:8000/v1/videos/sync -H "Accept: video/mp4" \
+  -F "model=nvidia/Cosmos3-Super" -F "prompt=A robot arm is cleaning a plate in the kitchen" \
+  -F "size=1280x720" -F "num_frames=189" -F "fps=24" -F "num_inference_steps=35" \
+  -F "guidance_scale=6.0" -F "max_sequence_length=4096" -F "flow_shift=10.0" \
+  -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \
+  -F "seed=17" -o cosmos3_super_t2v.mp4
+```
+
+#### Notes
+
+- **Measured (2x B300, bf16, guardrails off, official 2-GPU config above):**
+  - T2I 1024², 50 steps → **~6 s**
+  - T2V 1280×720, 189 frames, 35 steps → **~197 s**
+  - I2V 1280×720, 189 frames, 35 steps → **~200 s**
+  - T2V + sound (189 frames, 35 steps) → **~198 s**, output muxes **AAC 48 kHz stereo**
+  - (NVIDIA's reference: 8×H200 @ 50 steps ≈ 55 s/video; 2×H200 @ 35 steps ≈ 3 min/video.)
+- **Memory:** ~61.5 GiB per GPU when sharded across 2 GPUs (HSDP shard 2); repo ~135 GB on disk.
+- Same generation defaults, supported sizes, and `generate_sound`/`sound_duration`
+  semantics as Nano. Action (policy / forward- / inverse-dynamics) modalities are
+  not part of this integration yet.

From 7765517a2eb595a673a0fcfd71c800e2df85bafe Mon Sep 17 00:00:00 2001
From: lishunyang12 <lishunyang12@163.com>
Date: Tue, 2 Jun 2026 20:29:47 +0000
Subject: [PATCH 11/11] polish Cosmos3 recipes: add model field, install note,
 Super curls

Signed-off-by: lishunyang12 <lishunyang12@163.com>
---
 recipes/nvidia/Cosmos3-Nano.md  | 12 +++++++++---
 recipes/nvidia/Cosmos3-Super.md | 21 +++++++++++++++++++++
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/recipes/nvidia/Cosmos3-Nano.md b/recipes/nvidia/Cosmos3-Nano.md
index 41df35e2883..57f6b983cda 100644
--- a/recipes/nvidia/Cosmos3-Nano.md
+++ b/recipes/nvidia/Cosmos3-Nano.md
@@ -52,6 +52,9 @@ the mode is selected per request:
 
 #### Command
 
+Requires the `vllm-omni` package (or the `vllm/vllm-omni:cosmos3` container),
+which provides the `vllm serve … --omni` entrypoint used below.
+
 Safety guardrails are **on by default** (NVIDIA Open Model License). They load
 the **gated** `nvidia/Cosmos-1.0-Guardrail` model, so to keep them on you must:
 
@@ -126,6 +129,7 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync \
 # Text-to-video-with-sound
 curl -sS -X POST http://localhost:8000/v1/videos/sync \
   -H "Accept: video/mp4" \
+  -F "model=nvidia/Cosmos3-Nano" \
   -F "prompt=The video opens with a view of a well-lit indoor fruit display. A robotic arm picks up a pear, an orange, and a carambola one by one, placing each into a plastic bag in a shopping cart with red handles. The video is 7.875 seconds long, 24 FPS, and 1280x720. Audio description: soft servo whirs, gentle fruit thuds, plastic bag rustling, and a faint refrigeration hum." \
   -F "negative_prompt=blurry, distorted, low quality" \
   -F "size=1280x720" \
@@ -158,7 +162,9 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync \
   3:4, 9:16. Defaults: T2I 1024², 50 steps, guidance 7.0; T2V/I2V 1280×720,
   189 frames, 35 steps, guidance 6.0, `flow_shift=10.0`.
 - **Key flags / params:** `--no-guardrails` (server) or
-  `extra_params={"guardrails":false}` (per request) toggles safety;
+  `extra_params={"guardrails":false}` (per request) toggles safety. The
+  per-request flag only takes effect when the server was launched **with**
+  guardrails enabled (it cannot re-enable them on a `--no-guardrails` server).
   `use_resolution_template` / `use_duration_template` are off by default and only
   needed when not using upsampled prompts that already encode resolution/duration.
 - **Known limitations:**
@@ -194,8 +200,8 @@ def main():
         model_class_name="Cosmos3OmniDiffusersPipeline",
         trust_remote_code=True,
         enforce_eager=True,
-        # Keep guardrails on by installing cosmos-guardrail + gated-repo access;
-        # this disables them for a quick local run.
+        # Guardrails are disabled here for a quick local run; install
+        # cosmos-guardrail + gated-repo access and drop this to enable them.
         model_config={"guardrails": False},
     )
     gen = torch.Generator(device="cpu").manual_seed(42)
diff --git a/recipes/nvidia/Cosmos3-Super.md b/recipes/nvidia/Cosmos3-Super.md
index 33d3ec093b9..528b7a77393 100644
--- a/recipes/nvidia/Cosmos3-Super.md
+++ b/recipes/nvidia/Cosmos3-Super.md
@@ -29,6 +29,9 @@ for audio).
 
 ## GPU
 
+Requires the `vllm-omni` package (or the `vllm/vllm-omni:cosmos3` container),
+which provides the `vllm serve … --omni` entrypoint used below.
+
 ### 8x H200/H100/A100 (recommended, per model card)
 
 ```bash
@@ -71,6 +74,24 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync -H "Accept: video/mp4" \
   -F "guidance_scale=6.0" -F "max_sequence_length=4096" -F "flow_shift=10.0" \
   -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \
   -F "seed=17" -o cosmos3_super_t2v.mp4
+
+# I2V — add an uploaded reference image
+curl -sS -X POST http://localhost:8000/v1/videos/sync -H "Accept: video/mp4" \
+  -F "model=nvidia/Cosmos3-Super" -F "prompt=The scene comes to life with smooth, natural motion." \
+  -F "size=1280x720" -F "num_frames=189" -F "fps=24" -F "num_inference_steps=35" \
+  -F "guidance_scale=6.0" -F "max_sequence_length=4096" -F "flow_shift=10.0" \
+  -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \
+  -F "seed=1111" -F "input_reference=@/path/to/reference.jpg;type=image/jpeg" \
+  -o cosmos3_super_i2v.mp4
+
+# T2V + sound — add generate_sound/sound_duration (output muxes AAC 48 kHz stereo)
+curl -sS -X POST http://localhost:8000/v1/videos/sync -H "Accept: video/mp4" \
+  -F "model=nvidia/Cosmos3-Super" -F "prompt=A robot arm is cleaning a plate in the kitchen" \
+  -F "size=1280x720" -F "num_frames=189" -F "fps=24" -F "num_inference_steps=35" \
+  -F "guidance_scale=6.0" -F "max_sequence_length=4096" -F "flow_shift=10.0" \
+  -F "generate_sound=true" -F "sound_duration=7.875" \
+  -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \
+  -F "seed=17" -o cosmos3_super_t2vs.mp4
 ```
 
 #### Notes