From a37fd9e307c90f5b6433676235051119030bc4d7 Mon Sep 17 00:00:00 2001 From: Maciej Bala Date: Thu, 28 May 2026 17:58:43 +0200 Subject: [PATCH 01/11] Add Cosmos3 sound generation Signed-off-by: Maciej Bala Signed-off-by: lishunyang12 --- docs/models/supported_models.md | 2 +- tests/diffusion/models/cosmos3/conftest.py | 185 ++++++ .../models/cosmos3/test_cosmos3_pipeline.py | 72 ++- .../cosmos3/test_cosmos3_sound_tokenizer.py | 226 ++++++++ .../cosmos3/test_cosmos3_transformer.py | 105 +++- .../openai_api/test_video_server.py | 9 + .../cosmos3/audio_tokenizer/__init__.py | 6 + .../models/cosmos3/audio_tokenizer/avae.py | 323 +++++++++++ .../models/cosmos3/pipeline_cosmos3.py | 293 +++++++++- .../models/cosmos3/sound_tokenizer.py | 537 ++++++++++++++++++ .../models/cosmos3/transformer_cosmos3.py | 195 ++++++- vllm_omni/entrypoints/openai/api_server.py | 4 + .../entrypoints/openai/protocol/videos.py | 9 + vllm_omni/entrypoints/openai/serving_video.py | 4 + 14 files changed, 1941 insertions(+), 29 deletions(-) create mode 100644 tests/diffusion/models/cosmos3/conftest.py create mode 100644 tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py create mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py create mode 100644 vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py create mode 100644 vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index c4e181d5917..6482d503c8b 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -33,7 +33,7 @@ th { | `ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `WanPipeline` | Wan2.1-T2V, Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-T2V-14B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | -| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V | `nvidia/Cosmos3-Nano` | ✅︎ | | | | +| `Cosmos3OmniDiffusersPipeline` | Cosmos3 T2I, T2V, I2V, T2V with sound | `nvidia/Cosmos3-Nano` | ✅︎ | | | | | `WanSpeechToVideoPipeline` | Wan2.2-S2V | `Wan-AI/Wan2.2-S2V-14B` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `Wan22VACEPipeline` | Wan2.1-VACE | `Wan-AI/Wan2.1-VACE-1.3B-diffusers`, `Wan-AI/Wan2.1-VACE-14B-diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `LTX2Pipeline` | LTX-2-T2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | | diff --git a/tests/diffusion/models/cosmos3/conftest.py b/tests/diffusion/models/cosmos3/conftest.py new file mode 100644 index 00000000000..7075065447c --- /dev/null +++ b/tests/diffusion/models/cosmos3/conftest.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import sys +import types +from types import SimpleNamespace +from typing import Any + +import pytest +import torch +from torch import nn + + +class StubScheduler: + def __init__(self, timesteps: list[int] | None = None, *, flow_shift: float = 1.0) -> None: + self.timesteps = torch.tensor(timesteps or [9, 3], dtype=torch.int64) + self.config = SimpleNamespace(num_train_timesteps=1000, flow_shift=flow_shift) + self.set_timesteps_calls: list[tuple[int, torch.device]] = [] + self.step_calls: list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = [] + + def set_timesteps(self, num_steps: int, device: torch.device) -> None: + self.set_timesteps_calls.append((num_steps, device)) + self.timesteps = torch.arange(num_steps, 0, -1, dtype=torch.int64, device=device) + + def step(self, noise_pred: torch.Tensor, timestep: torch.Tensor, latents: torch.Tensor, **kwargs): + del kwargs + self.step_calls.append((noise_pred.clone(), timestep.clone(), latents.clone())) + return (latents + noise_pred,) + + +class _ModeLatentDist: + def __init__(self, latents: torch.Tensor) -> None: + self._latents = latents + + def mode(self) -> torch.Tensor: + return self._latents + + +class StubCosmos3VAE: + dtype = torch.float32 + + def __init__(self, z_dim: int = 2, *, temporal: int = 4, spatial: int = 8) -> None: + self.config = SimpleNamespace( + z_dim=z_dim, + scale_factor_temporal=temporal, + scale_factor_spatial=spatial, + latents_mean=[0.0] * z_dim, + latents_std=[1.0] * z_dim, + ) + + def encode(self, video: torch.Tensor): + latent_frames = (video.shape[2] - 1) // self.config.scale_factor_temporal + 1 + latent_height = video.shape[-2] // self.config.scale_factor_spatial + latent_width = video.shape[-1] // self.config.scale_factor_spatial + latents = torch.ones( + video.shape[0], + self.config.z_dim, + latent_frames, + latent_height, + latent_width, + dtype=video.dtype, + device=video.device, + ) + return SimpleNamespace(latent_dist=_ModeLatentDist(latents)) + + def decode(self, latents: torch.Tensor, return_dict: bool = False): + del return_dict + return (latents,) + + +class StubCosmos3Transformer(nn.Module): + def __init__( + self, + *, + latent_channel_size: int = 2, + sound_gen: bool = False, + sound_dim: int = 3, + ) -> None: + super().__init__() + self.latent_channel_size = latent_channel_size + self.sound_gen = sound_gen + self.sound_dim = sound_dim + self.cached_kv: Any | None = None + self.cached_freqs_gen: Any | None = None + self.calls: list[dict[str, Any]] = [] + self.reset_calls = 0 + + def reset_cache(self) -> None: + self.reset_calls += 1 + self.cached_kv = None + self.cached_freqs_gen = None + + def forward( + self, + *, + hidden_states: torch.Tensor, + timestep: torch.Tensor, + text_ids: torch.Tensor, + text_mask: torch.Tensor, + **kwargs: Any, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + token = int(text_ids.reshape(-1)[0].item()) if text_ids.numel() else 0 + sound_latents = kwargs.get("sound_latents") + self.calls.append( + { + "token": token, + "timestep": timestep.clone(), + "text_mask": text_mask.clone(), + "cache_before": self.cached_kv, + "kwargs": dict(kwargs), + } + ) + if self.cached_kv is None: + marker = torch.tensor([token], dtype=torch.float32) + self.cached_kv = [(marker, marker + 100)] + self.cached_freqs_gen = (marker + 200, marker + 300) + outputs: list[torch.Tensor] = [torch.full_like(hidden_states, float(token))] + if sound_latents is not None: + outputs.append(torch.full_like(sound_latents, float(token + 10))) + return outputs[0] if len(outputs) == 1 else tuple(outputs) + + +def passthrough_progress_bar(iterable): + return iterable + + +@pytest.fixture(autouse=True) +def fake_cosmos3_guardrails(monkeypatch: pytest.MonkeyPatch): + module = types.ModuleType("vllm_omni.diffusion.models.cosmos3.guardrails") + module.is_guardrails_enabled = lambda od_config, sampling_params=None: False + module.ensure_initialized = lambda od_config: None + module.check_text_safety = lambda text: None + module.check_video_safety = lambda video: video + monkeypatch.setitem(sys.modules, module.__name__, module) + return module + + +@pytest.fixture +def make_cosmos3_pipeline(): + def _make(): + from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import ( + Cosmos3OmniDiffusersPipeline, + ) + + pipeline = object.__new__(Cosmos3OmniDiffusersPipeline) + nn.Module.__init__(pipeline) + pipeline.od_config = SimpleNamespace() + pipeline.device = torch.device("cpu") + pipeline.dtype = torch.float32 + pipeline.transformer = StubCosmos3Transformer(latent_channel_size=2) + pipeline.vae = StubCosmos3VAE(z_dim=2) + pipeline.vae_scale_factor_temporal = 4 + pipeline.vae_scale_factor_spatial = 8 + pipeline.scheduler = StubScheduler([9, 3], flow_shift=1.0) + pipeline._base_scheduler_config = pipeline.scheduler.config + pipeline._engine_init_flow_shift = 1.0 + pipeline._current_flow_shift = 1.0 + pipeline._guidance_scale = None + pipeline._num_timesteps = None + pipeline.progress_bar = passthrough_progress_bar + pipeline._sound_tokenizer = None + return pipeline + + return _make + + +def make_sampling_params(**overrides: Any) -> SimpleNamespace: + values = { + "height": None, + "width": None, + "num_frames": None, + "num_inference_steps": None, + "guidance_scale": None, + "generator": None, + "seed": 123, + "num_outputs_per_prompt": 1, + "frame_rate": None, + "resolved_frame_rate": None, + "max_sequence_length": None, + "extra_args": {}, + } + values.update(overrides) + return SimpleNamespace(**values) diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py index 31b40b6eee5..b6116d9265d 100644 --- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py +++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py @@ -227,7 +227,7 @@ def test_preprocess_i2v_image_input() -> None: assert tuple(result.prompts[0]["additional_information"]["preprocessed_image"].shape[-2:]) == (672, 1344) -def test_postprocess_handles_image_video_and_validation() -> None: +def test_postprocess_handles_image_video_audio_and_validation() -> None: from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_post_process_func func = get_cosmos3_post_process_func(SimpleNamespace()) @@ -235,6 +235,14 @@ def test_postprocess_handles_image_video_and_validation() -> None: assert func(video, output_type="latent") is video assert func({"image": video})[0].size == (4, 4) + assert "video" in func({"video": video}) + assert ( + func( + {"video": video, "audio": torch.ones(1, 2, 16), "audio_sample_rate": 48000}, + sampling_params=SimpleNamespace(extra_args={"resolved_frame_rate": 12}), + )["audio_sample_rate"] + == 48000 + ) with pytest.raises(ValueError, match="text-to-image postprocess expects"): func({"image": torch.zeros(1, 3, 2, 4, 4)}) @@ -293,7 +301,7 @@ def test_prompt_formatting_and_checkpoint_key_remap(make_cosmos3_pipeline) -> No assert {key: Cosmos3OmniDiffusersPipeline._remap_ckpt_key(key) for key in remaps} == remaps -def test_prepare_latents_for_video_and_image(make_cosmos3_pipeline) -> None: +def test_prepare_latents_for_video_image_and_sound(make_cosmos3_pipeline) -> None: pipeline = make_cosmos3_pipeline() latents = pipeline._prepare_latents(16, 24, 5, torch.Generator(device="cpu").manual_seed(0)) assert latents.shape == (1, 2, 2, 2, 3) @@ -306,8 +314,24 @@ def test_prepare_latents_for_video_and_image(make_cosmos3_pipeline) -> None: assert velocity_mask.tolist() == [[[[[0.0]], [[1.0]]]]] assert image_latent.shape == (1, 2, 1, 2, 3) + pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3) + pipeline._sound_tokenizer = SimpleNamespace( + sample_rate=10, + latent_ch=3, + hop_size=4, + decode=lambda x: torch.ones(x.shape[0], 2, 24), + ) + assert pipeline._resolve_sound_target_samples(SimpleNamespace(extra_args={"sound_duration": 2.0}), 9, 3.0) == ( + 20, + 2.0, + 10, + ) + sound_latents, latent_frames = pipeline._prepare_sound_latents(21, torch.Generator(device="cpu").manual_seed(0)) + assert (sound_latents.shape, latent_frames) == (torch.Size([1, 3, 6]), 6) + assert pipeline._decode_sound_latents(torch.zeros(1, 3, 6), target_audio_samples=21).shape == (1, 2, 21) -def test_diffuse_covers_cfg_and_i2v_steps(make_cosmos3_pipeline) -> None: + +def test_diffuse_covers_cfg_i2v_and_sound_steps(make_cosmos3_pipeline) -> None: pipeline = make_cosmos3_pipeline() latents = torch.zeros(1, 2, 1, 1, 1) @@ -339,6 +363,21 @@ def test_diffuse_covers_cfg_and_i2v_steps(make_cosmos3_pipeline) -> None: ) torch.testing.assert_close(i2v[:, :, 0:1], torch.full((1, 2, 1, 1, 1), 7.0)) + pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3) + video_result, sound_result = pipeline.diffuse( + latents=latents, + sound_latents=torch.zeros(1, 3, 4), + timesteps=torch.tensor([7, 3]), + cond_ids=_ids(2), + cond_mask=_mask(), + uncond_ids=_ids(1), + uncond_mask=_mask(), + guidance_scale=1.0, + shared_kwargs={"video_shape": (1, 1, 1), "fps": 24.0}, + ) + torch.testing.assert_close(video_result, torch.full_like(latents, 4.0)) + torch.testing.assert_close(sound_result, torch.full((), 24.0).expand_as(sound_result)) + def test_diffuse_keeps_paired_cfg_when_cache_dit_active(make_cosmos3_pipeline) -> None: """With cache-dit active the uncond pass is kept even outside the guidance @@ -395,7 +434,10 @@ def fake_prepare(height, width, num_frames, generator): def fake_diffuse(**kwargs): captured["diffuse_calls"].append(kwargs) - return kwargs["latents"] + len(captured["diffuse_calls"]) + outputs = [kwargs["latents"] + len(captured["diffuse_calls"])] + if kwargs.get("sound_latents") is not None: + outputs.append(kwargs["sound_latents"] + 2.0) + return outputs[0] if len(outputs) == 1 else tuple(outputs) pipeline._format_and_tokenize_prompts = fake_format pipeline._prepare_latents = fake_prepare @@ -437,7 +479,7 @@ def test_forward_defaults_and_mode_selection( assert captured["flow_shifts"] == expected["flow"] assert [call[0] for call in pipeline.scheduler.set_timesteps_calls] == expected["steps"] - def test_forward_i2v_route(self, make_cosmos3_pipeline) -> None: + def test_forward_i2v_and_sound_routes(self, make_cosmos3_pipeline) -> None: pipeline = make_cosmos3_pipeline() captured = self._install_forward_stubs(pipeline) image_tensor = torch.zeros(1, 3, 16, 16) @@ -462,11 +504,30 @@ def test_forward_i2v_route(self, make_cosmos3_pipeline) -> None: ) assert captured["diffuse_calls"][-1]["shared_kwargs"]["noisy_frame_mask"] is velocity_mask + pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3) + sound_latents = torch.zeros(1, 3, 4) + pipeline._resolve_sound_target_samples = lambda *args: (20, 2.0, 10) + pipeline._prepare_sound_latents = lambda *args: (sound_latents, 4) + pipeline._decode_sound_latents = lambda *args: torch.ones(1, 2, 20) + output = pipeline.forward( + SimpleNamespace( + prompts=[{"prompt": "A robot", "modalities": ["video"], "generate_sound": True}], + sampling_params=make_sampling_params(num_frames=9, frame_rate=3.0), + ) + ) + assert captured["diffuse_calls"][-1]["sound_latents"] is sound_latents + assert output.output["audio_sample_rate"] == 10 + @pytest.mark.parametrize( ("prompt", "sampling_params", "message"), [ (["one", "two"], make_sampling_params(), "single prompt"), ([{"prompt": "one", "modalities": ["image", "video"]}], make_sampling_params(), "both image and video"), + ( + [{"prompt": "x", "modalities": ["image"], "generate_sound": True}], + make_sampling_params(), + "only for video", + ), ], ) def test_forward_rejects_invalid_public_requests( @@ -477,6 +538,7 @@ def test_forward_rejects_invalid_public_requests( message, ) -> None: pipeline = make_cosmos3_pipeline() + pipeline.transformer = pipeline.transformer.__class__(latent_channel_size=2, sound_gen=True, sound_dim=3) with pytest.raises(ValueError, match=message): pipeline.forward(SimpleNamespace(prompts=prompt, sampling_params=sampling_params)) diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py new file mode 100644 index 00000000000..47664c59e77 --- /dev/null +++ b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py @@ -0,0 +1,226 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import json +from pathlib import Path +from types import SimpleNamespace + +import pytest +import torch + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] + +DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME = "diffusion_pytorch_model.safetensors" + + +class _FakeAVAEAudioTokenizer: + def __init__(self, **kwargs) -> None: + self.kwargs = kwargs + self.sample_rate = int(kwargs["sample_rate"]) + self.audio_channels = int(kwargs["audio_channels"]) + self.latent_ch = int(kwargs["io_channels"]) + self.temporal_compression_factor = int(kwargs["hop_size"]) + + def get_latent_num_samples(self, num_audio_samples: int) -> int: + return int(num_audio_samples) // self.temporal_compression_factor + + def get_audio_num_samples(self, num_latent_samples: int) -> int: + return int(num_latent_samples) * self.temporal_compression_factor + + def decode(self, latents: torch.Tensor) -> torch.Tensor: + return torch.zeros(latents.shape[0], self.audio_channels, 8) + + +def _write_component(root: Path, config: dict | None = None, checkpoint_name: str | None = None) -> Path: + tokenizer_dir = root / "sound_tokenizer" + tokenizer_dir.mkdir(parents=True) + if checkpoint_name: + (tokenizer_dir / checkpoint_name).write_bytes(b"stub") + (tokenizer_dir / "config.json").write_text(json.dumps(config or {}), encoding="utf-8") + return tokenizer_dir + + +def _patch_fake_avae(monkeypatch: pytest.MonkeyPatch, created: dict) -> None: + from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer + + class FakeAVAE(_FakeAVAEAudioTokenizer): + def __init__(self, **kwargs) -> None: + created.update(kwargs) + super().__init__(**kwargs) + + monkeypatch.setattr(sound_tokenizer, "Cosmos3AVAEAudioTokenizer", FakeAVAE) + monkeypatch.setattr(sound_tokenizer, "get_local_device", lambda: torch.device("cpu")) + + +def test_from_config_loads_local_diffusers_component(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None: + from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer + + model_dir = tmp_path / "model" + tokenizer_dir = _write_component(model_dir, checkpoint_name=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME) + created = {} + _patch_fake_avae(monkeypatch, created) + + tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config( + SimpleNamespace( + model=str(model_dir), + custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800, "sound_dim": 3}, + dtype=torch.float32, + ) + ) + + assert created["checkpoint_path"] == str(tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME) + assert created["config_path"] == str(tokenizer_dir / "config.json") + assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (32000, 3, 800) + + +def test_from_config_downloads_component_from_hf_repo(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None: + import huggingface_hub + + from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer + + cache_dir = tmp_path / "hf" + _write_component(cache_dir, checkpoint_name=DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME) + calls = [] + created = {} + _patch_fake_avae(monkeypatch, created) + + def fake_snapshot_download(repo_id: str, *, revision: str | None, allow_patterns: list[str]) -> str: + calls.append((repo_id, revision, allow_patterns)) + return str(cache_dir) + + monkeypatch.setattr(huggingface_hub, "snapshot_download", fake_snapshot_download) + + sound_tokenizer.Cosmos3SoundTokenizer.from_config( + SimpleNamespace( + model="nvidia/cosmos3", + revision="test-rev", + custom_pipeline_args={"sound_sample_rate": 32000, "sound_hop_size": 800, "sound_dim": 3}, + dtype=torch.float32, + ) + ) + + assert created["checkpoint_path"].endswith(DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME) + assert calls == [ + ( + "nvidia/cosmos3", + "test-rev", + ["sound_tokenizer/config.json", f"sound_tokenizer/{DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME}"], + ) + ] + + +@pytest.mark.parametrize( + ("checkpoint_name", "message"), + [ + (None, "no AVAE sound tokenizer checkpoint"), + ("model.safetensors", DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME), + ], +) +def test_default_component_requires_diffusers_checkpoint_name(tmp_path, checkpoint_name, message) -> None: + from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer + + model_dir = tmp_path / "model" + _write_component(model_dir, checkpoint_name=checkpoint_name) + + with pytest.raises(ValueError, match=message): + sound_tokenizer.Cosmos3SoundTokenizer.from_config( + SimpleNamespace(model=str(model_dir), custom_pipeline_args={}, dtype=torch.float32) + ) + + +def test_component_config_precedence_and_conflict_detection(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None: + from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer + + component_config = { + "sampling_rate": 48000, + "dec_out_channels": 2, + "vocoder_input_dim": 64, + "hop_size": 1920, + } + model_dir = tmp_path / "model" + _write_component(model_dir, component_config, DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME) + created = {} + _patch_fake_avae(monkeypatch, created) + + tokenizer = sound_tokenizer.Cosmos3SoundTokenizer.from_config( + SimpleNamespace( + model=str(model_dir), + custom_pipeline_args={ + "sound_normalize_latents": True, + "sound_normalization_type": "tanh", + "sound_tanh_input_scale": 2.0, + }, + model_config={ + "sound_tokenizer": { + "sample_rate": 32000, + "audio_channels": 1, + "io_channels": 3, + "hop_size": 800, + "normalize_latents": False, + "normalization_type": "none", + } + }, + dtype=torch.float32, + ) + ) + + assert (created["sample_rate"], created["audio_channels"], created["io_channels"], created["hop_size"]) == ( + 48000, + 2, + 64, + 1920, + ) + assert (created["normalize_latents"], created["normalization_type"], created["tanh_input_scale"]) == ( + True, + "tanh", + 2.0, + ) + assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (48000, 64, 1920) + + with pytest.raises(ValueError, match=r"sample_rate.*48000.*32000"): + sound_tokenizer.Cosmos3SoundTokenizer.from_config( + SimpleNamespace( + model=str(model_dir), + custom_pipeline_args={"sound_sample_rate": 32000}, + dtype=torch.float32, + ) + ) + + +def test_avae_uses_diffusers_decoder_state_dict_layout(tmp_path) -> None: + from safetensors.torch import save_file + + from vllm_omni.diffusion.models.cosmos3.audio_tokenizer import avae + + config = { + "sampling_rate": 8000, + "hop_size": 2, + "dec_dim": 4, + "dec_c_mults": [1], + "dec_strides": [2], + "dec_out_channels": 1, + "vocoder_input_dim": 2, + "normalization_type": "none", + } + checkpoint_path = tmp_path / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME + config_path = tmp_path / "config.json" + config_path.write_text(json.dumps(config), encoding="utf-8") + + decoder = avae.OobleckDecoder(4, 2, 1, [2], [1]) + save_file({f"decoder.{key}": value for key, value in decoder.state_dict().items()}, str(checkpoint_path)) + + tokenizer = avae.Cosmos3AVAEAudioTokenizer( + checkpoint_path=checkpoint_path, + config_path=config_path, + dtype=torch.float32, + device="cpu", + ) + + keys = set(tokenizer.state_dict()) + assert {"decoder.conv1.weight_g", "decoder.block.0.conv_t1.weight_g", "decoder.conv2.weight_g"} <= keys + assert not any(key.startswith(("decoder.layers.", "model.decoder.")) for key in keys) + assert tokenizer.decode(torch.zeros(1, 2, 3)).shape == (1, 1, 6) + with pytest.raises(NotImplementedError, match="decoder-only"): + tokenizer.encode(torch.zeros(1, 1, 6)) diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py index 730079c116a..38db56e0c26 100644 --- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py +++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py @@ -29,8 +29,9 @@ def _tiny_cosmos3_config(**overrides): return config -def test_mrope_position_ids_cover_text_and_video() -> None: +def test_mrope_position_ids_cover_text_video_and_sound() -> None: from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import ( + compute_mrope_position_ids_sound, compute_mrope_position_ids_text, compute_mrope_position_ids_vision, ) @@ -56,6 +57,10 @@ def test_mrope_position_ids_cover_text_and_video() -> None: torch.testing.assert_close(modulated_ids[0], torch.tensor([10.0, 12.0])) assert modulated_offset == 13 + sound_ids, sound_offset = compute_mrope_position_ids_sound(3, temporal_offset=10, sound_latent_fps=25.0) + torch.testing.assert_close(sound_ids[0], torch.tensor([10.0, 10.96, 11.92])) + assert sound_offset == 12 + @pytest.mark.parametrize( ("key", "value"), @@ -115,12 +120,90 @@ def test_forward_returns_video_prediction(monkeypatch: pytest.MonkeyPatch) -> No text_mask=torch.ones(1, 2, dtype=torch.long), video_shape=(1, 2, 2), fps=24.0, + sound_latents=torch.zeros(1, 3, 4), ) assert tuple(output.shape) == (1, 2, 1, 2, 2) -def test_compute_rope_freqs_places_text_and_video_positions() -> None: +def test_sound_modules_follow_config() -> None: + from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer + + tiny = _tiny_cosmos3_config() + no_modal = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32)) + with_sound = Cosmos3VFMTransformer( + SimpleNamespace( + tf_model_config={**tiny, "sound_gen": True}, + model_config={"sound_tokenizer": {"io_channels": 5, "sample_rate": 32000, "hop_size": 800}}, + custom_pipeline_args={}, + dtype=torch.float32, + ) + ) + + assert no_modal.sound_gen is False + assert not hasattr(no_modal, "audio_proj_in") + assert with_sound.sound_dim == 5 + assert with_sound.sound_latent_fps == 40.0 + assert with_sound.audio_proj_in.in_features == 5 + + +def test_sound_pack_unpack_validate_shapes() -> None: + from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer + + model = object.__new__(Cosmos3VFMTransformer) + nn.Module.__init__(model) + model.sound_dim = 3 + + sound = torch.arange(2 * 3 * 4, dtype=torch.float32).reshape(2, 3, 4) + torch.testing.assert_close(model.unpack_sound(model.pack_sound(sound)), sound) + + with pytest.raises(ValueError, match="channel mismatch"): + model.pack_sound(torch.zeros(1, 4, 2)) + + +def test_forward_returns_video_and_sound_predictions() -> None: + from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer + + output = Cosmos3VFMTransformer( + SimpleNamespace( + tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0), + dtype=torch.float32, + ) + )( + hidden_states=torch.zeros(1, 2, 1, 2, 2), + timestep=torch.tensor([1.0]), + text_ids=torch.tensor([[1, 2]], dtype=torch.long), + text_mask=torch.ones(1, 2, dtype=torch.long), + video_shape=(1, 2, 2), + fps=24.0, + sound_latents=torch.zeros(1, 3, 4), + ) + + assert isinstance(output, tuple) + assert [tuple(tensor.shape) for tensor in output] == [(1, 2, 1, 2, 2), (1, 3, 4)] + + +def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch: pytest.MonkeyPatch) -> None: + import vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 as cosmos3_module + + model = cosmos3_module.Cosmos3VFMTransformer( + SimpleNamespace(tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3), dtype=torch.float32) + ) + monkeypatch.setattr(cosmos3_module, "_get_ulysses_state", lambda: (2, 0, None)) + + with pytest.raises(ValueError, match=r"GEN sequence length \(3 = video tokens 2 \+ sound tokens 1\)"): + model( + hidden_states=torch.zeros(1, 2, 1, 1, 2), + timestep=torch.tensor([1.0]), + text_ids=torch.tensor([[1, 2]], dtype=torch.long), + text_mask=torch.ones(1, 2, dtype=torch.long), + video_shape=(1, 1, 2), + fps=24.0, + sound_latents=torch.zeros(1, 3, 1), + ) + + +def test_compute_rope_freqs_places_text_video_and_sound_positions() -> None: from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer class FakeRotary: @@ -140,6 +223,8 @@ def __call__(self, x, position_ids): model.temporal_modality_margin = 100 model.base_fps = 24.0 model.temporal_compression_factor = 4 + model.temporal_compression_factor_sound = 1 + model.sound_latent_fps = 25.0 model.enable_fps_modulation = False freqs_und, freqs_gen = model._compute_rope_freqs( @@ -156,3 +241,19 @@ def __call__(self, x, position_ids): assert vision_pos[0, 0].tolist() == [102, 103] assert freqs_und[0].shape == (2, 3, 1, 4) assert freqs_gen[0].shape == (2, 2, 1, 4) + + rotary.position_ids.clear() + model._compute_rope_freqs( + text_mask=torch.tensor([[1, 1]], dtype=torch.long), + t=2, + hp=1, + wp=1, + fps=24.0, + device=torch.device("cpu"), + dtype=torch.float32, + t_sound=1, + ) + + _, gen_pos = rotary.position_ids + assert gen_pos.shape == (3, 1, 3) + assert gen_pos[0, 0].tolist() == [102, 103, 102] diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py index 36b19333980..de1f14c7455 100644 --- a/tests/entrypoints/openai_api/test_video_server.py +++ b/tests/entrypoints/openai_api/test_video_server.py @@ -399,6 +399,8 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture): "true_cfg_scale": "4.0", "boundary_ratio": "0.7", "flow_shift": "0.25", + "generate_sound": "true", + "sound_duration": "2.5", }, ) @@ -413,6 +415,8 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture): assert captured.true_cfg_scale == 4.0 assert captured.boundary_ratio == 0.7 assert captured.extra_args["flow_shift"] == 0.25 + assert captured.extra_args["generate_sound"] is True + assert captured.extra_args["sound_duration"] == 2.5 def test_frame_interpolation_params_pass_to_diffusion_sampling_params(test_client, mocker: MockerFixture): @@ -756,6 +760,9 @@ def test_invalid_uploaded_input_reference_returns_400(test_client): def test_video_request_validation(): req = VideoGenerationRequest(prompt="test") assert req.prompt == "test" + assert req.generate_sound is False + assert req.sound_duration is None + assert VideoGenerationRequest(prompt="test", generate_sound=True, sound_duration=1.5).generate_sound is True with pytest.raises(ValueError): VideoGenerationRequest(prompt="test", size="invalid") @@ -768,6 +775,8 @@ def test_video_request_validation(): VideoGenerationRequest(prompt="test", frame_interpolation_exp=0) with pytest.raises(ValueError): VideoGenerationRequest(prompt="test", frame_interpolation_scale=0) + with pytest.raises(ValueError): + VideoGenerationRequest(prompt="test", sound_duration=0) def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerFixture): diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py new file mode 100644 index 00000000000..cfb794705ba --- /dev/null +++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from .avae import Cosmos3AVAEAudioTokenizer + +__all__ = ["Cosmos3AVAEAudioTokenizer"] diff --git a/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py new file mode 100644 index 00000000000..4ddb8d41527 --- /dev/null +++ b/vllm_omni/diffusion/models/cosmos3/audio_tokenizer/avae.py @@ -0,0 +1,323 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Diffusers-format AVAE audio tokenizer used by Cosmos3 sound generation.""" + +from __future__ import annotations + +import json +import math +from pathlib import Path +from typing import Any + +import torch +from torch import nn +from torch.nn.utils import weight_norm +from vllm.logger import init_logger + +from vllm_omni.diffusion.models.progress_bar import _is_rank_zero + +logger = init_logger(__name__) + + +def _default_avae_config( + *, + sample_rate: int, + audio_channels: int, + io_channels: int, + hop_size: int, +) -> dict[str, Any]: + return { + "sampling_rate": sample_rate, + "hop_size": hop_size, + "dec_dim": 320, + "dec_c_mults": [1, 2, 4, 8, 16], + "dec_strides": [2, 4, 5, 6, 8], + "dec_out_channels": audio_channels, + "vocoder_input_dim": io_channels, + "normalization_type": "none", + "normalize_latents": False, + "tanh_input_scale": 1.5, + "tanh_output_scale": 3.5, + "tanh_clamp": 0.995, + } + + +def _config_get(config: dict[str, Any], *keys: str, default: Any = None) -> Any: + for key in keys: + value = config.get(key) + if value is not None: + return value + return default + + +def _load_config( + config_path: str | Path | None, + *, + sample_rate: int, + audio_channels: int, + io_channels: int, + hop_size: int, +) -> dict[str, Any]: + if config_path: + with open(config_path, encoding="utf-8") as f: + config = json.load(f) + if not isinstance(config, dict): + raise TypeError(f"Cosmos3 AVAE config must be a JSON object, got {type(config)!r}.") + return config + return _default_avae_config( + sample_rate=sample_rate, + audio_channels=audio_channels, + io_channels=io_channels, + hop_size=hop_size, + ) + + +def _load_checkpoint(path: str | Path, map_location: torch.device | str) -> dict[str, torch.Tensor]: + path = Path(path) + if path.suffix == ".safetensors": + try: + from safetensors.torch import load_file + except ImportError as exc: + raise ImportError("Loading AVAE .safetensors checkpoints requires safetensors.") from exc + checkpoint = load_file(str(path), device=str(map_location)) + else: + checkpoint = torch.load(path, map_location=map_location) + + if not isinstance(checkpoint, dict): + raise TypeError(f"AVAE checkpoint must be a flat state dict, got {type(checkpoint)!r}.") + if not all(isinstance(value, torch.Tensor) for value in checkpoint.values()): + raise TypeError("AVAE checkpoint must be a flat tensor state dict.") + return checkpoint + + +def _validate_diffusers_state_dict(state_dict: dict[str, torch.Tensor]) -> None: + if not state_dict: + raise RuntimeError("AVAE checkpoint is empty.") + + if not any(key.startswith("decoder.") for key in state_dict): + raise RuntimeError("Cosmos3 AVAE checkpoint must contain diffusers-format decoder.* keys.") + + +class Snake1d(nn.Module): + """One-dimensional Snake activation matching diffusers' Oobleck layout.""" + + def __init__(self, hidden_dim: int, logscale: bool = True) -> None: + super().__init__() + self.alpha = nn.Parameter(torch.zeros(1, hidden_dim, 1)) + self.beta = nn.Parameter(torch.zeros(1, hidden_dim, 1)) + self.logscale = logscale + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + shape = hidden_states.shape + alpha = torch.exp(self.alpha) if self.logscale else self.alpha + beta = torch.exp(self.beta) if self.logscale else self.beta + hidden_states = hidden_states.reshape(shape[0], shape[1], -1) + hidden_states = hidden_states + (beta + 1e-9).reciprocal() * torch.sin(alpha * hidden_states).pow(2) + return hidden_states.reshape(shape) + + +class OobleckResidualUnit(nn.Module): + """Residual unit used by the diffusers Oobleck decoder.""" + + def __init__(self, dimension: int = 16, dilation: int = 1) -> None: + super().__init__() + pad = ((7 - 1) * dilation) // 2 + self.snake1 = Snake1d(dimension) + self.conv1 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad)) + self.snake2 = Snake1d(dimension) + self.conv2 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=1)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + output_tensor = self.conv1(self.snake1(hidden_state)) + output_tensor = self.conv2(self.snake2(output_tensor)) + padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2 + if padding > 0: + hidden_state = hidden_state[..., padding:-padding] + return hidden_state + output_tensor + + +class OobleckDecoderBlock(nn.Module): + """Decoder block used by the diffusers Oobleck decoder.""" + + def __init__(self, input_dim: int, output_dim: int, stride: int = 1, output_padding: int = 0) -> None: + super().__init__() + self.snake1 = Snake1d(input_dim) + self.conv_t1 = weight_norm( + nn.ConvTranspose1d( + input_dim, + output_dim, + kernel_size=2 * stride, + stride=stride, + padding=math.ceil(stride / 2), + output_padding=output_padding, + ) + ) + self.res_unit1 = OobleckResidualUnit(output_dim, dilation=1) + self.res_unit2 = OobleckResidualUnit(output_dim, dilation=3) + self.res_unit3 = OobleckResidualUnit(output_dim, dilation=9) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.snake1(hidden_state) + hidden_state = self.conv_t1(hidden_state) + hidden_state = self.res_unit1(hidden_state) + hidden_state = self.res_unit2(hidden_state) + return self.res_unit3(hidden_state) + + +class OobleckDecoder(nn.Module): + """Diffusers-compatible Oobleck decoder for Cosmos3 AVAE latents.""" + + def __init__( + self, + channels: int, + input_channels: int, + audio_channels: int, + upsampling_ratios: list[int], + channel_multiples: list[int], + ) -> None: + super().__init__() + strides = upsampling_ratios + channel_multiples = [1] + channel_multiples + + self.conv1 = weight_norm(nn.Conv1d(input_channels, channels * channel_multiples[-1], kernel_size=7, padding=3)) + + block = [] + for stride_index, stride in enumerate(strides): + block.append( + OobleckDecoderBlock( + input_dim=channels * channel_multiples[len(strides) - stride_index], + output_dim=channels * channel_multiples[len(strides) - stride_index - 1], + stride=stride, + output_padding=stride % 2, + ) + ) + self.block = nn.ModuleList(block) + self.snake1 = Snake1d(channels) + self.conv2 = weight_norm(nn.Conv1d(channels, audio_channels, kernel_size=7, padding=3, bias=False)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.conv1(hidden_state) + for layer in self.block: + hidden_state = layer(hidden_state) + hidden_state = self.snake1(hidden_state) + return self.conv2(hidden_state) + + +class Cosmos3AVAEAudioTokenizer(nn.Module): + """Decoder-only AVAE tokenizer for Cosmos3 audio latents.""" + + def __init__( + self, + *, + checkpoint_path: str | Path, + config_path: str | Path | None = None, + sample_rate: int = 48000, + audio_channels: int = 2, + io_channels: int = 64, + hop_size: int = 1920, + normalize_latents: bool = False, + normalization_type: str = "none", + tanh_input_scale: float = 1.5, + tanh_output_scale: float = 3.5, + tanh_clamp: float = 0.995, + dtype: torch.dtype = torch.bfloat16, + device: torch.device | str = "cuda", + ) -> None: + super().__init__() + self.dtype = dtype + self.device = torch.device(device) + + config = _load_config( + config_path, + sample_rate=sample_rate, + audio_channels=audio_channels, + io_channels=io_channels, + hop_size=hop_size, + ) + self.sample_rate = int(_config_get(config, "sampling_rate", "sample_rate", default=sample_rate)) + self.audio_channels = int( + _config_get( + config, + "dec_out_channels", + "audio_channels", + default=2 if bool(config.get("stereo", audio_channels == 2)) else 1, + ) + ) + self.latent_ch = int(_config_get(config, "vocoder_input_dim", "io_channels", "latent_ch", default=io_channels)) + dec_strides = [int(stride) for stride in _config_get(config, "dec_strides", default=[2, 4, 5, 6, 8])] + self.hop_size = int( + _config_get(config, "hop_size", default=math.prod(dec_strides) if dec_strides else hop_size) + ) + dec_stride_product = math.prod(dec_strides) + if dec_stride_product != self.hop_size: + raise ValueError( + "Cosmos3 AVAE config dec_strides product must equal hop_size " + f"for correct latent/audio duration math: product={dec_stride_product}, hop_size={self.hop_size}." + ) + + normalization_type = str(_config_get(config, "normalization_type", default=normalization_type)) + normalize_latents = bool(_config_get(config, "normalize_latents", default=normalize_latents)) + if normalization_type == "none" and normalize_latents: + normalization_type = "tanh" + self.normalization_type = normalization_type + self.tanh_input_scale = float(_config_get(config, "tanh_input_scale", default=tanh_input_scale)) + self.tanh_output_scale = float(_config_get(config, "tanh_output_scale", default=tanh_output_scale)) + self.tanh_clamp = float(_config_get(config, "tanh_clamp", default=tanh_clamp)) + + self.decoder = OobleckDecoder( + channels=int(_config_get(config, "dec_dim", default=320)), + input_channels=self.latent_ch, + audio_channels=self.audio_channels, + upsampling_ratios=list(reversed(dec_strides)), + channel_multiples=list(_config_get(config, "dec_c_mults", default=[1, 2, 4, 8, 16])), + ) + state_dict = _load_checkpoint(checkpoint_path, self.device) + _validate_diffusers_state_dict(state_dict) + + # The checkpoint also contains encoder weights, which we do not support here, hence strict=False + self.load_state_dict(state_dict, strict=False) + + self.eval() + for param in self.parameters(): + param.requires_grad = False + self.to(device=self.device, dtype=self.dtype) + if _is_rank_zero(): + logger.info("Loaded diffusers-format Cosmos3 AVAE checkpoint from %s", checkpoint_path) + + @property + def temporal_compression_factor(self) -> int: + return self.hop_size + + def get_latent_num_samples(self, num_audio_samples: int) -> int: + return int(num_audio_samples) // self.temporal_compression_factor + + def get_audio_num_samples(self, num_latent_samples: int) -> int: + return int(num_latent_samples) * self.temporal_compression_factor + + def _denormalize_latent(self, latent: torch.Tensor) -> torch.Tensor: + if self.normalization_type == "tanh": + in_dtype = latent.dtype + latent = torch.clamp( + latent.float() / self.tanh_output_scale, + -self.tanh_clamp, + self.tanh_clamp, + ) + return (torch.atanh(latent) * self.tanh_input_scale).to(in_dtype) + if self.normalization_type != "none": + raise ValueError(f"Unsupported AVAE normalization_type={self.normalization_type!r}.") + return latent + + @torch.no_grad() + def encode(self, audio: torch.Tensor, force_pad: bool = False) -> torch.Tensor: + del audio, force_pad + raise NotImplementedError("Cosmos3AVAEAudioTokenizer is decoder-only for diffusers-format sound_tokenizer/.") + + @torch.no_grad() + def decode(self, latent: torch.Tensor) -> torch.Tensor: + in_dtype = latent.dtype + squeeze = latent.ndim == 2 + if squeeze: + latent = latent.unsqueeze(0) + z = self._denormalize_latent(latent.to(self.device)).to(self.dtype) + audio = self.decoder(z).clamp(-1.0, 1.0).to(in_dtype) + return audio.squeeze(0) if squeeze else audio diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py index 102b9216082..543add3ac46 100644 --- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py +++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py @@ -15,6 +15,7 @@ from __future__ import annotations +import math import os import time from collections.abc import Iterable @@ -160,6 +161,28 @@ def get_cosmos3_post_process_func(od_config: OmniDiffusionConfig): video_processor = VideoProcessor(vae_scale_factor=16) + def _sampling_param(sampling_params, key: str, default=None): + extra = getattr(sampling_params, "extra_args", None) + if isinstance(extra, dict) and extra.get(key) is not None: + return extra[key] + value = getattr(sampling_params, key, None) + return default if value is None else value + + def _resolve_output_fps(sampling_params): + fps = ( + _sampling_param(sampling_params, "resolved_frame_rate") + or _sampling_param(sampling_params, "frame_rate") + or _sampling_param(sampling_params, "fps") + or 24.0 + ) + try: + fps_value = float(fps) + except (TypeError, ValueError): + fps_value = 24.0 + if fps_value <= 0: + fps_value = 24.0 + return int(fps_value) if fps_value.is_integer() else fps_value + def post_process_func( output: torch.Tensor | dict[str, torch.Tensor] | tuple, output_type: str = "np", @@ -168,6 +191,8 @@ def post_process_func( if output_type == "latent": return output + audio = None + audio_sample_rate = None if isinstance(output, dict): if "image" in output and "video" in output: raise ValueError("Cosmos3 output cannot contain both image and video payloads.") @@ -177,10 +202,23 @@ def post_process_func( video = output["video"] else: raise ValueError("Cosmos3 postprocess expected an 'image' or 'video' output payload.") + audio = output.get("audio") + audio_sample_rate = output.get("audio_sample_rate") + elif isinstance(output, tuple): + if len(output) == 3: + video, audio, audio_sample_rate = output + elif len(output) == 2: + video, audio = output + else: + raise ValueError( + "Cosmos3 postprocess expects output tensor, output dict, or (video, audio[, sample_rate]) tuple." + ) else: video = output if isinstance(output, dict) and "image" in output: + if audio is not None: + raise ValueError("Cosmos3 text-to-image postprocess does not support audio output.") if video.ndim != 5 or video.shape[2] != 1: raise ValueError( "Cosmos3 text-to-image postprocess expects decoded output " @@ -194,7 +232,16 @@ def post_process_func( return video_processor.postprocess(image, output_type="pil") if is_guardrails_enabled(od_config, sampling_params): video = check_video_safety(video) - return video_processor.postprocess_video(video, output_type=output_type) + result = {"video": video_processor.postprocess_video(video, output_type=output_type)} + if audio is None: + return result + if isinstance(audio, torch.Tensor): + audio = audio.detach().cpu() + result["audio"] = audio + result["fps"] = _resolve_output_fps(sampling_params) + if audio_sample_rate is not None: + result["audio_sample_rate"] = int(audio_sample_rate) + return result return post_process_func @@ -317,6 +364,9 @@ def __init__( self._guidance_scale = None self._num_timesteps = None + self._sound_tokenizer = None + if getattr(self.transformer, "sound_gen", False): + self._get_sound_tokenizer() # Set True by ``enable_cache_for_cosmos3`` when cache-dit is enabled on # this pipeline. Tells the sequential-CFG loop to keep paired @@ -357,9 +407,13 @@ def _remap_ckpt_key(key: str) -> str | None: "proj_in.", "proj_out.", "time_embedder.", + "audio_proj_in.", + "audio_proj_out.", ) ): return f"transformer.{k}" + if k in ("audio_modality_embed", "audio_modality_embed.weight"): + return "transformer.audio_modality_embed" # Skip lm_head if k.startswith("lm_head."): @@ -453,12 +507,22 @@ def _remapped_weights() -> Iterable[tuple[str, torch.Tensor]]: loaded = loader.load_weights(_remapped_weights()) self.transformer.post_load_weights() self.transformer.eval() + if getattr(self.transformer, "sound_gen", False): + sound_markers = ("audio_proj_in.", "audio_proj_out.", "audio_modality_embed") + missing = [marker.rstrip(".") for marker in sound_markers if not any(marker in name for name in loaded)] + if missing: + raise ValueError( + "Cosmos3 transformer config enables sound generation, but " + f"the checkpoint is missing sound weights for {missing}. " + "Use a sound-capable transformer checkpoint." + ) return loaded def predict_noise(self, **kwargs) -> torch.Tensor | tuple[torch.Tensor, ...]: """Override CFGParallelMixin.predict_noise for Cosmos3. - The transformer returns the raw video noise prediction. + The transformer returns the raw prediction: video-only as a tensor, + or a tuple in video, sound order for sound generation. """ return self.transformer(**kwargs) @@ -509,6 +573,49 @@ def _get_sp_param(sp: OmniDiffusionSamplingParams, key: str, default: Any = None return val return default + @staticmethod + def _truthy(value) -> bool: + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "yes", "on"} + return bool(value) + + @classmethod + def _get_prompt_param(cls, prompt_data, key: str, default=None): + if not isinstance(prompt_data, dict): + return default + if prompt_data.get(key) is not None: + return prompt_data[key] + additional = prompt_data.get("additional_information") + if isinstance(additional, dict) and additional.get(key) is not None: + return additional[key] + return default + + @classmethod + def _is_sound_request(cls, prompt_data, sp) -> bool: + keys = ( + "sound_gen", + "generate_sound", + "enable_sound_generation", + "return_audio", + "output_audio", + "generate_audio", + ) + for key in keys: + if cls._truthy(cls._get_prompt_param(prompt_data, key, None)): + return True + if cls._truthy(cls._get_sp_param(sp, key, None)): + return True + return False + + def _get_sound_tokenizer(self): + if not hasattr(self, "_sound_tokenizer"): + self._sound_tokenizer = None + if self._sound_tokenizer is None: + from .sound_tokenizer import Cosmos3SoundTokenizer + + self._sound_tokenizer = Cosmos3SoundTokenizer.from_config(self.od_config) + return self._sound_tokenizer + @staticmethod def _is_t2i_request(req: OmniDiffusionRequest) -> bool: """Detect text-to-image mode from request-level prompt modalities.""" @@ -721,6 +828,47 @@ def _prepare_latents( ) return randn_tensor(shape, generator=generator, device=self.device, dtype=self.dtype) + def _prepare_sound_latents( + self, + target_audio_samples: int, + generator: torch.Generator, + ) -> tuple[torch.Tensor, int]: + sound_tokenizer = self._get_sound_tokenizer() + hop_size = int( + getattr(sound_tokenizer, "hop_size", None) or getattr(sound_tokenizer, "temporal_compression_factor") + ) + latent_frames = max(1, math.ceil(max(1, int(target_audio_samples)) / hop_size)) + sound_dim = int(getattr(sound_tokenizer, "latent_ch", 64)) + transformer_sound_dim = int(getattr(self.transformer, "sound_dim", sound_dim)) + if sound_dim != transformer_sound_dim: + raise ValueError( + "Cosmos3 sound tokenizer latent channels do not match transformer " + f"sound_dim: tokenizer={sound_dim}, transformer={transformer_sound_dim}." + ) + latents = randn_tensor( + (1, sound_dim, latent_frames), + generator=generator, + device=self.device, + dtype=self.dtype, + ) + return latents, latent_frames + + def _resolve_sound_target_samples( + self, + sp, + num_frames: int, + frame_rate: float, + ) -> tuple[int, float, int]: + sound_tokenizer = self._get_sound_tokenizer() + duration = self._get_sp_param(sp, "sound_duration", None) + if duration is None: + duration = self._get_sp_param(sp, "audio_duration", None) + if duration is None: + duration = num_frames / frame_rate + duration = max(float(duration), 1.0 / max(float(frame_rate), 1.0)) + sample_rate = int(getattr(sound_tokenizer, "sample_rate", 48000)) + return max(1, int(round(duration * sample_rate))), duration, sample_rate + # -- VAE decode ---------------------------------------------------------- def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor: @@ -742,6 +890,19 @@ def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor: video = self.vae.decode(latents, return_dict=False)[0] return video + def _decode_sound_latents( + self, + sound_latents: torch.Tensor, + target_audio_samples: int, + ) -> torch.Tensor: + sound_tokenizer = self._get_sound_tokenizer() + audio = sound_tokenizer.decode(sound_latents.to(self.dtype)) + if audio.shape[-1] > target_audio_samples: + audio = audio[..., :target_audio_samples] + elif audio.shape[-1] < target_audio_samples: + audio = torch.nn.functional.pad(audio, (0, target_audio_samples - audio.shape[-1])) + return audio.detach().cpu() + # -- Prompt formatting + tokenization (shared by T2V and I2V) ------------ def _format_and_tokenize_prompts( @@ -903,11 +1064,12 @@ def diffuse( guidance_scale: float, shared_kwargs: dict, *, + sound_latents: torch.Tensor | None = None, velocity_mask: torch.Tensor | None = None, image_latent: torch.Tensor | None = None, condition_latents: torch.Tensor | None = None, guidance_interval: tuple[float, float] | None = None, - ) -> torch.Tensor: + ) -> torch.Tensor | tuple[torch.Tensor, ...]: """Denoising loop with 3-mode CFG support (parallel, sequential, none). Cosmos3's UND pathway is text-dependent, so CFG needs separate K/V @@ -946,21 +1108,82 @@ def _cfg_active_at(t: torch.Tensor) -> bool: lo, hi = guidance_interval return lo <= t_scalar <= hi + def _pack_joint( + video_tensor: torch.Tensor, + sound_tensor: torch.Tensor | None = None, + ): + batch = video_tensor.shape[0] + tensors = [video_tensor] + if sound_tensor is not None: + tensors.append(sound_tensor) + flats = [tensor.reshape(batch, -1) for tensor in tensors] + return torch.cat(flats, dim=1), [tensor.shape for tensor in tensors], [flat.shape[1] for flat in flats] + + def _unpack_joint( + packed: torch.Tensor, + shapes: list[torch.Size], + numels: list[int], + ) -> tuple[torch.Tensor, ...]: + outputs = [] + offset = 0 + for shape, numel in zip(shapes, numels, strict=True): + outputs.append(packed[:, offset : offset + numel].reshape(shape)) + offset += numel + return tuple(outputs) + + def _split_noise_pred( + noise_pred: torch.Tensor | tuple[torch.Tensor, ...], + ) -> tuple[torch.Tensor, torch.Tensor | None]: + has_sound = sound_latents is not None + if not has_sound: + if isinstance(noise_pred, tuple): + raise ValueError("Cosmos3 video-only diffusion received tuple predictions.") + return noise_pred, None + if not isinstance(noise_pred, tuple): + raise ValueError("Cosmos3 multimodal diffusion expects transformer predictions as a tuple.") + if len(noise_pred) != 2: + raise ValueError(f"Cosmos3 sound diffusion expected 2 predictions, got {len(noise_pred)}.") + return noise_pred[0], noise_pred[1] + def _step( - noise_pred: torch.Tensor, + noise_pred: torch.Tensor | tuple[torch.Tensor, ...], t: torch.Tensor, latents: torch.Tensor, - ) -> torch.Tensor: - if isinstance(noise_pred, tuple): - raise ValueError("Cosmos3 noise prediction must be a single tensor; got a tuple.") + sound_latents: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, ...]: + video_pred, sound_pred = _split_noise_pred(noise_pred) if velocity_mask is not None: - noise_pred = noise_pred * velocity_mask - latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + video_pred = video_pred * velocity_mask + if sound_latents is None: + latents = self.scheduler.step(video_pred, t, latents, return_dict=False)[0] + else: + packed_noise, shapes, numels = _pack_joint(video_pred, sound_pred) + packed_latents, _, _ = _pack_joint(latents, sound_latents) + packed_next = self.scheduler.step(packed_noise, t, packed_latents, return_dict=False)[0] + unpacked = _unpack_joint(packed_next, shapes, numels) + latents = unpacked[0] + if sound_latents is not None: + sound_latents = unpacked[1] if condition_latents is not None and velocity_mask is not None: latents = velocity_mask * latents + (1.0 - velocity_mask) * condition_latents elif image_latent is not None: latents[:, :, 0:1, :, :] = image_latent - return latents + outputs = [latents] + if sound_latents is not None: + outputs.append(sound_latents) + return outputs[0] if len(outputs) == 1 else tuple(outputs) + + def _assign_step_out(step_out: torch.Tensor | tuple[torch.Tensor, ...]) -> None: + nonlocal latents, sound_latents + if sound_latents is None: + assert isinstance(step_out, torch.Tensor) + latents = step_out + return + if not isinstance(step_out, tuple): + raise ValueError("Cosmos3 multimodal diffusion step returned a non-tuple result.") + latents = step_out[0] + if sound_latents is not None: + sound_latents = step_out[1] if cfg_parallel: for t in self.progress_bar(timesteps): @@ -978,6 +1201,7 @@ def _step( timestep=timestep, text_ids=cond_ids, text_mask=cond_mask, + sound_latents=sound_latents, **shared_kwargs, ), negative_kwargs=dict( @@ -985,11 +1209,12 @@ def _step( timestep=timestep, text_ids=uncond_ids, text_mask=uncond_mask, + sound_latents=sound_latents, **shared_kwargs, ), cfg_normalize=False, ) - latents = _step(noise_pred, t, latents) + _assign_step_out(_step(noise_pred, t, latents, sound_latents)) elif do_cfg: cond_cache: tuple = (None, None) @@ -1007,6 +1232,7 @@ def _step( timestep=timestep, text_ids=cond_ids, text_mask=cond_mask, + sound_latents=sound_latents, **shared_kwargs, ) if cond_cache[0] is None: @@ -1019,6 +1245,7 @@ def _step( timestep=timestep, text_ids=uncond_ids, text_mask=uncond_mask, + sound_latents=sound_latents, **shared_kwargs, ) if uncond_cache[0] is None: @@ -1031,7 +1258,7 @@ def _step( else: noise_pred = noise_cond - latents = _step(noise_pred, t, latents) + _assign_step_out(_step(noise_pred, t, latents, sound_latents)) else: for t in self.progress_bar(timesteps): @@ -1041,11 +1268,15 @@ def _step( timestep=timestep, text_ids=cond_ids, text_mask=cond_mask, + sound_latents=sound_latents, **shared_kwargs, ) - latents = _step(noise_pred, t, latents) + _assign_step_out(_step(noise_pred, t, latents, sound_latents)) - return latents + outputs = [latents] + if sound_latents is not None: + outputs.append(sound_latents) + return outputs[0] if len(outputs) == 1 else tuple(outputs) # -- Forward (main generation entry point) ------------------------------- @@ -1072,6 +1303,18 @@ def forward( sp = req.sampling_params is_t2i = self._is_t2i_request(req) + sound_enabled = self._is_sound_request(prompt_data, sp) + if sound_enabled and is_t2i: + raise ValueError( + "Cosmos3 sound generation is supported only for video outputs in " + "this phase; text-to-image with sound is unsupported." + ) + if sound_enabled and not getattr(self.transformer, "sound_gen", False): + raise ValueError( + "Cosmos3 sound generation was requested, but the transformer was " + "initialized without sound modules. Check that the checkpoint config " + "enables sound_gen or defines sound_dim and includes sound weights." + ) if negative_prompt is None: negative_prompt = "" @@ -1163,6 +1406,13 @@ def forward( image_latent = None condition_latents = None + sound_latents = None + target_audio_samples = None + sound_sample_rate = None + if sound_enabled: + target_audio_samples, _, sound_sample_rate = self._resolve_sound_target_samples(sp, num_frames, frame_rate) + sound_latents, _ = self._prepare_sound_latents(target_audio_samples, generator) + T_latent = latents.shape[2] H_latent = latents.shape[3] W_latent = latents.shape[4] @@ -1184,6 +1434,7 @@ def _run_diffusion(start_latents): uncond_mask=uncond_mask, guidance_scale=guidance_scale, shared_kwargs=shared_kwargs, + sound_latents=sound_latents, velocity_mask=velocity_mask, image_latent=image_latent, condition_latents=condition_latents, @@ -1204,7 +1455,11 @@ def _run_diffusion(start_latents): samples.append(_run_diffusion(next_latents)) latents = torch.cat(samples, dim=0) else: - latents = _run_diffusion(latents) + diffusion_output = _run_diffusion(latents) + if sound_enabled: + latents, sound_latents = diffusion_output + else: + latents = diffusion_output # --- Decode --- if _is_rank_zero(): @@ -1215,4 +1470,12 @@ def _run_diffusion(start_latents): logger.info("Video decoded in %.2fs", time.time() - decode_start) logger.info("Total pipeline time: %.2fs", time.time() - pipeline_start) + if sound_enabled: + if sound_latents is None or target_audio_samples is None or sound_sample_rate is None: + raise ValueError("Cosmos3 sound generation finished without sound latents.") + if _is_rank_zero(): + logger.info("Decoding sound...") + audio = self._decode_sound_latents(sound_latents, target_audio_samples) + return DiffusionOutput(output={"video": video, "audio": audio, "audio_sample_rate": sound_sample_rate}) + return DiffusionOutput(output={"image": video} if is_t2i else {"video": video}) diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py new file mode 100644 index 00000000000..281b7e1d9f0 --- /dev/null +++ b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py @@ -0,0 +1,537 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Cosmos3 sound tokenizer integration.""" + +from __future__ import annotations + +import json +import os +from pathlib import Path +from typing import Any + +import torch +from vllm.logger import init_logger + +from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.distributed.utils import get_local_device +from vllm_omni.diffusion.models.progress_bar import _is_rank_zero + +from .audio_tokenizer import Cosmos3AVAEAudioTokenizer + +logger = init_logger(__name__) + +DEFAULT_SOUND_SAMPLE_RATE = 48000 +DEFAULT_SOUND_CHANNELS = 2 +DEFAULT_SOUND_DIM = 64 +DEFAULT_SOUND_HOP_SIZE = 1920 +DEFAULT_SOUND_LATENT_FPS = DEFAULT_SOUND_SAMPLE_RATE / DEFAULT_SOUND_HOP_SIZE +DEFAULT_SOUND_NORMALIZE_LATENTS = False +DEFAULT_SOUND_NORMALIZATION_TYPE = "none" +DEFAULT_SOUND_TANH_INPUT_SCALE = 1.5 +DEFAULT_SOUND_TANH_OUTPUT_SCALE = 3.5 +DEFAULT_SOUND_TANH_CLAMP = 0.995 +SOUND_TOKENIZER_COMPONENT_NAME = "sound_tokenizer" +SOUND_TOKENIZER_CHECKPOINT_NAME = "diffusion_pytorch_model.safetensors" + + +def _pipeline_args(od_config: OmniDiffusionConfig) -> dict[str, Any]: + return dict(getattr(od_config, "custom_pipeline_args", None) or {}) + + +def _config_get(config: Any, key: str, default: Any = None) -> Any: + if config is None: + return default + if isinstance(config, dict): + return config.get(key, default) + if hasattr(config, "get"): + value = config.get(key, None) + return default if value is None else value + return getattr(config, key, default) + + +def _config_path_get(config: Any, *keys: str) -> Any: + value = config + for key in keys: + value = _config_get(value, key, None) + if value is None: + return None + return value + + +def _sound_tokenizer_config_from(config: Any) -> Any: + """Return nested ``sound_tokenizer`` config from Cosmos3 config shapes.""" + for path in ( + ("sound_tokenizer",), + ("model", "config", "sound_tokenizer"), + ("config", "sound_tokenizer"), + ("model_config", "sound_tokenizer"), + ): + value = _config_path_get(config, *path) + if value is not None: + return value + return None + + +def _nested_sound_tokenizer_configs(od_config: OmniDiffusionConfig | None) -> tuple[Any, ...]: + if od_config is None: + return () + configs = [] + for source in ( + getattr(od_config, "model_config", None), + getattr(od_config, "tf_model_config", None), + ): + config = _sound_tokenizer_config_from(source) + if config is not None: + configs.append(config) + return tuple(configs) + + +def _first_value_from_configs(configs: tuple[Any, ...], keys: tuple[str, ...]) -> Any: + for config in configs: + for key in keys: + value = _config_get(config, key, None) + if value is not None: + return value + return None + + +def _top_level_model_value(od_config: OmniDiffusionConfig | None, keys: tuple[str, ...]) -> Any: + if od_config is None: + return None + for source in ( + getattr(od_config, "model_config", None), + getattr(od_config, "tf_model_config", None), + ): + for key in keys: + for path in ((key,), ("model", "config", key), ("config", key), ("model_config", key)): + value = _config_path_get(source, *path) + if value is not None: + return value + return None + + +def _custom_arg_value(args: dict[str, Any], keys: tuple[str, ...]) -> Any: + for key in keys: + value = args.get(key) + if value is not None: + return value + return None + + +def _as_bool(value: Any) -> bool: + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "yes", "on"} + return bool(value) + + +def _as_audio_channels(value: Any) -> int: + if isinstance(value, bool): + return 2 if value else 1 + if isinstance(value, str) and value.strip().lower() in { + "1", + "0", + "true", + "false", + "yes", + "no", + "on", + "off", + }: + return 2 if _as_bool(value) else 1 + return int(value) + + +def _resolve_model_file(path: Any, model_root: str | None) -> str | None: + if not path: + return None + path = str(path) + if "://" in path or os.path.isabs(path) or os.path.exists(path) or not model_root: + return path + return str(Path(model_root) / path) + + +def _load_sound_tokenizer_component_config(config_path: str | None) -> dict[str, Any]: + if not config_path: + return {} + with open(config_path, encoding="utf-8") as f: + config = json.load(f) + if not isinstance(config, dict): + raise TypeError(f"Cosmos3 sound tokenizer config must be a JSON object, got {type(config)!r}.") + return config + + +def _component_audio_channels(config: dict[str, Any]) -> Any: + if config.get("dec_out_channels") is not None: + return config["dec_out_channels"] + if config.get("audio_channels") is not None: + return config["audio_channels"] + if config.get("stereo") is not None: + return 2 if _as_bool(config["stereo"]) else 1 + return None + + +def _component_arch_values(config: dict[str, Any]) -> dict[str, Any]: + values = { + "sample_rate": config.get("sampling_rate", config.get("sample_rate")), + "audio_channels": _component_audio_channels(config), + "io_channels": config.get("vocoder_input_dim", config.get("io_channels", config.get("latent_ch"))), + "hop_size": config.get("hop_size"), + } + return {key: value for key, value in values.items() if value is not None} + + +def _resolve_arch_value( + od_config: OmniDiffusionConfig, + args: dict[str, Any], + component_values: dict[str, Any], + *, + field: str, + custom_keys: tuple[str, ...], + nested_keys: tuple[str, ...], + top_level_keys: tuple[str, ...], + default: Any, + cast, +) -> Any: + custom_value = _custom_arg_value(args, custom_keys) + component_value = component_values.get(field) + if component_value is not None: + resolved = cast(component_value) + if custom_value is not None and cast(custom_value) != resolved: + raise ValueError( + "Conflicting Cosmos3 sound tokenizer architecture override for " + f"{field}: component config has {resolved!r}, custom args have {cast(custom_value)!r}." + ) + return resolved + + if custom_value is not None: + return cast(custom_value) + + nested_value = _first_value_from_configs(_nested_sound_tokenizer_configs(od_config), nested_keys) + if nested_value is not None: + return cast(nested_value) + + top_value = _top_level_model_value(od_config, top_level_keys) + if top_value is not None: + return cast(top_value) + + return cast(default) + + +def _resolve_normalization_value( + od_config: OmniDiffusionConfig, + args: dict[str, Any], + *, + name: str, + default: Any, + aliases: tuple[str, ...] = (), +) -> Any: + keys = (f"sound_{name}", name, *aliases) + custom_value = _custom_arg_value(args, keys) + if custom_value is not None: + return custom_value + nested_value = _first_value_from_configs(_nested_sound_tokenizer_configs(od_config), (name, *aliases)) + return default if nested_value is None else nested_value + + +def get_sound_config_value( + od_config: OmniDiffusionConfig, + name: str, + default: Any, + aliases: tuple[str, ...] = (), +) -> Any: + # Backward-compatible generic accessor. Prefer the more specific helpers + # below for Cosmos3 sound tokenizer fields so precedence stays explicit. + keys = (name, *aliases) + for config in ( + _pipeline_args(od_config), + getattr(od_config, "model_config", None), + getattr(od_config, "tf_model_config", None), + ): + if config is None: + continue + for key in keys: + if hasattr(config, "get"): + value = config.get(key, None) + else: + value = getattr(config, key, None) + if value is not None: + return value + return default + + +def get_sound_sample_rate(od_config: OmniDiffusionConfig) -> int: + args = _pipeline_args(od_config) + return _resolve_arch_value( + od_config, + args, + {}, + field="sample_rate", + custom_keys=("sound_sample_rate", "sample_rate"), + nested_keys=("sample_rate", "sampling_rate"), + top_level_keys=("sound_sample_rate", "sample_rate"), + default=DEFAULT_SOUND_SAMPLE_RATE, + cast=int, + ) + + +def get_sound_channels(od_config: OmniDiffusionConfig) -> int: + args = _pipeline_args(od_config) + return _resolve_arch_value( + od_config, + args, + {}, + field="audio_channels", + custom_keys=("sound_audio_channels", "audio_channels", "stereo"), + nested_keys=("audio_channels", "dec_out_channels", "stereo"), + top_level_keys=("sound_audio_channels", "audio_channels", "stereo"), + default=DEFAULT_SOUND_CHANNELS, + cast=_as_audio_channels, + ) + + +def get_sound_dim(od_config: OmniDiffusionConfig | None) -> int: + if od_config is None: + return DEFAULT_SOUND_DIM + args = _pipeline_args(od_config) + custom_value = _custom_arg_value(args, ("sound_dim", "io_channels", "latent_ch")) + if custom_value is not None: + return int(custom_value) + top_value = _top_level_model_value(od_config, ("sound_dim",)) + if top_value is not None: + return int(top_value) + nested_value = _first_value_from_configs( + _nested_sound_tokenizer_configs(od_config), + ("io_channels", "vocoder_input_dim", "latent_ch"), + ) + return int(DEFAULT_SOUND_DIM if nested_value is None else nested_value) + + +def get_sound_hop_size(od_config: OmniDiffusionConfig) -> int: + args = _pipeline_args(od_config) + return _resolve_arch_value( + od_config, + args, + {}, + field="hop_size", + custom_keys=("sound_hop_size", "hop_size"), + nested_keys=("hop_size",), + top_level_keys=("sound_hop_size", "hop_size"), + default=DEFAULT_SOUND_HOP_SIZE, + cast=int, + ) + + +def get_sound_latent_fps(od_config: OmniDiffusionConfig | None) -> float: + if od_config is None: + return DEFAULT_SOUND_LATENT_FPS + args = _pipeline_args(od_config) + custom_value = _custom_arg_value(args, ("sound_latent_fps",)) + if custom_value is not None: + return float(custom_value) + top_value = _top_level_model_value(od_config, ("sound_latent_fps",)) + if top_value is not None: + return float(top_value) + nested_configs = _nested_sound_tokenizer_configs(od_config) + nested_fps = _first_value_from_configs(nested_configs, ("sound_latent_fps", "latent_fps")) + if nested_fps is not None: + return float(nested_fps) + sample_rate = _first_value_from_configs(nested_configs, ("sample_rate", "sampling_rate")) + hop_size = _first_value_from_configs(nested_configs, ("hop_size",)) + if sample_rate is not None and hop_size is not None: + return float(sample_rate) / float(hop_size) + return float(DEFAULT_SOUND_LATENT_FPS) + + +class Cosmos3SoundTokenizer: + """Thin adapter around the local AVAE tokenizer implementation.""" + + def __init__(self, tokenizer: Any) -> None: + self.tokenizer = tokenizer + self.sample_rate = int(getattr(tokenizer, "sample_rate", DEFAULT_SOUND_SAMPLE_RATE)) + self.audio_channels = int(getattr(tokenizer, "audio_channels", DEFAULT_SOUND_CHANNELS)) + self.latent_ch = int(getattr(tokenizer, "latent_ch", DEFAULT_SOUND_DIM)) + self.hop_size = int(getattr(tokenizer, "temporal_compression_factor", DEFAULT_SOUND_HOP_SIZE)) + + @classmethod + def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer: + args = _pipeline_args(od_config) + model_path = getattr(od_config, "model", None) + explicit_avae_path = ( + args.get("sound_tokenizer_path") + or args.get("avae_path") + or args.get("cosmos3_avae_path") + or os.environ.get("COSMOS3_SOUND_TOKENIZER_PATH") + ) + explicit_config_path = args.get("sound_tokenizer_config_path") or os.environ.get( + "COSMOS3_SOUND_TOKENIZER_CONFIG_PATH" + ) + + model_root = str(model_path) if model_path and os.path.isdir(model_path) else None + if model_root is None and model_path and not explicit_avae_path: + from huggingface_hub import snapshot_download + + model_root = snapshot_download( + repo_id=str(model_path), + revision=getattr(od_config, "revision", None), + allow_patterns=[ + f"{SOUND_TOKENIZER_COMPONENT_NAME}/config.json", + f"{SOUND_TOKENIZER_COMPONENT_NAME}/{SOUND_TOKENIZER_CHECKPOINT_NAME}", + ], + ) + + if explicit_avae_path: + avae_path = _resolve_model_file(explicit_avae_path, model_root) + else: + tokenizer_dir = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME if model_root else None + candidate = tokenizer_dir / SOUND_TOKENIZER_CHECKPOINT_NAME if tokenizer_dir else None + avae_path = str(candidate) if candidate and candidate.exists() else None + + if not avae_path: + raise ValueError( + "Cosmos3 sound generation was requested, but no AVAE sound " + "tokenizer checkpoint was provided. Set " + "custom_pipeline_args['sound_tokenizer_path'] or " + "COSMOS3_SOUND_TOKENIZER_PATH, or include " + f"{SOUND_TOKENIZER_COMPONENT_NAME}/{SOUND_TOKENIZER_CHECKPOINT_NAME} under the model path." + ) + + config_path = _resolve_model_file(explicit_config_path, model_root) + if config_path is None and model_root: + candidate = Path(model_root) / SOUND_TOKENIZER_COMPONENT_NAME / "config.json" + config_path = str(candidate) if candidate.exists() else None + component_config = _load_sound_tokenizer_component_config(config_path) + component_values = _component_arch_values(component_config) + + sample_rate = _resolve_arch_value( + od_config, + args, + component_values, + field="sample_rate", + custom_keys=("sound_sample_rate", "sample_rate"), + nested_keys=("sample_rate", "sampling_rate"), + top_level_keys=("sound_sample_rate", "sample_rate"), + default=DEFAULT_SOUND_SAMPLE_RATE, + cast=int, + ) + audio_channels = _resolve_arch_value( + od_config, + args, + component_values, + field="audio_channels", + custom_keys=("sound_audio_channels", "audio_channels", "stereo"), + nested_keys=("audio_channels", "dec_out_channels", "stereo"), + top_level_keys=("sound_audio_channels", "audio_channels", "stereo"), + default=DEFAULT_SOUND_CHANNELS, + cast=_as_audio_channels, + ) + sound_dim = _resolve_arch_value( + od_config, + args, + component_values, + field="io_channels", + custom_keys=("sound_dim", "io_channels", "latent_ch"), + nested_keys=("io_channels", "vocoder_input_dim", "latent_ch"), + top_level_keys=("sound_dim",), + default=DEFAULT_SOUND_DIM, + cast=int, + ) + hop_size = _resolve_arch_value( + od_config, + args, + component_values, + field="hop_size", + custom_keys=("sound_hop_size", "hop_size"), + nested_keys=("hop_size",), + top_level_keys=("sound_hop_size", "hop_size"), + default=DEFAULT_SOUND_HOP_SIZE, + cast=int, + ) + normalize_latents = _as_bool( + _resolve_normalization_value( + od_config, + args, + name="normalize_latents", + default=DEFAULT_SOUND_NORMALIZE_LATENTS, + ) + ) + normalization_type = str( + _resolve_normalization_value( + od_config, + args, + name="normalization_type", + default=DEFAULT_SOUND_NORMALIZATION_TYPE, + ) + ) + tanh_input_scale = float( + _resolve_normalization_value( + od_config, + args, + name="tanh_input_scale", + default=DEFAULT_SOUND_TANH_INPUT_SCALE, + ) + ) + tanh_output_scale = float( + _resolve_normalization_value( + od_config, + args, + name="tanh_output_scale", + default=DEFAULT_SOUND_TANH_OUTPUT_SCALE, + ) + ) + tanh_clamp = float( + _resolve_normalization_value( + od_config, + args, + name="tanh_clamp", + default=DEFAULT_SOUND_TANH_CLAMP, + ) + ) + tokenizer = Cosmos3AVAEAudioTokenizer( + checkpoint_path=str(avae_path), + config_path=config_path, + sample_rate=sample_rate, + audio_channels=audio_channels, + io_channels=sound_dim, + hop_size=hop_size, + normalize_latents=normalize_latents, + normalization_type=normalization_type, + tanh_input_scale=tanh_input_scale, + tanh_output_scale=tanh_output_scale, + tanh_clamp=tanh_clamp, + dtype=getattr(od_config, "dtype", torch.bfloat16), + device=get_local_device(), + ) + if _is_rank_zero(): + logger.info( + "Loaded Cosmos3 AVAE sound tokenizer from %s (sr=%d, channels=%d, latent_ch=%d, hop=%d)", + avae_path, + sample_rate, + audio_channels, + sound_dim, + hop_size, + ) + return cls(tokenizer) + + def get_latent_num_samples(self, num_audio_samples: int) -> int: + return int(self.tokenizer.get_latent_num_samples(num_audio_samples)) + + def get_audio_num_samples(self, num_latent_samples: int) -> int: + return int(self.tokenizer.get_audio_num_samples(num_latent_samples)) + + @torch.no_grad() + def decode(self, latents: torch.Tensor) -> torch.Tensor: + """Decode sound latents. + + Args: + latents: ``[B, C, T]`` or ``[C, T]`` tensor. + + Returns: + ``[B, audio_channels, N]`` tensor for batched input, or + ``[audio_channels, N]`` for unbatched input. + """ + squeeze = latents.ndim == 2 + if squeeze: + latents = latents.unsqueeze(0) + audio = self.tokenizer.decode(latents) + audio = audio.clamp(-1.0, 1.0) + return audio.squeeze(0) if squeeze else audio diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py index 7b3848a089a..49af3fde3d0 100644 --- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py +++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py @@ -76,6 +76,51 @@ def _tf_config_get(config: Any, key: str, default: Any) -> Any: return getattr(config, key, default) +def _nested_get(value: Any, key: str) -> Any: + if isinstance(value, dict): + if key in value: + return value[key] + for child in value.values(): + found = _nested_get(child, key) + if found is not None: + return found + elif isinstance(value, list | tuple): + for child in value: + found = _nested_get(child, key) + if found is not None: + return found + return None + + +def _od_config_get(od_config: Any, key: str, default: Any = None) -> Any: + """Read Cosmos3 options from runtime, model, or transformer config.""" + if od_config is None: + return default + for attr in ("custom_pipeline_args", "model_config"): + source = getattr(od_config, attr, None) or {} + if isinstance(source, dict): + if key in source: + return source[key] + found = _nested_get(source, key) + if found is not None: + return found + tf_model_config = getattr(od_config, "tf_model_config", None) + if isinstance(tf_model_config, dict): + if key in tf_model_config: + return tf_model_config[key] + found = _nested_get(tf_model_config, key) + if found is not None: + return found + value = _tf_config_get(tf_model_config, key, None) + return default if value is None else value + + +def _as_bool(value: Any) -> bool: + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "yes", "on"} + return bool(value) + + # --------------------------------------------------------------------------- # Rotary Position Embeddings (mRoPE) # --------------------------------------------------------------------------- @@ -138,6 +183,30 @@ def compute_mrope_position_ids_vision( return mrope_ids, next_offset +def compute_mrope_position_ids_sound( + grid_t: int, + temporal_offset: int | float, + sound_latent_fps: float, + base_fps: float = 24.0, + temporal_compression_factor_sound: int = 1, + enable_fps_modulation: bool = True, + base_temporal_compression_factor: int | None = None, +) -> tuple[torch.Tensor, int | float]: + """Generate mRoPE IDs for sound tokens as a (T, 1, 1) grid.""" + del base_temporal_compression_factor + return compute_mrope_position_ids_vision( + grid_t=grid_t, + grid_h=1, + grid_w=1, + temporal_offset=temporal_offset, + fps=sound_latent_fps, + base_fps=base_fps, + temporal_compression_factor=temporal_compression_factor_sound, + base_temporal_compression_factor=temporal_compression_factor_sound, + enable_fps_modulation=enable_fps_modulation, + ) + + class Qwen3VLTextRotaryEmbedding(nn.Module): """Multi-dimensional rotary position embedding for Qwen3-VL.""" @@ -859,9 +928,25 @@ def __init__( self.latent_channel_size = int(_tf_config_get(model_config, "latent_channel", 48)) self.timestep_scale = float(_tf_config_get(model_config, "timestep_scale", 0.001)) self.base_fps = float(_tf_config_get(model_config, "base_fps", 24.0)) + sound_gen_value = _od_config_get(od_config, "sound_gen", None) + sound_dim_value = _od_config_get(od_config, "sound_dim", None) + if sound_dim_value is None: + sound_dim_value = _od_config_get(od_config, "io_channels", None) + if sound_dim_value is None: + sound_dim_value = _od_config_get(od_config, "vocoder_input_dim", None) + if sound_dim_value is None: + sound_dim_value = _od_config_get(od_config, "latent_ch", None) + self.sound_gen = _as_bool(sound_gen_value) if sound_gen_value is not None else sound_dim_value is not None + from .sound_tokenizer import get_sound_dim, get_sound_latent_fps + + self.sound_dim = int(sound_dim_value if sound_dim_value is not None else get_sound_dim(od_config)) + self.sound_latent_fps = float(get_sound_latent_fps(od_config)) if temporal_compression_factor is None: temporal_compression_factor = _tf_config_get(model_config, "temporal_compression_factor", 4) self.temporal_compression_factor = int(temporal_compression_factor) + self.temporal_compression_factor_sound = int( + _tf_config_get(model_config, "temporal_compression_factor_sound", 1) + ) self.enable_fps_modulation = bool(_tf_config_get(model_config, "enable_fps_modulation", True)) self.temporal_modality_margin = int( _tf_config_get( @@ -894,6 +979,12 @@ def __init__( self.proj_in = nn.Linear(self.patch_latent_dim, self.hidden_size) self.proj_out = nn.Linear(self.hidden_size, self.patch_latent_dim) self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=dtype) + if self.sound_gen: + self.audio_proj_in = nn.Linear(self.sound_dim, self.hidden_size) + self.audio_proj_out = nn.Linear(self.hidden_size, self.sound_dim) + self.audio_modality_embed = nn.Parameter(torch.zeros(self.hidden_size)) + + self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=torch.bfloat16) self.gen_layers = nn.ModuleList( [ @@ -962,6 +1053,21 @@ def unpatchify(self, tokens: torch.Tensor, t: int, h: int, w: int) -> torch.Tens x = x[:, :, :, :h, :w] return x + def pack_sound(self, sound_latents: torch.Tensor) -> torch.Tensor: + """[B, C_sound, T_sound] -> [B, T_sound, C_sound].""" + if sound_latents.ndim != 3: + raise ValueError(f"Cosmos3 sound latents must have shape [B, C, T], got {tuple(sound_latents.shape)}.") + if sound_latents.shape[1] != self.sound_dim: + raise ValueError( + f"Cosmos3 sound latent channel mismatch: expected {self.sound_dim}, got {sound_latents.shape[1]}." + ) + return sound_latents.permute(0, 2, 1).contiguous() + + @staticmethod + def unpack_sound(tokens: torch.Tensor) -> torch.Tensor: + """[B, T_sound, C_sound] -> [B, C_sound, T_sound].""" + return tokens.permute(0, 2, 1).contiguous() + # -- RoPE computation ---------------------------------------------------- def _compute_rope_freqs( @@ -973,12 +1079,14 @@ def _compute_rope_freqs( fps: float | None, device: torch.device, dtype: torch.dtype, + t_sound: int | None = None, ) -> tuple[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor]]: """Compute mRoPE cos/sin for UND text and GEN media pathways.""" B = text_mask.shape[0] S_text = text_mask.shape[1] text_lengths = text_mask.sum(dim=1).long() effective_fps = fps if fps is not None and t > 1 else None + sound_frames = int(t_sound or 0) text_pos_list = [] gen_pos_list = [] @@ -996,6 +1104,21 @@ def _compute_rope_freqs( temporal_compression_factor=self.temporal_compression_factor, enable_fps_modulation=self.enable_fps_modulation, ) + gen_positions = [v_pos] + if sound_frames > 0: + s_pos, _ = compute_mrope_position_ids_sound( + sound_frames, + temporal_offset=media_temporal_offset, + sound_latent_fps=self.sound_latent_fps, + base_fps=self.base_fps, + temporal_compression_factor_sound=getattr(self, "temporal_compression_factor_sound", 1), + enable_fps_modulation=self.enable_fps_modulation, + ) + gen_positions.append(s_pos) + pos_dtype = gen_positions[0].dtype + for pos in gen_positions[1:]: + pos_dtype = torch.promote_types(pos_dtype, pos.dtype) + v_pos = torch.cat([pos.to(pos_dtype) for pos in gen_positions], dim=1) if real_len < S_text: t_pos = torch.cat( [t_pos, torch.zeros(3, S_text - real_len, dtype=t_pos.dtype)], @@ -1026,16 +1149,31 @@ def reset_cache(self) -> None: def _validate_gen_sequence_parallel( *, s_gen: int, + s_video: int, + s_sound: int, + has_sound: bool, ulysses_size: int, ) -> None: if ulysses_size <= 1 or s_gen % ulysses_size == 0: return + detail_parts = [f"video tokens {s_video}"] + if has_sound: + detail_parts.append(f"sound tokens {s_sound}") + detail = " = " + " + ".join(detail_parts) if len(detail_parts) > 1 else "" adjust_detail = ( - "Adjust the spatial resolution so that t * ceil(h/patch) * ceil(w/patch) is a multiple of ulysses_degree." + "Adjust the spatial resolution, frame count, sound duration, " + "or sound latent FPS so the combined media sequence is a " + "multiple of ulysses_degree." + if has_sound + else ( + "Adjust the spatial resolution so that " + "t * ceil(h/patch) * ceil(w/patch) is a multiple " + "of ulysses_degree." + ) ) raise ValueError( - f"GEN sequence length ({s_gen} video tokens) must be divisible by " + f"GEN sequence length ({s_gen}{detail}) must be divisible by " f"ulysses_degree ({ulysses_size}). {adjust_detail}" ) @@ -1049,9 +1187,10 @@ def forward( text_mask: torch.Tensor, video_shape: tuple[int, int, int], fps: float | None = None, + sound_latents: torch.Tensor | None = None, noisy_frame_mask: torch.Tensor | None = None, **kwargs, - ) -> torch.Tensor: + ) -> torch.Tensor | tuple[torch.Tensor, ...]: """ Args: hidden_states: [B, C, t, h, w] noisy latents @@ -1060,13 +1199,15 @@ def forward( text_mask: [B, S_text] attention mask (1=real, 0=pad) video_shape: (t, h, w) in latent space fps: video frame rate for temporal mRoPE modulation + sound_latents: Optional [B, C_sound, T_sound] noisy sound latents. noisy_frame_mask: Optional [B, 1, t, 1, 1] mask where 1=noisy (add timestep embedding, predict velocity) and 0=conditioned (clean context, skip timestep embedding). None means all frames noisy (T2V mode). Returns: - [B, C, t, h, w] velocity prediction. + [B, C, t, h, w] velocity prediction, or + tuple outputs in video, sound order when sound latents are provided. """ t, h, w = video_shape hp, wp, _, _ = self._pad_to_patch_size(h, w) @@ -1078,12 +1219,31 @@ def forward( f"Cosmos3 requires identical real text lengths within a batch " f"(got min={min_real_len}, max={max_real_len})." ) + has_sound = sound_latents is not None + if has_sound and not self.sound_gen: + raise ValueError( + "Cosmos3 sound generation was requested, but this transformer " + "was initialized without sound modules. Check that the " + "transformer config enables sound_gen or defines sound_dim." + ) # Query Ulysses state at runtime ulysses_size, _, _ = _get_ulysses_state() # Patchify latents and project to hidden space hidden_video = self.proj_in(self.patchify(hidden_states, t, h, w)) + s_video = hidden_video.shape[1] + s_sound = 0 + hidden_sound = None + if sound_latents is not None: + if sound_latents.shape[0] != hidden_states.shape[0]: + raise ValueError( + "Cosmos3 sound and video batch sizes must match: " + f"video={hidden_states.shape[0]}, sound={sound_latents.shape[0]}." + ) + hidden_sound = self.audio_proj_in(self.pack_sound(sound_latents)) + hidden_sound = hidden_sound + self.audio_modality_embed.to(hidden_sound.dtype) + s_sound = hidden_sound.shape[1] # Timestep embedding (fp32 for precision). # For I2V: only add to noisy tokens, not conditioned ones. @@ -1106,7 +1266,12 @@ def forward( else: hidden_video = hidden_video + time_embed.unsqueeze(1) - hidden_gen = hidden_video + if hidden_sound is not None: + hidden_sound = hidden_sound + time_embed.unsqueeze(1) + hidden_parts = [hidden_video] + if hidden_sound is not None: + hidden_parts.append(hidden_sound) + hidden_gen = torch.cat(hidden_parts, dim=1) # Run UND pathway once and cache K/V (replicated across all ranks) if self.cached_kv is None: @@ -1118,6 +1283,7 @@ def forward( fps, hidden_states.device, hidden_states.dtype, + t_sound=s_sound, ) cached_kv_full = self.language_model(text_ids, freqs_und) self.cached_freqs_gen = freqs_gen @@ -1133,6 +1299,9 @@ def forward( raise RuntimeError("Cosmos3 GEN cache was not initialized before running GEN layers.") self._validate_gen_sequence_parallel( s_gen=hidden_gen.shape[1], + s_video=s_video, + s_sound=s_sound, + has_sound=has_sound, ulysses_size=ulysses_size, ) freqs_cos, freqs_sin = self.cached_freqs_gen @@ -1166,7 +1335,21 @@ def forward( # Final norm and project back to latent space hidden_gen = self.norm_moe_gen(hidden_gen) - return self.unpatchify(self.proj_out(hidden_gen), t, h, w) + if not has_sound: + return self.unpatchify(self.proj_out(hidden_gen), t, h, w) + + split_sizes = [s_video] + if has_sound: + split_sizes.append(s_sound) + split_hidden = hidden_gen.split(split_sizes, dim=1) + hidden_video = split_hidden[0] + video_pred = self.unpatchify(self.proj_out(hidden_video), t, h, w) + outputs: list[torch.Tensor] = [video_pred] + split_idx = 1 + if has_sound: + hidden_sound = split_hidden[split_idx] + outputs.append(self.unpack_sound(self.audio_proj_out(hidden_sound))) + return tuple(outputs) def post_load_weights(self) -> None: """Post-load processing: ensure correct dtypes.""" diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 2c738853128..28acc7379c9 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -2627,6 +2627,8 @@ async def _parse_video_form( flow_shift: float | None = Form(default=None), true_cfg_scale: float | None = Form(default=None), seed: int | None = Form(default=None), + generate_sound: bool | None = Form(default=None), + sound_duration: float | None = Form(default=None, gt=0.0), negative_prompt: str | None = Form(default=None), enable_frame_interpolation: bool | None = Form(default=None), frame_interpolation_exp: int | None = Form(default=None, ge=1), @@ -2667,6 +2669,8 @@ async def _parse_video_form( "flow_shift": flow_shift, "true_cfg_scale": true_cfg_scale, "seed": seed, + "generate_sound": generate_sound, + "sound_duration": sound_duration, "negative_prompt": negative_prompt, "enable_frame_interpolation": enable_frame_interpolation, "frame_interpolation_exp": frame_interpolation_exp, diff --git a/vllm_omni/entrypoints/openai/protocol/videos.py b/vllm_omni/entrypoints/openai/protocol/videos.py index d46c8d43d6b..887e3ce67ea 100644 --- a/vllm_omni/entrypoints/openai/protocol/videos.py +++ b/vllm_omni/entrypoints/openai/protocol/videos.py @@ -149,6 +149,15 @@ class VideoGenerationRequest(BaseModel): description="True CFG scale (model-specific parameter, may be ignored if not supported)", ) seed: int | None = Field(default=None, description="Random seed for reproducibility") + generate_sound: bool = Field( + default=False, + description="Request model-generated audio for video models that support sound generation.", + ) + sound_duration: float | None = Field( + default=None, + gt=0.0, + description="Duration in seconds for model-generated audio. Defaults to the generated video duration.", + ) # vllm-omni extensions for post-generation frame interpolation. enable_frame_interpolation: bool = Field( diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py index b6ed49996fe..57a76594a0f 100644 --- a/vllm_omni/entrypoints/openai/serving_video.py +++ b/vllm_omni/entrypoints/openai/serving_video.py @@ -148,6 +148,10 @@ async def _run_and_extract( ) if "flow_shift" in provided_fields and request.flow_shift is not None: gen_params.extra_args["flow_shift"] = request.flow_shift + if "generate_sound" in provided_fields: + gen_params.extra_args["generate_sound"] = request.generate_sound + if "sound_duration" in provided_fields and request.sound_duration is not None: + gen_params.extra_args["sound_duration"] = request.sound_duration # Apply model-specific extra parameters if request.extra_params is not None: From 1b7e40d1e6ae95f14c357a297ae292020b5e1a2f Mon Sep 17 00:00:00 2001 From: Maciej Bala Date: Tue, 2 Jun 2026 10:52:47 +0200 Subject: [PATCH 02/11] Fix tests; small improvements Signed-off-by: Maciej Bala Signed-off-by: lishunyang12 --- tests/diffusion/models/cosmos3/conftest.py | 185 ------------------ .../models/cosmos3/test_cosmos3_pipeline.py | 17 +- .../cosmos3/test_cosmos3_transformer.py | 9 +- .../models/cosmos3/pipeline_cosmos3.py | 11 +- .../models/cosmos3/transformer_cosmos3.py | 2 - 5 files changed, 26 insertions(+), 198 deletions(-) delete mode 100644 tests/diffusion/models/cosmos3/conftest.py diff --git a/tests/diffusion/models/cosmos3/conftest.py b/tests/diffusion/models/cosmos3/conftest.py deleted file mode 100644 index 7075065447c..00000000000 --- a/tests/diffusion/models/cosmos3/conftest.py +++ /dev/null @@ -1,185 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from __future__ import annotations - -import sys -import types -from types import SimpleNamespace -from typing import Any - -import pytest -import torch -from torch import nn - - -class StubScheduler: - def __init__(self, timesteps: list[int] | None = None, *, flow_shift: float = 1.0) -> None: - self.timesteps = torch.tensor(timesteps or [9, 3], dtype=torch.int64) - self.config = SimpleNamespace(num_train_timesteps=1000, flow_shift=flow_shift) - self.set_timesteps_calls: list[tuple[int, torch.device]] = [] - self.step_calls: list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]] = [] - - def set_timesteps(self, num_steps: int, device: torch.device) -> None: - self.set_timesteps_calls.append((num_steps, device)) - self.timesteps = torch.arange(num_steps, 0, -1, dtype=torch.int64, device=device) - - def step(self, noise_pred: torch.Tensor, timestep: torch.Tensor, latents: torch.Tensor, **kwargs): - del kwargs - self.step_calls.append((noise_pred.clone(), timestep.clone(), latents.clone())) - return (latents + noise_pred,) - - -class _ModeLatentDist: - def __init__(self, latents: torch.Tensor) -> None: - self._latents = latents - - def mode(self) -> torch.Tensor: - return self._latents - - -class StubCosmos3VAE: - dtype = torch.float32 - - def __init__(self, z_dim: int = 2, *, temporal: int = 4, spatial: int = 8) -> None: - self.config = SimpleNamespace( - z_dim=z_dim, - scale_factor_temporal=temporal, - scale_factor_spatial=spatial, - latents_mean=[0.0] * z_dim, - latents_std=[1.0] * z_dim, - ) - - def encode(self, video: torch.Tensor): - latent_frames = (video.shape[2] - 1) // self.config.scale_factor_temporal + 1 - latent_height = video.shape[-2] // self.config.scale_factor_spatial - latent_width = video.shape[-1] // self.config.scale_factor_spatial - latents = torch.ones( - video.shape[0], - self.config.z_dim, - latent_frames, - latent_height, - latent_width, - dtype=video.dtype, - device=video.device, - ) - return SimpleNamespace(latent_dist=_ModeLatentDist(latents)) - - def decode(self, latents: torch.Tensor, return_dict: bool = False): - del return_dict - return (latents,) - - -class StubCosmos3Transformer(nn.Module): - def __init__( - self, - *, - latent_channel_size: int = 2, - sound_gen: bool = False, - sound_dim: int = 3, - ) -> None: - super().__init__() - self.latent_channel_size = latent_channel_size - self.sound_gen = sound_gen - self.sound_dim = sound_dim - self.cached_kv: Any | None = None - self.cached_freqs_gen: Any | None = None - self.calls: list[dict[str, Any]] = [] - self.reset_calls = 0 - - def reset_cache(self) -> None: - self.reset_calls += 1 - self.cached_kv = None - self.cached_freqs_gen = None - - def forward( - self, - *, - hidden_states: torch.Tensor, - timestep: torch.Tensor, - text_ids: torch.Tensor, - text_mask: torch.Tensor, - **kwargs: Any, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - token = int(text_ids.reshape(-1)[0].item()) if text_ids.numel() else 0 - sound_latents = kwargs.get("sound_latents") - self.calls.append( - { - "token": token, - "timestep": timestep.clone(), - "text_mask": text_mask.clone(), - "cache_before": self.cached_kv, - "kwargs": dict(kwargs), - } - ) - if self.cached_kv is None: - marker = torch.tensor([token], dtype=torch.float32) - self.cached_kv = [(marker, marker + 100)] - self.cached_freqs_gen = (marker + 200, marker + 300) - outputs: list[torch.Tensor] = [torch.full_like(hidden_states, float(token))] - if sound_latents is not None: - outputs.append(torch.full_like(sound_latents, float(token + 10))) - return outputs[0] if len(outputs) == 1 else tuple(outputs) - - -def passthrough_progress_bar(iterable): - return iterable - - -@pytest.fixture(autouse=True) -def fake_cosmos3_guardrails(monkeypatch: pytest.MonkeyPatch): - module = types.ModuleType("vllm_omni.diffusion.models.cosmos3.guardrails") - module.is_guardrails_enabled = lambda od_config, sampling_params=None: False - module.ensure_initialized = lambda od_config: None - module.check_text_safety = lambda text: None - module.check_video_safety = lambda video: video - monkeypatch.setitem(sys.modules, module.__name__, module) - return module - - -@pytest.fixture -def make_cosmos3_pipeline(): - def _make(): - from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import ( - Cosmos3OmniDiffusersPipeline, - ) - - pipeline = object.__new__(Cosmos3OmniDiffusersPipeline) - nn.Module.__init__(pipeline) - pipeline.od_config = SimpleNamespace() - pipeline.device = torch.device("cpu") - pipeline.dtype = torch.float32 - pipeline.transformer = StubCosmos3Transformer(latent_channel_size=2) - pipeline.vae = StubCosmos3VAE(z_dim=2) - pipeline.vae_scale_factor_temporal = 4 - pipeline.vae_scale_factor_spatial = 8 - pipeline.scheduler = StubScheduler([9, 3], flow_shift=1.0) - pipeline._base_scheduler_config = pipeline.scheduler.config - pipeline._engine_init_flow_shift = 1.0 - pipeline._current_flow_shift = 1.0 - pipeline._guidance_scale = None - pipeline._num_timesteps = None - pipeline.progress_bar = passthrough_progress_bar - pipeline._sound_tokenizer = None - return pipeline - - return _make - - -def make_sampling_params(**overrides: Any) -> SimpleNamespace: - values = { - "height": None, - "width": None, - "num_frames": None, - "num_inference_steps": None, - "guidance_scale": None, - "generator": None, - "seed": 123, - "num_outputs_per_prompt": 1, - "frame_rate": None, - "resolved_frame_rate": None, - "max_sequence_length": None, - "extra_args": {}, - } - values.update(overrides) - return SimpleNamespace(**values) diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py index b6116d9265d..3c042275341 100644 --- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py +++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py @@ -78,9 +78,13 @@ def __init__( self, *, latent_channel_size: int = 2, + sound_gen: bool = False, + sound_dim: int = 3, ) -> None: super().__init__() self.latent_channel_size = latent_channel_size + self.sound_gen = sound_gen + self.sound_dim = sound_dim self.cached_kv: Any | None = None self.cached_freqs_gen: Any | None = None self.calls: list[dict[str, Any]] = [] @@ -99,8 +103,9 @@ def forward( text_ids: torch.Tensor, text_mask: torch.Tensor, **kwargs: Any, - ) -> torch.Tensor: + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: token = int(text_ids.reshape(-1)[0].item()) if text_ids.numel() else 0 + sound_latents = kwargs.get("sound_latents") self.calls.append( { "token": token, @@ -114,7 +119,10 @@ def forward( marker = torch.tensor([token], dtype=torch.float32) self.cached_kv = [(marker, marker + 100)] self.cached_freqs_gen = (marker + 200, marker + 300) - return torch.full_like(hidden_states, float(token)) + outputs: list[torch.Tensor] = [torch.full_like(hidden_states, float(token))] + if sound_latents is not None: + outputs.append(torch.full_like(sound_latents, float(token + 10))) + return outputs[0] if len(outputs) == 1 else tuple(outputs) def passthrough_progress_bar(iterable): @@ -155,6 +163,7 @@ def _make(): pipeline._guidance_scale = None pipeline._num_timesteps = None pipeline._cache_dit_requires_paired_cfg = False + pipeline._sound_tokenizer = None pipeline.progress_bar = passthrough_progress_bar return pipeline @@ -235,7 +244,9 @@ def test_postprocess_handles_image_video_audio_and_validation() -> None: assert func(video, output_type="latent") is video assert func({"image": video})[0].size == (4, 4) - assert "video" in func({"video": video}) + # Video-only postprocess returns the bare processed video (not a dict), + # matching the image/latent branches and peer audio-capable pipelines. + assert not isinstance(func({"video": video}), dict) assert ( func( {"video": video, "audio": torch.ones(1, 2, 16), "audio_sample_rate": 48000}, diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py index 38db56e0c26..bd1a9588b7e 100644 --- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py +++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py @@ -120,7 +120,6 @@ def test_forward_returns_video_prediction(monkeypatch: pytest.MonkeyPatch) -> No text_mask=torch.ones(1, 2, dtype=torch.long), video_shape=(1, 2, 2), fps=24.0, - sound_latents=torch.zeros(1, 3, 4), ) assert tuple(output.shape) == (1, 2, 1, 2, 2) @@ -161,10 +160,12 @@ def test_sound_pack_unpack_validate_shapes() -> None: model.pack_sound(torch.zeros(1, 4, 2)) -def test_forward_returns_video_and_sound_predictions() -> None: - from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer +def test_forward_returns_video_and_sound_predictions(monkeypatch: pytest.MonkeyPatch) -> None: + from vllm_omni.diffusion.models.cosmos3 import transformer_cosmos3 - output = Cosmos3VFMTransformer( + monkeypatch.setattr(transformer_cosmos3, "_get_ulysses_state", lambda: (1, 0, None)) + + output = transformer_cosmos3.Cosmos3VFMTransformer( SimpleNamespace( tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0), dtype=torch.float32, diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py index 543add3ac46..672f77715a2 100644 --- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py +++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py @@ -232,13 +232,16 @@ def post_process_func( return video_processor.postprocess(image, output_type="pil") if is_guardrails_enabled(od_config, sampling_params): video = check_video_safety(video) - result = {"video": video_processor.postprocess_video(video, output_type=output_type)} + processed_video = video_processor.postprocess_video(video, output_type=output_type) if audio is None: - return result + return processed_video if isinstance(audio, torch.Tensor): audio = audio.detach().cpu() - result["audio"] = audio - result["fps"] = _resolve_output_fps(sampling_params) + result = { + "video": processed_video, + "audio": audio, + "fps": _resolve_output_fps(sampling_params), + } if audio_sample_rate is not None: result["audio_sample_rate"] = int(audio_sample_rate) return result diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py index 49af3fde3d0..31fbf69d66d 100644 --- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py +++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py @@ -984,8 +984,6 @@ def __init__( self.audio_proj_out = nn.Linear(self.hidden_size, self.sound_dim) self.audio_modality_embed = nn.Parameter(torch.zeros(self.hidden_size)) - self.time_embedder = TimestepEmbedder(self.hidden_size, target_dtype=torch.bfloat16) - self.gen_layers = nn.ModuleList( [ Cosmos3GenDecoderLayer( From 6638fbc5e91a4ce1b7deda0fd362c61f696d7d7e Mon Sep 17 00:00:00 2001 From: Bartosz Stefaniak Date: Tue, 2 Jun 2026 16:08:46 +0000 Subject: [PATCH 03/11] Remove unused parameter Signed-off-by: lishunyang12 --- vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py index 31fbf69d66d..e1810bd7103 100644 --- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py +++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py @@ -190,10 +190,8 @@ def compute_mrope_position_ids_sound( base_fps: float = 24.0, temporal_compression_factor_sound: int = 1, enable_fps_modulation: bool = True, - base_temporal_compression_factor: int | None = None, ) -> tuple[torch.Tensor, int | float]: """Generate mRoPE IDs for sound tokens as a (T, 1, 1) grid.""" - del base_temporal_compression_factor return compute_mrope_position_ids_vision( grid_t=grid_t, grid_h=1, From 9b8b239f10a11972025edf4f4c7d6e3a07e47eb3 Mon Sep 17 00:00:00 2001 From: Bartosz Stefaniak Date: Tue, 2 Jun 2026 19:03:00 +0000 Subject: [PATCH 04/11] Comment about packed modalities into single tensor Signed-off-by: lishunyang12 --- vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py index 672f77715a2..90de6575eee 100644 --- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py +++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py @@ -1111,6 +1111,9 @@ def _cfg_active_at(t: torch.Tensor) -> bool: lo, hi = guidance_interval return lo <= t_scalar <= hi + # Joint scheduler step over multiple modalities. Safe for flow-matching schedulers + # because the update is linear per element; revisit this if Cosmos3 adopts a + # scheduler with cross-element dependencies (e.g. per-modality timestep). def _pack_joint( video_tensor: torch.Tensor, sound_tensor: torch.Tensor | None = None, From e82a83180838178a2ddc02015c8ecffb81981541 Mon Sep 17 00:00:00 2001 From: Bartosz Stefaniak Date: Tue, 2 Jun 2026 19:03:30 +0000 Subject: [PATCH 05/11] Enable sound generation only thorough "generate_sound", "sound_gen" flags Signed-off-by: lishunyang12 --- vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py index 90de6575eee..33c05efbf95 100644 --- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py +++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py @@ -595,15 +595,7 @@ def _get_prompt_param(cls, prompt_data, key: str, default=None): @classmethod def _is_sound_request(cls, prompt_data, sp) -> bool: - keys = ( - "sound_gen", - "generate_sound", - "enable_sound_generation", - "return_audio", - "output_audio", - "generate_audio", - ) - for key in keys: + for key in ("generate_sound", "sound_gen"): if cls._truthy(cls._get_prompt_param(prompt_data, key, None)): return True if cls._truthy(cls._get_sp_param(sp, key, None)): From 2ee73c6019513ace43ff996a95096d671d56802b Mon Sep 17 00:00:00 2001 From: Bartosz Stefaniak Date: Tue, 2 Jun 2026 19:17:50 +0000 Subject: [PATCH 06/11] Pass sound_dim/sound_latent_fps into transformer from initialized sound tokenizer Signed-off-by: lishunyang12 --- .../models/cosmos3/test_cosmos3_pipeline.py | 122 ++++++++++++++++++ .../cosmos3/test_cosmos3_sound_tokenizer.py | 14 +- .../cosmos3/test_cosmos3_transformer.py | 43 ++++-- .../models/cosmos3/pipeline_cosmos3.py | 19 ++- .../models/cosmos3/sound_tokenizer.py | 47 ++----- .../models/cosmos3/transformer_cosmos3.py | 41 ++++-- 6 files changed, 215 insertions(+), 71 deletions(-) diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py index 3c042275341..0e441766a97 100644 --- a/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py +++ b/tests/diffusion/models/cosmos3/test_cosmos3_pipeline.py @@ -73,6 +73,24 @@ def decode(self, latents: torch.Tensor, return_dict: bool = False): return (latents,) +class StubCosmos3AVAE: + def __init__(self, **kwargs: Any) -> None: + self.kwargs = kwargs + self.sample_rate = int(kwargs["sample_rate"]) + self.audio_channels = int(kwargs["audio_channels"]) + self.latent_ch = int(kwargs["io_channels"]) + self.temporal_compression_factor = int(kwargs["hop_size"]) + + def get_latent_num_samples(self, num_audio_samples: int) -> int: + return int(num_audio_samples) // self.temporal_compression_factor + + def get_audio_num_samples(self, num_latent_samples: int) -> int: + return int(num_latent_samples) * self.temporal_compression_factor + + def decode(self, latents: torch.Tensor) -> torch.Tensor: + return torch.zeros(latents.shape[0], self.audio_channels, 8) + + class StubCosmos3Transformer(nn.Module): def __init__( self, @@ -80,11 +98,13 @@ def __init__( latent_channel_size: int = 2, sound_gen: bool = False, sound_dim: int = 3, + sound_latent_fps: float = 25.0, ) -> None: super().__init__() self.latent_channel_size = latent_channel_size self.sound_gen = sound_gen self.sound_dim = sound_dim + self.sound_latent_fps = sound_latent_fps self.cached_kv: Any | None = None self.cached_freqs_gen: Any | None = None self.calls: list[dict[str, Any]] = [] @@ -222,6 +242,108 @@ def test_pipeline_registered_and_exported() -> None: assert "Cosmos3OmniDiffusersPipeline" in cosmos3.__all__ +@pytest.fixture +def stub_real_pipeline_init(monkeypatch: pytest.MonkeyPatch): + from vllm_omni.diffusion.models.cosmos3 import pipeline_cosmos3 + + class _StubAutoTokenizer: + @classmethod + def from_pretrained(cls, *args, **kwargs): + return SimpleNamespace() + + class _StubDiffusersVAE: + config = SimpleNamespace(scale_factor_temporal=4, scale_factor_spatial=8) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + return cls() + + def to(self, _device): + return self + + class _StubDiffusersScheduler: + config = SimpleNamespace(flow_shift=1.0) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + return cls() + + class _StubVideoProcessor: + def __init__(self, *args, **kwargs) -> None: + pass + + monkeypatch.setattr(pipeline_cosmos3, "AutoTokenizer", _StubAutoTokenizer) + monkeypatch.setattr(pipeline_cosmos3, "DistributedAutoencoderKLWan", _StubDiffusersVAE) + monkeypatch.setattr(pipeline_cosmos3, "UniPCMultistepScheduler", _StubDiffusersScheduler) + monkeypatch.setattr(pipeline_cosmos3, "VideoProcessor", _StubVideoProcessor) + monkeypatch.setattr(pipeline_cosmos3, "get_local_device", lambda: torch.device("cpu")) + + +def _make_od_config(*, sound_gen: bool) -> SimpleNamespace: + tf_model_config = { + "hidden_size": 8, + "num_hidden_layers": 0, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "head_dim": 4, + "intermediate_size": 16, + "vocab_size": 32, + "latent_patch_size": 1, + "latent_channel": 2, + "rope_scaling": {"mrope_section": [1, 1, 0]}, + } + if sound_gen: + tf_model_config["sound_gen"] = True + return SimpleNamespace( + enable_cpu_offload=False, + enable_diffusion_pipeline_profiler=False, + model="/nonexistent/model/path", + dtype=torch.float32, + flow_shift=None, + quantization_config=None, + custom_pipeline_args={}, + model_config={}, + tf_model_config=tf_model_config, + ) + + +def test_pipeline_init_skips_tokenizer_when_sound_disabled(stub_real_pipeline_init) -> None: + from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import Cosmos3OmniDiffusersPipeline + + pipeline = Cosmos3OmniDiffusersPipeline(od_config=_make_od_config(sound_gen=False)) + + assert pipeline._sound_tokenizer is None + assert pipeline.transformer.sound_gen is False + assert not hasattr(pipeline.transformer, "audio_proj_in") + assert not hasattr(pipeline.transformer, "audio_proj_out") + + +def test_pipeline_init_passes_tokenizer_attrs_into_transformer( + stub_real_pipeline_init, + monkeypatch: pytest.MonkeyPatch, +) -> None: + from vllm_omni.diffusion.models.cosmos3 import sound_tokenizer + from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import Cosmos3OmniDiffusersPipeline + + stub_tokenizer = sound_tokenizer.Cosmos3SoundTokenizer( + StubCosmos3AVAE(sample_rate=32000, audio_channels=2, io_channels=5, hop_size=800) + ) + monkeypatch.setattr( + sound_tokenizer.Cosmos3SoundTokenizer, + "from_config", + classmethod(lambda cls, od_config: stub_tokenizer), + ) + + pipeline = Cosmos3OmniDiffusersPipeline(od_config=_make_od_config(sound_gen=True)) + + assert pipeline._sound_tokenizer is stub_tokenizer + assert pipeline.transformer.sound_gen is True + assert pipeline.transformer.sound_dim == pipeline._sound_tokenizer.latent_ch == 5 + assert pipeline.transformer.sound_latent_fps == pipeline._sound_tokenizer.latent_fps == 40.0 + assert pipeline.transformer.audio_proj_in.in_features == 5 + assert pipeline.transformer.audio_proj_out.out_features == 5 + + def test_preprocess_i2v_image_input() -> None: from vllm_omni.diffusion.models.cosmos3.pipeline_cosmos3 import get_cosmos3_pre_process_func diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py index 47664c59e77..7ab04cc212f 100644 --- a/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py +++ b/tests/diffusion/models/cosmos3/test_cosmos3_sound_tokenizer.py @@ -72,7 +72,12 @@ def test_from_config_loads_local_diffusers_component(tmp_path, monkeypatch: pyte assert created["checkpoint_path"] == str(tokenizer_dir / DIFFUSERS_SOUND_TOKENIZER_CHECKPOINT_NAME) assert created["config_path"] == str(tokenizer_dir / "config.json") - assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (32000, 3, 800) + assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size, tokenizer.latent_fps) == ( + 32000, + 3, + 800, + 40.0, + ) def test_from_config_downloads_component_from_hf_repo(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None: @@ -177,7 +182,12 @@ def test_component_config_precedence_and_conflict_detection(tmp_path, monkeypatc "tanh", 2.0, ) - assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size) == (48000, 64, 1920) + assert (tokenizer.sample_rate, tokenizer.latent_ch, tokenizer.hop_size, tokenizer.latent_fps) == ( + 48000, + 64, + 1920, + 25.0, + ) with pytest.raises(ValueError, match=r"sample_rate.*48000.*32000"): sound_tokenizer.Cosmos3SoundTokenizer.from_config( diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py index bd1a9588b7e..6878b6b96ed 100644 --- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py +++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py @@ -4,7 +4,7 @@ from __future__ import annotations from types import SimpleNamespace - +from typing import Any import pytest import torch from torch import nn @@ -125,18 +125,16 @@ def test_forward_returns_video_prediction(monkeypatch: pytest.MonkeyPatch) -> No assert tuple(output.shape) == (1, 2, 1, 2, 2) -def test_sound_modules_follow_config() -> None: +def test_sound_modules_follow_injected_sound_dim() -> None: from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer tiny = _tiny_cosmos3_config() no_modal = Cosmos3VFMTransformer(SimpleNamespace(tf_model_config=tiny, dtype=torch.float32)) with_sound = Cosmos3VFMTransformer( - SimpleNamespace( - tf_model_config={**tiny, "sound_gen": True}, - model_config={"sound_tokenizer": {"io_channels": 5, "sample_rate": 32000, "hop_size": 800}}, - custom_pipeline_args={}, - dtype=torch.float32, - ) + SimpleNamespace(tf_model_config=tiny, dtype=torch.float32), + sound_gen=True, + sound_dim=5, + sound_latent_fps=40.0, ) assert no_modal.sound_gen is False @@ -146,6 +144,23 @@ def test_sound_modules_follow_config() -> None: assert with_sound.audio_proj_in.in_features == 5 +@pytest.mark.parametrize( + "kwargs", + [ + {"sound_gen": True}, + {"sound_gen": True, "sound_dim": 5}, + {"sound_gen": True, "sound_latent_fps": 40.0}, + ], +) +def test_transformer_requires_sound_dim_and_fps_when_sound_gen_true(kwargs: dict[str, Any]) -> None: + from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer + with pytest.raises(ValueError, match=r"requires an explicit sound_dim and sound_latent_fps"): + Cosmos3VFMTransformer( + SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32), + **kwargs, + ) + + def test_sound_pack_unpack_validate_shapes() -> None: from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer @@ -167,9 +182,12 @@ def test_forward_returns_video_and_sound_predictions(monkeypatch: pytest.MonkeyP output = transformer_cosmos3.Cosmos3VFMTransformer( SimpleNamespace( - tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3, sound_latent_fps=24.0), + tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32, - ) + ), + sound_gen=True, + sound_dim=3, + sound_latent_fps=40.0, )( hidden_states=torch.zeros(1, 2, 1, 2, 2), timestep=torch.tensor([1.0]), @@ -188,7 +206,10 @@ def test_forward_with_sound_ulysses_error_mentions_combined_sequence(monkeypatch import vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 as cosmos3_module model = cosmos3_module.Cosmos3VFMTransformer( - SimpleNamespace(tf_model_config=_tiny_cosmos3_config(sound_gen=True, sound_dim=3), dtype=torch.float32) + SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32), + sound_gen=True, + sound_dim=3, + sound_latent_fps=40.0, ) monkeypatch.setattr(cosmos3_module, "_get_ulysses_state", lambda: (2, 0, None)) diff --git a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py index 33c05efbf95..5290e21204e 100644 --- a/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py +++ b/vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py @@ -46,7 +46,7 @@ from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from .transformer_cosmos3 import Cosmos3VFMTransformer +from .transformer_cosmos3 import Cosmos3VFMTransformer, resolve_sound_gen logger = init_logger(__name__) @@ -324,10 +324,22 @@ def __init__( self.vae_scale_factor_temporal = int(self.vae.config.scale_factor_temporal) self.vae_scale_factor_spatial = getattr(self.vae.config, "scale_factor_spatial", 16) + sound_gen = resolve_sound_gen(od_config) + sound_dim = None + sound_latent_fps = None + self._sound_tokenizer = None + if sound_gen: + self._sound_tokenizer = self._get_sound_tokenizer() + sound_dim = self._sound_tokenizer.latent_ch + sound_latent_fps = self._sound_tokenizer.latent_fps + # --- Transformer (weights loaded later via weights_sources) --- self.transformer = Cosmos3VFMTransformer( od_config=od_config, temporal_compression_factor=self.vae_scale_factor_temporal, + sound_gen=sound_gen, + sound_dim=sound_dim, + sound_latent_fps=sound_latent_fps, ) # --- Scheduler --- @@ -367,9 +379,6 @@ def __init__( self._guidance_scale = None self._num_timesteps = None - self._sound_tokenizer = None - if getattr(self.transformer, "sound_gen", False): - self._get_sound_tokenizer() # Set True by ``enable_cache_for_cosmos3`` when cache-dit is enabled on # this pipeline. Tells the sequential-CFG loop to keep paired @@ -603,8 +612,6 @@ def _is_sound_request(cls, prompt_data, sp) -> bool: return False def _get_sound_tokenizer(self): - if not hasattr(self, "_sound_tokenizer"): - self._sound_tokenizer = None if self._sound_tokenizer is None: from .sound_tokenizer import Cosmos3SoundTokenizer diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py index 281b7e1d9f0..4e2d6f7ee76 100644 --- a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py +++ b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py @@ -289,23 +289,6 @@ def get_sound_channels(od_config: OmniDiffusionConfig) -> int: ) -def get_sound_dim(od_config: OmniDiffusionConfig | None) -> int: - if od_config is None: - return DEFAULT_SOUND_DIM - args = _pipeline_args(od_config) - custom_value = _custom_arg_value(args, ("sound_dim", "io_channels", "latent_ch")) - if custom_value is not None: - return int(custom_value) - top_value = _top_level_model_value(od_config, ("sound_dim",)) - if top_value is not None: - return int(top_value) - nested_value = _first_value_from_configs( - _nested_sound_tokenizer_configs(od_config), - ("io_channels", "vocoder_input_dim", "latent_ch"), - ) - return int(DEFAULT_SOUND_DIM if nested_value is None else nested_value) - - def get_sound_hop_size(od_config: OmniDiffusionConfig) -> int: args = _pipeline_args(od_config) return _resolve_arch_value( @@ -321,27 +304,6 @@ def get_sound_hop_size(od_config: OmniDiffusionConfig) -> int: ) -def get_sound_latent_fps(od_config: OmniDiffusionConfig | None) -> float: - if od_config is None: - return DEFAULT_SOUND_LATENT_FPS - args = _pipeline_args(od_config) - custom_value = _custom_arg_value(args, ("sound_latent_fps",)) - if custom_value is not None: - return float(custom_value) - top_value = _top_level_model_value(od_config, ("sound_latent_fps",)) - if top_value is not None: - return float(top_value) - nested_configs = _nested_sound_tokenizer_configs(od_config) - nested_fps = _first_value_from_configs(nested_configs, ("sound_latent_fps", "latent_fps")) - if nested_fps is not None: - return float(nested_fps) - sample_rate = _first_value_from_configs(nested_configs, ("sample_rate", "sampling_rate")) - hop_size = _first_value_from_configs(nested_configs, ("hop_size",)) - if sample_rate is not None and hop_size is not None: - return float(sample_rate) / float(hop_size) - return float(DEFAULT_SOUND_LATENT_FPS) - - class Cosmos3SoundTokenizer: """Thin adapter around the local AVAE tokenizer implementation.""" @@ -351,6 +313,11 @@ def __init__(self, tokenizer: Any) -> None: self.audio_channels = int(getattr(tokenizer, "audio_channels", DEFAULT_SOUND_CHANNELS)) self.latent_ch = int(getattr(tokenizer, "latent_ch", DEFAULT_SOUND_DIM)) self.hop_size = int(getattr(tokenizer, "temporal_compression_factor", DEFAULT_SOUND_HOP_SIZE)) + if self.hop_size <= 0: + raise ValueError( + f"Cosmos3 sound tokenizer hop_size must be positive, got {self.hop_size}." + ) + self.latent_fps = float(self.sample_rate) / float(self.hop_size) @classmethod def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer: @@ -503,12 +470,14 @@ def from_config(cls, od_config: OmniDiffusionConfig) -> Cosmos3SoundTokenizer: ) if _is_rank_zero(): logger.info( - "Loaded Cosmos3 AVAE sound tokenizer from %s (sr=%d, channels=%d, latent_ch=%d, hop=%d)", + "Loaded Cosmos3 AVAE sound tokenizer from %s " + "(sr=%d, channels=%d, latent_ch=%d, hop=%d, latent_fps=%.3f)", avae_path, sample_rate, audio_channels, sound_dim, hop_size, + float(sample_rate) / float(hop_size), ) return cls(tokenizer) diff --git a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py index e1810bd7103..5ff2683fdda 100644 --- a/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py +++ b/vllm_omni/diffusion/models/cosmos3/transformer_cosmos3.py @@ -121,6 +121,21 @@ def _as_bool(value: Any) -> bool: return bool(value) +def resolve_sound_gen(od_config: Any) -> bool: + """Capability gate shared by the pipeline and transformer. + + Explicit ``sound_gen`` flag wins (including an explicit False); + otherwise infer from the presence of any sound-width key in od_config. + """ + sound_gen_value = _od_config_get(od_config, "sound_gen", None) + if sound_gen_value is not None: + return _as_bool(sound_gen_value) + for key in ("sound_dim", "io_channels", "vocoder_input_dim", "latent_ch"): + if _od_config_get(od_config, key, None) is not None: + return True + return False + + # --------------------------------------------------------------------------- # Rotary Position Embeddings (mRoPE) # --------------------------------------------------------------------------- @@ -906,6 +921,9 @@ def __init__( od_config: OmniDiffusionConfig, *, temporal_compression_factor: int | None = None, + sound_gen: bool = False, + sound_dim: int | None = None, + sound_latent_fps: float | None = None, ) -> None: super().__init__() model_config = od_config.tf_model_config @@ -926,19 +944,16 @@ def __init__( self.latent_channel_size = int(_tf_config_get(model_config, "latent_channel", 48)) self.timestep_scale = float(_tf_config_get(model_config, "timestep_scale", 0.001)) self.base_fps = float(_tf_config_get(model_config, "base_fps", 24.0)) - sound_gen_value = _od_config_get(od_config, "sound_gen", None) - sound_dim_value = _od_config_get(od_config, "sound_dim", None) - if sound_dim_value is None: - sound_dim_value = _od_config_get(od_config, "io_channels", None) - if sound_dim_value is None: - sound_dim_value = _od_config_get(od_config, "vocoder_input_dim", None) - if sound_dim_value is None: - sound_dim_value = _od_config_get(od_config, "latent_ch", None) - self.sound_gen = _as_bool(sound_gen_value) if sound_gen_value is not None else sound_dim_value is not None - from .sound_tokenizer import get_sound_dim, get_sound_latent_fps - - self.sound_dim = int(sound_dim_value if sound_dim_value is not None else get_sound_dim(od_config)) - self.sound_latent_fps = float(get_sound_latent_fps(od_config)) + self.sound_gen = sound_gen + self.sound_dim = sound_dim + self.sound_latent_fps = sound_latent_fps + + if self.sound_gen and (sound_dim is None or sound_latent_fps is None): + raise ValueError( + "Cosmos3VFMTransformer requires an explicit sound_dim and sound_latent_fps when sound_gen is True; " + "the pipeline must pass Cosmos3SoundTokenizer.latent_ch so the audio projection " + "layers are sized from the authoritative AVAE latent width." + ) if temporal_compression_factor is None: temporal_compression_factor = _tf_config_get(model_config, "temporal_compression_factor", 4) self.temporal_compression_factor = int(temporal_compression_factor) From 04ffce45365e12d9726fcb74904040ab9352372f Mon Sep 17 00:00:00 2001 From: Bartosz Stefaniak Date: Tue, 2 Jun 2026 19:17:58 +0000 Subject: [PATCH 07/11] Update recipes Signed-off-by: lishunyang12 --- recipes/README.md | 2 +- recipes/nvidia/Cosmos3-Nano.md | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/recipes/README.md b/recipes/README.md index 48e9d0a80e7..416ed77fe93 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -36,7 +36,7 @@ recipes/ | [`LTX/LTX-2.md`](./LTX/LTX-2.md) | Text-to-video and image-to-video serving | 1x H200 141GB | | [`LTX/LTX-2.3.md`](./LTX/LTX-2.3.md) | Text-to-video with audio generation (22B) | 1x GPU (96GB VRAM) | | [`mistralai/Voxtral-TTS.md`](./mistralai/Voxtral-TTS.md) | Online serving for TTS | 1x RTX 4090 24GB | -| [`nvidia/Cosmos3-Nano.md`](./nvidia/Cosmos3-Nano.md) | Text-to-image, text-to-video, and image-to-video generation | 1x H200 141GB / B300 | +| [`nvidia/Cosmos3-Nano.md`](./nvidia/Cosmos3-Nano.md) | Text-to-image, text-to-video, image-to-video generation, text to video with sound | 1x H200 141GB / B300 | | [`OpenBMB/MiniCPM-o-4_5.md`](./OpenBMB/MiniCPM-o-4_5.md) | Online serving for omni multimodal chat (text / image / audio / video → text + 24 kHz speech) | 2x A100/H100 80GB / 3x mid-tier GPU / 8x RTX 4090 24GB | | [`OpenBMB/VoxCPM2.md`](./OpenBMB/VoxCPM2.md) | Online + offline TTS with native AR pipeline (48 kHz, 30+ languages) | 1x RTX 4090 24GB | | [`Qwen/Qwen-Image.md`](./Qwen/Qwen-Image.md) | Text-to-image serving with step-wise continuous batching replay and ModelOpt mixed FP8/NVFP4 | 1x A100 80GB / 2x B200 | diff --git a/recipes/nvidia/Cosmos3-Nano.md b/recipes/nvidia/Cosmos3-Nano.md index 5d5e524da58..8113bfb7081 100644 --- a/recipes/nvidia/Cosmos3-Nano.md +++ b/recipes/nvidia/Cosmos3-Nano.md @@ -20,6 +20,7 @@ the mode is selected per request: - **T2V** — `POST /v1/videos/sync` with `num_frames > 1` and no reference image. - **I2V** — `POST /v1/videos/sync` with a reference image (`input_reference` file upload, or `image_reference` JSON). +- **T2VS** — `POST /v1/videos/sync` with `num_frames > 1`, no reference image and `generate_sound=true`. ## References @@ -116,6 +117,25 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync \ -F "seed=1111" \ -F "input_reference=@/path/to/reference.jpg;type=image/jpeg" \ -o cosmos3_i2v.mp4 + + +# Text-to-video-with-sound +curl -sS -X POST http://localhost:8000/v1/videos/sync \ + -H "Accept: video/mp4" \ + -F "prompt=The video opens with a view of a well-lit indoor fruit display. A robotic arm picks up a pear, an orange, and a carambola one by one, placing each into a plastic bag in a shopping cart with red handles. The video is 7.875 seconds long, 24 FPS, and 1280x720. Audio description: soft servo whirs, gentle fruit thuds, plastic bag rustling, and a faint refrigeration hum." \ + -F "negative_prompt=blurry, distorted, low quality" \ + -F "size=1280x720" \ + -F "num_frames=189" \ + -F "fps=24" \ + -F "num_inference_steps=35" \ + -F "guidance_scale=6.0" \ + -F "max_sequence_length=4096" \ + -F "flow_shift=10.0" \ + -F "seed=42" \ + -F "generate_sound=true" \ + -F "sound_duration=7.875" \ + -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \ + -o cosmos3_t2v_with_sound.mp4 ``` #### Notes From a0a98683b955f9d30deb47ec80f9b31713532acf Mon Sep 17 00:00:00 2001 From: Bartosz Stefaniak Date: Tue, 2 Jun 2026 19:19:43 +0000 Subject: [PATCH 08/11] lint Signed-off-by: Bartosz Stefaniak Signed-off-by: lishunyang12 --- recipes/nvidia/Cosmos3-Nano.md | 2 +- tests/diffusion/models/cosmos3/test_cosmos3_transformer.py | 2 ++ vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py | 4 +--- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/recipes/nvidia/Cosmos3-Nano.md b/recipes/nvidia/Cosmos3-Nano.md index 8113bfb7081..1698fa14f61 100644 --- a/recipes/nvidia/Cosmos3-Nano.md +++ b/recipes/nvidia/Cosmos3-Nano.md @@ -131,7 +131,7 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync \ -F "guidance_scale=6.0" \ -F "max_sequence_length=4096" \ -F "flow_shift=10.0" \ - -F "seed=42" \ + -F "seed=0" \ -F "generate_sound=true" \ -F "sound_duration=7.875" \ -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \ diff --git a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py index 6878b6b96ed..062cd8abf98 100644 --- a/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py +++ b/tests/diffusion/models/cosmos3/test_cosmos3_transformer.py @@ -5,6 +5,7 @@ from types import SimpleNamespace from typing import Any + import pytest import torch from torch import nn @@ -154,6 +155,7 @@ def test_sound_modules_follow_injected_sound_dim() -> None: ) def test_transformer_requires_sound_dim_and_fps_when_sound_gen_true(kwargs: dict[str, Any]) -> None: from vllm_omni.diffusion.models.cosmos3.transformer_cosmos3 import Cosmos3VFMTransformer + with pytest.raises(ValueError, match=r"requires an explicit sound_dim and sound_latent_fps"): Cosmos3VFMTransformer( SimpleNamespace(tf_model_config=_tiny_cosmos3_config(), dtype=torch.float32), diff --git a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py index 4e2d6f7ee76..66937b33c6b 100644 --- a/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py +++ b/vllm_omni/diffusion/models/cosmos3/sound_tokenizer.py @@ -314,9 +314,7 @@ def __init__(self, tokenizer: Any) -> None: self.latent_ch = int(getattr(tokenizer, "latent_ch", DEFAULT_SOUND_DIM)) self.hop_size = int(getattr(tokenizer, "temporal_compression_factor", DEFAULT_SOUND_HOP_SIZE)) if self.hop_size <= 0: - raise ValueError( - f"Cosmos3 sound tokenizer hop_size must be positive, got {self.hop_size}." - ) + raise ValueError(f"Cosmos3 sound tokenizer hop_size must be positive, got {self.hop_size}.") self.latent_fps = float(self.sample_rate) / float(self.hop_size) @classmethod From 8c340fe5dda614aec9d12702ccdb704962b074ce Mon Sep 17 00:00:00 2001 From: lishunyang12 Date: Tue, 2 Jun 2026 19:09:50 +0000 Subject: [PATCH 09/11] add video+sound usage to Cosmos3-Nano recipe Signed-off-by: lishunyang12 --- recipes/nvidia/Cosmos3-Nano.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/recipes/nvidia/Cosmos3-Nano.md b/recipes/nvidia/Cosmos3-Nano.md index 1698fa14f61..41df35e2883 100644 --- a/recipes/nvidia/Cosmos3-Nano.md +++ b/recipes/nvidia/Cosmos3-Nano.md @@ -6,7 +6,7 @@ - Vendor: NVIDIA - Model: `nvidia/Cosmos3-Nano` -- Task: Text-to-image (T2I), text-to-video (T2V), and image-to-video (I2V) generation +- Task: Text-to-image (T2I), text-to-video (T2V), and image-to-video (I2V) generation, with optional synchronized audio (video + sound) - Mode: Online serving with the OpenAI-compatible image/video APIs, plus offline generation via the `Omni` API - Maintainer: Community @@ -20,13 +20,17 @@ the mode is selected per request: - **T2V** — `POST /v1/videos/sync` with `num_frames > 1` and no reference image. - **I2V** — `POST /v1/videos/sync` with a reference image (`input_reference` file upload, or `image_reference` JSON). -- **T2VS** — `POST /v1/videos/sync` with `num_frames > 1`, no reference image and `generate_sound=true`. +- **T2VS / I2VS** — add `generate_sound=true` (and optional `sound_duration`) to a + T2V/I2V `/v1/videos/sync` request to also generate synchronized audio, muxed into + the mp4 as AAC 48 kHz stereo. See the official model card's "Video + Audio" examples. ## References - Model card (authoritative usage + example assets): - Example inputs/outputs live in the repo's `assets/` (`example_t2v_prompt.json`, - `example_i2v_prompt.json`, `example_i2v_input.jpg`, `negative_prompt.json`). + `example_i2v_prompt.json`, `example_i2v_input.jpg`, `negative_prompt.json`; + audio examples: `example_t2vs_prompt.json`, `example_t2vs_output.mp4`, + `example_i2vs_output.mp4`). - Prompt upsampling (recommended for quality): the model expects JSON-upsampled structured prompts; see NVIDIA's `cosmos-framework` prompt-upsampling docs. - Pipeline: [`vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py`](../../vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py) @@ -163,8 +167,8 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync \ the server fails at pipeline build with a gated-repo / safety-checker error. - A guardrail-blocked prompt currently returns HTTP 500 (`"Guardrail blocked prompt"`). - - Video + audio, and action (policy / forward- / inverse-dynamics) modalities - are not part of this integration yet. + - Action (policy / forward- / inverse-dynamics) modalities are not part of + this integration yet. ### 1x GPU (Offline generation) From 96243ef11ae2cbde89e8a96e681b3ba0645a40a4 Mon Sep 17 00:00:00 2001 From: lishunyang12 Date: Tue, 2 Jun 2026 20:22:03 +0000 Subject: [PATCH 10/11] add Cosmos3-Super recipe Signed-off-by: lishunyang12 --- recipes/README.md | 1 + recipes/nvidia/Cosmos3-Super.md | 87 +++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 recipes/nvidia/Cosmos3-Super.md diff --git a/recipes/README.md b/recipes/README.md index 416ed77fe93..161bcdd5edc 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -37,6 +37,7 @@ recipes/ | [`LTX/LTX-2.3.md`](./LTX/LTX-2.3.md) | Text-to-video with audio generation (22B) | 1x GPU (96GB VRAM) | | [`mistralai/Voxtral-TTS.md`](./mistralai/Voxtral-TTS.md) | Online serving for TTS | 1x RTX 4090 24GB | | [`nvidia/Cosmos3-Nano.md`](./nvidia/Cosmos3-Nano.md) | Text-to-image, text-to-video, image-to-video generation, text to video with sound | 1x H200 141GB / B300 | +| [`nvidia/Cosmos3-Super.md`](./nvidia/Cosmos3-Super.md) | 64B T2I / T2V / I2V generation (+ optional audio) | 8x H200/H100/A100 / 2x H200 / B300 | | [`OpenBMB/MiniCPM-o-4_5.md`](./OpenBMB/MiniCPM-o-4_5.md) | Online serving for omni multimodal chat (text / image / audio / video → text + 24 kHz speech) | 2x A100/H100 80GB / 3x mid-tier GPU / 8x RTX 4090 24GB | | [`OpenBMB/VoxCPM2.md`](./OpenBMB/VoxCPM2.md) | Online + offline TTS with native AR pipeline (48 kHz, 30+ languages) | 1x RTX 4090 24GB | | [`Qwen/Qwen-Image.md`](./Qwen/Qwen-Image.md) | Text-to-image serving with step-wise continuous batching replay and ModelOpt mixed FP8/NVFP4 | 1x A100 80GB / 2x B200 | diff --git a/recipes/nvidia/Cosmos3-Super.md b/recipes/nvidia/Cosmos3-Super.md new file mode 100644 index 00000000000..33d3ec093b9 --- /dev/null +++ b/recipes/nvidia/Cosmos3-Super.md @@ -0,0 +1,87 @@ +# Cosmos3-Super + +> Frontier 64B world model: text-to-image, text-to-video, image-to-video (+ optional audio) + +## Summary + +- Vendor: NVIDIA +- Model: `nvidia/Cosmos3-Super` (64B; also `Cosmos3-Super-Text2Image`, `Cosmos3-Super-Image2Video`) +- Task: T2I, T2V, I2V generation, with optional synchronized audio (video + sound) +- Mode: Online serving with the OpenAI-compatible image/video APIs +- Maintainer: Community + +## When to use this recipe + +Use this recipe to deploy the 64B `nvidia/Cosmos3-Super` for the highest-quality +Cosmos3 generation. It shares the same `Cosmos3OmniDiffusersPipeline` and request +formats as [Cosmos3-Nano](./Cosmos3-Nano.md) — only the checkpoint size and the +recommended parallelism differ. Mode is selected per request (T2I → +`/v1/images/generations`; T2V/I2V → `/v1/videos/sync`; add `generate_sound=true` +for audio). + +## References + +- Model card (authoritative usage + example assets): +- Nano recipe (same APIs/params): [`Cosmos3-Nano.md`](./Cosmos3-Nano.md) +- Pipeline: [`vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py`](../../vllm_omni/diffusion/models/cosmos3/pipeline_cosmos3.py) + +## Hardware Support + +## GPU + +### 8x H200/H100/A100 (recommended, per model card) + +```bash +vllm serve nvidia/Cosmos3-Super \ + --omni \ + --host 0.0.0.0 --port 8000 \ + --cfg-parallel-size 2 \ + --ulysses-degree 4 \ + --use-hsdp --hsdp-shard-size 8 \ + --init-timeout 1800 +``` + +### 2x H200 / B300 (minimum) + +```bash +vllm serve nvidia/Cosmos3-Super \ + --omni \ + --host 0.0.0.0 --port 8000 \ + --cfg-parallel-size 2 \ + --use-hsdp --hsdp-shard-size 2 \ + --init-timeout 1800 +``` + +Guardrails are on by default (gated `nvidia/Cosmos-1.0-Guardrail` — `pip install +cosmos-guardrail`, accept the license, set `HF_TOKEN`); add `--no-guardrails` to +disable. `--enable-layerwise-offload` reduces VRAM on smaller GPUs. + +#### Verification + +Requests are identical to Nano (see [`Cosmos3-Nano.md`](./Cosmos3-Nano.md) for full +T2I/T2V/I2V/T2VS curls); official params: `size=1280x720, num_frames=189, fps=24, +num_inference_steps=35, guidance_scale=6.0, flow_shift=10.0, max_sequence_length=4096`. + +```bash +curl http://localhost:8000/v1/models +# T2V (official prompt assets give best quality) +curl -sS -X POST http://localhost:8000/v1/videos/sync -H "Accept: video/mp4" \ + -F "model=nvidia/Cosmos3-Super" -F "prompt=A robot arm is cleaning a plate in the kitchen" \ + -F "size=1280x720" -F "num_frames=189" -F "fps=24" -F "num_inference_steps=35" \ + -F "guidance_scale=6.0" -F "max_sequence_length=4096" -F "flow_shift=10.0" \ + -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \ + -F "seed=17" -o cosmos3_super_t2v.mp4 +``` + +#### Notes + +- **Measured (2x B300, bf16, guardrails off, official 2-GPU config above):** + - T2I 1024², 50 steps → **~6 s** + - T2V 1280×720, 189 frames, 35 steps → **~197 s** + - I2V 1280×720, 189 frames, 35 steps → **~200 s** + - T2V + sound (189 frames, 35 steps) → **~198 s**, output muxes **AAC 48 kHz stereo** + - (NVIDIA's reference: 8×H200 @ 50 steps ≈ 55 s/video; 2×H200 @ 35 steps ≈ 3 min/video.) +- **Memory:** ~61.5 GiB per GPU when sharded across 2 GPUs (HSDP shard 2); repo ~135 GB on disk. +- Same generation defaults, supported sizes, and `generate_sound`/`sound_duration` + semantics as Nano. Action (policy / forward- / inverse-dynamics) modalities are + not part of this integration yet. From 7765517a2eb595a673a0fcfd71c800e2df85bafe Mon Sep 17 00:00:00 2001 From: lishunyang12 Date: Tue, 2 Jun 2026 20:29:47 +0000 Subject: [PATCH 11/11] polish Cosmos3 recipes: add model field, install note, Super curls Signed-off-by: lishunyang12 --- recipes/nvidia/Cosmos3-Nano.md | 12 +++++++++--- recipes/nvidia/Cosmos3-Super.md | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/recipes/nvidia/Cosmos3-Nano.md b/recipes/nvidia/Cosmos3-Nano.md index 41df35e2883..57f6b983cda 100644 --- a/recipes/nvidia/Cosmos3-Nano.md +++ b/recipes/nvidia/Cosmos3-Nano.md @@ -52,6 +52,9 @@ the mode is selected per request: #### Command +Requires the `vllm-omni` package (or the `vllm/vllm-omni:cosmos3` container), +which provides the `vllm serve … --omni` entrypoint used below. + Safety guardrails are **on by default** (NVIDIA Open Model License). They load the **gated** `nvidia/Cosmos-1.0-Guardrail` model, so to keep them on you must: @@ -126,6 +129,7 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync \ # Text-to-video-with-sound curl -sS -X POST http://localhost:8000/v1/videos/sync \ -H "Accept: video/mp4" \ + -F "model=nvidia/Cosmos3-Nano" \ -F "prompt=The video opens with a view of a well-lit indoor fruit display. A robotic arm picks up a pear, an orange, and a carambola one by one, placing each into a plastic bag in a shopping cart with red handles. The video is 7.875 seconds long, 24 FPS, and 1280x720. Audio description: soft servo whirs, gentle fruit thuds, plastic bag rustling, and a faint refrigeration hum." \ -F "negative_prompt=blurry, distorted, low quality" \ -F "size=1280x720" \ @@ -158,7 +162,9 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync \ 3:4, 9:16. Defaults: T2I 1024², 50 steps, guidance 7.0; T2V/I2V 1280×720, 189 frames, 35 steps, guidance 6.0, `flow_shift=10.0`. - **Key flags / params:** `--no-guardrails` (server) or - `extra_params={"guardrails":false}` (per request) toggles safety; + `extra_params={"guardrails":false}` (per request) toggles safety. The + per-request flag only takes effect when the server was launched **with** + guardrails enabled (it cannot re-enable them on a `--no-guardrails` server). `use_resolution_template` / `use_duration_template` are off by default and only needed when not using upsampled prompts that already encode resolution/duration. - **Known limitations:** @@ -194,8 +200,8 @@ def main(): model_class_name="Cosmos3OmniDiffusersPipeline", trust_remote_code=True, enforce_eager=True, - # Keep guardrails on by installing cosmos-guardrail + gated-repo access; - # this disables them for a quick local run. + # Guardrails are disabled here for a quick local run; install + # cosmos-guardrail + gated-repo access and drop this to enable them. model_config={"guardrails": False}, ) gen = torch.Generator(device="cpu").manual_seed(42) diff --git a/recipes/nvidia/Cosmos3-Super.md b/recipes/nvidia/Cosmos3-Super.md index 33d3ec093b9..528b7a77393 100644 --- a/recipes/nvidia/Cosmos3-Super.md +++ b/recipes/nvidia/Cosmos3-Super.md @@ -29,6 +29,9 @@ for audio). ## GPU +Requires the `vllm-omni` package (or the `vllm/vllm-omni:cosmos3` container), +which provides the `vllm serve … --omni` entrypoint used below. + ### 8x H200/H100/A100 (recommended, per model card) ```bash @@ -71,6 +74,24 @@ curl -sS -X POST http://localhost:8000/v1/videos/sync -H "Accept: video/mp4" \ -F "guidance_scale=6.0" -F "max_sequence_length=4096" -F "flow_shift=10.0" \ -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \ -F "seed=17" -o cosmos3_super_t2v.mp4 + +# I2V — add an uploaded reference image +curl -sS -X POST http://localhost:8000/v1/videos/sync -H "Accept: video/mp4" \ + -F "model=nvidia/Cosmos3-Super" -F "prompt=The scene comes to life with smooth, natural motion." \ + -F "size=1280x720" -F "num_frames=189" -F "fps=24" -F "num_inference_steps=35" \ + -F "guidance_scale=6.0" -F "max_sequence_length=4096" -F "flow_shift=10.0" \ + -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \ + -F "seed=1111" -F "input_reference=@/path/to/reference.jpg;type=image/jpeg" \ + -o cosmos3_super_i2v.mp4 + +# T2V + sound — add generate_sound/sound_duration (output muxes AAC 48 kHz stereo) +curl -sS -X POST http://localhost:8000/v1/videos/sync -H "Accept: video/mp4" \ + -F "model=nvidia/Cosmos3-Super" -F "prompt=A robot arm is cleaning a plate in the kitchen" \ + -F "size=1280x720" -F "num_frames=189" -F "fps=24" -F "num_inference_steps=35" \ + -F "guidance_scale=6.0" -F "max_sequence_length=4096" -F "flow_shift=10.0" \ + -F "generate_sound=true" -F "sound_duration=7.875" \ + -F 'extra_params={"use_resolution_template":false,"use_duration_template":false,"guardrails":true}' \ + -F "seed=17" -o cosmos3_super_t2vs.mp4 ``` #### Notes