Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions components/src/dynamo/common/utils/video_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,34 @@ def normalize_video_frames(images: list) -> list:
return list(frames)


def normalize_image_frames(images: list) -> list:
"""Normalize stage_output.images into a flat list of PIL Images.

Image diffusion pipelines usually return PIL Images, but some (e.g. the
Cosmos3 native pipeline) return numpy arrays shaped ``[batch, frames, H, W,
C]`` even for single images. Collapse leading batch/frame dims and convert
each frame to a PIL Image; PIL inputs pass through unchanged.
"""
from PIL import Image

out: list = []
for item in images:
if isinstance(item, Image.Image):
out.append(item)
continue
arr = np.asarray(item)
while arr.ndim > 4: # [batch, frames, H, W, C] -> [frames, H, W, C]

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

normalize_image_frames collapses a [B, F, H, W, C] Cosmos3 array by taking arr[0], so image requests with n > 1 silently drop every generated batch after the first. Fix: preserve and flatten all leading batch/frame dimensions before converting frames to PIL images.

🤖 AI Fix

In components/src/dynamo/common/utils/video_utils.py, update normalize_image_frames to replace the while arr.ndim > 4: arr = arr[0] logic with validation that the last three dimensions are H, W, C and arr = arr.reshape((-1, *arr.shape[-3:])) so all [B, F, H, W, C] outputs are emitted.

arr = arr[0]
if arr.dtype != np.uint8: # frames share a dtype/range; convert once
arr = ((arr.clip(0, 1) * 255).round() if arr.max() <= 1.0 else arr).astype(
np.uint8
)
frames = arr if arr.ndim == 4 else arr[None] # -> [N, H, W, C]
for frame in frames:
out.append(Image.fromarray(frame))
return out


def frames_to_numpy(images: list) -> np.ndarray:
"""Convert a list of PIL Images to a numpy array suitable for video encoding.

Expand Down
15 changes: 15 additions & 0 deletions components/src/dynamo/vllm/omni/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,16 @@ def add_arguments(self, parser) -> None:
default=False,
help="Disable torch.compile and force eager execution for diffusion models.",
)
add_negatable_bool_argument(
g,
flag_name="--cosmos3-guardrails",
env_var="DYN_OMNI_COSMOS3_GUARDRAILS",
default=True,
help=(
"Enable Cosmos3 text/video safety guardrails (loads guardrail models "
"at startup). Use --no-cosmos3-guardrails to disable."
),
)

# TTS parameters
tts_g = parser.add_argument_group(
Expand Down Expand Up @@ -333,6 +343,11 @@ class OmniConfig(DynamoRuntimeConfig):
stage_configs_path: Optional[str] = None
default_video_fps: int = 16

# Cosmos3 safety guardrails. When False, routed into
# od_config.model_config["guardrails"]=False so the diffusion engine skips
# loading the guardrail models (see base_handler._build_omni_kwargs).
cosmos3_guardrails: bool = True

# Nested structs — each group of fields has a clear destination
diffusion: OmniDiffusionKwargs = dataclasses.field(
default_factory=OmniDiffusionKwargs
Expand Down
6 changes: 6 additions & 0 deletions components/src/dynamo/vllm/omni/base_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ def _build_omni_kwargs(self, config) -> Dict[str, Any]:
if config.stage_configs_path:
omni_kwargs["stage_configs_path"] = config.stage_configs_path

# Cosmos3 guardrails toggle -> od_config.model_config["guardrails"].
# Mirrors vllm-omni serve's --cosmos3-no-guardrails; when disabled the
# diffusion engine skips loading the guardrail models entirely.
if not config.cosmos3_guardrails:
omni_kwargs["model_config"] = {"guardrails": False}

for field, value in dataclasses.asdict(config.diffusion).items():
if value is not None:
omni_kwargs[field] = value
Expand Down
7 changes: 5 additions & 2 deletions components/src/dynamo/vllm/omni/output_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
from dynamo.common.storage import upload_to_fs
from dynamo.common.utils.engine_response import normalize_finish_reason
from dynamo.common.utils.output_modalities import RequestType
from dynamo.common.utils.video_utils import normalize_video_frames
from dynamo.common.utils.video_utils import (
normalize_image_frames,
normalize_video_frames,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -241,7 +244,7 @@ async def _prepare_images(
self, images: list, request_id: str, response_format: Optional[str] = None
) -> list:
outlist = []
for img in images:
for img in normalize_image_frames(images):
buf = BytesIO()
img.save(buf, format="PNG")
image_bytes = buf.getvalue()
Expand Down
163 changes: 163 additions & 0 deletions docs/backends/vllm/cosmos3.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
---
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
title: Cosmos3
---

Run NVIDIA's **Cosmos3** omni model through Dynamo's
[vLLM-Omni backend](vllm-omni.md) for **text-to-image**, **text-to-video**, and
**image-to-video** generation.

Cosmos3 is a unified world foundation model (WFM) for Physical AI, built on a
Mixture-of-Transformers (MoT) architecture. A single `Cosmos3OmniTransformer`
runs a Qwen-style "understanding" stream alongside a "generation" stream
joined by a 3D multimodal RoPE, replacing the separate Predict / Reason /
Transfer models from earlier Cosmos releases. See the
[Cosmos World Foundation Model Platform paper](https://huggingface.co/papers/2501.03575)
for the architectural background, and the
[diffusers Cosmos3 reference](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cosmos3) for the underlying pipeline.

Cosmos3 support in Dynamo is provided by the native vLLM-Omni pipeline added in
[vllm-project/vllm-omni#3454](https://github.com/vllm-project/vllm-omni/pull/3454).

## Checkpoints

Both checkpoints share the same `Cosmos3OmniPipeline` class and Dynamo flags;
swap the model identifier on the worker (`--model …`) and in request payloads.

| Checkpoint | Description | HF Hub |
|------------|-------------|--------|
| `nvidia/Cosmos3-Nano` | Smaller, faster — default in this repo's launch scripts | [link](https://huggingface.co/nvidia/Cosmos3-Nano) |
| `nvidia/Cosmos3-Super` | Larger, higher quality | [link](https://huggingface.co/nvidia/Cosmos3-Super) |
Comment on lines +31 to +32

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Use descriptive link labels for checkpoint URLs.

[link] is too generic and already flagged by markdownlint. Use labels like Cosmos3-Nano / Cosmos3-Super.

As per coding guidelines, for **/*.md documentation quality should be maintained; replacing non-descriptive link text improves clarity and lint compliance.

🧰 Tools
🪛 markdownlint-cli2 (0.22.1)

[warning] 31-31: Link text should be descriptive

(MD059, descriptive-link-text)


[warning] 32-32: Link text should be descriptive

(MD059, descriptive-link-text)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@docs/backends/vllm/cosmos3.md` around lines 31 - 32, Replace the generic link
text "[link]" with descriptive labels matching the checkpoint names so the table
rows for `nvidia/Cosmos3-Nano` and `nvidia/Cosmos3-Super` use link text like
"Cosmos3-Nano" and "Cosmos3-Super" respectively; update the markdown links in
the table to read `[Cosmos3-Nano](https://huggingface.co/nvidia/Cosmos3-Nano)`
and `[Cosmos3-Super](https://huggingface.co/nvidia/Cosmos3-Super)` so linting
passes and the labels clearly identify the checkpoints.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Fix the checkpoint links that currently fail docs link-check CI.

The current Hugging Face checkpoint URLs are failing lychee with 401, which blocks docs checks. Please switch these to URLs that pass CI (or update the docs-link-check allowlist for these exact domains/statuses).

🧰 Tools
🪛 markdownlint-cli2 (0.22.1)

[warning] 31-31: Link text should be descriptive

(MD059, descriptive-link-text)


[warning] 32-32: Link text should be descriptive

(MD059, descriptive-link-text)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@docs/backends/vllm/cosmos3.md` around lines 31 - 32, The HF checkpoint links
for the model names `nvidia/Cosmos3-Nano` and `nvidia/Cosmos3-Super` in the docs
page are returning 401 in link-check CI; update the two Markdown link targets
for those model entries so they point to publicly accessible URLs that pass docs
link-check (for example swap the current https://huggingface.co/... checkpoint
links for the public model hub pages or an official NVIDIA/public docs page), or
alternatively add those exact HF URLs/statuses to the docs-link-check allowlist;
change the two link targets referenced alongside the `nvidia/Cosmos3-Nano` and
`nvidia/Cosmos3-Super` entries to the new URLs or add them to the allowlist so
CI no longer fails.


## Supported modalities

| Task | Endpoint | `--output-modalities` |
|------|----------|-----------------------|
| Text-to-Image | `/v1/images/generations` | `image` |
| Text-to-Video | `/v1/videos` | `video` |
| Image-to-Video | `/v1/videos` (with `input_reference`) | `video` |

## Prerequisites

This guide builds on the [vLLM-Omni backend guide](vllm-omni.md) — see it for general setup, `etcd`/`nats`, and OpenAI-endpoint details.

### Installation

This branch carries Dynamo code changes (the Cosmos3 worker flags and image
output handling) on top of a pinned vLLM-Omni, so run Dynamo **from source on
this branch** — a released `ai-dynamo` wheel will not include the integration.

1. Clone and check out the branch:

```bash
git clone https://github.com/ai-dynamo/dynamo.git
cd dynamo
git checkout cosmos3-omni-integration
```

2. Create a Python 3.12 environment:

```bash
uv venv --python 3.12 --seed
source .venv/bin/activate
```

3. Build and install Dynamo from source (the branch's Cosmos3 code must be
live, and the Rust core `ai-dynamo-runtime` isn't published for this dev
version, so it has to be built locally). See
[Building from source](../../getting-started/building-from-source.md) for
prerequisites (Rust toolchain, system deps); the key steps from the repo root:

```bash
uv pip install pip maturin
(cd lib/bindings/python && maturin develop --uv) # builds ai-dynamo-runtime
uv pip install -e lib/gpu_memory_service
uv pip install -e ".[vllm]" # also pulls vllm==0.21.0
```

4. Install the Cosmos3-capable vLLM-Omni, pinned to the PR commit (its dynamic
`setup.py` pulls the matching pipeline deps — `diffusers==0.38`, `torchsde`,
`x-transformers`):

```bash
uv pip install "vllm-omni @ git+https://github.com/vllm-project/vllm-omni.git@e826f626afb47c8c3c39ccf892ed247f442f6bd2"
```

5. Start etcd and NATS:

```bash
docker compose -f dev/docker-compose.yml up -d
```

## Serve

Quick start — each script launches the frontend on `:8000` plus a
single-modality worker and prints a sample request:

```bash
examples/backends/vllm/launch/agg_omni_cosmos3_image.sh # text-to-image
examples/backends/vllm/launch/agg_omni_cosmos3_video.sh # text-to-video
examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh # image-to-video
```

Manual launch:

```bash
python -m dynamo.frontend --http-port 8000 &

python -m dynamo.vllm.omni \
--model nvidia/Cosmos3-Nano \
--output-modalities image \ # or: video
--no-cosmos3-guardrails \ # skip loading the safety guardrail models
--media-output-fs-url file:///tmp/dynamo_media
Comment on lines +112 to +114

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

The multiline shell example is not copy/paste-safe.

The inline comments after line-continuation backslashes break the command. Move those comments to separate lines (or provide separate command variants) so the snippet executes as documented.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@docs/backends/vllm/cosmos3.md` around lines 112 - 114, The multiline shell
snippet in the docs uses backslash line continuations with inline comments after
the backslashes which breaks copy/paste execution; update the example around the
flags --output-modalities, --no-cosmos3-guardrails, and --media-output-fs-url so
comments are on their own lines (or provide separate full-command variants)
instead of trailing the backslashes, ensuring each continued line ends only with
the backslash and the flag text so the shell command is copy/paste-safe.

```

Cosmos3-specific flags:

| Flag | Purpose |
|------|---------|
| `--no-cosmos3-guardrails` | Disable the Cosmos3 text/video safety guardrails (otherwise loaded at startup). |
| `--flow-shift <float>` | Scheduler flow-shift (image default `3.0`). Launch-time only — not a per-request image parameter. |
| `--media-output-fs-url file://<dir>` | Destination for media when `response_format: "url"`. |

## Requests

### Text-to-image

Run from the repo root; `cosmos3/t2i.json` is the official Cosmos3 t2i payload
(prompt verbatim) mapped to the Dynamo request schema:

```bash
curl -s -X POST http://localhost:8000/v1/images/generations \
-H 'Content-Type: application/json' \
--data-binary @examples/backends/vllm/launch/cosmos3/t2i.json \
| jq -r '.data[0].b64_json' | base64 -d > out.png
```

- `size` must be one of `256x256`, `512x512`, `1024x1024`, `1792x1024`,
`1024x1792`, `1536x1024`, `1024x1536`, `auto` — the payload uses `1024x1024`
(the official `960x960` is not an allowed image size).
- Put `num_inference_steps`, `guidance_scale`, `seed`, and `negative_prompt`
under `nvext` — top-level values are ignored.

### Text-to-video

```bash
curl -s http://localhost:8000/v1/videos \
-H 'Content-Type: application/json' \
--data-binary @examples/backends/vllm/launch/cosmos3/t2v.json | jq
```

The official `t2v.json` payload is `1280x720`, `192` frames @ `24` fps (8s).

### Image-to-video

`i2v.json` adds `input_reference` (the official `vision_path` — an http URL;
local paths are rejected, use an http(s) URL or a `data:` base64 URI):

```bash
curl -s http://localhost:8000/v1/videos \
-H 'Content-Type: application/json' \
--data-binary @examples/backends/vllm/launch/cosmos3/i2v.json | jq
```
63 changes: 63 additions & 0 deletions examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated Cosmos3 image-to-video generation (1 GPU).
# Same worker as text-to-video (registers the "video" modality); i2v is driven
# by adding "input_reference" to the /v1/videos request. The image loader
# rejects local file paths — pass a data: URI (base64) or an http(s) URL.
# --no-cosmos3-guardrails skips loading the safety guardrail models.

set -e
trap 'echo Cleaning up...; kill 0' EXIT

SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"

MODEL="nvidia/Cosmos3-Nano"

# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done

HTTP_PORT="${DYN_HTTP_PORT:-8000}"
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image-to-Video (1 GPU)" "$MODEL" "$HTTP_PORT"
print_curl_footer <<CURL
# Official Cosmos3 image-to-video payload (prompt + vision_path verbatim).
# input_reference must be an http(s) URL or a data: URI (local paths are rejected).
curl -s http://localhost:${HTTP_PORT}/v1/videos \\
-H 'Content-Type: application/json' \\
--data-binary @${SCRIPT_DIR}/cosmos3/i2v.json | jq
CURL


python -m dynamo.frontend &
FRONTEND_PID=$!

sleep 2

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Remove fixed readiness sleep and use shared health-check orchestration.

sleep 2 is a fragile startup gate and violates the launch-script convention for readiness handling.

As per coding guidelines, launch scripts should “Avoid readiness sleeps/polls; rely on the shared framework health-check patterns instead.”

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh` at line 50, Remove the
fragile fixed wait ("sleep 2") from agg_omni_cosmos3_i2v.sh and replace it with
the project’s shared health-check orchestration: remove the "sleep 2" line and
invoke the centralized readiness check (use the launch framework's health-check
helper or wait-for-ready wrapper used by other launch scripts) to block until
the service reports healthy; ensure you call the same health-check entrypoint
used elsewhere in the repo so the script follows the launch-script convention
for readiness handling.


echo "Starting Omni worker..."
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm.omni \
--model "$MODEL" \
--output-modalities video \
--no-cosmos3-guardrails \
--media-output-fs-url file:///tmp/dynamo_media \
$GPU_MEM_ARGS \
"${EXTRA_ARGS[@]}" &

# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
60 changes: 60 additions & 0 deletions examples/backends/vllm/launch/agg_omni_cosmos3_image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated Cosmos3 text-to-image generation (1 GPU).
# Uses the native vLLM-Omni Cosmos3 pipeline; --no-cosmos3-guardrails skips
# loading the safety guardrail models. A worker serves a single modality, so
# this script registers the "image" modality (see agg_omni_cosmos3_video.sh
# for text-to-video).

set -e
trap 'echo Cleaning up...; kill 0' EXIT

SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"

Comment on lines +15 to +16

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Align this launcher with shared vLLM GPU-memory utilities.

This script skips gpu_utils.sh and does not use build_vllm_gpu_mem_args, so users can’t control VRAM behavior consistently with the other Cosmos3 launchers.

As per coding guidelines, launchers should source gpu_utils.sh and “Use build_vllm_gpu_mem_args() to construct GPU memory CLI flags for vLLM.”

Also applies to: 50-57

🧰 Tools
🪛 Shellcheck (0.11.0)

[info] 15-15: Not following: ./../../../common/launch_utils.sh was not specified as input (see shellcheck -x).

(SC1091)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@examples/backends/vllm/launch/agg_omni_cosmos3_image.sh` around lines 15 -
16, The launcher missing gpu_utils integration should source gpu_utils.sh (via
SCRIPT_DIR/../../../common/gpu_utils.sh) and use build_vllm_gpu_mem_args() when
constructing the vLLM CLI invocation in agg_omni_cosmos3_image.sh; update the
script to source gpu_utils.sh near the other shared utils and insert the output
of build_vllm_gpu_mem_args into the vLLM/vllm-server command-line assembly so
GPU memory flags are consistent with other Cosmos3 launchers.

MODEL="nvidia/Cosmos3-Nano"

# Parse command line arguments
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL="$2"
shift 2
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done

HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image Generation (1 GPU)" "$MODEL" "$HTTP_PORT"
print_curl_footer <<CURL
# Official Cosmos3 text-to-image payload (prompt verbatim)
curl -s -X POST http://localhost:${HTTP_PORT}/v1/images/generations \\
-H 'Content-Type: application/json' \\
--data-binary @${SCRIPT_DIR}/cosmos3/t2i.json \\
| jq -r '.data[0].b64_json' | base64 -d > t2i.png
CURL


python -m dynamo.frontend &
FRONTEND_PID=$!

sleep 2

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Replace fixed startup sleep with framework readiness handling.

Using sleep 2 introduces flaky startup behavior across machines.

As per coding guidelines, launch scripts should “Avoid readiness sleeps/polls; rely on the shared framework health-check patterns instead.”

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@examples/backends/vllm/launch/agg_omni_cosmos3_image.sh` at line 48, Replace
the fixed "sleep 2" with a proper readiness check: remove the "sleep 2" line and
instead call the shared framework health-check/ready helper (e.g., a common
script or function like wait_for_framework_ready or check_framework_health) in a
loop with a timeout and non-zero exit if not ready; ensure the script waits for
the specific service(s) the launch depends on and logs progress/errors so
startup is deterministic and not flaky.


echo "Starting Omni worker..."
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
python -m dynamo.vllm.omni \
--model "$MODEL" \
--output-modalities image \
--no-cosmos3-guardrails \
--media-output-fs-url file:///tmp/dynamo_media \
"${EXTRA_ARGS[@]}" &

# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
Loading
Loading