ai-dynamo · ayushag-nv · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
@@ -82,6 +82,34 @@ def normalize_video_frames(images: list) -> list:
     return list(frames)
 
 
+def normalize_image_frames(images: list) -> list:
+    """Normalize stage_output.images into a flat list of PIL Images.
+
+    Image diffusion pipelines usually return PIL Images, but some (e.g. the
+    Cosmos3 native pipeline) return numpy arrays shaped ``[batch, frames, H, W,
+    C]`` even for single images. Collapse leading batch/frame dims and convert
+    each frame to a PIL Image; PIL inputs pass through unchanged.
+    """
+    from PIL import Image
+
+    out: list = []
+    for item in images:
+        if isinstance(item, Image.Image):
+            out.append(item)
+            continue
+        arr = np.asarray(item)
+        while arr.ndim > 4:  # [batch, frames, H, W, C] -> [frames, H, W, C]
+            arr = arr[0]
+        if arr.dtype != np.uint8:  # frames share a dtype/range; convert once
+            arr = ((arr.clip(0, 1) * 255).round() if arr.max() <= 1.0 else arr).astype(
+                np.uint8
+            )
+        frames = arr if arr.ndim == 4 else arr[None]  # -> [N, H, W, C]
+        for frame in frames:
+            out.append(Image.fromarray(frame))
+    return out
+
+
 def frames_to_numpy(images: list) -> np.ndarray:
     """Convert a list of PIL Images to a numpy array suitable for video encoding.
 

@@ -187,6 +187,16 @@ def add_arguments(self, parser) -> None:
             default=False,
             help="Disable torch.compile and force eager execution for diffusion models.",
         )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--cosmos3-guardrails",
+            env_var="DYN_OMNI_COSMOS3_GUARDRAILS",
+            default=True,
+            help=(
+                "Enable Cosmos3 text/video safety guardrails (loads guardrail models "
+                "at startup). Use --no-cosmos3-guardrails to disable."
+            ),
+        )
 
         # TTS parameters
         tts_g = parser.add_argument_group(
@@ -333,6 +343,11 @@ class OmniConfig(DynamoRuntimeConfig):
     stage_configs_path: Optional[str] = None
     default_video_fps: int = 16
 
+    # Cosmos3 safety guardrails. When False, routed into
+    # od_config.model_config["guardrails"]=False so the diffusion engine skips
+    # loading the guardrail models (see base_handler._build_omni_kwargs).
+    cosmos3_guardrails: bool = True
+
     # Nested structs — each group of fields has a clear destination
     diffusion: OmniDiffusionKwargs = dataclasses.field(
         default_factory=OmniDiffusionKwargs

@@ -77,6 +77,12 @@ def _build_omni_kwargs(self, config) -> Dict[str, Any]:
         if config.stage_configs_path:
             omni_kwargs["stage_configs_path"] = config.stage_configs_path
 
+        # Cosmos3 guardrails toggle -> od_config.model_config["guardrails"].
+        # Mirrors vllm-omni serve's --cosmos3-no-guardrails; when disabled the
+        # diffusion engine skips loading the guardrail models entirely.
+        if not config.cosmos3_guardrails:
+            omni_kwargs["model_config"] = {"guardrails": False}
+
         for field, value in dataclasses.asdict(config.diffusion).items():
             if value is not None:
                 omni_kwargs[field] = value

@@ -28,7 +28,10 @@
 from dynamo.common.storage import upload_to_fs
 from dynamo.common.utils.engine_response import normalize_finish_reason
 from dynamo.common.utils.output_modalities import RequestType
-from dynamo.common.utils.video_utils import normalize_video_frames
+from dynamo.common.utils.video_utils import (
+    normalize_image_frames,
+    normalize_video_frames,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -241,7 +244,7 @@ async def _prepare_images(
         self, images: list, request_id: str, response_format: Optional[str] = None
     ) -> list:
         outlist = []
-        for img in images:
+        for img in normalize_image_frames(images):
             buf = BytesIO()
             img.save(buf, format="PNG")
             image_bytes = buf.getvalue()

@@ -0,0 +1,163 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: Cosmos3
+---
+
+Run NVIDIA's **Cosmos3** omni model through Dynamo's
+[vLLM-Omni backend](vllm-omni.md) for **text-to-image**, **text-to-video**, and
+**image-to-video** generation.
+
+Cosmos3 is a unified world foundation model (WFM) for Physical AI, built on a
+Mixture-of-Transformers (MoT) architecture. A single `Cosmos3OmniTransformer`
+runs a Qwen-style "understanding" stream alongside a "generation" stream
+joined by a 3D multimodal RoPE, replacing the separate Predict / Reason /
+Transfer models from earlier Cosmos releases. See the
+[Cosmos World Foundation Model Platform paper](https://huggingface.co/papers/2501.03575)
+for the architectural background, and the
+[diffusers Cosmos3 reference](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cosmos3) for the underlying pipeline.
+
+Cosmos3 support in Dynamo is provided by the native vLLM-Omni pipeline added in
+[vllm-project/vllm-omni#3454](https://github.com/vllm-project/vllm-omni/pull/3454).
+
+## Checkpoints
+
+Both checkpoints share the same `Cosmos3OmniPipeline` class and Dynamo flags;
+swap the model identifier on the worker (`--model …`) and in request payloads.
+
+| Checkpoint | Description | HF Hub |
+|------------|-------------|--------|
+| `nvidia/Cosmos3-Nano` | Smaller, faster — default in this repo's launch scripts | [link](https://huggingface.co/nvidia/Cosmos3-Nano) |
+| `nvidia/Cosmos3-Super` | Larger, higher quality | [link](https://huggingface.co/nvidia/Cosmos3-Super) |
+
+## Supported modalities
+
+| Task | Endpoint | `--output-modalities` |
+|------|----------|-----------------------|
+| Text-to-Image | `/v1/images/generations` | `image` |
+| Text-to-Video | `/v1/videos` | `video` |
+| Image-to-Video | `/v1/videos` (with `input_reference`) | `video` |
+
+## Prerequisites
+
+This guide builds on the [vLLM-Omni backend guide](vllm-omni.md) — see it for general setup, `etcd`/`nats`, and OpenAI-endpoint details.
+
+### Installation
+
+This branch carries Dynamo code changes (the Cosmos3 worker flags and image
+output handling) on top of a pinned vLLM-Omni, so run Dynamo **from source on
+this branch** — a released `ai-dynamo` wheel will not include the integration.
+
+1. Clone and check out the branch:
+
+   ```bash
+   git clone https://github.com/ai-dynamo/dynamo.git
+   cd dynamo
+   git checkout cosmos3-omni-integration
+   ```
+
+2. Create a Python 3.12 environment:
+
+   ```bash
+   uv venv --python 3.12 --seed
+   source .venv/bin/activate
+   ```
+
+3. Build and install Dynamo from source (the branch's Cosmos3 code must be
+   live, and the Rust core `ai-dynamo-runtime` isn't published for this dev
+   version, so it has to be built locally). See
+   [Building from source](../../getting-started/building-from-source.md) for
+   prerequisites (Rust toolchain, system deps); the key steps from the repo root:
+
+   ```bash
+   uv pip install pip maturin
+   (cd lib/bindings/python && maturin develop --uv)   # builds ai-dynamo-runtime
+   uv pip install -e lib/gpu_memory_service
+   uv pip install -e ".[vllm]"                         # also pulls vllm==0.21.0
+   ```
+
+4. Install the Cosmos3-capable vLLM-Omni, pinned to the PR commit (its dynamic
+   `setup.py` pulls the matching pipeline deps — `diffusers==0.38`, `torchsde`,
+   `x-transformers`):
+
+   ```bash
+   uv pip install "vllm-omni @ git+https://github.com/vllm-project/vllm-omni.git@e826f626afb47c8c3c39ccf892ed247f442f6bd2"
+   ```
+
+5. Start etcd and NATS:
+
+   ```bash
+   docker compose -f dev/docker-compose.yml up -d
+   ```
+
+## Serve
+
+Quick start — each script launches the frontend on `:8000` plus a
+single-modality worker and prints a sample request:
+
+```bash
+examples/backends/vllm/launch/agg_omni_cosmos3_image.sh   # text-to-image
+examples/backends/vllm/launch/agg_omni_cosmos3_video.sh   # text-to-video
+examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh     # image-to-video
+```
+
+Manual launch:
+
+```bash
+python -m dynamo.frontend --http-port 8000 &
+
+python -m dynamo.vllm.omni \
+    --model nvidia/Cosmos3-Nano \
+    --output-modalities image \            # or: video
+    --no-cosmos3-guardrails \              # skip loading the safety guardrail models
+    --media-output-fs-url file:///tmp/dynamo_media
+```
+
+Cosmos3-specific flags:
+
+| Flag | Purpose |
+|------|---------|
+| `--no-cosmos3-guardrails` | Disable the Cosmos3 text/video safety guardrails (otherwise loaded at startup). |
+| `--flow-shift <float>` | Scheduler flow-shift (image default `3.0`). Launch-time only — not a per-request image parameter. |
+| `--media-output-fs-url file://<dir>` | Destination for media when `response_format: "url"`. |
+
+## Requests
+
+### Text-to-image
+
+Run from the repo root; `cosmos3/t2i.json` is the official Cosmos3 t2i payload
+(prompt verbatim) mapped to the Dynamo request schema:
+
+```bash
+curl -s -X POST http://localhost:8000/v1/images/generations \
+  -H 'Content-Type: application/json' \
+  --data-binary @examples/backends/vllm/launch/cosmos3/t2i.json \
+  | jq -r '.data[0].b64_json' | base64 -d > out.png
+```
+
+- `size` must be one of `256x256`, `512x512`, `1024x1024`, `1792x1024`,
+  `1024x1792`, `1536x1024`, `1024x1536`, `auto` — the payload uses `1024x1024`
+  (the official `960x960` is not an allowed image size).
+- Put `num_inference_steps`, `guidance_scale`, `seed`, and `negative_prompt`
+  under `nvext` — top-level values are ignored.
+
+### Text-to-video
+
+```bash
+curl -s http://localhost:8000/v1/videos \
+  -H 'Content-Type: application/json' \
+  --data-binary @examples/backends/vllm/launch/cosmos3/t2v.json | jq
+```
+
+The official `t2v.json` payload is `1280x720`, `192` frames @ `24` fps (8s).
+
+### Image-to-video
+
+`i2v.json` adds `input_reference` (the official `vision_path` — an http URL;
+local paths are rejected, use an http(s) URL or a `data:` base64 URI):
+
+```bash
+curl -s http://localhost:8000/v1/videos \
+  -H 'Content-Type: application/json' \
+  --data-binary @examples/backends/vllm/launch/cosmos3/i2v.json | jq
+```
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated Cosmos3 image-to-video generation (1 GPU).
+# Same worker as text-to-video (registers the "video" modality); i2v is driven
+# by adding "input_reference" to the /v1/videos request. The image loader
+# rejects local file paths — pass a data: URI (base64) or an http(s) URL.
+# --no-cosmos3-guardrails skips loading the safety guardrail models.
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+MODEL="nvidia/Cosmos3-Nano"
+
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
+print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image-to-Video (1 GPU)" "$MODEL" "$HTTP_PORT"
+print_curl_footer <<CURL
+# Official Cosmos3 image-to-video payload (prompt + vision_path verbatim).
+# input_reference must be an http(s) URL or a data: URI (local paths are rejected).
+curl -s http://localhost:${HTTP_PORT}/v1/videos \\
+  -H 'Content-Type: application/json' \\
+  --data-binary @${SCRIPT_DIR}/cosmos3/i2v.json | jq
+CURL
+
+
+python -m dynamo.frontend &
+FRONTEND_PID=$!
+
+sleep 2
+
+echo "Starting Omni worker..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+    python -m dynamo.vllm.omni \
+    --model "$MODEL" \
+    --output-modalities video \
+    --no-cosmos3-guardrails \
+    --media-output-fs-url file:///tmp/dynamo_media \
+    $GPU_MEM_ARGS \
+    "${EXTRA_ARGS[@]}" &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
@@ -0,0 +1,60 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated Cosmos3 text-to-image generation (1 GPU).
+# Uses the native vLLM-Omni Cosmos3 pipeline; --no-cosmos3-guardrails skips
+# loading the safety guardrail models. A worker serves a single modality, so
+# this script registers the "image" modality (see agg_omni_cosmos3_video.sh
+# for text-to-video).
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+MODEL="nvidia/Cosmos3-Nano"
+
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image Generation (1 GPU)" "$MODEL" "$HTTP_PORT"
+print_curl_footer <<CURL
+# Official Cosmos3 text-to-image payload (prompt verbatim)
+curl -s -X POST http://localhost:${HTTP_PORT}/v1/images/generations \\
+  -H 'Content-Type: application/json' \\
+  --data-binary @${SCRIPT_DIR}/cosmos3/t2i.json \\
+  | jq -r '.data[0].b64_json' | base64 -d > t2i.png
+CURL
+
+
+python -m dynamo.frontend &
+FRONTEND_PID=$!
+
+sleep 2
+
+echo "Starting Omni worker..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+    python -m dynamo.vllm.omni \
+    --model "$MODEL" \
+    --output-modalities image \
+    --no-cosmos3-guardrails \
+    --media-output-fs-url file:///tmp/dynamo_media \
+    "${EXTRA_ARGS[@]}" &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit