ai-dynamo · ayushag-nv · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
@@ -82,6 +82,34 @@ def normalize_video_frames(images: list) -> list:
     return list(frames)
 
 
+def normalize_image_frames(images: list) -> list:
+    """Normalize stage_output.images into a flat list of PIL Images.
+
+    Image diffusion pipelines usually return PIL Images, but some (e.g. the
+    Cosmos3 native pipeline) return numpy arrays shaped ``[batch, frames, H, W,
+    C]`` even for single images. Collapse leading batch/frame dims and convert
+    each frame to a PIL Image; PIL inputs pass through unchanged.
+    """
+    from PIL import Image
+
+    out: list = []
+    for item in images:
+        if isinstance(item, Image.Image):
+            out.append(item)
+            continue
+        arr = np.asarray(item)
+        while arr.ndim > 4:  # [batch, frames, H, W, C] -> [frames, H, W, C]
+            arr = arr[0]
+        if arr.dtype != np.uint8:  # frames share a dtype/range; convert once
+            arr = ((arr.clip(0, 1) * 255).round() if arr.max() <= 1.0 else arr).astype(
+                np.uint8
+            )
+        frames = arr if arr.ndim == 4 else arr[None]  # -> [N, H, W, C]
+        for frame in frames:
+            out.append(Image.fromarray(frame))
+    return out
+
+
 def frames_to_numpy(images: list) -> np.ndarray:
     """Convert a list of PIL Images to a numpy array suitable for video encoding.
 

@@ -187,6 +187,16 @@ def add_arguments(self, parser) -> None:
             default=False,
             help="Disable torch.compile and force eager execution for diffusion models.",
         )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--cosmos3-guardrails",
+            env_var="DYN_OMNI_COSMOS3_GUARDRAILS",
+            default=True,
+            help=(
+                "Enable Cosmos3 text/video safety guardrails (loads guardrail models "
+                "at startup). Use --no-cosmos3-guardrails to disable."
+            ),
+        )
 
         # TTS parameters
         tts_g = parser.add_argument_group(
@@ -333,6 +343,11 @@ class OmniConfig(DynamoRuntimeConfig):
     stage_configs_path: Optional[str] = None
     default_video_fps: int = 16
 
+    # Cosmos3 safety guardrails. When False, routed into
+    # od_config.model_config["guardrails"]=False so the diffusion engine skips
+    # loading the guardrail models (see base_handler._build_omni_kwargs).
+    cosmos3_guardrails: bool = True
+
     # Nested structs — each group of fields has a clear destination
     diffusion: OmniDiffusionKwargs = dataclasses.field(
         default_factory=OmniDiffusionKwargs

@@ -77,6 +77,12 @@ def _build_omni_kwargs(self, config) -> Dict[str, Any]:
         if config.stage_configs_path:
             omni_kwargs["stage_configs_path"] = config.stage_configs_path
 
+        # Cosmos3 guardrails toggle -> od_config.model_config["guardrails"].
+        # Mirrors vllm-omni serve's --cosmos3-no-guardrails; when disabled the
+        # diffusion engine skips loading the guardrail models entirely.
+        if not config.cosmos3_guardrails:
+            omni_kwargs["model_config"] = {"guardrails": False}
+
         for field, value in dataclasses.asdict(config.diffusion).items():
             if value is not None:
                 omni_kwargs[field] = value

@@ -28,7 +28,10 @@
 from dynamo.common.storage import upload_to_fs
 from dynamo.common.utils.engine_response import normalize_finish_reason
 from dynamo.common.utils.output_modalities import RequestType
-from dynamo.common.utils.video_utils import normalize_video_frames
+from dynamo.common.utils.video_utils import (
+    normalize_image_frames,
+    normalize_video_frames,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -241,7 +244,7 @@ async def _prepare_images(
         self, images: list, request_id: str, response_format: Optional[str] = None
     ) -> list:
         outlist = []
-        for img in images:
+        for img in normalize_image_frames(images):
             buf = BytesIO()
             img.save(buf, format="PNG")
             image_bytes = buf.getvalue()

@@ -63,7 +63,12 @@ vllm:
     base_image_tag: 22.04
     runtime_image_tag: v0.21.0
   flashinf_ref: v0.6.8.post1
-  vllm_omni_ref: "v0.21.0rc1"
+  # Cosmos3 support is not yet in a released vllm-omni; install from the
+  # canonical repo pinned to the PR head commit (vllm-project/vllm-omni#3454,
+  # == refs/pull/3454/head). When vllm_omni_git_url is set, install_vllm_omni.sh
+  # installs from git@ref; otherwise it falls back to "vllm-omni==<ref>".
+  vllm_omni_ref: "e826f626afb47c8c3c39ccf892ed247f442f6bd2"
+  vllm_omni_git_url: "https://github.com/vllm-project/vllm-omni.git"
   nixl_ref: v1.1.0
   max_jobs: "10"
   enable_media_ffmpeg: "false"

@@ -9,7 +9,15 @@ set -euo pipefail
 VLLM_OMNI_PROTECTED_PACKAGES_FILE="${VLLM_OMNI_PROTECTED_PACKAGES_FILE:-/tmp/vllm_omni_protected_packages.txt}"
 
 PROTECTED_CONSTRAINTS="$(mktemp /tmp/vllm-openai-protected.XXXXXX.txt)"
-VLLM_OMNI_VERSION="${VLLM_OMNI_REF#v}"
+
+# When VLLM_OMNI_GIT_URL is set, install from a git ref (e.g. an unreleased PR
+# commit); otherwise fall back to the matching PyPI release.
+VLLM_OMNI_GIT_URL="${VLLM_OMNI_GIT_URL:-}"
+if [ -n "${VLLM_OMNI_GIT_URL}" ]; then
+  VLLM_OMNI_SPEC="vllm-omni @ git+${VLLM_OMNI_GIT_URL}@${VLLM_OMNI_REF}"
+else
+  VLLM_OMNI_SPEC="vllm-omni==${VLLM_OMNI_REF#v}"
+fi
 
 cleanup() {
   rm -rf "${PROTECTED_CONSTRAINTS}"
@@ -41,11 +49,11 @@ if [ "${VLLM_OMNI_TARGET_DEVICE}" = "cuda" ]; then
   uv pip install --system \
     --prerelease=allow \
     --constraints "${PROTECTED_CONSTRAINTS}" \
-    "vllm-omni==${VLLM_OMNI_VERSION}"
+    "${VLLM_OMNI_SPEC}"
 else
   uv pip install \
     --prerelease=allow \
     --constraints "${PROTECTED_CONSTRAINTS}" \
-    "vllm-omni==${VLLM_OMNI_VERSION}"
+    "${VLLM_OMNI_SPEC}"
 fi
 
@@ -99,6 +99,7 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }}
 ARG FLASHINF_REF={{ context.vllm.flashinf_ref }}
 {% endif %}
 ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}
+ARG VLLM_OMNI_GIT_URL={{ context.vllm.vllm_omni_git_url }}
 
 {% if device == "cuda" -%}
 # If left blank, then we will fallback to vLLM defaults

@@ -19,6 +19,7 @@ ARG PYTHON_VERSION
 ARG ENABLE_KVBM
 ARG ENABLE_GPU_MEMORY_SERVICE
 ARG VLLM_OMNI_REF
+ARG VLLM_OMNI_GIT_URL
 ARG NIXL_REF
 {% if device == "cuda" %}
 ARG CUDA_MAJOR

@@ -0,0 +1,72 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated Cosmos3 image-to-video generation (1 GPU).
+# Same worker as text-to-video (registers the "video" modality); i2v is driven
+# by adding "input_reference" to the /v1/videos request. The image loader
+# rejects local file paths — pass a data: URI (base64) or an http(s) URL.
+# --no-cosmos3-guardrails skips loading the safety guardrail models.
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+MODEL="nvidia/Cosmos3-Nano"
+
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
+print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image-to-Video (1 GPU)" "$MODEL" "$HTTP_PORT"
+print_curl_footer <<CURL
+# input_reference must be an http(s) URL or a data: URI (local paths are rejected)
+curl -s http://localhost:${HTTP_PORT}/v1/videos \\
+  -H 'Content-Type: application/json' \\
+  -d '{
+    "model": "${MODEL}",
+    "prompt": "The scene comes alive, gentle camera motion",
+    "input_reference": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+    "size": "512x512",
+    "response_format": "url",
+    "nvext": {
+      "num_inference_steps": 20,
+      "num_frames": 17
+    }
+  }' | jq
+CURL
+
+
+python -m dynamo.frontend &
+FRONTEND_PID=$!
+
+sleep 2
+
+echo "Starting Omni worker..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+    python -m dynamo.vllm.omni \
+    --model "$MODEL" \
+    --output-modalities video \
+    --no-cosmos3-guardrails \
+    --media-output-fs-url file:///tmp/dynamo_media \
+    $GPU_MEM_ARGS \
+    "${EXTRA_ARGS[@]}" &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated Cosmos3 text-to-image generation (1 GPU).
+# Uses the native vLLM-Omni Cosmos3 pipeline; --no-cosmos3-guardrails skips
+# loading the safety guardrail models. A worker serves a single modality, so
+# this script registers the "image" modality (see agg_omni_cosmos3_video.sh
+# for text-to-video).
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+MODEL="nvidia/Cosmos3-Nano"
+
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image Generation (1 GPU)" "$MODEL" "$HTTP_PORT"
+print_curl_footer <<CURL
+curl -s -X POST http://localhost:${HTTP_PORT}/v1/images/generations \\
+  -H 'Content-Type: application/json' \\
+  -d '{
+    "model": "${MODEL}",
+    "prompt": "A robot standing in a bright laboratory",
+    "size": "512x512",
+    "num_inference_steps": 20
+  }' | jq
+CURL
+
+
+python -m dynamo.frontend &
+FRONTEND_PID=$!
+
+sleep 2
+
+echo "Starting Omni worker..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+    python -m dynamo.vllm.omni \
+    --model "$MODEL" \
+    --output-modalities image \
+    --no-cosmos3-guardrails \
+    --media-output-fs-url file:///tmp/dynamo_media \
+    "${EXTRA_ARGS[@]}" &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
@@ -0,0 +1,70 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated Cosmos3 text-to-video generation (1 GPU).
+# Uses the native vLLM-Omni Cosmos3 pipeline; --no-cosmos3-guardrails skips
+# loading the safety guardrail models. A worker serves a single modality, so
+# this script registers the "video" modality (see agg_omni_cosmos3_image.sh
+# for text-to-image).
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+MODEL="nvidia/Cosmos3-Nano"
+
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
+print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Video Generation (1 GPU)" "$MODEL" "$HTTP_PORT"
+print_curl_footer <<CURL
+curl -s http://localhost:${HTTP_PORT}/v1/videos \\
+  -H 'Content-Type: application/json' \\
+  -d '{
+    "model": "${MODEL}",
+    "prompt": "A waterfall in a green forest, gentle mist",
+    "size": "512x512",
+    "response_format": "url",
+    "nvext": {
+      "num_inference_steps": 20,
+      "num_frames": 17
+    }
+  }' | jq
+CURL
+
+
+python -m dynamo.frontend &
+FRONTEND_PID=$!
+
+sleep 2
+
+echo "Starting Omni worker..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+    python -m dynamo.vllm.omni \
+    --model "$MODEL" \
+    --output-modalities video \
+    --no-cosmos3-guardrails \
+    --media-output-fs-url file:///tmp/dynamo_media \
+    $GPU_MEM_ARGS \
+    "${EXTRA_ARGS[@]}" &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit