From ebe677986b23a3619370de88645e63de15c100e9 Mon Sep 17 00:00:00 2001
From: ayushag <ayushag@nvidia.com>
Date: Fri, 29 May 2026 10:12:52 -0700
Subject: [PATCH 1/9] feat(omni): add Cosmos3 image generation support

Signed-off-by: ayushag <ayushag@nvidia.com>
---
 .../src/dynamo/common/utils/video_utils.py    | 28 +++++++++++++++++++
 components/src/dynamo/vllm/omni/args.py       | 15 ++++++++++
 .../src/dynamo/vllm/omni/base_handler.py      |  6 ++++
 .../src/dynamo/vllm/omni/output_formatter.py  |  7 +++--
 container/context.yaml                        |  7 ++++-
 container/deps/vllm/install_vllm_omni.sh      | 14 ++++++++--
 container/templates/args.Dockerfile           |  1 +
 container/templates/vllm_runtime.Dockerfile   |  1 +
 8 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/components/src/dynamo/common/utils/video_utils.py b/components/src/dynamo/common/utils/video_utils.py
index 37326d3280bc..cddd7655114d 100644
--- a/components/src/dynamo/common/utils/video_utils.py
+++ b/components/src/dynamo/common/utils/video_utils.py
@@ -82,6 +82,34 @@ def normalize_video_frames(images: list) -> list:
     return list(frames)
 
 
+def normalize_image_frames(images: list) -> list:
+    """Normalize stage_output.images into a flat list of PIL Images.
+
+    Image diffusion pipelines usually return PIL Images, but some (e.g. the
+    Cosmos3 native pipeline) return numpy arrays shaped ``[batch, frames, H, W,
+    C]`` even for single images. Collapse leading batch/frame dims and convert
+    each frame to a PIL Image; PIL inputs pass through unchanged.
+    """
+    from PIL import Image
+
+    out: list = []
+    for item in images:
+        if isinstance(item, Image.Image):
+            out.append(item)
+            continue
+        arr = np.asarray(item)
+        while arr.ndim > 4:  # [batch, frames, H, W, C] -> [frames, H, W, C]
+            arr = arr[0]
+        if arr.dtype != np.uint8:  # frames share a dtype/range; convert once
+            arr = ((arr.clip(0, 1) * 255).round() if arr.max() <= 1.0 else arr).astype(
+                np.uint8
+            )
+        frames = arr if arr.ndim == 4 else arr[None]  # -> [N, H, W, C]
+        for frame in frames:
+            out.append(Image.fromarray(frame))
+    return out
+
+
 def frames_to_numpy(images: list) -> np.ndarray:
     """Convert a list of PIL Images to a numpy array suitable for video encoding.
 
diff --git a/components/src/dynamo/vllm/omni/args.py b/components/src/dynamo/vllm/omni/args.py
index fd64cd5ec7f6..d0f5abb07066 100644
--- a/components/src/dynamo/vllm/omni/args.py
+++ b/components/src/dynamo/vllm/omni/args.py
@@ -187,6 +187,16 @@ def add_arguments(self, parser) -> None:
             default=False,
             help="Disable torch.compile and force eager execution for diffusion models.",
         )
+        add_negatable_bool_argument(
+            g,
+            flag_name="--cosmos3-guardrails",
+            env_var="DYN_OMNI_COSMOS3_GUARDRAILS",
+            default=True,
+            help=(
+                "Enable Cosmos3 text/video safety guardrails (loads guardrail models "
+                "at startup). Use --no-cosmos3-guardrails to disable."
+            ),
+        )
 
         # TTS parameters
         tts_g = parser.add_argument_group(
@@ -333,6 +343,11 @@ class OmniConfig(DynamoRuntimeConfig):
     stage_configs_path: Optional[str] = None
     default_video_fps: int = 16
 
+    # Cosmos3 safety guardrails. When False, routed into
+    # od_config.model_config["guardrails"]=False so the diffusion engine skips
+    # loading the guardrail models (see base_handler._build_omni_kwargs).
+    cosmos3_guardrails: bool = True
+
     # Nested structs — each group of fields has a clear destination
     diffusion: OmniDiffusionKwargs = dataclasses.field(
         default_factory=OmniDiffusionKwargs
diff --git a/components/src/dynamo/vllm/omni/base_handler.py b/components/src/dynamo/vllm/omni/base_handler.py
index 85f30a2b0297..bf4d98d7e15a 100644
--- a/components/src/dynamo/vllm/omni/base_handler.py
+++ b/components/src/dynamo/vllm/omni/base_handler.py
@@ -77,6 +77,12 @@ def _build_omni_kwargs(self, config) -> Dict[str, Any]:
         if config.stage_configs_path:
             omni_kwargs["stage_configs_path"] = config.stage_configs_path
 
+        # Cosmos3 guardrails toggle -> od_config.model_config["guardrails"].
+        # Mirrors vllm-omni serve's --cosmos3-no-guardrails; when disabled the
+        # diffusion engine skips loading the guardrail models entirely.
+        if not config.cosmos3_guardrails:
+            omni_kwargs["model_config"] = {"guardrails": False}
+
         for field, value in dataclasses.asdict(config.diffusion).items():
             if value is not None:
                 omni_kwargs[field] = value
diff --git a/components/src/dynamo/vllm/omni/output_formatter.py b/components/src/dynamo/vllm/omni/output_formatter.py
index 9816bd3f69a5..d425e8e4cff5 100644
--- a/components/src/dynamo/vllm/omni/output_formatter.py
+++ b/components/src/dynamo/vllm/omni/output_formatter.py
@@ -28,7 +28,10 @@
 from dynamo.common.storage import upload_to_fs
 from dynamo.common.utils.engine_response import normalize_finish_reason
 from dynamo.common.utils.output_modalities import RequestType
-from dynamo.common.utils.video_utils import normalize_video_frames
+from dynamo.common.utils.video_utils import (
+    normalize_image_frames,
+    normalize_video_frames,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -241,7 +244,7 @@ async def _prepare_images(
         self, images: list, request_id: str, response_format: Optional[str] = None
     ) -> list:
         outlist = []
-        for img in images:
+        for img in normalize_image_frames(images):
             buf = BytesIO()
             img.save(buf, format="PNG")
             image_bytes = buf.getvalue()
diff --git a/container/context.yaml b/container/context.yaml
index 0a3c1a777316..b76132838b5f 100644
--- a/container/context.yaml
+++ b/container/context.yaml
@@ -63,7 +63,12 @@ vllm:
     base_image_tag: 22.04
     runtime_image_tag: v0.21.0
   flashinf_ref: v0.6.8.post1
-  vllm_omni_ref: "v0.21.0rc1"
+  # Cosmos3 support is not yet in a released vllm-omni; install from the
+  # canonical repo pinned to the PR head commit (vllm-project/vllm-omni#3454,
+  # == refs/pull/3454/head). When vllm_omni_git_url is set, install_vllm_omni.sh
+  # installs from git@ref; otherwise it falls back to "vllm-omni==<ref>".
+  vllm_omni_ref: "e826f626afb47c8c3c39ccf892ed247f442f6bd2"
+  vllm_omni_git_url: "https://github.com/vllm-project/vllm-omni.git"
   nixl_ref: v1.1.0
   max_jobs: "10"
   enable_media_ffmpeg: "false"
diff --git a/container/deps/vllm/install_vllm_omni.sh b/container/deps/vllm/install_vllm_omni.sh
index 83b08875a621..c8dad9cba3cf 100755
--- a/container/deps/vllm/install_vllm_omni.sh
+++ b/container/deps/vllm/install_vllm_omni.sh
@@ -9,7 +9,15 @@ set -euo pipefail
 VLLM_OMNI_PROTECTED_PACKAGES_FILE="${VLLM_OMNI_PROTECTED_PACKAGES_FILE:-/tmp/vllm_omni_protected_packages.txt}"
 
 PROTECTED_CONSTRAINTS="$(mktemp /tmp/vllm-openai-protected.XXXXXX.txt)"
-VLLM_OMNI_VERSION="${VLLM_OMNI_REF#v}"
+
+# When VLLM_OMNI_GIT_URL is set, install from a git ref (e.g. an unreleased PR
+# commit); otherwise fall back to the matching PyPI release.
+VLLM_OMNI_GIT_URL="${VLLM_OMNI_GIT_URL:-}"
+if [ -n "${VLLM_OMNI_GIT_URL}" ]; then
+  VLLM_OMNI_SPEC="vllm-omni @ git+${VLLM_OMNI_GIT_URL}@${VLLM_OMNI_REF}"
+else
+  VLLM_OMNI_SPEC="vllm-omni==${VLLM_OMNI_REF#v}"
+fi
 
 cleanup() {
   rm -rf "${PROTECTED_CONSTRAINTS}"
@@ -41,11 +49,11 @@ if [ "${VLLM_OMNI_TARGET_DEVICE}" = "cuda" ]; then
   uv pip install --system \
     --prerelease=allow \
     --constraints "${PROTECTED_CONSTRAINTS}" \
-    "vllm-omni==${VLLM_OMNI_VERSION}"
+    "${VLLM_OMNI_SPEC}"
 else
   uv pip install \
     --prerelease=allow \
     --constraints "${PROTECTED_CONSTRAINTS}" \
-    "vllm-omni==${VLLM_OMNI_VERSION}"
+    "${VLLM_OMNI_SPEC}"
 fi
 
diff --git a/container/templates/args.Dockerfile b/container/templates/args.Dockerfile
index 4e21b5ba1ea2..f5d7aaf14bcb 100644
--- a/container/templates/args.Dockerfile
+++ b/container/templates/args.Dockerfile
@@ -99,6 +99,7 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }}
 ARG FLASHINF_REF={{ context.vllm.flashinf_ref }}
 {% endif %}
 ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}
+ARG VLLM_OMNI_GIT_URL={{ context.vllm.vllm_omni_git_url }}
 
 {% if device == "cuda" -%}
 # If left blank, then we will fallback to vLLM defaults
diff --git a/container/templates/vllm_runtime.Dockerfile b/container/templates/vllm_runtime.Dockerfile
index 71cccc155aaa..c82186bf0162 100644
--- a/container/templates/vllm_runtime.Dockerfile
+++ b/container/templates/vllm_runtime.Dockerfile
@@ -19,6 +19,7 @@ ARG PYTHON_VERSION
 ARG ENABLE_KVBM
 ARG ENABLE_GPU_MEMORY_SERVICE
 ARG VLLM_OMNI_REF
+ARG VLLM_OMNI_GIT_URL
 ARG NIXL_REF
 {% if device == "cuda" %}
 ARG CUDA_MAJOR

From b9b9ca3b83925eb1b9550f28f8ca7ce2ac2774f4 Mon Sep 17 00:00:00 2001
From: ayushag <ayushag@nvidia.com>
Date: Fri, 29 May 2026 10:22:45 -0700
Subject: [PATCH 2/9] feat(examples): add Cosmos3 omni image/video launch
 scripts

Signed-off-by: ayushag <ayushag@nvidia.com>
---
 .../vllm/launch/agg_omni_cosmos3_image.sh     | 63 +++++++++++++++++
 .../vllm/launch/agg_omni_cosmos3_video.sh     | 70 +++++++++++++++++++
 2 files changed, 133 insertions(+)
 create mode 100755 examples/backends/vllm/launch/agg_omni_cosmos3_image.sh
 create mode 100755 examples/backends/vllm/launch/agg_omni_cosmos3_video.sh

diff --git a/examples/backends/vllm/launch/agg_omni_cosmos3_image.sh b/examples/backends/vllm/launch/agg_omni_cosmos3_image.sh
new file mode 100755
index 000000000000..afc6f4f1aa7e
--- /dev/null
+++ b/examples/backends/vllm/launch/agg_omni_cosmos3_image.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated Cosmos3 text-to-image generation (1 GPU).
+# Uses the native vLLM-Omni Cosmos3 pipeline; --no-cosmos3-guardrails skips
+# loading the safety guardrail models. A worker serves a single modality, so
+# this script registers the "image" modality (see agg_omni_cosmos3_video.sh
+# for text-to-video).
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+MODEL="nvidia/Cosmos3-Nano"
+
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image Generation (1 GPU)" "$MODEL" "$HTTP_PORT"
+print_curl_footer <<CURL
+curl -s -X POST http://localhost:${HTTP_PORT}/v1/images/generations \\
+  -H 'Content-Type: application/json' \\
+  -d '{
+    "model": "${MODEL}",
+    "prompt": "A robot standing in a bright laboratory",
+    "size": "512x512",
+    "num_inference_steps": 20
+  }' | jq
+CURL
+
+
+python -m dynamo.frontend &
+FRONTEND_PID=$!
+
+sleep 2
+
+echo "Starting Omni worker..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+    python -m dynamo.vllm.omni \
+    --model "$MODEL" \
+    --output-modalities image \
+    --no-cosmos3-guardrails \
+    --media-output-fs-url file:///tmp/dynamo_media \
+    "${EXTRA_ARGS[@]}" &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
diff --git a/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh b/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh
new file mode 100755
index 000000000000..a6067354a91e
--- /dev/null
+++ b/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated Cosmos3 text-to-video generation (1 GPU).
+# Uses the native vLLM-Omni Cosmos3 pipeline; --no-cosmos3-guardrails skips
+# loading the safety guardrail models. A worker serves a single modality, so
+# this script registers the "video" modality (see agg_omni_cosmos3_image.sh
+# for text-to-image).
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+MODEL="nvidia/Cosmos3-Nano"
+
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
+print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Video Generation (1 GPU)" "$MODEL" "$HTTP_PORT"
+print_curl_footer <<CURL
+curl -s http://localhost:${HTTP_PORT}/v1/videos \\
+  -H 'Content-Type: application/json' \\
+  -d '{
+    "model": "${MODEL}",
+    "prompt": "A waterfall in a green forest, gentle mist",
+    "size": "512x512",
+    "response_format": "url",
+    "nvext": {
+      "num_inference_steps": 20,
+      "num_frames": 17
+    }
+  }' | jq
+CURL
+
+
+python -m dynamo.frontend &
+FRONTEND_PID=$!
+
+sleep 2
+
+echo "Starting Omni worker..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+    python -m dynamo.vllm.omni \
+    --model "$MODEL" \
+    --output-modalities video \
+    --no-cosmos3-guardrails \
+    --media-output-fs-url file:///tmp/dynamo_media \
+    $GPU_MEM_ARGS \
+    "${EXTRA_ARGS[@]}" &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit

From 22812d0d54e49c2b020e4d85f08bc6980214d3fd Mon Sep 17 00:00:00 2001
From: ayushag <ayushag@nvidia.com>
Date: Fri, 29 May 2026 10:26:44 -0700
Subject: [PATCH 3/9] feat(examples): add Cosmos3 omni image-to-video launch
 script

Signed-off-by: ayushag <ayushag@nvidia.com>
---
 .../vllm/launch/agg_omni_cosmos3_i2v.sh       | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100755 examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh

diff --git a/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh b/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh
new file mode 100755
index 000000000000..bb37e58dd8b9
--- /dev/null
+++ b/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Aggregated Cosmos3 image-to-video generation (1 GPU).
+# Same worker as text-to-video (registers the "video" modality); i2v is driven
+# by adding "input_reference" to the /v1/videos request. The image loader
+# rejects local file paths — pass a data: URI (base64) or an http(s) URL.
+# --no-cosmos3-guardrails skips loading the safety guardrail models.
+
+set -e
+trap 'echo Cleaning up...; kill 0' EXIT
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
+source "$SCRIPT_DIR/../../../common/launch_utils.sh"
+
+MODEL="nvidia/Cosmos3-Nano"
+
+# Parse command line arguments
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        *)
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
+print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image-to-Video (1 GPU)" "$MODEL" "$HTTP_PORT"
+print_curl_footer <<CURL
+# input_reference must be an http(s) URL or a data: URI (local paths are rejected)
+curl -s http://localhost:${HTTP_PORT}/v1/videos \\
+  -H 'Content-Type: application/json' \\
+  -d '{
+    "model": "${MODEL}",
+    "prompt": "The scene comes alive, gentle camera motion",
+    "input_reference": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+    "size": "512x512",
+    "response_format": "url",
+    "nvext": {
+      "num_inference_steps": 20,
+      "num_frames": 17
+    }
+  }' | jq
+CURL
+
+
+python -m dynamo.frontend &
+FRONTEND_PID=$!
+
+sleep 2
+
+echo "Starting Omni worker..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+    python -m dynamo.vllm.omni \
+    --model "$MODEL" \
+    --output-modalities video \
+    --no-cosmos3-guardrails \
+    --media-output-fs-url file:///tmp/dynamo_media \
+    $GPU_MEM_ARGS \
+    "${EXTRA_ARGS[@]}" &
+
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit

From 7744835c195faa9b1cb979b9624dfd8c168b32f2 Mon Sep 17 00:00:00 2001
From: ayushag <ayushag@nvidia.com>
Date: Fri, 29 May 2026 13:29:34 -0700
Subject: [PATCH 4/9] chore(cosmos3): add docs and sample payloads; revert
 container git install

Signed-off-by: ayushag <ayushag@nvidia.com>
---
 container/context.yaml                        |   7 +-
 container/deps/vllm/install_vllm_omni.sh      |  14 +-
 container/templates/args.Dockerfile           |   1 -
 container/templates/vllm_runtime.Dockerfile   |   1 -
 docs/backends/vllm/cosmos3.md                 | 163 ++++++++++++++++++
 .../vllm/launch/agg_omni_cosmos3_i2v.sh       |  15 +-
 .../vllm/launch/agg_omni_cosmos3_image.sh     |   9 +-
 .../vllm/launch/agg_omni_cosmos3_video.sh     |  12 +-
 .../backends/vllm/launch/cosmos3/i2v.json     |  12 ++
 .../backends/vllm/launch/cosmos3/t2i.json     |  11 ++
 .../backends/vllm/launch/cosmos3/t2v.json     |  11 ++
 11 files changed, 209 insertions(+), 47 deletions(-)
 create mode 100644 docs/backends/vllm/cosmos3.md
 create mode 100644 examples/backends/vllm/launch/cosmos3/i2v.json
 create mode 100644 examples/backends/vllm/launch/cosmos3/t2i.json
 create mode 100644 examples/backends/vllm/launch/cosmos3/t2v.json

diff --git a/container/context.yaml b/container/context.yaml
index b76132838b5f..0a3c1a777316 100644
--- a/container/context.yaml
+++ b/container/context.yaml
@@ -63,12 +63,7 @@ vllm:
     base_image_tag: 22.04
     runtime_image_tag: v0.21.0
   flashinf_ref: v0.6.8.post1
-  # Cosmos3 support is not yet in a released vllm-omni; install from the
-  # canonical repo pinned to the PR head commit (vllm-project/vllm-omni#3454,
-  # == refs/pull/3454/head). When vllm_omni_git_url is set, install_vllm_omni.sh
-  # installs from git@ref; otherwise it falls back to "vllm-omni==<ref>".
-  vllm_omni_ref: "e826f626afb47c8c3c39ccf892ed247f442f6bd2"
-  vllm_omni_git_url: "https://github.com/vllm-project/vllm-omni.git"
+  vllm_omni_ref: "v0.21.0rc1"
   nixl_ref: v1.1.0
   max_jobs: "10"
   enable_media_ffmpeg: "false"
diff --git a/container/deps/vllm/install_vllm_omni.sh b/container/deps/vllm/install_vllm_omni.sh
index c8dad9cba3cf..83b08875a621 100755
--- a/container/deps/vllm/install_vllm_omni.sh
+++ b/container/deps/vllm/install_vllm_omni.sh
@@ -9,15 +9,7 @@ set -euo pipefail
 VLLM_OMNI_PROTECTED_PACKAGES_FILE="${VLLM_OMNI_PROTECTED_PACKAGES_FILE:-/tmp/vllm_omni_protected_packages.txt}"
 
 PROTECTED_CONSTRAINTS="$(mktemp /tmp/vllm-openai-protected.XXXXXX.txt)"
-
-# When VLLM_OMNI_GIT_URL is set, install from a git ref (e.g. an unreleased PR
-# commit); otherwise fall back to the matching PyPI release.
-VLLM_OMNI_GIT_URL="${VLLM_OMNI_GIT_URL:-}"
-if [ -n "${VLLM_OMNI_GIT_URL}" ]; then
-  VLLM_OMNI_SPEC="vllm-omni @ git+${VLLM_OMNI_GIT_URL}@${VLLM_OMNI_REF}"
-else
-  VLLM_OMNI_SPEC="vllm-omni==${VLLM_OMNI_REF#v}"
-fi
+VLLM_OMNI_VERSION="${VLLM_OMNI_REF#v}"
 
 cleanup() {
   rm -rf "${PROTECTED_CONSTRAINTS}"
@@ -49,11 +41,11 @@ if [ "${VLLM_OMNI_TARGET_DEVICE}" = "cuda" ]; then
   uv pip install --system \
     --prerelease=allow \
     --constraints "${PROTECTED_CONSTRAINTS}" \
-    "${VLLM_OMNI_SPEC}"
+    "vllm-omni==${VLLM_OMNI_VERSION}"
 else
   uv pip install \
     --prerelease=allow \
     --constraints "${PROTECTED_CONSTRAINTS}" \
-    "${VLLM_OMNI_SPEC}"
+    "vllm-omni==${VLLM_OMNI_VERSION}"
 fi
 
diff --git a/container/templates/args.Dockerfile b/container/templates/args.Dockerfile
index f5d7aaf14bcb..4e21b5ba1ea2 100644
--- a/container/templates/args.Dockerfile
+++ b/container/templates/args.Dockerfile
@@ -99,7 +99,6 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }}
 ARG FLASHINF_REF={{ context.vllm.flashinf_ref }}
 {% endif %}
 ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}
-ARG VLLM_OMNI_GIT_URL={{ context.vllm.vllm_omni_git_url }}
 
 {% if device == "cuda" -%}
 # If left blank, then we will fallback to vLLM defaults
diff --git a/container/templates/vllm_runtime.Dockerfile b/container/templates/vllm_runtime.Dockerfile
index c82186bf0162..71cccc155aaa 100644
--- a/container/templates/vllm_runtime.Dockerfile
+++ b/container/templates/vllm_runtime.Dockerfile
@@ -19,7 +19,6 @@ ARG PYTHON_VERSION
 ARG ENABLE_KVBM
 ARG ENABLE_GPU_MEMORY_SERVICE
 ARG VLLM_OMNI_REF
-ARG VLLM_OMNI_GIT_URL
 ARG NIXL_REF
 {% if device == "cuda" %}
 ARG CUDA_MAJOR
diff --git a/docs/backends/vllm/cosmos3.md b/docs/backends/vllm/cosmos3.md
new file mode 100644
index 000000000000..dc3a79278ea7
--- /dev/null
+++ b/docs/backends/vllm/cosmos3.md
@@ -0,0 +1,163 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: Cosmos3
+---
+
+Run NVIDIA's **Cosmos3** omni model through Dynamo's
+[vLLM-Omni backend](vllm-omni.md) for **text-to-image**, **text-to-video**, and
+**image-to-video** generation.
+
+Cosmos3 is a unified world foundation model (WFM) for Physical AI, built on a
+Mixture-of-Transformers (MoT) architecture. A single `Cosmos3OmniTransformer`
+runs a Qwen-style "understanding" stream alongside a "generation" stream
+joined by a 3D multimodal RoPE, replacing the separate Predict / Reason /
+Transfer models from earlier Cosmos releases. See the
+[Cosmos World Foundation Model Platform paper](https://huggingface.co/papers/2501.03575)
+for the architectural background, and the
+[diffusers Cosmos3 reference](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cosmos3) for the underlying pipeline.
+
+Cosmos3 support in Dynamo is provided by the native vLLM-Omni pipeline added in
+[vllm-project/vllm-omni#3454](https://github.com/vllm-project/vllm-omni/pull/3454).
+
+## Checkpoints
+
+Both checkpoints share the same `Cosmos3OmniPipeline` class and Dynamo flags;
+swap the model identifier on the worker (`--model …`) and in request payloads.
+
+| Checkpoint | Description | HF Hub |
+|------------|-------------|--------|
+| `nvidia/Cosmos3-Nano` | Smaller, faster — default in this repo's launch scripts | [link](https://huggingface.co/nvidia/Cosmos3-Nano) |
+| `nvidia/Cosmos3-Super` | Larger, higher quality | [link](https://huggingface.co/nvidia/Cosmos3-Super) |
+
+## Supported modalities
+
+| Task | Endpoint | `--output-modalities` |
+|------|----------|-----------------------|
+| Text-to-Image | `/v1/images/generations` | `image` |
+| Text-to-Video | `/v1/videos` | `video` |
+| Image-to-Video | `/v1/videos` (with `input_reference`) | `video` |
+
+## Prerequisites
+
+This guide builds on the [vLLM-Omni backend guide](vllm-omni.md) — see it for general setup, `etcd`/`nats`, and OpenAI-endpoint details.
+
+### Installation
+
+This branch carries Dynamo code changes (the Cosmos3 worker flags and image
+output handling) on top of a pinned vLLM-Omni, so run Dynamo **from source on
+this branch** — a released `ai-dynamo` wheel will not include the integration.
+
+1. Clone and check out the branch:
+
+   ```bash
+   git clone https://github.com/ai-dynamo/dynamo.git
+   cd dynamo
+   git checkout cosmos3-omni-integration
+   ```
+
+2. Create a Python 3.12 environment:
+
+   ```bash
+   uv venv --python 3.12 --seed
+   source .venv/bin/activate
+   ```
+
+3. Build and install Dynamo from source (the branch's Cosmos3 code must be
+   live, and the Rust core `ai-dynamo-runtime` isn't published for this dev
+   version, so it has to be built locally). See
+   [Building from source](../../getting-started/building-from-source.md) for
+   prerequisites (Rust toolchain, system deps); the key steps from the repo root:
+
+   ```bash
+   uv pip install pip maturin
+   (cd lib/bindings/python && maturin develop --uv)   # builds ai-dynamo-runtime
+   uv pip install -e lib/gpu_memory_service
+   uv pip install -e ".[vllm]"                         # also pulls vllm==0.21.0
+   ```
+
+4. Install the Cosmos3-capable vLLM-Omni, pinned to the PR commit (its dynamic
+   `setup.py` pulls the matching pipeline deps — `diffusers==0.38`, `torchsde`,
+   `x-transformers`):
+
+   ```bash
+   uv pip install "vllm-omni @ git+https://github.com/vllm-project/vllm-omni.git@e826f626afb47c8c3c39ccf892ed247f442f6bd2"
+   ```
+
+5. Start etcd and NATS:
+
+   ```bash
+   docker compose -f dev/docker-compose.yml up -d
+   ```
+
+## Serve
+
+Quick start — each script launches the frontend on `:8000` plus a
+single-modality worker and prints a sample request:
+
+```bash
+examples/backends/vllm/launch/agg_omni_cosmos3_image.sh   # text-to-image
+examples/backends/vllm/launch/agg_omni_cosmos3_video.sh   # text-to-video
+examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh     # image-to-video
+```
+
+Manual launch:
+
+```bash
+python -m dynamo.frontend --http-port 8000 &
+
+python -m dynamo.vllm.omni \
+    --model nvidia/Cosmos3-Nano \
+    --output-modalities image \            # or: video
+    --no-cosmos3-guardrails \              # skip loading the safety guardrail models
+    --media-output-fs-url file:///tmp/dynamo_media
+```
+
+Cosmos3-specific flags:
+
+| Flag | Purpose |
+|------|---------|
+| `--no-cosmos3-guardrails` | Disable the Cosmos3 text/video safety guardrails (otherwise loaded at startup). |
+| `--flow-shift <float>` | Scheduler flow-shift (image default `3.0`). Launch-time only — not a per-request image parameter. |
+| `--media-output-fs-url file://<dir>` | Destination for media when `response_format: "url"`. |
+
+## Requests
+
+### Text-to-image
+
+Run from the repo root; `cosmos3/t2i.json` is the official Cosmos3 t2i payload
+(prompt verbatim) mapped to the Dynamo request schema:
+
+```bash
+curl -s -X POST http://localhost:8000/v1/images/generations \
+  -H 'Content-Type: application/json' \
+  --data-binary @examples/backends/vllm/launch/cosmos3/t2i.json \
+  | jq -r '.data[0].b64_json' | base64 -d > out.png
+```
+
+- `size` must be one of `256x256`, `512x512`, `1024x1024`, `1792x1024`,
+  `1024x1792`, `1536x1024`, `1024x1536`, `auto` — the payload uses `1024x1024`
+  (the official `960x960` is not an allowed image size).
+- Put `num_inference_steps`, `guidance_scale`, `seed`, and `negative_prompt`
+  under `nvext` — top-level values are ignored.
+
+### Text-to-video
+
+```bash
+curl -s http://localhost:8000/v1/videos \
+  -H 'Content-Type: application/json' \
+  --data-binary @examples/backends/vllm/launch/cosmos3/t2v.json | jq
+```
+
+The official `t2v.json` payload is `1280x720`, `192` frames @ `24` fps (8s).
+
+### Image-to-video
+
+`i2v.json` adds `input_reference` (the official `vision_path` — an http URL;
+local paths are rejected, use an http(s) URL or a `data:` base64 URI):
+
+```bash
+curl -s http://localhost:8000/v1/videos \
+  -H 'Content-Type: application/json' \
+  --data-binary @examples/backends/vllm/launch/cosmos3/i2v.json | jq
+```
diff --git a/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh b/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh
index bb37e58dd8b9..0f8bd6877c05 100755
--- a/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh
+++ b/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh
@@ -36,20 +36,11 @@ HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image-to-Video (1 GPU)" "$MODEL" "$HTTP_PORT"
 print_curl_footer <<CURL
-# input_reference must be an http(s) URL or a data: URI (local paths are rejected)
+# Official Cosmos3 image-to-video payload (prompt + vision_path verbatim).
+# input_reference must be an http(s) URL or a data: URI (local paths are rejected).
 curl -s http://localhost:${HTTP_PORT}/v1/videos \\
   -H 'Content-Type: application/json' \\
-  -d '{
-    "model": "${MODEL}",
-    "prompt": "The scene comes alive, gentle camera motion",
-    "input_reference": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
-    "size": "512x512",
-    "response_format": "url",
-    "nvext": {
-      "num_inference_steps": 20,
-      "num_frames": 17
-    }
-  }' | jq
+  --data-binary @${SCRIPT_DIR}/cosmos3/i2v.json | jq
 CURL
 
 
diff --git a/examples/backends/vllm/launch/agg_omni_cosmos3_image.sh b/examples/backends/vllm/launch/agg_omni_cosmos3_image.sh
index afc6f4f1aa7e..472da96839cd 100755
--- a/examples/backends/vllm/launch/agg_omni_cosmos3_image.sh
+++ b/examples/backends/vllm/launch/agg_omni_cosmos3_image.sh
@@ -34,14 +34,11 @@ done
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image Generation (1 GPU)" "$MODEL" "$HTTP_PORT"
 print_curl_footer <<CURL
+# Official Cosmos3 text-to-image payload (prompt verbatim)
 curl -s -X POST http://localhost:${HTTP_PORT}/v1/images/generations \\
   -H 'Content-Type: application/json' \\
-  -d '{
-    "model": "${MODEL}",
-    "prompt": "A robot standing in a bright laboratory",
-    "size": "512x512",
-    "num_inference_steps": 20
-  }' | jq
+  --data-binary @${SCRIPT_DIR}/cosmos3/t2i.json \\
+  | jq -r '.data[0].b64_json' | base64 -d > t2i.png
 CURL
 
 
diff --git a/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh b/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh
index a6067354a91e..4681749fc81c 100755
--- a/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh
+++ b/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh
@@ -36,18 +36,10 @@ HTTP_PORT="${DYN_HTTP_PORT:-8000}"
 GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Video Generation (1 GPU)" "$MODEL" "$HTTP_PORT"
 print_curl_footer <<CURL
+# Official Cosmos3 text-to-video payload (prompt verbatim; 1280x720, 192 frames @ 24fps)
 curl -s http://localhost:${HTTP_PORT}/v1/videos \\
   -H 'Content-Type: application/json' \\
-  -d '{
-    "model": "${MODEL}",
-    "prompt": "A waterfall in a green forest, gentle mist",
-    "size": "512x512",
-    "response_format": "url",
-    "nvext": {
-      "num_inference_steps": 20,
-      "num_frames": 17
-    }
-  }' | jq
+  --data-binary @${SCRIPT_DIR}/cosmos3/t2v.json | jq
 CURL
 
 
diff --git a/examples/backends/vllm/launch/cosmos3/i2v.json b/examples/backends/vllm/launch/cosmos3/i2v.json
new file mode 100644
index 000000000000..1835aa49624b
--- /dev/null
+++ b/examples/backends/vllm/launch/cosmos3/i2v.json
@@ -0,0 +1,12 @@
+{
+  "model": "nvidia/Cosmos3-Nano",
+  "prompt": "{\"scene_imagination\": \"focus: robotic manipulation demo in a lab with two arms and a wooden shelf. define: left arm stationary, right arm reaches for red sphere on top shelf, places it on bottom shelf, returns. visualize: first frame shows both silver/gray articulated hands at table, dark wood shelf center, red ball on top rack, person observing mid-ground. refine: camera is static overhead-ish POV, wide-angle showing room equipment (tripod, chair, cables). analyze: timing\\u20140:00 idle, 0:01-0:03 reach, 0:03-0:05 grip/redistribute, 0:05-0:08 retract. clarify: lighting is even fluorescent/lab lighting; palette muted grays/beiges with red accent. refine: keep left hand motionless throughout. visualize: smooth mechanical motions, no glitches. define: mood technical, clinical, demonstrative. analyze: ensure all actions end \\u2264 0:08.\", \"temporal_caption\": \"Two robotic arms sit at a wooden table flanking a small wooden shelf; the right arm extends to grasp a red spherical object from the top shelf, repositions it onto the lower shelf, then retracts while the left arm remains still.\", \"audio_description\": \"Low ambient hum of laboratory HVAC and faint electrical whirring from motors; soft mechanical clicks as joints articulate during the reach and release; subtle creak of the wooden shelf under weight; no human speech or music.\", \"subjects\": [{\"description\": \"Left robotic arm with a multi-jointed metallic hand featuring four articulated fingers made of brushed silver and gray composite material, mounted on a black base.\", \"appearance_details\": \"Visible actuator joints, knuckle articulation, matte finish on palm, glossy highlights on finger segments.\", \"relationship\": \"Paired counterpart to the right robotic arm; both interact with the central wooden shelf assembly.\", \"location\": \"Left foreground, resting on the light-colored wooden table\", \"relative_size\": \"Medium within frame\", \"orientation\": \"Hand oriented toward the center of the table\", \"pose\": \"Fingers loosely splayed, hovering just above the tabletop\", \"action\": \"Remains stationary throughout the sequence\", \"state_changes\": \"No significant change.\", \"clothing\": \"\", \"expression\": \"\", \"gender\": \"\", \"age\": \"\", \"skin_tone_and_texture\": \"\", \"facial_features\": \"\", \"number_of_subjects\": 1, \"number_of_arms\": 0, \"number_of_legs\": 0}, {\"description\": \"Right robotic arm with a similar multi-jointed metallic hand, matching the left arm's design but positioned further back near the shelf.\", \"appearance_details\": \"Brushed silver-gray fingers, visible motor housings, slight wear marks suggesting use.\", \"relationship\": \"Primary actor performing the manipulation task; paired with the left arm.\", \"location\": \"Right side, mid-ground near the wooden shelf\", \"relative_size\": \"Medium within frame\", \"orientation\": \"Hand extended forward toward the shelf\", \"pose\": \"Initially relaxed, then extending upward and forward to grasp an object\", \"action\": \"Reaches up, grasps the red spherical object from the top shelf, lowers it to the bottom shelf, then retracts\", \"state_changes\": \"Transitions from resting to reaching, gripping, lowering, and returning to rest.\", \"clothing\": \"\", \"expression\": \"\", \"gender\": \"\", \"age\": \"\", \"skin_tone_and_texture\": \"\", \"facial_features\": \"\", \"number_of_subjects\": 1, \"number_of_arms\": 0, \"number_of_legs\": 0}, {\"description\": \"A person standing in the background wearing a blue zip-up jacket over a white shirt and dark trousers, hands in pockets.\", \"appearance_details\": \"Casual posture, partially obscured by equipment; only torso and legs clearly visible.\", \"relationship\": \"Observer monitoring the robotic demonstration\", \"location\": \"Center background, between the two robotic arms\", \"relative_size\": \"Small within frame\", \"orientation\": \"Facing roughly toward the table and robotic arms\", \"pose\": \"Standing upright with hands in jacket pockets\", \"action\": \"Watches the robotic operation without moving\", \"state_changes\": \"No significant change.\", \"clothing\": \"Blue jacket, white undershirt, black pants\", \"expression\": \"\", \"gender\": \"Unknown\", \"age\": \"Adult\", \"skin_tone_and_texture\": \"Not clearly visible due to distance and clothing\", \"facial_features\": \"\", \"number_of_subjects\": 1, \"number_of_arms\": 2, \"number_of_legs\": 2}], \"background_setting\": \"An indoor robotics testing laboratory with a large light-colored wooden worktable in the foreground, tiled gray floor, scattered equipment including a black office chair, a tripod, a small robot figure, computer monitors, cables, and storage bins against the walls.\", \"lighting\": {\"conditions\": \"Even, diffuse interior lighting typical of a lab environment\", \"direction\": \"Top-down overhead illumination\", \"shadows\": \"Soft, short shadows directly beneath objects\", \"illumination_effect\": \"Subtle specular highlights on the metallic robotic hands\"}, \"aesthetics\": {\"composition\": \"Symmetrical framing with the wooden shelf centered between two robotic arms, leading lines from the table edges\", \"color_scheme\": \"Muted neutrals\\u2014beige wood, gray metal, black accents\\u2014with a single saturated red accent from the spherical object\", \"mood_atmosphere\": \"Clinical, focused, technological\", \"patterns\": \"Grid pattern of the floor tiles\"}, \"cinematography\": {\"camera_motion\": \"Static\", \"framing\": \"Wide shot capturing the full table and surrounding workspace\", \"camera_angle\": \"High-angle, looking down at the table surface\", \"depth_of_field\": \"Deep\", \"focus\": \"Sharp focus across the entire scene, particularly on the robotic arms and shelf\", \"lens_focal_length\": \"Wide-angle approximately 18-24mm equivalent\"}, \"style_medium\": \"Photoreal live-action footage\", \"artistic_style\": \"Documentary technical demonstration\", \"context\": \"A robotics research or development session demonstrating precise object manipulation capabilities using dual robotic arms in a controlled lab setting.\", \"actions\": [{\"time\": \"0:00-0:01\", \"description\": \"Both robotic arms are stationary; the right arm hovers near the top shelf where the red sphere rests.\"}, {\"time\": \"0:01-0:03\", \"description\": \"The right robotic hand extends upward and forward, its fingers closing around the red spherical object on the top shelf.\"}, {\"time\": \"0:03-0:05\", \"description\": \"The right hand lifts the red sphere off the top shelf and carefully lowers it onto the lowest rack of the wooden shelf.\"}, {\"time\": \"0:05-0:07\", \"description\": \"The right hand releases the sphere and retracts back to its initial position beside the shelf.\"}, {\"time\": \"0:07-0:08\", \"description\": \"Both arms settle into their resting positions; the person in the background remains still, observing.\"}], \"text_and_signage_elements\": [], \"segments\": [{\"segment_index\": 0, \"time_range\": \"0:00-0:01\", \"description\": \"Establishing view of the lab table with both robotic arms at rest and the red sphere on the top shelf.\", \"key_changes\": \"None; scene is set.\", \"camera\": \"Static high-angle wide shot\"}, {\"segment_index\": 1, \"time_range\": \"0:01-0:05\", \"description\": \"The right robotic hand reaches up, grasps the red sphere, and transfers it to the lower shelf.\", \"key_changes\": \"Red sphere moves from top shelf to bottom shelf.\", \"camera\": \"Static high-angle wide shot\"}, {\"segment_index\": 2, \"time_range\": \"0:05-0:08\", \"description\": \"The right hand retracts to its original position; both arms remain still as the demonstration concludes.\", \"key_changes\": \"Right arm returns to resting pose; scene stabilizes.\", \"camera\": \"Static high-angle wide shot\"}], \"transitions\": [], \"resolution\": {\"W\": 1280, \"H\": 720}, \"aspect_ratio\": \"16,9\", \"duration\": \"0:08\", \"fps\": 24}",
+  "input_reference": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/refs/heads/assets/cosmos3/inputs/vision/robot_153.jpg",
+  "size": "1280x720",
+  "response_format": "url",
+  "nvext": {
+    "num_inference_steps": 35,
+    "num_frames": 192,
+    "fps": 24
+  }
+}
\ No newline at end of file
diff --git a/examples/backends/vllm/launch/cosmos3/t2i.json b/examples/backends/vllm/launch/cosmos3/t2i.json
new file mode 100644
index 000000000000..608856224f57
--- /dev/null
+++ b/examples/backends/vllm/launch/cosmos3/t2i.json
@@ -0,0 +1,11 @@
+{
+  "model": "nvidia/Cosmos3-Nano",
+  "prompt": "{\"scene_imagination\": \"focus: on a modern robotics lab, emphasizing the workspace and equipment. define: the central subject as a robotic arm mounted on a white workbench. visualize: the arm's metallic finish, joints, and gripper in a ready position above objects. refine: the objects it interacts with \\u2013 small colored cubes (red, yellow, green) and black cylinders. analyze: the composition, noting the rule-of-thirds placement of the robot on the right and the laptop on the left. visualize: the background elements, including the wall-mounted monitor showing software UI and a whiteboard with diagrams. define: the lighting as bright, clean fluorescent overhead light, creating a sterile, professional atmosphere. refine: the color scheme to be predominantly white and gray with accents of red, yellow, and green from the objects.\", \"comprehensive_t2i_caption\": \"A high-angle, medium shot of a modern robotics laboratory featuring a sophisticated silver and grey articulated robotic arm positioned over a white workbench. The robotic arm has multiple cylindrical segments connected by visible hinges and features a complex black and silver mechanical gripper at its end-effector. On the white bench surface, several small, colorful geometric blocks\\u2014yellow, blue, and red\\u2014are arranged near the base of the arm. To the left, a black Dell laptop is open, displaying a dark interface, next to a clear plastic container holding black cylindrical components and a small white power supply unit. A large computer monitor on the white wall behind the bench displays a technical user interface with a prominent 'Robotics' logo, a grid of icons, and various data readouts. To the right, a vertical white panel contains two circular control knobs and a digital display. The room is brightly lit by overhead fluorescent lights, casting soft shadows on the smooth, grey-tiled floor. In the background, a white dry-erase board holds hand-drawn sketches of mechanical parts, and a second monitor sits on a stand further back. The overall aesthetic is clean, industrial, and highly functional, emphasizing precision engineering and advanced technology.\", \"subjects\": [{\"description\": \"A sophisticated robotic arm with a metallic, silver-toned finish. It consists of multiple articulated segments connected by joints, ending in a complex mechanical gripper or tool head. The arm is mounted on a white workbench and is positioned above a set of small, colored objects.\", \"appearance_details\": \"The arm features visible screws, bolts, and cylindrical joints. The gripper at the end has a multi-fingered or multi-pronged design made of black and silver materials. The overall construction appears modular and industrial.\", \"relationship\": \"The robotic arm is the central focus, interacting with the small colored blocks on the workbench and positioned in front of a laptop and a computer monitor.\", \"location\": \"Center and right foreground\", \"relative_size\": \"Large within frame\", \"orientation\": \"Facing left towards the workbench\", \"pose\": \"The arm is extended horizontally across the workbench, with its gripper hovering just above the objects.\", \"clothing\": \"\", \"expression\": \"\", \"gender\": \"\", \"age\": \"\", \"skin_tone_and_texture\": \"\", \"facial_features\": \"\", \"number_of_subjects\": 0, \"number_of_arms\": 0, \"number_of_legs\": 0, \"number_of_hands\": 0, \"number_of_fingers\": 0}, {\"description\": \"A group of small, solid-colored geometric blocks arranged in a row on the white workbench. They appear to be made of plastic or painted wood.\", \"appearance_details\": \"The blocks are rectangular prisms of varying heights. From left to right, they are red, yellow, green, and another red block.\", \"relationship\": \"Positioned directly beneath the robotic arm's gripper, likely serving as targets for manipulation.\", \"location\": \"Left-center foreground on the workbench\", \"relative_size\": \"Small\", \"orientation\": \"Aligned vertically in a line\", \"pose\": \"Stationary on the flat surface\", \"clothing\": \"\", \"expression\": \"\", \"gender\": \"\", \"age\": \"\", \"skin_tone_and_texture\": \"\", \"facial_features\": \"\", \"number_of_subjects\": 4, \"number_of_arms\": 0, \"number_of_legs\": 0, \"number_of_hands\": 0, \"number_of_fingers\": 0}, {\"description\": \"A black laptop computer sitting open on the white workbench. The screen is dark or displaying a dark interface.\", \"appearance_details\": \"Standard clamshell design with a visible keyboard and trackpad. It is positioned to the left of the robotic arm.\", \"relationship\": \"Placed on the same workbench as the robotic arm, likely used for controlling or monitoring the system.\", \"location\": \"Left foreground\", \"relative_size\": \"Medium\", \"orientation\": \"Facing slightly right\", \"pose\": \"Open and resting on the bench\", \"clothing\": \"\", \"expression\": \"\", \"gender\": \"\", \"age\": \"\", \"skin_tone_and_texture\": \"\", \"facial_features\": \"\", \"number_of_subjects\": 0, \"number_of_arms\": 0, \"number_of_legs\": 0, \"number_of_hands\": 0, \"number_of_fingers\": 0}], \"subject_details\": {\"environment_type\": \"Modern robotics research laboratory\", \"primary_equipment\": \"Articulated robotic arm with a mechanical gripper\", \"task_context\": \"Object manipulation demonstration using colored blocks\", \"color_palette\": \"Clean whites and grays with vibrant accent colors\"}, \"background_setting\": \"An indoor laboratory or workshop environment. The walls are white with some areas covered by translucent plastic sheeting. There is a large computer monitor on the wall displaying a software interface. To the right, there is a white vertical panel with two circular knobs and a digital display. In the far background, another monitor is visible on a stand, and a whiteboard with hand-drawn diagrams and text is partially visible on the left wall. The floor is a light gray tile or linoleum.\", \"lighting\": {\"conditions\": \"Bright indoor lighting\", \"direction\": \"Top-lit from overhead fluorescent fixtures\", \"shadows\": \"Soft, diffused shadows cast downwards onto the workbench and floor.\", \"illumination_effect\": \"\", \"illumination effect\": \"The scene is evenly illuminated with a cool, neutral tone, typical of a professional workspace.\"}, \"aesthetics\": {\"composition\": \"The robotic arm dominates the right side of the frame, creating a sense of scale and technological complexity. The laptop and blocks provide balance on the left.\", \"color_scheme\": \"Predominantly monochromatic white and gray, punctuated by the primary colors of the blocks (red, yellow, green).\", \"mood_atmosphere\": \"Professional, clean, and focused on technology and precision.\", \"patterns\": \"Geometric patterns are present in the robotic arm's segments and the arrangement of the blocks.\"}, \"cinematography\": {\"framing\": \"Medium shot\", \"camera_angle\": \"Eye-level, slightly angled down towards the workbench\", \"depth_of_field\": \"Deep, with both the foreground robotic arm and background monitor in relatively clear focus.\", \"focus\": \"Sharp focus on the robotic arm and the objects on the bench.\", \"lens_focal_length\": \"Standard wide-angle\"}, \"style_medium\": \"Photograph\", \"artistic_style\": \"Clean industrial photography\", \"context\": \"This image depicts a robotics research or development setting, likely showcasing a prototype or an educational project involving object recognition and manipulation.\", \"text_and_signage_elements\": [{\"text\": \"Robotics\", \"category\": \"ui_text\", \"appearance\": \"Bold, sans-serif font, dark blue or black color on a light background.\", \"spatial\": \"Top center of the computer monitor on the wall.\", \"context\": \"Title or header of the software application running on the monitor.\"}, {\"text\": \"System status, battery level, etc.\", \"category\": \"ui_text\", \"appearance\": \"Small, white or light gray text and icons on a dark blue sidebar.\", \"spatial\": \"Right edge of the computer monitor.\", \"context\": \"User interface elements for monitoring the robotic system.\"}, {\"text\": \"Hand-drawn diagrams and notes\", \"category\": \"physical_in_scene\", \"appearance\": \"Black marker lines and illegible handwritten text on a whiteboard.\", \"spatial\": \"Far left background on the wall.\", \"context\": \"Technical schematics or planning notes related to the project.\"}], \"quadrant_scan\": {\"top_left\": \"A white wall features a whiteboard with hand-drawn sketches of mechanical components. A black Dell laptop sits on a white workbench, angled toward the center. A clear plastic container holds black cylindrical objects. The ceiling has recessed fluorescent lighting panels.\", \"top_right\": \"A large computer monitor displays a software interface with a 'Robotics' logo, a grid of icons, and a digital clock reading '10:58'. To the right of the monitor is a white vertical panel with two circular dials and a small digital display. The wall is covered in translucent plastic sheeting.\", \"bottom_left\": \"The lower portion of the white workbench shows a white power supply unit and a black cable management box. Several small, colorful geometric blocks (yellow, blue, red) are scattered on the bench. The floor is a light gray tiled surface.\", \"bottom_right\": \"A complex, articulated robotic arm with a silver and black finish extends from the right side of the frame. Its end effector is a multi-jointed gripper. The arm rests on a white platform that appears to be part of a larger machine or workstation.\", \"absolute_center\": \"The focal point is the intersection of the robotic arm's end effector and the white workbench where the colorful blocks are placed. The arm's intricate mechanical joints and the smooth surface of the bench create a contrast between heavy machinery and a clean workspace.\"}, \"resolution\": {\"W\": 960, \"H\": 960}, \"aspect_ratio\": \"1,1\"}",
+  "size": "1024x1024",
+  "n": 1,
+  "nvext": {
+    "num_inference_steps": 50,
+    "guidance_scale": 6.0,
+    "seed": 42
+  }
+}
\ No newline at end of file
diff --git a/examples/backends/vllm/launch/cosmos3/t2v.json b/examples/backends/vllm/launch/cosmos3/t2v.json
new file mode 100644
index 000000000000..2bbb97da5604
--- /dev/null
+++ b/examples/backends/vllm/launch/cosmos3/t2v.json
@@ -0,0 +1,11 @@
+{
+  "model": "nvidia/Cosmos3-Nano",
+  "prompt": "{\"scene_imagination\": \"focus: automated fruit-picking demo in a retail display case. define: wooden multi-compartment case, bananas center, apples left, pears/oranges/carambolas right. visualize: two robotic arms at bottom of frame; left arm static, partially blocking apples; right arm active. refine: red-handled shopping cart with plastic bag to the right. analyze: timing \\u2014 right arm extends, picks pear, places in bag (0:01-0:03), retracts (0:03-0:04); repeats for orange (0:04-0:06) and carambola (0:06-0:07); final return to rest by 0:08. clarify: bright even indoor lighting, clean modern supermarket aesthetic, photoreal. visualize: smooth mechanical motion, grippers closing on fruit, slight sway as item is lifted. refine: camera locked off, eye-level medium-wide, deep focus so all compartments read clearly. analyze: audio is low ambient hum plus soft pneumatic click and bag rustle per pick. clarify: no people visible, only the two robotic arms are subjects.\", \"temporal_caption\": \"At 0:00, the scene opens on a well-lit wooden fruit display case with three compartments: bananas centered, apples on the left, and a mix of pears, oranges, and carambolas on the right. Two robotic arms sit at the bottom of the frame; the left arm is stationary, its gripper hovering just above the apples. The right arm begins to extend toward the right compartment at 0:01, fingers opening. At 0:02, the gripper closes around a greenish-yellow pear. From 0:02 to 0:03, the arm lifts the pear smoothly upward and rotates slightly rightward, aligning it over the red-handled shopping cart beside the case. At 0:03, the pear is released into the open plastic bag, settling among other selected fruit. The arm retracts back to its resting position by 0:04. At 0:05, the right arm re-extends toward the right compartment, this time targeting an orange. At 0:06, the gripper closes around the orange and lifts it. By 0:06-0:07, the orange is placed into the same plastic bag, joining the pear. At 0:07, the arm retracts. At 0:08, the right arm returns fully to its initial position, leaving the display case unchanged and the process complete.\", \"audio_description\": \"Low ambient room tone and faint HVAC hum 0:00-0:08. Soft pneumatic hiss as the right arm's joints articulate 0:01-0:02. Gentle fabric/silicone crinkle of the gripper closing around the pear at 0:02. Subtle metallic clink as the pear settles into the plastic bag at 0:03. Light bag-rustle 0:03-0:04. Second joint articulation hiss 0:05-0:06. Gripper close-on-orange at 0:06. Bag rustle as orange drops at 0:07. Final retract hiss 0:07-0:08. No dialogue, no music.\", \"subjects\": [{\"description\": \"Robotic arm with articulated joints and black-and-silver gripper, mounted at the lower edge of the frame\", \"appearance_details\": \"Glossy aluminum segments, thin silver cables along the forearm, black rubberized gripper pads\", \"relationship\": \"Active agent performing fruit-picking tasks; paired with a second identical but idle arm\", \"location\": \"right foreground, extending from bottom edge toward the fruit case\", \"relative_size\": \"Medium within frame\", \"orientation\": \"Reaching diagonally upward and to the right\", \"pose\": \"Extended reach, fingers open then closing around fruit\", \"action\": \"Picks up a pear, then an orange, then a carambola, placing each into a plastic bag\", \"state_changes\": \"Cycles between extended reach, closed grip, lift, rotate, release, and retract\", \"clothing\": \"\", \"expression\": \"\", \"gender\": \"\", \"age\": \"\", \"skin_tone_and_texture\": \"\", \"facial_features\": \"\", \"number_of_subjects\": 2, \"number_of_arms\": 0, \"number_of_legs\": 0}, {\"description\": \"Wooden multi-compartment fruit display case filled with assorted produce\", \"appearance_details\": \"Warm honey-toned wood grain, glass-topped compartments, neatly arranged fruit\", \"relationship\": \"Primary object being interacted with by the robotic arms\", \"location\": \"center background filling most of the frame\", \"relative_size\": \"Large within frame\", \"orientation\": \"Facing camera directly\", \"pose\": \"Static fixture\", \"action\": \"Holds bananas, apples, and mixed fruits\", \"state_changes\": \"No significant change.\", \"clothing\": \"\", \"expression\": \"\", \"gender\": \"\", \"age\": \"\", \"skin_tone_and_texture\": \"\", \"facial_features\": \"\", \"number_of_subjects\": 1, \"number_of_arms\": 0, \"number_of_legs\": 0}, {\"description\": \"Red-handled shopping cart with open plastic bag containing selected fruit\", \"appearance_details\": \"Bright red tubular handles, white wire basket frame, clear plastic bag with some printed text\", \"relationship\": \"Receiving container for picked fruit\", \"location\": \"right foreground, adjacent to the fruit case\", \"relative_size\": \"Medium within frame\", \"orientation\": \"Side profile facing left toward the case\", \"pose\": \"Stationary\", \"action\": \"Accumulates a pear, an orange, and a carambola\", \"state_changes\": \"Bag contents increase across the clip\", \"clothing\": \"\", \"expression\": \"\", \"gender\": \"\", \"age\": \"\", \"skin_tone_and_texture\": \"\", \"facial_features\": \"\", \"number_of_subjects\": 1, \"number_of_arms\": 0, \"number_of_legs\": 0}], \"background_setting\": \"Interior of a modern retail grocery store or demonstration kiosk, featuring a polished wooden fruit display case under bright overhead lighting, with the red-handled shopping cart positioned nearby\", \"lighting\": {\"conditions\": \"Bright, even artificial interior lighting typical of a retail environment\", \"direction\": \"Top-down overhead illumination with mild fill from front\", \"shadows\": \"Soft, short shadows beneath the robotic arms and inside the compartments\", \"illumination_effect\": \"Subtle highlights on the glossy aluminum joints and the wet sheen of the fruit\"}, \"aesthetics\": {\"composition\": \"Centered symmetrical framing with the fruit case dominating the middle, robotic arms anchoring the lower corners\", \"color_scheme\": \"Warm wood browns, vibrant fruit colors (yellow bananas, red apples, green pears, orange oranges, star-shaped carambolas), accented by the red cart handles\", \"mood_atmosphere\": \"Clean, efficient, futuristic yet familiar \\u2014 showcasing precision automation in everyday retail\", \"patterns\": \"Repeating rows of fruit within the compartment grids\"}, \"cinematography\": {\"camera_motion\": \"Static\", \"framing\": \"Medium-wide shot encompassing the full display case and both robotic arms\", \"camera_angle\": \"Eye-level, straight-on\", \"depth_of_field\": \"Deep\", \"focus\": \"Sharp across the entire display case and robotic arms\", \"lens_focal_length\": \"Standard 35mm equivalent\"}, \"style_medium\": \"Photoreal live-action video\", \"artistic_style\": \"Clean commercial product-demo realism\", \"context\": \"A demonstration of an automated fruit-picking system in a retail setting, highlighting the precision and efficiency of modern robotics\", \"actions\": [{\"time\": \"0:00-0:01\", \"description\": \"Static establishing view of the fruit display case and two robotic arms; left arm idle, right arm begins to extend.\"}, {\"time\": \"0:01-0:03\", \"description\": \"Right robotic arm reaches into the right compartment, grasps a pear, lifts it, and rotates it over the shopping cart.\"}, {\"time\": \"0:03-0:04\", \"description\": \"Pear is released into the plastic bag; arm retracts back to its resting position.\"}, {\"time\": \"0:04-0:06\", \"description\": \"Right arm re-extends, targets an orange, closes its gripper, and lifts it out of the compartment.\"}, {\"time\": \"0:06-0:07\", \"description\": \"Orange is dropped into the plastic bag alongside the pear; arm begins retracting.\"}, {\"time\": \"0:07-0:08\", \"description\": \"Arm completes its return to the original position; final state matches the opening frame.\"}], \"text_and_signage_elements\": [], \"segments\": [{\"segment_index\": 0, \"time_range\": \"0:00-0:04\", \"description\": \"Establishing view of the fruit case as the right robotic arm picks a pear and places it in the shopping cart's plastic bag.\", \"key_changes\": \"Pear transferred from compartment to bag; arm transitions from extended to retracted.\", \"camera\": \"Locked-off static medium-wide shot.\"}, {\"segment_index\": 1, \"time_range\": \"0:04-0:08\", \"description\": \"The right robotic arm repeats the cycle, picking an orange and then a carambola, adding them to the bag before returning to its starting position.\", \"key_changes\": \"Orange and carambola added to bag; arm returns to rest.\", \"camera\": \"Continues locked-off static framing.\"}], \"transitions\": [], \"resolution\": {\"W\": 1280, \"H\": 720}, \"aspect_ratio\": \"16,9\", \"duration\": \"0:08\", \"fps\": 24}",
+  "size": "1280x720",
+  "response_format": "url",
+  "nvext": {
+    "num_inference_steps": 35,
+    "num_frames": 192,
+    "fps": 24
+  }
+}
\ No newline at end of file

From 0034bee9c80934e57afa13d3ae66cf363675d5aa Mon Sep 17 00:00:00 2001
From: ayushag <ayushag@nvidia.com>
Date: Fri, 29 May 2026 13:46:24 -0700
Subject: [PATCH 5/9] test(omni): add Cosmos3 tests and refine guide

Signed-off-by: ayushag <ayushag@nvidia.com>
---
 .../dynamo/common/tests/test_video_utils.py   | 64 +++++++++++++++++++
 .../dynamo/vllm/tests/omni/test_omni_args.py  | 14 ++++
 .../vllm/tests/omni/test_omni_base_handler.py | 22 +++++++
 docs/backends/vllm/cosmos3.md                 | 15 +++--
 4 files changed, 108 insertions(+), 7 deletions(-)

diff --git a/components/src/dynamo/common/tests/test_video_utils.py b/components/src/dynamo/common/tests/test_video_utils.py
index fab867fb611c..6e134fd01ff2 100644
--- a/components/src/dynamo/common/tests/test_video_utils.py
+++ b/components/src/dynamo/common/tests/test_video_utils.py
@@ -154,3 +154,67 @@ def test_v2_api_fallback_writes_all_frames(self):
 
             assert writer.append_data.call_count == 4
             writer.close.assert_called_once()
+
+
+# ---------------------------------------------------------------------------
+# normalize_image_frames
+# ---------------------------------------------------------------------------
+
+
+class TestNormalizeImageFrames:
+    """Tests for normalize_image_frames() — flattens DiffusionFormatter image
+    inputs to PIL. Image pipelines usually emit PIL Images; the Cosmos3 native
+    pipeline emits 5D numpy ``[B, F, H, W, C]``."""
+
+    def test_pil_inputs_returned_by_identity(self):
+        """PIL inputs must pass through without conversion or copy."""
+        from PIL import Image
+
+        from dynamo.common.utils.video_utils import normalize_image_frames
+
+        a = Image.new("RGB", (4, 4), (255, 0, 0))
+        b = Image.new("RGB", (4, 4), (0, 255, 0))
+        out = normalize_image_frames([a, b])
+
+        assert len(out) == 2
+        assert out[0] is a and out[1] is b
+
+    def test_uint8_hwc_numpy_preserves_pixels(self):
+        from PIL import Image
+
+        from dynamo.common.utils.video_utils import normalize_image_frames
+
+        arr = np.full((4, 4, 3), 7, dtype=np.uint8)
+        out = normalize_image_frames([arr])
+
+        assert len(out) == 1
+        assert isinstance(out[0], Image.Image)
+        assert out[0].size == (4, 4)  # PIL is (W, H)
+        assert np.asarray(out[0])[0, 0].tolist() == [7, 7, 7]
+
+    def test_cosmos3_5d_strips_batch_and_preserves_frame_order(self):
+        """[B, F, H, W, C] collapses to F PIL frames in order. Distinct
+        per-frame content guards against wrong-axis indexing regressions."""
+        from dynamo.common.utils.video_utils import normalize_image_frames
+
+        arr = np.zeros((1, 3, 4, 4, 3), dtype=np.uint8)
+        arr[0, 0] = 10  # frame 0 fill
+        arr[0, 1] = 20  # frame 1 fill
+        arr[0, 2] = 30  # frame 2 fill
+
+        out = normalize_image_frames([arr])
+
+        assert len(out) == 3
+        assert np.asarray(out[0])[0, 0, 0] == 10
+        assert np.asarray(out[1])[0, 0, 0] == 20
+        assert np.asarray(out[2])[0, 0, 0] == 30
+
+    def test_float_zero_to_one_scaled_to_uint8(self):
+        """float32 [0, 1] inputs must be rescaled to uint8 [0, 255]."""
+        from dynamo.common.utils.video_utils import normalize_image_frames
+
+        arr = np.full((4, 4, 3), 0.5, dtype=np.float32)
+        out = normalize_image_frames([arr])
+
+        # 0.5 * 255 = 127.5; numpy's banker's rounding yields exactly 128.
+        assert np.asarray(out[0])[0, 0, 0] == 128
diff --git a/components/src/dynamo/vllm/tests/omni/test_omni_args.py b/components/src/dynamo/vllm/tests/omni/test_omni_args.py
index 92380e489412..22b5213c41cb 100644
--- a/components/src/dynamo/vllm/tests/omni/test_omni_args.py
+++ b/components/src/dynamo/vllm/tests/omni/test_omni_args.py
@@ -75,6 +75,7 @@ def _make_omni_config(**overrides) -> OmniConfig:
         "tts_ref_audio_max_bytes": 50 * 1024 * 1024,
         "stage_id": None,
         "omni_router": False,
+        "cosmos3_guardrails": True,
     }
     flat_defaults.update(flat_overrides)
 
@@ -191,3 +192,16 @@ def test_omni_config_imports_cleanly():
 
     assert OmniConfig is not None
     assert callable(parse_omni_args)
+
+
+# --- Cosmos3 guardrails ---
+
+
+def test_omni_config_cosmos3_guardrails_default_enabled():
+    assert OmniConfig.cosmos3_guardrails is True
+
+
+def test_omni_config_cosmos3_guardrails_overridable():
+    config = _make_omni_config(cosmos3_guardrails=False)
+    assert config.cosmos3_guardrails is False
+    config.validate()  # disabling guardrails must not fail validation
diff --git a/components/src/dynamo/vllm/tests/omni/test_omni_base_handler.py b/components/src/dynamo/vllm/tests/omni/test_omni_base_handler.py
index f27dddda4c27..2ac2f3886000 100644
--- a/components/src/dynamo/vllm/tests/omni/test_omni_base_handler.py
+++ b/components/src/dynamo/vllm/tests/omni/test_omni_base_handler.py
@@ -101,3 +101,25 @@ def test_output_modalities_forwarded_to_async_omni(self):
         kwargs = _build_kwargs(config)
 
         assert kwargs["output_modalities"] == ["image"]
+
+
+class TestCosmos3Guardrails:
+    """`cosmos3_guardrails=False` should route into omni_kwargs as
+    ``model_config={"guardrails": False}``; the default (True) leaves
+    model_config untouched so vllm-omni applies its own default."""
+
+    def test_disabled_routes_into_model_config(self):
+        config = _make_config()
+        config.cosmos3_guardrails = False
+
+        kwargs = _build_kwargs(config)
+
+        assert kwargs.get("model_config") == {"guardrails": False}
+
+    def test_enabled_does_not_set_model_config(self):
+        config = _make_config()
+        config.cosmos3_guardrails = True
+
+        kwargs = _build_kwargs(config)
+
+        assert "model_config" not in kwargs
diff --git a/docs/backends/vllm/cosmos3.md b/docs/backends/vllm/cosmos3.md
index dc3a79278ea7..e57f3279d38c 100644
--- a/docs/backends/vllm/cosmos3.md
+++ b/docs/backends/vllm/cosmos3.md
@@ -8,14 +8,15 @@ Run NVIDIA's **Cosmos3** omni model through Dynamo's
 [vLLM-Omni backend](vllm-omni.md) for **text-to-image**, **text-to-video**, and
 **image-to-video** generation.
 
-Cosmos3 is a unified world foundation model (WFM) for Physical AI, built on a
-Mixture-of-Transformers (MoT) architecture. A single `Cosmos3OmniTransformer`
-runs a Qwen-style "understanding" stream alongside a "generation" stream
-joined by a 3D multimodal RoPE, replacing the separate Predict / Reason /
-Transfer models from earlier Cosmos releases. See the
+Cosmos3 is a unified world foundation model for Physical AI, built on a
+Mixture-of-Transformers architecture. A single `Cosmos3OmniTransformer` runs
+a Qwen-style "understanding" stream alongside a "generation" stream joined
+by a 3D multimodal RoPE, replacing the separate Predict / Reason / Transfer
+models from earlier Cosmos releases. See the
 [Cosmos World Foundation Model Platform paper](https://huggingface.co/papers/2501.03575)
 for the architectural background, and the
-[diffusers Cosmos3 reference](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cosmos3) for the underlying pipeline.
+[diffusers Cosmos3 reference](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cosmos3)
+for the underlying pipeline.
 
 Cosmos3 support in Dynamo is provided by the native vLLM-Omni pipeline added in
 [vllm-project/vllm-omni#3454](https://github.com/vllm-project/vllm-omni/pull/3454).
@@ -27,7 +28,7 @@ swap the model identifier on the worker (`--model …`) and in request payloads.
 
 | Checkpoint | Description | HF Hub |
 |------------|-------------|--------|
-| `nvidia/Cosmos3-Nano` | Smaller, faster — default in this repo's launch scripts | [link](https://huggingface.co/nvidia/Cosmos3-Nano) |
+| `nvidia/Cosmos3-Nano` | Smaller, faster — default in the Dynamo launch scripts below | [link](https://huggingface.co/nvidia/Cosmos3-Nano) |
 | `nvidia/Cosmos3-Super` | Larger, higher quality | [link](https://huggingface.co/nvidia/Cosmos3-Super) |
 
 ## Supported modalities

From 001eacbe3d1edcac484910b5c678a24df5f7a73e Mon Sep 17 00:00:00 2001
From: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com>
Date: Sun, 31 May 2026 04:09:41 -0400
Subject: [PATCH 6/9] feat(omni): install vllm-omni from PR #3454 for Cosmos3
 support

Cosmos3 pipelines are only in the unreleased vllm-omni PR
vllm-project/vllm-omni#3454, not in any released wheel. Re-enable the
git-install mechanism (reverted in 7744835c195) so the vllm-runtime
container installs vllm-omni from the canonical repo pinned to the
current PR head SHA (65b83d87, == refs/pull/3454/head).

When vllm_omni_git_url is set, install_vllm_omni.sh installs
"vllm-omni @ git+<url>@<ref>"; otherwise it falls back to the released
"vllm-omni==<ref>" wheel.

Signed-off-by: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com>
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 container/context.yaml                      | 10 +++++++++-
 container/deps/vllm/install_vllm_omni.sh    | 14 +++++++++++---
 container/templates/args.Dockerfile         |  1 +
 container/templates/vllm_runtime.Dockerfile |  1 +
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/container/context.yaml b/container/context.yaml
index 0a3c1a777316..b6f397da160e 100644
--- a/container/context.yaml
+++ b/container/context.yaml
@@ -63,7 +63,15 @@ vllm:
     base_image_tag: 22.04
     runtime_image_tag: v0.21.0
   flashinf_ref: v0.6.8.post1
-  vllm_omni_ref: "v0.21.0rc1"
+  # Cosmos3 support is not yet in a released vllm-omni; install from the
+  # canonical repo pinned to the PR head commit (vllm-project/vllm-omni#3454,
+  # == refs/pull/3454/head). When vllm_omni_git_url is set, install_vllm_omni.sh
+  # installs from git@ref; otherwise it falls back to "vllm-omni==<ref>".
+  vllm_omni_ref: "65b83d87ad786aa786b248f0242e2ed1b4a8161f"
+  # If vllm_omni_git_url is defined, vllm-omni is NOT installed from PyPI; it is
+  # installed from the git commit SHA (or ref) defined in vllm_omni_ref above.
+  # Leave it unset/empty to install the released "vllm-omni==<vllm_omni_ref>" wheel.
+  vllm_omni_git_url: "https://github.com/vllm-project/vllm-omni.git"
   nixl_ref: v1.1.0
   max_jobs: "10"
   enable_media_ffmpeg: "false"
diff --git a/container/deps/vllm/install_vllm_omni.sh b/container/deps/vllm/install_vllm_omni.sh
index 83b08875a621..c8dad9cba3cf 100755
--- a/container/deps/vllm/install_vllm_omni.sh
+++ b/container/deps/vllm/install_vllm_omni.sh
@@ -9,7 +9,15 @@ set -euo pipefail
 VLLM_OMNI_PROTECTED_PACKAGES_FILE="${VLLM_OMNI_PROTECTED_PACKAGES_FILE:-/tmp/vllm_omni_protected_packages.txt}"
 
 PROTECTED_CONSTRAINTS="$(mktemp /tmp/vllm-openai-protected.XXXXXX.txt)"
-VLLM_OMNI_VERSION="${VLLM_OMNI_REF#v}"
+
+# When VLLM_OMNI_GIT_URL is set, install from a git ref (e.g. an unreleased PR
+# commit); otherwise fall back to the matching PyPI release.
+VLLM_OMNI_GIT_URL="${VLLM_OMNI_GIT_URL:-}"
+if [ -n "${VLLM_OMNI_GIT_URL}" ]; then
+  VLLM_OMNI_SPEC="vllm-omni @ git+${VLLM_OMNI_GIT_URL}@${VLLM_OMNI_REF}"
+else
+  VLLM_OMNI_SPEC="vllm-omni==${VLLM_OMNI_REF#v}"
+fi
 
 cleanup() {
   rm -rf "${PROTECTED_CONSTRAINTS}"
@@ -41,11 +49,11 @@ if [ "${VLLM_OMNI_TARGET_DEVICE}" = "cuda" ]; then
   uv pip install --system \
     --prerelease=allow \
     --constraints "${PROTECTED_CONSTRAINTS}" \
-    "vllm-omni==${VLLM_OMNI_VERSION}"
+    "${VLLM_OMNI_SPEC}"
 else
   uv pip install \
     --prerelease=allow \
     --constraints "${PROTECTED_CONSTRAINTS}" \
-    "vllm-omni==${VLLM_OMNI_VERSION}"
+    "${VLLM_OMNI_SPEC}"
 fi
 
diff --git a/container/templates/args.Dockerfile b/container/templates/args.Dockerfile
index 4e21b5ba1ea2..f5d7aaf14bcb 100644
--- a/container/templates/args.Dockerfile
+++ b/container/templates/args.Dockerfile
@@ -99,6 +99,7 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }}
 ARG FLASHINF_REF={{ context.vllm.flashinf_ref }}
 {% endif %}
 ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }}
+ARG VLLM_OMNI_GIT_URL={{ context.vllm.vllm_omni_git_url }}
 
 {% if device == "cuda" -%}
 # If left blank, then we will fallback to vLLM defaults
diff --git a/container/templates/vllm_runtime.Dockerfile b/container/templates/vllm_runtime.Dockerfile
index 71cccc155aaa..c82186bf0162 100644
--- a/container/templates/vllm_runtime.Dockerfile
+++ b/container/templates/vllm_runtime.Dockerfile
@@ -19,6 +19,7 @@ ARG PYTHON_VERSION
 ARG ENABLE_KVBM
 ARG ENABLE_GPU_MEMORY_SERVICE
 ARG VLLM_OMNI_REF
+ARG VLLM_OMNI_GIT_URL
 ARG NIXL_REF
 {% if device == "cuda" %}
 ARG CUDA_MAJOR

From 22d56b960660cc89480ea8de9489adda4ddd1112 Mon Sep 17 00:00:00 2001
From: Harrison Saturley-Hall <hsaturleyhal@nvidia.com>
Date: Fri, 29 May 2026 16:01:38 -0400
Subject: [PATCH 7/9] chore(container): build in-tree ffmpeg CLI and route
 imageio through it (#10091)

Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit dc2f3521455feed94c052d20612c8810fb6d14be)
---
 .../dynamo/common/tests/test_video_utils.py   |  4 +-
 .../src/dynamo/common/utils/video_utils.py    | 12 ++---
 .../video_generation_handler.py               |  2 +-
 container/context.yaml                        |  6 +++
 container/deps/requirements.common.txt        |  9 +++-
 container/deps/requirements.sglang.txt        | 12 +++++
 container/deps/requirements.trtllm.txt        |  7 ++-
 container/deps/requirements.vllm.txt          | 14 ++++++
 container/templates/args.Dockerfile           |  6 +++
 container/templates/dynamo_base.Dockerfile    |  2 +-
 container/templates/dynamo_runtime.Dockerfile |  8 +++-
 container/templates/sglang_runtime.Dockerfile | 30 ++++++++++---
 container/templates/trtllm_runtime.Dockerfile | 18 ++++++++
 container/templates/vllm_runtime.Dockerfile   | 12 +++++
 container/templates/wheel_builder.Dockerfile  | 44 +++++++++++++++----
 docs/backends/trtllm/trtllm-diffusion.md      |  6 ++-
 16 files changed, 163 insertions(+), 29 deletions(-)
 create mode 100644 container/deps/requirements.sglang.txt
 create mode 100644 container/deps/requirements.vllm.txt

diff --git a/components/src/dynamo/common/tests/test_video_utils.py b/components/src/dynamo/common/tests/test_video_utils.py
index 6e134fd01ff2..3b240710082f 100644
--- a/components/src/dynamo/common/tests/test_video_utils.py
+++ b/components/src/dynamo/common/tests/test_video_utils.py
@@ -41,7 +41,7 @@ def _mock_iio_v2(self):
         iio.get_writer = MagicMock(return_value=writer)
         return iio, writer
 
-    def test_mp4_selects_libx264_codec(self):
+    def test_mp4_selects_h264_nvenc_codec(self):
         from dynamo.common.utils.video_utils import encode_to_video_bytes
 
         iio = self._mock_iio_v3()
@@ -56,7 +56,7 @@ def test_mp4_selects_libx264_codec(self):
 
             iio.imwrite.assert_called_once()
             _, kwargs = iio.imwrite.call_args
-            assert kwargs.get("codec") == "libx264"
+            assert kwargs.get("codec") == "h264_nvenc"
             assert kwargs.get("fps") == 8
 
     def test_webm_selects_libvpx_vp9_codec(self):
diff --git a/components/src/dynamo/common/utils/video_utils.py b/components/src/dynamo/common/utils/video_utils.py
index cddd7655114d..d6ce6dd826bb 100644
--- a/components/src/dynamo/common/utils/video_utils.py
+++ b/components/src/dynamo/common/utils/video_utils.py
@@ -182,13 +182,15 @@ def encode_to_mp4(
     logger.info(f"Encoding {len(frames)} frames to {output_path} at {fps} fps")
 
     try:
-        # Use imageio to write MP4
-        # imageio.v3 API
+        # Use imageio to write MP4. We use h264_nvenc (NVIDIA HW encoder) instead
+        # of libx264 because the in-tree ffmpeg build is LGPL-only and libx264
+        # is GPL-licensed; see container/templates/wheel_builder.Dockerfile.
+        # Requires a CUDA-capable GPU at runtime.
         if hasattr(iio, "imwrite"):
-            iio.imwrite(output_path, frames, fps=fps, codec="libx264")
+            iio.imwrite(output_path, frames, fps=fps, codec="h264_nvenc")
         else:
             # Fall back to v2 API
-            writer = iio.get_writer(output_path, fps=fps, codec="libx264")  # type: ignore[attr-defined]
+            writer = iio.get_writer(output_path, fps=fps, codec="h264_nvenc")  # type: ignore[attr-defined]
             try:
                 for frame in frames:
                     writer.append_data(frame)
@@ -243,7 +245,7 @@ def encode_to_video_bytes(
         if output_format == "webm":
             kwargs["codec"] = "libvpx-vp9"
         elif output_format == "mp4":
-            kwargs["codec"] = "libx264"
+            kwargs["codec"] = "h264_nvenc"
         else:
             raise ValueError(f"No codec specified for response format: {output_format}")
 
diff --git a/components/src/dynamo/sglang/request_handlers/video_generation/video_generation_handler.py b/components/src/dynamo/sglang/request_handlers/video_generation/video_generation_handler.py
index 0a2d516c4de2..8ad5c6044b4c 100644
--- a/components/src/dynamo/sglang/request_handlers/video_generation/video_generation_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/video_generation/video_generation_handler.py
@@ -259,7 +259,7 @@ async def _generate_video(
         return video_bytes
 
     async def _frames_to_video(
-        self, frames: list, fps: int, codec: str = "libx264"
+        self, frames: list, fps: int, codec: str = "h264_nvenc"
     ) -> bytes:
         """Convert list of frames to video bytes.
 
diff --git a/container/context.yaml b/container/context.yaml
index b6f397da160e..4fee22a94c32 100644
--- a/container/context.yaml
+++ b/container/context.yaml
@@ -39,6 +39,10 @@ dynamo:
   enable_media_ffmpeg: "false"
   enable_gpu_memory_service: "true"
   ffmpeg_version: "8.1"
+  # ffmpeg build inputs (only consumed when ENABLE_MEDIA_FFMPEG=true).
+  nv_codec_headers_ref: "n13.0.19.0"
+  libvpx_ref: "v1.14.1"
+  sccache_version: "v0.14.0"
   efa_version: 1.47.0
 
 vllm:
@@ -79,6 +83,8 @@ vllm:
   enable_kvbm: "true"
   enable_modelexpress_p2p: "false"
   modelexpress_ref: "76fc5d7f06c37121ee8789a29fac6f9b08c4743a"  # v0.3.0
+  # aws-sdk-cpp tag for the NIXL OBJ / S3 backend (built in wheel_builder).
+  aws_sdk_cpp_version: "1.11.760"
 
 sglang:
   cuda12.9:
diff --git a/container/deps/requirements.common.txt b/container/deps/requirements.common.txt
index 7265f7cd5401..d04cb50bc344 100644
--- a/container/deps/requirements.common.txt
+++ b/container/deps/requirements.common.txt
@@ -4,6 +4,13 @@
 # Core runtime dependencies shared by ALL Dynamo containers.
 # See README.md in this directory for version pinning strategy.
 
+# Force a source install of imageio-ffmpeg (pure-Python wrapper). The PyPI wheel
+# bundles a prebuilt, GPL-encumbered ffmpeg binary in <site-packages>/imageio_ffmpeg/binaries/
+# that has CVE exposure; we point imageio at the in-tree LGPL ffmpeg CLI via
+# IMAGEIO_FFMPEG_EXE instead. This directive is honored by pip and uv when this
+# file is passed via --requirement, and applies to the whole install.
+--no-binary imageio-ffmpeg
+
 aiohttp>=3.9.0,<4.0
 fastapi==0.120.1
 grpcio-tools<=1.76.0  # May have platform-specific builds; pins grpcio ecosystem version
@@ -11,7 +18,7 @@ httpx==0.28.1
 
 # Video generation: encode frames to MP4 (used by TRT-LLM, vLLM-Omni, SGLang diffusion)
 imageio>=2.37.0
-imageio-ffmpeg>=0.6.0
+imageio-ffmpeg>=0.6.0  # binary skipped per --no-binary directive at top of file
 # Shared plotting utility used by runtime diagnostics and benchmark tooling.
 matplotlib==3.10.7
 msgspec==0.19.0
diff --git a/container/deps/requirements.sglang.txt b/container/deps/requirements.sglang.txt
new file mode 100644
index 000000000000..bc2e5679d398
--- /dev/null
+++ b/container/deps/requirements.sglang.txt
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Third-party Python dependencies for the sglang runtime image. Installed
+# with --force-reinstall --no-deps to replace the upstream lmsysorg/sglang
+# base image's imageio-ffmpeg wheel (which ships a GPL-encumbered prebuilt
+# ffmpeg binary) with a source build that leaves no binary on disk.
+# IMAGEIO_FFMPEG_EXE points imageio at the in-tree LGPL ffmpeg CLI.
+
+--no-binary imageio-ffmpeg
+
+imageio-ffmpeg>=0.6.0  # binary skipped per --no-binary directive at top of file
diff --git a/container/deps/requirements.trtllm.txt b/container/deps/requirements.trtllm.txt
index 4f93903e2157..3952a6deec70 100644
--- a/container/deps/requirements.trtllm.txt
+++ b/container/deps/requirements.trtllm.txt
@@ -5,10 +5,15 @@
 # with --no-deps so upstream nvcr.io/nvidia/tensorrt-llm/release's solve
 # stays intact. Sorted alphabetically per pre-commit requirements-txt-fixer.
 
+# Force a source install of imageio-ffmpeg. The PyPI wheel bundles a prebuilt,
+# GPL-encumbered ffmpeg binary that has CVE exposure; we point imageio at the
+# in-tree LGPL ffmpeg CLI via IMAGEIO_FFMPEG_EXE instead.
+--no-binary imageio-ffmpeg
+
 # Used by the trtllm video_diffusion handler to encode generated frames to MP4.
 # Upstream tensorrt-llm/release does not ship them.
 imageio>=2.37.0
-imageio-ffmpeg>=0.6.0
+imageio-ffmpeg>=0.6.0  # binary skipped per --no-binary directive at top of file
 # Required by ai_dynamo_runtime + gpu_memory_service. Upstream tensorrt-llm/release
 # does not ship them; vllm/vllm-openai does (which is why DYN-2204's vllm path
 # does not need this).
diff --git a/container/deps/requirements.vllm.txt b/container/deps/requirements.vllm.txt
new file mode 100644
index 000000000000..5e596820f44d
--- /dev/null
+++ b/container/deps/requirements.vllm.txt
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Third-party Python dependencies for the vllm runtime image. Installed
+# with --reinstall-package imageio-ffmpeg --no-deps to replace the upstream
+# vllm/vllm-openai base image's imageio-ffmpeg wheel (which ships a
+# GPL-encumbered prebuilt ffmpeg binary) with a source build that leaves
+# no binary on disk. vLLM-Omni uses diffusers.export_to_video and doesn't
+# invoke imageio-ffmpeg, so no IMAGEIO_FFMPEG_EXE is needed — this is
+# purely to clear the GPL binary.
+
+--no-binary imageio-ffmpeg
+
+imageio-ffmpeg>=0.6.0  # binary skipped per --no-binary directive at top of file
diff --git a/container/templates/args.Dockerfile b/container/templates/args.Dockerfile
index f5d7aaf14bcb..999478cf8d4c 100644
--- a/container/templates/args.Dockerfile
+++ b/container/templates/args.Dockerfile
@@ -55,12 +55,15 @@ ARG ETCD_VERSION={{ context.dynamo.etcd_version }}
 
 ARG ENABLE_MEDIA_FFMPEG={{ context[framework].enable_media_ffmpeg }}
 ARG FFMPEG_VERSION={{ context.dynamo.ffmpeg_version }}
+ARG NV_CODEC_HEADERS_REF={{ context.dynamo.nv_codec_headers_ref }}
+ARG LIBVPX_REF={{ context.dynamo.libvpx_ref }}
 {% if device == "cuda" -%}
 ARG ENABLE_GPU_MEMORY_SERVICE={{ context[framework].enable_gpu_memory_service }}
 {% endif %}
 
 # SCCACHE configuration
 ARG USE_SCCACHE
+ARG SCCACHE_VERSION={{ context.dynamo.sccache_version }}
 ARG SCCACHE_BUCKET=""
 ARG SCCACHE_REGION=""
 
@@ -108,6 +111,9 @@ ARG DEEPGEMM_REF=""
 # ModelExpress for P2P weight transfer (optional)
 ARG ENABLE_MODELEXPRESS_P2P={{ context.vllm.enable_modelexpress_p2p }}
 ARG MODELEXPRESS_REF={{ context.vllm.modelexpress_ref }}
+
+# aws-sdk-cpp tag for the NIXL OBJ / S3 backend (built in wheel_builder).
+ARG AWS_SDK_CPP_VERSION={{ context.vllm.aws_sdk_cpp_version }}
 {% endif %}
 {%- endif -%}
 
diff --git a/container/templates/dynamo_base.Dockerfile b/container/templates/dynamo_base.Dockerfile
index dca5b95b32f8..f38d61cdda35 100644
--- a/container/templates/dynamo_base.Dockerfile
+++ b/container/templates/dynamo_base.Dockerfile
@@ -22,7 +22,7 @@ RUN apt clean && apt-get update -y && \
 
 # Install sccache into the base image so downstream stages can COPY it
 # instead of downloading from GitHub (avoids 502 errors under parallel builds)
-ARG SCCACHE_VERSION=v0.14.0
+ARG SCCACHE_VERSION
 RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \
     wget --tries=3 --waitretry=5 \
         "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" && \
diff --git a/container/templates/dynamo_runtime.Dockerfile b/container/templates/dynamo_runtime.Dockerfile
index eb2863c3213b..ef3cb0f74c30 100644
--- a/container/templates/dynamo_runtime.Dockerfile
+++ b/container/templates/dynamo_runtime.Dockerfile
@@ -42,13 +42,17 @@ COPY --chown=dynamo: --from=wheel_builder ${NIXL_PREFIX}/ ${NIXL_PREFIX}/
 COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/
 COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/
 
-# Always copy FFmpeg so libs are available for Rust checks in CI
+# Always copy FFmpeg so libs are available for Rust checks in CI.
+# libvpx.so* is included because the in-tree ffmpeg is built with --enable-libvpx,
+# so libavcodec.so has a runtime dependency on libvpx.so.9.
 RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
     mkdir -p /usr/local/lib/pkgconfig && \
     cp -rnL /tmp/usr/local/include/libav* /tmp/usr/local/include/libsw* /usr/local/include/ && \
     cp -nL /tmp/usr/local/lib/libav*.so /tmp/usr/local/lib/libsw*.so /usr/local/lib/ && \
+    cp -nL /tmp/usr/local/lib/lib*vpx*.so* /usr/local/lib/ 2>/dev/null || true && \
     cp -nL /tmp/usr/local/lib/pkgconfig/libav*.pc /tmp/usr/local/lib/pkgconfig/libsw*.pc /usr/local/lib/pkgconfig/ && \
-    cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/
+    cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ && \
+    ldconfig
 
 {% if target not in ("dev", "local-dev") %}
 # Copy built artifacts (not needed for dev/local-dev; users build from source)
diff --git a/container/templates/sglang_runtime.Dockerfile b/container/templates/sglang_runtime.Dockerfile
index 0ee140c4ec72..b138daac91d7 100644
--- a/container/templates/sglang_runtime.Dockerfile
+++ b/container/templates/sglang_runtime.Dockerfile
@@ -29,15 +29,24 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \
     # NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable
     && mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh
 
-{% if context.sglang.enable_media_ffmpeg == "true" %}
-# Copy ffmpeg
+# Copy ffmpeg from wheel_builder: versioned shared libs (libav*.so*,
+# libsw*.so*) for the Rust media-ffmpeg decoder, plus the LGPL CLI binary
+# (built with h264_nvenc + libvpx_vp9 encoders) that imageio targets via
+# IMAGEIO_FFMPEG_EXE for video encoding. Ungated by enable_media_ffmpeg
+# because the upstream lmsysorg/sglang base image always ships
+# imageio-ffmpeg with a GPL-encumbered prebuilt binary that we replace
+# unconditionally below; the LGPL CLI must be present so imageio has
+# something to target.
 RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
     mkdir -p /usr/local/lib/pkgconfig && \
     cp -rnL /tmp/usr/local/include/libav* /tmp/usr/local/include/libsw* /usr/local/include/ && \
-    cp -nL /tmp/usr/local/lib/libav*.so /tmp/usr/local/lib/libsw*.so /usr/local/lib/ && \
+    cp -nL /tmp/usr/local/lib/libav*.so* /tmp/usr/local/lib/libsw*.so* /usr/local/lib/ && \
+    cp -nL /tmp/usr/local/lib/lib*vpx*.so* /usr/local/lib/ 2>/dev/null || true && \
     cp -nL /tmp/usr/local/lib/pkgconfig/libav*.pc /tmp/usr/local/lib/pkgconfig/libsw*.pc /usr/local/lib/pkgconfig/ && \
-    cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/
-{% endif %}
+    cp -nL /tmp/usr/local/bin/ffmpeg /usr/local/bin/ffmpeg && \
+    cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ && \
+    ldconfig
+ENV IMAGEIO_FFMPEG_EXE=/usr/local/bin/ffmpeg
 
 {% if target not in ("dev", "local-dev") %}
 # Runtime target installs only the prebuilt Dynamo wheels. SGLang and its NIXL
@@ -86,6 +95,17 @@ RUN --mount=type=bind,source=./container/deps/requirements.common.txt,target=/tm
     export PIP_CACHE_DIR=/root/.cache/pip && \
     pip install --break-system-packages --no-deps $(grep -E '^nvtx==' /tmp/requirements.common.txt)
 
+# Replace the upstream lmsysorg/sglang image's imageio-ffmpeg (which ships a
+# GPL-encumbered prebuilt ffmpeg binary in <site-packages>/imageio_ffmpeg/binaries/)
+# with a source install that leaves no binary on disk. IMAGEIO_FFMPEG_EXE points
+# imageio at the LGPL CLI we copied from wheel_builder above. The --no-binary
+# directive lives in the requirements file itself.
+RUN --mount=type=bind,source=./container/deps/requirements.sglang.txt,target=/tmp/requirements.sglang.txt \
+    --mount=type=cache,target=/root/.cache/pip,sharing=locked \
+    export PIP_CACHE_DIR=/root/.cache/pip && \
+    pip install --break-system-packages --force-reinstall --no-deps \
+        --requirement /tmp/requirements.sglang.txt
+
 # Copy tests, deploy and components for CI with correct ownership
 COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests
 COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples
diff --git a/container/templates/trtllm_runtime.Dockerfile b/container/templates/trtllm_runtime.Dockerfile
index 5285fa2dbf21..617a95287a91 100644
--- a/container/templates/trtllm_runtime.Dockerfile
+++ b/container/templates/trtllm_runtime.Dockerfile
@@ -127,6 +127,9 @@ RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \
     \
     # Third-party deps Dynamo wheels declare but upstream lacks, plus the
     # huggingface-hub pin and KVBM-matching nixl-cu13. See the file for context.
+    # The requirements.trtllm.txt file itself carries a `--no-binary imageio-ffmpeg`
+    # directive that keeps the GPL-encumbered prebuilt ffmpeg off disk; IMAGEIO_FFMPEG_EXE
+    # below points imageio at the in-tree LGPL CLI.
     uv pip install --no-deps --requirement /tmp/requirements.trtllm.txt && \
     \
     if [ "${ENABLE_KVBM}" = "true" ]; then \
@@ -143,6 +146,20 @@ RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \
     fi
 {% endif %}
 
+# Copy the in-tree LGPL ffmpeg from wheel_builder. The TRT-LLM diffusion handler
+# always encodes video (video_handler.py:263 → encode_to_video_bytes), so the
+# CLI and its libav* / libvpx runtime libs need to be present in this image and
+# imageio must be pointed at it via IMAGEIO_FFMPEG_EXE. Ungated by
+# enable_media_ffmpeg because TRT-LLM unconditionally needs the encoder.
+RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \
+    cp -nL /tmp/usr/local/lib/libav*.so* /usr/local/lib/ 2>/dev/null || true && \
+    cp -nL /tmp/usr/local/lib/libsw*.so* /usr/local/lib/ 2>/dev/null || true && \
+    cp -nL /tmp/usr/local/lib/lib*vpx*.so* /usr/local/lib/ 2>/dev/null || true && \
+    cp -nL /tmp/usr/local/bin/ffmpeg /usr/local/bin/ffmpeg && \
+    cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ && \
+    ldconfig
+ENV IMAGEIO_FFMPEG_EXE=/usr/local/bin/ffmpeg
+
 # Pull /workspace_src (incl. ATTRIBUTION/LICENSE) from the transport stage and
 # wire up the launch screen in a single RUN — saves the standalone workspace COPY layer.
 RUN --mount=type=bind,from=workspace_files,source=/workspace_src,target=/tmp/workspace_src \
@@ -181,6 +198,7 @@ ENV DYNAMO_HOME=/workspace \
     HOME=/home/dynamo \
     VIRTUAL_ENV=/opt/dynamo/venv \
     PATH=/opt/dynamo/venv/bin:/usr/local/bin/etcd:${PATH} \
+    IMAGEIO_FFMPEG_EXE=/usr/local/bin/ffmpeg \
     LD_PRELOAD=/opt/dynamo/libstdc++.so.6:/usr/local/lib/python3.12/dist-packages/tensorrt_llm/libs/nixl/libnixl.so \
     NIXL_PLUGIN_DIR=/usr/local/lib/python3.12/dist-packages/tensorrt_llm/libs/nixl/plugins
 
diff --git a/container/templates/vllm_runtime.Dockerfile b/container/templates/vllm_runtime.Dockerfile
index c82186bf0162..65a39711f014 100644
--- a/container/templates/vllm_runtime.Dockerfile
+++ b/container/templates/vllm_runtime.Dockerfile
@@ -202,6 +202,18 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca
     cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/
 {% endif %}
 
+# Replace the upstream vllm/vllm-openai image's imageio-ffmpeg (which ships
+# a GPL-encumbered prebuilt ffmpeg binary) with a source install that leaves
+# no binary on disk. vLLM-Omni uses diffusers.export_to_video and doesn't
+# invoke imageio-ffmpeg, so no IMAGEIO_FFMPEG_EXE is needed — this is
+# purely to clear the GPL binary. The --no-binary directive lives in the
+# requirements file itself.
+RUN --mount=type=bind,source=./container/deps/requirements.vllm.txt,target=/tmp/requirements.vllm.txt \
+    --mount=type=cache,target=/root/.cache/uv,sharing=locked \
+    export UV_CACHE_DIR=/root/.cache/uv && \
+    uv pip install {{ pip_target }} --reinstall-package imageio-ffmpeg --no-deps \
+        --requirement /tmp/requirements.vllm.txt
+
 # Remove the vLLM source tree shipped in the base image to avoid pytest
 # collection conflicts (duplicate conftest plugin registration) and stale
 # tool scripts referencing files not present in Dynamo's build context.
diff --git a/container/templates/wheel_builder.Dockerfile b/container/templates/wheel_builder.Dockerfile
index 19e07cbf64f9..faa3865c045e 100644
--- a/container/templates/wheel_builder.Dockerfile
+++ b/container/templates/wheel_builder.Dockerfile
@@ -255,9 +255,16 @@ RUN if [ "$USE_SCCACHE" = "true" ]; then \
 ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \
     SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}}
 
-# Always build FFmpeg so libs are available for Rust checks in CI
-# Do not delete the source tarball for legal reasons
+# Always build FFmpeg so libs are available for Rust checks in CI.
+# We also build the ffmpeg CLI with h264_nvenc + libvpx_vp9 encoders so Python
+# code can encode video without the GPL-licensed binary shipped by imageio-ffmpeg.
+# Stays LGPL-only: --disable-gpl --disable-nonfree are preserved; H.264 comes from
+# NVIDIA's NVENC (proprietary HW encoder, already a runtime dependency of these
+# GPU images) and VP9 from libvpx (BSD).
+# Do not delete the source tarball for legal reasons.
 ARG FFMPEG_VERSION
+ARG NV_CODEC_HEADERS_REF
+ARG LIBVPX_REF
 RUN --mount=type=secret,id=aws-web-identity-token,target=/run/secrets/aws-token \
     --mount=type=secret,id=aws-role-arn,env=AWS_ROLE_ARN \
     export AWS_WEB_IDENTITY_TOKEN_FILE=/run/secrets/aws-token && \
@@ -266,11 +273,26 @@ RUN --mount=type=secret,id=aws-web-identity-token,target=/run/secrets/aws-token
         eval $(/tmp/use-sccache.sh setup-env); \
     fi && \
     if [ "$DEVICE" = "xpu" ] || [ "$DEVICE" = "cpu" ]; then \
-    apt-get update -y && apt-get install -y build-essential pkg-config xz-utils; \
+    apt-get update -y && apt-get install -y build-essential pkg-config xz-utils git yasm; \
     apt-get clean && rm -rf /var/lib/apt/lists/*; \
     elif [ "$DEVICE" = "cuda" ]; then \
-    dnf install -y --setopt=tsflags=nocontexts pkg-config xz; \
+    dnf install -y --setopt=tsflags=nocontexts pkg-config xz git yasm; \
     fi && \
+    # nv-codec-headers: provides the NVENC/NVDEC API headers ffmpeg compiles against.
+    # Header-only, no runtime dep here; libcuda/libnvidia-encode are loaded at runtime
+    # in the consuming container.
+    cd /tmp && \
+    git clone --depth 1 --branch ${NV_CODEC_HEADERS_REF} https://github.com/FFmpeg/nv-codec-headers.git && \
+    make -C nv-codec-headers PREFIX=/usr/local install && \
+    # libvpx: BSD-licensed VP9 encoder needed for the WebM output path. Built from
+    # source so we don't need to track distro package names (libvpx-dev on Debian
+    # vs libvpx-devel via EPEL on RHEL/manylinux).
+    git clone --depth 1 --branch ${LIBVPX_REF} https://chromium.googlesource.com/webm/libvpx.git && \
+    cd libvpx && \
+    ./configure --prefix=/usr/local --enable-shared --disable-static --disable-examples --disable-unit-tests --disable-tools --disable-docs && \
+    make -j$(nproc) && \
+    make install && \
+    ldconfig && \
     cd /tmp && \
     curl --retry 5 --retry-delay 3 -LO https://ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.xz && \
     tar xf ffmpeg-${FFMPEG_VERSION}.tar.xz && \
@@ -279,17 +301,21 @@ RUN --mount=type=secret,id=aws-web-identity-token,target=/run/secrets/aws-token
         --prefix=/usr/local \
         --disable-gpl \
         --disable-nonfree \
-        --disable-programs \
         --disable-doc \
         --disable-static \
         --disable-x86asm \
         --disable-network \
-        --disable-encoders \
-        --disable-muxers \
         --disable-bsfs \
         --disable-devices \
         --disable-libdrm \
-        --enable-shared && \
+        --enable-shared \
+        --enable-nvenc \
+        --enable-libvpx \
+        --disable-encoders \
+        --enable-encoder=h264_nvenc,libvpx_vp9 \
+        --disable-muxers \
+        --enable-muxer=mov,mp4,matroska,webm \
+        --enable-protocol=file,pipe && \
     make -j$(nproc) && \
     make install && \
     /tmp/use-sccache.sh show-stats "FFMPEG" && \
@@ -401,7 +427,7 @@ ENV PKG_CONFIG_PATH="/usr/local/libfabric/lib/pkgconfig:${PKG_CONFIG_PATH}"
 
 {% if framework == "vllm" and device == "cuda" %}
 # Build and install AWS SDK C++ (required for NIXL OBJ backend / S3 support)
-ARG AWS_SDK_CPP_VERSION=1.11.760
+ARG AWS_SDK_CPP_VERSION
 RUN --mount=type=secret,id=aws-web-identity-token,target=/run/secrets/aws-token \
     --mount=type=secret,id=aws-role-arn,env=AWS_ROLE_ARN \
     export AWS_WEB_IDENTITY_TOKEN_FILE=/run/secrets/aws-token && \
diff --git a/docs/backends/trtllm/trtllm-diffusion.md b/docs/backends/trtllm/trtllm-diffusion.md
index 360c9928d75e..18044ceba9ec 100644
--- a/docs/backends/trtllm/trtllm-diffusion.md
+++ b/docs/backends/trtllm/trtllm-diffusion.md
@@ -15,10 +15,12 @@ image generation through `--modality image_diffusion` flag.
 
 - **TensorRT-LLM with visual_gen**: The `visual_gen` module is part of TensorRT-LLM (`tensorrt_llm._torch.visual_gen`). Install TensorRT-LLM following the [official instructions](https://github.com/NVIDIA/TensorRT-LLM#installation).
 - **dynamo-runtime with multimodal API**: The Dynamo runtime must include `ModelType.Videos` or `ModelType.Images` support. Ensure you're using a compatible version.
-- **VIDEO diffusion: imageio with ffmpeg**: Required for encoding generated frames to MP4 video:
+- **VIDEO diffusion: imageio with ffmpeg**: Required for encoding generated frames to MP4 video. The Dynamo TRT-LLM runtime container ships an LGPL-only ffmpeg CLI built with the NVIDIA NVENC H.264 encoder (`h264_nvenc`) and `libvpx_vp9` for WebM, and points `imageio` at it via `IMAGEIO_FFMPEG_EXE=/usr/local/bin/ffmpeg` — the GPL-encumbered ffmpeg binary normally shipped inside the `imageio-ffmpeg` PyPI wheel is **not** installed. If you're running outside the container, install the Python wrapper without the bundled binary and point it at your own ffmpeg:
   ```bash
-  pip install imageio[ffmpeg]
+  pip install --no-binary imageio-ffmpeg "imageio[ffmpeg]"
+  export IMAGEIO_FFMPEG_EXE=/path/to/your/ffmpeg
   ```
+  MP4 output requires an NVIDIA GPU at runtime (NVENC is a hardware encoder).
 
 ## Supported Models
 

From 2c480644baef440ffaf67f308166c43080756975 Mon Sep 17 00:00:00 2001
From: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com>
Date: Sun, 31 May 2026 05:12:14 -0400
Subject: [PATCH 8/9] fix(omni): install git in vllm-runtime for git-based
 vllm-omni install

The vllm-runtime build failed at install_vllm_omni.sh with "Git executable
not found" because uv needs git to fetch the vllm-omni PR pin
(git+https://...@65b83d87), but the upstream vllm/vllm-openai runtime image
does not ship git. The released-wheel install never needed it.

Add git to the existing omni apt step, gated on VLLM_OMNI_GIT_URL via
${VLLM_OMNI_GIT_URL:+git} so the PyPI-wheel path (and the eventual revert)
keeps the runtime image lean.

Signed-off-by: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com>
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 container/templates/vllm_runtime.Dockerfile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/container/templates/vllm_runtime.Dockerfile b/container/templates/vllm_runtime.Dockerfile
index 65a39711f014..cd6322ed8ce0 100644
--- a/container/templates/vllm_runtime.Dockerfile
+++ b/container/templates/vllm_runtime.Dockerfile
@@ -164,13 +164,16 @@ RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \
     fi
 
 # vLLM-Omni's audio helpers shell out to SoX, and the launch script examples use
-# jq for readable curl output just like the upstream omni image does.
+# jq for readable curl output just like the upstream omni image does. git is only
+# pulled in when VLLM_OMNI_GIT_URL is set (installing vllm-omni from an unreleased
+# git ref) — the upstream vllm runtime image does not ship git.
 RUN set -eux; \
     apt-get update; \
     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
         jq \
         sox \
-        libsox-fmt-all; \
+        libsox-fmt-all \
+        ${VLLM_OMNI_GIT_URL:+git}; \
     rm -rf /var/lib/apt/lists/*
 
 # Layer the released vLLM-Omni package matching the pinned upstream ref while

From 271214ef7366d73543247c17da432cc07d0b5526 Mon Sep 17 00:00:00 2001
From: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com>
Date: Sun, 31 May 2026 08:41:03 -0400
Subject: [PATCH 9/9] test(omni): use real PIL images in output_formatter image
 tests

The diffusion image tests fed bare MagicMock() objects as images. Since
ebe677986b2 routed _prepare_images through normalize_image_frames(), a
non-PIL input takes the np.asarray(item).max() path; MagicMock.__iter__
defaults to empty, so np.asarray(MagicMock()) is a zero-size array and
arr.max() raises "zero-size array to reduction operation maximum". These
8 tests only ran in CI once the runtime image built, exposing the failure.

Swap the MagicMock image doubles for real PIL images via a _make_pil_image()
helper, so they hit the isinstance(item, Image.Image) pass-through and
img.save(buf, format="PNG") produces real PNG bytes. Assertions unchanged.

Signed-off-by: Harrison King Saturley-Hall <hsaturleyhal@nvidia.com>
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../vllm/tests/omni/test_output_formatter.py  | 39 ++++++++-----------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/components/src/dynamo/vllm/tests/omni/test_output_formatter.py b/components/src/dynamo/vllm/tests/omni/test_output_formatter.py
index db04855787c1..d45cb30f6518 100644
--- a/components/src/dynamo/vllm/tests/omni/test_output_formatter.py
+++ b/components/src/dynamo/vllm/tests/omni/test_output_formatter.py
@@ -115,36 +115,39 @@ def _make_diffusion_formatter():
     )
 
 
+def _make_pil_image(size=(4, 4)):
+    # Use a real PIL image: normalize_image_frames() passes PIL inputs through
+    # unchanged, whereas a MagicMock falls into the np.asarray(item).max() path
+    # and raises "zero-size array to reduction operation maximum".
+    from PIL import Image
+
+    return Image.new("RGB", size, (123, 222, 64))
+
+
 class TestDiffusionFormatterPrepareImages:
     @pytest.mark.asyncio
     async def test_b64_json(self):
         f = _make_diffusion_formatter()
-        img = MagicMock()
-        img.save = lambda b, format: b.write(b"fake_png_data")
-        results = await f._prepare_images([img], "req-1", "b64_json")
+        results = await f._prepare_images([_make_pil_image()], "req-1", "b64_json")
         assert len(results) == 1
         assert results[0].startswith("data:image/png;base64,")
 
     @pytest.mark.asyncio
     async def test_b64_default_when_none(self):
         f = _make_diffusion_formatter()
-        img = MagicMock()
-        img.save = lambda b, format: b.write(b"data")
-        results = await f._prepare_images([img], "req-1", None)
+        results = await f._prepare_images([_make_pil_image()], "req-1", None)
         assert results[0].startswith("data:image/png;base64,")
 
     @pytest.mark.asyncio
     async def test_invalid_format(self):
         f = _make_diffusion_formatter()
         with pytest.raises(ValueError, match="Invalid response format"):
-            await f._prepare_images([MagicMock()], "req-1", "invalid")
+            await f._prepare_images([_make_pil_image()], "req-1", "invalid")
 
     @pytest.mark.asyncio
     async def test_multiple_images(self):
         f = _make_diffusion_formatter()
-        imgs = [MagicMock() for _ in range(3)]
-        for img in imgs:
-            img.save = lambda b, format: b.write(b"px")
+        imgs = [_make_pil_image() for _ in range(3)]
         results = await f._prepare_images(imgs, "req-1", "b64_json")
         assert len(results) == 3
 
@@ -155,10 +158,8 @@ async def test_chat_completion_format(self):
         from dynamo.common.utils.output_modalities import RequestType
 
         f = _make_diffusion_formatter()
-        img = MagicMock()
-        img.save = lambda b, format: b.write(b"px")
         chunk = await f._encode_image(
-            [img], "req-1", request_type=RequestType.CHAT_COMPLETION
+            [_make_pil_image()], "req-1", request_type=RequestType.CHAT_COMPLETION
         )
         assert chunk["object"] == "chat.completion.chunk"
         assert chunk["choices"][0]["delta"]["content"][0]["type"] == "image_url"
@@ -168,10 +169,8 @@ async def test_image_generation_b64_format(self):
         from dynamo.common.utils.output_modalities import RequestType
 
         f = _make_diffusion_formatter()
-        img = MagicMock()
-        img.save = lambda b, format: b.write(b"px")
         chunk = await f._encode_image(
-            [img],
+            [_make_pil_image()],
             "req-1",
             response_format="b64_json",
             request_type=RequestType.IMAGE_GENERATION,
@@ -183,10 +182,8 @@ async def test_image_generation_default_format_returns_b64(self):
         from dynamo.common.utils.output_modalities import RequestType
 
         f = _make_diffusion_formatter()
-        img = MagicMock()
-        img.save = lambda b, format: b.write(b"px")
         chunk = await f._encode_image(
-            [img],
+            [_make_pil_image()],
             "req-1",
             response_format=None,
             request_type=RequestType.IMAGE_GENERATION,
@@ -381,9 +378,7 @@ async def test_routes_image(self):
         f = OutputFormatter(model_name="test-model")
         stage = MagicMock()
         stage.final_output_type = "image"
-        img = MagicMock()
-        img.save = lambda b, format: b.write(b"px")
-        stage.images = [img]
+        stage.images = [_make_pil_image()]
         chunk = await f.format(
             stage, "req-1", request_type=RequestType.CHAT_COMPLETION, **self._FULL_CTX
         )