From ebe677986b23a3619370de88645e63de15c100e9 Mon Sep 17 00:00:00 2001 From: ayushag Date: Fri, 29 May 2026 10:12:52 -0700 Subject: [PATCH 1/9] feat(omni): add Cosmos3 image generation support Signed-off-by: ayushag --- .../src/dynamo/common/utils/video_utils.py | 28 +++++++++++++++++++ components/src/dynamo/vllm/omni/args.py | 15 ++++++++++ .../src/dynamo/vllm/omni/base_handler.py | 6 ++++ .../src/dynamo/vllm/omni/output_formatter.py | 7 +++-- container/context.yaml | 7 ++++- container/deps/vllm/install_vllm_omni.sh | 14 ++++++++-- container/templates/args.Dockerfile | 1 + container/templates/vllm_runtime.Dockerfile | 1 + 8 files changed, 73 insertions(+), 6 deletions(-) diff --git a/components/src/dynamo/common/utils/video_utils.py b/components/src/dynamo/common/utils/video_utils.py index 37326d3280bc..cddd7655114d 100644 --- a/components/src/dynamo/common/utils/video_utils.py +++ b/components/src/dynamo/common/utils/video_utils.py @@ -82,6 +82,34 @@ def normalize_video_frames(images: list) -> list: return list(frames) +def normalize_image_frames(images: list) -> list: + """Normalize stage_output.images into a flat list of PIL Images. + + Image diffusion pipelines usually return PIL Images, but some (e.g. the + Cosmos3 native pipeline) return numpy arrays shaped ``[batch, frames, H, W, + C]`` even for single images. Collapse leading batch/frame dims and convert + each frame to a PIL Image; PIL inputs pass through unchanged. + """ + from PIL import Image + + out: list = [] + for item in images: + if isinstance(item, Image.Image): + out.append(item) + continue + arr = np.asarray(item) + while arr.ndim > 4: # [batch, frames, H, W, C] -> [frames, H, W, C] + arr = arr[0] + if arr.dtype != np.uint8: # frames share a dtype/range; convert once + arr = ((arr.clip(0, 1) * 255).round() if arr.max() <= 1.0 else arr).astype( + np.uint8 + ) + frames = arr if arr.ndim == 4 else arr[None] # -> [N, H, W, C] + for frame in frames: + out.append(Image.fromarray(frame)) + return out + + def frames_to_numpy(images: list) -> np.ndarray: """Convert a list of PIL Images to a numpy array suitable for video encoding. diff --git a/components/src/dynamo/vllm/omni/args.py b/components/src/dynamo/vllm/omni/args.py index fd64cd5ec7f6..d0f5abb07066 100644 --- a/components/src/dynamo/vllm/omni/args.py +++ b/components/src/dynamo/vllm/omni/args.py @@ -187,6 +187,16 @@ def add_arguments(self, parser) -> None: default=False, help="Disable torch.compile and force eager execution for diffusion models.", ) + add_negatable_bool_argument( + g, + flag_name="--cosmos3-guardrails", + env_var="DYN_OMNI_COSMOS3_GUARDRAILS", + default=True, + help=( + "Enable Cosmos3 text/video safety guardrails (loads guardrail models " + "at startup). Use --no-cosmos3-guardrails to disable." + ), + ) # TTS parameters tts_g = parser.add_argument_group( @@ -333,6 +343,11 @@ class OmniConfig(DynamoRuntimeConfig): stage_configs_path: Optional[str] = None default_video_fps: int = 16 + # Cosmos3 safety guardrails. When False, routed into + # od_config.model_config["guardrails"]=False so the diffusion engine skips + # loading the guardrail models (see base_handler._build_omni_kwargs). + cosmos3_guardrails: bool = True + # Nested structs — each group of fields has a clear destination diffusion: OmniDiffusionKwargs = dataclasses.field( default_factory=OmniDiffusionKwargs diff --git a/components/src/dynamo/vllm/omni/base_handler.py b/components/src/dynamo/vllm/omni/base_handler.py index 85f30a2b0297..bf4d98d7e15a 100644 --- a/components/src/dynamo/vllm/omni/base_handler.py +++ b/components/src/dynamo/vllm/omni/base_handler.py @@ -77,6 +77,12 @@ def _build_omni_kwargs(self, config) -> Dict[str, Any]: if config.stage_configs_path: omni_kwargs["stage_configs_path"] = config.stage_configs_path + # Cosmos3 guardrails toggle -> od_config.model_config["guardrails"]. + # Mirrors vllm-omni serve's --cosmos3-no-guardrails; when disabled the + # diffusion engine skips loading the guardrail models entirely. + if not config.cosmos3_guardrails: + omni_kwargs["model_config"] = {"guardrails": False} + for field, value in dataclasses.asdict(config.diffusion).items(): if value is not None: omni_kwargs[field] = value diff --git a/components/src/dynamo/vllm/omni/output_formatter.py b/components/src/dynamo/vllm/omni/output_formatter.py index 9816bd3f69a5..d425e8e4cff5 100644 --- a/components/src/dynamo/vllm/omni/output_formatter.py +++ b/components/src/dynamo/vllm/omni/output_formatter.py @@ -28,7 +28,10 @@ from dynamo.common.storage import upload_to_fs from dynamo.common.utils.engine_response import normalize_finish_reason from dynamo.common.utils.output_modalities import RequestType -from dynamo.common.utils.video_utils import normalize_video_frames +from dynamo.common.utils.video_utils import ( + normalize_image_frames, + normalize_video_frames, +) logger = logging.getLogger(__name__) @@ -241,7 +244,7 @@ async def _prepare_images( self, images: list, request_id: str, response_format: Optional[str] = None ) -> list: outlist = [] - for img in images: + for img in normalize_image_frames(images): buf = BytesIO() img.save(buf, format="PNG") image_bytes = buf.getvalue() diff --git a/container/context.yaml b/container/context.yaml index 0a3c1a777316..b76132838b5f 100644 --- a/container/context.yaml +++ b/container/context.yaml @@ -63,7 +63,12 @@ vllm: base_image_tag: 22.04 runtime_image_tag: v0.21.0 flashinf_ref: v0.6.8.post1 - vllm_omni_ref: "v0.21.0rc1" + # Cosmos3 support is not yet in a released vllm-omni; install from the + # canonical repo pinned to the PR head commit (vllm-project/vllm-omni#3454, + # == refs/pull/3454/head). When vllm_omni_git_url is set, install_vllm_omni.sh + # installs from git@ref; otherwise it falls back to "vllm-omni==". + vllm_omni_ref: "e826f626afb47c8c3c39ccf892ed247f442f6bd2" + vllm_omni_git_url: "https://github.com/vllm-project/vllm-omni.git" nixl_ref: v1.1.0 max_jobs: "10" enable_media_ffmpeg: "false" diff --git a/container/deps/vllm/install_vllm_omni.sh b/container/deps/vllm/install_vllm_omni.sh index 83b08875a621..c8dad9cba3cf 100755 --- a/container/deps/vllm/install_vllm_omni.sh +++ b/container/deps/vllm/install_vllm_omni.sh @@ -9,7 +9,15 @@ set -euo pipefail VLLM_OMNI_PROTECTED_PACKAGES_FILE="${VLLM_OMNI_PROTECTED_PACKAGES_FILE:-/tmp/vllm_omni_protected_packages.txt}" PROTECTED_CONSTRAINTS="$(mktemp /tmp/vllm-openai-protected.XXXXXX.txt)" -VLLM_OMNI_VERSION="${VLLM_OMNI_REF#v}" + +# When VLLM_OMNI_GIT_URL is set, install from a git ref (e.g. an unreleased PR +# commit); otherwise fall back to the matching PyPI release. +VLLM_OMNI_GIT_URL="${VLLM_OMNI_GIT_URL:-}" +if [ -n "${VLLM_OMNI_GIT_URL}" ]; then + VLLM_OMNI_SPEC="vllm-omni @ git+${VLLM_OMNI_GIT_URL}@${VLLM_OMNI_REF}" +else + VLLM_OMNI_SPEC="vllm-omni==${VLLM_OMNI_REF#v}" +fi cleanup() { rm -rf "${PROTECTED_CONSTRAINTS}" @@ -41,11 +49,11 @@ if [ "${VLLM_OMNI_TARGET_DEVICE}" = "cuda" ]; then uv pip install --system \ --prerelease=allow \ --constraints "${PROTECTED_CONSTRAINTS}" \ - "vllm-omni==${VLLM_OMNI_VERSION}" + "${VLLM_OMNI_SPEC}" else uv pip install \ --prerelease=allow \ --constraints "${PROTECTED_CONSTRAINTS}" \ - "vllm-omni==${VLLM_OMNI_VERSION}" + "${VLLM_OMNI_SPEC}" fi diff --git a/container/templates/args.Dockerfile b/container/templates/args.Dockerfile index 4e21b5ba1ea2..f5d7aaf14bcb 100644 --- a/container/templates/args.Dockerfile +++ b/container/templates/args.Dockerfile @@ -99,6 +99,7 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }} ARG FLASHINF_REF={{ context.vllm.flashinf_ref }} {% endif %} ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }} +ARG VLLM_OMNI_GIT_URL={{ context.vllm.vllm_omni_git_url }} {% if device == "cuda" -%} # If left blank, then we will fallback to vLLM defaults diff --git a/container/templates/vllm_runtime.Dockerfile b/container/templates/vllm_runtime.Dockerfile index 71cccc155aaa..c82186bf0162 100644 --- a/container/templates/vllm_runtime.Dockerfile +++ b/container/templates/vllm_runtime.Dockerfile @@ -19,6 +19,7 @@ ARG PYTHON_VERSION ARG ENABLE_KVBM ARG ENABLE_GPU_MEMORY_SERVICE ARG VLLM_OMNI_REF +ARG VLLM_OMNI_GIT_URL ARG NIXL_REF {% if device == "cuda" %} ARG CUDA_MAJOR From b9b9ca3b83925eb1b9550f28f8ca7ce2ac2774f4 Mon Sep 17 00:00:00 2001 From: ayushag Date: Fri, 29 May 2026 10:22:45 -0700 Subject: [PATCH 2/9] feat(examples): add Cosmos3 omni image/video launch scripts Signed-off-by: ayushag --- .../vllm/launch/agg_omni_cosmos3_image.sh | 63 +++++++++++++++++ .../vllm/launch/agg_omni_cosmos3_video.sh | 70 +++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100755 examples/backends/vllm/launch/agg_omni_cosmos3_image.sh create mode 100755 examples/backends/vllm/launch/agg_omni_cosmos3_video.sh diff --git a/examples/backends/vllm/launch/agg_omni_cosmos3_image.sh b/examples/backends/vllm/launch/agg_omni_cosmos3_image.sh new file mode 100755 index 000000000000..afc6f4f1aa7e --- /dev/null +++ b/examples/backends/vllm/launch/agg_omni_cosmos3_image.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Aggregated Cosmos3 text-to-image generation (1 GPU). +# Uses the native vLLM-Omni Cosmos3 pipeline; --no-cosmos3-guardrails skips +# loading the safety guardrail models. A worker serves a single modality, so +# this script registers the "image" modality (see agg_omni_cosmos3_video.sh +# for text-to-video). + +set -e +trap 'echo Cleaning up...; kill 0' EXIT + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +source "$SCRIPT_DIR/../../../common/launch_utils.sh" + +MODEL="nvidia/Cosmos3-Nano" + +# Parse command line arguments +EXTRA_ARGS=() +while [[ $# -gt 0 ]]; do + case $1 in + --model) + MODEL="$2" + shift 2 + ;; + *) + EXTRA_ARGS+=("$1") + shift + ;; + esac +done + +HTTP_PORT="${DYN_HTTP_PORT:-8000}" +print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image Generation (1 GPU)" "$MODEL" "$HTTP_PORT" +print_curl_footer < Date: Fri, 29 May 2026 10:26:44 -0700 Subject: [PATCH 3/9] feat(examples): add Cosmos3 omni image-to-video launch script Signed-off-by: ayushag --- .../vllm/launch/agg_omni_cosmos3_i2v.sh | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100755 examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh diff --git a/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh b/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh new file mode 100755 index 000000000000..bb37e58dd8b9 --- /dev/null +++ b/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Aggregated Cosmos3 image-to-video generation (1 GPU). +# Same worker as text-to-video (registers the "video" modality); i2v is driven +# by adding "input_reference" to the /v1/videos request. The image loader +# rejects local file paths — pass a data: URI (base64) or an http(s) URL. +# --no-cosmos3-guardrails skips loading the safety guardrail models. + +set -e +trap 'echo Cleaning up...; kill 0' EXIT + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +source "$SCRIPT_DIR/../../../common/gpu_utils.sh" +source "$SCRIPT_DIR/../../../common/launch_utils.sh" + +MODEL="nvidia/Cosmos3-Nano" + +# Parse command line arguments +EXTRA_ARGS=() +while [[ $# -gt 0 ]]; do + case $1 in + --model) + MODEL="$2" + shift 2 + ;; + *) + EXTRA_ARGS+=("$1") + shift + ;; + esac +done + +HTTP_PORT="${DYN_HTTP_PORT:-8000}" +GPU_MEM_ARGS=$(build_vllm_gpu_mem_args) +print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image-to-Video (1 GPU)" "$MODEL" "$HTTP_PORT" +print_curl_footer < Date: Fri, 29 May 2026 13:29:34 -0700 Subject: [PATCH 4/9] chore(cosmos3): add docs and sample payloads; revert container git install Signed-off-by: ayushag --- container/context.yaml | 7 +- container/deps/vllm/install_vllm_omni.sh | 14 +- container/templates/args.Dockerfile | 1 - container/templates/vllm_runtime.Dockerfile | 1 - docs/backends/vllm/cosmos3.md | 163 ++++++++++++++++++ .../vllm/launch/agg_omni_cosmos3_i2v.sh | 15 +- .../vllm/launch/agg_omni_cosmos3_image.sh | 9 +- .../vllm/launch/agg_omni_cosmos3_video.sh | 12 +- .../backends/vllm/launch/cosmos3/i2v.json | 12 ++ .../backends/vllm/launch/cosmos3/t2i.json | 11 ++ .../backends/vllm/launch/cosmos3/t2v.json | 11 ++ 11 files changed, 209 insertions(+), 47 deletions(-) create mode 100644 docs/backends/vllm/cosmos3.md create mode 100644 examples/backends/vllm/launch/cosmos3/i2v.json create mode 100644 examples/backends/vllm/launch/cosmos3/t2i.json create mode 100644 examples/backends/vllm/launch/cosmos3/t2v.json diff --git a/container/context.yaml b/container/context.yaml index b76132838b5f..0a3c1a777316 100644 --- a/container/context.yaml +++ b/container/context.yaml @@ -63,12 +63,7 @@ vllm: base_image_tag: 22.04 runtime_image_tag: v0.21.0 flashinf_ref: v0.6.8.post1 - # Cosmos3 support is not yet in a released vllm-omni; install from the - # canonical repo pinned to the PR head commit (vllm-project/vllm-omni#3454, - # == refs/pull/3454/head). When vllm_omni_git_url is set, install_vllm_omni.sh - # installs from git@ref; otherwise it falls back to "vllm-omni==". - vllm_omni_ref: "e826f626afb47c8c3c39ccf892ed247f442f6bd2" - vllm_omni_git_url: "https://github.com/vllm-project/vllm-omni.git" + vllm_omni_ref: "v0.21.0rc1" nixl_ref: v1.1.0 max_jobs: "10" enable_media_ffmpeg: "false" diff --git a/container/deps/vllm/install_vllm_omni.sh b/container/deps/vllm/install_vllm_omni.sh index c8dad9cba3cf..83b08875a621 100755 --- a/container/deps/vllm/install_vllm_omni.sh +++ b/container/deps/vllm/install_vllm_omni.sh @@ -9,15 +9,7 @@ set -euo pipefail VLLM_OMNI_PROTECTED_PACKAGES_FILE="${VLLM_OMNI_PROTECTED_PACKAGES_FILE:-/tmp/vllm_omni_protected_packages.txt}" PROTECTED_CONSTRAINTS="$(mktemp /tmp/vllm-openai-protected.XXXXXX.txt)" - -# When VLLM_OMNI_GIT_URL is set, install from a git ref (e.g. an unreleased PR -# commit); otherwise fall back to the matching PyPI release. -VLLM_OMNI_GIT_URL="${VLLM_OMNI_GIT_URL:-}" -if [ -n "${VLLM_OMNI_GIT_URL}" ]; then - VLLM_OMNI_SPEC="vllm-omni @ git+${VLLM_OMNI_GIT_URL}@${VLLM_OMNI_REF}" -else - VLLM_OMNI_SPEC="vllm-omni==${VLLM_OMNI_REF#v}" -fi +VLLM_OMNI_VERSION="${VLLM_OMNI_REF#v}" cleanup() { rm -rf "${PROTECTED_CONSTRAINTS}" @@ -49,11 +41,11 @@ if [ "${VLLM_OMNI_TARGET_DEVICE}" = "cuda" ]; then uv pip install --system \ --prerelease=allow \ --constraints "${PROTECTED_CONSTRAINTS}" \ - "${VLLM_OMNI_SPEC}" + "vllm-omni==${VLLM_OMNI_VERSION}" else uv pip install \ --prerelease=allow \ --constraints "${PROTECTED_CONSTRAINTS}" \ - "${VLLM_OMNI_SPEC}" + "vllm-omni==${VLLM_OMNI_VERSION}" fi diff --git a/container/templates/args.Dockerfile b/container/templates/args.Dockerfile index f5d7aaf14bcb..4e21b5ba1ea2 100644 --- a/container/templates/args.Dockerfile +++ b/container/templates/args.Dockerfile @@ -99,7 +99,6 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }} ARG FLASHINF_REF={{ context.vllm.flashinf_ref }} {% endif %} ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }} -ARG VLLM_OMNI_GIT_URL={{ context.vllm.vllm_omni_git_url }} {% if device == "cuda" -%} # If left blank, then we will fallback to vLLM defaults diff --git a/container/templates/vllm_runtime.Dockerfile b/container/templates/vllm_runtime.Dockerfile index c82186bf0162..71cccc155aaa 100644 --- a/container/templates/vllm_runtime.Dockerfile +++ b/container/templates/vllm_runtime.Dockerfile @@ -19,7 +19,6 @@ ARG PYTHON_VERSION ARG ENABLE_KVBM ARG ENABLE_GPU_MEMORY_SERVICE ARG VLLM_OMNI_REF -ARG VLLM_OMNI_GIT_URL ARG NIXL_REF {% if device == "cuda" %} ARG CUDA_MAJOR diff --git a/docs/backends/vllm/cosmos3.md b/docs/backends/vllm/cosmos3.md new file mode 100644 index 000000000000..dc3a79278ea7 --- /dev/null +++ b/docs/backends/vllm/cosmos3.md @@ -0,0 +1,163 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: Cosmos3 +--- + +Run NVIDIA's **Cosmos3** omni model through Dynamo's +[vLLM-Omni backend](vllm-omni.md) for **text-to-image**, **text-to-video**, and +**image-to-video** generation. + +Cosmos3 is a unified world foundation model (WFM) for Physical AI, built on a +Mixture-of-Transformers (MoT) architecture. A single `Cosmos3OmniTransformer` +runs a Qwen-style "understanding" stream alongside a "generation" stream +joined by a 3D multimodal RoPE, replacing the separate Predict / Reason / +Transfer models from earlier Cosmos releases. See the +[Cosmos World Foundation Model Platform paper](https://huggingface.co/papers/2501.03575) +for the architectural background, and the +[diffusers Cosmos3 reference](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cosmos3) for the underlying pipeline. + +Cosmos3 support in Dynamo is provided by the native vLLM-Omni pipeline added in +[vllm-project/vllm-omni#3454](https://github.com/vllm-project/vllm-omni/pull/3454). + +## Checkpoints + +Both checkpoints share the same `Cosmos3OmniPipeline` class and Dynamo flags; +swap the model identifier on the worker (`--model …`) and in request payloads. + +| Checkpoint | Description | HF Hub | +|------------|-------------|--------| +| `nvidia/Cosmos3-Nano` | Smaller, faster — default in this repo's launch scripts | [link](https://huggingface.co/nvidia/Cosmos3-Nano) | +| `nvidia/Cosmos3-Super` | Larger, higher quality | [link](https://huggingface.co/nvidia/Cosmos3-Super) | + +## Supported modalities + +| Task | Endpoint | `--output-modalities` | +|------|----------|-----------------------| +| Text-to-Image | `/v1/images/generations` | `image` | +| Text-to-Video | `/v1/videos` | `video` | +| Image-to-Video | `/v1/videos` (with `input_reference`) | `video` | + +## Prerequisites + +This guide builds on the [vLLM-Omni backend guide](vllm-omni.md) — see it for general setup, `etcd`/`nats`, and OpenAI-endpoint details. + +### Installation + +This branch carries Dynamo code changes (the Cosmos3 worker flags and image +output handling) on top of a pinned vLLM-Omni, so run Dynamo **from source on +this branch** — a released `ai-dynamo` wheel will not include the integration. + +1. Clone and check out the branch: + + ```bash + git clone https://github.com/ai-dynamo/dynamo.git + cd dynamo + git checkout cosmos3-omni-integration + ``` + +2. Create a Python 3.12 environment: + + ```bash + uv venv --python 3.12 --seed + source .venv/bin/activate + ``` + +3. Build and install Dynamo from source (the branch's Cosmos3 code must be + live, and the Rust core `ai-dynamo-runtime` isn't published for this dev + version, so it has to be built locally). See + [Building from source](../../getting-started/building-from-source.md) for + prerequisites (Rust toolchain, system deps); the key steps from the repo root: + + ```bash + uv pip install pip maturin + (cd lib/bindings/python && maturin develop --uv) # builds ai-dynamo-runtime + uv pip install -e lib/gpu_memory_service + uv pip install -e ".[vllm]" # also pulls vllm==0.21.0 + ``` + +4. Install the Cosmos3-capable vLLM-Omni, pinned to the PR commit (its dynamic + `setup.py` pulls the matching pipeline deps — `diffusers==0.38`, `torchsde`, + `x-transformers`): + + ```bash + uv pip install "vllm-omni @ git+https://github.com/vllm-project/vllm-omni.git@e826f626afb47c8c3c39ccf892ed247f442f6bd2" + ``` + +5. Start etcd and NATS: + + ```bash + docker compose -f dev/docker-compose.yml up -d + ``` + +## Serve + +Quick start — each script launches the frontend on `:8000` plus a +single-modality worker and prints a sample request: + +```bash +examples/backends/vllm/launch/agg_omni_cosmos3_image.sh # text-to-image +examples/backends/vllm/launch/agg_omni_cosmos3_video.sh # text-to-video +examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh # image-to-video +``` + +Manual launch: + +```bash +python -m dynamo.frontend --http-port 8000 & + +python -m dynamo.vllm.omni \ + --model nvidia/Cosmos3-Nano \ + --output-modalities image \ # or: video + --no-cosmos3-guardrails \ # skip loading the safety guardrail models + --media-output-fs-url file:///tmp/dynamo_media +``` + +Cosmos3-specific flags: + +| Flag | Purpose | +|------|---------| +| `--no-cosmos3-guardrails` | Disable the Cosmos3 text/video safety guardrails (otherwise loaded at startup). | +| `--flow-shift ` | Scheduler flow-shift (image default `3.0`). Launch-time only — not a per-request image parameter. | +| `--media-output-fs-url file://` | Destination for media when `response_format: "url"`. | + +## Requests + +### Text-to-image + +Run from the repo root; `cosmos3/t2i.json` is the official Cosmos3 t2i payload +(prompt verbatim) mapped to the Dynamo request schema: + +```bash +curl -s -X POST http://localhost:8000/v1/images/generations \ + -H 'Content-Type: application/json' \ + --data-binary @examples/backends/vllm/launch/cosmos3/t2i.json \ + | jq -r '.data[0].b64_json' | base64 -d > out.png +``` + +- `size` must be one of `256x256`, `512x512`, `1024x1024`, `1792x1024`, + `1024x1792`, `1536x1024`, `1024x1536`, `auto` — the payload uses `1024x1024` + (the official `960x960` is not an allowed image size). +- Put `num_inference_steps`, `guidance_scale`, `seed`, and `negative_prompt` + under `nvext` — top-level values are ignored. + +### Text-to-video + +```bash +curl -s http://localhost:8000/v1/videos \ + -H 'Content-Type: application/json' \ + --data-binary @examples/backends/vllm/launch/cosmos3/t2v.json | jq +``` + +The official `t2v.json` payload is `1280x720`, `192` frames @ `24` fps (8s). + +### Image-to-video + +`i2v.json` adds `input_reference` (the official `vision_path` — an http URL; +local paths are rejected, use an http(s) URL or a `data:` base64 URI): + +```bash +curl -s http://localhost:8000/v1/videos \ + -H 'Content-Type: application/json' \ + --data-binary @examples/backends/vllm/launch/cosmos3/i2v.json | jq +``` diff --git a/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh b/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh index bb37e58dd8b9..0f8bd6877c05 100755 --- a/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh +++ b/examples/backends/vllm/launch/agg_omni_cosmos3_i2v.sh @@ -36,20 +36,11 @@ HTTP_PORT="${DYN_HTTP_PORT:-8000}" GPU_MEM_ARGS=$(build_vllm_gpu_mem_args) print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Image-to-Video (1 GPU)" "$MODEL" "$HTTP_PORT" print_curl_footer < t2i.png CURL diff --git a/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh b/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh index a6067354a91e..4681749fc81c 100755 --- a/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh +++ b/examples/backends/vllm/launch/agg_omni_cosmos3_video.sh @@ -36,18 +36,10 @@ HTTP_PORT="${DYN_HTTP_PORT:-8000}" GPU_MEM_ARGS=$(build_vllm_gpu_mem_args) print_launch_banner --no-curl "Launching vLLM-Omni Cosmos3 Video Generation (1 GPU)" "$MODEL" "$HTTP_PORT" print_curl_footer < Date: Fri, 29 May 2026 13:46:24 -0700 Subject: [PATCH 5/9] test(omni): add Cosmos3 tests and refine guide Signed-off-by: ayushag --- .../dynamo/common/tests/test_video_utils.py | 64 +++++++++++++++++++ .../dynamo/vllm/tests/omni/test_omni_args.py | 14 ++++ .../vllm/tests/omni/test_omni_base_handler.py | 22 +++++++ docs/backends/vllm/cosmos3.md | 15 +++-- 4 files changed, 108 insertions(+), 7 deletions(-) diff --git a/components/src/dynamo/common/tests/test_video_utils.py b/components/src/dynamo/common/tests/test_video_utils.py index fab867fb611c..6e134fd01ff2 100644 --- a/components/src/dynamo/common/tests/test_video_utils.py +++ b/components/src/dynamo/common/tests/test_video_utils.py @@ -154,3 +154,67 @@ def test_v2_api_fallback_writes_all_frames(self): assert writer.append_data.call_count == 4 writer.close.assert_called_once() + + +# --------------------------------------------------------------------------- +# normalize_image_frames +# --------------------------------------------------------------------------- + + +class TestNormalizeImageFrames: + """Tests for normalize_image_frames() — flattens DiffusionFormatter image + inputs to PIL. Image pipelines usually emit PIL Images; the Cosmos3 native + pipeline emits 5D numpy ``[B, F, H, W, C]``.""" + + def test_pil_inputs_returned_by_identity(self): + """PIL inputs must pass through without conversion or copy.""" + from PIL import Image + + from dynamo.common.utils.video_utils import normalize_image_frames + + a = Image.new("RGB", (4, 4), (255, 0, 0)) + b = Image.new("RGB", (4, 4), (0, 255, 0)) + out = normalize_image_frames([a, b]) + + assert len(out) == 2 + assert out[0] is a and out[1] is b + + def test_uint8_hwc_numpy_preserves_pixels(self): + from PIL import Image + + from dynamo.common.utils.video_utils import normalize_image_frames + + arr = np.full((4, 4, 3), 7, dtype=np.uint8) + out = normalize_image_frames([arr]) + + assert len(out) == 1 + assert isinstance(out[0], Image.Image) + assert out[0].size == (4, 4) # PIL is (W, H) + assert np.asarray(out[0])[0, 0].tolist() == [7, 7, 7] + + def test_cosmos3_5d_strips_batch_and_preserves_frame_order(self): + """[B, F, H, W, C] collapses to F PIL frames in order. Distinct + per-frame content guards against wrong-axis indexing regressions.""" + from dynamo.common.utils.video_utils import normalize_image_frames + + arr = np.zeros((1, 3, 4, 4, 3), dtype=np.uint8) + arr[0, 0] = 10 # frame 0 fill + arr[0, 1] = 20 # frame 1 fill + arr[0, 2] = 30 # frame 2 fill + + out = normalize_image_frames([arr]) + + assert len(out) == 3 + assert np.asarray(out[0])[0, 0, 0] == 10 + assert np.asarray(out[1])[0, 0, 0] == 20 + assert np.asarray(out[2])[0, 0, 0] == 30 + + def test_float_zero_to_one_scaled_to_uint8(self): + """float32 [0, 1] inputs must be rescaled to uint8 [0, 255].""" + from dynamo.common.utils.video_utils import normalize_image_frames + + arr = np.full((4, 4, 3), 0.5, dtype=np.float32) + out = normalize_image_frames([arr]) + + # 0.5 * 255 = 127.5; numpy's banker's rounding yields exactly 128. + assert np.asarray(out[0])[0, 0, 0] == 128 diff --git a/components/src/dynamo/vllm/tests/omni/test_omni_args.py b/components/src/dynamo/vllm/tests/omni/test_omni_args.py index 92380e489412..22b5213c41cb 100644 --- a/components/src/dynamo/vllm/tests/omni/test_omni_args.py +++ b/components/src/dynamo/vllm/tests/omni/test_omni_args.py @@ -75,6 +75,7 @@ def _make_omni_config(**overrides) -> OmniConfig: "tts_ref_audio_max_bytes": 50 * 1024 * 1024, "stage_id": None, "omni_router": False, + "cosmos3_guardrails": True, } flat_defaults.update(flat_overrides) @@ -191,3 +192,16 @@ def test_omni_config_imports_cleanly(): assert OmniConfig is not None assert callable(parse_omni_args) + + +# --- Cosmos3 guardrails --- + + +def test_omni_config_cosmos3_guardrails_default_enabled(): + assert OmniConfig.cosmos3_guardrails is True + + +def test_omni_config_cosmos3_guardrails_overridable(): + config = _make_omni_config(cosmos3_guardrails=False) + assert config.cosmos3_guardrails is False + config.validate() # disabling guardrails must not fail validation diff --git a/components/src/dynamo/vllm/tests/omni/test_omni_base_handler.py b/components/src/dynamo/vllm/tests/omni/test_omni_base_handler.py index f27dddda4c27..2ac2f3886000 100644 --- a/components/src/dynamo/vllm/tests/omni/test_omni_base_handler.py +++ b/components/src/dynamo/vllm/tests/omni/test_omni_base_handler.py @@ -101,3 +101,25 @@ def test_output_modalities_forwarded_to_async_omni(self): kwargs = _build_kwargs(config) assert kwargs["output_modalities"] == ["image"] + + +class TestCosmos3Guardrails: + """`cosmos3_guardrails=False` should route into omni_kwargs as + ``model_config={"guardrails": False}``; the default (True) leaves + model_config untouched so vllm-omni applies its own default.""" + + def test_disabled_routes_into_model_config(self): + config = _make_config() + config.cosmos3_guardrails = False + + kwargs = _build_kwargs(config) + + assert kwargs.get("model_config") == {"guardrails": False} + + def test_enabled_does_not_set_model_config(self): + config = _make_config() + config.cosmos3_guardrails = True + + kwargs = _build_kwargs(config) + + assert "model_config" not in kwargs diff --git a/docs/backends/vllm/cosmos3.md b/docs/backends/vllm/cosmos3.md index dc3a79278ea7..e57f3279d38c 100644 --- a/docs/backends/vllm/cosmos3.md +++ b/docs/backends/vllm/cosmos3.md @@ -8,14 +8,15 @@ Run NVIDIA's **Cosmos3** omni model through Dynamo's [vLLM-Omni backend](vllm-omni.md) for **text-to-image**, **text-to-video**, and **image-to-video** generation. -Cosmos3 is a unified world foundation model (WFM) for Physical AI, built on a -Mixture-of-Transformers (MoT) architecture. A single `Cosmos3OmniTransformer` -runs a Qwen-style "understanding" stream alongside a "generation" stream -joined by a 3D multimodal RoPE, replacing the separate Predict / Reason / -Transfer models from earlier Cosmos releases. See the +Cosmos3 is a unified world foundation model for Physical AI, built on a +Mixture-of-Transformers architecture. A single `Cosmos3OmniTransformer` runs +a Qwen-style "understanding" stream alongside a "generation" stream joined +by a 3D multimodal RoPE, replacing the separate Predict / Reason / Transfer +models from earlier Cosmos releases. See the [Cosmos World Foundation Model Platform paper](https://huggingface.co/papers/2501.03575) for the architectural background, and the -[diffusers Cosmos3 reference](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cosmos3) for the underlying pipeline. +[diffusers Cosmos3 reference](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cosmos3) +for the underlying pipeline. Cosmos3 support in Dynamo is provided by the native vLLM-Omni pipeline added in [vllm-project/vllm-omni#3454](https://github.com/vllm-project/vllm-omni/pull/3454). @@ -27,7 +28,7 @@ swap the model identifier on the worker (`--model …`) and in request payloads. | Checkpoint | Description | HF Hub | |------------|-------------|--------| -| `nvidia/Cosmos3-Nano` | Smaller, faster — default in this repo's launch scripts | [link](https://huggingface.co/nvidia/Cosmos3-Nano) | +| `nvidia/Cosmos3-Nano` | Smaller, faster — default in the Dynamo launch scripts below | [link](https://huggingface.co/nvidia/Cosmos3-Nano) | | `nvidia/Cosmos3-Super` | Larger, higher quality | [link](https://huggingface.co/nvidia/Cosmos3-Super) | ## Supported modalities From 001eacbe3d1edcac484910b5c678a24df5f7a73e Mon Sep 17 00:00:00 2001 From: Harrison King Saturley-Hall Date: Sun, 31 May 2026 04:09:41 -0400 Subject: [PATCH 6/9] feat(omni): install vllm-omni from PR #3454 for Cosmos3 support Cosmos3 pipelines are only in the unreleased vllm-omni PR vllm-project/vllm-omni#3454, not in any released wheel. Re-enable the git-install mechanism (reverted in 7744835c195) so the vllm-runtime container installs vllm-omni from the canonical repo pinned to the current PR head SHA (65b83d87, == refs/pull/3454/head). When vllm_omni_git_url is set, install_vllm_omni.sh installs "vllm-omni @ git+@"; otherwise it falls back to the released "vllm-omni==" wheel. Signed-off-by: Harrison King Saturley-Hall Co-Authored-By: Claude Opus 4.8 --- container/context.yaml | 10 +++++++++- container/deps/vllm/install_vllm_omni.sh | 14 +++++++++++--- container/templates/args.Dockerfile | 1 + container/templates/vllm_runtime.Dockerfile | 1 + 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/container/context.yaml b/container/context.yaml index 0a3c1a777316..b6f397da160e 100644 --- a/container/context.yaml +++ b/container/context.yaml @@ -63,7 +63,15 @@ vllm: base_image_tag: 22.04 runtime_image_tag: v0.21.0 flashinf_ref: v0.6.8.post1 - vllm_omni_ref: "v0.21.0rc1" + # Cosmos3 support is not yet in a released vllm-omni; install from the + # canonical repo pinned to the PR head commit (vllm-project/vllm-omni#3454, + # == refs/pull/3454/head). When vllm_omni_git_url is set, install_vllm_omni.sh + # installs from git@ref; otherwise it falls back to "vllm-omni==". + vllm_omni_ref: "65b83d87ad786aa786b248f0242e2ed1b4a8161f" + # If vllm_omni_git_url is defined, vllm-omni is NOT installed from PyPI; it is + # installed from the git commit SHA (or ref) defined in vllm_omni_ref above. + # Leave it unset/empty to install the released "vllm-omni==" wheel. + vllm_omni_git_url: "https://github.com/vllm-project/vllm-omni.git" nixl_ref: v1.1.0 max_jobs: "10" enable_media_ffmpeg: "false" diff --git a/container/deps/vllm/install_vllm_omni.sh b/container/deps/vllm/install_vllm_omni.sh index 83b08875a621..c8dad9cba3cf 100755 --- a/container/deps/vllm/install_vllm_omni.sh +++ b/container/deps/vllm/install_vllm_omni.sh @@ -9,7 +9,15 @@ set -euo pipefail VLLM_OMNI_PROTECTED_PACKAGES_FILE="${VLLM_OMNI_PROTECTED_PACKAGES_FILE:-/tmp/vllm_omni_protected_packages.txt}" PROTECTED_CONSTRAINTS="$(mktemp /tmp/vllm-openai-protected.XXXXXX.txt)" -VLLM_OMNI_VERSION="${VLLM_OMNI_REF#v}" + +# When VLLM_OMNI_GIT_URL is set, install from a git ref (e.g. an unreleased PR +# commit); otherwise fall back to the matching PyPI release. +VLLM_OMNI_GIT_URL="${VLLM_OMNI_GIT_URL:-}" +if [ -n "${VLLM_OMNI_GIT_URL}" ]; then + VLLM_OMNI_SPEC="vllm-omni @ git+${VLLM_OMNI_GIT_URL}@${VLLM_OMNI_REF}" +else + VLLM_OMNI_SPEC="vllm-omni==${VLLM_OMNI_REF#v}" +fi cleanup() { rm -rf "${PROTECTED_CONSTRAINTS}" @@ -41,11 +49,11 @@ if [ "${VLLM_OMNI_TARGET_DEVICE}" = "cuda" ]; then uv pip install --system \ --prerelease=allow \ --constraints "${PROTECTED_CONSTRAINTS}" \ - "vllm-omni==${VLLM_OMNI_VERSION}" + "${VLLM_OMNI_SPEC}" else uv pip install \ --prerelease=allow \ --constraints "${PROTECTED_CONSTRAINTS}" \ - "vllm-omni==${VLLM_OMNI_VERSION}" + "${VLLM_OMNI_SPEC}" fi diff --git a/container/templates/args.Dockerfile b/container/templates/args.Dockerfile index 4e21b5ba1ea2..f5d7aaf14bcb 100644 --- a/container/templates/args.Dockerfile +++ b/container/templates/args.Dockerfile @@ -99,6 +99,7 @@ ARG MAX_JOBS={{ context.vllm.max_jobs }} ARG FLASHINF_REF={{ context.vllm.flashinf_ref }} {% endif %} ARG VLLM_OMNI_REF={{ context.vllm.vllm_omni_ref }} +ARG VLLM_OMNI_GIT_URL={{ context.vllm.vllm_omni_git_url }} {% if device == "cuda" -%} # If left blank, then we will fallback to vLLM defaults diff --git a/container/templates/vllm_runtime.Dockerfile b/container/templates/vllm_runtime.Dockerfile index 71cccc155aaa..c82186bf0162 100644 --- a/container/templates/vllm_runtime.Dockerfile +++ b/container/templates/vllm_runtime.Dockerfile @@ -19,6 +19,7 @@ ARG PYTHON_VERSION ARG ENABLE_KVBM ARG ENABLE_GPU_MEMORY_SERVICE ARG VLLM_OMNI_REF +ARG VLLM_OMNI_GIT_URL ARG NIXL_REF {% if device == "cuda" %} ARG CUDA_MAJOR From 22d56b960660cc89480ea8de9489adda4ddd1112 Mon Sep 17 00:00:00 2001 From: Harrison Saturley-Hall Date: Fri, 29 May 2026 16:01:38 -0400 Subject: [PATCH 7/9] chore(container): build in-tree ffmpeg CLI and route imageio through it (#10091) Co-authored-by: Claude Opus 4.7 (cherry picked from commit dc2f3521455feed94c052d20612c8810fb6d14be) --- .../dynamo/common/tests/test_video_utils.py | 4 +- .../src/dynamo/common/utils/video_utils.py | 12 ++--- .../video_generation_handler.py | 2 +- container/context.yaml | 6 +++ container/deps/requirements.common.txt | 9 +++- container/deps/requirements.sglang.txt | 12 +++++ container/deps/requirements.trtllm.txt | 7 ++- container/deps/requirements.vllm.txt | 14 ++++++ container/templates/args.Dockerfile | 6 +++ container/templates/dynamo_base.Dockerfile | 2 +- container/templates/dynamo_runtime.Dockerfile | 8 +++- container/templates/sglang_runtime.Dockerfile | 30 ++++++++++--- container/templates/trtllm_runtime.Dockerfile | 18 ++++++++ container/templates/vllm_runtime.Dockerfile | 12 +++++ container/templates/wheel_builder.Dockerfile | 44 +++++++++++++++---- docs/backends/trtllm/trtllm-diffusion.md | 6 ++- 16 files changed, 163 insertions(+), 29 deletions(-) create mode 100644 container/deps/requirements.sglang.txt create mode 100644 container/deps/requirements.vllm.txt diff --git a/components/src/dynamo/common/tests/test_video_utils.py b/components/src/dynamo/common/tests/test_video_utils.py index 6e134fd01ff2..3b240710082f 100644 --- a/components/src/dynamo/common/tests/test_video_utils.py +++ b/components/src/dynamo/common/tests/test_video_utils.py @@ -41,7 +41,7 @@ def _mock_iio_v2(self): iio.get_writer = MagicMock(return_value=writer) return iio, writer - def test_mp4_selects_libx264_codec(self): + def test_mp4_selects_h264_nvenc_codec(self): from dynamo.common.utils.video_utils import encode_to_video_bytes iio = self._mock_iio_v3() @@ -56,7 +56,7 @@ def test_mp4_selects_libx264_codec(self): iio.imwrite.assert_called_once() _, kwargs = iio.imwrite.call_args - assert kwargs.get("codec") == "libx264" + assert kwargs.get("codec") == "h264_nvenc" assert kwargs.get("fps") == 8 def test_webm_selects_libvpx_vp9_codec(self): diff --git a/components/src/dynamo/common/utils/video_utils.py b/components/src/dynamo/common/utils/video_utils.py index cddd7655114d..d6ce6dd826bb 100644 --- a/components/src/dynamo/common/utils/video_utils.py +++ b/components/src/dynamo/common/utils/video_utils.py @@ -182,13 +182,15 @@ def encode_to_mp4( logger.info(f"Encoding {len(frames)} frames to {output_path} at {fps} fps") try: - # Use imageio to write MP4 - # imageio.v3 API + # Use imageio to write MP4. We use h264_nvenc (NVIDIA HW encoder) instead + # of libx264 because the in-tree ffmpeg build is LGPL-only and libx264 + # is GPL-licensed; see container/templates/wheel_builder.Dockerfile. + # Requires a CUDA-capable GPU at runtime. if hasattr(iio, "imwrite"): - iio.imwrite(output_path, frames, fps=fps, codec="libx264") + iio.imwrite(output_path, frames, fps=fps, codec="h264_nvenc") else: # Fall back to v2 API - writer = iio.get_writer(output_path, fps=fps, codec="libx264") # type: ignore[attr-defined] + writer = iio.get_writer(output_path, fps=fps, codec="h264_nvenc") # type: ignore[attr-defined] try: for frame in frames: writer.append_data(frame) @@ -243,7 +245,7 @@ def encode_to_video_bytes( if output_format == "webm": kwargs["codec"] = "libvpx-vp9" elif output_format == "mp4": - kwargs["codec"] = "libx264" + kwargs["codec"] = "h264_nvenc" else: raise ValueError(f"No codec specified for response format: {output_format}") diff --git a/components/src/dynamo/sglang/request_handlers/video_generation/video_generation_handler.py b/components/src/dynamo/sglang/request_handlers/video_generation/video_generation_handler.py index 0a2d516c4de2..8ad5c6044b4c 100644 --- a/components/src/dynamo/sglang/request_handlers/video_generation/video_generation_handler.py +++ b/components/src/dynamo/sglang/request_handlers/video_generation/video_generation_handler.py @@ -259,7 +259,7 @@ async def _generate_video( return video_bytes async def _frames_to_video( - self, frames: list, fps: int, codec: str = "libx264" + self, frames: list, fps: int, codec: str = "h264_nvenc" ) -> bytes: """Convert list of frames to video bytes. diff --git a/container/context.yaml b/container/context.yaml index b6f397da160e..4fee22a94c32 100644 --- a/container/context.yaml +++ b/container/context.yaml @@ -39,6 +39,10 @@ dynamo: enable_media_ffmpeg: "false" enable_gpu_memory_service: "true" ffmpeg_version: "8.1" + # ffmpeg build inputs (only consumed when ENABLE_MEDIA_FFMPEG=true). + nv_codec_headers_ref: "n13.0.19.0" + libvpx_ref: "v1.14.1" + sccache_version: "v0.14.0" efa_version: 1.47.0 vllm: @@ -79,6 +83,8 @@ vllm: enable_kvbm: "true" enable_modelexpress_p2p: "false" modelexpress_ref: "76fc5d7f06c37121ee8789a29fac6f9b08c4743a" # v0.3.0 + # aws-sdk-cpp tag for the NIXL OBJ / S3 backend (built in wheel_builder). + aws_sdk_cpp_version: "1.11.760" sglang: cuda12.9: diff --git a/container/deps/requirements.common.txt b/container/deps/requirements.common.txt index 7265f7cd5401..d04cb50bc344 100644 --- a/container/deps/requirements.common.txt +++ b/container/deps/requirements.common.txt @@ -4,6 +4,13 @@ # Core runtime dependencies shared by ALL Dynamo containers. # See README.md in this directory for version pinning strategy. +# Force a source install of imageio-ffmpeg (pure-Python wrapper). The PyPI wheel +# bundles a prebuilt, GPL-encumbered ffmpeg binary in /imageio_ffmpeg/binaries/ +# that has CVE exposure; we point imageio at the in-tree LGPL ffmpeg CLI via +# IMAGEIO_FFMPEG_EXE instead. This directive is honored by pip and uv when this +# file is passed via --requirement, and applies to the whole install. +--no-binary imageio-ffmpeg + aiohttp>=3.9.0,<4.0 fastapi==0.120.1 grpcio-tools<=1.76.0 # May have platform-specific builds; pins grpcio ecosystem version @@ -11,7 +18,7 @@ httpx==0.28.1 # Video generation: encode frames to MP4 (used by TRT-LLM, vLLM-Omni, SGLang diffusion) imageio>=2.37.0 -imageio-ffmpeg>=0.6.0 +imageio-ffmpeg>=0.6.0 # binary skipped per --no-binary directive at top of file # Shared plotting utility used by runtime diagnostics and benchmark tooling. matplotlib==3.10.7 msgspec==0.19.0 diff --git a/container/deps/requirements.sglang.txt b/container/deps/requirements.sglang.txt new file mode 100644 index 000000000000..bc2e5679d398 --- /dev/null +++ b/container/deps/requirements.sglang.txt @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Third-party Python dependencies for the sglang runtime image. Installed +# with --force-reinstall --no-deps to replace the upstream lmsysorg/sglang +# base image's imageio-ffmpeg wheel (which ships a GPL-encumbered prebuilt +# ffmpeg binary) with a source build that leaves no binary on disk. +# IMAGEIO_FFMPEG_EXE points imageio at the in-tree LGPL ffmpeg CLI. + +--no-binary imageio-ffmpeg + +imageio-ffmpeg>=0.6.0 # binary skipped per --no-binary directive at top of file diff --git a/container/deps/requirements.trtllm.txt b/container/deps/requirements.trtllm.txt index 4f93903e2157..3952a6deec70 100644 --- a/container/deps/requirements.trtllm.txt +++ b/container/deps/requirements.trtllm.txt @@ -5,10 +5,15 @@ # with --no-deps so upstream nvcr.io/nvidia/tensorrt-llm/release's solve # stays intact. Sorted alphabetically per pre-commit requirements-txt-fixer. +# Force a source install of imageio-ffmpeg. The PyPI wheel bundles a prebuilt, +# GPL-encumbered ffmpeg binary that has CVE exposure; we point imageio at the +# in-tree LGPL ffmpeg CLI via IMAGEIO_FFMPEG_EXE instead. +--no-binary imageio-ffmpeg + # Used by the trtllm video_diffusion handler to encode generated frames to MP4. # Upstream tensorrt-llm/release does not ship them. imageio>=2.37.0 -imageio-ffmpeg>=0.6.0 +imageio-ffmpeg>=0.6.0 # binary skipped per --no-binary directive at top of file # Required by ai_dynamo_runtime + gpu_memory_service. Upstream tensorrt-llm/release # does not ship them; vllm/vllm-openai does (which is why DYN-2204's vllm path # does not need this). diff --git a/container/deps/requirements.vllm.txt b/container/deps/requirements.vllm.txt new file mode 100644 index 000000000000..5e596820f44d --- /dev/null +++ b/container/deps/requirements.vllm.txt @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Third-party Python dependencies for the vllm runtime image. Installed +# with --reinstall-package imageio-ffmpeg --no-deps to replace the upstream +# vllm/vllm-openai base image's imageio-ffmpeg wheel (which ships a +# GPL-encumbered prebuilt ffmpeg binary) with a source build that leaves +# no binary on disk. vLLM-Omni uses diffusers.export_to_video and doesn't +# invoke imageio-ffmpeg, so no IMAGEIO_FFMPEG_EXE is needed — this is +# purely to clear the GPL binary. + +--no-binary imageio-ffmpeg + +imageio-ffmpeg>=0.6.0 # binary skipped per --no-binary directive at top of file diff --git a/container/templates/args.Dockerfile b/container/templates/args.Dockerfile index f5d7aaf14bcb..999478cf8d4c 100644 --- a/container/templates/args.Dockerfile +++ b/container/templates/args.Dockerfile @@ -55,12 +55,15 @@ ARG ETCD_VERSION={{ context.dynamo.etcd_version }} ARG ENABLE_MEDIA_FFMPEG={{ context[framework].enable_media_ffmpeg }} ARG FFMPEG_VERSION={{ context.dynamo.ffmpeg_version }} +ARG NV_CODEC_HEADERS_REF={{ context.dynamo.nv_codec_headers_ref }} +ARG LIBVPX_REF={{ context.dynamo.libvpx_ref }} {% if device == "cuda" -%} ARG ENABLE_GPU_MEMORY_SERVICE={{ context[framework].enable_gpu_memory_service }} {% endif %} # SCCACHE configuration ARG USE_SCCACHE +ARG SCCACHE_VERSION={{ context.dynamo.sccache_version }} ARG SCCACHE_BUCKET="" ARG SCCACHE_REGION="" @@ -108,6 +111,9 @@ ARG DEEPGEMM_REF="" # ModelExpress for P2P weight transfer (optional) ARG ENABLE_MODELEXPRESS_P2P={{ context.vllm.enable_modelexpress_p2p }} ARG MODELEXPRESS_REF={{ context.vllm.modelexpress_ref }} + +# aws-sdk-cpp tag for the NIXL OBJ / S3 backend (built in wheel_builder). +ARG AWS_SDK_CPP_VERSION={{ context.vllm.aws_sdk_cpp_version }} {% endif %} {%- endif -%} diff --git a/container/templates/dynamo_base.Dockerfile b/container/templates/dynamo_base.Dockerfile index dca5b95b32f8..f38d61cdda35 100644 --- a/container/templates/dynamo_base.Dockerfile +++ b/container/templates/dynamo_base.Dockerfile @@ -22,7 +22,7 @@ RUN apt clean && apt-get update -y && \ # Install sccache into the base image so downstream stages can COPY it # instead of downloading from GitHub (avoids 502 errors under parallel builds) -ARG SCCACHE_VERSION=v0.14.0 +ARG SCCACHE_VERSION RUN ARCH_ALT=$([ "${TARGETARCH}" = "amd64" ] && echo "x86_64" || echo "aarch64") && \ wget --tries=3 --waitretry=5 \ "https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${ARCH_ALT}-unknown-linux-musl.tar.gz" && \ diff --git a/container/templates/dynamo_runtime.Dockerfile b/container/templates/dynamo_runtime.Dockerfile index eb2863c3213b..ef3cb0f74c30 100644 --- a/container/templates/dynamo_runtime.Dockerfile +++ b/container/templates/dynamo_runtime.Dockerfile @@ -42,13 +42,17 @@ COPY --chown=dynamo: --from=wheel_builder ${NIXL_PREFIX}/ ${NIXL_PREFIX}/ COPY --chown=dynamo: --from=wheel_builder /opt/dynamo/dist/nixl/ /opt/dynamo/wheelhouse/nixl/ COPY --chown=dynamo: --from=wheel_builder /workspace/nixl/build/src/bindings/python/nixl-meta/nixl-*.whl /opt/dynamo/wheelhouse/nixl/ -# Always copy FFmpeg so libs are available for Rust checks in CI +# Always copy FFmpeg so libs are available for Rust checks in CI. +# libvpx.so* is included because the in-tree ffmpeg is built with --enable-libvpx, +# so libavcodec.so has a runtime dependency on libvpx.so.9. RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \ mkdir -p /usr/local/lib/pkgconfig && \ cp -rnL /tmp/usr/local/include/libav* /tmp/usr/local/include/libsw* /usr/local/include/ && \ cp -nL /tmp/usr/local/lib/libav*.so /tmp/usr/local/lib/libsw*.so /usr/local/lib/ && \ + cp -nL /tmp/usr/local/lib/lib*vpx*.so* /usr/local/lib/ 2>/dev/null || true && \ cp -nL /tmp/usr/local/lib/pkgconfig/libav*.pc /tmp/usr/local/lib/pkgconfig/libsw*.pc /usr/local/lib/pkgconfig/ && \ - cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ + cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ && \ + ldconfig {% if target not in ("dev", "local-dev") %} # Copy built artifacts (not needed for dev/local-dev; users build from source) diff --git a/container/templates/sglang_runtime.Dockerfile b/container/templates/sglang_runtime.Dockerfile index 0ee140c4ec72..b138daac91d7 100644 --- a/container/templates/sglang_runtime.Dockerfile +++ b/container/templates/sglang_runtime.Dockerfile @@ -29,15 +29,24 @@ RUN userdel -r ubuntu > /dev/null 2>&1 || true \ # NOTE: Setting ENV UMASK=002 does NOT work - umask is a shell builtin, not an environment variable && mkdir -p /etc/profile.d && echo 'umask 002' > /etc/profile.d/00-umask.sh -{% if context.sglang.enable_media_ffmpeg == "true" %} -# Copy ffmpeg +# Copy ffmpeg from wheel_builder: versioned shared libs (libav*.so*, +# libsw*.so*) for the Rust media-ffmpeg decoder, plus the LGPL CLI binary +# (built with h264_nvenc + libvpx_vp9 encoders) that imageio targets via +# IMAGEIO_FFMPEG_EXE for video encoding. Ungated by enable_media_ffmpeg +# because the upstream lmsysorg/sglang base image always ships +# imageio-ffmpeg with a GPL-encumbered prebuilt binary that we replace +# unconditionally below; the LGPL CLI must be present so imageio has +# something to target. RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \ mkdir -p /usr/local/lib/pkgconfig && \ cp -rnL /tmp/usr/local/include/libav* /tmp/usr/local/include/libsw* /usr/local/include/ && \ - cp -nL /tmp/usr/local/lib/libav*.so /tmp/usr/local/lib/libsw*.so /usr/local/lib/ && \ + cp -nL /tmp/usr/local/lib/libav*.so* /tmp/usr/local/lib/libsw*.so* /usr/local/lib/ && \ + cp -nL /tmp/usr/local/lib/lib*vpx*.so* /usr/local/lib/ 2>/dev/null || true && \ cp -nL /tmp/usr/local/lib/pkgconfig/libav*.pc /tmp/usr/local/lib/pkgconfig/libsw*.pc /usr/local/lib/pkgconfig/ && \ - cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ -{% endif %} + cp -nL /tmp/usr/local/bin/ffmpeg /usr/local/bin/ffmpeg && \ + cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ && \ + ldconfig +ENV IMAGEIO_FFMPEG_EXE=/usr/local/bin/ffmpeg {% if target not in ("dev", "local-dev") %} # Runtime target installs only the prebuilt Dynamo wheels. SGLang and its NIXL @@ -86,6 +95,17 @@ RUN --mount=type=bind,source=./container/deps/requirements.common.txt,target=/tm export PIP_CACHE_DIR=/root/.cache/pip && \ pip install --break-system-packages --no-deps $(grep -E '^nvtx==' /tmp/requirements.common.txt) +# Replace the upstream lmsysorg/sglang image's imageio-ffmpeg (which ships a +# GPL-encumbered prebuilt ffmpeg binary in /imageio_ffmpeg/binaries/) +# with a source install that leaves no binary on disk. IMAGEIO_FFMPEG_EXE points +# imageio at the LGPL CLI we copied from wheel_builder above. The --no-binary +# directive lives in the requirements file itself. +RUN --mount=type=bind,source=./container/deps/requirements.sglang.txt,target=/tmp/requirements.sglang.txt \ + --mount=type=cache,target=/root/.cache/pip,sharing=locked \ + export PIP_CACHE_DIR=/root/.cache/pip && \ + pip install --break-system-packages --force-reinstall --no-deps \ + --requirement /tmp/requirements.sglang.txt + # Copy tests, deploy and components for CI with correct ownership COPY --chmod=775 --chown=dynamo:0 tests /workspace/tests COPY --chmod=775 --chown=dynamo:0 examples /workspace/examples diff --git a/container/templates/trtllm_runtime.Dockerfile b/container/templates/trtllm_runtime.Dockerfile index 5285fa2dbf21..617a95287a91 100644 --- a/container/templates/trtllm_runtime.Dockerfile +++ b/container/templates/trtllm_runtime.Dockerfile @@ -127,6 +127,9 @@ RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \ \ # Third-party deps Dynamo wheels declare but upstream lacks, plus the # huggingface-hub pin and KVBM-matching nixl-cu13. See the file for context. + # The requirements.trtllm.txt file itself carries a `--no-binary imageio-ffmpeg` + # directive that keeps the GPL-encumbered prebuilt ffmpeg off disk; IMAGEIO_FFMPEG_EXE + # below points imageio at the in-tree LGPL CLI. uv pip install --no-deps --requirement /tmp/requirements.trtllm.txt && \ \ if [ "${ENABLE_KVBM}" = "true" ]; then \ @@ -143,6 +146,20 @@ RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \ fi {% endif %} +# Copy the in-tree LGPL ffmpeg from wheel_builder. The TRT-LLM diffusion handler +# always encodes video (video_handler.py:263 → encode_to_video_bytes), so the +# CLI and its libav* / libvpx runtime libs need to be present in this image and +# imageio must be pointed at it via IMAGEIO_FFMPEG_EXE. Ungated by +# enable_media_ffmpeg because TRT-LLM unconditionally needs the encoder. +RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/local/ \ + cp -nL /tmp/usr/local/lib/libav*.so* /usr/local/lib/ 2>/dev/null || true && \ + cp -nL /tmp/usr/local/lib/libsw*.so* /usr/local/lib/ 2>/dev/null || true && \ + cp -nL /tmp/usr/local/lib/lib*vpx*.so* /usr/local/lib/ 2>/dev/null || true && \ + cp -nL /tmp/usr/local/bin/ffmpeg /usr/local/bin/ffmpeg && \ + cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ && \ + ldconfig +ENV IMAGEIO_FFMPEG_EXE=/usr/local/bin/ffmpeg + # Pull /workspace_src (incl. ATTRIBUTION/LICENSE) from the transport stage and # wire up the launch screen in a single RUN — saves the standalone workspace COPY layer. RUN --mount=type=bind,from=workspace_files,source=/workspace_src,target=/tmp/workspace_src \ @@ -181,6 +198,7 @@ ENV DYNAMO_HOME=/workspace \ HOME=/home/dynamo \ VIRTUAL_ENV=/opt/dynamo/venv \ PATH=/opt/dynamo/venv/bin:/usr/local/bin/etcd:${PATH} \ + IMAGEIO_FFMPEG_EXE=/usr/local/bin/ffmpeg \ LD_PRELOAD=/opt/dynamo/libstdc++.so.6:/usr/local/lib/python3.12/dist-packages/tensorrt_llm/libs/nixl/libnixl.so \ NIXL_PLUGIN_DIR=/usr/local/lib/python3.12/dist-packages/tensorrt_llm/libs/nixl/plugins diff --git a/container/templates/vllm_runtime.Dockerfile b/container/templates/vllm_runtime.Dockerfile index c82186bf0162..65a39711f014 100644 --- a/container/templates/vllm_runtime.Dockerfile +++ b/container/templates/vllm_runtime.Dockerfile @@ -202,6 +202,18 @@ RUN --mount=type=bind,from=wheel_builder,source=/usr/local/,target=/tmp/usr/loca cp -r /tmp/usr/local/src/ffmpeg /usr/local/src/ {% endif %} +# Replace the upstream vllm/vllm-openai image's imageio-ffmpeg (which ships +# a GPL-encumbered prebuilt ffmpeg binary) with a source install that leaves +# no binary on disk. vLLM-Omni uses diffusers.export_to_video and doesn't +# invoke imageio-ffmpeg, so no IMAGEIO_FFMPEG_EXE is needed — this is +# purely to clear the GPL binary. The --no-binary directive lives in the +# requirements file itself. +RUN --mount=type=bind,source=./container/deps/requirements.vllm.txt,target=/tmp/requirements.vllm.txt \ + --mount=type=cache,target=/root/.cache/uv,sharing=locked \ + export UV_CACHE_DIR=/root/.cache/uv && \ + uv pip install {{ pip_target }} --reinstall-package imageio-ffmpeg --no-deps \ + --requirement /tmp/requirements.vllm.txt + # Remove the vLLM source tree shipped in the base image to avoid pytest # collection conflicts (duplicate conftest plugin registration) and stale # tool scripts referencing files not present in Dynamo's build context. diff --git a/container/templates/wheel_builder.Dockerfile b/container/templates/wheel_builder.Dockerfile index 19e07cbf64f9..faa3865c045e 100644 --- a/container/templates/wheel_builder.Dockerfile +++ b/container/templates/wheel_builder.Dockerfile @@ -255,9 +255,16 @@ RUN if [ "$USE_SCCACHE" = "true" ]; then \ ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET}} \ SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION}} -# Always build FFmpeg so libs are available for Rust checks in CI -# Do not delete the source tarball for legal reasons +# Always build FFmpeg so libs are available for Rust checks in CI. +# We also build the ffmpeg CLI with h264_nvenc + libvpx_vp9 encoders so Python +# code can encode video without the GPL-licensed binary shipped by imageio-ffmpeg. +# Stays LGPL-only: --disable-gpl --disable-nonfree are preserved; H.264 comes from +# NVIDIA's NVENC (proprietary HW encoder, already a runtime dependency of these +# GPU images) and VP9 from libvpx (BSD). +# Do not delete the source tarball for legal reasons. ARG FFMPEG_VERSION +ARG NV_CODEC_HEADERS_REF +ARG LIBVPX_REF RUN --mount=type=secret,id=aws-web-identity-token,target=/run/secrets/aws-token \ --mount=type=secret,id=aws-role-arn,env=AWS_ROLE_ARN \ export AWS_WEB_IDENTITY_TOKEN_FILE=/run/secrets/aws-token && \ @@ -266,11 +273,26 @@ RUN --mount=type=secret,id=aws-web-identity-token,target=/run/secrets/aws-token eval $(/tmp/use-sccache.sh setup-env); \ fi && \ if [ "$DEVICE" = "xpu" ] || [ "$DEVICE" = "cpu" ]; then \ - apt-get update -y && apt-get install -y build-essential pkg-config xz-utils; \ + apt-get update -y && apt-get install -y build-essential pkg-config xz-utils git yasm; \ apt-get clean && rm -rf /var/lib/apt/lists/*; \ elif [ "$DEVICE" = "cuda" ]; then \ - dnf install -y --setopt=tsflags=nocontexts pkg-config xz; \ + dnf install -y --setopt=tsflags=nocontexts pkg-config xz git yasm; \ fi && \ + # nv-codec-headers: provides the NVENC/NVDEC API headers ffmpeg compiles against. + # Header-only, no runtime dep here; libcuda/libnvidia-encode are loaded at runtime + # in the consuming container. + cd /tmp && \ + git clone --depth 1 --branch ${NV_CODEC_HEADERS_REF} https://github.com/FFmpeg/nv-codec-headers.git && \ + make -C nv-codec-headers PREFIX=/usr/local install && \ + # libvpx: BSD-licensed VP9 encoder needed for the WebM output path. Built from + # source so we don't need to track distro package names (libvpx-dev on Debian + # vs libvpx-devel via EPEL on RHEL/manylinux). + git clone --depth 1 --branch ${LIBVPX_REF} https://chromium.googlesource.com/webm/libvpx.git && \ + cd libvpx && \ + ./configure --prefix=/usr/local --enable-shared --disable-static --disable-examples --disable-unit-tests --disable-tools --disable-docs && \ + make -j$(nproc) && \ + make install && \ + ldconfig && \ cd /tmp && \ curl --retry 5 --retry-delay 3 -LO https://ffmpeg.org/releases/ffmpeg-${FFMPEG_VERSION}.tar.xz && \ tar xf ffmpeg-${FFMPEG_VERSION}.tar.xz && \ @@ -279,17 +301,21 @@ RUN --mount=type=secret,id=aws-web-identity-token,target=/run/secrets/aws-token --prefix=/usr/local \ --disable-gpl \ --disable-nonfree \ - --disable-programs \ --disable-doc \ --disable-static \ --disable-x86asm \ --disable-network \ - --disable-encoders \ - --disable-muxers \ --disable-bsfs \ --disable-devices \ --disable-libdrm \ - --enable-shared && \ + --enable-shared \ + --enable-nvenc \ + --enable-libvpx \ + --disable-encoders \ + --enable-encoder=h264_nvenc,libvpx_vp9 \ + --disable-muxers \ + --enable-muxer=mov,mp4,matroska,webm \ + --enable-protocol=file,pipe && \ make -j$(nproc) && \ make install && \ /tmp/use-sccache.sh show-stats "FFMPEG" && \ @@ -401,7 +427,7 @@ ENV PKG_CONFIG_PATH="/usr/local/libfabric/lib/pkgconfig:${PKG_CONFIG_PATH}" {% if framework == "vllm" and device == "cuda" %} # Build and install AWS SDK C++ (required for NIXL OBJ backend / S3 support) -ARG AWS_SDK_CPP_VERSION=1.11.760 +ARG AWS_SDK_CPP_VERSION RUN --mount=type=secret,id=aws-web-identity-token,target=/run/secrets/aws-token \ --mount=type=secret,id=aws-role-arn,env=AWS_ROLE_ARN \ export AWS_WEB_IDENTITY_TOKEN_FILE=/run/secrets/aws-token && \ diff --git a/docs/backends/trtllm/trtllm-diffusion.md b/docs/backends/trtllm/trtllm-diffusion.md index 360c9928d75e..18044ceba9ec 100644 --- a/docs/backends/trtllm/trtllm-diffusion.md +++ b/docs/backends/trtllm/trtllm-diffusion.md @@ -15,10 +15,12 @@ image generation through `--modality image_diffusion` flag. - **TensorRT-LLM with visual_gen**: The `visual_gen` module is part of TensorRT-LLM (`tensorrt_llm._torch.visual_gen`). Install TensorRT-LLM following the [official instructions](https://github.com/NVIDIA/TensorRT-LLM#installation). - **dynamo-runtime with multimodal API**: The Dynamo runtime must include `ModelType.Videos` or `ModelType.Images` support. Ensure you're using a compatible version. -- **VIDEO diffusion: imageio with ffmpeg**: Required for encoding generated frames to MP4 video: +- **VIDEO diffusion: imageio with ffmpeg**: Required for encoding generated frames to MP4 video. The Dynamo TRT-LLM runtime container ships an LGPL-only ffmpeg CLI built with the NVIDIA NVENC H.264 encoder (`h264_nvenc`) and `libvpx_vp9` for WebM, and points `imageio` at it via `IMAGEIO_FFMPEG_EXE=/usr/local/bin/ffmpeg` — the GPL-encumbered ffmpeg binary normally shipped inside the `imageio-ffmpeg` PyPI wheel is **not** installed. If you're running outside the container, install the Python wrapper without the bundled binary and point it at your own ffmpeg: ```bash - pip install imageio[ffmpeg] + pip install --no-binary imageio-ffmpeg "imageio[ffmpeg]" + export IMAGEIO_FFMPEG_EXE=/path/to/your/ffmpeg ``` + MP4 output requires an NVIDIA GPU at runtime (NVENC is a hardware encoder). ## Supported Models From 2c480644baef440ffaf67f308166c43080756975 Mon Sep 17 00:00:00 2001 From: Harrison King Saturley-Hall Date: Sun, 31 May 2026 05:12:14 -0400 Subject: [PATCH 8/9] fix(omni): install git in vllm-runtime for git-based vllm-omni install The vllm-runtime build failed at install_vllm_omni.sh with "Git executable not found" because uv needs git to fetch the vllm-omni PR pin (git+https://...@65b83d87), but the upstream vllm/vllm-openai runtime image does not ship git. The released-wheel install never needed it. Add git to the existing omni apt step, gated on VLLM_OMNI_GIT_URL via ${VLLM_OMNI_GIT_URL:+git} so the PyPI-wheel path (and the eventual revert) keeps the runtime image lean. Signed-off-by: Harrison King Saturley-Hall Co-Authored-By: Claude Opus 4.8 --- container/templates/vllm_runtime.Dockerfile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/container/templates/vllm_runtime.Dockerfile b/container/templates/vllm_runtime.Dockerfile index 65a39711f014..cd6322ed8ce0 100644 --- a/container/templates/vllm_runtime.Dockerfile +++ b/container/templates/vllm_runtime.Dockerfile @@ -164,13 +164,16 @@ RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked \ fi # vLLM-Omni's audio helpers shell out to SoX, and the launch script examples use -# jq for readable curl output just like the upstream omni image does. +# jq for readable curl output just like the upstream omni image does. git is only +# pulled in when VLLM_OMNI_GIT_URL is set (installing vllm-omni from an unreleased +# git ref) — the upstream vllm runtime image does not ship git. RUN set -eux; \ apt-get update; \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ jq \ sox \ - libsox-fmt-all; \ + libsox-fmt-all \ + ${VLLM_OMNI_GIT_URL:+git}; \ rm -rf /var/lib/apt/lists/* # Layer the released vLLM-Omni package matching the pinned upstream ref while From 271214ef7366d73543247c17da432cc07d0b5526 Mon Sep 17 00:00:00 2001 From: Harrison King Saturley-Hall Date: Sun, 31 May 2026 08:41:03 -0400 Subject: [PATCH 9/9] test(omni): use real PIL images in output_formatter image tests The diffusion image tests fed bare MagicMock() objects as images. Since ebe677986b2 routed _prepare_images through normalize_image_frames(), a non-PIL input takes the np.asarray(item).max() path; MagicMock.__iter__ defaults to empty, so np.asarray(MagicMock()) is a zero-size array and arr.max() raises "zero-size array to reduction operation maximum". These 8 tests only ran in CI once the runtime image built, exposing the failure. Swap the MagicMock image doubles for real PIL images via a _make_pil_image() helper, so they hit the isinstance(item, Image.Image) pass-through and img.save(buf, format="PNG") produces real PNG bytes. Assertions unchanged. Signed-off-by: Harrison King Saturley-Hall Co-Authored-By: Claude Opus 4.8 --- .../vllm/tests/omni/test_output_formatter.py | 39 ++++++++----------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/components/src/dynamo/vllm/tests/omni/test_output_formatter.py b/components/src/dynamo/vllm/tests/omni/test_output_formatter.py index db04855787c1..d45cb30f6518 100644 --- a/components/src/dynamo/vllm/tests/omni/test_output_formatter.py +++ b/components/src/dynamo/vllm/tests/omni/test_output_formatter.py @@ -115,36 +115,39 @@ def _make_diffusion_formatter(): ) +def _make_pil_image(size=(4, 4)): + # Use a real PIL image: normalize_image_frames() passes PIL inputs through + # unchanged, whereas a MagicMock falls into the np.asarray(item).max() path + # and raises "zero-size array to reduction operation maximum". + from PIL import Image + + return Image.new("RGB", size, (123, 222, 64)) + + class TestDiffusionFormatterPrepareImages: @pytest.mark.asyncio async def test_b64_json(self): f = _make_diffusion_formatter() - img = MagicMock() - img.save = lambda b, format: b.write(b"fake_png_data") - results = await f._prepare_images([img], "req-1", "b64_json") + results = await f._prepare_images([_make_pil_image()], "req-1", "b64_json") assert len(results) == 1 assert results[0].startswith("data:image/png;base64,") @pytest.mark.asyncio async def test_b64_default_when_none(self): f = _make_diffusion_formatter() - img = MagicMock() - img.save = lambda b, format: b.write(b"data") - results = await f._prepare_images([img], "req-1", None) + results = await f._prepare_images([_make_pil_image()], "req-1", None) assert results[0].startswith("data:image/png;base64,") @pytest.mark.asyncio async def test_invalid_format(self): f = _make_diffusion_formatter() with pytest.raises(ValueError, match="Invalid response format"): - await f._prepare_images([MagicMock()], "req-1", "invalid") + await f._prepare_images([_make_pil_image()], "req-1", "invalid") @pytest.mark.asyncio async def test_multiple_images(self): f = _make_diffusion_formatter() - imgs = [MagicMock() for _ in range(3)] - for img in imgs: - img.save = lambda b, format: b.write(b"px") + imgs = [_make_pil_image() for _ in range(3)] results = await f._prepare_images(imgs, "req-1", "b64_json") assert len(results) == 3 @@ -155,10 +158,8 @@ async def test_chat_completion_format(self): from dynamo.common.utils.output_modalities import RequestType f = _make_diffusion_formatter() - img = MagicMock() - img.save = lambda b, format: b.write(b"px") chunk = await f._encode_image( - [img], "req-1", request_type=RequestType.CHAT_COMPLETION + [_make_pil_image()], "req-1", request_type=RequestType.CHAT_COMPLETION ) assert chunk["object"] == "chat.completion.chunk" assert chunk["choices"][0]["delta"]["content"][0]["type"] == "image_url" @@ -168,10 +169,8 @@ async def test_image_generation_b64_format(self): from dynamo.common.utils.output_modalities import RequestType f = _make_diffusion_formatter() - img = MagicMock() - img.save = lambda b, format: b.write(b"px") chunk = await f._encode_image( - [img], + [_make_pil_image()], "req-1", response_format="b64_json", request_type=RequestType.IMAGE_GENERATION, @@ -183,10 +182,8 @@ async def test_image_generation_default_format_returns_b64(self): from dynamo.common.utils.output_modalities import RequestType f = _make_diffusion_formatter() - img = MagicMock() - img.save = lambda b, format: b.write(b"px") chunk = await f._encode_image( - [img], + [_make_pil_image()], "req-1", response_format=None, request_type=RequestType.IMAGE_GENERATION, @@ -381,9 +378,7 @@ async def test_routes_image(self): f = OutputFormatter(model_name="test-model") stage = MagicMock() stage.final_output_type = "image" - img = MagicMock() - img.save = lambda b, format: b.write(b"px") - stage.images = [img] + stage.images = [_make_pil_image()] chunk = await f.format( stage, "req-1", request_type=RequestType.CHAT_COMPLETION, **self._FULL_CTX )