Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d282d39
feat(vllm-omni): bump to 0.20.0rc1 and align with upstream vllm v0.20.0
May 2, 2026
0b31a62
fix(vllm-omni): split cuda/build requirements install for vllm v0.20.0
May 5, 2026
dc1af5b
refactor(vllm-omni): align Dockerfile with upstream vllm v0.20.0 and …
May 5, 2026
5dc4627
chore(vllm-omni): compute SETUPTOOLS_SCM_PRETEND_VERSION in versions.env
May 5, 2026
d1d1ba1
fix(vllm-omni): explicit cuda-compat-13-0 upgrade for CVE-2025-33219
May 5, 2026
d8be5aa
feat(vllm-omni): add stable-audio-open-1.0 to smoke-test matrix
May 7, 2026
00bce6f
feat(vllm-omni): support ref_audio_s3 in smoke-test, add voice-clone …
May 7, 2026
b6a3627
fix(vllm-omni): pass large request body to curl via file, not argv
May 7, 2026
85fe09f
feat(vllm-omni): bump to v0.20.0 final and expand smoke-test matrix
May 8, 2026
da00c60
chore: fix ruff lint+format on scripts/autocurrency/agent-fix.py
May 8, 2026
52df3a3
Merge branch 'main' into omni-0.20.0
Yadan-Wei May 8, 2026
fe1a499
fix: resolve agent-fix.py merge-conflict markers from main merge
May 8, 2026
2a736bf
chore(vllm-omni): allowlist GHSA-98h9-4798-4q5v (diffusers trust_remo…
May 8, 2026
4a03c47
docs(vllm-omni): note --middleware contract in SageMaker proxy module
May 8, 2026
2393207
test(vllm-omni): add async SageMaker endpoint test for video generation
May 8, 2026
d60a22f
Merge branch 'main' into omni-0.20.0
Yadan-Wei May 8, 2026
2a94ad3
chore: sync agent-fix.py with main (structured GitHub API + prompt fi…
May 8, 2026
b85fd71
refactor(vllm-omni): inline capacity-skip into async_endpoint fixture
May 8, 2026
1611a11
Merge remote-tracking branch 'origin/omni-0.20.0' into omni-0.20.0
May 8, 2026
48a26f3
chore(vllm-omni): allowlist 2 go/stdlib CVEs in mooncake/libetcd_wrap…
May 8, 2026
02691e9
test(vllm-omni): switch video async endpoint test to ml.g5.2xlarge
May 11, 2026
bf4f94d
chore(vllm-omni): allowlist 3 more mooncake go/stdlib CVEs
May 11, 2026
97e64fb
ci(vllm-omni): rename prod_image tags + bump cosyvoice fleet to g6e.x…
May 11, 2026
b2f9c8d
chore(vllm-omni): bump DLC_MINOR_VERSION 0 → 1
May 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/config/image/vllm-omni-ec2-amzn2023.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ image:

common:
framework: "vllm_omni"
framework_version: "0.18.0"
vllm_ref: "v0.18.0"
framework_version: "0.20.0"
vllm_ref: "v0.20.0"
job_type: "general"
python_version: "py312"
cuda_version: "cu129"
cuda_version: "cu130"
os_version: "amzn2023"
customer_type: "ec2"
arch_type: "x86"
prod_image: "vllm-omni:0.18-gpu-py312-ec2"
prod_image: "vllm:omni-cuda-v1"
device_type: "gpu"
contributor: "None"

Expand Down
8 changes: 4 additions & 4 deletions .github/config/image/vllm-omni-sagemaker-amzn2023.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@ image:

common:
framework: "vllm_omni"
framework_version: "0.18.0"
vllm_ref: "v0.18.0"
framework_version: "0.20.0"
vllm_ref: "v0.20.0"
job_type: "general"
python_version: "py312"
cuda_version: "cu129"
cuda_version: "cu130"
os_version: "amzn2023"
customer_type: "sagemaker"
platform: "sagemaker"
arch_type: "x86"
prod_image: "vllm-omni:0.18-gpu-py312-sagemaker"
prod_image: "vllm:omni-sagemaker-cuda-v1"
device_type: "gpu"
contributor: "None"

Expand Down
88 changes: 87 additions & 1 deletion .github/config/model-tests/vllm-omni-model-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,33 @@ smoke-test:
test_request: '{"input": "Hello, how are you?", "voice": "vivian", "language": "English"}'
validate: "binary_size_gt:1000"

# Voice-clone TTS: ref_audio_s3 is fetched by the workflow, base64-encoded,
# and injected as ref_audio before invoking the smoke-test script.
# ref_text MUST be the exact transcript of the reference audio — mismatched
# transcripts can cause Code2Wav malformed output (upstream issue #3124).
- name: "qwen3-tts-12hz-1.7b-base"
s3_model: "qwen3-tts-12hz-1.7b-base.tar.gz"
fleet: "x86-g6xl-runner"
extra_args: ""
route: "/v1/audio/speech"
test_request: '{"input": "Hello, this is a voice cloning smoke test.", "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English"}'
validate: "binary_size_gt:1000"

# CosyVoice3 is zero-shot voice-clone only (no preset voices). The upstream
# vendored fixture ships at tests/assets/cosyvoice3/zero_shot_prompt.wav;
# we mirror it under test-fixtures/audio/ for CI isolation.
# Fleet bumped from x86-g6xl-runner (16 GB RAM) to x86-g6exl-runner
# (32 GB RAM) on 2026-05-11: cosyvoice with --trust-remote-code on 16 GB
# was causing host SIGKILL during model load on vllm-omni 0.20.0 final.
# Last green run was 2026-05-07 on rc1; regression in final.
- name: "cosyvoice3-0.5b"
s3_model: "cosyvoice3-0.5b.tar.gz"
fleet: "x86-g6exl-runner"
extra_args: "--trust-remote-code --enforce-eager"
route: "/v1/audio/speech"
test_request: '{"input": "Hello, this is a voice cloning smoke test.", "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/cosyvoice3_ref.wav", "ref_text": "希望你以后能够做的比我还好呦。", "response_format": "wav", "stream": false}'
validate: "binary_size_gt:1000"

# --- Image generation models (route: /v1/images/generations) ---
- name: "flux2-klein-4b"
s3_model: "flux2-klein-4b.tar.gz"
Expand All @@ -36,7 +63,18 @@ smoke-test:
test_request: '{"prompt": "a red apple on a white table", "size": "512x512", "n": 1}'
validate: "json_field:data[0].b64_json"

# --- Video generation models (route: /v1/videos) ---
# ERNIE-Image-Turbo: 8-step distilled DiT image gen, added in vllm-omni
# #2861. ErnieImagePipeline only landed in v0.20.0 final (post-rc1).
- name: "ernie-image-turbo"
s3_model: "ernie-image-turbo.tar.gz"
fleet: "x86-g6exl-runner"
extra_args: ""
route: "/v1/images/generations"
test_request: '{"prompt": "a red apple on a white table", "size": "512x512", "n": 1}'
validate: "json_field:data[0].b64_json"

# --- Video generation models ---
# Async route (POST /v1/videos): returns job ID, requires polling.
- name: "wan2.1-t2v-1.3b"
s3_model: "wan2.1-t2v-1.3b.tar.gz"
fleet: "x86-g6exl-runner"
Expand All @@ -46,6 +84,54 @@ smoke-test:
test_request: 'prompt=a dog running on a beach&num_frames=17&num_inference_steps=4&size=480x320&seed=42'
validate: "json_field:id"

# Sync route (POST /v1/videos/sync): blocks until complete, returns raw
# video/mp4 bytes. New in v0.20.0 — compatible with SageMaker endpoints.
- name: "wan2.1-t2v-1.3b-sync"
s3_model: "wan2.1-t2v-1.3b.tar.gz"
fleet: "x86-g6exl-runner"
extra_args: ""
route: "/v1/videos/sync"
content_type: "multipart/form-data"
test_request: 'prompt=a dog running on a beach&num_frames=17&num_inference_steps=4&size=480x320&seed=42'
validate: "binary_size_gt:1000"

# Wan2.1-VACE: unified video creation/editing pipeline (WanVACEPipeline,
# added in vllm-omni #1885). Distinct from WanPipeline T2V — accepts
# text + optional video/mask/reference image. 1.3B variant fits L40S.
# Validated 2026-05-08 on g6e.2xlarge: 46 KB MP4 in 2.37s, peak GPU 19.3 GB.
- name: "wan2.1-vace-1.3b"
s3_model: "wan2.1-vace-1.3b.tar.gz"
fleet: "x86-g6exl-runner"
extra_args: ""
route: "/v1/videos/sync"
content_type: "multipart/form-data"
test_request: 'prompt=a dog running on a beach&num_frames=17&num_inference_steps=4&size=480x320&seed=42'
validate: "binary_size_gt:1000"

# Wan2.2-I2V-A14B: 27B-total / 14B-active MoE, image-to-video. Tarball
# is 107 GB and needs g6e.12xlarge. Pre-staged at
# s3://dlc-cicd-models/omni-models/wan2.2-i2v-a14b.tar.gz but not enabled
# because (a) g6e.12xl is currently ICE in us-west-2, (b) /v1/videos/sync
# for I2V needs an `image` form field that the current smoke-test harness
# doesn't fetch (analogous to the ref_audio_s3 pattern, but for images).
# - name: "wan2.2-i2v-a14b"
# s3_model: "wan2.2-i2v-a14b.tar.gz"
# fleet: "x86-g6e12xl-runner"
# extra_args: ""
# route: "/v1/videos/sync"
# content_type: "multipart/form-data"
# test_request: 'prompt=a dog running on a beach&image_s3=s3://dlc-cicd-models/test-fixtures/images/i2v_seed.png&num_frames=17&num_inference_steps=4&size=480x320&seed=42'
# validate: "binary_size_gt:1000"

# --- Audio generation models (route: /v1/audio/generate, new in v0.20.0 per vllm-project/vllm-omni#1794) ---
- name: "stable-audio-open-1.0"
s3_model: "stable-audio-open-1.0.tar.gz"
fleet: "x86-g6xl-runner"
extra_args: "--gpu-memory-utilization 0.9 --trust-remote-code --enforce-eager"
route: "/v1/audio/generate"
test_request: '{"input": "The sound of a dog barking", "audio_length": 5.0, "guidance_scale": 7.0, "num_inference_steps": 50, "seed": 42}'
validate: "binary_size_gt:10000"

# --- Omni chat models (route: /v1/chat/completions, fallthrough) ---
# model is big, won't run for now
# - name: "bagel-7b-mot"
Expand Down
32 changes: 30 additions & 2 deletions .github/workflows/reusable-vllm-omni-model-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ jobs:
${{ inputs.image-uri }} \
--model ${{ steps.resolve.outputs.model_path }} \
--port 8080 \
--stage-init-timeout 600 \
--stage-init-timeout 900 \
${{ matrix.model.extra_args }})
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV

Expand Down Expand Up @@ -158,11 +158,39 @@ jobs:
docker cp test/vllm-omni/scripts/vllm_omni_${{ inputs.customer-type }}_smoke_test.sh \
${CONTAINER_ID}:/tmp/smoke_test.sh

- name: Prepare test request
# Write test_request to a file and expand any S3-backed voice-clone
# reference audio into an inlined base64 data URL. Using a file keeps
# us under the shell argument-length limit (~128KB) when ref_audio
# payloads push the JSON past ~400KB.
run: |
set -euo pipefail
REQUEST='${{ matrix.model.test_request }}'
if [[ "$REQUEST" == *ref_audio_s3* ]]; then
REF_S3=$(python3 -c 'import json,sys;print(json.loads(sys.stdin.read()).get("ref_audio_s3",""))' <<< "$REQUEST")
if [ -n "$REF_S3" ]; then
echo "Fetching ref_audio from $REF_S3"
aws s3 cp "$REF_S3" /tmp/ref_audio.wav --quiet
REQUEST=$(REQUEST="$REQUEST" python3 <<'PY'
import base64, json, os
payload = json.loads(os.environ["REQUEST"])
with open("/tmp/ref_audio.wav", "rb") as f:
payload["ref_audio"] = "data:audio/wav;base64," + base64.b64encode(f.read()).decode()
payload.pop("ref_audio_s3", None)
print(json.dumps(payload))
PY
)
fi
fi
printf '%s' "$REQUEST" > /tmp/test_request.body
echo "Request size: $(wc -c < /tmp/test_request.body) bytes"
docker cp /tmp/test_request.body ${CONTAINER_ID}:/tmp/test_request.body

- name: Run smoke test
run: |
docker exec ${CONTAINER_ID} bash /tmp/smoke_test.sh \
"${{ matrix.model.route }}" \
'${{ matrix.model.test_request }}' \
'@/tmp/test_request.body' \
"${{ matrix.model.validate }}" \
"${{ matrix.model.content_type || 'application/json' }}"

Expand Down
95 changes: 52 additions & 43 deletions docker/vllm_omni/Dockerfile.amzn2023
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
ARG CUDA_VERSION=12.9.1
ARG CUDA_VERSION=13.0.2
ARG PYTHON_VERSION=3.12
ARG VLLM_VERSION=0.18.0
ARG FLASHINFER_VERSION=0.6.6
ARG DEEPGEMM_GIT_REF=477618cd51baffca09c4b0b87e97c03fe827ef03
ARG VLLM_VERSION=0.20.0
ARG FLASHINFER_VERSION=0.6.8.post1
ARG DEEPEP_COMMIT_HASH=73b6ea4

# =============================================================================
Expand Down Expand Up @@ -67,10 +66,16 @@ WORKDIR /workspace/vllm
COPY --from=source /src/ ./

# Install PyTorch + build deps (slow-changing, cached)
RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r requirements/cuda.txt -r requirements/build.txt \
# Upstream vllm v0.20.0 moved requirements/build.txt → requirements/build/cuda.txt
# and installs it as a separate step after cuda.txt so that only cuda.txt touches
# the torch index URL resolution. See docker/Dockerfile in vllm v0.20.0.
RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r requirements/cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r requirements/build/cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}

ARG max_jobs=32
Expand All @@ -82,11 +87,14 @@ ENV NVCC_THREADS=${nvcc_threads}
ENV VLLM_TARGET_DEVICE=cuda
ARG VLLM_REF
ARG VLLM_VERSION
ENV SETUPTOOLS_SCM_PRETEND_VERSION="${VLLM_VERSION}+amzn2023"
# Wheel version tag — pass via --build-arg from workflow (see docker/vllm_omni/versions.env).
# Example: 0.20.0+amzn2023.abcdef12
ARG SETUPTOOLS_SCM_PRETEND_VERSION
ENV SETUPTOOLS_SCM_PRETEND_VERSION=${SETUPTOOLS_SCM_PRETEND_VERSION:-${VLLM_VERSION}+amzn2023}

# --- Pre-built wheel support ---
# Fetch wheels from S3 into docker/vllm/prebuilt_wheels/ BEFORE docker build:
# bash scripts/vllm/amzn2023/fetch_cached_wheels.sh cu129 v0.18.0
# bash scripts/vllm/amzn2023/fetch_cached_wheels.sh cu130 v0.20.0
# The directory is empty by default so COPY always succeeds.
COPY docker/vllm/prebuilt_wheels/ /tmp/prebuilt_wheels/

Expand All @@ -102,7 +110,7 @@ COPY docker/vllm/sccache-cache/ /root/.cache/sccache/
RUN --mount=type=cache,target=/root/.cache/ccache --mount=type=cache,target=/root/.cache/uv \
if [ "$USE_PREBUILT_WHEEL" = "1" ] && ls /tmp/prebuilt_wheels/*.whl >/dev/null 2>&1; then \
echo "✅ Using pre-built vLLM wheel — skipping compilation" \
&& mkdir -p dist \
&& rm -rf dist && mkdir -p dist \
&& cp /tmp/prebuilt_wheels/*.whl dist/; \
elif [ "$USE_SCCACHE" = "1" ]; then \
echo "🔧 Building vLLM with sccache..." \
Expand All @@ -126,16 +134,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache --mount=type=cache,target=/roo
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi

# Build DeepGEMM wheel
ARG DEEPGEMM_GIT_REF
RUN --mount=type=cache,target=/root/.cache/uv mkdir -p /tmp/deepgemm/dist \
&& VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" \
tools/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" \
${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} \
--wheel-dir /tmp/deepgemm/dist \
|| echo "DeepGEMM build skipped (CUDA version requirement not met)"
RUN mkdir -p /tmp/deepgemm/dist

# Build DeepEP wheels
ARG DEEPEP_COMMIT_HASH
ARG NVSHMEM_VER
Expand Down Expand Up @@ -193,29 +191,26 @@ COPY --from=build /workspace/vllm/requirements/cuda.txt /tmp/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r /tmp/cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# Install vLLM wheel
# Install vLLM wheel (pick the most recent vllm-*.whl and log which one)
COPY --from=build /workspace/vllm/dist /tmp/vllm-dist
RUN --mount=type=cache,target=/root/.cache/uv uv pip install /tmp/vllm-dist/*.whl \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# Install FlashInfer cubin/jit-cache
RUN --mount=type=cache,target=/root/.cache/uv ls /tmp/vllm-dist/*.whl \
&& VLLM_WHL=$(ls -t /tmp/vllm-dist/vllm-*.whl | head -1) \
&& echo "Installing ${VLLM_WHL}" \
&& uv pip install "${VLLM_WHL}" \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# Install FlashInfer JIT cache (requires CUDA-version-specific index URL).
# flashinfer-python and flashinfer-cubin are already pulled in via requirements/cuda.txt.
# Pre-download cubins so the first inference request doesn't pay JIT compile latency.
ARG FLASHINFER_VERSION
RUN --mount=type=cache,target=/root/.cache/uv uv pip install flashinfer-cubin==${FLASHINFER_VERSION} \
flashinfer-jit-cache==${FLASHINFER_VERSION} \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
RUN --mount=type=cache,target=/root/.cache/uv uv pip install flashinfer-jit-cache==${FLASHINFER_VERSION} \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config \
&& flashinfer download-cubin

# Install serving extras
RUN --mount=type=cache,target=/root/.cache/uv uv pip install accelerate hf_transfer modelscope \
"bitsandbytes>=0.46.1" "timm>=1.0.17" "runai-model-streamer[s3,gcs]>=0.15.3"

# Install DeepGEMM wheel from build stage
COPY --from=build /tmp/deepgemm/dist /tmp/deepgemm/dist
RUN --mount=type=cache,target=/root/.cache/uv \
if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
uv pip install /tmp/deepgemm/dist/*.whl; \
else \
echo "No DeepGEMM wheels to install; skipping."; \
fi && rm -rf /tmp/deepgemm
# Install serving extras (matches upstream vllm v0.20.0 serving extras set)
RUN --mount=type=cache,target=/root/.cache/uv uv pip install accelerate modelscope \
"bitsandbytes>=0.46.1" "timm>=1.0.17" "runai-model-streamer[s3,gcs]>=0.15.7"

# Install DeepEP wheels from build stage
COPY --from=build /tmp/ep_kernels_workspace/dist /tmp/ep_kernels/dist
Expand Down Expand Up @@ -259,12 +254,19 @@ WORKDIR /vllm-workspace

RUN dnf install -y --setopt=install_weak_deps=False \
python${PYTHON_VERSION} libibverbs shadow-utils tar gzip \
numactl numactl-libs numactl-devel \
&& dnf clean all && rm -rf /var/cache/dnf

# Install uv
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
ENV PATH="/root/.local/bin:${PATH}"

# Enable CUDA forward compatibility for datacenter GPUs with older drivers.
# Set to 1 at runtime (e.g., -e VLLM_ENABLE_CUDA_COMPATIBILITY=1) only when
# the host NVIDIA driver is older than what CUDA 13 normally requires.
# See: https://docs.nvidia.com/deploy/cuda-compatibility/
ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0

# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM)
RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \
&& dnf install -y --setopt=install_weak_deps=False \
Expand All @@ -284,21 +286,22 @@ ENV PATH="/opt/venv/bin:${PATH}"
ENV VIRTUAL_ENV="/opt/venv"
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
ENV VLLM_USAGE_SOURCE=production-docker-image
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV HF_XET_HIGH_PERFORMANCE=1


# =============================================================================
# STAGE: omni-deps — install vllm-omni on top of runtime venv
# =============================================================================
FROM runtime AS omni-deps

ARG VLLM_OMNI_VERSION=0.18.0
ARG VLLM_OMNI_VERSION=0.20.0

# System deps for omni-modality (TTS, audio, image/video)
# Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, sox, ffmpeg
# Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, ffmpeg.
# sox dropped — vllm-omni v0.20.0 removed sox from its deps (#2745)
RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-release \
&& dnf install -y spal-release \
&& dnf install -y --setopt=install_weak_deps=False espeak-ng sox ffmpeg-free \
&& dnf install -y --setopt=install_weak_deps=False espeak-ng ffmpeg-free \
&& dnf clean all && rm -rf /var/cache/dnf

# Install vllm-omni (pure Python, no compilation)
Expand Down Expand Up @@ -400,6 +403,9 @@ FROM omni-base AS vllm-omni-ec2-amzn2023

ARG CACHE_REFRESH=0
RUN dnf upgrade -y --security --releasever latest --setopt=install_weak_deps=False \
# Explicitly upgrade cuda-compat-13-0 to pick up fix for CVE-2025-33219
# (NVIDIA repo — not flagged in AL2023 security advisories, so --security misses it)
&& dnf upgrade -y --releasever latest cuda-compat-13-0 \
&& dnf clean all && rm -rf /var/cache/dnf /tmp/* \
&& ln -sf /opt/venv/bin/python3 /usr/bin/python3

Expand All @@ -413,6 +419,9 @@ FROM omni-base AS vllm-omni-sagemaker-amzn2023

ARG CACHE_REFRESH=0
RUN dnf upgrade -y --security --releasever latest --setopt=install_weak_deps=False \
# Explicitly upgrade cuda-compat-13-0 to pick up fix for CVE-2025-33219
# (NVIDIA repo — not flagged in AL2023 security advisories, so --security misses it)
&& dnf upgrade -y --releasever latest cuda-compat-13-0 \
&& dnf clean all && rm -rf /var/cache/dnf /tmp/* \
&& ln -sf /opt/venv/bin/python3 /usr/bin/python3

Expand Down
Loading
Loading