Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/config/image/vllm-omni-ec2-amzn2023.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ image:

common:
framework: "vllm_omni"
framework_version: "0.20.0"
vllm_ref: "v0.20.0"
framework_version: "0.21.0rc1"
vllm_ref: "v0.21.0"
job_type: "general"
python_version: "py312"
cuda_version: "cu130"
Expand Down
4 changes: 2 additions & 2 deletions .github/config/image/vllm-omni-sagemaker-amzn2023.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ image:

common:
framework: "vllm_omni"
framework_version: "0.20.0"
vllm_ref: "v0.20.0"
framework_version: "0.21.0rc1"
vllm_ref: "v0.21.0"
job_type: "general"
python_version: "py312"
cuda_version: "cu130"
Expand Down
12 changes: 5 additions & 7 deletions .github/config/model-tests/vllm-omni-model-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -185,18 +185,16 @@ benchmark:
# See: https://github.com/vllm-project/vllm-omni/issues/3124
# Runs on L4 (x86-g6xl-runner);
#
# Thresholds temporarily loosened for vllm-omni 0.20.0: upstream regression
# introduced by vllm-omni#3203 (commit 01f500a5) un-batches Code2Wav decode
# chunks; observed RPS 0.281 vs prior 0.4, audio RTF mult 1.109 vs prior 1.6,
# p95 e2e 15919ms vs prior 11000ms. Fix is merged upstream as
# vllm-omni#3485 (post-0.20.0) and will land in the next omni point release.
# Re-tighten to (0.4 / 1.6 / 11000) once that release is picked up.
# Thresholds restored to pre-regression baseline (0.4 / 1.6 / 11000) on
# vllm-omni 0.21.0rc1: vllm-omni#3485 fix for the #3203 Code2Wav un-batching
# regression is now picked up. Observed on rc1: rps=1.302, audio rtf
# mult=5.033, p95 e2e=3499ms — well above baseline.
- name: "qwen3-tts-12hz-1.7b-base"
s3_model: "qwen3-tts-12hz-1.7b-base.tar.gz"
fleet: "x86-g6xl-runner"
extra_args: ""
benchmark_type: "tts-base"
benchmark_config: '{"concurrency": 4, "num_prompts": 20, "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English", "min_rps": 0.27, "min_audio_rtf_mult": 1.0, "max_p95_e2e_ms": 17000}'
benchmark_config: '{"concurrency": 4, "num_prompts": 20, "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English", "min_rps": 0.4, "min_audio_rtf_mult": 1.6, "max_p95_e2e_ms": 11000}'

# CosyVoice3 zero-shot voice-clone — same /v1/audio/speech route as Qwen3-TTS,
# uses the tts-base benchmark client with ref_audio_s3. Fleet matches the
Expand Down
32 changes: 24 additions & 8 deletions .github/workflows/autorelease-vllm-omni.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,16 @@ jobs:
- name: Fetch cached vLLM wheel
id: wheel-cache
run: |
# The workflow's framework-version is the omni package version
# (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm
# wheel filename. Source versions.env to read VLLM_VERSION (the
# vllm core version, e.g. 0.21.0) so the cache key + filename glob
# match wheels uploaded by any workflow on the same vllm core.
set -a; source docker/vllm_omni/versions.env; set +a
OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \
${{ needs.load-config-ec2.outputs.cuda-version }} \
${{ needs.load-config-ec2.outputs.vllm-ref }} \
${{ needs.load-config-ec2.outputs.framework-version }})
"${VLLM_REF}" \
"${VLLM_VERSION}")
echo "$OUTPUT"
HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2)
echo "hit=${HIT}" >> $GITHUB_OUTPUT
Expand Down Expand Up @@ -187,10 +193,12 @@ jobs:
- name: Upload vLLM wheel to cache
if: success() && steps.wheel-cache.outputs.hit != 'true'
run: |
# Use vllm core version, not omni package version (see fetch step).
set -a; source docker/vllm_omni/versions.env; set +a
bash scripts/vllm/amzn2023/upload_cached_wheels.sh \
${{ needs.load-config-ec2.outputs.cuda-version }} \
${{ needs.load-config-ec2.outputs.vllm-ref }} \
${{ needs.load-config-ec2.outputs.framework-version }}
"${VLLM_REF}" \
"${VLLM_VERSION}"

- name: Sync sccache cache to S3
if: success() && steps.wheel-cache.outputs.hit != 'true'
Expand All @@ -217,10 +225,16 @@ jobs:
- name: Fetch cached vLLM wheel
id: wheel-cache
run: |
# The workflow's framework-version is the omni package version
# (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm
# wheel filename. Source versions.env to read VLLM_VERSION (the
# vllm core version, e.g. 0.21.0) so the cache key + filename glob
# match wheels uploaded by any workflow on the same vllm core.
set -a; source docker/vllm_omni/versions.env; set +a
OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \
${{ needs.load-config-sagemaker.outputs.cuda-version }} \
${{ needs.load-config-sagemaker.outputs.vllm-ref }} \
${{ needs.load-config-sagemaker.outputs.framework-version }})
"${VLLM_REF}" \
"${VLLM_VERSION}")
echo "$OUTPUT"
HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2)
echo "hit=${HIT}" >> $GITHUB_OUTPUT
Expand Down Expand Up @@ -272,10 +286,12 @@ jobs:
- name: Upload vLLM wheel to cache
if: success() && steps.wheel-cache.outputs.hit != 'true'
run: |
# Use vllm core version, not omni package version (see fetch step).
set -a; source docker/vllm_omni/versions.env; set +a
bash scripts/vllm/amzn2023/upload_cached_wheels.sh \
${{ needs.load-config-sagemaker.outputs.cuda-version }} \
${{ needs.load-config-sagemaker.outputs.vllm-ref }} \
${{ needs.load-config-sagemaker.outputs.framework-version }}
"${VLLM_REF}" \
"${VLLM_VERSION}"

- name: Sync sccache cache to S3
if: success() && steps.wheel-cache.outputs.hit != 'true'
Expand Down
16 changes: 12 additions & 4 deletions .github/workflows/pr-vllm-omni-ec2-amzn2023.yml
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,16 @@ jobs:
- name: Fetch cached vLLM wheel
id: wheel-cache
run: |
# The workflow's framework-version is the omni package version
# (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm
# wheel filename. Source versions.env to read VLLM_VERSION (the
# vllm core version, e.g. 0.21.0) so the cache key + filename glob
# match wheels uploaded by any workflow on the same vllm core.
set -a; source docker/vllm_omni/versions.env; set +a
OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \
${{ needs.load-config.outputs.cuda-version }} \
${{ needs.load-config.outputs.vllm-ref }} \
${{ needs.load-config.outputs.framework-version }})
"${VLLM_REF}" \
"${VLLM_VERSION}")
echo "$OUTPUT"
HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2)
echo "hit=${HIT}" >> $GITHUB_OUTPUT
Expand Down Expand Up @@ -207,10 +213,12 @@ jobs:
- name: Upload vLLM wheel to cache
if: success() && steps.wheel-cache.outputs.hit != 'true'
run: |
# Use vllm core version, not omni package version (see fetch step).
set -a; source docker/vllm_omni/versions.env; set +a
bash scripts/vllm/amzn2023/upload_cached_wheels.sh \
${{ needs.load-config.outputs.cuda-version }} \
${{ needs.load-config.outputs.vllm-ref }} \
${{ needs.load-config.outputs.framework-version }}
"${VLLM_REF}" \
"${VLLM_VERSION}"

- name: Sync sccache cache to S3
if: success() && steps.wheel-cache.outputs.hit != 'true'
Expand Down
53 changes: 38 additions & 15 deletions docker/vllm_omni/Dockerfile.amzn2023
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ARG CUDA_VERSION=13.0.2
ARG PYTHON_VERSION=3.12
ARG VLLM_VERSION=0.20.0
ARG VLLM_VERSION=0.21.0
ARG FLASHINFER_VERSION=0.6.8.post1
ARG DEEPEP_COMMIT_HASH=73b6ea4

Expand Down Expand Up @@ -201,14 +201,14 @@ RUN --mount=type=cache,target=/root/.cache/uv ls /tmp/vllm-dist/*.whl \

# Install FlashInfer JIT cache (requires CUDA-version-specific index URL).
# flashinfer-python and flashinfer-cubin are already pulled in via requirements/cuda.txt.
# Pre-download cubins so the first inference request doesn't pay JIT compile latency.
# Cubins are downloaded later, AFTER all wheel installs that may overwrite
# flashinfer files (vllm wheel, EP kernels, KV connectors). See upstream vllm
# v0.21.0 Dockerfile — downloading earlier wastes ~2.5 GB on layer duplication.
ARG FLASHINFER_VERSION
RUN --mount=type=cache,target=/root/.cache/uv uv pip install flashinfer-jit-cache==${FLASHINFER_VERSION} \
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
&& flashinfer show-config \
&& flashinfer download-cubin
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

# Install serving extras (matches upstream vllm v0.20.0 serving extras set)
# Install serving extras (matches upstream vllm v0.21.0 serving extras set)
RUN --mount=type=cache,target=/root/.cache/uv uv pip install accelerate modelscope \
"bitsandbytes>=0.46.1" "timm>=1.0.17" "runai-model-streamer[s3,gcs]>=0.15.7"

Expand All @@ -228,9 +228,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
if [ "$CUDA_MAJOR" -ge 13 ]; then \
uv pip install nixl-cu13; \
fi; \
uv pip install -r /tmp/kv_connectors.txt --no-build || ( \
dnf install -y --setopt=install_weak_deps=False \
libcusparse-devel-${CUDA_DASH} \
Expand All @@ -240,8 +237,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& dnf remove -y libcusparse-devel-${CUDA_DASH} libcublas-devel-${CUDA_DASH} libcusolver-devel-${CUDA_DASH} \
&& dnf clean all && rm -rf /var/cache/dnf \
); \
# Force-reinstall the matching CUDA wheel so the correct nixl_ep_cpp.so
# is installed (upstream vllm v0.21.0 fix).
uv pip install --force-reinstall --no-deps nixl-cu${CUDA_MAJOR}; \
fi

# Pre-download FlashInfer cubins AFTER all wheel installs (vllm wheel, EP
# kernels, KV connectors) finish — earlier installs may overwrite flashinfer
# package files. Downloading here avoids ~2.5 GB layer duplication.
RUN flashinfer show-config && flashinfer download-cubin

# =============================================================================
# STAGE 3: runtime — minimal image with clean venv
# =============================================================================
Expand All @@ -267,7 +272,9 @@ ENV PATH="/root/.local/bin:${PATH}"
# See: https://docs.nvidia.com/deploy/cuda-compatibility/
ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0

# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM)
# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM).
# Upstream vllm v0.21.0 switched libcublas → libcublas-devel so cublas headers
# are present at runtime for JIT (e.g. fastsafetensors / nccl_allocator).
RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \
&& dnf install -y --setopt=install_weak_deps=False \
gcc python${PYTHON_VERSION}-devel \
Expand All @@ -276,7 +283,7 @@ RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \
cuda-nvrtc-${CUDA_DASH} \
cuda-cuobjdump-${CUDA_DASH} \
libcurand-devel-${CUDA_DASH} \
libcublas-${CUDA_DASH} \
libcublas-devel-${CUDA_DASH} \
&& dnf clean all && rm -rf /var/cache/dnf

COPY --from=deps /opt/venv /opt/venv
Expand All @@ -294,7 +301,7 @@ ENV HF_XET_HIGH_PERFORMANCE=1
# =============================================================================
FROM runtime AS omni-deps

ARG VLLM_OMNI_VERSION=0.20.0
ARG VLLM_OMNI_VERSION=0.21.0rc1

# System deps for omni-modality (TTS, audio, image/video)
# Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, ffmpeg.
Expand All @@ -304,8 +311,24 @@ RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-r
&& dnf install -y --setopt=install_weak_deps=False espeak-ng ffmpeg-free \
&& dnf clean all && rm -rf /var/cache/dnf

# Install vllm-omni (pure Python, no compilation)
RUN --mount=type=cache,target=/root/.cache/uv uv pip install vllm-omni==${VLLM_OMNI_VERSION}
# Install vllm-omni (pure Python, no compilation).
# --prerelease=allow needed because 0.21.0rc1 is a PEP 440 pre-release;
# strip when bumping to a stable 0.21.0.
RUN --mount=type=cache,target=/root/.cache/uv uv pip install --prerelease=allow vllm-omni==${VLLM_OMNI_VERSION}

# Pin transformers <5.9.0. vllm-omni 0.21.0rc1's qwen3_tts module calls
# create_causal_mask(input_embeds=...). The kwarg was renamed to
# `inputs_embeds` in transformers 5.5.1 with a deprecated `input_embeds`
# alias kept in place — versions 5.5.1..5.8.1 still accept the call with a
# deprecation warning. transformers 5.9.0 (released 2026-05-20, see
# https://github.com/huggingface/transformers/releases/tag/v5.9.0) dropped
# the @deprecate_kwarg("input_embeds", ...) decorator from
# src/transformers/masking_utils.py, breaking qwen3-tts smoke tests with:
# TypeError: create_causal_mask() got an unexpected keyword argument 'input_embeds'
# vllm core 0.21.0's pin (>=4.56.0, !=5.0..5.4, !=5.5.0) is too loose — pip
# resolves to 5.9.x. Cap ourselves at <5.9.0 until vllm-omni updates the
# call site to use `inputs_embeds`.
RUN --mount=type=cache,target=/root/.cache/uv uv pip install --force-reinstall --no-deps "transformers>=4.56.0,<5.9.0"

# =============================================================================
# STAGE: builder-oss-omni — OSS compliance for omni venv
Expand All @@ -327,7 +350,7 @@ ARG PYTHON="python3"
ARG PYTHON_VERSION=3.12
ARG CUDA_VERSION
ARG DLC_MAJOR_VERSION=1
ARG DLC_MINOR_VERSION=0
ARG DLC_MINOR_VERSION=3

LABEL maintainer="Amazon AI"
LABEL dlc_major_version="${DLC_MAJOR_VERSION}"
Expand Down
9 changes: 5 additions & 4 deletions docker/vllm_omni/versions.env
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@

# ── vLLM source & version ──────────────────────────────────────
export VLLM_REPO="https://github.com/vllm-project/vllm.git"
export VLLM_VERSION="0.20.0"
export VLLM_VERSION="0.21.0"
export VLLM_REF="v${VLLM_VERSION}"
export VLLM_OMNI_VERSION="0.20.0"
# vllm-omni 0.21.0rc1 is a pre-release; pip install must resolve with --pre.
export VLLM_OMNI_VERSION="0.21.0rc1"

# Wheel version tag — PEP 440 local-version encoding the pinned ref for
# traceability. Commit SHAs are truncated to 8 chars; tags/branches are
Expand All @@ -36,9 +37,9 @@ export EFA_VERSION="1.47.0"

# ── DLC image versioning ───────────────────────────────────────
export DLC_MAJOR_VERSION="1"
export DLC_MINOR_VERSION="2"
export DLC_MINOR_VERSION="3"

# ── Build configuration ────────────────────────────────────────
# Aligned with upstream vllm v0.20.0 Dockerfile.
# Aligned with upstream vllm v0.21.0 Dockerfile.
export torch_cuda_arch_list="7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX"
export INSTALL_KV_CONNECTORS="true"
Loading