diff --git a/.github/config/image/vllm-omni-ec2-amzn2023.yml b/.github/config/image/vllm-omni-ec2-amzn2023.yml index aadeb2bb75fc..fbdbd835c8d2 100644 --- a/.github/config/image/vllm-omni-ec2-amzn2023.yml +++ b/.github/config/image/vllm-omni-ec2-amzn2023.yml @@ -6,8 +6,8 @@ image: common: framework: "vllm_omni" - framework_version: "0.20.0" - vllm_ref: "v0.20.0" + framework_version: "0.21.0rc1" + vllm_ref: "v0.21.0" job_type: "general" python_version: "py312" cuda_version: "cu130" diff --git a/.github/config/image/vllm-omni-sagemaker-amzn2023.yml b/.github/config/image/vllm-omni-sagemaker-amzn2023.yml index 9d7fe12575e3..c46dae617879 100644 --- a/.github/config/image/vllm-omni-sagemaker-amzn2023.yml +++ b/.github/config/image/vllm-omni-sagemaker-amzn2023.yml @@ -6,8 +6,8 @@ image: common: framework: "vllm_omni" - framework_version: "0.20.0" - vllm_ref: "v0.20.0" + framework_version: "0.21.0rc1" + vllm_ref: "v0.21.0" job_type: "general" python_version: "py312" cuda_version: "cu130" diff --git a/.github/config/model-tests/vllm-omni-model-tests.yml b/.github/config/model-tests/vllm-omni-model-tests.yml index 89d3287b8364..10954eb366c9 100644 --- a/.github/config/model-tests/vllm-omni-model-tests.yml +++ b/.github/config/model-tests/vllm-omni-model-tests.yml @@ -185,18 +185,16 @@ benchmark: # See: https://github.com/vllm-project/vllm-omni/issues/3124 # Runs on L4 (x86-g6xl-runner); # - # Thresholds temporarily loosened for vllm-omni 0.20.0: upstream regression - # introduced by vllm-omni#3203 (commit 01f500a5) un-batches Code2Wav decode - # chunks; observed RPS 0.281 vs prior 0.4, audio RTF mult 1.109 vs prior 1.6, - # p95 e2e 15919ms vs prior 11000ms. Fix is merged upstream as - # vllm-omni#3485 (post-0.20.0) and will land in the next omni point release. - # Re-tighten to (0.4 / 1.6 / 11000) once that release is picked up. + # Thresholds restored to pre-regression baseline (0.4 / 1.6 / 11000) on + # vllm-omni 0.21.0rc1: vllm-omni#3485 fix for the #3203 Code2Wav un-batching + # regression is now picked up. Observed on rc1: rps=1.302, audio rtf + # mult=5.033, p95 e2e=3499ms — well above baseline. - name: "qwen3-tts-12hz-1.7b-base" s3_model: "qwen3-tts-12hz-1.7b-base.tar.gz" fleet: "x86-g6xl-runner" extra_args: "" benchmark_type: "tts-base" - benchmark_config: '{"concurrency": 4, "num_prompts": 20, "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English", "min_rps": 0.27, "min_audio_rtf_mult": 1.0, "max_p95_e2e_ms": 17000}' + benchmark_config: '{"concurrency": 4, "num_prompts": 20, "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English", "min_rps": 0.4, "min_audio_rtf_mult": 1.6, "max_p95_e2e_ms": 11000}' # CosyVoice3 zero-shot voice-clone — same /v1/audio/speech route as Qwen3-TTS, # uses the tts-base benchmark client with ref_audio_s3. Fleet matches the diff --git a/.github/workflows/autorelease-vllm-omni.yml b/.github/workflows/autorelease-vllm-omni.yml index e187d7cb35a4..2db102fd2a20 100644 --- a/.github/workflows/autorelease-vllm-omni.yml +++ b/.github/workflows/autorelease-vllm-omni.yml @@ -132,10 +132,16 @@ jobs: - name: Fetch cached vLLM wheel id: wheel-cache run: | + # The workflow's framework-version is the omni package version + # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm + # wheel filename. Source versions.env to read VLLM_VERSION (the + # vllm core version, e.g. 0.21.0) so the cache key + filename glob + # match wheels uploaded by any workflow on the same vllm core. + set -a; source docker/vllm_omni/versions.env; set +a OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \ ${{ needs.load-config-ec2.outputs.cuda-version }} \ - ${{ needs.load-config-ec2.outputs.vllm-ref }} \ - ${{ needs.load-config-ec2.outputs.framework-version }}) + "${VLLM_REF}" \ + "${VLLM_VERSION}") echo "$OUTPUT" HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2) echo "hit=${HIT}" >> $GITHUB_OUTPUT @@ -187,10 +193,12 @@ jobs: - name: Upload vLLM wheel to cache if: success() && steps.wheel-cache.outputs.hit != 'true' run: | + # Use vllm core version, not omni package version (see fetch step). + set -a; source docker/vllm_omni/versions.env; set +a bash scripts/vllm/amzn2023/upload_cached_wheels.sh \ ${{ needs.load-config-ec2.outputs.cuda-version }} \ - ${{ needs.load-config-ec2.outputs.vllm-ref }} \ - ${{ needs.load-config-ec2.outputs.framework-version }} + "${VLLM_REF}" \ + "${VLLM_VERSION}" - name: Sync sccache cache to S3 if: success() && steps.wheel-cache.outputs.hit != 'true' @@ -217,10 +225,16 @@ jobs: - name: Fetch cached vLLM wheel id: wheel-cache run: | + # The workflow's framework-version is the omni package version + # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm + # wheel filename. Source versions.env to read VLLM_VERSION (the + # vllm core version, e.g. 0.21.0) so the cache key + filename glob + # match wheels uploaded by any workflow on the same vllm core. + set -a; source docker/vllm_omni/versions.env; set +a OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \ ${{ needs.load-config-sagemaker.outputs.cuda-version }} \ - ${{ needs.load-config-sagemaker.outputs.vllm-ref }} \ - ${{ needs.load-config-sagemaker.outputs.framework-version }}) + "${VLLM_REF}" \ + "${VLLM_VERSION}") echo "$OUTPUT" HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2) echo "hit=${HIT}" >> $GITHUB_OUTPUT @@ -272,10 +286,12 @@ jobs: - name: Upload vLLM wheel to cache if: success() && steps.wheel-cache.outputs.hit != 'true' run: | + # Use vllm core version, not omni package version (see fetch step). + set -a; source docker/vllm_omni/versions.env; set +a bash scripts/vllm/amzn2023/upload_cached_wheels.sh \ ${{ needs.load-config-sagemaker.outputs.cuda-version }} \ - ${{ needs.load-config-sagemaker.outputs.vllm-ref }} \ - ${{ needs.load-config-sagemaker.outputs.framework-version }} + "${VLLM_REF}" \ + "${VLLM_VERSION}" - name: Sync sccache cache to S3 if: success() && steps.wheel-cache.outputs.hit != 'true' diff --git a/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml index a6e1990ccb68..d99ac1c103db 100644 --- a/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml +++ b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml @@ -152,10 +152,16 @@ jobs: - name: Fetch cached vLLM wheel id: wheel-cache run: | + # The workflow's framework-version is the omni package version + # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm + # wheel filename. Source versions.env to read VLLM_VERSION (the + # vllm core version, e.g. 0.21.0) so the cache key + filename glob + # match wheels uploaded by any workflow on the same vllm core. + set -a; source docker/vllm_omni/versions.env; set +a OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \ ${{ needs.load-config.outputs.cuda-version }} \ - ${{ needs.load-config.outputs.vllm-ref }} \ - ${{ needs.load-config.outputs.framework-version }}) + "${VLLM_REF}" \ + "${VLLM_VERSION}") echo "$OUTPUT" HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2) echo "hit=${HIT}" >> $GITHUB_OUTPUT @@ -207,10 +213,12 @@ jobs: - name: Upload vLLM wheel to cache if: success() && steps.wheel-cache.outputs.hit != 'true' run: | + # Use vllm core version, not omni package version (see fetch step). + set -a; source docker/vllm_omni/versions.env; set +a bash scripts/vllm/amzn2023/upload_cached_wheels.sh \ ${{ needs.load-config.outputs.cuda-version }} \ - ${{ needs.load-config.outputs.vllm-ref }} \ - ${{ needs.load-config.outputs.framework-version }} + "${VLLM_REF}" \ + "${VLLM_VERSION}" - name: Sync sccache cache to S3 if: success() && steps.wheel-cache.outputs.hit != 'true' diff --git a/docker/vllm_omni/Dockerfile.amzn2023 b/docker/vllm_omni/Dockerfile.amzn2023 index 3d022078df2a..50e74b582640 100644 --- a/docker/vllm_omni/Dockerfile.amzn2023 +++ b/docker/vllm_omni/Dockerfile.amzn2023 @@ -1,6 +1,6 @@ ARG CUDA_VERSION=13.0.2 ARG PYTHON_VERSION=3.12 -ARG VLLM_VERSION=0.20.0 +ARG VLLM_VERSION=0.21.0 ARG FLASHINFER_VERSION=0.6.8.post1 ARG DEEPEP_COMMIT_HASH=73b6ea4 @@ -201,14 +201,14 @@ RUN --mount=type=cache,target=/root/.cache/uv ls /tmp/vllm-dist/*.whl \ # Install FlashInfer JIT cache (requires CUDA-version-specific index URL). # flashinfer-python and flashinfer-cubin are already pulled in via requirements/cuda.txt. -# Pre-download cubins so the first inference request doesn't pay JIT compile latency. +# Cubins are downloaded later, AFTER all wheel installs that may overwrite +# flashinfer files (vllm wheel, EP kernels, KV connectors). See upstream vllm +# v0.21.0 Dockerfile — downloading earlier wastes ~2.5 GB on layer duplication. ARG FLASHINFER_VERSION RUN --mount=type=cache,target=/root/.cache/uv uv pip install flashinfer-jit-cache==${FLASHINFER_VERSION} \ - --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - && flashinfer show-config \ - && flashinfer download-cubin + --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') -# Install serving extras (matches upstream vllm v0.20.0 serving extras set) +# Install serving extras (matches upstream vllm v0.21.0 serving extras set) RUN --mount=type=cache,target=/root/.cache/uv uv pip install accelerate modelscope \ "bitsandbytes>=0.46.1" "timm>=1.0.17" "runai-model-streamer[s3,gcs]>=0.15.7" @@ -228,9 +228,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \ if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ - if [ "$CUDA_MAJOR" -ge 13 ]; then \ - uv pip install nixl-cu13; \ - fi; \ uv pip install -r /tmp/kv_connectors.txt --no-build || ( \ dnf install -y --setopt=install_weak_deps=False \ libcusparse-devel-${CUDA_DASH} \ @@ -240,8 +237,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && dnf remove -y libcusparse-devel-${CUDA_DASH} libcublas-devel-${CUDA_DASH} libcusolver-devel-${CUDA_DASH} \ && dnf clean all && rm -rf /var/cache/dnf \ ); \ + # Force-reinstall the matching CUDA wheel so the correct nixl_ep_cpp.so + # is installed (upstream vllm v0.21.0 fix). + uv pip install --force-reinstall --no-deps nixl-cu${CUDA_MAJOR}; \ fi +# Pre-download FlashInfer cubins AFTER all wheel installs (vllm wheel, EP +# kernels, KV connectors) finish — earlier installs may overwrite flashinfer +# package files. Downloading here avoids ~2.5 GB layer duplication. +RUN flashinfer show-config && flashinfer download-cubin + # ============================================================================= # STAGE 3: runtime — minimal image with clean venv # ============================================================================= @@ -267,7 +272,9 @@ ENV PATH="/root/.local/bin:${PATH}" # See: https://docs.nvidia.com/deploy/cuda-compatibility/ ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0 -# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM) +# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM). +# Upstream vllm v0.21.0 switched libcublas → libcublas-devel so cublas headers +# are present at runtime for JIT (e.g. fastsafetensors / nccl_allocator). RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \ && dnf install -y --setopt=install_weak_deps=False \ gcc python${PYTHON_VERSION}-devel \ @@ -276,7 +283,7 @@ RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \ cuda-nvrtc-${CUDA_DASH} \ cuda-cuobjdump-${CUDA_DASH} \ libcurand-devel-${CUDA_DASH} \ - libcublas-${CUDA_DASH} \ + libcublas-devel-${CUDA_DASH} \ && dnf clean all && rm -rf /var/cache/dnf COPY --from=deps /opt/venv /opt/venv @@ -294,7 +301,7 @@ ENV HF_XET_HIGH_PERFORMANCE=1 # ============================================================================= FROM runtime AS omni-deps -ARG VLLM_OMNI_VERSION=0.20.0 +ARG VLLM_OMNI_VERSION=0.21.0rc1 # System deps for omni-modality (TTS, audio, image/video) # Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, ffmpeg. @@ -304,8 +311,24 @@ RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-r && dnf install -y --setopt=install_weak_deps=False espeak-ng ffmpeg-free \ && dnf clean all && rm -rf /var/cache/dnf -# Install vllm-omni (pure Python, no compilation) -RUN --mount=type=cache,target=/root/.cache/uv uv pip install vllm-omni==${VLLM_OMNI_VERSION} +# Install vllm-omni (pure Python, no compilation). +# --prerelease=allow needed because 0.21.0rc1 is a PEP 440 pre-release; +# strip when bumping to a stable 0.21.0. +RUN --mount=type=cache,target=/root/.cache/uv uv pip install --prerelease=allow vllm-omni==${VLLM_OMNI_VERSION} + +# Pin transformers <5.9.0. vllm-omni 0.21.0rc1's qwen3_tts module calls +# create_causal_mask(input_embeds=...). The kwarg was renamed to +# `inputs_embeds` in transformers 5.5.1 with a deprecated `input_embeds` +# alias kept in place — versions 5.5.1..5.8.1 still accept the call with a +# deprecation warning. transformers 5.9.0 (released 2026-05-20, see +# https://github.com/huggingface/transformers/releases/tag/v5.9.0) dropped +# the @deprecate_kwarg("input_embeds", ...) decorator from +# src/transformers/masking_utils.py, breaking qwen3-tts smoke tests with: +# TypeError: create_causal_mask() got an unexpected keyword argument 'input_embeds' +# vllm core 0.21.0's pin (>=4.56.0, !=5.0..5.4, !=5.5.0) is too loose — pip +# resolves to 5.9.x. Cap ourselves at <5.9.0 until vllm-omni updates the +# call site to use `inputs_embeds`. +RUN --mount=type=cache,target=/root/.cache/uv uv pip install --force-reinstall --no-deps "transformers>=4.56.0,<5.9.0" # ============================================================================= # STAGE: builder-oss-omni — OSS compliance for omni venv @@ -327,7 +350,7 @@ ARG PYTHON="python3" ARG PYTHON_VERSION=3.12 ARG CUDA_VERSION ARG DLC_MAJOR_VERSION=1 -ARG DLC_MINOR_VERSION=0 +ARG DLC_MINOR_VERSION=3 LABEL maintainer="Amazon AI" LABEL dlc_major_version="${DLC_MAJOR_VERSION}" diff --git a/docker/vllm_omni/versions.env b/docker/vllm_omni/versions.env index e8d9e1582cd2..8024e13bfc24 100755 --- a/docker/vllm_omni/versions.env +++ b/docker/vllm_omni/versions.env @@ -11,9 +11,10 @@ # ── vLLM source & version ────────────────────────────────────── export VLLM_REPO="https://github.com/vllm-project/vllm.git" -export VLLM_VERSION="0.20.0" +export VLLM_VERSION="0.21.0" export VLLM_REF="v${VLLM_VERSION}" -export VLLM_OMNI_VERSION="0.20.0" +# vllm-omni 0.21.0rc1 is a pre-release; pip install must resolve with --pre. +export VLLM_OMNI_VERSION="0.21.0rc1" # Wheel version tag — PEP 440 local-version encoding the pinned ref for # traceability. Commit SHAs are truncated to 8 chars; tags/branches are @@ -36,9 +37,9 @@ export EFA_VERSION="1.47.0" # ── DLC image versioning ─────────────────────────────────────── export DLC_MAJOR_VERSION="1" -export DLC_MINOR_VERSION="2" +export DLC_MINOR_VERSION="3" # ── Build configuration ──────────────────────────────────────── -# Aligned with upstream vllm v0.20.0 Dockerfile. +# Aligned with upstream vllm v0.21.0 Dockerfile. export torch_cuda_arch_list="7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX" export INSTALL_KV_CONNECTORS="true"