From 9a9b97619492075b48623b728f9c11e0a3f57223 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Mon, 18 May 2026 16:26:00 -0700 Subject: [PATCH 1/5] feat(vllm-omni): prepare 0.21.0rc1 release branch Bumps the vLLM-Omni AL2023 image to vllm-omni 0.21.0rc1 (pre-release), which rebases onto upstream vLLM v0.21.0. Cherry-picks three Dockerfile changes from the upstream vLLM v0.20.0 -> v0.21.0 diff that are relevant to our fork: - libcublas-${CUDA_DASH} -> libcublas-devel-${CUDA_DASH} in the runtime stage so cublas headers are present for JIT (fastsafetensors, nccl_allocator). - FlashInfer download-cubin moved to a final RUN after vllm wheel, EP kernels, and KV connectors install. Earlier downloads cause ~2.5 GB layer duplication when later pip installs overwrite flashinfer files. - nixl-cu${CUDA_MAJOR} --force-reinstall --no-deps after the kv_connectors install, replacing the bare nixl-cu13 install, so the matching nixl_ep_cpp.so is shipped. Skipped upstream changes that don't apply to our AL2023 fork: BUILD_OS=manylinux apt/dnf branching (we are dnf-only), nvidia-cutlass-dsl[cu13] strip-shim (we pin CUDA 13), DeepGEMM multi-Python interpreter matrix (single-Python build), and the sagemaker-entrypoint.sh path move (we ship our own entrypoints). Also adds --prerelease=allow on the omni install since 0.21.0rc1 is a PEP 440 pre-release; uv would otherwise refuse to resolve it. Strip when bumping to a stable 0.21.0. DLC_MINOR_VERSION 2 -> 3, tagging this image v1.3. This is a preparation PR for the official release. No public docs or release notes are updated; those land in the follow-up PR once 0.21.0 ships final. No test-suite additions: per the new vllm-omni-release skill audit (Step 4b/4c), neither SenseNova-U1 nor Tencent Covo-Audio-Chat clears the gating rules right now (existing image-gen route already covered; g6e12xl-runner is ICE in us-west-2). Endpoint test routes / content-types are unchanged in 0.21.0rc1, so no new endpoint cases. Signed-off-by: Yadan Wei --- .../config/image/vllm-omni-ec2-amzn2023.yml | 4 +- .../image/vllm-omni-sagemaker-amzn2023.yml | 4 +- docker/vllm_omni/Dockerfile.amzn2023 | 39 ++++++++++++------- docker/vllm_omni/versions.env | 9 +++-- 4 files changed, 33 insertions(+), 23 deletions(-) diff --git a/.github/config/image/vllm-omni-ec2-amzn2023.yml b/.github/config/image/vllm-omni-ec2-amzn2023.yml index aadeb2bb75fc..fbdbd835c8d2 100644 --- a/.github/config/image/vllm-omni-ec2-amzn2023.yml +++ b/.github/config/image/vllm-omni-ec2-amzn2023.yml @@ -6,8 +6,8 @@ image: common: framework: "vllm_omni" - framework_version: "0.20.0" - vllm_ref: "v0.20.0" + framework_version: "0.21.0rc1" + vllm_ref: "v0.21.0" job_type: "general" python_version: "py312" cuda_version: "cu130" diff --git a/.github/config/image/vllm-omni-sagemaker-amzn2023.yml b/.github/config/image/vllm-omni-sagemaker-amzn2023.yml index 9d7fe12575e3..c46dae617879 100644 --- a/.github/config/image/vllm-omni-sagemaker-amzn2023.yml +++ b/.github/config/image/vllm-omni-sagemaker-amzn2023.yml @@ -6,8 +6,8 @@ image: common: framework: "vllm_omni" - framework_version: "0.20.0" - vllm_ref: "v0.20.0" + framework_version: "0.21.0rc1" + vllm_ref: "v0.21.0" job_type: "general" python_version: "py312" cuda_version: "cu130" diff --git a/docker/vllm_omni/Dockerfile.amzn2023 b/docker/vllm_omni/Dockerfile.amzn2023 index 3d022078df2a..ce5999eb456b 100644 --- a/docker/vllm_omni/Dockerfile.amzn2023 +++ b/docker/vllm_omni/Dockerfile.amzn2023 @@ -1,6 +1,6 @@ ARG CUDA_VERSION=13.0.2 ARG PYTHON_VERSION=3.12 -ARG VLLM_VERSION=0.20.0 +ARG VLLM_VERSION=0.21.0 ARG FLASHINFER_VERSION=0.6.8.post1 ARG DEEPEP_COMMIT_HASH=73b6ea4 @@ -201,14 +201,14 @@ RUN --mount=type=cache,target=/root/.cache/uv ls /tmp/vllm-dist/*.whl \ # Install FlashInfer JIT cache (requires CUDA-version-specific index URL). # flashinfer-python and flashinfer-cubin are already pulled in via requirements/cuda.txt. -# Pre-download cubins so the first inference request doesn't pay JIT compile latency. +# Cubins are downloaded later, AFTER all wheel installs that may overwrite +# flashinfer files (vllm wheel, EP kernels, KV connectors). See upstream vllm +# v0.21.0 Dockerfile — downloading earlier wastes ~2.5 GB on layer duplication. ARG FLASHINFER_VERSION RUN --mount=type=cache,target=/root/.cache/uv uv pip install flashinfer-jit-cache==${FLASHINFER_VERSION} \ - --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - && flashinfer show-config \ - && flashinfer download-cubin + --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') -# Install serving extras (matches upstream vllm v0.20.0 serving extras set) +# Install serving extras (matches upstream vllm v0.21.0 serving extras set) RUN --mount=type=cache,target=/root/.cache/uv uv pip install accelerate modelscope \ "bitsandbytes>=0.46.1" "timm>=1.0.17" "runai-model-streamer[s3,gcs]>=0.15.7" @@ -228,9 +228,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \ if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ - if [ "$CUDA_MAJOR" -ge 13 ]; then \ - uv pip install nixl-cu13; \ - fi; \ uv pip install -r /tmp/kv_connectors.txt --no-build || ( \ dnf install -y --setopt=install_weak_deps=False \ libcusparse-devel-${CUDA_DASH} \ @@ -240,8 +237,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && dnf remove -y libcusparse-devel-${CUDA_DASH} libcublas-devel-${CUDA_DASH} libcusolver-devel-${CUDA_DASH} \ && dnf clean all && rm -rf /var/cache/dnf \ ); \ + # Force-reinstall the matching CUDA wheel so the correct nixl_ep_cpp.so + # is installed (upstream vllm v0.21.0 fix). + uv pip install --force-reinstall --no-deps nixl-cu${CUDA_MAJOR}; \ fi +# Pre-download FlashInfer cubins AFTER all wheel installs (vllm wheel, EP +# kernels, KV connectors) finish — earlier installs may overwrite flashinfer +# package files. Downloading here avoids ~2.5 GB layer duplication. +RUN flashinfer show-config && flashinfer download-cubin + # ============================================================================= # STAGE 3: runtime — minimal image with clean venv # ============================================================================= @@ -267,7 +272,9 @@ ENV PATH="/root/.local/bin:${PATH}" # See: https://docs.nvidia.com/deploy/cuda-compatibility/ ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0 -# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM) +# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM). +# Upstream vllm v0.21.0 switched libcublas → libcublas-devel so cublas headers +# are present at runtime for JIT (e.g. fastsafetensors / nccl_allocator). RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \ && dnf install -y --setopt=install_weak_deps=False \ gcc python${PYTHON_VERSION}-devel \ @@ -276,7 +283,7 @@ RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \ cuda-nvrtc-${CUDA_DASH} \ cuda-cuobjdump-${CUDA_DASH} \ libcurand-devel-${CUDA_DASH} \ - libcublas-${CUDA_DASH} \ + libcublas-devel-${CUDA_DASH} \ && dnf clean all && rm -rf /var/cache/dnf COPY --from=deps /opt/venv /opt/venv @@ -294,7 +301,7 @@ ENV HF_XET_HIGH_PERFORMANCE=1 # ============================================================================= FROM runtime AS omni-deps -ARG VLLM_OMNI_VERSION=0.20.0 +ARG VLLM_OMNI_VERSION=0.21.0rc1 # System deps for omni-modality (TTS, audio, image/video) # Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, ffmpeg. @@ -304,8 +311,10 @@ RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-r && dnf install -y --setopt=install_weak_deps=False espeak-ng ffmpeg-free \ && dnf clean all && rm -rf /var/cache/dnf -# Install vllm-omni (pure Python, no compilation) -RUN --mount=type=cache,target=/root/.cache/uv uv pip install vllm-omni==${VLLM_OMNI_VERSION} +# Install vllm-omni (pure Python, no compilation). +# --prerelease=allow needed because 0.21.0rc1 is a PEP 440 pre-release; +# strip when bumping to a stable 0.21.0. +RUN --mount=type=cache,target=/root/.cache/uv uv pip install --prerelease=allow vllm-omni==${VLLM_OMNI_VERSION} # ============================================================================= # STAGE: builder-oss-omni — OSS compliance for omni venv @@ -327,7 +336,7 @@ ARG PYTHON="python3" ARG PYTHON_VERSION=3.12 ARG CUDA_VERSION ARG DLC_MAJOR_VERSION=1 -ARG DLC_MINOR_VERSION=0 +ARG DLC_MINOR_VERSION=3 LABEL maintainer="Amazon AI" LABEL dlc_major_version="${DLC_MAJOR_VERSION}" diff --git a/docker/vllm_omni/versions.env b/docker/vllm_omni/versions.env index e8d9e1582cd2..8024e13bfc24 100755 --- a/docker/vllm_omni/versions.env +++ b/docker/vllm_omni/versions.env @@ -11,9 +11,10 @@ # ── vLLM source & version ────────────────────────────────────── export VLLM_REPO="https://github.com/vllm-project/vllm.git" -export VLLM_VERSION="0.20.0" +export VLLM_VERSION="0.21.0" export VLLM_REF="v${VLLM_VERSION}" -export VLLM_OMNI_VERSION="0.20.0" +# vllm-omni 0.21.0rc1 is a pre-release; pip install must resolve with --pre. +export VLLM_OMNI_VERSION="0.21.0rc1" # Wheel version tag — PEP 440 local-version encoding the pinned ref for # traceability. Commit SHAs are truncated to 8 chars; tags/branches are @@ -36,9 +37,9 @@ export EFA_VERSION="1.47.0" # ── DLC image versioning ─────────────────────────────────────── export DLC_MAJOR_VERSION="1" -export DLC_MINOR_VERSION="2" +export DLC_MINOR_VERSION="3" # ── Build configuration ──────────────────────────────────────── -# Aligned with upstream vllm v0.20.0 Dockerfile. +# Aligned with upstream vllm v0.21.0 Dockerfile. export torch_cuda_arch_list="7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX" export INSTALL_KV_CONNECTORS="true" From 014ce51915a7d39f70df701dbaf7f445a9fc946d Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Wed, 20 May 2026 14:21:54 -0700 Subject: [PATCH 2/5] test(vllm-omni): restore qwen3-tts-base thresholds to pre-regression baseline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vllm-omni 0.20.0 had a regression from vllm-omni#3203 that un-batched Code2Wav decode chunks. Thresholds were loosened to (0.27 / 1.0 / 17000). vllm-omni#3485 fix is now picked up in 0.21.0rc1. Observed on this branch: rps=1.302, audio rtf mult=5.033, p95 e2e=3499ms — comfortably above the original (0.4 / 1.6 / 11000) baseline. Restore those values as the comment explicitly directed once the fix landed. Signed-off-by: Yadan Wei --- .github/config/model-tests/vllm-omni-model-tests.yml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/config/model-tests/vllm-omni-model-tests.yml b/.github/config/model-tests/vllm-omni-model-tests.yml index 89d3287b8364..10954eb366c9 100644 --- a/.github/config/model-tests/vllm-omni-model-tests.yml +++ b/.github/config/model-tests/vllm-omni-model-tests.yml @@ -185,18 +185,16 @@ benchmark: # See: https://github.com/vllm-project/vllm-omni/issues/3124 # Runs on L4 (x86-g6xl-runner); # - # Thresholds temporarily loosened for vllm-omni 0.20.0: upstream regression - # introduced by vllm-omni#3203 (commit 01f500a5) un-batches Code2Wav decode - # chunks; observed RPS 0.281 vs prior 0.4, audio RTF mult 1.109 vs prior 1.6, - # p95 e2e 15919ms vs prior 11000ms. Fix is merged upstream as - # vllm-omni#3485 (post-0.20.0) and will land in the next omni point release. - # Re-tighten to (0.4 / 1.6 / 11000) once that release is picked up. + # Thresholds restored to pre-regression baseline (0.4 / 1.6 / 11000) on + # vllm-omni 0.21.0rc1: vllm-omni#3485 fix for the #3203 Code2Wav un-batching + # regression is now picked up. Observed on rc1: rps=1.302, audio rtf + # mult=5.033, p95 e2e=3499ms — well above baseline. - name: "qwen3-tts-12hz-1.7b-base" s3_model: "qwen3-tts-12hz-1.7b-base.tar.gz" fleet: "x86-g6xl-runner" extra_args: "" benchmark_type: "tts-base" - benchmark_config: '{"concurrency": 4, "num_prompts": 20, "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English", "min_rps": 0.27, "min_audio_rtf_mult": 1.0, "max_p95_e2e_ms": 17000}' + benchmark_config: '{"concurrency": 4, "num_prompts": 20, "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English", "min_rps": 0.4, "min_audio_rtf_mult": 1.6, "max_p95_e2e_ms": 11000}' # CosyVoice3 zero-shot voice-clone — same /v1/audio/speech route as Qwen3-TTS, # uses the tts-base benchmark client with ref_audio_s3. Fleet matches the From f55e3daf4bc944ea70acda7f4f3fcde29a066a55 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Wed, 20 May 2026 16:33:49 -0700 Subject: [PATCH 3/5] fix(vllm-omni): use vllm core version (not omni package version) for wheel cache key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The workflow was passing framework_version (= 0.21.0rc1, the omni package version) into fetch_cached_wheels.sh as the vLLM version. That makes the cache key sha256(...,version:0.21.0rc1,...) and the filename glob 'vllm-0.21.0rc1*.whl' — neither matches wheels uploaded for vllm core 0.21.0. Result: every omni build is a forced cache miss, even when a matching vllm core wheel exists in S3. Source docker/vllm_omni/versions.env first and pass VLLM_VERSION (= 0.21.0) to fetch + upload. Now omni shares the cache with any other workflow building the same vllm core ref/version. Signed-off-by: Yadan Wei --- .github/workflows/pr-vllm-omni-ec2-amzn2023.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml index a6e1990ccb68..d99ac1c103db 100644 --- a/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml +++ b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml @@ -152,10 +152,16 @@ jobs: - name: Fetch cached vLLM wheel id: wheel-cache run: | + # The workflow's framework-version is the omni package version + # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm + # wheel filename. Source versions.env to read VLLM_VERSION (the + # vllm core version, e.g. 0.21.0) so the cache key + filename glob + # match wheels uploaded by any workflow on the same vllm core. + set -a; source docker/vllm_omni/versions.env; set +a OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \ ${{ needs.load-config.outputs.cuda-version }} \ - ${{ needs.load-config.outputs.vllm-ref }} \ - ${{ needs.load-config.outputs.framework-version }}) + "${VLLM_REF}" \ + "${VLLM_VERSION}") echo "$OUTPUT" HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2) echo "hit=${HIT}" >> $GITHUB_OUTPUT @@ -207,10 +213,12 @@ jobs: - name: Upload vLLM wheel to cache if: success() && steps.wheel-cache.outputs.hit != 'true' run: | + # Use vllm core version, not omni package version (see fetch step). + set -a; source docker/vllm_omni/versions.env; set +a bash scripts/vllm/amzn2023/upload_cached_wheels.sh \ ${{ needs.load-config.outputs.cuda-version }} \ - ${{ needs.load-config.outputs.vllm-ref }} \ - ${{ needs.load-config.outputs.framework-version }} + "${VLLM_REF}" \ + "${VLLM_VERSION}" - name: Sync sccache cache to S3 if: success() && steps.wheel-cache.outputs.hit != 'true' From 1c344a0556c2cb2cfd77f03f5b2686c404152bbd Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Wed, 20 May 2026 16:34:02 -0700 Subject: [PATCH 4/5] fix(vllm-omni): pin transformers <5.9.0 for qwen3-tts compatibility vllm-omni 0.21.0rc1's qwen3_tts module calls create_causal_mask(..., input_embeds=...) at modeling_qwen3_tts_tokenizer_v2.py:576. transformers renamed the kwarg to `inputs_embeds` in 5.5.1 (kept input_embeds as a deprecated alias via @deprecate_kwarg) and removed the decorator outright in 5.9.0 (released 2026-05-20). Reference: https://github.com/huggingface/transformers/releases/tag/v5.9.0 vllm core 0.21.0's pin (>=4.56.0, !=5.0..5.4, !=5.5.0) doesn't upper-bound past 5.5, so pip resolves to 5.9.x and breaks qwen3-tts smoke tests with: TypeError: create_causal_mask() got an unexpected keyword argument 'input_embeds' Cap at <5.9.0 (last working release line is 5.8.x). Drop when vllm-omni updates the call site to use `inputs_embeds`. Signed-off-by: Yadan Wei --- docker/vllm_omni/Dockerfile.amzn2023 | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docker/vllm_omni/Dockerfile.amzn2023 b/docker/vllm_omni/Dockerfile.amzn2023 index ce5999eb456b..50e74b582640 100644 --- a/docker/vllm_omni/Dockerfile.amzn2023 +++ b/docker/vllm_omni/Dockerfile.amzn2023 @@ -316,6 +316,20 @@ RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-r # strip when bumping to a stable 0.21.0. RUN --mount=type=cache,target=/root/.cache/uv uv pip install --prerelease=allow vllm-omni==${VLLM_OMNI_VERSION} +# Pin transformers <5.9.0. vllm-omni 0.21.0rc1's qwen3_tts module calls +# create_causal_mask(input_embeds=...). The kwarg was renamed to +# `inputs_embeds` in transformers 5.5.1 with a deprecated `input_embeds` +# alias kept in place — versions 5.5.1..5.8.1 still accept the call with a +# deprecation warning. transformers 5.9.0 (released 2026-05-20, see +# https://github.com/huggingface/transformers/releases/tag/v5.9.0) dropped +# the @deprecate_kwarg("input_embeds", ...) decorator from +# src/transformers/masking_utils.py, breaking qwen3-tts smoke tests with: +# TypeError: create_causal_mask() got an unexpected keyword argument 'input_embeds' +# vllm core 0.21.0's pin (>=4.56.0, !=5.0..5.4, !=5.5.0) is too loose — pip +# resolves to 5.9.x. Cap ourselves at <5.9.0 until vllm-omni updates the +# call site to use `inputs_embeds`. +RUN --mount=type=cache,target=/root/.cache/uv uv pip install --force-reinstall --no-deps "transformers>=4.56.0,<5.9.0" + # ============================================================================= # STAGE: builder-oss-omni — OSS compliance for omni venv # ============================================================================= From 193ee3350c3f4ff404fa55c93b1e53f61cdb9914 Mon Sep 17 00:00:00 2001 From: Yadan Wei Date: Thu, 21 May 2026 10:21:27 -0700 Subject: [PATCH 5/5] fix(vllm-omni): use vllm core version for wheel cache key in autorelease Mirror the same fix applied to pr-vllm-omni-ec2-amzn2023.yml (f55e3daf) for both build-ec2 and build-sagemaker jobs in the scheduled autorelease workflow. framework-version (= 0.21.0rc1, the omni package version) is not the version stamped on the vllm wheel filename, so passing it to fetch/upload_cached_wheels.sh forces a cache miss every run. Source docker/vllm_omni/versions.env and pass VLLM_VERSION (= 0.21.0) + VLLM_REF (= v0.21.0) instead so the autorelease shares the wheel cache with PR builds on the same vllm core ref. Signed-off-by: Yadan Wei --- .github/workflows/autorelease-vllm-omni.yml | 32 +++++++++++++++------ 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/.github/workflows/autorelease-vllm-omni.yml b/.github/workflows/autorelease-vllm-omni.yml index e187d7cb35a4..2db102fd2a20 100644 --- a/.github/workflows/autorelease-vllm-omni.yml +++ b/.github/workflows/autorelease-vllm-omni.yml @@ -132,10 +132,16 @@ jobs: - name: Fetch cached vLLM wheel id: wheel-cache run: | + # The workflow's framework-version is the omni package version + # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm + # wheel filename. Source versions.env to read VLLM_VERSION (the + # vllm core version, e.g. 0.21.0) so the cache key + filename glob + # match wheels uploaded by any workflow on the same vllm core. + set -a; source docker/vllm_omni/versions.env; set +a OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \ ${{ needs.load-config-ec2.outputs.cuda-version }} \ - ${{ needs.load-config-ec2.outputs.vllm-ref }} \ - ${{ needs.load-config-ec2.outputs.framework-version }}) + "${VLLM_REF}" \ + "${VLLM_VERSION}") echo "$OUTPUT" HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2) echo "hit=${HIT}" >> $GITHUB_OUTPUT @@ -187,10 +193,12 @@ jobs: - name: Upload vLLM wheel to cache if: success() && steps.wheel-cache.outputs.hit != 'true' run: | + # Use vllm core version, not omni package version (see fetch step). + set -a; source docker/vllm_omni/versions.env; set +a bash scripts/vllm/amzn2023/upload_cached_wheels.sh \ ${{ needs.load-config-ec2.outputs.cuda-version }} \ - ${{ needs.load-config-ec2.outputs.vllm-ref }} \ - ${{ needs.load-config-ec2.outputs.framework-version }} + "${VLLM_REF}" \ + "${VLLM_VERSION}" - name: Sync sccache cache to S3 if: success() && steps.wheel-cache.outputs.hit != 'true' @@ -217,10 +225,16 @@ jobs: - name: Fetch cached vLLM wheel id: wheel-cache run: | + # The workflow's framework-version is the omni package version + # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm + # wheel filename. Source versions.env to read VLLM_VERSION (the + # vllm core version, e.g. 0.21.0) so the cache key + filename glob + # match wheels uploaded by any workflow on the same vllm core. + set -a; source docker/vllm_omni/versions.env; set +a OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \ ${{ needs.load-config-sagemaker.outputs.cuda-version }} \ - ${{ needs.load-config-sagemaker.outputs.vllm-ref }} \ - ${{ needs.load-config-sagemaker.outputs.framework-version }}) + "${VLLM_REF}" \ + "${VLLM_VERSION}") echo "$OUTPUT" HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2) echo "hit=${HIT}" >> $GITHUB_OUTPUT @@ -272,10 +286,12 @@ jobs: - name: Upload vLLM wheel to cache if: success() && steps.wheel-cache.outputs.hit != 'true' run: | + # Use vllm core version, not omni package version (see fetch step). + set -a; source docker/vllm_omni/versions.env; set +a bash scripts/vllm/amzn2023/upload_cached_wheels.sh \ ${{ needs.load-config-sagemaker.outputs.cuda-version }} \ - ${{ needs.load-config-sagemaker.outputs.vllm-ref }} \ - ${{ needs.load-config-sagemaker.outputs.framework-version }} + "${VLLM_REF}" \ + "${VLLM_VERSION}" - name: Sync sccache cache to S3 if: success() && steps.wheel-cache.outputs.hit != 'true'