From 9a9b97619492075b48623b728f9c11e0a3f57223 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Mon, 18 May 2026 16:26:00 -0700
Subject: [PATCH 1/5] feat(vllm-omni): prepare 0.21.0rc1 release branch

Bumps the vLLM-Omni AL2023 image to vllm-omni 0.21.0rc1 (pre-release),
which rebases onto upstream vLLM v0.21.0. Cherry-picks three Dockerfile
changes from the upstream vLLM v0.20.0 -> v0.21.0 diff that are relevant
to our fork:

- libcublas-${CUDA_DASH} -> libcublas-devel-${CUDA_DASH} in the runtime
  stage so cublas headers are present for JIT (fastsafetensors,
  nccl_allocator).
- FlashInfer download-cubin moved to a final RUN after vllm wheel,
  EP kernels, and KV connectors install. Earlier downloads cause
  ~2.5 GB layer duplication when later pip installs overwrite
  flashinfer files.
- nixl-cu${CUDA_MAJOR} --force-reinstall --no-deps after the kv_connectors
  install, replacing the bare nixl-cu13 install, so the matching
  nixl_ep_cpp.so is shipped.

Skipped upstream changes that don't apply to our AL2023 fork:
BUILD_OS=manylinux apt/dnf branching (we are dnf-only),
nvidia-cutlass-dsl[cu13] strip-shim (we pin CUDA 13), DeepGEMM
multi-Python interpreter matrix (single-Python build), and the
sagemaker-entrypoint.sh path move (we ship our own entrypoints).

Also adds --prerelease=allow on the omni install since 0.21.0rc1 is a
PEP 440 pre-release; uv would otherwise refuse to resolve it. Strip
when bumping to a stable 0.21.0.

DLC_MINOR_VERSION 2 -> 3, tagging this image v1.3.

This is a preparation PR for the official release. No public docs or
release notes are updated; those land in the follow-up PR once 0.21.0
ships final.

No test-suite additions: per the new vllm-omni-release skill audit
(Step 4b/4c), neither SenseNova-U1 nor Tencent Covo-Audio-Chat clears
the gating rules right now (existing image-gen route already covered;
g6e12xl-runner is ICE in us-west-2). Endpoint test routes /
content-types are unchanged in 0.21.0rc1, so no new endpoint cases.

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 .../config/image/vllm-omni-ec2-amzn2023.yml   |  4 +-
 .../image/vllm-omni-sagemaker-amzn2023.yml    |  4 +-
 docker/vllm_omni/Dockerfile.amzn2023          | 39 ++++++++++++-------
 docker/vllm_omni/versions.env                 |  9 +++--
 4 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/.github/config/image/vllm-omni-ec2-amzn2023.yml b/.github/config/image/vllm-omni-ec2-amzn2023.yml
index aadeb2bb75fc..fbdbd835c8d2 100644
--- a/.github/config/image/vllm-omni-ec2-amzn2023.yml
+++ b/.github/config/image/vllm-omni-ec2-amzn2023.yml
@@ -6,8 +6,8 @@ image:
 
 common:
   framework: "vllm_omni"
-  framework_version: "0.20.0"
-  vllm_ref: "v0.20.0"
+  framework_version: "0.21.0rc1"
+  vllm_ref: "v0.21.0"
   job_type: "general"
   python_version: "py312"
   cuda_version: "cu130"
diff --git a/.github/config/image/vllm-omni-sagemaker-amzn2023.yml b/.github/config/image/vllm-omni-sagemaker-amzn2023.yml
index 9d7fe12575e3..c46dae617879 100644
--- a/.github/config/image/vllm-omni-sagemaker-amzn2023.yml
+++ b/.github/config/image/vllm-omni-sagemaker-amzn2023.yml
@@ -6,8 +6,8 @@ image:
 
 common:
   framework: "vllm_omni"
-  framework_version: "0.20.0"
-  vllm_ref: "v0.20.0"
+  framework_version: "0.21.0rc1"
+  vllm_ref: "v0.21.0"
   job_type: "general"
   python_version: "py312"
   cuda_version: "cu130"
diff --git a/docker/vllm_omni/Dockerfile.amzn2023 b/docker/vllm_omni/Dockerfile.amzn2023
index 3d022078df2a..ce5999eb456b 100644
--- a/docker/vllm_omni/Dockerfile.amzn2023
+++ b/docker/vllm_omni/Dockerfile.amzn2023
@@ -1,6 +1,6 @@
 ARG CUDA_VERSION=13.0.2
 ARG PYTHON_VERSION=3.12
-ARG VLLM_VERSION=0.20.0
+ARG VLLM_VERSION=0.21.0
 ARG FLASHINFER_VERSION=0.6.8.post1
 ARG DEEPEP_COMMIT_HASH=73b6ea4
 
@@ -201,14 +201,14 @@ RUN --mount=type=cache,target=/root/.cache/uv ls /tmp/vllm-dist/*.whl \
 
 # Install FlashInfer JIT cache (requires CUDA-version-specific index URL).
 # flashinfer-python and flashinfer-cubin are already pulled in via requirements/cuda.txt.
-# Pre-download cubins so the first inference request doesn't pay JIT compile latency.
+# Cubins are downloaded later, AFTER all wheel installs that may overwrite
+# flashinfer files (vllm wheel, EP kernels, KV connectors). See upstream vllm
+# v0.21.0 Dockerfile — downloading earlier wastes ~2.5 GB on layer duplication.
 ARG FLASHINFER_VERSION
 RUN --mount=type=cache,target=/root/.cache/uv uv pip install flashinfer-jit-cache==${FLASHINFER_VERSION} \
-  --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-  && flashinfer show-config \
-  && flashinfer download-cubin
+  --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# Install serving extras (matches upstream vllm v0.20.0 serving extras set)
+# Install serving extras (matches upstream vllm v0.21.0 serving extras set)
 RUN --mount=type=cache,target=/root/.cache/uv uv pip install accelerate modelscope \
   "bitsandbytes>=0.46.1" "timm>=1.0.17" "runai-model-streamer[s3,gcs]>=0.15.7"
 
@@ -228,9 +228,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
     CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \
     if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        if [ "$CUDA_MAJOR" -ge 13 ]; then \
-            uv pip install nixl-cu13; \
-        fi; \
         uv pip install -r /tmp/kv_connectors.txt --no-build || ( \
             dnf install -y --setopt=install_weak_deps=False \
                 libcusparse-devel-${CUDA_DASH} \
@@ -240,8 +237,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \
             && dnf remove -y libcusparse-devel-${CUDA_DASH} libcublas-devel-${CUDA_DASH} libcusolver-devel-${CUDA_DASH} \
             && dnf clean all && rm -rf /var/cache/dnf \
         ); \
+        # Force-reinstall the matching CUDA wheel so the correct nixl_ep_cpp.so
+        # is installed (upstream vllm v0.21.0 fix).
+        uv pip install --force-reinstall --no-deps nixl-cu${CUDA_MAJOR}; \
     fi
 
+# Pre-download FlashInfer cubins AFTER all wheel installs (vllm wheel, EP
+# kernels, KV connectors) finish — earlier installs may overwrite flashinfer
+# package files. Downloading here avoids ~2.5 GB layer duplication.
+RUN flashinfer show-config && flashinfer download-cubin
+
 # =============================================================================
 # STAGE 3: runtime — minimal image with clean venv
 # =============================================================================
@@ -267,7 +272,9 @@ ENV PATH="/root/.local/bin:${PATH}"
 # See: https://docs.nvidia.com/deploy/cuda-compatibility/
 ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0
 
-# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM)
+# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM).
+# Upstream vllm v0.21.0 switched libcublas → libcublas-devel so cublas headers
+# are present at runtime for JIT (e.g. fastsafetensors / nccl_allocator).
 RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \
   && dnf install -y --setopt=install_weak_deps=False \
     gcc python${PYTHON_VERSION}-devel \
@@ -276,7 +283,7 @@ RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \
     cuda-nvrtc-${CUDA_DASH} \
     cuda-cuobjdump-${CUDA_DASH} \
     libcurand-devel-${CUDA_DASH} \
-    libcublas-${CUDA_DASH} \
+    libcublas-devel-${CUDA_DASH} \
   && dnf clean all && rm -rf /var/cache/dnf
 
 COPY --from=deps /opt/venv /opt/venv
@@ -294,7 +301,7 @@ ENV HF_XET_HIGH_PERFORMANCE=1
 # =============================================================================
 FROM runtime AS omni-deps
 
-ARG VLLM_OMNI_VERSION=0.20.0
+ARG VLLM_OMNI_VERSION=0.21.0rc1
 
 # System deps for omni-modality (TTS, audio, image/video)
 # Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, ffmpeg.
@@ -304,8 +311,10 @@ RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-r
   && dnf install -y --setopt=install_weak_deps=False espeak-ng ffmpeg-free \
   && dnf clean all && rm -rf /var/cache/dnf
 
-# Install vllm-omni (pure Python, no compilation)
-RUN --mount=type=cache,target=/root/.cache/uv uv pip install vllm-omni==${VLLM_OMNI_VERSION}
+# Install vllm-omni (pure Python, no compilation).
+# --prerelease=allow needed because 0.21.0rc1 is a PEP 440 pre-release;
+# strip when bumping to a stable 0.21.0.
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install --prerelease=allow vllm-omni==${VLLM_OMNI_VERSION}
 
 # =============================================================================
 # STAGE: builder-oss-omni — OSS compliance for omni venv
@@ -327,7 +336,7 @@ ARG PYTHON="python3"
 ARG PYTHON_VERSION=3.12
 ARG CUDA_VERSION
 ARG DLC_MAJOR_VERSION=1
-ARG DLC_MINOR_VERSION=0
+ARG DLC_MINOR_VERSION=3
 
 LABEL maintainer="Amazon AI"
 LABEL dlc_major_version="${DLC_MAJOR_VERSION}"
diff --git a/docker/vllm_omni/versions.env b/docker/vllm_omni/versions.env
index e8d9e1582cd2..8024e13bfc24 100755
--- a/docker/vllm_omni/versions.env
+++ b/docker/vllm_omni/versions.env
@@ -11,9 +11,10 @@
 
 # ── vLLM source & version ──────────────────────────────────────
 export VLLM_REPO="https://github.com/vllm-project/vllm.git"
-export VLLM_VERSION="0.20.0"
+export VLLM_VERSION="0.21.0"
 export VLLM_REF="v${VLLM_VERSION}"
-export VLLM_OMNI_VERSION="0.20.0"
+# vllm-omni 0.21.0rc1 is a pre-release; pip install must resolve with --pre.
+export VLLM_OMNI_VERSION="0.21.0rc1"
 
 # Wheel version tag — PEP 440 local-version encoding the pinned ref for
 # traceability. Commit SHAs are truncated to 8 chars; tags/branches are
@@ -36,9 +37,9 @@ export EFA_VERSION="1.47.0"
 
 # ── DLC image versioning ───────────────────────────────────────
 export DLC_MAJOR_VERSION="1"
-export DLC_MINOR_VERSION="2"
+export DLC_MINOR_VERSION="3"
 
 # ── Build configuration ────────────────────────────────────────
-# Aligned with upstream vllm v0.20.0 Dockerfile.
+# Aligned with upstream vllm v0.21.0 Dockerfile.
 export torch_cuda_arch_list="7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX"
 export INSTALL_KV_CONNECTORS="true"

From 014ce51915a7d39f70df701dbaf7f445a9fc946d Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Wed, 20 May 2026 14:21:54 -0700
Subject: [PATCH 2/5] test(vllm-omni): restore qwen3-tts-base thresholds to
 pre-regression baseline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

vllm-omni 0.20.0 had a regression from vllm-omni#3203 that un-batched
Code2Wav decode chunks. Thresholds were loosened to (0.27 / 1.0 / 17000).

vllm-omni#3485 fix is now picked up in 0.21.0rc1. Observed on this
branch: rps=1.302, audio rtf mult=5.033, p95 e2e=3499ms — comfortably
above the original (0.4 / 1.6 / 11000) baseline. Restore those values
as the comment explicitly directed once the fix landed.

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 .github/config/model-tests/vllm-omni-model-tests.yml | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/.github/config/model-tests/vllm-omni-model-tests.yml b/.github/config/model-tests/vllm-omni-model-tests.yml
index 89d3287b8364..10954eb366c9 100644
--- a/.github/config/model-tests/vllm-omni-model-tests.yml
+++ b/.github/config/model-tests/vllm-omni-model-tests.yml
@@ -185,18 +185,16 @@ benchmark:
     # See: https://github.com/vllm-project/vllm-omni/issues/3124
     # Runs on L4 (x86-g6xl-runner);
     #
-    # Thresholds temporarily loosened for vllm-omni 0.20.0: upstream regression
-    # introduced by vllm-omni#3203 (commit 01f500a5) un-batches Code2Wav decode
-    # chunks; observed RPS 0.281 vs prior 0.4, audio RTF mult 1.109 vs prior 1.6,
-    # p95 e2e 15919ms vs prior 11000ms. Fix is merged upstream as
-    # vllm-omni#3485 (post-0.20.0) and will land in the next omni point release.
-    # Re-tighten to (0.4 / 1.6 / 11000) once that release is picked up.
+    # Thresholds restored to pre-regression baseline (0.4 / 1.6 / 11000) on
+    # vllm-omni 0.21.0rc1: vllm-omni#3485 fix for the #3203 Code2Wav un-batching
+    # regression is now picked up. Observed on rc1: rps=1.302, audio rtf
+    # mult=5.033, p95 e2e=3499ms — well above baseline.
     - name: "qwen3-tts-12hz-1.7b-base"
       s3_model: "qwen3-tts-12hz-1.7b-base.tar.gz"
       fleet: "x86-g6xl-runner"
       extra_args: ""
       benchmark_type: "tts-base"
-      benchmark_config: '{"concurrency": 4, "num_prompts": 20, "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English", "min_rps": 0.27, "min_audio_rtf_mult": 1.0, "max_p95_e2e_ms": 17000}'
+      benchmark_config: '{"concurrency": 4, "num_prompts": 20, "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English", "min_rps": 0.4, "min_audio_rtf_mult": 1.6, "max_p95_e2e_ms": 11000}'
 
     # CosyVoice3 zero-shot voice-clone — same /v1/audio/speech route as Qwen3-TTS,
     # uses the tts-base benchmark client with ref_audio_s3. Fleet matches the

From f55e3daf4bc944ea70acda7f4f3fcde29a066a55 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Wed, 20 May 2026 16:33:49 -0700
Subject: [PATCH 3/5] fix(vllm-omni): use vllm core version (not omni package
 version) for wheel cache key
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The workflow was passing framework_version (= 0.21.0rc1, the omni package
version) into fetch_cached_wheels.sh as the vLLM version. That makes the
cache key sha256(...,version:0.21.0rc1,...) and the filename glob
'vllm-0.21.0rc1*.whl' — neither matches wheels uploaded for vllm core
0.21.0. Result: every omni build is a forced cache miss, even when a
matching vllm core wheel exists in S3.

Source docker/vllm_omni/versions.env first and pass VLLM_VERSION
(= 0.21.0) to fetch + upload. Now omni shares the cache with any other
workflow building the same vllm core ref/version.

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 .github/workflows/pr-vllm-omni-ec2-amzn2023.yml | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml
index a6e1990ccb68..d99ac1c103db 100644
--- a/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml
+++ b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml
@@ -152,10 +152,16 @@ jobs:
       - name: Fetch cached vLLM wheel
         id: wheel-cache
         run: |
+          # The workflow's framework-version is the omni package version
+          # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm
+          # wheel filename. Source versions.env to read VLLM_VERSION (the
+          # vllm core version, e.g. 0.21.0) so the cache key + filename glob
+          # match wheels uploaded by any workflow on the same vllm core.
+          set -a; source docker/vllm_omni/versions.env; set +a
           OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \
             ${{ needs.load-config.outputs.cuda-version }} \
-            ${{ needs.load-config.outputs.vllm-ref }} \
-            ${{ needs.load-config.outputs.framework-version }})
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}")
           echo "$OUTPUT"
           HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2)
           echo "hit=${HIT}" >> $GITHUB_OUTPUT
@@ -207,10 +213,12 @@ jobs:
       - name: Upload vLLM wheel to cache
         if: success() && steps.wheel-cache.outputs.hit != 'true'
         run: |
+          # Use vllm core version, not omni package version (see fetch step).
+          set -a; source docker/vllm_omni/versions.env; set +a
           bash scripts/vllm/amzn2023/upload_cached_wheels.sh \
             ${{ needs.load-config.outputs.cuda-version }} \
-            ${{ needs.load-config.outputs.vllm-ref }} \
-            ${{ needs.load-config.outputs.framework-version }}
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}"
 
       - name: Sync sccache cache to S3
         if: success() && steps.wheel-cache.outputs.hit != 'true'

From 1c344a0556c2cb2cfd77f03f5b2686c404152bbd Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Wed, 20 May 2026 16:34:02 -0700
Subject: [PATCH 4/5] fix(vllm-omni): pin transformers <5.9.0 for qwen3-tts
 compatibility

vllm-omni 0.21.0rc1's qwen3_tts module calls
create_causal_mask(..., input_embeds=...) at modeling_qwen3_tts_tokenizer_v2.py:576.
transformers renamed the kwarg to `inputs_embeds` in 5.5.1 (kept
input_embeds as a deprecated alias via @deprecate_kwarg) and removed the
decorator outright in 5.9.0 (released 2026-05-20).

Reference: https://github.com/huggingface/transformers/releases/tag/v5.9.0

vllm core 0.21.0's pin (>=4.56.0, !=5.0..5.4, !=5.5.0) doesn't upper-bound
past 5.5, so pip resolves to 5.9.x and breaks qwen3-tts smoke tests with:

    TypeError: create_causal_mask() got an unexpected keyword argument 'input_embeds'

Cap at <5.9.0 (last working release line is 5.8.x). Drop when vllm-omni
updates the call site to use `inputs_embeds`.

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 docker/vllm_omni/Dockerfile.amzn2023 | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docker/vllm_omni/Dockerfile.amzn2023 b/docker/vllm_omni/Dockerfile.amzn2023
index ce5999eb456b..50e74b582640 100644
--- a/docker/vllm_omni/Dockerfile.amzn2023
+++ b/docker/vllm_omni/Dockerfile.amzn2023
@@ -316,6 +316,20 @@ RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-r
 # strip when bumping to a stable 0.21.0.
 RUN --mount=type=cache,target=/root/.cache/uv uv pip install --prerelease=allow vllm-omni==${VLLM_OMNI_VERSION}
 
+# Pin transformers <5.9.0. vllm-omni 0.21.0rc1's qwen3_tts module calls
+# create_causal_mask(input_embeds=...). The kwarg was renamed to
+# `inputs_embeds` in transformers 5.5.1 with a deprecated `input_embeds`
+# alias kept in place — versions 5.5.1..5.8.1 still accept the call with a
+# deprecation warning. transformers 5.9.0 (released 2026-05-20, see
+# https://github.com/huggingface/transformers/releases/tag/v5.9.0) dropped
+# the @deprecate_kwarg("input_embeds", ...) decorator from
+# src/transformers/masking_utils.py, breaking qwen3-tts smoke tests with:
+#     TypeError: create_causal_mask() got an unexpected keyword argument 'input_embeds'
+# vllm core 0.21.0's pin (>=4.56.0, !=5.0..5.4, !=5.5.0) is too loose — pip
+# resolves to 5.9.x. Cap ourselves at <5.9.0 until vllm-omni updates the
+# call site to use `inputs_embeds`.
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install --force-reinstall --no-deps "transformers>=4.56.0,<5.9.0"
+
 # =============================================================================
 # STAGE: builder-oss-omni — OSS compliance for omni venv
 # =============================================================================

From 193ee3350c3f4ff404fa55c93b1e53f61cdb9914 Mon Sep 17 00:00:00 2001
From: Yadan Wei <yadanwei@amazon.com>
Date: Thu, 21 May 2026 10:21:27 -0700
Subject: [PATCH 5/5] fix(vllm-omni): use vllm core version for wheel cache key
 in autorelease

Mirror the same fix applied to pr-vllm-omni-ec2-amzn2023.yml (f55e3daf)
for both build-ec2 and build-sagemaker jobs in the scheduled autorelease
workflow. framework-version (= 0.21.0rc1, the omni package version) is
not the version stamped on the vllm wheel filename, so passing it to
fetch/upload_cached_wheels.sh forces a cache miss every run.

Source docker/vllm_omni/versions.env and pass VLLM_VERSION (= 0.21.0)
+ VLLM_REF (= v0.21.0) instead so the autorelease shares the wheel
cache with PR builds on the same vllm core ref.

Signed-off-by: Yadan Wei <yadanwei@amazon.com>
---
 .github/workflows/autorelease-vllm-omni.yml | 32 +++++++++++++++------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/autorelease-vllm-omni.yml b/.github/workflows/autorelease-vllm-omni.yml
index e187d7cb35a4..2db102fd2a20 100644
--- a/.github/workflows/autorelease-vllm-omni.yml
+++ b/.github/workflows/autorelease-vllm-omni.yml
@@ -132,10 +132,16 @@ jobs:
       - name: Fetch cached vLLM wheel
         id: wheel-cache
         run: |
+          # The workflow's framework-version is the omni package version
+          # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm
+          # wheel filename. Source versions.env to read VLLM_VERSION (the
+          # vllm core version, e.g. 0.21.0) so the cache key + filename glob
+          # match wheels uploaded by any workflow on the same vllm core.
+          set -a; source docker/vllm_omni/versions.env; set +a
           OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \
             ${{ needs.load-config-ec2.outputs.cuda-version }} \
-            ${{ needs.load-config-ec2.outputs.vllm-ref }} \
-            ${{ needs.load-config-ec2.outputs.framework-version }})
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}")
           echo "$OUTPUT"
           HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2)
           echo "hit=${HIT}" >> $GITHUB_OUTPUT
@@ -187,10 +193,12 @@ jobs:
       - name: Upload vLLM wheel to cache
         if: success() && steps.wheel-cache.outputs.hit != 'true'
         run: |
+          # Use vllm core version, not omni package version (see fetch step).
+          set -a; source docker/vllm_omni/versions.env; set +a
           bash scripts/vllm/amzn2023/upload_cached_wheels.sh \
             ${{ needs.load-config-ec2.outputs.cuda-version }} \
-            ${{ needs.load-config-ec2.outputs.vllm-ref }} \
-            ${{ needs.load-config-ec2.outputs.framework-version }}
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}"
 
       - name: Sync sccache cache to S3
         if: success() && steps.wheel-cache.outputs.hit != 'true'
@@ -217,10 +225,16 @@ jobs:
       - name: Fetch cached vLLM wheel
         id: wheel-cache
         run: |
+          # The workflow's framework-version is the omni package version
+          # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm
+          # wheel filename. Source versions.env to read VLLM_VERSION (the
+          # vllm core version, e.g. 0.21.0) so the cache key + filename glob
+          # match wheels uploaded by any workflow on the same vllm core.
+          set -a; source docker/vllm_omni/versions.env; set +a
           OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \
             ${{ needs.load-config-sagemaker.outputs.cuda-version }} \
-            ${{ needs.load-config-sagemaker.outputs.vllm-ref }} \
-            ${{ needs.load-config-sagemaker.outputs.framework-version }})
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}")
           echo "$OUTPUT"
           HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2)
           echo "hit=${HIT}" >> $GITHUB_OUTPUT
@@ -272,10 +286,12 @@ jobs:
       - name: Upload vLLM wheel to cache
         if: success() && steps.wheel-cache.outputs.hit != 'true'
         run: |
+          # Use vllm core version, not omni package version (see fetch step).
+          set -a; source docker/vllm_omni/versions.env; set +a
           bash scripts/vllm/amzn2023/upload_cached_wheels.sh \
             ${{ needs.load-config-sagemaker.outputs.cuda-version }} \
-            ${{ needs.load-config-sagemaker.outputs.vllm-ref }} \
-            ${{ needs.load-config-sagemaker.outputs.framework-version }}
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}"
 
       - name: Sync sccache cache to S3
         if: success() && steps.wheel-cache.outputs.hit != 'true'