aws · Yadan-Wei · May 21, 2026 · May 18, 2026 · May 20, 2026 · May 20, 2026
diff --git a/.github/config/image/vllm-omni-ec2-amzn2023.yml b/.github/config/image/vllm-omni-ec2-amzn2023.yml
@@ -6,8 +6,8 @@ image:
 
 common:
   framework: "vllm_omni"
-  framework_version: "0.20.0"
-  vllm_ref: "v0.20.0"
+  framework_version: "0.21.0rc1"
+  vllm_ref: "v0.21.0"
   job_type: "general"
   python_version: "py312"
   cuda_version: "cu130"

diff --git a/.github/config/image/vllm-omni-sagemaker-amzn2023.yml b/.github/config/image/vllm-omni-sagemaker-amzn2023.yml
@@ -6,8 +6,8 @@ image:
 
 common:
   framework: "vllm_omni"
-  framework_version: "0.20.0"
-  vllm_ref: "v0.20.0"
+  framework_version: "0.21.0rc1"
+  vllm_ref: "v0.21.0"
   job_type: "general"
   python_version: "py312"
   cuda_version: "cu130"

diff --git a/.github/config/model-tests/vllm-omni-model-tests.yml b/.github/config/model-tests/vllm-omni-model-tests.yml
@@ -185,18 +185,16 @@ benchmark:
     # See: https://github.com/vllm-project/vllm-omni/issues/3124
     # Runs on L4 (x86-g6xl-runner);
     #
-    # Thresholds temporarily loosened for vllm-omni 0.20.0: upstream regression
-    # introduced by vllm-omni#3203 (commit 01f500a5) un-batches Code2Wav decode
-    # chunks; observed RPS 0.281 vs prior 0.4, audio RTF mult 1.109 vs prior 1.6,
-    # p95 e2e 15919ms vs prior 11000ms. Fix is merged upstream as
-    # vllm-omni#3485 (post-0.20.0) and will land in the next omni point release.
-    # Re-tighten to (0.4 / 1.6 / 11000) once that release is picked up.
+    # Thresholds restored to pre-regression baseline (0.4 / 1.6 / 11000) on
+    # vllm-omni 0.21.0rc1: vllm-omni#3485 fix for the #3203 Code2Wav un-batching
+    # regression is now picked up. Observed on rc1: rps=1.302, audio rtf
+    # mult=5.033, p95 e2e=3499ms — well above baseline.
     - name: "qwen3-tts-12hz-1.7b-base"
       s3_model: "qwen3-tts-12hz-1.7b-base.tar.gz"
       fleet: "x86-g6xl-runner"
       extra_args: ""
       benchmark_type: "tts-base"
-      benchmark_config: '{"concurrency": 4, "num_prompts": 20, "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English", "min_rps": 0.27, "min_audio_rtf_mult": 1.0, "max_p95_e2e_ms": 17000}'
+      benchmark_config: '{"concurrency": 4, "num_prompts": 20, "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English", "min_rps": 0.4, "min_audio_rtf_mult": 1.6, "max_p95_e2e_ms": 11000}'
 
     # CosyVoice3 zero-shot voice-clone — same /v1/audio/speech route as Qwen3-TTS,
     # uses the tts-base benchmark client with ref_audio_s3. Fleet matches the

diff --git a/.github/workflows/autorelease-vllm-omni.yml b/.github/workflows/autorelease-vllm-omni.yml
@@ -132,10 +132,16 @@ jobs:
       - name: Fetch cached vLLM wheel
         id: wheel-cache
         run: |
+          # The workflow's framework-version is the omni package version
+          # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm
+          # wheel filename. Source versions.env to read VLLM_VERSION (the
+          # vllm core version, e.g. 0.21.0) so the cache key + filename glob
+          # match wheels uploaded by any workflow on the same vllm core.
+          set -a; source docker/vllm_omni/versions.env; set +a
           OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \
             ${{ needs.load-config-ec2.outputs.cuda-version }} \
-            ${{ needs.load-config-ec2.outputs.vllm-ref }} \
-            ${{ needs.load-config-ec2.outputs.framework-version }})
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}")
           echo "$OUTPUT"
           HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2)
           echo "hit=${HIT}" >> $GITHUB_OUTPUT
@@ -187,10 +193,12 @@ jobs:
       - name: Upload vLLM wheel to cache
         if: success() && steps.wheel-cache.outputs.hit != 'true'
         run: |
+          # Use vllm core version, not omni package version (see fetch step).
+          set -a; source docker/vllm_omni/versions.env; set +a
           bash scripts/vllm/amzn2023/upload_cached_wheels.sh \
             ${{ needs.load-config-ec2.outputs.cuda-version }} \
-            ${{ needs.load-config-ec2.outputs.vllm-ref }} \
-            ${{ needs.load-config-ec2.outputs.framework-version }}
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}"
 
       - name: Sync sccache cache to S3
         if: success() && steps.wheel-cache.outputs.hit != 'true'
@@ -217,10 +225,16 @@ jobs:
       - name: Fetch cached vLLM wheel
         id: wheel-cache
         run: |
+          # The workflow's framework-version is the omni package version
+          # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm
+          # wheel filename. Source versions.env to read VLLM_VERSION (the
+          # vllm core version, e.g. 0.21.0) so the cache key + filename glob
+          # match wheels uploaded by any workflow on the same vllm core.
+          set -a; source docker/vllm_omni/versions.env; set +a
           OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \
             ${{ needs.load-config-sagemaker.outputs.cuda-version }} \
-            ${{ needs.load-config-sagemaker.outputs.vllm-ref }} \
-            ${{ needs.load-config-sagemaker.outputs.framework-version }})
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}")
           echo "$OUTPUT"
           HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2)
           echo "hit=${HIT}" >> $GITHUB_OUTPUT
@@ -272,10 +286,12 @@ jobs:
       - name: Upload vLLM wheel to cache
         if: success() && steps.wheel-cache.outputs.hit != 'true'
         run: |
+          # Use vllm core version, not omni package version (see fetch step).
+          set -a; source docker/vllm_omni/versions.env; set +a
           bash scripts/vllm/amzn2023/upload_cached_wheels.sh \
             ${{ needs.load-config-sagemaker.outputs.cuda-version }} \
-            ${{ needs.load-config-sagemaker.outputs.vllm-ref }} \
-            ${{ needs.load-config-sagemaker.outputs.framework-version }}
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}"
 
       - name: Sync sccache cache to S3
         if: success() && steps.wheel-cache.outputs.hit != 'true'

diff --git a/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml b/.github/workflows/pr-vllm-omni-ec2-amzn2023.yml
@@ -152,10 +152,16 @@ jobs:
       - name: Fetch cached vLLM wheel
         id: wheel-cache
         run: |
+          # The workflow's framework-version is the omni package version
+          # (e.g. 0.21.0rc1), which is NOT the version stamped on the vllm
+          # wheel filename. Source versions.env to read VLLM_VERSION (the
+          # vllm core version, e.g. 0.21.0) so the cache key + filename glob
+          # match wheels uploaded by any workflow on the same vllm core.
+          set -a; source docker/vllm_omni/versions.env; set +a
           OUTPUT=$(bash scripts/vllm/amzn2023/fetch_cached_wheels.sh \
             ${{ needs.load-config.outputs.cuda-version }} \
-            ${{ needs.load-config.outputs.vllm-ref }} \
-            ${{ needs.load-config.outputs.framework-version }})
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}")
           echo "$OUTPUT"
           HIT=$(echo "$OUTPUT" | grep -o 'cache-hit=.*' | cut -d= -f2)
           echo "hit=${HIT}" >> $GITHUB_OUTPUT
@@ -207,10 +213,12 @@ jobs:
       - name: Upload vLLM wheel to cache
         if: success() && steps.wheel-cache.outputs.hit != 'true'
         run: |
+          # Use vllm core version, not omni package version (see fetch step).
+          set -a; source docker/vllm_omni/versions.env; set +a
           bash scripts/vllm/amzn2023/upload_cached_wheels.sh \
             ${{ needs.load-config.outputs.cuda-version }} \
-            ${{ needs.load-config.outputs.vllm-ref }} \
-            ${{ needs.load-config.outputs.framework-version }}
+            "${VLLM_REF}" \
+            "${VLLM_VERSION}"
 
       - name: Sync sccache cache to S3
         if: success() && steps.wheel-cache.outputs.hit != 'true'

diff --git a/docker/vllm_omni/Dockerfile.amzn2023 b/docker/vllm_omni/Dockerfile.amzn2023
@@ -1,6 +1,6 @@
 ARG CUDA_VERSION=13.0.2
 ARG PYTHON_VERSION=3.12
-ARG VLLM_VERSION=0.20.0
+ARG VLLM_VERSION=0.21.0
 ARG FLASHINFER_VERSION=0.6.8.post1
 ARG DEEPEP_COMMIT_HASH=73b6ea4
 
@@ -201,14 +201,14 @@ RUN --mount=type=cache,target=/root/.cache/uv ls /tmp/vllm-dist/*.whl \
 
 # Install FlashInfer JIT cache (requires CUDA-version-specific index URL).
 # flashinfer-python and flashinfer-cubin are already pulled in via requirements/cuda.txt.
-# Pre-download cubins so the first inference request doesn't pay JIT compile latency.
+# Cubins are downloaded later, AFTER all wheel installs that may overwrite
+# flashinfer files (vllm wheel, EP kernels, KV connectors). See upstream vllm
+# v0.21.0 Dockerfile — downloading earlier wastes ~2.5 GB on layer duplication.
 ARG FLASHINFER_VERSION
 RUN --mount=type=cache,target=/root/.cache/uv uv pip install flashinfer-jit-cache==${FLASHINFER_VERSION} \
-  --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-  && flashinfer show-config \
-  && flashinfer download-cubin
+  --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# Install serving extras (matches upstream vllm v0.20.0 serving extras set)
+# Install serving extras (matches upstream vllm v0.21.0 serving extras set)
 RUN --mount=type=cache,target=/root/.cache/uv uv pip install accelerate modelscope \
   "bitsandbytes>=0.46.1" "timm>=1.0.17" "runai-model-streamer[s3,gcs]>=0.15.7"
 
@@ -228,9 +228,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
     CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \
     if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        if [ "$CUDA_MAJOR" -ge 13 ]; then \
-            uv pip install nixl-cu13; \
-        fi; \
         uv pip install -r /tmp/kv_connectors.txt --no-build || ( \
             dnf install -y --setopt=install_weak_deps=False \
                 libcusparse-devel-${CUDA_DASH} \
@@ -240,8 +237,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \
             && dnf remove -y libcusparse-devel-${CUDA_DASH} libcublas-devel-${CUDA_DASH} libcusolver-devel-${CUDA_DASH} \
             && dnf clean all && rm -rf /var/cache/dnf \
         ); \
+        # Force-reinstall the matching CUDA wheel so the correct nixl_ep_cpp.so
+        # is installed (upstream vllm v0.21.0 fix).
+        uv pip install --force-reinstall --no-deps nixl-cu${CUDA_MAJOR}; \
     fi
 
+# Pre-download FlashInfer cubins AFTER all wheel installs (vllm wheel, EP
+# kernels, KV connectors) finish — earlier installs may overwrite flashinfer
+# package files. Downloading here avoids ~2.5 GB layer duplication.
+RUN flashinfer show-config && flashinfer download-cubin
+
 # =============================================================================
 # STAGE 3: runtime — minimal image with clean venv
 # =============================================================================
@@ -267,7 +272,9 @@ ENV PATH="/root/.local/bin:${PATH}"
 # See: https://docs.nvidia.com/deploy/cuda-compatibility/
 ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0
 
-# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM)
+# Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM).
+# Upstream vllm v0.21.0 switched libcublas → libcublas-devel so cublas headers
+# are present at runtime for JIT (e.g. fastsafetensors / nccl_allocator).
 RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \
   && dnf install -y --setopt=install_weak_deps=False \
     gcc python${PYTHON_VERSION}-devel \
@@ -276,7 +283,7 @@ RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \
     cuda-nvrtc-${CUDA_DASH} \
     cuda-cuobjdump-${CUDA_DASH} \
     libcurand-devel-${CUDA_DASH} \
-    libcublas-${CUDA_DASH} \
+    libcublas-devel-${CUDA_DASH} \
   && dnf clean all && rm -rf /var/cache/dnf
 
 COPY --from=deps /opt/venv /opt/venv
@@ -294,7 +301,7 @@ ENV HF_XET_HIGH_PERFORMANCE=1
 # =============================================================================
 FROM runtime AS omni-deps
 
-ARG VLLM_OMNI_VERSION=0.20.0
+ARG VLLM_OMNI_VERSION=0.21.0rc1
 
 # System deps for omni-modality (TTS, audio, image/video)
 # Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, ffmpeg.
@@ -304,8 +311,24 @@ RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-r
   && dnf install -y --setopt=install_weak_deps=False espeak-ng ffmpeg-free \
   && dnf clean all && rm -rf /var/cache/dnf
 
-# Install vllm-omni (pure Python, no compilation)
-RUN --mount=type=cache,target=/root/.cache/uv uv pip install vllm-omni==${VLLM_OMNI_VERSION}
+# Install vllm-omni (pure Python, no compilation).
+# --prerelease=allow needed because 0.21.0rc1 is a PEP 440 pre-release;
+# strip when bumping to a stable 0.21.0.
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install --prerelease=allow vllm-omni==${VLLM_OMNI_VERSION}
+
+# Pin transformers <5.9.0. vllm-omni 0.21.0rc1's qwen3_tts module calls
+# create_causal_mask(input_embeds=...). The kwarg was renamed to
+# `inputs_embeds` in transformers 5.5.1 with a deprecated `input_embeds`
+# alias kept in place — versions 5.5.1..5.8.1 still accept the call with a
+# deprecation warning. transformers 5.9.0 (released 2026-05-20, see
+# https://github.com/huggingface/transformers/releases/tag/v5.9.0) dropped
+# the @deprecate_kwarg("input_embeds", ...) decorator from
+# src/transformers/masking_utils.py, breaking qwen3-tts smoke tests with:
+#     TypeError: create_causal_mask() got an unexpected keyword argument 'input_embeds'
+# vllm core 0.21.0's pin (>=4.56.0, !=5.0..5.4, !=5.5.0) is too loose — pip
+# resolves to 5.9.x. Cap ourselves at <5.9.0 until vllm-omni updates the
+# call site to use `inputs_embeds`.
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install --force-reinstall --no-deps "transformers>=4.56.0,<5.9.0"
 
 # =============================================================================
 # STAGE: builder-oss-omni — OSS compliance for omni venv
@@ -327,7 +350,7 @@ ARG PYTHON="python3"
 ARG PYTHON_VERSION=3.12
 ARG CUDA_VERSION
 ARG DLC_MAJOR_VERSION=1
-ARG DLC_MINOR_VERSION=0
+ARG DLC_MINOR_VERSION=3
 
 LABEL maintainer="Amazon AI"
 LABEL dlc_major_version="${DLC_MAJOR_VERSION}"

diff --git a/docker/vllm_omni/versions.env b/docker/vllm_omni/versions.env
@@ -11,9 +11,10 @@
 
 # ── vLLM source & version ──────────────────────────────────────
 export VLLM_REPO="https://github.com/vllm-project/vllm.git"
-export VLLM_VERSION="0.20.0"
+export VLLM_VERSION="0.21.0"
 export VLLM_REF="v${VLLM_VERSION}"
-export VLLM_OMNI_VERSION="0.20.0"
+# vllm-omni 0.21.0rc1 is a pre-release; pip install must resolve with --pre.
+export VLLM_OMNI_VERSION="0.21.0rc1"
 
 # Wheel version tag — PEP 440 local-version encoding the pinned ref for
 # traceability. Commit SHAs are truncated to 8 chars; tags/branches are
@@ -36,9 +37,9 @@ export EFA_VERSION="1.47.0"
 
 # ── DLC image versioning ───────────────────────────────────────
 export DLC_MAJOR_VERSION="1"
-export DLC_MINOR_VERSION="2"
+export DLC_MINOR_VERSION="3"
 
 # ── Build configuration ────────────────────────────────────────
-# Aligned with upstream vllm v0.20.0 Dockerfile.
+# Aligned with upstream vllm v0.21.0 Dockerfile.
 export torch_cuda_arch_list="7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX"
 export INSTALL_KV_CONNECTORS="true"