aws · Yadan-Wei · May 12, 2026 · May 2, 2026 · May 5, 2026 · May 5, 2026
diff --git a/.github/config/image/vllm-omni-ec2-amzn2023.yml b/.github/config/image/vllm-omni-ec2-amzn2023.yml
@@ -6,15 +6,15 @@ image:
 
 common:
   framework: "vllm_omni"
-  framework_version: "0.18.0"
-  vllm_ref: "v0.18.0"
+  framework_version: "0.20.0"
+  vllm_ref: "v0.20.0"
   job_type: "general"
   python_version: "py312"
-  cuda_version: "cu129"
+  cuda_version: "cu130"
   os_version: "amzn2023"
   customer_type: "ec2"
   arch_type: "x86"
-  prod_image: "vllm-omni:0.18-gpu-py312-ec2"
+  prod_image: "vllm:omni-cuda-v1"
   device_type: "gpu"
   contributor: "None"
 

diff --git a/.github/config/image/vllm-omni-sagemaker-amzn2023.yml b/.github/config/image/vllm-omni-sagemaker-amzn2023.yml
@@ -6,16 +6,16 @@ image:
 
 common:
   framework: "vllm_omni"
-  framework_version: "0.18.0"
-  vllm_ref: "v0.18.0"
+  framework_version: "0.20.0"
+  vllm_ref: "v0.20.0"
   job_type: "general"
   python_version: "py312"
-  cuda_version: "cu129"
+  cuda_version: "cu130"
   os_version: "amzn2023"
   customer_type: "sagemaker"
   platform: "sagemaker"
   arch_type: "x86"
-  prod_image: "vllm-omni:0.18-gpu-py312-sagemaker"
+  prod_image: "vllm:omni-sagemaker-cuda-v1"
   device_type: "gpu"
   contributor: "None"
 

diff --git a/.github/config/model-tests/vllm-omni-model-tests.yml b/.github/config/model-tests/vllm-omni-model-tests.yml
@@ -27,6 +27,33 @@ smoke-test:
       test_request: '{"input": "Hello, how are you?", "voice": "vivian", "language": "English"}'
       validate: "binary_size_gt:1000"
 
+    # Voice-clone TTS: ref_audio_s3 is fetched by the workflow, base64-encoded,
+    # and injected as ref_audio before invoking the smoke-test script.
+    # ref_text MUST be the exact transcript of the reference audio — mismatched
+    # transcripts can cause Code2Wav malformed output (upstream issue #3124).
+    - name: "qwen3-tts-12hz-1.7b-base"
+      s3_model: "qwen3-tts-12hz-1.7b-base.tar.gz"
+      fleet: "x86-g6xl-runner"
+      extra_args: ""
+      route: "/v1/audio/speech"
+      test_request: '{"input": "Hello, this is a voice cloning smoke test.", "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/tts_ref_vivian.wav", "ref_text": "The quick brown fox jumps over the lazy dog near the riverbank at sunset.", "language": "English"}'
+      validate: "binary_size_gt:1000"
+
+    # CosyVoice3 is zero-shot voice-clone only (no preset voices). The upstream
+    # vendored fixture ships at tests/assets/cosyvoice3/zero_shot_prompt.wav;
+    # we mirror it under test-fixtures/audio/ for CI isolation.
+    # Fleet bumped from x86-g6xl-runner (16 GB RAM) to x86-g6exl-runner
+    # (32 GB RAM) on 2026-05-11: cosyvoice with --trust-remote-code on 16 GB
+    # was causing host SIGKILL during model load on vllm-omni 0.20.0 final.
+    # Last green run was 2026-05-07 on rc1; regression in final.
+    - name: "cosyvoice3-0.5b"
+      s3_model: "cosyvoice3-0.5b.tar.gz"
+      fleet: "x86-g6exl-runner"
+      extra_args: "--trust-remote-code --enforce-eager"
+      route: "/v1/audio/speech"
+      test_request: '{"input": "Hello, this is a voice cloning smoke test.", "ref_audio_s3": "s3://dlc-cicd-models/test-fixtures/audio/cosyvoice3_ref.wav", "ref_text": "希望你以后能够做的比我还好呦。", "response_format": "wav", "stream": false}'
+      validate: "binary_size_gt:1000"
+
     # --- Image generation models (route: /v1/images/generations) ---
     - name: "flux2-klein-4b"
       s3_model: "flux2-klein-4b.tar.gz"
@@ -36,7 +63,18 @@ smoke-test:
       test_request: '{"prompt": "a red apple on a white table", "size": "512x512", "n": 1}'
       validate: "json_field:data[0].b64_json"
 
-    # --- Video generation models (route: /v1/videos) ---
+    # ERNIE-Image-Turbo: 8-step distilled DiT image gen, added in vllm-omni
+    # #2861. ErnieImagePipeline only landed in v0.20.0 final (post-rc1).
+    - name: "ernie-image-turbo"
+      s3_model: "ernie-image-turbo.tar.gz"
+      fleet: "x86-g6exl-runner"
+      extra_args: ""
+      route: "/v1/images/generations"
+      test_request: '{"prompt": "a red apple on a white table", "size": "512x512", "n": 1}'
+      validate: "json_field:data[0].b64_json"
+
+    # --- Video generation models ---
+    # Async route (POST /v1/videos): returns job ID, requires polling.
     - name: "wan2.1-t2v-1.3b"
       s3_model: "wan2.1-t2v-1.3b.tar.gz"
       fleet: "x86-g6exl-runner"
@@ -46,6 +84,54 @@ smoke-test:
       test_request: 'prompt=a dog running on a beach&num_frames=17&num_inference_steps=4&size=480x320&seed=42'
       validate: "json_field:id"
 
+    # Sync route (POST /v1/videos/sync): blocks until complete, returns raw
+    # video/mp4 bytes. New in v0.20.0 — compatible with SageMaker endpoints.
+    - name: "wan2.1-t2v-1.3b-sync"
+      s3_model: "wan2.1-t2v-1.3b.tar.gz"
+      fleet: "x86-g6exl-runner"
+      extra_args: ""
+      route: "/v1/videos/sync"
+      content_type: "multipart/form-data"
+      test_request: 'prompt=a dog running on a beach&num_frames=17&num_inference_steps=4&size=480x320&seed=42'
+      validate: "binary_size_gt:1000"
+
+    # Wan2.1-VACE: unified video creation/editing pipeline (WanVACEPipeline,
+    # added in vllm-omni #1885). Distinct from WanPipeline T2V — accepts
+    # text + optional video/mask/reference image. 1.3B variant fits L40S.
+    # Validated 2026-05-08 on g6e.2xlarge: 46 KB MP4 in 2.37s, peak GPU 19.3 GB.
+    - name: "wan2.1-vace-1.3b"
+      s3_model: "wan2.1-vace-1.3b.tar.gz"
+      fleet: "x86-g6exl-runner"
+      extra_args: ""
+      route: "/v1/videos/sync"
+      content_type: "multipart/form-data"
+      test_request: 'prompt=a dog running on a beach&num_frames=17&num_inference_steps=4&size=480x320&seed=42'
+      validate: "binary_size_gt:1000"
+
+    # Wan2.2-I2V-A14B: 27B-total / 14B-active MoE, image-to-video. Tarball
+    # is 107 GB and needs g6e.12xlarge. Pre-staged at
+    # s3://dlc-cicd-models/omni-models/wan2.2-i2v-a14b.tar.gz but not enabled
+    # because (a) g6e.12xl is currently ICE in us-west-2, (b) /v1/videos/sync
+    # for I2V needs an `image` form field that the current smoke-test harness
+    # doesn't fetch (analogous to the ref_audio_s3 pattern, but for images).
+    # - name: "wan2.2-i2v-a14b"
+    #   s3_model: "wan2.2-i2v-a14b.tar.gz"
+    #   fleet: "x86-g6e12xl-runner"
+    #   extra_args: ""
+    #   route: "/v1/videos/sync"
+    #   content_type: "multipart/form-data"
+    #   test_request: 'prompt=a dog running on a beach&image_s3=s3://dlc-cicd-models/test-fixtures/images/i2v_seed.png&num_frames=17&num_inference_steps=4&size=480x320&seed=42'
+    #   validate: "binary_size_gt:1000"
+
+    # --- Audio generation models (route: /v1/audio/generate, new in v0.20.0 per vllm-project/vllm-omni#1794) ---
+    - name: "stable-audio-open-1.0"
+      s3_model: "stable-audio-open-1.0.tar.gz"
+      fleet: "x86-g6xl-runner"
+      extra_args: "--gpu-memory-utilization 0.9 --trust-remote-code --enforce-eager"
+      route: "/v1/audio/generate"
+      test_request: '{"input": "The sound of a dog barking", "audio_length": 5.0, "guidance_scale": 7.0, "num_inference_steps": 50, "seed": 42}'
+      validate: "binary_size_gt:10000"
+
     # --- Omni chat models (route: /v1/chat/completions, fallthrough) ---
     # model is big, won't run for now
     # - name: "bagel-7b-mot"

diff --git a/.github/workflows/reusable-vllm-omni-model-tests.yml b/.github/workflows/reusable-vllm-omni-model-tests.yml
@@ -108,7 +108,7 @@ jobs:
             ${{ inputs.image-uri }} \
             --model ${{ steps.resolve.outputs.model_path }} \
             --port 8080 \
-            --stage-init-timeout 600 \
+            --stage-init-timeout 900 \
             ${{ matrix.model.extra_args }})
           echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
 
@@ -158,11 +158,39 @@ jobs:
           docker cp test/vllm-omni/scripts/vllm_omni_${{ inputs.customer-type }}_smoke_test.sh \
             ${CONTAINER_ID}:/tmp/smoke_test.sh
 
+      - name: Prepare test request
+        # Write test_request to a file and expand any S3-backed voice-clone
+        # reference audio into an inlined base64 data URL. Using a file keeps
+        # us under the shell argument-length limit (~128KB) when ref_audio
+        # payloads push the JSON past ~400KB.
+        run: |
+          set -euo pipefail
+          REQUEST='${{ matrix.model.test_request }}'
+          if [[ "$REQUEST" == *ref_audio_s3* ]]; then
+            REF_S3=$(python3 -c 'import json,sys;print(json.loads(sys.stdin.read()).get("ref_audio_s3",""))' <<< "$REQUEST")
+            if [ -n "$REF_S3" ]; then
+              echo "Fetching ref_audio from $REF_S3"
+              aws s3 cp "$REF_S3" /tmp/ref_audio.wav --quiet
+              REQUEST=$(REQUEST="$REQUEST" python3 <<'PY'
+          import base64, json, os
+          payload = json.loads(os.environ["REQUEST"])
+          with open("/tmp/ref_audio.wav", "rb") as f:
+              payload["ref_audio"] = "data:audio/wav;base64," + base64.b64encode(f.read()).decode()
+          payload.pop("ref_audio_s3", None)
+          print(json.dumps(payload))
+          PY
+              )
+            fi
+          fi
+          printf '%s' "$REQUEST" > /tmp/test_request.body
+          echo "Request size: $(wc -c < /tmp/test_request.body) bytes"
+          docker cp /tmp/test_request.body ${CONTAINER_ID}:/tmp/test_request.body
+
       - name: Run smoke test
         run: |
           docker exec ${CONTAINER_ID} bash /tmp/smoke_test.sh \
             "${{ matrix.model.route }}" \
-            '${{ matrix.model.test_request }}' \
+            '@/tmp/test_request.body' \
             "${{ matrix.model.validate }}" \
             "${{ matrix.model.content_type || 'application/json' }}"
 

diff --git a/docker/vllm_omni/Dockerfile.amzn2023 b/docker/vllm_omni/Dockerfile.amzn2023
@@ -1,8 +1,7 @@
-ARG CUDA_VERSION=12.9.1
+ARG CUDA_VERSION=13.0.2
 ARG PYTHON_VERSION=3.12
-ARG VLLM_VERSION=0.18.0
-ARG FLASHINFER_VERSION=0.6.6
-ARG DEEPGEMM_GIT_REF=477618cd51baffca09c4b0b87e97c03fe827ef03
+ARG VLLM_VERSION=0.20.0
+ARG FLASHINFER_VERSION=0.6.8.post1
 ARG DEEPEP_COMMIT_HASH=73b6ea4
 
 # =============================================================================
@@ -67,10 +66,16 @@ WORKDIR /workspace/vllm
 COPY --from=source /src/ ./
 
 # Install PyTorch + build deps (slow-changing, cached)
-RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r requirements/cuda.txt -r requirements/build.txt \
+# Upstream vllm v0.20.0 moved requirements/build.txt → requirements/build/cuda.txt
+# and installs it as a separate step after cuda.txt so that only cuda.txt touches
+# the torch index URL resolution. See docker/Dockerfile in vllm v0.20.0.
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r requirements/cuda.txt \
   --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r requirements/build/cuda.txt \
+  --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 
 ARG max_jobs=32
@@ -82,11 +87,14 @@ ENV NVCC_THREADS=${nvcc_threads}
 ENV VLLM_TARGET_DEVICE=cuda
 ARG VLLM_REF
 ARG VLLM_VERSION
-ENV SETUPTOOLS_SCM_PRETEND_VERSION="${VLLM_VERSION}+amzn2023"
+# Wheel version tag — pass via --build-arg from workflow (see docker/vllm_omni/versions.env).
+# Example: 0.20.0+amzn2023.abcdef12
+ARG SETUPTOOLS_SCM_PRETEND_VERSION
+ENV SETUPTOOLS_SCM_PRETEND_VERSION=${SETUPTOOLS_SCM_PRETEND_VERSION:-${VLLM_VERSION}+amzn2023}
 
 # --- Pre-built wheel support ---
 # Fetch wheels from S3 into docker/vllm/prebuilt_wheels/ BEFORE docker build:
-#   bash scripts/vllm/amzn2023/fetch_cached_wheels.sh cu129 v0.18.0
+#   bash scripts/vllm/amzn2023/fetch_cached_wheels.sh cu130 v0.20.0
 # The directory is empty by default so COPY always succeeds.
 COPY docker/vllm/prebuilt_wheels/ /tmp/prebuilt_wheels/
 
@@ -102,7 +110,7 @@ COPY docker/vllm/sccache-cache/ /root/.cache/sccache/
 RUN --mount=type=cache,target=/root/.cache/ccache --mount=type=cache,target=/root/.cache/uv \
     if [ "$USE_PREBUILT_WHEEL" = "1" ] && ls /tmp/prebuilt_wheels/*.whl >/dev/null 2>&1; then \
       echo "✅ Using pre-built vLLM wheel — skipping compilation" \
-      && mkdir -p dist \
+      && rm -rf dist && mkdir -p dist \
       && cp /tmp/prebuilt_wheels/*.whl dist/; \
     elif [ "$USE_SCCACHE" = "1" ]; then \
       echo "🔧 Building vLLM with sccache..." \
@@ -126,16 +134,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache --mount=type=cache,target=/roo
       && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
-# Build DeepGEMM wheel
-ARG DEEPGEMM_GIT_REF
-RUN --mount=type=cache,target=/root/.cache/uv mkdir -p /tmp/deepgemm/dist \
-  && VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" \
-    tools/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" \
-    ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} \
-    --wheel-dir /tmp/deepgemm/dist \
-  || echo "DeepGEMM build skipped (CUDA version requirement not met)"
-RUN mkdir -p /tmp/deepgemm/dist
-
 # Build DeepEP wheels
 ARG DEEPEP_COMMIT_HASH
 ARG NVSHMEM_VER
@@ -193,29 +191,26 @@ COPY --from=build /workspace/vllm/requirements/cuda.txt /tmp/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r /tmp/cuda.txt \
   --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# Install vLLM wheel
+# Install vLLM wheel (pick the most recent vllm-*.whl and log which one)
 COPY --from=build /workspace/vllm/dist /tmp/vllm-dist
-RUN --mount=type=cache,target=/root/.cache/uv uv pip install /tmp/vllm-dist/*.whl \
-  --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
-
-# Install FlashInfer cubin/jit-cache
+RUN --mount=type=cache,target=/root/.cache/uv ls /tmp/vllm-dist/*.whl \
+  && VLLM_WHL=$(ls -t /tmp/vllm-dist/vllm-*.whl | head -1) \
+  && echo "Installing ${VLLM_WHL}" \
+  && uv pip install "${VLLM_WHL}" \
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+# Install FlashInfer JIT cache (requires CUDA-version-specific index URL).
+# flashinfer-python and flashinfer-cubin are already pulled in via requirements/cuda.txt.
+# Pre-download cubins so the first inference request doesn't pay JIT compile latency.
 ARG FLASHINFER_VERSION
-RUN --mount=type=cache,target=/root/.cache/uv uv pip install flashinfer-cubin==${FLASHINFER_VERSION} \
-  flashinfer-jit-cache==${FLASHINFER_VERSION} \
-  --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install flashinfer-jit-cache==${FLASHINFER_VERSION} \
+  --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+  && flashinfer show-config \
+  && flashinfer download-cubin
 
-# Install serving extras
-RUN --mount=type=cache,target=/root/.cache/uv uv pip install accelerate hf_transfer modelscope \
-  "bitsandbytes>=0.46.1" "timm>=1.0.17" "runai-model-streamer[s3,gcs]>=0.15.3"
-
-# Install DeepGEMM wheel from build stage
-COPY --from=build /tmp/deepgemm/dist /tmp/deepgemm/dist
-RUN --mount=type=cache,target=/root/.cache/uv \
-  if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
-    uv pip install /tmp/deepgemm/dist/*.whl; \
-  else \
-    echo "No DeepGEMM wheels to install; skipping."; \
-  fi && rm -rf /tmp/deepgemm
+# Install serving extras (matches upstream vllm v0.20.0 serving extras set)
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install accelerate modelscope \
+  "bitsandbytes>=0.46.1" "timm>=1.0.17" "runai-model-streamer[s3,gcs]>=0.15.7"
 
 # Install DeepEP wheels from build stage
 COPY --from=build /tmp/ep_kernels_workspace/dist /tmp/ep_kernels/dist
@@ -259,12 +254,19 @@ WORKDIR /vllm-workspace
 
 RUN dnf install -y --setopt=install_weak_deps=False \
   python${PYTHON_VERSION} libibverbs shadow-utils tar gzip \
+  numactl numactl-libs numactl-devel \
   && dnf clean all && rm -rf /var/cache/dnf
 
 # Install uv
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh
 ENV PATH="/root/.local/bin:${PATH}"
 
+# Enable CUDA forward compatibility for datacenter GPUs with older drivers.
+# Set to 1 at runtime (e.g., -e VLLM_ENABLE_CUDA_COMPATIBILITY=1) only when
+# the host NVIDIA driver is older than what CUDA 13 normally requires.
+# See: https://docs.nvidia.com/deploy/cuda-compatibility/
+ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0
+
 # Runtime JIT compilation tools (Triton/Inductor, FlashInfer, DeepGEMM)
 RUN CUDA_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') \
   && dnf install -y --setopt=install_weak_deps=False \
@@ -284,21 +286,22 @@ ENV PATH="/opt/venv/bin:${PATH}"
 ENV VIRTUAL_ENV="/opt/venv"
 ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
 ENV VLLM_USAGE_SOURCE=production-docker-image
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
+ENV HF_XET_HIGH_PERFORMANCE=1
 
 
 # =============================================================================
 # STAGE: omni-deps — install vllm-omni on top of runtime venv
 # =============================================================================
 FROM runtime AS omni-deps
 
-ARG VLLM_OMNI_VERSION=0.18.0
+ARG VLLM_OMNI_VERSION=0.20.0
 
 # System deps for omni-modality (TTS, audio, image/video)
-# Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, sox, ffmpeg
+# Enable SPAL (Supplementary Packages for Amazon Linux) for espeak-ng, ffmpeg.
+# sox dropped — vllm-omni v0.20.0 removed sox from its deps (#2745)
 RUN dnf upgrade -y --releasever=latest --setopt=install_weak_deps=False system-release \
   && dnf install -y spal-release \
-  && dnf install -y --setopt=install_weak_deps=False espeak-ng sox ffmpeg-free \
+  && dnf install -y --setopt=install_weak_deps=False espeak-ng ffmpeg-free \
   && dnf clean all && rm -rf /var/cache/dnf
 
 # Install vllm-omni (pure Python, no compilation)
@@ -400,6 +403,9 @@ FROM omni-base AS vllm-omni-ec2-amzn2023
 
 ARG CACHE_REFRESH=0
 RUN dnf upgrade -y --security --releasever latest --setopt=install_weak_deps=False \
+  # Explicitly upgrade cuda-compat-13-0 to pick up fix for CVE-2025-33219
+  # (NVIDIA repo — not flagged in AL2023 security advisories, so --security misses it)
+  && dnf upgrade -y --releasever latest cuda-compat-13-0 \
   && dnf clean all && rm -rf /var/cache/dnf /tmp/* \
   && ln -sf /opt/venv/bin/python3 /usr/bin/python3
 
@@ -413,6 +419,9 @@ FROM omni-base AS vllm-omni-sagemaker-amzn2023
 
 ARG CACHE_REFRESH=0
 RUN dnf upgrade -y --security --releasever latest --setopt=install_weak_deps=False \
+  # Explicitly upgrade cuda-compat-13-0 to pick up fix for CVE-2025-33219
+  # (NVIDIA repo — not flagged in AL2023 security advisories, so --security misses it)
+  && dnf upgrade -y --releasever latest cuda-compat-13-0 \
   && dnf clean all && rm -rf /var/cache/dnf /tmp/* \
   && ln -sf /opt/venv/bin/python3 /usr/bin/python3