From 8c1f814611deb4bc8d9c61329e0f8217e50a5990 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 12 Mar 2026 22:11:42 -0500
Subject: [PATCH 01/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/hardware_tests/amd.yaml            |  26 ++-
 .buildkite/scripts/ci-bake.sh                 | 172 ++++++++++++++++++
 .buildkite/test-amd.yaml                      |  13 +-
 docker/Dockerfile.rocm                        | 143 +++++++++++----
 docker/docker-bake-rocm.hcl                   |  81 +++++++++
 .../python_only_compile_rocm.sh               |  71 ++++++++
 6 files changed, 449 insertions(+), 57 deletions(-)
 create mode 100644 .buildkite/scripts/ci-bake.sh
 create mode 100644 docker/docker-bake-rocm.hcl
 create mode 100644 tests/standalone_tests/python_only_compile_rocm.sh

diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml
index 23a23723ad93..6f96db2110e6 100644
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,4 +1,4 @@
-group: Hardware - AMD Build 
+group: Hardware - AMD Build
 steps:
   - label: "AMD: :docker: build image"
     key: image-build-amd
@@ -6,25 +6,21 @@ steps:
     device: amd_cpu
     no_plugin: true
     commands:
-    - >
-      docker build
-      --build-arg max_jobs=16
-      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
-      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
-      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-      -f docker/Dockerfile.rocm
-      --target test
-      --no-cache
-      --progress plain .
-    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
+      - bash .buildkite/scripts/ci-bake.sh test-rocm-ci
     env:
       DOCKER_BUILDKIT: "1"
+      IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}"
+      VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
+      CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
+      PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950"
+    timeout_in_minutes: 600
     retry:
       automatic:
-        - exit_status: -1  # Agent was lost
+        - exit_status: -1   # Agent was lost
           limit: 1
         - exit_status: -10  # Agent was lost
           limit: 1
-        - exit_status: 1  # Machine occasionally fail
+        - exit_status: 128  # Git / network connectivity issues
+          limit: 1
+        - exit_status: 1    # Machine occasionally fails
           limit: 1
diff --git a/.buildkite/scripts/ci-bake.sh b/.buildkite/scripts/ci-bake.sh
new file mode 100644
index 000000000000..410c5c352366
--- /dev/null
+++ b/.buildkite/scripts/ci-bake.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+# ci-bake.sh - Wrapper script for Docker buildx bake CI builds
+#
+# Canonical location: vllm repo at .buildkite/scripts/ci-bake.sh
+# Kept in sync with ci-infra repo at buildkite/scripts/ci-bake.sh.
+# Update both when making changes; the vllm copy is what actually runs in CI
+# (pinned to the vllm commit under test).
+#
+# This script handles the common setup for running docker buildx bake:
+# - Downloads ci.hcl from ci-infra
+# - Detects and uses local buildkitd if available (custom AMI with warm cache)
+# - Falls back to docker-container driver on regular instances
+# - Runs bake with --print for debugging
+# - Runs the actual build
+#
+# Usage:
+#   ci-bake.sh [TARGET]
+#
+# Environment variables (all optional, with sensible defaults):
+#   CI_HCL_URL          - URL to ci.hcl (default: from ci-infra main branch)
+#   VLLM_CI_BRANCH      - ci-infra branch to use (default: main)
+#   VLLM_BAKE_FILE      - Path to vLLM's bake file (default: docker/docker-bake.hcl)
+#   BUILDER_NAME        - Name for buildx builder (default: vllm-builder)
+#
+# Build configuration (passed through to bake via environment):
+#   BUILDKITE_COMMIT    - Git commit (auto-detected from Buildkite)
+#   PARENT_COMMIT       - Parent commit (HEAD~1) for cache fallback (auto-computed)
+#   IMAGE_TAG           - Primary image tag
+#   IMAGE_TAG_LATEST    - Latest tag (optional)
+#   CACHE_FROM          - Cache source
+#   CACHE_FROM_BASE     - Base branch cache source
+#   CACHE_FROM_MAIN     - Main branch cache source
+#   CACHE_TO            - Cache destination
+#   VLLM_USE_PRECOMPILED    - Use precompiled wheels
+#   VLLM_MERGE_BASE_COMMIT  - Merge base commit for precompiled
+
+set -euo pipefail
+
+# Check if image already exists (skip build if it does)
+if [[ -n "${IMAGE_TAG:-}" ]]; then
+    echo "--- :mag: Checking if image exists"
+    if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
+        echo "Image already exists: ${IMAGE_TAG}"
+        echo "Skipping build"
+        exit 0
+    fi
+    echo "Image not found, proceeding with build"
+fi
+
+# Configuration with defaults
+TARGET="${1:-test-ci}"
+CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
+VLLM_BAKE_FILE="${VLLM_BAKE_FILE:-docker/docker-bake.hcl}"
+BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
+CI_HCL_PATH="/tmp/ci.hcl"
+BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"
+
+echo "--- :docker: Setting up Docker buildx bake"
+echo "Target: ${TARGET}"
+echo "CI HCL URL: ${CI_HCL_URL}"
+echo "vLLM bake file: ${VLLM_BAKE_FILE}"
+
+
+# Check if vLLM bake file exists
+if [[ ! -f "${VLLM_BAKE_FILE}" ]]; then
+    echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE}"
+    echo "Make sure you're running from the vLLM repository root"
+    exit 1
+fi
+
+# Download ci.hcl
+echo "--- :arrow_down: Downloading ci.hcl"
+curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
+echo "Downloaded to ${CI_HCL_PATH}"
+
+# Set up buildx builder
+# Priority: 1) local buildkitd socket (custom AMI) 2) existing builder 3) new docker-container builder
+echo "--- :buildkite: Setting up buildx builder"
+
+if [[ -S "${BUILDKIT_SOCKET}" ]]; then
+    # Custom AMI with standalone buildkitd - use remote driver for warm cache
+    echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
+    echo "Using remote driver to connect to buildkitd (warm cache available)"
+
+    # Check if baked-vllm-builder already exists and is using the socket
+    if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
+        echo "Using existing baked-vllm-builder"
+        docker buildx use baked-vllm-builder
+    else
+        echo "Creating baked-vllm-builder with remote driver"
+        docker buildx create \
+            --name baked-vllm-builder \
+            --driver remote \
+            --use \
+            "unix://${BUILDKIT_SOCKET}"
+    fi
+    docker buildx inspect --bootstrap
+elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
+    # Existing builder available
+    echo "Using existing builder: ${BUILDER_NAME}"
+    docker buildx use "${BUILDER_NAME}"
+    docker buildx inspect --bootstrap
+else
+    # No local buildkitd, no existing builder - create new docker-container builder
+    echo "No local buildkitd found, using docker-container driver"
+    docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
+    docker buildx inspect --bootstrap
+fi
+
+# Show builder info
+echo "Active builder:"
+docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls
+
+# Deepen shallow clones so HEAD~1 and merge-base are available.
+# Buildkite agents often clone with --depth=1; without deepening, git rev-parse
+# HEAD~1 and git merge-base both silently fail, disabling the per-commit cache layers.
+if git rev-parse --is-shallow-repository 2>/dev/null | grep -q "true"; then
+    echo "Shallow clone detected — deepening for cache key computation"
+    # --deepen=1 extends the current shallow clone by 1 commit along the
+    # already-fetched branch, making HEAD~1 available.  Unlike --depth=2
+    # with a refspec, it operates on the currently checked-out branch and
+    # is safe in detached-HEAD (Buildkite) checkout state.
+    git fetch --deepen=1 origin 2>/dev/null || true
+fi
+
+# Compute parent commit for cache fallback (if not already set)
+if [[ -z "${PARENT_COMMIT:-}" ]]; then
+    PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
+    if [[ -n "${PARENT_COMMIT}" ]]; then
+        echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
+        export PARENT_COMMIT
+    else
+        echo "Could not determine parent commit (may be first commit in repo)"
+    fi
+else
+    echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
+fi
+
+# Compute merge-base with main for an additional cache fallback layer.
+# Mirrors the VLLM_MERGE_BASE_COMMIT pattern used in ci.hcl (CUDA).
+# Useful for long-lived PRs where parent-commit cache may be missing but the
+# merge-base (a real main commit) maps to a warm :rocm-latest snapshot.
+if [[ -z "${VLLM_MERGE_BASE_COMMIT:-}" ]]; then
+    # Fetch just the tip of main so merge-base can be resolved on shallow clones.
+    git fetch --depth=1 origin main 2>/dev/null || true
+    VLLM_MERGE_BASE_COMMIT=$(git merge-base HEAD origin/main 2>/dev/null || echo "")
+    if [[ -n "${VLLM_MERGE_BASE_COMMIT}" ]]; then
+        echo "Computed merge base commit for cache fallback: ${VLLM_MERGE_BASE_COMMIT}"
+        export VLLM_MERGE_BASE_COMMIT
+    else
+        echo "Could not determine merge base (will skip that cache layer)"
+    fi
+else
+    echo "Using provided VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
+fi
+
+# Print resolved configuration for debugging and upload as a Buildkite artifact
+echo "--- :page_facing_up: Resolved bake configuration"
+BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
+docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
+if command -v buildkite-agent >/dev/null 2>&1 && [[ -n "${BUILDKITE_BUILD_NUMBER:-}" ]]; then
+    buildkite-agent artifact upload "${BAKE_CONFIG_FILE}" || true
+    echo "Uploaded ${BAKE_CONFIG_FILE} as Buildkite artifact"
+else
+    echo "Saved bake config to ${BAKE_CONFIG_FILE} (not in Buildkite, skipping upload)"
+fi
+
+# Run the actual build
+echo "--- :docker: Building ${TARGET}"
+docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"
+
+echo "--- :white_check_mark: Build complete"
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index a4c98f86ee07..ad04b0ed50e3 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -97,10 +97,10 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
   agent_pool: mi250_1
   source_file_dependencies:
-  - tests/standalone_tests/python_only_compile.sh
+  - tests/standalone_tests/python_only_compile_rocm.sh
   - setup.py
   commands:
-  - bash standalone_tests/python_only_compile.sh
+  - bash standalone_tests/python_only_compile_rocm.sh
 
 - label: Basic Correctness Test # 20min
   timeout_in_minutes: 30
@@ -1429,12 +1429,11 @@ steps:
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   optional: true
-  # grade: Blocking
   source_file_dependencies:
-  - tests/standalone_tests/python_only_compile.sh
+  - tests/standalone_tests/python_only_compile_rocm.sh
   - setup.py
   commands:
-  - bash standalone_tests/python_only_compile.sh
+  - bash standalone_tests/python_only_compile_rocm.sh
 
 - label: Basic Correctness Test # 20min
   timeout_in_minutes: 30
@@ -3189,10 +3188,10 @@ steps:
   agent_pool: mi355_1
   optional: true
   source_file_dependencies:
-  - tests/standalone_tests/python_only_compile.sh
+  - tests/standalone_tests/python_only_compile_rocm.sh
   - setup.py
   commands:
-  - bash standalone_tests/python_only_compile.sh
+  - bash standalone_tests/python_only_compile_rocm.sh
 
 - label: Basic Correctness Test # 20min
   timeout_in_minutes: 30
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index f8a4274a179f..37f1c487e4c0 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -19,7 +19,9 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
     sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
-    apt-transport-https ca-certificates wget curl
+    apt-transport-https ca-certificates wget curl \
+    ccache mold \
+    && update-alternatives --install /usr/bin/ld ld /usr/bin/mold 100
 RUN python3 -m pip install --upgrade pip
 # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
 ARG USE_SCCACHE
@@ -38,6 +40,11 @@ ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
+# ccache directory — persisted across layer rebuilds via --mount=type=cache
+ENV CCACHE_DIR=/root/.cache/ccache
+# Compilation parallelism — overridable via --build-arg max_jobs=N; falls back to nproc
+ARG max_jobs
+ENV MAX_JOBS=${max_jobs}
 
 # Install sccache if USE_SCCACHE is enabled (for release builds)
 ARG USE_SCCACHE
@@ -94,14 +101,49 @@ ONBUILD RUN git clone ${VLLM_REPO} \
                && git fetch upstream ; fi
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 
+# -----------------------
+# HIP kernel compilation stage (csrc-build-rocm)
+#
+# Intentionally copies ONLY build-critical files (CMakeLists.txt, csrc/, cmake/)
+# so that Python-only changes to vllm/*.py do NOT invalidate this expensive layer.
+# BuildKit's registry cache (--cache-from ECR) reuses this layer across commits
+# whenever only Python code changed — turning a 2-hour HIP recompile into seconds.
+#
+# Note: only applies when REMOTE_VLLM=0 (default), so the build context
+# contains the vllm source. Release builds (REMOTE_VLLM=1) always compile fully.
+FROM base AS csrc-build-rocm
+ARG COMMON_WORKDIR
+WORKDIR ${COMMON_WORKDIR}/vllm
+# Copy only files HIP compilation depends on — vllm/**/*.py changes don't bust this
+COPY requirements/rocm-build.txt requirements/rocm-build.txt
+COPY pyproject.toml setup.py CMakeLists.txt ./
+COPY cmake cmake/
+COPY csrc csrc/
+COPY vllm/envs.py vllm/envs.py
+COPY vllm/__init__.py vllm/__init__.py
+# Dummy version prevents git-state from busting the cache key on every commit
+ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/rocm-build.txt \
+    && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist
+
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
-# Build vLLM (setup.py auto-detects sccache in PATH)
-RUN cd vllm \
-    && python3 -m pip install -r requirements/rocm.txt \
-    && python3 setup.py clean --all  \
-    && python3 setup.py bdist_wheel --dist-dir=dist
+ARG COMMON_WORKDIR
+# Re-use the pre-built HIP kernel wheel from csrc-build-rocm.
+# When VLLM_PRECOMPILED_WHEEL_LOCATION is set, setup.py extracts the compiled
+# .so files from this wheel instead of recompiling HIP kernels.
+# Python-only changes complete in minutes instead of hours.
+COPY --from=csrc-build-rocm ${COMMON_WORKDIR}/vllm/dist /precompiled-wheels
+ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    cd vllm \
+    && uv pip install --system -r requirements/rocm-build.txt \
+    && export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl) \
+    && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
@@ -143,12 +185,14 @@ RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
 
 RUN uv pip install --system meson auditwheel patchelf tomlkit
 
-RUN cd /usr/local/src && \
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    cd /usr/local/src && \
     git clone ${UCX_REPO} &&  \
     cd ucx  && \
     git checkout ${UCX_BRANCH} && \
     ./autogen.sh && \
     mkdir build && cd build && \
+    CC="ccache gcc" CXX="ccache g++" \
     ../configure \
         --prefix=/usr/local/ucx \
         --enable-shared \
@@ -160,20 +204,22 @@ RUN cd /usr/local/src && \
         --with-verbs \
         --with-dm \
         --enable-mt && \
-    make -j && \
+    make -j$(nproc) && \
     make install
 
 ENV PATH=/usr/local/ucx/bin:$PATH
 ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
 
-RUN git clone ${RIXL_REPO} /opt/rixl && \
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    git clone ${RIXL_REPO} /opt/rixl && \
     cd /opt/rixl && \
     git checkout ${RIXL_BRANCH} && \
+    CC="ccache gcc" CXX="ccache g++" \
     meson setup build --prefix=${RIXL_HOME} \
                      -Ducx_path=${UCX_HOME} \
                      -Drocm_path=${ROCM_PATH} && \
     cd build && \
-    ninja && \
+    ninja -j$(nproc) && \
     ninja install
 
 # Generate RIXL wheel
@@ -184,34 +230,43 @@ RUN cd /opt/rixl && mkdir -p /app/install && \
         --ucx-plugins-dir ${UCX_HOME}/lib/ucx \
         --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
 
-# DeepEP build stage
-FROM base AS build_deep
+# -----------------------
+# ROCShmem build stage — split from DeepEP so changing DEEPEP_BRANCH
+# does not invalidate the slow cmake+make ROCSHMEM build.
+FROM base AS build_rocshmem
 ARG ROCSHMEM_BRANCH="ba0bf0f3"
 ARG ROCSHMEM_REPO="https://github.com/ROCm/rocm-systems.git"
-ARG DEEPEP_BRANCH="e84464ec"
-ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
-ARG DEEPEP_NIC="cx7"
 ENV ROCSHMEM_DIR=/opt/rocshmem
 
-RUN git clone ${ROCSHMEM_REPO} \
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    git clone ${ROCSHMEM_REPO} \
  && cd rocm-systems \
  && git checkout ${ROCSHMEM_BRANCH} \
  && mkdir -p projects/rocshmem/build \
  && cd projects/rocshmem/build \
- && cmake .. \
+ && CC="ccache gcc" CXX="ccache g++" cmake .. \
     -DCMAKE_INSTALL_PREFIX="${ROCSHMEM_DIR}" \
     -DROCM_PATH=/opt/rocm \
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
     -DUSE_EXTERNAL_MPI=OFF \
- && make -j \
+ && make -j$(nproc) \
  && make install
 
-# Build DeepEP wheel.
-# DeepEP looks for rocshmem at ROCSHMEM_DIR.
-RUN git clone ${DEEPEP_REPO} \
+# -----------------------
+# DeepEP build stage — depends on ROCShmem, builds the HIP kernel wheel.
+# Kept separate so the ROCShmem layer above is reused when only DEEPEP_BRANCH changes.
+FROM build_rocshmem AS build_deepep
+ARG DEEPEP_BRANCH="e84464ec"
+ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
+ARG DEEPEP_NIC="cx7"
+
+# Build DeepEP wheel. DeepEP looks for rocshmem at ROCSHMEM_DIR (inherited from build_rocshmem).
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    git clone ${DEEPEP_REPO} \
  && cd DeepEP \
  && git checkout ${DEEPEP_BRANCH} \
- && python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
+ && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
+
 
 # -----------------------
 # vLLM wheel release build stage (for building distributable wheels)
@@ -252,8 +307,9 @@ RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
 # Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
 # This ensures setuptools_scm sees clean repo state for version detection
 RUN --mount=type=bind,source=.git,target=vllm/.git \
+    --mount=type=cache,target=/root/.cache/uv \
     cd vllm \
-    && pip install setuptools_scm regex \
+    && uv pip install --system setuptools_scm regex \
     && VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
     && echo "Detected vLLM version: ${VLLM_VERSION}" \
     && echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
@@ -289,18 +345,19 @@ RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
     && python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt
 
 # Install dependencies using custom wheels from /install
-RUN cd vllm \
+RUN --mount=type=cache,target=/root/.cache/uv \
+    cd vllm \
     && echo "Building vLLM with custom wheels from /install" \
-    && python3 -m pip install --find-links /install -r requirements/rocm.txt \
-    && python3 setup.py clean --all
+    && uv pip install --system --find-links /install -r requirements/rocm.txt
 
 # Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
-# (setup.py auto-detects sccache in PATH)
+# (setup.py auto-detects ccache/sccache in PATH)
 RUN --mount=type=bind,source=.git,target=vllm/.git \
+    --mount=type=cache,target=/root/.cache/ccache \
     cd vllm \
     && export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
     && echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
-    && python3 setup.py bdist_wheel --dist-dir=dist
+    && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist
 
 FROM scratch AS export_vllm_wheel_release
 ARG COMMON_WORKDIR
@@ -321,22 +378,33 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 
 # Install vLLM using uv (inherited from base stage)
 # Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
+# Note: rocm-test.txt contains a git+ URL (fastsafetensors) that uv cannot resolve;
+#       we install non-git requirements with uv and git+ requirements with pip separately.
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/pip \
     cd /install \
     && uv pip install --system -r requirements/rocm.txt \
-    && uv pip install --system -r requirements/rocm-test.txt \
+    && grep -v 'git+' requirements/rocm-test.txt | uv pip install --system -r /dev/stdin \
+    && grep 'git+' requirements/rocm-test.txt > /tmp/git-reqs.txt \
+    && pip install --no-deps -r /tmp/git-reqs.txt \
+    && rm /tmp/git-reqs.txt \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
+# Store the vLLM wheel in the image for python_only_compile_rocm.sh.
+# The wheel is only available via bind mount during the RUN above; we need it
+# accessible at test runtime so the python-only compile test can reinstall
+# vLLM without a compiler (no wheels.vllm.ai equivalent exists for ROCm).
+COPY --from=export_vllm /*.whl /opt/vllm-wheels/
+
 # Install RIXL wheel
 RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
     uv pip install --system /rixl_install/*.whl
 
 # Install DeepEP wheel
-RUN --mount=type=bind,from=build_deep,src=/app/deep_install,target=/deep_install \
+RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
     uv pip install --system /deep_install/*.whl
-COPY --from=build_deep /opt/rocshmem /opt/rocshmem
 
 # RIXL/MoRIIO runtime dependencies (RDMA userspace libraries)
 RUN apt-get update -q -y && apt-get install -q -y \
@@ -351,9 +419,9 @@ ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 
 # install development dependencies (for testing)
-RUN cd /vllm-workspace \
-    && python3 -m pip install -e tests/vllm_test_utils \
-    && python3 -m pip install pytest-shard
+RUN --mount=type=cache,target=/root/.cache/uv \
+    cd /vllm-workspace \
+    && uv pip install --system -e tests/vllm_test_utils pytest-shard
 
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -363,7 +431,8 @@ ENV HF_HUB_ENABLE_HF_TRANSFER=1
 # install audio decode package `torchcodec` from source (required due to 
 # ROCm and torch version mismatch) for tests with datasets package
 COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh
-RUN bash /tmp/install_torchcodec.sh \
+RUN --mount=type=cache,target=/root/.cache/pip \
+    bash /tmp/install_torchcodec.sh \
     && rm /tmp/install_torchcodec.sh \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
@@ -418,6 +487,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
+# Install DeepEP wheel
+RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
+    uv pip install --system /deep_install/*.whl
+
 ARG COMMON_WORKDIR
 ARG BASE_IMAGE
 
diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl
new file mode 100644
index 000000000000..2e3525d6ee2c
--- /dev/null
+++ b/docker/docker-bake-rocm.hcl
@@ -0,0 +1,81 @@
+# docker-bake-rocm.hcl - vLLM ROCm Docker build configuration
+#
+# This file lives in the vLLM repo at docker/docker-bake-rocm.hcl
+# Equivalent of docker-bake.hcl for ROCm builds.
+#
+# Usage:
+#   docker buildx bake -f docker/docker-bake-rocm.hcl              # Build test (default)
+#   docker buildx bake -f docker/docker-bake-rocm.hcl final-rocm   # Build final image
+#   docker buildx bake -f docker/docker-bake-rocm.hcl --print      # Show resolved config
+#
+# CI usage (with ci-rocm.hcl overlay from ci-infra):
+#   docker buildx bake -f docker/docker-bake-rocm.hcl -f /tmp/ci-rocm.hcl test-rocm-ci
+
+variable "MAX_JOBS" {
+  # Empty string lets the Dockerfile fall back to $(nproc) via
+  # MAX_JOBS="${MAX_JOBS:-$(nproc)}" in each RUN step, which uses all
+  # available cores on whatever machine the build runs on.
+  # Override with --set '*.args.max_jobs=8' for local builds on small machines.
+  default = ""
+}
+
+variable "PYTORCH_ROCM_ARCH" {
+  default = "gfx90a;gfx942;gfx950"
+}
+
+variable "COMMIT" {
+  default = ""
+}
+
+# REMOTE_VLLM=0: use local source via Docker build context (ONBUILD COPY ./ vllm/)
+# REMOTE_VLLM=1: clone from GitHub at VLLM_BRANCH (standalone builds without local source)
+variable "REMOTE_VLLM" {
+  default = "0"
+}
+
+variable "VLLM_BRANCH" {
+  default = "main"
+}
+
+group "default" {
+  targets = ["test-rocm"]
+}
+
+target "_common-rocm" {
+  dockerfile = "docker/Dockerfile.rocm"
+  context    = "."
+  args = {
+    max_jobs              = MAX_JOBS
+    ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH
+    REMOTE_VLLM           = REMOTE_VLLM
+    VLLM_BRANCH           = VLLM_BRANCH
+  }
+}
+
+target "_labels" {
+  labels = {
+    "org.opencontainers.image.source"      = "https://github.com/vllm-project/vllm"
+    "org.opencontainers.image.vendor"      = "vLLM"
+    "org.opencontainers.image.title"       = "vLLM ROCm"
+    "org.opencontainers.image.description" = "vLLM: A high-throughput and memory-efficient inference and serving engine for LLMs (ROCm)"
+    "org.opencontainers.image.licenses"    = "Apache-2.0"
+    "org.opencontainers.image.revision"    = COMMIT
+  }
+  annotations = [
+    "index,manifest:org.opencontainers.image.revision=${COMMIT}",
+  ]
+}
+
+target "test-rocm" {
+  inherits = ["_common-rocm", "_labels"]
+  target   = "test"
+  tags     = ["rocm/vllm:test"]
+  output   = ["type=docker"]
+}
+
+target "final-rocm" {
+  inherits = ["_common-rocm", "_labels"]
+  target   = "final"
+  tags     = ["rocm/vllm:latest"]
+  output   = ["type=docker"]
+}
diff --git a/tests/standalone_tests/python_only_compile_rocm.sh b/tests/standalone_tests/python_only_compile_rocm.sh
new file mode 100644
index 000000000000..0760eb413872
--- /dev/null
+++ b/tests/standalone_tests/python_only_compile_rocm.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# ROCm equivalent of python_only_compile.sh.
+#
+# Goal: verify that a user without any C/C++ compiler can install and import
+# vLLM from a pre-built ROCm wheel (i.e., all HIP kernel .so files are already
+# compiled into the wheel — no recompilation is triggered at install time).
+#
+# This differs from the CUDA version in one key way: there is no
+# wheels.vllm.ai equivalent for ROCm, so we reinstall from the wheel that was
+# baked into the test image at /opt/vllm-wheels/ during the Docker build
+# (COPY --from=export_vllm /*.whl /opt/vllm-wheels/ in Dockerfile.rocm).
+
+set -e
+
+WHEEL_DIR="/opt/vllm-wheels"
+
+echo "=== ROCm Python-only Installation Test ==="
+echo "Verifies vLLM is installable and importable without a C++ compiler."
+echo ""
+
+# Confirm the wheel is present in the image
+if ! ls "${WHEEL_DIR}"/*.whl &>/dev/null; then
+    echo "ERROR: No wheel found at ${WHEEL_DIR}/*.whl"
+    echo "The Dockerfile.rocm test stage must have COPY --from=export_vllm /*.whl /opt/vllm-wheels/"
+    exit 1
+fi
+
+WHEEL_PATH=$(ls "${WHEEL_DIR}"/*.whl | head -1)
+echo "Found wheel: ${WHEEL_PATH}"
+
+cd /vllm-workspace/
+
+# Restore the vllm source tree so __init__.py can be patched
+# (same pattern as the CUDA python_only_compile.sh)
+pip3 uninstall -y vllm
+mv src/vllm ./vllm
+
+# Sentinel: append a side-effect to __init__.py so we can verify the installed
+# code actually ran (not a cached .pyc from the previous install)
+echo 'import os; os.system("touch /tmp/rocm_python_only.file")' >> vllm/__init__.py
+
+echo ""
+echo "=== Removing C/C++ compilers ==="
+apt-get remove --purge build-essential -y
+apt-get autoremove -y
+echo "Compilers removed. Verifying cc/g++ are gone:"
+! command -v cc  && echo "  cc:  not found (expected)"
+! command -v g++ && echo "  g++: not found (expected)"
+
+echo ""
+echo "=== Installing vLLM from pre-built wheel (no compiler) ==="
+echo "Wheel: ${WHEEL_PATH}"
+# --no-build-isolation + --no-deps: install exactly the wheel, no setup.py
+# compilation triggered; HIP .so files are already inside the wheel.
+pip3 install --no-build-isolation --no-deps "${WHEEL_PATH}"
+
+echo ""
+echo "=== Importing vLLM ==="
+python3 -c 'import vllm; print(f"vLLM {vllm.__version__} imported successfully")'
+
+# Verify our sentinel side-effect fired (confirms the patched __init__.py ran)
+if [ ! -f /tmp/rocm_python_only.file ]; then
+    echo "ERROR: sentinel file not created — python-only installation failed"
+    exit 1
+fi
+
+echo ""
+echo "=== ROCm Python-only Installation Test PASSED ==="

From 4716340ba3fac328a32bcfd058045544690acd24 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 12 Mar 2026 22:45:02 -0500
Subject: [PATCH 02/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/scripts/ci-bake.sh | 12 ++++++------
 docker/Dockerfile.rocm        |  6 ++++++
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/.buildkite/scripts/ci-bake.sh b/.buildkite/scripts/ci-bake.sh
index 410c5c352366..f40c4a2a37dd 100644
--- a/.buildkite/scripts/ci-bake.sh
+++ b/.buildkite/scripts/ci-bake.sh
@@ -82,14 +82,14 @@ if [[ -S "${BUILDKIT_SOCKET}" ]]; then
     echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
     echo "Using remote driver to connect to buildkitd (warm cache available)"
 
-    # Check if baked-vllm-builder already exists and is using the socket
-    if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then
-        echo "Using existing baked-vllm-builder"
-        docker buildx use baked-vllm-builder
+    # Check if ${BUILDER_NAME} already exists and is using the socket
+    if docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
+        echo "Using existing builder: ${BUILDER_NAME}"
+        docker buildx use "${BUILDER_NAME}"
     else
-        echo "Creating baked-vllm-builder with remote driver"
+        echo "Creating builder '${BUILDER_NAME}' with remote driver"
         docker buildx create \
-            --name baked-vllm-builder \
+            --name "${BUILDER_NAME}" \
             --driver remote \
             --use \
             "unix://${BUILDKIT_SOCKET}"
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 37f1c487e4c0..528c4c5d85d4 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -406,6 +406,9 @@ RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
 RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
     uv pip install --system /deep_install/*.whl
 
+# Copy rocshmem runtime libraries
+COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem
+
 # RIXL/MoRIIO runtime dependencies (RDMA userspace libraries)
 RUN apt-get update -q -y && apt-get install -q -y \
     librdmacm1 \
@@ -491,6 +494,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
 RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
     uv pip install --system /deep_install/*.whl
 
+# Copy rocshmem runtime libraries
+COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem
+
 ARG COMMON_WORKDIR
 ARG BASE_IMAGE
 

From f7086c27bf15ace333b204261685085af7a00ddc Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Fri, 13 Mar 2026 14:53:12 -0500
Subject: [PATCH 03/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 528c4c5d85d4..299ea05118b0 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -16,12 +16,14 @@ FROM ${BASE_IMAGE} AS base
 ARG ARG_PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 
-# Install some basic utilities
+# Install build dependencies and utilities
 RUN apt-get update -q -y && apt-get install -q -y \
     sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
     apt-transport-https ca-certificates wget curl \
-    ccache mold \
-    && update-alternatives --install /usr/bin/ld ld /usr/bin/mold 100
+    ccache mold
+# Use mold as the default linker — significantly faster than GNU ld/gold for
+# the large C++ link steps in ROCm extension builds (e.g. vLLM, DeepEP, FA).
+RUN update-alternatives --install /usr/bin/ld ld /usr/bin/mold 100
 RUN python3 -m pip install --upgrade pip
 # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
 ARG USE_SCCACHE
@@ -115,7 +117,8 @@ FROM base AS csrc-build-rocm
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}/vllm
 # Copy only files HIP compilation depends on — vllm/**/*.py changes don't bust this
-COPY requirements/rocm-build.txt requirements/rocm-build.txt
+COPY requirements/common.txt requirements/common.txt
+COPY requirements/rocm.txt requirements/rocm.txt
 COPY pyproject.toml setup.py CMakeLists.txt ./
 COPY cmake cmake/
 COPY csrc csrc/
@@ -125,7 +128,7 @@ COPY vllm/__init__.py vllm/__init__.py
 ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/rocm-build.txt \
+    uv pip install --system -r requirements/rocm.txt \
     && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist
 
 # -----------------------
@@ -141,7 +144,7 @@ ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     cd vllm \
-    && uv pip install --system -r requirements/rocm-build.txt \
+    && uv pip install --system -r requirements/rocm.txt \
     && export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl) \
     && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm

From 8e657f02a93435053edb424060c3d55a128b382f Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Fri, 13 Mar 2026 16:38:51 -0500
Subject: [PATCH 04/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/hardware_tests/amd.yaml | 77 ++++++++++++++++++++++++++++++
 CMakeLists.txt                     |  6 ++-
 docker/Dockerfile.rocm             | 38 +--------------
 docker/docker-bake-rocm.hcl        | 46 ++++++++++++++++++
 4 files changed, 129 insertions(+), 38 deletions(-)

diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml
index 6f96db2110e6..919ef3e3edb2 100644
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,5 +1,6 @@
 group: Hardware - AMD Build
 steps:
+  # Image with all architectures
   - label: "AMD: :docker: build image"
     key: image-build-amd
     depends_on: []
@@ -24,3 +25,79 @@ steps:
           limit: 1
         - exit_status: 1    # Machine occasionally fails
           limit: 1
+
+  # Per-architecture images
+  - label: "AMD: :docker: build image (gfx90a)"
+    key: image-build-amd-gfx90a
+    depends_on: []
+    device: amd_cpu
+    no_plugin: true
+    commands:
+      - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx90a-ci
+    env:
+      DOCKER_BUILDKIT: "1"
+      IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx90a"
+      VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
+      CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
+      PYTORCH_ROCM_ARCH: "gfx90a"
+    timeout_in_minutes: 600
+    retry:
+      automatic:
+        - exit_status: -1
+          limit: 1
+        - exit_status: -10
+          limit: 1
+        - exit_status: 128
+          limit: 1
+        - exit_status: 1
+          limit: 1
+
+  - label: "AMD: :docker: build image (gfx942)"
+    key: image-build-amd-gfx942
+    depends_on: []
+    device: amd_cpu
+    no_plugin: true
+    commands:
+      - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx942-ci
+    env:
+      DOCKER_BUILDKIT: "1"
+      IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx942"
+      VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
+      CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
+      PYTORCH_ROCM_ARCH: "gfx942"
+    timeout_in_minutes: 600
+    retry:
+      automatic:
+        - exit_status: -1
+          limit: 1
+        - exit_status: -10
+          limit: 1
+        - exit_status: 128
+          limit: 1
+        - exit_status: 1
+          limit: 1
+
+  - label: "AMD: :docker: build image (gfx950)"
+    key: image-build-amd-gfx950
+    depends_on: []
+    device: amd_cpu
+    no_plugin: true
+    commands:
+      - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx950-ci
+    env:
+      DOCKER_BUILDKIT: "1"
+      IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx950"
+      VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
+      CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
+      PYTORCH_ROCM_ARCH: "gfx950"
+    timeout_in_minutes: 600
+    retry:
+      automatic:
+        - exit_status: -1
+          limit: 1
+        - exit_status: -10
+          limit: 1
+        - exit_status: 128
+          limit: 1
+        - exit_status: 1
+          limit: 1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bbadfdc5e9e3..35e9716c1a3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1184,8 +1184,10 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
     WITH_SOABI)
 endif()
 
-# For CUDA and HIP builds also build the triton_kernels external package.
-if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+# Fetch and vendor triton_kernels (Python-only, no compilation).
+# Skipped for HIP/ROCm - the git clone of the full triton repo is expensive
+# and triton_kernels is optional at runtime (graceful fallback in import_utils).
+if(VLLM_GPU_LANG STREQUAL "CUDA")
     include(cmake/external_projects/triton_kernels.cmake)
 endif()
 
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 299ea05118b0..b63ea9c33fd2 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -103,49 +103,15 @@ ONBUILD RUN git clone ${VLLM_REPO} \
                && git fetch upstream ; fi
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 
-# -----------------------
-# HIP kernel compilation stage (csrc-build-rocm)
-#
-# Intentionally copies ONLY build-critical files (CMakeLists.txt, csrc/, cmake/)
-# so that Python-only changes to vllm/*.py do NOT invalidate this expensive layer.
-# BuildKit's registry cache (--cache-from ECR) reuses this layer across commits
-# whenever only Python code changed — turning a 2-hour HIP recompile into seconds.
-#
-# Note: only applies when REMOTE_VLLM=0 (default), so the build context
-# contains the vllm source. Release builds (REMOTE_VLLM=1) always compile fully.
-FROM base AS csrc-build-rocm
-ARG COMMON_WORKDIR
-WORKDIR ${COMMON_WORKDIR}/vllm
-# Copy only files HIP compilation depends on — vllm/**/*.py changes don't bust this
-COPY requirements/common.txt requirements/common.txt
-COPY requirements/rocm.txt requirements/rocm.txt
-COPY pyproject.toml setup.py CMakeLists.txt ./
-COPY cmake cmake/
-COPY csrc csrc/
-COPY vllm/envs.py vllm/envs.py
-COPY vllm/__init__.py vllm/__init__.py
-# Dummy version prevents git-state from busting the cache key on every commit
-ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/rocm.txt \
-    && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist
-
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
-ARG COMMON_WORKDIR
-# Re-use the pre-built HIP kernel wheel from csrc-build-rocm.
-# When VLLM_PRECOMPILED_WHEEL_LOCATION is set, setup.py extracts the compiled
-# .so files from this wheel instead of recompiling HIP kernels.
-# Python-only changes complete in minutes instead of hours.
-COPY --from=csrc-build-rocm ${COMMON_WORKDIR}/vllm/dist /precompiled-wheels
-ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1
+# Build vLLM wheel (setup.py auto-detects ccache/sccache in PATH)
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     cd vllm \
     && uv pip install --system -r requirements/rocm.txt \
-    && export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl) \
+    && python3 setup.py clean --all \
     && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl
index 2e3525d6ee2c..bf473149a5d0 100644
--- a/docker/docker-bake-rocm.hcl
+++ b/docker/docker-bake-rocm.hcl
@@ -73,6 +73,52 @@ target "test-rocm" {
   output   = ["type=docker"]
 }
 
+# Per-architecture test targets — build in parallel on separate agents to avoid
+# compiling expensive HIP kernels (e.g. rocm/attention.hip) for all 3 archs
+# sequentially. Each image only links for one architecture.
+# Usage: docker buildx bake -f docker/docker-bake-rocm.hcl test-rocm-all
+target "test-rocm-gfx90a" {
+  inherits = ["_common-rocm", "_labels"]
+  target   = "test"
+  args     = { ARG_PYTORCH_ROCM_ARCH = "gfx90a" }
+  tags     = ["rocm/vllm:test-gfx90a"]
+  output   = ["type=docker"]
+}
+
+target "test-rocm-gfx942" {
+  inherits = ["_common-rocm", "_labels"]
+  target   = "test"
+  args     = { ARG_PYTORCH_ROCM_ARCH = "gfx942" }
+  tags     = ["rocm/vllm:test-gfx942"]
+  output   = ["type=docker"]
+}
+
+target "test-rocm-gfx950" {
+  inherits = ["_common-rocm", "_labels"]
+  target   = "test"
+  args     = { ARG_PYTORCH_ROCM_ARCH = "gfx950" }
+  tags     = ["rocm/vllm:test-gfx950"]
+  output   = ["type=docker"]
+}
+
+group "test-rocm-all" {
+  targets = ["test-rocm-gfx90a", "test-rocm-gfx942", "test-rocm-gfx950"]
+}
+
+# Per-architecture CI targets — the ci-rocm.hcl overlay in ci-infra extends
+# these with cache-from/cache-to and registry push configuration.
+target "test-rocm-gfx90a-ci" {
+  inherits = ["test-rocm-gfx90a"]
+}
+
+target "test-rocm-gfx942-ci" {
+  inherits = ["test-rocm-gfx942"]
+}
+
+target "test-rocm-gfx950-ci" {
+  inherits = ["test-rocm-gfx950"]
+}
+
 target "final-rocm" {
   inherits = ["_common-rocm", "_labels"]
   target   = "final"

From 20d0a5fba5299323da1ef78010e3fef1b6ca6be8 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 15:08:26 -0500
Subject: [PATCH 05/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/docker-bake-rocm.hcl | 6 ++----
 setup.py                    | 7 +++++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl
index bf473149a5d0..e227c57b8532 100644
--- a/docker/docker-bake-rocm.hcl
+++ b/docker/docker-bake-rocm.hcl
@@ -12,11 +12,9 @@
 #   docker buildx bake -f docker/docker-bake-rocm.hcl -f /tmp/ci-rocm.hcl test-rocm-ci
 
 variable "MAX_JOBS" {
-  # Empty string lets the Dockerfile fall back to $(nproc) via
-  # MAX_JOBS="${MAX_JOBS:-$(nproc)}" in each RUN step, which uses all
-  # available cores on whatever machine the build runs on.
+  # Cap parallelism to avoid OOM during linking on large machines.
   # Override with --set '*.args.max_jobs=8' for local builds on small machines.
-  default = ""
+  default = "64"
 }
 
 variable "PYTORCH_ROCM_ARCH" {
diff --git a/setup.py b/setup.py
index fa13fff4e62e..051f3bd07baf 100644
--- a/setup.py
+++ b/setup.py
@@ -298,7 +298,7 @@ def run(self):
             os.makedirs(os.path.dirname(dst_file), exist_ok=True)
             self.copy_file(file, dst_file)
 
-        if _is_cuda() or _is_hip():
+        if _is_cuda():
             # copy vllm/third_party/triton_kernels/**/*.py from self.build_lib
             # to current directory so that they can be included in the editable
             # build
@@ -887,7 +887,10 @@ def _read_requirements(filename: str) -> list[str]:
     ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
     # Optional since this doesn't get built (produce an .so file). This is just
     # copying the relevant .py files from the source repository.
-    ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True))
+    # Skipped for ROCm — CMake already gates this on CUDA and the git clone
+    # of the full triton repo is expensive.
+    if not _is_hip():
+        ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True))
 
 if _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))

From f76a302c4f9c9578995e729480cf90a0e98ab30c Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 15:49:04 -0500
Subject: [PATCH 06/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index b63ea9c33fd2..7e438d420a74 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -354,7 +354,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     --mount=type=cache,target=/root/.cache/pip \
     cd /install \
     && uv pip install --system -r requirements/rocm.txt \
-    && grep -v 'git+' requirements/rocm-test.txt | uv pip install --system -r /dev/stdin \
+    && grep -v 'git+' requirements/rocm-test.txt > requirements/_rocm-test-nogit.txt \
+    && uv pip install --system -r requirements/_rocm-test-nogit.txt \
+    && rm requirements/_rocm-test-nogit.txt \
     && grep 'git+' requirements/rocm-test.txt > /tmp/git-reqs.txt \
     && pip install --no-deps -r /tmp/git-reqs.txt \
     && rm /tmp/git-reqs.txt \

From e40694eafe48de3385381d15eb995a98ec26935e Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 15:56:01 -0500
Subject: [PATCH 07/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 7e438d420a74..8cfcf86ff99b 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -354,9 +354,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     --mount=type=cache,target=/root/.cache/pip \
     cd /install \
     && uv pip install --system -r requirements/rocm.txt \
-    && grep -v 'git+' requirements/rocm-test.txt > requirements/_rocm-test-nogit.txt \
-    && uv pip install --system -r requirements/_rocm-test-nogit.txt \
-    && rm requirements/_rocm-test-nogit.txt \
+    && grep -v 'git+' requirements/rocm-test.txt > /tmp/rocm-test-nogit.txt \
+    && cd requirements && uv pip install --system -r /tmp/rocm-test-nogit.txt && cd /install \
+    && rm /tmp/rocm-test-nogit.txt \
     && grep 'git+' requirements/rocm-test.txt > /tmp/git-reqs.txt \
     && pip install --no-deps -r /tmp/git-reqs.txt \
     && rm /tmp/git-reqs.txt \

From e8346a09e912c9830454503ecb9f8204bba7643d Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 16:01:20 -0500
Subject: [PATCH 08/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 8cfcf86ff99b..11727cbb57ef 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -230,8 +230,11 @@ ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
 ARG DEEPEP_NIC="cx7"
 
 # Build DeepEP wheel. DeepEP looks for rocshmem at ROCSHMEM_DIR (inherited from build_rocshmem).
+# DeepEP only supports gfx942 and gfx950 — override PYTORCH_ROCM_ARCH to avoid
+# the gfx90a in the default list causing a build failure.
 RUN --mount=type=cache,target=/root/.cache/ccache \
-    git clone ${DEEPEP_REPO} \
+    export PYTORCH_ROCM_ARCH="gfx942;gfx950" \
+ && git clone ${DEEPEP_REPO} \
  && cd DeepEP \
  && git checkout ${DEEPEP_BRANCH} \
  && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install

From a2c6035758d014758373fc26bcb8745ce9dd426c Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 16:11:07 -0500
Subject: [PATCH 09/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm     | 17 ++++++-----------
 requirements/rocm-test.txt |  2 +-
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 11727cbb57ef..d33d77ca0f52 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -348,21 +348,16 @@ FROM base AS test
 
 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 
-# Install vLLM using uv (inherited from base stage)
-# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
-# Note: rocm-test.txt contains a git+ URL (fastsafetensors) that uv cannot resolve;
-#       we install non-git requirements with uv and git+ requirements with pip separately.
+# Install vLLM dependencies and test requirements
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=cache,target=/root/.cache/pip \
     cd /install \
     && uv pip install --system -r requirements/rocm.txt \
-    && grep -v 'git+' requirements/rocm-test.txt > /tmp/rocm-test-nogit.txt \
-    && cd requirements && uv pip install --system -r /tmp/rocm-test-nogit.txt && cd /install \
-    && rm /tmp/rocm-test-nogit.txt \
-    && grep 'git+' requirements/rocm-test.txt > /tmp/git-reqs.txt \
-    && pip install --no-deps -r /tmp/git-reqs.txt \
-    && rm /tmp/git-reqs.txt \
+    && if grep -q 'git+' requirements/rocm-test.txt; then \
+           pip install -r requirements/rocm-test.txt; \
+       else \
+           uv pip install --system -r requirements/rocm-test.txt; \
+       fi \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index e616a99c5315..50edda4fe263 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -79,7 +79,7 @@ pqdm==0.2.0
     # via lm-eval
 
 # Required for fastsafetensors test
-fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
+fastsafetensors==0.2.2
 # Required for suffix decoding test
 arctic-inference == 0.1.1
 # Required for Nemotron test

From 067a486c8f0cfbe5604dc16954857b6bf46e8e28 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 16:20:08 -0500
Subject: [PATCH 10/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/docker-bake-rocm.hcl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl
index e227c57b8532..723f38999efd 100644
--- a/docker/docker-bake-rocm.hcl
+++ b/docker/docker-bake-rocm.hcl
@@ -60,7 +60,7 @@ target "_labels" {
     "org.opencontainers.image.revision"    = COMMIT
   }
   annotations = [
-    "index,manifest:org.opencontainers.image.revision=${COMMIT}",
+    "manifest:org.opencontainers.image.revision=${COMMIT}",
   ]
 }
 

From 3509942db63fa6c31c6de91ce5ab58ba6d7889a5 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 17:11:48 -0500
Subject: [PATCH 11/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index d33d77ca0f52..b7ed836dbe35 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -21,9 +21,10 @@ RUN apt-get update -q -y && apt-get install -q -y \
     sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
     apt-transport-https ca-certificates wget curl \
     ccache mold
-# Use mold as the default linker — significantly faster than GNU ld/gold for
-# the large C++ link steps in ROCm extension builds (e.g. vLLM, DeepEP, FA).
-RUN update-alternatives --install /usr/bin/ld ld /usr/bin/mold 100
+# Note: mold is installed but NOT set as the system default linker because
+# some packages (e.g. aiter) use JIT compilation at runtime with flags
+# that mold doesn't support (--cref). Build stages opt in via CMAKE_LINKER_TYPE
+# or LDFLAGS="-fuse-ld=mold".
 RUN python3 -m pip install --upgrade pip
 # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
 ARG USE_SCCACHE
@@ -107,12 +108,13 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
 # Build vLLM wheel (setup.py auto-detects ccache/sccache in PATH)
+# Use mold linker for faster linking of large C++ extensions
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     cd vllm \
     && uv pip install --system -r requirements/rocm.txt \
     && python3 setup.py clean --all \
-    && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist
+    && LDFLAGS="-fuse-ld=mold" MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
@@ -237,7 +239,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
  && git clone ${DEEPEP_REPO} \
  && cd DeepEP \
  && git checkout ${DEEPEP_BRANCH} \
- && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
+ && LDFLAGS="-fuse-ld=mold" MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
 
 
 # -----------------------

From 3ca62e109dfe16f414aa0ed4739dce0ca8243618 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 18:17:35 -0500
Subject: [PATCH 12/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm           | 16 ++++-----
 docker/docker-bake-rocm.hcl      |  6 ++--
 tools/install_torchcodec_rocm.sh | 57 +++++++++++++++++++++++++++++---
 3 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index b7ed836dbe35..8186b0a9c894 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -397,20 +397,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     cd /vllm-workspace \
     && uv pip install --system -e tests/vllm_test_utils pytest-shard
 
+# Pre-install FFmpeg dev libs so torchcodec can be built from source at test
+# time without apt-get update (saves ~10s per whisper/audio test step).
+RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
+    pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \
+    libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \
+    && rm -rf /var/lib/apt/lists/*
+
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
-# install audio decode package `torchcodec` from source (required due to 
-# ROCm and torch version mismatch) for tests with datasets package
-COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh
-RUN --mount=type=cache,target=/root/.cache/pip \
-    bash /tmp/install_torchcodec.sh \
-    && rm /tmp/install_torchcodec.sh \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
-
 # Copy in the v1 package (for python-only install test group)
 COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
 
diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl
index 723f38999efd..ad7b933a788c 100644
--- a/docker/docker-bake-rocm.hcl
+++ b/docker/docker-bake-rocm.hcl
@@ -12,9 +12,11 @@
 #   docker buildx bake -f docker/docker-bake-rocm.hcl -f /tmp/ci-rocm.hcl test-rocm-ci
 
 variable "MAX_JOBS" {
-  # Cap parallelism to avoid OOM during linking on large machines.
+  # Empty string lets the Dockerfile fall back to $(nproc) via
+  # MAX_JOBS="${MAX_JOBS:-$(nproc)}" in each RUN step, which uses all
+  # available cores on whatever machine the build runs on.
   # Override with --set '*.args.max_jobs=8' for local builds on small machines.
-  default = "64"
+  default = ""
 }
 
 variable "PYTORCH_ROCM_ARCH" {
diff --git a/tools/install_torchcodec_rocm.sh b/tools/install_torchcodec_rocm.sh
index 6cb3b39fd66a..210d7b24145a 100755
--- a/tools/install_torchcodec_rocm.sh
+++ b/tools/install_torchcodec_rocm.sh
@@ -3,12 +3,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # Script to install TorchCodec from source (required for ROCm compatibility)
+# The PyPI wheel is built against upstream PyTorch and has ABI mismatches with
+# ROCm's custom torch build, so we must compile from source.
 
 set -e
 
 TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}"
 # Pin to a specific release for reproducibility; update as needed.
 TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}"
+# Cache directory for pre-built wheels to avoid redundant recompilation.
+TORCHCODEC_WHEEL_CACHE="${TORCHCODEC_WHEEL_CACHE:-/root/.cache/torchcodec-wheels}"
 
 echo "=== TorchCodec Installation Script ==="
 
@@ -18,9 +22,26 @@ if python3 -c "from torchcodec.decoders import VideoDecoder" 2>/dev/null; then
     exit 0
 fi
 
+# Try to install from cached wheel first
+ARCH_TAG="${PYTORCH_ROCM_ARCH:-all}"
+# Normalize arch tag (replace ; with _) for use in filename
+ARCH_TAG="${ARCH_TAG//;/_}"
+CACHED_WHEEL="${TORCHCODEC_WHEEL_CACHE}/torchcodec-${TORCHCODEC_BRANCH}-${ARCH_TAG}.whl"
+
+if [ -f "$CACHED_WHEEL" ]; then
+    echo "Found cached wheel: $CACHED_WHEEL"
+    pip install "$CACHED_WHEEL" && {
+        echo "Installed from cached wheel."
+        echo "=== TorchCodec installation complete ==="
+        exit 0
+    }
+    echo "Cached wheel installation failed, rebuilding from source..."
+fi
+
 echo "TorchCodec not found. Installing from source..."
 
-# Install system dependencies (FFmpeg + pkg-config)
+# Install system dependencies (FFmpeg + pkg-config) if not already present.
+# The Docker test image pre-installs these, so this is a fallback for other envs.
 install_system_deps() {
     if command -v apt-get &> /dev/null; then
         echo "Installing system dependencies..."
@@ -56,6 +77,12 @@ export pybind11_DIR=$(python3 -c "import pybind11; print(pybind11.get_cmake_dir(
 export CMAKE_PREFIX_PATH="${pybind11_DIR}:${CMAKE_PREFIX_PATH}"
 echo "pybind11_DIR set to: $pybind11_DIR"
 
+# Limit GPU architectures to only what this image targets.
+# The default builds for all supported archs which is very slow.
+if [ -n "$PYTORCH_ROCM_ARCH" ]; then
+    echo "Building for PYTORCH_ROCM_ARCH=$PYTORCH_ROCM_ARCH"
+fi
+
 # Create temp directory for build
 BUILD_DIR=$(mktemp -d -t torchcodec-XXXXXX)
 echo "Building in temporary directory: $BUILD_DIR"
@@ -77,9 +104,31 @@ cd torchcodec
 export TORCHCODEC_CMAKE_BUILD_DIR="${PWD}/build"
 export TORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR=1
 export I_CONFIRM_THIS_IS_NOT_A_LICENSE_VIOLATION=1
+# Use ninja for faster builds and parallelize compilation
+export CMAKE_GENERATOR=Ninja
+export MAX_JOBS="${MAX_JOBS:-$(nproc)}"
+# Use ccache if available to speed up recompilation
+if command -v ccache &> /dev/null; then
+    export CMAKE_C_COMPILER_LAUNCHER=ccache
+    export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+fi
 
-echo "Building TorchCodec..."
-pip install . --no-build-isolation
+echo "Building TorchCodec (MAX_JOBS=$MAX_JOBS)..."
+pip wheel . --no-build-isolation --no-deps -w "$BUILD_DIR/dist"
+
+# Install the built wheel
+BUILT_WHEEL=$(ls "$BUILD_DIR/dist"/torchcodec-*.whl 2>/dev/null | head -1)
+if [ -z "$BUILT_WHEEL" ]; then
+    echo "Error: No wheel produced"
+    exit 1
+fi
+
+pip install "$BUILT_WHEEL"
+
+# Cache the wheel for future runs
+mkdir -p "$TORCHCODEC_WHEEL_CACHE"
+cp "$BUILT_WHEEL" "$CACHED_WHEEL"
+echo "Cached wheel to: $CACHED_WHEEL"
 
 # Verify installation
 echo "Verifying installation..."
@@ -88,4 +137,4 @@ if python3 -c "from torchcodec.decoders import VideoDecoder; print('TorchCodec i
 else
     echo "Error: TorchCodec installation failed verification"
     exit 1
-fi
\ No newline at end of file
+fi

From 624b413595288b9f1b1e14c96b32266c69dc26f7 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 18:33:00 -0500
Subject: [PATCH 13/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/scripts/hardware_ci/run-amd-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 1c43c404d247..da76ea8eda53 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -378,7 +378,7 @@ wait_for_clean_gpus
 
 # --- Pull test image ---
 echo "--- Pulling container"
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="${DOCKER_IMAGE_NAME:-rocm/vllm-ci:${BUILDKITE_COMMIT}}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 

From 04f3ee6c9197f55b1c44160fb6b10e098f44da5c Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 19:05:40 -0500
Subject: [PATCH 14/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/hardware_tests/amd.yaml |  6 ++++--
 docker/Dockerfile.rocm             | 15 ++++++---------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml
index 919ef3e3edb2..74978d47a4db 100644
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -1,7 +1,9 @@
 group: Hardware - AMD Build
 steps:
-  # Image with all architectures
-  - label: "AMD: :docker: build image"
+  # Fat multi-arch image - only auto-runs on main (cache warming / release).
+  # On PR builds, the Jinja template gates this behind a manual block step.
+  # This YAML is the source-of-truth for step shape; the template adds the block logic.
+  - label: "AMD: :docker: build image (all archs)"
     key: image-build-amd
     depends_on: []
     device: amd_cpu
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 8186b0a9c894..1aa852709784 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -380,12 +380,16 @@ RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_insta
 # Copy rocshmem runtime libraries
 COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem
 
-# RIXL/MoRIIO runtime dependencies (RDMA userspace libraries)
-RUN apt-get update -q -y && apt-get install -q -y \
+# RDMA userspace libraries (RIXL/MoRIIO runtime) + FFmpeg dev libs (torchcodec
+# source builds at test time). Combined into one apt-get to avoid a redundant
+# apt-get update round-trip.
+RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
     librdmacm1 \
     libibverbs1 \
     ibverbs-providers \
     ibverbs-utils \
+    pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \
+    libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \
     && rm -rf /var/lib/apt/lists/*
 
 WORKDIR /vllm-workspace
@@ -397,13 +401,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     cd /vllm-workspace \
     && uv pip install --system -e tests/vllm_test_utils pytest-shard
 
-# Pre-install FFmpeg dev libs so torchcodec can be built from source at test
-# time without apt-get update (saves ~10s per whisper/audio test step).
-RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
-    pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \
-    libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \
-    && rm -rf /var/lib/apt/lists/*
-
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system hf_transfer

From 87a03a81a89c221ce6fe3f34cf32f9de0fa45e70 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 19:22:42 -0500
Subject: [PATCH 15/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 1aa852709784..1de5ebaa37e3 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -220,6 +220,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
     -DROCM_PATH=/opt/rocm \
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
     -DUSE_EXTERNAL_MPI=OFF \
+    -DAMDGPU_TARGETS="${PYTORCH_ROCM_ARCH}" \
+    -DBUILD_TESTS=OFF \
+    -DBUILD_EXAMPLES=OFF \
  && make -j$(nproc) \
  && make install
 

From 17f5ee99e52a83a97dca6abd853c04330f629ce5 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 22:34:47 -0500
Subject: [PATCH 16/23] [ROCm][CI] Add ROCm Docker Hub registry cache and
 weekly cleanup pipeline

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 1de5ebaa37e3..d3bccfd73978 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -45,8 +45,10 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy
 # ccache directory — persisted across layer rebuilds via --mount=type=cache
 ENV CCACHE_DIR=/root/.cache/ccache
-# Compilation parallelism — overridable via --build-arg max_jobs=N; falls back to nproc
-ARG max_jobs
+# Compilation parallelism — overridable via --build-arg max_jobs=N.
+# Default to 64 to avoid race conditions with hipify at very high core counts
+# (e.g. 256-core machines where -j=256 can start compiling before hipify finishes).
+ARG max_jobs=64
 ENV MAX_JOBS=${max_jobs}
 
 # Install sccache if USE_SCCACHE is enabled (for release builds)

From 5af45e04d9e55c26dcf578a91b77ce1d84cb4bb2 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 22:41:18 -0500
Subject: [PATCH 17/23] [ROCm][CI] Chain hipify targets sequentially to resolve
 potential race condition in highly concurrent max job settings

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 cmake/utils.cmake | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index bdb2ba74d944..6afe04724501 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -81,6 +81,15 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
     BYPRODUCTS ${HIP_SRCS}
     COMMENT "Running hipify on ${NAME} extension source files.")
 
+  # Chain hipify targets so they run sequentially. Each hipify target runs
+  # shutil.copytree into a shared output directory; running them in parallel
+  # causes a race where one target's copytree overwrites .hip files produced
+  # by another target back to .cu originals.
+  if (DEFINED _VLLM_LAST_HIPIFY_TARGET)
+    add_dependencies(hipify${NAME} ${_VLLM_LAST_HIPIFY_TARGET})
+  endif()
+  set(_VLLM_LAST_HIPIFY_TARGET "hipify${NAME}" PARENT_SCOPE)
+
   # Swap out original extension sources with hipified sources.
   list(APPEND HIP_SRCS ${CXX_SRCS})
   set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)

From 7f1f98c08aabc6c116879c05808068f0ba52d638 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 26 Mar 2026 00:21:32 -0500
Subject: [PATCH 18/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and
 Jinja build steps

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/hardware_tests/amd-ci-base.yaml | 39 ++++++++++
 .buildkite/hardware_tests/amd.yaml         | 16 ++++
 docker/Dockerfile.rocm                     | 88 +++++++++++++---------
 3 files changed, 107 insertions(+), 36 deletions(-)
 create mode 100644 .buildkite/hardware_tests/amd-ci-base.yaml

diff --git a/.buildkite/hardware_tests/amd-ci-base.yaml b/.buildkite/hardware_tests/amd-ci-base.yaml
new file mode 100644
index 000000000000..63ab32f42624
--- /dev/null
+++ b/.buildkite/hardware_tests/amd-ci-base.yaml
@@ -0,0 +1,39 @@
+# Scheduled pipeline: build and push the ROCm CI base image (Tier 1).
+#
+# This image contains all slow, stable CI dependencies (RIXL+UCX, DeepEP+rocshmem,
+# torchcodec, RDMA libs, hf_transfer, pytest-shard, MIOPEN env vars) and is used
+# as the base for every per-PR test image build via CI_BASE_IMAGE in amd.yaml.
+#
+# Rebuild triggers (configure in Buildkite UI → Pipelines → Schedules):
+#   - Weekly cron (e.g. Sunday 00:00 UTC) on the main branch
+#   - Manual trigger when RIXL_BRANCH, DEEPEP_BRANCH, or ROCSHMEM_BRANCH changes
+#
+# Produces:
+#   rocm/vllm-dev:ci_base              <- stable tag, always points to most recent build
+#   rocm/vllm-dev:ci_base-YYYYMMDD    <- dated snapshot for rollback
+group: Hardware - AMD CI Base Build
+steps:
+  - label: "AMD: :docker: build ci_base image"
+    key: image-build-amd-ci-base
+    depends_on: []
+    device: amd_cpu
+    no_plugin: true
+    commands:
+    - export DATED_TAG="rocm/vllm-dev:ci_base-$(date +%Y%m%d)"
+    - export IMAGE_TAG="$DATED_TAG"
+    - export CI_BASE_IMAGE_TAG_DATED="$DATED_TAG"
+    - bash .buildkite/scripts/ci-bake.sh ci-base-rocm-ci
+    env:
+      DOCKER_BUILDKIT: "1"
+      VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
+      CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
+      CI_BASE_IMAGE_TAG: "rocm/vllm-dev:ci_base"
+      DOCKERHUB_CACHE_TO: "rocm/vllm-ci-cache:rocm-latest"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 1
+        - exit_status: -10  # Agent was lost
+          limit: 1
+        - exit_status: 1  # Machine occasionally fail
+          limit: 1
diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml
index 74978d47a4db..e784d9a87542 100644
--- a/.buildkite/hardware_tests/amd.yaml
+++ b/.buildkite/hardware_tests/amd.yaml
@@ -9,12 +9,16 @@ steps:
     device: amd_cpu
     no_plugin: true
     commands:
+      - docker pull rocm/vllm-dev:ci_base
       - bash .buildkite/scripts/ci-bake.sh test-rocm-ci
     env:
       DOCKER_BUILDKIT: "1"
       IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}"
       VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
       CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
+      CI_BASE_IMAGE: "rocm/vllm-dev:ci_base"
+      REMOTE_VLLM: "1"
+      VLLM_BRANCH: "${BUILDKITE_COMMIT}"
       PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950"
     timeout_in_minutes: 600
     retry:
@@ -35,12 +39,16 @@ steps:
     device: amd_cpu
     no_plugin: true
     commands:
+      - docker pull rocm/vllm-dev:ci_base
       - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx90a-ci
     env:
       DOCKER_BUILDKIT: "1"
       IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx90a"
       VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
       CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
+      CI_BASE_IMAGE: "rocm/vllm-dev:ci_base"
+      REMOTE_VLLM: "1"
+      VLLM_BRANCH: "${BUILDKITE_COMMIT}"
       PYTORCH_ROCM_ARCH: "gfx90a"
     timeout_in_minutes: 600
     retry:
@@ -60,12 +68,16 @@ steps:
     device: amd_cpu
     no_plugin: true
     commands:
+      - docker pull rocm/vllm-dev:ci_base
       - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx942-ci
     env:
       DOCKER_BUILDKIT: "1"
       IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx942"
       VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
       CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
+      CI_BASE_IMAGE: "rocm/vllm-dev:ci_base"
+      REMOTE_VLLM: "1"
+      VLLM_BRANCH: "${BUILDKITE_COMMIT}"
       PYTORCH_ROCM_ARCH: "gfx942"
     timeout_in_minutes: 600
     retry:
@@ -85,12 +97,16 @@ steps:
     device: amd_cpu
     no_plugin: true
     commands:
+      - docker pull rocm/vllm-dev:ci_base
       - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx950-ci
     env:
       DOCKER_BUILDKIT: "1"
       IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx950"
       VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
       CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
+      CI_BASE_IMAGE: "rocm/vllm-dev:ci_base"
+      REMOTE_VLLM: "1"
+      VLLM_BRANCH: "${BUILDKITE_COMMIT}"
       PYTORCH_ROCM_ARCH: "gfx950"
     timeout_in_minutes: 600
     retry:
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index d3bccfd73978..dd80c284f69c 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -2,6 +2,7 @@
 ARG REMOTE_VLLM="0"
 ARG COMMON_WORKDIR=/app
 ARG BASE_IMAGE=rocm/vllm-dev:base
+ARG CI_BASE_IMAGE=rocm/vllm-dev:ci_base
 
 # Sccache configuration (only used in release pipeline)
 ARG USE_SCCACHE
@@ -350,8 +351,56 @@ COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildki
 COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
 
 # -----------------------
-# Test vLLM image
-FROM base AS test
+# CI base image (Tier 1) — stable, rarely-changing CI dependencies.
+# Rebuilt weekly (or when RIXL/DeepEP/ROCSHMEM branch ARGs change).
+# Per-PR test builds pull this as CI_BASE_IMAGE instead of rebuilding
+# these slow layers from scratch every commit.
+FROM base AS ci_base
+
+# Install RIXL wheel (pre-built in build_rixl stage)
+RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
+    uv pip install --system /rixl_install/*.whl
+
+# Install DeepEP wheel (pre-built in build_deepep stage)
+RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
+    uv pip install --system /deep_install/*.whl
+
+# Copy rocshmem runtime libraries
+COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem
+
+# RDMA userspace libraries (RIXL/MoRIIO runtime) + FFmpeg dev libs (torchcodec).
+# Combined into one apt-get to avoid a redundant apt-get update round-trip.
+RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
+    librdmacm1 \
+    libibverbs1 \
+    ibverbs-providers \
+    ibverbs-utils \
+    pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \
+    libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install torchcodec from source (ROCm/torch version mismatch prevents PyPI install).
+# Pre-building here avoids rebuilding it on every per-PR test image.
+COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh
+RUN bash /tmp/install_torchcodec.sh \
+    && rm /tmp/install_torchcodec.sh \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Stable test tooling that doesn't depend on the vLLM wheel
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer pytest-shard
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+# Suppress MIOpen 3D convolution performance regressions
+# See: https://github.com/pytorch/pytorch/issues/169857
+ENV MIOPEN_DEBUG_CONV_DIRECT=0
+ENV MIOPEN_DEBUG_CONV_GEMM=0
+
+# -----------------------
+# Test vLLM image (Tier 2) — thin per-PR layer on top of ci_base.
+# Only rebuilds vLLM wheel + workspace, which change every PR.
+FROM ${CI_BASE_IMAGE} AS test
 
 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 
@@ -374,29 +423,6 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
 # vLLM without a compiler (no wheels.vllm.ai equivalent exists for ROCm).
 COPY --from=export_vllm /*.whl /opt/vllm-wheels/
 
-# Install RIXL wheel
-RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
-    uv pip install --system /rixl_install/*.whl
-
-# Install DeepEP wheel
-RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
-    uv pip install --system /deep_install/*.whl
-
-# Copy rocshmem runtime libraries
-COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem
-
-# RDMA userspace libraries (RIXL/MoRIIO runtime) + FFmpeg dev libs (torchcodec
-# source builds at test time). Combined into one apt-get to avoid a redundant
-# apt-get update round-trip.
-RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
-    librdmacm1 \
-    libibverbs1 \
-    ibverbs-providers \
-    ibverbs-utils \
-    pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \
-    libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \
-    && rm -rf /var/lib/apt/lists/*
-
 WORKDIR /vllm-workspace
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
@@ -404,21 +430,11 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     cd /vllm-workspace \
-    && uv pip install --system -e tests/vllm_test_utils pytest-shard
-
-# enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
+    && uv pip install --system -e tests/vllm_test_utils
 
 # Copy in the v1 package (for python-only install test group)
 COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
 
-# Set MIOPEN ENVS to resolve performance regressions in MIOpen 3D convolution kernel
-# See: https://github.com/pytorch/pytorch/issues/169857
-ENV MIOPEN_DEBUG_CONV_DIRECT=0
-ENV MIOPEN_DEBUG_CONV_GEMM=0
-
 # Source code is used in the `python_only_compile.sh` test
 # We hide it inside `src/` so that this source code
 # will not be imported by other tests

From 064edb8de96e0755ff8c2676a24dd0d98307303a Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 26 Mar 2026 00:28:33 -0500
Subject: [PATCH 19/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and
 Jinja build steps

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test-amd.yaml | 85 ++--------------------------------------
 1 file changed, 4 insertions(+), 81 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index b691e5705696..254f9f5d0b00 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -192,9 +192,10 @@ steps:
   commands:
   - bash standalone_tests/python_only_compile_rocm.sh
 
-- label: Basic Correctness Test # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
+
+- label: Basic Correctness Test # TBD
+  timeout_in_minutes: 180
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdgfx90a]
   agent_pool: mi250_1
   fast_check: true
   torch_nightly: true
@@ -1453,18 +1454,6 @@ steps:
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
-  optional: true
-  source_file_dependencies:
-  - tests/standalone_tests/python_only_compile_rocm.sh
-  - setup.py
-  commands:
-  - bash standalone_tests/python_only_compile_rocm.sh
-
-- label: Basic Correctness Test # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
-  # grade: Blocking
   fast_check: true
   torch_nightly: true
   working_dir: "/vllm-workspace/tests"
@@ -3005,72 +2994,6 @@ steps:
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
-  optional: true
-  soft_fail: true
-  source_file_dependencies:
-  - requirements/nightly_torch_test.txt
-  commands:
-  - bash standalone_tests/pytorch_nightly_dependency.sh
-
-- label: Async Engine, Inputs, Utils, Worker Test # 10min
-  timeout_in_minutes: 15
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/multimodal
-  - tests/utils_
-  commands:
-  - pytest -v -s -m 'not cpu_test' multimodal
-  - pytest -v -s utils_
-
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - vllm/
-  - tests/test_inputs.py
-  - tests/test_outputs.py
-  - tests/test_pooling_params.py
-  - tests/multimodal
-  - tests/renderers
-  - tests/standalone_tests/lazy_imports.py
-  - tests/tokenizers_
-  - tests/tool_parsers
-  - tests/transformers_utils
-  - tests/config
-  no_gpu: true
-  commands:
-  - python3 standalone_tests/lazy_imports.py
-  - pytest -v -s test_inputs.py
-  - pytest -v -s test_outputs.py
-  - pytest -v -s test_pooling_params.py
-  - pytest -v -s -m 'cpu_test' multimodal
-  - pytest -v -s renderers
-  - pytest -v -s tokenizers_
-  - pytest -v -s tool_parsers
-  - pytest -v -s transformers_utils
-  - pytest -v -s config
-
-- label: Python-only Installation Test # 10min
-  timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental]
-  agent_pool: mi355_1
-  optional: true
-  source_file_dependencies:
-  - tests/standalone_tests/python_only_compile_rocm.sh
-  - setup.py
-  commands:
-  - bash standalone_tests/python_only_compile_rocm.sh
-
-- label: Basic Correctness Test # 20min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi355_1
-  optional: true
   fast_check: true
   torch_nightly: true
   working_dir: "/vllm-workspace/tests"

From 12f3da72ef86f5df9e5863d07cd8fdd65880eb88 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 26 Mar 2026 00:29:37 -0500
Subject: [PATCH 20/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and
 Jinja build steps

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test-amd.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 254f9f5d0b00..c75707b659b3 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -193,9 +193,9 @@ steps:
   - bash standalone_tests/python_only_compile_rocm.sh
 
 
-- label: Basic Correctness Test # TBD
+- label: Basic Correctness # TBD
   timeout_in_minutes: 180
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdgfx90a]
+  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   fast_check: true
   torch_nightly: true

From 758ef6a45f3d1c62377a6d67f3907c5d63bdc22b Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 26 Mar 2026 02:01:05 -0500
Subject: [PATCH 21/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and
 Jinja build steps

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 docker/Dockerfile.rocm      | 17 ++++++++++++++---
 docker/docker-bake-rocm.hcl | 18 ++++++++++++++++++
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index dd80c284f69c..28357fa2d48a 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -26,7 +26,6 @@ RUN apt-get update -q -y && apt-get install -q -y \
 # some packages (e.g. aiter) use JIT compilation at runtime with flags
 # that mold doesn't support (--cref). Build stages opt in via CMAKE_LINKER_TYPE
 # or LDFLAGS="-fuse-ld=mold".
-RUN python3 -m pip install --upgrade pip
 # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
 ARG USE_SCCACHE
 RUN if [ "$USE_SCCACHE" != "1" ]; then \
@@ -392,6 +391,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system hf_transfer pytest-shard
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
+# Pre-install vLLM runtime + test dependencies (stable between PRs).
+# The per-PR test stage re-runs the same install, but uv resolves in <100ms
+# because 99% of packages are already present from ci_base.
+COPY requirements/rocm.txt /tmp/rocm-reqs.txt
+COPY requirements/rocm-test.txt /tmp/rocm-test-reqs.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r /tmp/rocm-reqs.txt \
+    && if grep -q 'git+' /tmp/rocm-test-reqs.txt; then \
+           pip install -r /tmp/rocm-test-reqs.txt; \
+       else \
+           uv pip install --system -r /tmp/rocm-test-reqs.txt; \
+       fi \
+    && rm /tmp/rocm-reqs.txt /tmp/rocm-test-reqs.txt
+
 # Suppress MIOpen 3D convolution performance regressions
 # See: https://github.com/pytorch/pytorch/issues/169857
 ENV MIOPEN_DEBUG_CONV_DIRECT=0
@@ -402,8 +415,6 @@ ENV MIOPEN_DEBUG_CONV_GEMM=0
 # Only rebuilds vLLM wheel + workspace, which change every PR.
 FROM ${CI_BASE_IMAGE} AS test
 
-RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
-
 # Install vLLM dependencies and test requirements
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     --mount=type=cache,target=/root/.cache/uv \
diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl
index ad7b933a788c..24e50393d9a7 100644
--- a/docker/docker-bake-rocm.hcl
+++ b/docker/docker-bake-rocm.hcl
@@ -37,6 +37,13 @@ variable "VLLM_BRANCH" {
   default = "main"
 }
 
+# CI_BASE_IMAGE: pre-built ci_base image for per-PR test builds.
+# Defaults to the local "ci_base" stage for standalone/local builds.
+# CI overrides this to "rocm/vllm-dev:ci_base" via environment variable.
+variable "CI_BASE_IMAGE" {
+  default = "rocm/vllm-dev:ci_base"
+}
+
 group "default" {
   targets = ["test-rocm"]
 }
@@ -49,6 +56,7 @@ target "_common-rocm" {
     ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH
     REMOTE_VLLM           = REMOTE_VLLM
     VLLM_BRANCH           = VLLM_BRANCH
+    CI_BASE_IMAGE         = CI_BASE_IMAGE
   }
 }
 
@@ -119,6 +127,16 @@ target "test-rocm-gfx950-ci" {
   inherits = ["test-rocm-gfx950"]
 }
 
+# CI base image target — builds only the ci_base stage (RIXL, DeepEP,
+# torchcodec, requirements, etc.). Used by the weekly scheduled build and
+# the auto-rebuild trigger when requirements change in a PR.
+target "ci-base-rocm" {
+  inherits = ["_common-rocm", "_labels"]
+  target   = "ci_base"
+  tags     = ["rocm/vllm-dev:ci_base"]
+  output   = ["type=docker"]
+}
+
 target "final-rocm" {
   inherits = ["_common-rocm", "_labels"]
   target   = "final"

From 3c5c2ab5810447926d3adeb0b1eaa1a30d239fce Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 26 Mar 2026 04:31:50 -0500
Subject: [PATCH 22/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and
 Jinja build steps

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/scripts/ci-bake.sh |    2 +-
 .pre-commit-config.yaml       |   37 +-
 docker/Dockerfile.rocm        |   45 +-
 requirements/rocm-test.in     |   83 ++
 requirements/rocm-test.txt    | 1451 ++++++++++++++++++++++++++++++---
 5 files changed, 1489 insertions(+), 129 deletions(-)
 create mode 100644 requirements/rocm-test.in

diff --git a/.buildkite/scripts/ci-bake.sh b/.buildkite/scripts/ci-bake.sh
index f40c4a2a37dd..4eb690ec403d 100644
--- a/.buildkite/scripts/ci-bake.sh
+++ b/.buildkite/scripts/ci-bake.sh
@@ -37,7 +37,7 @@
 set -euo pipefail
 
 # Check if image already exists (skip build if it does)
-if [[ -n "${IMAGE_TAG:-}" ]]; then
+if [[ -n "${IMAGE_TAG:-}" && "${FORCE_BUILD:-0}" != "1" ]]; then
     echo "--- :mag: Checking if image exists"
     if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
         echo "Image already exists: ${IMAGE_TAG}"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0b17ad7335c7..e53274480cc0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -36,11 +36,46 @@ repos:
   hooks:
   - id: actionlint
 - repo: https://github.com/astral-sh/uv-pre-commit
-  rev: 0.9.1
+  rev: 0.11.1
   hooks:
     - id: pip-compile
       args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
       files: ^requirements/test\.(in|txt)$
+    - id: pip-compile
+      alias: pip-compile-rocm
+      name: pip-compile-rocm
+      args: [
+        requirements/rocm-test.in, -o, requirements/rocm-test.txt,
+        --index-strategy, unsafe-best-match,
+        -c, requirements/rocm.txt,
+        --python-platform, x86_64-manylinux_2_28,
+        --python-version, "3.12",
+        # Exclude torch and CUDA/NVIDIA packages
+        --no-emit-package, torch,
+        --no-emit-package, torchvision,
+        --no-emit-package, torchaudio,
+        --no-emit-package, triton,
+        --no-emit-package, cuda-bindings,
+        --no-emit-package, cuda-pathfinder,
+        --no-emit-package, cuda-toolkit,
+        --no-emit-package, cupy-cuda12x,
+        --no-emit-package, nvidia-cublas,
+        --no-emit-package, nvidia-cuda-cupti,
+        --no-emit-package, nvidia-cuda-nvrtc,
+        --no-emit-package, nvidia-cuda-runtime,
+        --no-emit-package, nvidia-cudnn-cu13,
+        --no-emit-package, nvidia-cufft,
+        --no-emit-package, nvidia-cufile,
+        --no-emit-package, nvidia-curand,
+        --no-emit-package, nvidia-cusolver,
+        --no-emit-package, nvidia-cusparse,
+        --no-emit-package, nvidia-cusparselt-cu13,
+        --no-emit-package, nvidia-nccl-cu13,
+        --no-emit-package, nvidia-nvjitlink,
+        --no-emit-package, nvidia-nvshmem-cu13,
+        --no-emit-package, nvidia-nvtx,
+      ]
+      files: ^requirements/rocm-test\.(in|txt)$
 - repo: local
   hooks:
   - id: format-torch-nightly-test
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 28357fa2d48a..3be12e4f9781 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -356,19 +356,16 @@ COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
 # these slow layers from scratch every commit.
 FROM base AS ci_base
 
-# Install RIXL wheel (pre-built in build_rixl stage)
+# Install RIXL + DeepEP wheels (pre-built in earlier stages)
 RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
-    uv pip install --system /rixl_install/*.whl
-
-# Install DeepEP wheel (pre-built in build_deepep stage)
-RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
-    uv pip install --system /deep_install/*.whl
+    --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
+    uv pip install --system /rixl_install/*.whl /deep_install/*.whl
 
 # Copy rocshmem runtime libraries
 COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem
 
-# RDMA userspace libraries (RIXL/MoRIIO runtime) + FFmpeg dev libs (torchcodec).
-# Combined into one apt-get to avoid a redundant apt-get update round-trip.
+# RDMA userspace libraries (RIXL/MoRIIO runtime) + FFmpeg dev libs (torchcodec
+# links against libav* at runtime, so the -dev packages must stay installed).
 RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
     librdmacm1 \
     libibverbs1 \
@@ -379,31 +376,24 @@ RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/*
 
 # Install torchcodec from source (ROCm/torch version mismatch prevents PyPI install).
-# Pre-building here avoids rebuilding it on every per-PR test image.
 COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh
 RUN bash /tmp/install_torchcodec.sh \
     && rm /tmp/install_torchcodec.sh \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
 
-# Stable test tooling that doesn't depend on the vLLM wheel
+# hf_transfer for fast model downloads (pytest-shard is in rocm-test.txt lockfile)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer pytest-shard
+    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
-# Pre-install vLLM runtime + test dependencies (stable between PRs).
+# Pre-install vLLM test dependencies (stable between PRs).
+# rocm-test.txt is a fully-resolved lockfile (no relative -r includes).
 # The per-PR test stage re-runs the same install, but uv resolves in <100ms
 # because 99% of packages are already present from ci_base.
-COPY requirements/rocm.txt /tmp/rocm-reqs.txt
 COPY requirements/rocm-test.txt /tmp/rocm-test-reqs.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r /tmp/rocm-reqs.txt \
-    && if grep -q 'git+' /tmp/rocm-test-reqs.txt; then \
-           pip install -r /tmp/rocm-test-reqs.txt; \
-       else \
-           uv pip install --system -r /tmp/rocm-test-reqs.txt; \
-       fi \
-    && rm /tmp/rocm-reqs.txt /tmp/rocm-test-reqs.txt
+    uv pip install --system -r /tmp/rocm-test-reqs.txt \
+    && rm /tmp/rocm-test-reqs.txt
 
 # Suppress MIOpen 3D convolution performance regressions
 # See: https://github.com/pytorch/pytorch/issues/169857
@@ -415,17 +405,12 @@ ENV MIOPEN_DEBUG_CONV_GEMM=0
 # Only rebuilds vLLM wheel + workspace, which change every PR.
 FROM ${CI_BASE_IMAGE} AS test
 
-# Install vLLM dependencies and test requirements
+# Install the vLLM wheel.
+# Runtime + test deps are already in ci_base (rocm-test.txt lockfile is
+# compiled with -c rocm.txt, so all runtime packages are covered).
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     --mount=type=cache,target=/root/.cache/uv \
     cd /install \
-    && uv pip install --system -r requirements/rocm.txt \
-    && if grep -q 'git+' requirements/rocm-test.txt; then \
-           pip install -r requirements/rocm-test.txt; \
-       else \
-           uv pip install --system -r requirements/rocm-test.txt; \
-       fi \
-    && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
 # Store the vLLM wheel in the image for python_only_compile_rocm.sh.
diff --git a/requirements/rocm-test.in b/requirements/rocm-test.in
new file mode 100644
index 000000000000..856fab7e9f65
--- /dev/null
+++ b/requirements/rocm-test.in
@@ -0,0 +1,83 @@
+# testing
+pytest
+tensorizer==2.10.1
+pytest-forked
+pytest-asyncio
+pytest-rerunfailures
+pytest-shard
+pytest-timeout
+pytest-cov
+
+# testing utils
+albumentations # required for Nemotron Parse in test_common.py
+av # required for audio_in_video tests
+backoff # required for phi4mm test
+blobfile # required for kimi-vl test
+einops # required for MPT, qwen-vl
+httpx
+librosa # required for audio tests
+vector_quantize_pytorch # required for minicpmo_26 test
+vocos # required for minicpmo_26 test
+peft>=0.15.0 # required for phi-4-mm test
+pqdm
+ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
+resampy # required for audio tests
+sentence-transformers>=5.2.0 # required for embedding tests
+soundfile # required for audio tests
+jiwer # required for audio tests
+tblib # for pickling test exceptions
+timm>=1.0.17 # required for internvl and gemma3n-mm test
+transformers_stream_generator # required for qwen-vl test
+matplotlib # required for qwen-vl test
+mistral_common[image,audio]>=1.10.0 # required for voxtral test
+num2words # required for smolvlm test
+open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
+opencv-python-headless>=4.13.0 # required for video test
+datamodel_code_generator # required for minicpm3 test
+lm-eval[api]>=0.4.11 # required for model evaluation test
+mteb[bm25s]>=2, <3 # required for mteb test
+transformers==4.57.5
+tokenizers==0.22.0
+schemathesis>=3.39.15 # Required for openai schema test
+# quantization
+bitsandbytes==0.49.2
+buildkite-test-collector==0.1.9
+
+genai_perf>=0.0.8
+tritonclient>=2.51.0
+
+# The version of gRPC libraries should be consistent with each other
+grpcio==1.78.0
+grpcio-reflection==1.78.0
+
+arctic-inference==0.1.1 # Required for suffix decoding test
+numba==0.61.2 # Required for N-gram speculative decoding
+numpy
+runai-model-streamer[s3,gcs,azure]==0.15.7
+fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
+instanttensor>=0.1.5
+pydantic>=2.12 # 2.11 leads to error on python 3.13
+decord==0.6.0
+
+# Prithvi tests
+terratorch>=1.2.2
+imagehash # Required for Prithvi tests
+segmentation-models-pytorch>0.4.0 # Required for Prithvi tests
+
+gpt-oss>=0.0.7; python_version > '3.11'
+
+perceptron # required for isaac test
+kaldi-native-fbank>=1.18.7 # required for fireredasr2 test
+
+# Newer versions of datasets require torchcoded, that makes the tests fail in CI because of a missing library.
+# Older versions are in conflict with terratorch requirements.
+datasets>=3.3.0,<=3.6.0
+
+openpyxl # required for perf comparison excel report
+plotly # required for perf comparison html report
+
+# ROCm-specific extras (not in CUDA test.in)
+rapidfuzz
+torchgeo==0.7.0
+multiprocess==0.70.16
+huggingface-hub==0.36.2
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 9428ee112ae2..dd4c7c24f40c 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -1,116 +1,1373 @@
-# Common dependencies
--r common.txt
-
-# Test infrastructure
-tblib==3.1.0
-pytest==8.3.5
-pytest-asyncio==0.24.0
-pytest-timeout==2.3.1
-pytest-cov==6.3.0
-pytest-forked==1.6.0
-pytest-rerunfailures==14.0
-pytest-shard==0.1.2
-
-# Async/HTTP dependencies
-anyio==4.6.2.post1
-    # via httpx, starlette
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements/rocm-test.in -o requirements/rocm-test.txt --index-strategy unsafe-best-match -c requirements/rocm.txt --python-platform x86_64-manylinux_2_28 --python-version 3.12 --no-emit-package torch --no-emit-package torchvision --no-emit-package torchaudio --no-emit-package triton --no-emit-package cuda-bindings --no-emit-package cuda-pathfinder --no-emit-package cuda-toolkit --no-emit-package cupy-cuda12x --no-emit-package nvidia-cublas --no-emit-package nvidia-cuda-cupti --no-emit-package nvidia-cuda-nvrtc --no-emit-package nvidia-cuda-runtime --no-emit-package nvidia-cudnn-cu13 --no-emit-package nvidia-cufft --no-emit-package nvidia-cufile --no-emit-package nvidia-curand --no-emit-package nvidia-cusolver --no-emit-package nvidia-cusparse --no-emit-package nvidia-cusparselt-cu13 --no-emit-package nvidia-nccl-cu13 --no-emit-package nvidia-nvjitlink --no-emit-package nvidia-nvshmem-cu13 --no-emit-package nvidia-nvtx
+absl-py==2.4.0
+    # via
+    #   rouge-score
+    #   tensorboard
+accelerate==1.13.0
+    # via peft
+aenum==3.1.17
+    # via lightly
+affine==2.4.0
+    # via rasterio
+aiohappyeyeballs==2.6.1
+    # via aiohttp
 aiohttp==3.13.3
-    # via gpt-oss
-httpx==0.27.2
-    # HTTP testing
-
-# Audio processing dependencies
+    # via
+    #   -c requirements/common.txt
+    #   aiohttp-cors
+    #   fsspec
+    #   gpt-oss
+    #   lm-eval
+    #   ray
+aiohttp-cors==0.8.1
+    # via ray
+aiosignal==1.4.0
+    # via aiohttp
+albucore==0.1.2
+    # via terratorch
+albumentations==1.4.6
+    # via
+    #   -r requirements/rocm-test.in
+    #   terratorch
+alembic==1.18.4
+    # via optuna
+annotated-doc==0.0.4
+    # via
+    #   fastapi
+    #   typer
+annotated-types==0.7.0
+    # via pydantic
+antlr4-python3-runtime==4.9.3
+    # via
+    #   hydra-core
+    #   omegaconf
+anyio==4.6.2.post1
+    # via
+    #   httpx
+    #   starlette
+arctic-inference==0.1.1
+    # via -r requirements/rocm-test.in
+argcomplete==3.6.3
+    # via datamodel-code-generator
+arrow==1.4.0
+    # via isoduration
+attrs==26.1.0
+    # via
+    #   aiohttp
+    #   fiona
+    #   jsonlines
+    #   jsonschema
+    #   pytest-subtests
+    #   rasterio
+    #   referencing
 audioread==3.0.1
     # via librosa
+av==16.1.0
+    # via -r requirements/rocm-test.in
+azure-core==1.39.0
+    # via
+    #   azure-identity
+    #   azure-storage-blob
+azure-identity==1.25.3
+    # via runai-model-streamer-azure
+azure-storage-blob==12.28.0
+    # via runai-model-streamer-azure
+backoff==2.2.1
+    # via
+    #   -r requirements/rocm-test.in
+    #   schemathesis
+bitsandbytes==0.49.2
+    # via
+    #   -r requirements/rocm-test.in
+    #   lightning
+black==26.3.1
+    # via datamodel-code-generator
+blobfile==3.0.0
+    # via -r requirements/rocm-test.in
+bm25s==0.2.13
+    # via mteb
+boto3==1.42.74
+    # via
+    #   runai-model-streamer-s3
+    #   tensorizer
+botocore==1.42.74
+    # via
+    #   boto3
+    #   s3transfer
+bounded-pool-executor==0.0.3
+    # via pqdm
+buildkite-test-collector==0.1.9
+    # via -r requirements/rocm-test.in
+certifi==2026.2.25
+    # via
+    #   fiona
+    #   httpcore
+    #   httpx
+    #   lightly
+    #   pyogrio
+    #   pyproj
+    #   rasterio
+    #   requests
+    #   sentry-sdk
 cffi==1.17.1
-    # via soundfile
+    # via
+    #   cryptography
+    #   soundfile
+chardet==5.2.0
+    # via mbstrdecoder
+charset-normalizer==3.4.6
+    # via requests
+choreographer==1.2.1
+    # via kaleido
+chz==0.4.0
+    # via gpt-oss
+click==8.3.1
+    # via
+    #   black
+    #   click-plugins
+    #   cligj
+    #   fiona
+    #   jiwer
+    #   nltk
+    #   rasterio
+    #   ray
+    #   schemathesis
+    #   typer
+    #   uvicorn
+    #   wandb
+click-plugins==1.1.1.2
+    # via fiona
+cligj==0.7.2
+    # via
+    #   fiona
+    #   rasterio
+colorama==0.4.6
+    # via
+    #   perceptron
+    #   sacrebleu
+    #   schemathesis
+colorful==0.5.8
+    # via ray
+colorlog==6.10.1
+    # via optuna
+contourpy==1.3.3
+    # via matplotlib
+coverage==7.13.5
+    # via pytest-cov
+cramjam==2.11.0
+    # via fastparquet
+cryptography==46.0.0
+    # via
+    #   azure-identity
+    #   azure-storage-blob
+    #   google-auth
+    #   msal
+    #   pyjwt
+cycler==0.12.1
+    # via matplotlib
+datamodel-code-generator==0.55.0
+    # via -r requirements/rocm-test.in
+dataproperty==1.1.0
+    # via
+    #   pytablewriter
+    #   tabledata
+datasets==3.6.0
+    # via
+    #   -r requirements/rocm-test.in
+    #   evaluate
+    #   lm-eval
+    #   mteb
 decorator==5.2.1
     # via librosa
+decord==0.6.0
+    # via -r requirements/rocm-test.in
+diffusers==0.37.0
+    # via terratorch
+dill==0.3.8
+    # via
+    #   datasets
+    #   evaluate
+    #   lm-eval
+    #   multiprocess
+distlib==0.4.0
+    # via virtualenv
+docker==7.1.0
+    # via gpt-oss
+docopt==0.6.2
+    # via num2words
+docstring-parser==0.17.0
+    # via jsonargparse
+einops==0.8.2
+    # via
+    #   -r requirements/rocm-test.in
+    #   encodec
+    #   terratorch
+    #   torchgeo
+    #   vector-quantize-pytorch
+    #   vocos
+einx==0.4.2
+    # via vector-quantize-pytorch
+encodec==0.1.1
+    # via vocos
+et-xmlfile==2.0.0
+    # via openpyxl
+evaluate==0.4.6
+    # via lm-eval
+fastapi==0.135.2
+    # via
+    #   -c requirements/common.txt
+    #   gpt-oss
+fastparquet==2026.3.0
+    # via genai-perf
+fastsafetensors==0.2.2
+    # via -r requirements/rocm-test.in
+filelock==3.25.2
+    # via
+    #   -c requirements/common.txt
+    #   blobfile
+    #   datasets
+    #   diffusers
+    #   huggingface-hub
+    #   python-discovery
+    #   ray
+    #   torch
+    #   transformers
+    #   virtualenv
+fiona==1.10.1
+    # via torchgeo
+fonttools==4.62.1
+    # via matplotlib
+fqdn==1.5.1
+    # via jsonschema
+frozendict==2.4.7
+    # via einx
+frozenlist==1.8.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2025.3.0
+    # via
+    #   datasets
+    #   evaluate
+    #   fastparquet
+    #   huggingface-hub
+    #   lightning
+    #   pytorch-lightning
+    #   tacoreader
+    #   torch
+ftfy==6.3.1
+    # via open-clip-torch
+genai-perf==0.0.16
+    # via -r requirements/rocm-test.in
+genson==1.3.0
+    # via datamodel-code-generator
+geopandas==1.1.3
+    # via terratorch
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.46
+    # via wandb
+google-api-core==2.30.0
+    # via
+    #   google-cloud-core
+    #   google-cloud-storage
+    #   opencensus
+google-auth==2.49.1
+    # via
+    #   google-api-core
+    #   google-cloud-core
+    #   google-cloud-storage
+    #   runai-model-streamer-gcs
+google-cloud-core==2.5.0
+    # via google-cloud-storage
+google-cloud-storage==3.10.1
+    # via runai-model-streamer-gcs
+google-crc32c==1.8.0
+    # via
+    #   google-cloud-storage
+    #   google-resumable-media
+google-resumable-media==2.8.0
+    # via google-cloud-storage
+googleapis-common-protos==1.73.0
+    # via google-api-core
+gpt-oss==0.0.8
+    # via -r requirements/rocm-test.in
+graphql-core==3.2.8
+    # via hypothesis-graphql
+greenlet==3.3.2
+    # via sqlalchemy
+grpcio==1.78.0
+    # via
+    #   -c requirements/rocm.txt
+    #   -r requirements/rocm-test.in
+    #   grpcio-reflection
+    #   ray
+    #   tensorboard
+grpcio-reflection==1.78.0
+    # via
+    #   -c requirements/rocm.txt
+    #   -r requirements/rocm-test.in
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+h2==4.3.0
+    # via httpx
+h5py==3.16.0
+    # via terratorch
+harfile==0.4.0
+    # via schemathesis
+hf-xet==1.4.2
+    # via huggingface-hub
+hiredis==3.3.1
+    # via tensorizer
+hpack==4.1.0
+    # via h2
+html2text==2025.4.15
+    # via gpt-oss
+httpcore==1.0.9
+    # via httpx
+httpx==0.27.2
+    # via
+    #   -r requirements/rocm-test.in
+    #   diffusers
+    #   perceptron
+    #   schemathesis
+huggingface-hub==0.36.2
+    # via
+    #   -r requirements/rocm-test.in
+    #   accelerate
+    #   datasets
+    #   diffusers
+    #   evaluate
+    #   open-clip-torch
+    #   peft
+    #   segmentation-models-pytorch
+    #   sentence-transformers
+    #   terratorch
+    #   timm
+    #   tokenizers
+    #   transformers
+    #   vocos
+humanize==4.15.0
+    # via runai-model-streamer
+hydra-core==1.3.2
+    # via
+    #   lightly
+    #   lightning
+hyperframe==6.1.0
+    # via h2
+hypothesis==6.151.9
+    # via
+    #   hypothesis-graphql
+    #   hypothesis-jsonschema
+    #   schemathesis
+hypothesis-graphql==0.12.0
+    # via schemathesis
+hypothesis-jsonschema==0.23.1
+    # via schemathesis
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+    #   jsonschema
+    #   requests
+    #   yarl
+imagehash==4.3.2
+    # via -r requirements/rocm-test.in
+imageio==2.37.3
+    # via scikit-image
+importlib-metadata==8.7.1
+    # via
+    #   diffusers
+    #   opentelemetry-api
+importlib-resources==6.5.2
+    # via typeshed-client
+inflect==7.5.0
+    # via datamodel-code-generator
+iniconfig==2.3.0
+    # via pytest
+instanttensor==0.1.6
+    # via -r requirements/rocm-test.in
+isodate==0.7.2
+    # via azure-storage-blob
+isoduration==20.11.0
+    # via jsonschema
+isort==8.0.1
+    # via datamodel-code-generator
+jinja2==3.1.6
+    # via
+    #   datamodel-code-generator
+    #   genai-perf
+    #   lm-eval
+    #   torch
+jiwer==4.0.0
+    # via -r requirements/rocm-test.in
+jmespath==1.1.0
+    # via
+    #   boto3
+    #   botocore
+joblib==1.5.3
+    # via
+    #   librosa
+    #   nltk
+    #   scikit-learn
+jsonargparse==4.47.0
+    # via
+    #   lightning
+    #   terratorch
+jsonlines==4.0.0
+    # via lm-eval
+jsonnet==0.21.0
+    # via jsonargparse
+jsonpointer==3.1.0
+    # via jsonschema
+jsonschema==4.26.0
+    # via
+    #   hypothesis-jsonschema
+    #   mistral-common
+    #   ray
+    #   schemathesis
+jsonschema-specifications==2025.9.1
+    # via jsonschema
+junit-xml==1.9
+    # via schemathesis
+kaldi-native-fbank==1.22.3
+    # via -r requirements/rocm-test.in
+kaleido==1.0.0
+    # via genai-perf
+kiwisolver==1.5.0
+    # via matplotlib
+kornia==0.8.2
+    # via torchgeo
+kornia-rs==0.1.10
+    # via kornia
 lazy-loader==0.4
-    # via librosa
+    # via
+    #   librosa
+    #   scikit-image
+libnacl==2.1.0
+    # via tensorizer
+librosa==0.10.2.post1
+    # via -r requirements/rocm-test.in
+lightly==1.5.22
+    # via
+    #   terratorch
+    #   torchgeo
+lightly-utils==0.0.2
+    # via lightly
+lightning==2.6.1
+    # via
+    #   terratorch
+    #   torchgeo
+lightning-utilities==0.15.3
+    # via
+    #   lightning
+    #   pytorch-lightning
+    #   torchmetrics
+llvmlite==0.44.0
+    # via numba
+lm-eval==0.4.11
+    # via -r requirements/rocm-test.in
+logistro==2.0.1
+    # via
+    #   choreographer
+    #   kaleido
+lxml==6.0.2
+    # via
+    #   blobfile
+    #   gpt-oss
+    #   sacrebleu
+mako==1.3.10
+    # via alembic
+markdown==3.10.2
+    # via tensorboard
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.3
+    # via
+    #   jinja2
+    #   mako
+    #   werkzeug
+matplotlib==3.10.8
+    # via
+    #   -r requirements/rocm-test.in
+    #   lightning
+    #   torchgeo
+mbstrdecoder==1.1.4
+    # via
+    #   dataproperty
+    #   pytablewriter
+    #   typepy
+mdurl==0.1.2
+    # via markdown-it-py
+mistral-common==1.10.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/rocm-test.in
+more-itertools==10.8.0
+    # via
+    #   inflect
+    #   lm-eval
+mpmath==1.3.0
+    # via sympy
+msal==1.35.1
+    # via
+    #   azure-identity
+    #   msal-extensions
+msal-extensions==1.3.1
+    # via azure-identity
+msgpack==1.1.2
+    # via
+    #   librosa
+    #   ray
+mteb==2.11.5
+    # via -r requirements/rocm-test.in
+multidict==6.7.1
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via
+    #   -r requirements/rocm-test.in
+    #   datasets
+    #   evaluate
+mypy-extensions==1.1.0
+    # via black
+narwhals==2.18.0
+    # via plotly
+networkx==3.6.1
+    # via
+    #   scikit-image
+    #   torch
+nltk==3.9.3
+    # via rouge-score
+num2words==0.5.14
+    # via -r requirements/rocm-test.in
+numba==0.61.2
+    # via
+    #   -c requirements/rocm.txt
+    #   -r requirements/rocm-test.in
+    #   librosa
+    #   resampy
+numkong==7.1.1
+    # via albucore
+numpy==2.2.6
+    # via
+    #   -r requirements/rocm-test.in
+    #   accelerate
+    #   albucore
+    #   albumentations
+    #   bitsandbytes
+    #   bm25s
+    #   contourpy
+    #   cupy-cuda12x
+    #   datasets
+    #   decord
+    #   diffusers
+    #   einx
+    #   encodec
+    #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   geopandas
+    #   h5py
+    #   imagehash
+    #   imageio
+    #   librosa
+    #   lightly
+    #   lightly-utils
+    #   lm-eval
+    #   matplotlib
+    #   mistral-common
+    #   mteb
+    #   numba
+    #   opencv-python-headless
+    #   optuna
+    #   pandas
+    #   patsy
+    #   peft
+    #   perceptron
+    #   pycocotools
+    #   pyogrio
+    #   pytrec-eval-terrier
+    #   pywavelets
+    #   rasterio
+    #   resampy
+    #   rioxarray
+    #   rouge-score
+    #   runai-model-streamer
+    #   sacrebleu
+    #   scikit-image
+    #   scikit-learn
+    #   scipy
+    #   segmentation-models-pytorch
+    #   sentence-transformers
+    #   shapely
+    #   soundfile
+    #   soxr
+    #   statsmodels
+    #   tensorboard
+    #   tensorboardx
+    #   tensorizer
+    #   terratorch
+    #   tifffile
+    #   torchgeo
+    #   torchmetrics
+    #   torchvision
+    #   transformers
+    #   tritonclient
+    #   vocos
+    #   xarray
+omegaconf==2.3.0
+    # via
+    #   hydra-core
+    #   lightning
+open-clip-torch==2.32.0
+    # via -r requirements/rocm-test.in
+openai-harmony==0.0.8
+    # via
+    #   -c requirements/common.txt
+    #   gpt-oss
+opencensus==0.11.4
+    # via ray
+opencensus-context==0.1.3
+    # via opencensus
+opencv-python-headless==4.13.0.92
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/rocm-test.in
+    #   albumentations
+    #   mistral-common
+openpyxl==3.1.5
+    # via -r requirements/rocm-test.in
+opentelemetry-api==1.40.0
+    # via
+    #   -c requirements/common.txt
+    #   opentelemetry-exporter-prometheus
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-prometheus==0.61b0
+    # via ray
+opentelemetry-proto==1.40.0
+    # via ray
+opentelemetry-sdk==1.40.0
+    # via
+    #   -c requirements/common.txt
+    #   opentelemetry-exporter-prometheus
+    #   ray
+opentelemetry-semantic-conventions==0.61b0
+    # via opentelemetry-sdk
+optuna==3.6.1
+    # via genai-perf
+orjson==3.11.7
+    # via
+    #   genai-perf
+    #   kaleido
+packaging==26.0
+    # via
+    #   -c requirements/rocm.txt
+    #   accelerate
+    #   bitsandbytes
+    #   black
+    #   datasets
+    #   evaluate
+    #   fastparquet
+    #   geopandas
+    #   huggingface-hub
+    #   hydra-core
+    #   kaleido
+    #   kornia
+    #   lazy-loader
+    #   lightning
+    #   lightning-utilities
+    #   matplotlib
+    #   optuna
+    #   peft
+    #   plotly
+    #   pooch
+    #   pyogrio
+    #   pytest
+    #   pytest-rerunfailures
+    #   pytorch-lightning
+    #   ray
+    #   rioxarray
+    #   scikit-image
+    #   statsmodels
+    #   tensorboard
+    #   tensorboardx
+    #   torchmetrics
+    #   transformers
+    #   typepy
+    #   wandb
+    #   xarray
+pandas==3.0.1
+    # via
+    #   datasets
+    #   evaluate
+    #   fastparquet
+    #   genai-perf
+    #   geopandas
+    #   statsmodels
+    #   tacoreader
+    #   torchgeo
+    #   xarray
+pathspec==1.0.4
+    # via black
+pathvalidate==3.3.1
+    # via pytablewriter
+patsy==1.0.2
+    # via statsmodels
+peft==0.18.1
+    # via -r requirements/rocm-test.in
+perceptron==0.1.4
+    # via -r requirements/rocm-test.in
+perf-analyzer==0.1.0
+    # via genai-perf
+pillow==12.1.1
+    # via
+    #   diffusers
+    #   genai-perf
+    #   imagehash
+    #   imageio
+    #   lightly-utils
+    #   matplotlib
+    #   mistral-common
+    #   perceptron
+    #   scikit-image
+    #   segmentation-models-pytorch
+    #   tensorboard
+    #   torchgeo
+    #   torchvision
 platformdirs==4.3.6
-    # via pooch
+    # via
+    #   black
+    #   pooch
+    #   python-discovery
+    #   virtualenv
+    #   wandb
+plotly==6.6.0
+    # via
+    #   -r requirements/rocm-test.in
+    #   genai-perf
+pluggy==1.6.0
+    # via
+    #   pytest
+    #   pytest-cov
+polars==1.39.3
+    # via mteb
+polars-runtime-32==1.39.3
+    # via polars
 pooch==1.8.2
     # via librosa
-soundfile==0.13.1
-    # via librosa
-soxr==0.5.0.post1
-    # via librosa
-librosa==0.10.2.post1
-
-# Retrieval and search
-bm25s==0.2.13
-    # via mteb
+portalocker==3.2.0
+    # via sacrebleu
+pqdm==0.2.0
+    # via -r requirements/rocm-test.in
+prometheus-client==0.24.1
+    # via
+    #   -c requirements/common.txt
+    #   opentelemetry-exporter-prometheus
+    #   ray
+propcache==0.4.1
+    # via
+    #   aiohttp
+    #   yarl
+proto-plus==1.27.1
+    # via google-api-core
+protobuf==6.33.6
+    # via
+    #   -c requirements/common.txt
+    #   google-api-core
+    #   googleapis-common-protos
+    #   grpcio-reflection
+    #   opentelemetry-proto
+    #   proto-plus
+    #   ray
+    #   tensorboard
+    #   tensorboardx
+    #   tensorizer
+    #   wandb
+psutil==7.2.2
+    # via
+    #   accelerate
+    #   peft
+    #   tensorizer
+py==1.11.0
+    # via pytest-forked
+py-spy==0.4.1
+    # via ray
+pyarrow==23.0.1
+    # via
+    #   datasets
+    #   genai-perf
+    #   tacoreader
+    #   terratorch
+pyasn1==0.6.3
+    # via pyasn1-modules
+pyasn1-modules==0.4.2
+    # via google-auth
+pycocotools==2.0.11
+    # via terratorch
+pycountry==26.2.16
+    # via pydantic-extra-types
+pycparser==3.0
+    # via cffi
+pycryptodomex==3.23.0
+    # via blobfile
+pydantic==2.12.5
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/rocm-test.in
+    #   albumentations
+    #   datamodel-code-generator
+    #   fastapi
+    #   gpt-oss
+    #   lightly
+    #   mistral-common
+    #   mteb
+    #   openai-harmony
+    #   pydantic-extra-types
+    #   ray
+    #   wandb
+pydantic-core==2.41.5
+    # via pydantic
+pydantic-extra-types==2.11.1
+    # via mistral-common
+pygments==2.19.2
+    # via rich
+pyjwt==2.12.1
+    # via msal
+pyogrio==0.12.1
+    # via geopandas
+pyparsing==3.3.2
+    # via
+    #   matplotlib
+    #   rasterio
+pyproj==3.7.2
+    # via
+    #   geopandas
+    #   rioxarray
+    #   torchgeo
+pyrate-limiter==3.9.0
+    # via schemathesis
 pystemmer==3.0.0
     # via mteb
-
-# Multi-modal processing
-av==16.1.0
-    # required for audio_in_video tests
-resampy==0.4.3
-    # audio processing, required for audio_in_video tests
-blobfile==3.0.0
-    # Multi-Modal Models Test
-decord==0.6.0
-    # video processing, required by entrypoints/openai/chat_completion/test_video.py
+pytablewriter==1.2.1
+    # via lm-eval
+pytest==8.3.5
+    # via
+    #   -r requirements/rocm-test.in
+    #   buildkite-test-collector
+    #   genai-perf
+    #   pytest-asyncio
+    #   pytest-cov
+    #   pytest-forked
+    #   pytest-mock
+    #   pytest-rerunfailures
+    #   pytest-shard
+    #   pytest-subtests
+    #   pytest-timeout
+    #   schemathesis
+pytest-asyncio==0.24.0
+    # via -r requirements/rocm-test.in
+pytest-cov==6.3.0
+    # via -r requirements/rocm-test.in
+pytest-forked==1.6.0
+    # via -r requirements/rocm-test.in
+pytest-mock==3.15.1
+    # via genai-perf
+pytest-rerunfailures==14.0
+    # via -r requirements/rocm-test.in
+pytest-shard==0.1.2
+    # via -r requirements/rocm-test.in
+pytest-subtests==0.14.2
+    # via schemathesis
+pytest-timeout==2.3.1
+    # via -r requirements/rocm-test.in
+python-box==7.4.1
+    # via terratorch
+python-dateutil==2.9.0.post0
+    # via
+    #   arrow
+    #   botocore
+    #   lightly
+    #   matplotlib
+    #   pandas
+    #   typepy
+python-discovery==1.2.0
+    # via virtualenv
+python-rapidjson==1.23
+    # via tritonclient
+pytokens==0.4.1
+    # via black
+pytorch-lightning==2.6.1
+    # via
+    #   lightly
+    #   lightning
+pytrec-eval-terrier==0.5.10
+    # via mteb
+pytz==2026.1.post1
+    # via typepy
+pywavelets==1.9.0
+    # via imagehash
+pyyaml==6.0.3
+    # via
+    #   accelerate
+    #   albumentations
+    #   datamodel-code-generator
+    #   datasets
+    #   genai-perf
+    #   huggingface-hub
+    #   jsonargparse
+    #   lightning
+    #   omegaconf
+    #   optuna
+    #   peft
+    #   pytorch-lightning
+    #   ray
+    #   responses
+    #   schemathesis
+    #   timm
+    #   transformers
+    #   vocos
+    #   wandb
 rapidfuzz==3.12.1
-
-# OpenAI compatibility and testing
-gpt-oss==0.0.8
-    # OpenAI compatibility tests
+    # via
+    #   -r requirements/rocm-test.in
+    #   jiwer
+rasterio==1.5.0
+    # via
+    #   rioxarray
+    #   terratorch
+    #   torchgeo
+ray==2.54.0
+    # via -r requirements/rocm-test.in
+redis==7.3.0
+    # via tensorizer
+referencing==0.37.0
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2026.2.28
+    # via
+    #   diffusers
+    #   nltk
+    #   open-clip-torch
+    #   sacrebleu
+    #   tiktoken
+    #   transformers
+requests==2.32.5
+    # via
+    #   -c requirements/common.txt
+    #   azure-core
+    #   buildkite-test-collector
+    #   datasets
+    #   diffusers
+    #   docker
+    #   evaluate
+    #   google-api-core
+    #   google-cloud-storage
+    #   gpt-oss
+    #   huggingface-hub
+    #   lightly
+    #   lm-eval
+    #   mistral-common
+    #   msal
+    #   mteb
+    #   pooch
+    #   ray
+    #   responses
+    #   schemathesis
+    #   starlette-testclient
+    #   tacoreader
+    #   tiktoken
+    #   transformers
+    #   wandb
+resampy==0.4.3
+    # via -r requirements/rocm-test.in
+responses==0.26.0
+    # via genai-perf
+rfc3339-validator==0.1.4
+    # via jsonschema
+rfc3987==1.3.8
+    # via jsonschema
+rich==14.3.3
+    # via
+    #   genai-perf
+    #   lightning
+    #   mteb
+    #   perceptron
+    #   terratorch
+    #   typer
+rioxarray==0.22.0
+    # via terratorch
+rouge-score==0.1.2
+    # via lm-eval
+rpds-py==0.30.0
+    # via
+    #   jsonschema
+    #   referencing
+rtree==1.4.1
+    # via torchgeo
+runai-model-streamer==0.15.7
+    # via
+    #   -c requirements/rocm.txt
+    #   -r requirements/rocm-test.in
+runai-model-streamer-azure==0.15.7
+    # via runai-model-streamer
+runai-model-streamer-gcs==0.15.7
+    # via runai-model-streamer
+runai-model-streamer-s3==0.15.7
+    # via runai-model-streamer
+s3transfer==0.16.0
+    # via boto3
+sacrebleu==2.6.0
+    # via lm-eval
+safetensors==0.7.0
+    # via
+    #   accelerate
+    #   diffusers
+    #   open-clip-torch
+    #   peft
+    #   segmentation-models-pytorch
+    #   timm
+    #   transformers
 schemathesis==3.39.15
-    # OpenAI schema test
-
-# Evaluation and benchmarking
-lm-eval[api]==0.4.11
-jiwer==4.0.0
-
-# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
-multiprocess==0.70.16
-
-# Required for v1/metrics/test_engine_logger_apis.py
-ray[cgraph,default]>=2.48.0
-
+    # via -r requirements/rocm-test.in
+scikit-image==0.26.0
+    # via
+    #   albumentations
+    #   terratorch
+scikit-learn==1.8.0
+    # via
+    #   albumentations
+    #   librosa
+    #   lm-eval
+    #   mteb
+    #   sentence-transformers
+    #   terratorch
+scipy==1.17.1
+    # via
+    #   albumentations
+    #   bm25s
+    #   imagehash
+    #   librosa
+    #   mteb
+    #   pytrec-eval-terrier
+    #   scikit-image
+    #   scikit-learn
+    #   sentence-transformers
+    #   statsmodels
+    #   vocos
+segmentation-models-pytorch==0.5.0
+    # via
+    #   -r requirements/rocm-test.in
+    #   terratorch
+    #   torchgeo
+sentence-transformers==5.3.0
+    # via
+    #   -r requirements/rocm-test.in
+    #   mteb
+sentry-sdk==2.55.0
+    # via wandb
+setuptools==79.0.1
+    # via
+    #   -c requirements/common.txt
+    #   -c requirements/rocm.txt
+    #   pytablewriter
+    #   tensorboard
+    #   torch
+shapely==2.1.2
+    # via
+    #   geopandas
+    #   torchgeo
+shellingham==1.5.4
+    # via
+    #   perceptron
+    #   typer
+simplejson==3.20.2
+    # via choreographer
+six==1.17.0
+    # via
+    #   -c requirements/common.txt
+    #   junit-xml
+    #   lightly
+    #   opencensus
+    #   python-dateutil
+    #   rfc3339-validator
+    #   rouge-score
+smart-open==7.5.1
+    # via ray
+smmap==5.0.3
+    # via gitdb
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   httpx
+sortedcontainers==2.4.0
+    # via hypothesis
+soundfile==0.13.1
+    # via
+    #   -r requirements/rocm-test.in
+    #   genai-perf
+    #   librosa
+    #   mistral-common
+soxr==0.5.0.post1
+    # via
+    #   librosa
+    #   mistral-common
+sqlalchemy==2.0.48
+    # via
+    #   alembic
+    #   optuna
+sqlitedict==2.1.0
+    # via lm-eval
+starlette==0.52.1
+    # via
+    #   fastapi
+    #   schemathesis
+    #   starlette-testclient
+starlette-testclient==0.4.1
+    # via schemathesis
+statsmodels==0.14.6
+    # via genai-perf
+stringzilla==4.6.0
+    # via albucore
+structlog==25.5.0
+    # via gpt-oss
+sympy==1.14.0
+    # via
+    #   einx
+    #   torch
+tabledata==1.3.4
+    # via pytablewriter
+tabulate==0.10.0
+    # via sacrebleu
+tacoreader==0.5.6
+    # via terratorch
+tblib==3.1.0
+    # via -r requirements/rocm-test.in
+tcolorpy==0.1.7
+    # via pytablewriter
+tenacity==9.1.4
+    # via
+    #   gpt-oss
+    #   lm-eval
+tensorboard==2.20.0
+    # via terratorch
+tensorboard-data-server==0.7.2
+    # via tensorboard
+tensorboardx==2.6.4
+    # via lightning
+tensorizer==2.10.1
+    # via
+    #   -c requirements/rocm.txt
+    #   -r requirements/rocm-test.in
+termcolor==3.3.0
+    # via
+    #   gpt-oss
+    #   terratorch
+terratorch==1.2.2
+    # via -r requirements/rocm-test.in
+threadpoolctl==3.6.0
+    # via scikit-learn
+tifffile==2026.3.3
+    # via
+    #   scikit-image
+    #   terratorch
+tiktoken==0.12.0
+    # via
+    #   -c requirements/common.txt
+    #   gpt-oss
+    #   lm-eval
+    #   mistral-common
+timm==1.0.17
+    # via
+    #   -c requirements/rocm.txt
+    #   -r requirements/rocm-test.in
+    #   open-clip-torch
+    #   segmentation-models-pytorch
+    #   terratorch
+    #   torchgeo
+tokenizers==0.22.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/rocm-test.in
+    #   transformers
+tomli==2.4.0
+    # via schemathesis
+tomli-w==1.2.0
+    # via schemathesis
 torchgeo==0.7.0
+    # via
+    #   -r requirements/rocm-test.in
+    #   terratorch
+torchmetrics==1.9.0
+    # via
+    #   lightning
+    #   pytorch-lightning
+    #   terratorch
+    #   torchgeo
+tqdm==4.67.3
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   lightly
+    #   lightning
+    #   lm-eval
+    #   mteb
+    #   nltk
+    #   open-clip-torch
+    #   optuna
+    #   peft
+    #   pqdm
+    #   pytorch-lightning
+    #   segmentation-models-pytorch
+    #   sentence-transformers
+    #   tacoreader
+    #   terratorch
+    #   transformers
+transformers==4.57.5
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/rocm-test.in
+    #   genai-perf
+    #   peft
+    #   sentence-transformers
+    #   transformers-stream-generator
+transformers-stream-generator==0.0.5
+    # via -r requirements/rocm-test.in
+tritonclient==2.66.0
+    # via -r requirements/rocm-test.in
+typeguard==4.5.1
+    # via inflect
+typepy==1.3.4
+    # via
+    #   dataproperty
+    #   pytablewriter
+    #   tabledata
+typer==0.24.1
+    # via
+    #   fastsafetensors
+    #   perceptron
+typeshed-client==2.9.0
+    # via jsonargparse
+typing-extensions==4.15.0
+    # via
+    #   -c requirements/common.txt
+    #   aiosignal
+    #   albumentations
+    #   alembic
+    #   azure-core
+    #   azure-identity
+    #   azure-storage-blob
+    #   chz
+    #   fastapi
+    #   grpcio
+    #   huggingface-hub
+    #   librosa
+    #   lightning
+    #   lightning-utilities
+    #   lm-eval
+    #   mistral-common
+    #   mteb
+    #   opentelemetry-api
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+    #   pqdm
+    #   pydantic
+    #   pydantic-core
+    #   pydantic-extra-types
+    #   pytorch-lightning
+    #   referencing
+    #   sentence-transformers
+    #   sqlalchemy
+    #   starlette
+    #   torch
+    #   torchgeo
+    #   typeguard
+    #   typeshed-client
+    #   typing-inspection
+    #   wandb
+typing-inspection==0.4.2
+    # via
+    #   fastapi
+    #   pydantic
+tzdata==2025.3
+    # via arrow
+uri-template==1.3.0
+    # via jsonschema
+urllib3==2.6.3
+    # via
+    #   blobfile
+    #   botocore
+    #   docker
+    #   lightly
+    #   requests
+    #   responses
+    #   sentry-sdk
+    #   tritonclient
+uvicorn==0.42.0
+    # via gpt-oss
+vector-quantize-pytorch==1.28.0
+    # via -r requirements/rocm-test.in
+virtualenv==21.2.0
+    # via ray
+vocos==0.1.0
+    # via -r requirements/rocm-test.in
+wandb==0.25.1
     # via terratorch
-# MTEB Benchmark Test
-mteb[bm25s]>=2, <3
-
-# Utilities
-num2words==0.5.14
+wcwidth==0.6.0
+    # via ftfy
+webcolors==25.10.0
+    # via jsonschema
+werkzeug==3.1.6
+    # via
+    #   schemathesis
+    #   tensorboard
+word2number==1.1
     # via lm-eval
-pqdm==0.2.0
+wrapt==2.1.2
+    # via smart-open
+xarray==2026.2.0
+    # via rioxarray
+xxhash==3.6.0
+    # via
+    #   datasets
+    #   evaluate
+yarl==1.23.0
+    # via
+    #   aiohttp
+    #   schemathesis
+zipp==3.23.0
+    # via importlib-metadata
+zstandard==0.25.0
     # via lm-eval
 
-# Required for fastsafetensors test
-fastsafetensors==0.2.2
-# Required for suffix decoding test
-arctic-inference == 0.1.1
-# Required for Nemotron test
-open-clip-torch==2.32.0
-# Required for isaac Multi-Modal generation test
-perceptron==0.1.4
-# Required for the multi-modal models test
-timm==1.0.17
-# Required for plugins test
-albumentations==1.4.6
-# Pin transformers version
-transformers==4.57.5
-# Pin HF Hub version
-huggingface-hub==0.36.2
-# Pin Mistral Common
-mistral-common[image,audio]==1.10.0
-# Required for Prithvi tests
-terratorch==1.2.2
-# Required for Prithvi tests
-segmentation-models-pytorch==0.5.0
-# Required for Prithvi tests
-imagehash==4.3.2
-# Required for bitsandbytes quantization test
-bitsandbytes==0.49.2
-# Examples (tensorizer) tests
-tensorizer==2.10.1
-# Multi-modal models test (`allendou/FireRedASR2-LLM-vllm`)
-kaldi-native-fbank==1.22.3
-# Pinning numpy version
-numpy==2.2.6
+# The following packages were excluded from the output:
+# torch
+# torchvision
+# torchaudio
+# triton
+# cuda-bindings
+# cuda-pathfinder
+# cuda-toolkit
+# cupy-cuda12x
+# nvidia-cublas
+# nvidia-cuda-cupti
+# nvidia-cuda-nvrtc
+# nvidia-cuda-runtime
+# nvidia-cudnn-cu13
+# nvidia-cufft
+# nvidia-cufile
+# nvidia-curand
+# nvidia-cusolver
+# nvidia-cusparse
+# nvidia-cusparselt-cu13
+# nvidia-nccl-cu13
+# nvidia-nvjitlink
+# nvidia-nvshmem-cu13
+# nvidia-nvtx

From d2dd9b03aad43caf67552a4383842e793f50198e Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 26 Mar 2026 13:08:31 -0500
Subject: [PATCH 23/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and
 Jinja build steps

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/scripts/hardware_ci/run-amd-test.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index c1c6549daf03..ca7ec6c5dd7c 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -380,7 +380,13 @@ wait_for_clean_gpus
 
 # --- Pull test image ---
 echo "--- Pulling container"
-image_name="${DOCKER_IMAGE_NAME:-rocm/vllm-ci:${BUILDKITE_COMMIT}}"
+if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then
+  echo "Error: DOCKER_IMAGE_NAME is not set. The pipeline must pass the per-arch" \
+       "image tag (e.g. rocm/vllm-ci:\$COMMIT-gfx90a). Check pool_to_arch mapping" \
+       "in test-template-amd.j2." >&2
+  exit 1
+fi
+image_name="${DOCKER_IMAGE_NAME}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"