From 8c1f814611deb4bc8d9c61329e0f8217e50a5990 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 12 Mar 2026 22:11:42 -0500 Subject: [PATCH 01/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- .buildkite/hardware_tests/amd.yaml | 26 ++- .buildkite/scripts/ci-bake.sh | 172 ++++++++++++++++++ .buildkite/test-amd.yaml | 13 +- docker/Dockerfile.rocm | 143 +++++++++++---- docker/docker-bake-rocm.hcl | 81 +++++++++ .../python_only_compile_rocm.sh | 71 ++++++++ 6 files changed, 449 insertions(+), 57 deletions(-) create mode 100644 .buildkite/scripts/ci-bake.sh create mode 100644 docker/docker-bake-rocm.hcl create mode 100644 tests/standalone_tests/python_only_compile_rocm.sh diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml index 23a23723ad93..6f96db2110e6 100644 --- a/.buildkite/hardware_tests/amd.yaml +++ b/.buildkite/hardware_tests/amd.yaml @@ -1,4 +1,4 @@ -group: Hardware - AMD Build +group: Hardware - AMD Build steps: - label: "AMD: :docker: build image" key: image-build-amd @@ -6,25 +6,21 @@ steps: device: amd_cpu no_plugin: true commands: - - > - docker build - --build-arg max_jobs=16 - --build-arg REMOTE_VLLM=1 - --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950' - --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT - --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}" - -f docker/Dockerfile.rocm - --target test - --no-cache - --progress plain . - - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}" + - bash .buildkite/scripts/ci-bake.sh test-rocm-ci env: DOCKER_BUILDKIT: "1" + IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl" + PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950" + timeout_in_minutes: 600 retry: automatic: - - exit_status: -1 # Agent was lost + - exit_status: -1 # Agent was lost limit: 1 - exit_status: -10 # Agent was lost limit: 1 - - exit_status: 1 # Machine occasionally fail + - exit_status: 128 # Git / network connectivity issues + limit: 1 + - exit_status: 1 # Machine occasionally fails limit: 1 diff --git a/.buildkite/scripts/ci-bake.sh b/.buildkite/scripts/ci-bake.sh new file mode 100644 index 000000000000..410c5c352366 --- /dev/null +++ b/.buildkite/scripts/ci-bake.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# ci-bake.sh - Wrapper script for Docker buildx bake CI builds +# +# Canonical location: vllm repo at .buildkite/scripts/ci-bake.sh +# Kept in sync with ci-infra repo at buildkite/scripts/ci-bake.sh. +# Update both when making changes; the vllm copy is what actually runs in CI +# (pinned to the vllm commit under test). +# +# This script handles the common setup for running docker buildx bake: +# - Downloads ci.hcl from ci-infra +# - Detects and uses local buildkitd if available (custom AMI with warm cache) +# - Falls back to docker-container driver on regular instances +# - Runs bake with --print for debugging +# - Runs the actual build +# +# Usage: +# ci-bake.sh [TARGET] +# +# Environment variables (all optional, with sensible defaults): +# CI_HCL_URL - URL to ci.hcl (default: from ci-infra main branch) +# VLLM_CI_BRANCH - ci-infra branch to use (default: main) +# VLLM_BAKE_FILE - Path to vLLM's bake file (default: docker/docker-bake.hcl) +# BUILDER_NAME - Name for buildx builder (default: vllm-builder) +# +# Build configuration (passed through to bake via environment): +# BUILDKITE_COMMIT - Git commit (auto-detected from Buildkite) +# PARENT_COMMIT - Parent commit (HEAD~1) for cache fallback (auto-computed) +# IMAGE_TAG - Primary image tag +# IMAGE_TAG_LATEST - Latest tag (optional) +# CACHE_FROM - Cache source +# CACHE_FROM_BASE - Base branch cache source +# CACHE_FROM_MAIN - Main branch cache source +# CACHE_TO - Cache destination +# VLLM_USE_PRECOMPILED - Use precompiled wheels +# VLLM_MERGE_BASE_COMMIT - Merge base commit for precompiled + +set -euo pipefail + +# Check if image already exists (skip build if it does) +if [[ -n "${IMAGE_TAG:-}" ]]; then + echo "--- :mag: Checking if image exists" + if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then + echo "Image already exists: ${IMAGE_TAG}" + echo "Skipping build" + exit 0 + fi + echo "Image not found, proceeding with build" +fi + +# Configuration with defaults +TARGET="${1:-test-ci}" +CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}" +VLLM_BAKE_FILE="${VLLM_BAKE_FILE:-docker/docker-bake.hcl}" +BUILDER_NAME="${BUILDER_NAME:-vllm-builder}" +CI_HCL_PATH="/tmp/ci.hcl" +BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock" + +echo "--- :docker: Setting up Docker buildx bake" +echo "Target: ${TARGET}" +echo "CI HCL URL: ${CI_HCL_URL}" +echo "vLLM bake file: ${VLLM_BAKE_FILE}" + + +# Check if vLLM bake file exists +if [[ ! -f "${VLLM_BAKE_FILE}" ]]; then + echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE}" + echo "Make sure you're running from the vLLM repository root" + exit 1 +fi + +# Download ci.hcl +echo "--- :arrow_down: Downloading ci.hcl" +curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}" +echo "Downloaded to ${CI_HCL_PATH}" + +# Set up buildx builder +# Priority: 1) local buildkitd socket (custom AMI) 2) existing builder 3) new docker-container builder +echo "--- :buildkite: Setting up buildx builder" + +if [[ -S "${BUILDKIT_SOCKET}" ]]; then + # Custom AMI with standalone buildkitd - use remote driver for warm cache + echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}" + echo "Using remote driver to connect to buildkitd (warm cache available)" + + # Check if baked-vllm-builder already exists and is using the socket + if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then + echo "Using existing baked-vllm-builder" + docker buildx use baked-vllm-builder + else + echo "Creating baked-vllm-builder with remote driver" + docker buildx create \ + --name baked-vllm-builder \ + --driver remote \ + --use \ + "unix://${BUILDKIT_SOCKET}" + fi + docker buildx inspect --bootstrap +elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then + # Existing builder available + echo "Using existing builder: ${BUILDER_NAME}" + docker buildx use "${BUILDER_NAME}" + docker buildx inspect --bootstrap +else + # No local buildkitd, no existing builder - create new docker-container builder + echo "No local buildkitd found, using docker-container driver" + docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use + docker buildx inspect --bootstrap +fi + +# Show builder info +echo "Active builder:" +docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls + +# Deepen shallow clones so HEAD~1 and merge-base are available. +# Buildkite agents often clone with --depth=1; without deepening, git rev-parse +# HEAD~1 and git merge-base both silently fail, disabling the per-commit cache layers. +if git rev-parse --is-shallow-repository 2>/dev/null | grep -q "true"; then + echo "Shallow clone detected — deepening for cache key computation" + # --deepen=1 extends the current shallow clone by 1 commit along the + # already-fetched branch, making HEAD~1 available. Unlike --depth=2 + # with a refspec, it operates on the currently checked-out branch and + # is safe in detached-HEAD (Buildkite) checkout state. + git fetch --deepen=1 origin 2>/dev/null || true +fi + +# Compute parent commit for cache fallback (if not already set) +if [[ -z "${PARENT_COMMIT:-}" ]]; then + PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "") + if [[ -n "${PARENT_COMMIT}" ]]; then + echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}" + export PARENT_COMMIT + else + echo "Could not determine parent commit (may be first commit in repo)" + fi +else + echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}" +fi + +# Compute merge-base with main for an additional cache fallback layer. +# Mirrors the VLLM_MERGE_BASE_COMMIT pattern used in ci.hcl (CUDA). +# Useful for long-lived PRs where parent-commit cache may be missing but the +# merge-base (a real main commit) maps to a warm :rocm-latest snapshot. +if [[ -z "${VLLM_MERGE_BASE_COMMIT:-}" ]]; then + # Fetch just the tip of main so merge-base can be resolved on shallow clones. + git fetch --depth=1 origin main 2>/dev/null || true + VLLM_MERGE_BASE_COMMIT=$(git merge-base HEAD origin/main 2>/dev/null || echo "") + if [[ -n "${VLLM_MERGE_BASE_COMMIT}" ]]; then + echo "Computed merge base commit for cache fallback: ${VLLM_MERGE_BASE_COMMIT}" + export VLLM_MERGE_BASE_COMMIT + else + echo "Could not determine merge base (will skip that cache layer)" + fi +else + echo "Using provided VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}" +fi + +# Print resolved configuration for debugging and upload as a Buildkite artifact +echo "--- :page_facing_up: Resolved bake configuration" +BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json" +docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true +if command -v buildkite-agent >/dev/null 2>&1 && [[ -n "${BUILDKITE_BUILD_NUMBER:-}" ]]; then + buildkite-agent artifact upload "${BAKE_CONFIG_FILE}" || true + echo "Uploaded ${BAKE_CONFIG_FILE} as Buildkite artifact" +else + echo "Saved bake config to ${BAKE_CONFIG_FILE} (not in Buildkite, skipping upload)" +fi + +# Run the actual build +echo "--- :docker: Building ${TARGET}" +docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}" + +echo "--- :white_check_mark: Build complete" diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index a4c98f86ee07..ad04b0ed50e3 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -97,10 +97,10 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] agent_pool: mi250_1 source_file_dependencies: - - tests/standalone_tests/python_only_compile.sh + - tests/standalone_tests/python_only_compile_rocm.sh - setup.py commands: - - bash standalone_tests/python_only_compile.sh + - bash standalone_tests/python_only_compile_rocm.sh - label: Basic Correctness Test # 20min timeout_in_minutes: 30 @@ -1429,12 +1429,11 @@ steps: mirror_hardwares: [amdexperimental] agent_pool: mi325_1 optional: true - # grade: Blocking source_file_dependencies: - - tests/standalone_tests/python_only_compile.sh + - tests/standalone_tests/python_only_compile_rocm.sh - setup.py commands: - - bash standalone_tests/python_only_compile.sh + - bash standalone_tests/python_only_compile_rocm.sh - label: Basic Correctness Test # 20min timeout_in_minutes: 30 @@ -3189,10 +3188,10 @@ steps: agent_pool: mi355_1 optional: true source_file_dependencies: - - tests/standalone_tests/python_only_compile.sh + - tests/standalone_tests/python_only_compile_rocm.sh - setup.py commands: - - bash standalone_tests/python_only_compile.sh + - bash standalone_tests/python_only_compile_rocm.sh - label: Basic Correctness Test # 20min timeout_in_minutes: 30 diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index f8a4274a179f..37f1c487e4c0 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -19,7 +19,9 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} # Install some basic utilities RUN apt-get update -q -y && apt-get install -q -y \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ - apt-transport-https ca-certificates wget curl + apt-transport-https ca-certificates wget curl \ + ccache mold \ + && update-alternatives --install /usr/bin/ld ld /usr/bin/mold 100 RUN python3 -m pip install --upgrade pip # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base) ARG USE_SCCACHE @@ -38,6 +40,11 @@ ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts ENV UV_LINK_MODE=copy +# ccache directory — persisted across layer rebuilds via --mount=type=cache +ENV CCACHE_DIR=/root/.cache/ccache +# Compilation parallelism — overridable via --build-arg max_jobs=N; falls back to nproc +ARG max_jobs +ENV MAX_JOBS=${max_jobs} # Install sccache if USE_SCCACHE is enabled (for release builds) ARG USE_SCCACHE @@ -94,14 +101,49 @@ ONBUILD RUN git clone ${VLLM_REPO} \ && git fetch upstream ; fi FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm +# ----------------------- +# HIP kernel compilation stage (csrc-build-rocm) +# +# Intentionally copies ONLY build-critical files (CMakeLists.txt, csrc/, cmake/) +# so that Python-only changes to vllm/*.py do NOT invalidate this expensive layer. +# BuildKit's registry cache (--cache-from ECR) reuses this layer across commits +# whenever only Python code changed — turning a 2-hour HIP recompile into seconds. +# +# Note: only applies when REMOTE_VLLM=0 (default), so the build context +# contains the vllm source. Release builds (REMOTE_VLLM=1) always compile fully. +FROM base AS csrc-build-rocm +ARG COMMON_WORKDIR +WORKDIR ${COMMON_WORKDIR}/vllm +# Copy only files HIP compilation depends on — vllm/**/*.py changes don't bust this +COPY requirements/rocm-build.txt requirements/rocm-build.txt +COPY pyproject.toml setup.py CMakeLists.txt ./ +COPY cmake cmake/ +COPY csrc csrc/ +COPY vllm/envs.py vllm/envs.py +COPY vllm/__init__.py vllm/__init__.py +# Dummy version prevents git-state from busting the cache key on every commit +ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build" +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r requirements/rocm-build.txt \ + && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist + # ----------------------- # vLLM build stages FROM fetch_vllm AS build_vllm -# Build vLLM (setup.py auto-detects sccache in PATH) -RUN cd vllm \ - && python3 -m pip install -r requirements/rocm.txt \ - && python3 setup.py clean --all \ - && python3 setup.py bdist_wheel --dist-dir=dist +ARG COMMON_WORKDIR +# Re-use the pre-built HIP kernel wheel from csrc-build-rocm. +# When VLLM_PRECOMPILED_WHEEL_LOCATION is set, setup.py extracts the compiled +# .so files from this wheel instead of recompiling HIP kernels. +# Python-only changes complete in minutes instead of hours. +COPY --from=csrc-build-rocm ${COMMON_WORKDIR}/vllm/dist /precompiled-wheels +ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1 +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/uv \ + cd vllm \ + && uv pip install --system -r requirements/rocm-build.txt \ + && export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl) \ + && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm ARG COMMON_WORKDIR COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl / @@ -143,12 +185,14 @@ RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \ RUN uv pip install --system meson auditwheel patchelf tomlkit -RUN cd /usr/local/src && \ +RUN --mount=type=cache,target=/root/.cache/ccache \ + cd /usr/local/src && \ git clone ${UCX_REPO} && \ cd ucx && \ git checkout ${UCX_BRANCH} && \ ./autogen.sh && \ mkdir build && cd build && \ + CC="ccache gcc" CXX="ccache g++" \ ../configure \ --prefix=/usr/local/ucx \ --enable-shared \ @@ -160,20 +204,22 @@ RUN cd /usr/local/src && \ --with-verbs \ --with-dm \ --enable-mt && \ - make -j && \ + make -j$(nproc) && \ make install ENV PATH=/usr/local/ucx/bin:$PATH ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH} -RUN git clone ${RIXL_REPO} /opt/rixl && \ +RUN --mount=type=cache,target=/root/.cache/ccache \ + git clone ${RIXL_REPO} /opt/rixl && \ cd /opt/rixl && \ git checkout ${RIXL_BRANCH} && \ + CC="ccache gcc" CXX="ccache g++" \ meson setup build --prefix=${RIXL_HOME} \ -Ducx_path=${UCX_HOME} \ -Drocm_path=${ROCM_PATH} && \ cd build && \ - ninja && \ + ninja -j$(nproc) && \ ninja install # Generate RIXL wheel @@ -184,34 +230,43 @@ RUN cd /opt/rixl && mkdir -p /app/install && \ --ucx-plugins-dir ${UCX_HOME}/lib/ucx \ --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins -# DeepEP build stage -FROM base AS build_deep +# ----------------------- +# ROCShmem build stage — split from DeepEP so changing DEEPEP_BRANCH +# does not invalidate the slow cmake+make ROCSHMEM build. +FROM base AS build_rocshmem ARG ROCSHMEM_BRANCH="ba0bf0f3" ARG ROCSHMEM_REPO="https://github.com/ROCm/rocm-systems.git" -ARG DEEPEP_BRANCH="e84464ec" -ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git" -ARG DEEPEP_NIC="cx7" ENV ROCSHMEM_DIR=/opt/rocshmem -RUN git clone ${ROCSHMEM_REPO} \ +RUN --mount=type=cache,target=/root/.cache/ccache \ + git clone ${ROCSHMEM_REPO} \ && cd rocm-systems \ && git checkout ${ROCSHMEM_BRANCH} \ && mkdir -p projects/rocshmem/build \ && cd projects/rocshmem/build \ - && cmake .. \ + && CC="ccache gcc" CXX="ccache g++" cmake .. \ -DCMAKE_INSTALL_PREFIX="${ROCSHMEM_DIR}" \ -DROCM_PATH=/opt/rocm \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DUSE_EXTERNAL_MPI=OFF \ - && make -j \ + && make -j$(nproc) \ && make install -# Build DeepEP wheel. -# DeepEP looks for rocshmem at ROCSHMEM_DIR. -RUN git clone ${DEEPEP_REPO} \ +# ----------------------- +# DeepEP build stage — depends on ROCShmem, builds the HIP kernel wheel. +# Kept separate so the ROCShmem layer above is reused when only DEEPEP_BRANCH changes. +FROM build_rocshmem AS build_deepep +ARG DEEPEP_BRANCH="e84464ec" +ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git" +ARG DEEPEP_NIC="cx7" + +# Build DeepEP wheel. DeepEP looks for rocshmem at ROCSHMEM_DIR (inherited from build_rocshmem). +RUN --mount=type=cache,target=/root/.cache/ccache \ + git clone ${DEEPEP_REPO} \ && cd DeepEP \ && git checkout ${DEEPEP_BRANCH} \ - && python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install + && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install + # ----------------------- # vLLM wheel release build stage (for building distributable wheels) @@ -252,8 +307,9 @@ RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \ # Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt) # This ensures setuptools_scm sees clean repo state for version detection RUN --mount=type=bind,source=.git,target=vllm/.git \ + --mount=type=cache,target=/root/.cache/uv \ cd vllm \ - && pip install setuptools_scm regex \ + && uv pip install --system setuptools_scm regex \ && VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \ && echo "Detected vLLM version: ${VLLM_VERSION}" \ && echo "${VLLM_VERSION}" > /tmp/vllm_version.txt @@ -289,18 +345,19 @@ RUN echo "Pinning vLLM dependencies to custom wheel versions..." \ && python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt # Install dependencies using custom wheels from /install -RUN cd vllm \ +RUN --mount=type=cache,target=/root/.cache/uv \ + cd vllm \ && echo "Building vLLM with custom wheels from /install" \ - && python3 -m pip install --find-links /install -r requirements/rocm.txt \ - && python3 setup.py clean --all + && uv pip install --system --find-links /install -r requirements/rocm.txt # Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt -# (setup.py auto-detects sccache in PATH) +# (setup.py auto-detects ccache/sccache in PATH) RUN --mount=type=bind,source=.git,target=vllm/.git \ + --mount=type=cache,target=/root/.cache/ccache \ cd vllm \ && export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \ && echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \ - && python3 setup.py bdist_wheel --dist-dir=dist + && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm_wheel_release ARG COMMON_WORKDIR @@ -321,22 +378,33 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* # Install vLLM using uv (inherited from base stage) # Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version +# Note: rocm-test.txt contains a git+ URL (fastsafetensors) that uv cannot resolve; +# we install non-git requirements with uv and git+ requirements with pip separately. RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/pip \ cd /install \ && uv pip install --system -r requirements/rocm.txt \ - && uv pip install --system -r requirements/rocm-test.txt \ + && grep -v 'git+' requirements/rocm-test.txt | uv pip install --system -r /dev/stdin \ + && grep 'git+' requirements/rocm-test.txt > /tmp/git-reqs.txt \ + && pip install --no-deps -r /tmp/git-reqs.txt \ + && rm /tmp/git-reqs.txt \ && pip uninstall -y vllm \ && uv pip install --system *.whl +# Store the vLLM wheel in the image for python_only_compile_rocm.sh. +# The wheel is only available via bind mount during the RUN above; we need it +# accessible at test runtime so the python-only compile test can reinstall +# vLLM without a compiler (no wheels.vllm.ai equivalent exists for ROCm). +COPY --from=export_vllm /*.whl /opt/vllm-wheels/ + # Install RIXL wheel RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ uv pip install --system /rixl_install/*.whl # Install DeepEP wheel -RUN --mount=type=bind,from=build_deep,src=/app/deep_install,target=/deep_install \ +RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \ uv pip install --system /deep_install/*.whl -COPY --from=build_deep /opt/rocshmem /opt/rocshmem # RIXL/MoRIIO runtime dependencies (RDMA userspace libraries) RUN apt-get update -q -y && apt-get install -q -y \ @@ -351,9 +419,9 @@ ARG COMMON_WORKDIR COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace # install development dependencies (for testing) -RUN cd /vllm-workspace \ - && python3 -m pip install -e tests/vllm_test_utils \ - && python3 -m pip install pytest-shard +RUN --mount=type=cache,target=/root/.cache/uv \ + cd /vllm-workspace \ + && uv pip install --system -e tests/vllm_test_utils pytest-shard # enable fast downloads from hf (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ @@ -363,7 +431,8 @@ ENV HF_HUB_ENABLE_HF_TRANSFER=1 # install audio decode package `torchcodec` from source (required due to # ROCm and torch version mismatch) for tests with datasets package COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh -RUN bash /tmp/install_torchcodec.sh \ +RUN --mount=type=cache,target=/root/.cache/pip \ + bash /tmp/install_torchcodec.sh \ && rm /tmp/install_torchcodec.sh \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* @@ -418,6 +487,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ && pip uninstall -y vllm \ && uv pip install --system *.whl +# Install DeepEP wheel +RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \ + uv pip install --system /deep_install/*.whl + ARG COMMON_WORKDIR ARG BASE_IMAGE diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl new file mode 100644 index 000000000000..2e3525d6ee2c --- /dev/null +++ b/docker/docker-bake-rocm.hcl @@ -0,0 +1,81 @@ +# docker-bake-rocm.hcl - vLLM ROCm Docker build configuration +# +# This file lives in the vLLM repo at docker/docker-bake-rocm.hcl +# Equivalent of docker-bake.hcl for ROCm builds. +# +# Usage: +# docker buildx bake -f docker/docker-bake-rocm.hcl # Build test (default) +# docker buildx bake -f docker/docker-bake-rocm.hcl final-rocm # Build final image +# docker buildx bake -f docker/docker-bake-rocm.hcl --print # Show resolved config +# +# CI usage (with ci-rocm.hcl overlay from ci-infra): +# docker buildx bake -f docker/docker-bake-rocm.hcl -f /tmp/ci-rocm.hcl test-rocm-ci + +variable "MAX_JOBS" { + # Empty string lets the Dockerfile fall back to $(nproc) via + # MAX_JOBS="${MAX_JOBS:-$(nproc)}" in each RUN step, which uses all + # available cores on whatever machine the build runs on. + # Override with --set '*.args.max_jobs=8' for local builds on small machines. + default = "" +} + +variable "PYTORCH_ROCM_ARCH" { + default = "gfx90a;gfx942;gfx950" +} + +variable "COMMIT" { + default = "" +} + +# REMOTE_VLLM=0: use local source via Docker build context (ONBUILD COPY ./ vllm/) +# REMOTE_VLLM=1: clone from GitHub at VLLM_BRANCH (standalone builds without local source) +variable "REMOTE_VLLM" { + default = "0" +} + +variable "VLLM_BRANCH" { + default = "main" +} + +group "default" { + targets = ["test-rocm"] +} + +target "_common-rocm" { + dockerfile = "docker/Dockerfile.rocm" + context = "." + args = { + max_jobs = MAX_JOBS + ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH + REMOTE_VLLM = REMOTE_VLLM + VLLM_BRANCH = VLLM_BRANCH + } +} + +target "_labels" { + labels = { + "org.opencontainers.image.source" = "https://github.com/vllm-project/vllm" + "org.opencontainers.image.vendor" = "vLLM" + "org.opencontainers.image.title" = "vLLM ROCm" + "org.opencontainers.image.description" = "vLLM: A high-throughput and memory-efficient inference and serving engine for LLMs (ROCm)" + "org.opencontainers.image.licenses" = "Apache-2.0" + "org.opencontainers.image.revision" = COMMIT + } + annotations = [ + "index,manifest:org.opencontainers.image.revision=${COMMIT}", + ] +} + +target "test-rocm" { + inherits = ["_common-rocm", "_labels"] + target = "test" + tags = ["rocm/vllm:test"] + output = ["type=docker"] +} + +target "final-rocm" { + inherits = ["_common-rocm", "_labels"] + target = "final" + tags = ["rocm/vllm:latest"] + output = ["type=docker"] +} diff --git a/tests/standalone_tests/python_only_compile_rocm.sh b/tests/standalone_tests/python_only_compile_rocm.sh new file mode 100644 index 000000000000..0760eb413872 --- /dev/null +++ b/tests/standalone_tests/python_only_compile_rocm.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# ROCm equivalent of python_only_compile.sh. +# +# Goal: verify that a user without any C/C++ compiler can install and import +# vLLM from a pre-built ROCm wheel (i.e., all HIP kernel .so files are already +# compiled into the wheel — no recompilation is triggered at install time). +# +# This differs from the CUDA version in one key way: there is no +# wheels.vllm.ai equivalent for ROCm, so we reinstall from the wheel that was +# baked into the test image at /opt/vllm-wheels/ during the Docker build +# (COPY --from=export_vllm /*.whl /opt/vllm-wheels/ in Dockerfile.rocm). + +set -e + +WHEEL_DIR="/opt/vllm-wheels" + +echo "=== ROCm Python-only Installation Test ===" +echo "Verifies vLLM is installable and importable without a C++ compiler." +echo "" + +# Confirm the wheel is present in the image +if ! ls "${WHEEL_DIR}"/*.whl &>/dev/null; then + echo "ERROR: No wheel found at ${WHEEL_DIR}/*.whl" + echo "The Dockerfile.rocm test stage must have COPY --from=export_vllm /*.whl /opt/vllm-wheels/" + exit 1 +fi + +WHEEL_PATH=$(ls "${WHEEL_DIR}"/*.whl | head -1) +echo "Found wheel: ${WHEEL_PATH}" + +cd /vllm-workspace/ + +# Restore the vllm source tree so __init__.py can be patched +# (same pattern as the CUDA python_only_compile.sh) +pip3 uninstall -y vllm +mv src/vllm ./vllm + +# Sentinel: append a side-effect to __init__.py so we can verify the installed +# code actually ran (not a cached .pyc from the previous install) +echo 'import os; os.system("touch /tmp/rocm_python_only.file")' >> vllm/__init__.py + +echo "" +echo "=== Removing C/C++ compilers ===" +apt-get remove --purge build-essential -y +apt-get autoremove -y +echo "Compilers removed. Verifying cc/g++ are gone:" +! command -v cc && echo " cc: not found (expected)" +! command -v g++ && echo " g++: not found (expected)" + +echo "" +echo "=== Installing vLLM from pre-built wheel (no compiler) ===" +echo "Wheel: ${WHEEL_PATH}" +# --no-build-isolation + --no-deps: install exactly the wheel, no setup.py +# compilation triggered; HIP .so files are already inside the wheel. +pip3 install --no-build-isolation --no-deps "${WHEEL_PATH}" + +echo "" +echo "=== Importing vLLM ===" +python3 -c 'import vllm; print(f"vLLM {vllm.__version__} imported successfully")' + +# Verify our sentinel side-effect fired (confirms the patched __init__.py ran) +if [ ! -f /tmp/rocm_python_only.file ]; then + echo "ERROR: sentinel file not created — python-only installation failed" + exit 1 +fi + +echo "" +echo "=== ROCm Python-only Installation Test PASSED ===" From 4716340ba3fac328a32bcfd058045544690acd24 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 12 Mar 2026 22:45:02 -0500 Subject: [PATCH 02/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- .buildkite/scripts/ci-bake.sh | 12 ++++++------ docker/Dockerfile.rocm | 6 ++++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.buildkite/scripts/ci-bake.sh b/.buildkite/scripts/ci-bake.sh index 410c5c352366..f40c4a2a37dd 100644 --- a/.buildkite/scripts/ci-bake.sh +++ b/.buildkite/scripts/ci-bake.sh @@ -82,14 +82,14 @@ if [[ -S "${BUILDKIT_SOCKET}" ]]; then echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}" echo "Using remote driver to connect to buildkitd (warm cache available)" - # Check if baked-vllm-builder already exists and is using the socket - if docker buildx inspect baked-vllm-builder >/dev/null 2>&1; then - echo "Using existing baked-vllm-builder" - docker buildx use baked-vllm-builder + # Check if ${BUILDER_NAME} already exists and is using the socket + if docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then + echo "Using existing builder: ${BUILDER_NAME}" + docker buildx use "${BUILDER_NAME}" else - echo "Creating baked-vllm-builder with remote driver" + echo "Creating builder '${BUILDER_NAME}' with remote driver" docker buildx create \ - --name baked-vllm-builder \ + --name "${BUILDER_NAME}" \ --driver remote \ --use \ "unix://${BUILDKIT_SOCKET}" diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 37f1c487e4c0..528c4c5d85d4 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -406,6 +406,9 @@ RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \ uv pip install --system /deep_install/*.whl +# Copy rocshmem runtime libraries +COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem + # RIXL/MoRIIO runtime dependencies (RDMA userspace libraries) RUN apt-get update -q -y && apt-get install -q -y \ librdmacm1 \ @@ -491,6 +494,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \ uv pip install --system /deep_install/*.whl +# Copy rocshmem runtime libraries +COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem + ARG COMMON_WORKDIR ARG BASE_IMAGE From f7086c27bf15ace333b204261685085af7a00ddc Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Fri, 13 Mar 2026 14:53:12 -0500 Subject: [PATCH 03/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 528c4c5d85d4..299ea05118b0 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -16,12 +16,14 @@ FROM ${BASE_IMAGE} AS base ARG ARG_PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} -# Install some basic utilities +# Install build dependencies and utilities RUN apt-get update -q -y && apt-get install -q -y \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ apt-transport-https ca-certificates wget curl \ - ccache mold \ - && update-alternatives --install /usr/bin/ld ld /usr/bin/mold 100 + ccache mold +# Use mold as the default linker — significantly faster than GNU ld/gold for +# the large C++ link steps in ROCm extension builds (e.g. vLLM, DeepEP, FA). +RUN update-alternatives --install /usr/bin/ld ld /usr/bin/mold 100 RUN python3 -m pip install --upgrade pip # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base) ARG USE_SCCACHE @@ -115,7 +117,8 @@ FROM base AS csrc-build-rocm ARG COMMON_WORKDIR WORKDIR ${COMMON_WORKDIR}/vllm # Copy only files HIP compilation depends on — vllm/**/*.py changes don't bust this -COPY requirements/rocm-build.txt requirements/rocm-build.txt +COPY requirements/common.txt requirements/common.txt +COPY requirements/rocm.txt requirements/rocm.txt COPY pyproject.toml setup.py CMakeLists.txt ./ COPY cmake cmake/ COPY csrc csrc/ @@ -125,7 +128,7 @@ COPY vllm/__init__.py vllm/__init__.py ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build" RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/rocm-build.txt \ + uv pip install --system -r requirements/rocm.txt \ && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist # ----------------------- @@ -141,7 +144,7 @@ ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1 RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ cd vllm \ - && uv pip install --system -r requirements/rocm-build.txt \ + && uv pip install --system -r requirements/rocm.txt \ && export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl) \ && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm From 8e657f02a93435053edb424060c3d55a128b382f Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Fri, 13 Mar 2026 16:38:51 -0500 Subject: [PATCH 04/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- .buildkite/hardware_tests/amd.yaml | 77 ++++++++++++++++++++++++++++++ CMakeLists.txt | 6 ++- docker/Dockerfile.rocm | 38 +-------------- docker/docker-bake-rocm.hcl | 46 ++++++++++++++++++ 4 files changed, 129 insertions(+), 38 deletions(-) diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml index 6f96db2110e6..919ef3e3edb2 100644 --- a/.buildkite/hardware_tests/amd.yaml +++ b/.buildkite/hardware_tests/amd.yaml @@ -1,5 +1,6 @@ group: Hardware - AMD Build steps: + # Image with all architectures - label: "AMD: :docker: build image" key: image-build-amd depends_on: [] @@ -24,3 +25,79 @@ steps: limit: 1 - exit_status: 1 # Machine occasionally fails limit: 1 + + # Per-architecture images + - label: "AMD: :docker: build image (gfx90a)" + key: image-build-amd-gfx90a + depends_on: [] + device: amd_cpu + no_plugin: true + commands: + - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx90a-ci + env: + DOCKER_BUILDKIT: "1" + IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx90a" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl" + PYTORCH_ROCM_ARCH: "gfx90a" + timeout_in_minutes: 600 + retry: + automatic: + - exit_status: -1 + limit: 1 + - exit_status: -10 + limit: 1 + - exit_status: 128 + limit: 1 + - exit_status: 1 + limit: 1 + + - label: "AMD: :docker: build image (gfx942)" + key: image-build-amd-gfx942 + depends_on: [] + device: amd_cpu + no_plugin: true + commands: + - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx942-ci + env: + DOCKER_BUILDKIT: "1" + IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx942" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl" + PYTORCH_ROCM_ARCH: "gfx942" + timeout_in_minutes: 600 + retry: + automatic: + - exit_status: -1 + limit: 1 + - exit_status: -10 + limit: 1 + - exit_status: 128 + limit: 1 + - exit_status: 1 + limit: 1 + + - label: "AMD: :docker: build image (gfx950)" + key: image-build-amd-gfx950 + depends_on: [] + device: amd_cpu + no_plugin: true + commands: + - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx950-ci + env: + DOCKER_BUILDKIT: "1" + IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx950" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl" + PYTORCH_ROCM_ARCH: "gfx950" + timeout_in_minutes: 600 + retry: + automatic: + - exit_status: -1 + limit: 1 + - exit_status: -10 + limit: 1 + - exit_status: 128 + limit: 1 + - exit_status: 1 + limit: 1 diff --git a/CMakeLists.txt b/CMakeLists.txt index bbadfdc5e9e3..35e9716c1a3f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1184,8 +1184,10 @@ if(VLLM_GPU_LANG STREQUAL "HIP") WITH_SOABI) endif() -# For CUDA and HIP builds also build the triton_kernels external package. -if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") +# Fetch and vendor triton_kernels (Python-only, no compilation). +# Skipped for HIP/ROCm - the git clone of the full triton repo is expensive +# and triton_kernels is optional at runtime (graceful fallback in import_utils). +if(VLLM_GPU_LANG STREQUAL "CUDA") include(cmake/external_projects/triton_kernels.cmake) endif() diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 299ea05118b0..b63ea9c33fd2 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -103,49 +103,15 @@ ONBUILD RUN git clone ${VLLM_REPO} \ && git fetch upstream ; fi FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm -# ----------------------- -# HIP kernel compilation stage (csrc-build-rocm) -# -# Intentionally copies ONLY build-critical files (CMakeLists.txt, csrc/, cmake/) -# so that Python-only changes to vllm/*.py do NOT invalidate this expensive layer. -# BuildKit's registry cache (--cache-from ECR) reuses this layer across commits -# whenever only Python code changed — turning a 2-hour HIP recompile into seconds. -# -# Note: only applies when REMOTE_VLLM=0 (default), so the build context -# contains the vllm source. Release builds (REMOTE_VLLM=1) always compile fully. -FROM base AS csrc-build-rocm -ARG COMMON_WORKDIR -WORKDIR ${COMMON_WORKDIR}/vllm -# Copy only files HIP compilation depends on — vllm/**/*.py changes don't bust this -COPY requirements/common.txt requirements/common.txt -COPY requirements/rocm.txt requirements/rocm.txt -COPY pyproject.toml setup.py CMakeLists.txt ./ -COPY cmake cmake/ -COPY csrc csrc/ -COPY vllm/envs.py vllm/envs.py -COPY vllm/__init__.py vllm/__init__.py -# Dummy version prevents git-state from busting the cache key on every commit -ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build" -RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r requirements/rocm.txt \ - && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist - # ----------------------- # vLLM build stages FROM fetch_vllm AS build_vllm -ARG COMMON_WORKDIR -# Re-use the pre-built HIP kernel wheel from csrc-build-rocm. -# When VLLM_PRECOMPILED_WHEEL_LOCATION is set, setup.py extracts the compiled -# .so files from this wheel instead of recompiling HIP kernels. -# Python-only changes complete in minutes instead of hours. -COPY --from=csrc-build-rocm ${COMMON_WORKDIR}/vllm/dist /precompiled-wheels -ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1 +# Build vLLM wheel (setup.py auto-detects ccache/sccache in PATH) RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ cd vllm \ && uv pip install --system -r requirements/rocm.txt \ - && export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl) \ + && python3 setup.py clean --all \ && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm ARG COMMON_WORKDIR diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl index 2e3525d6ee2c..bf473149a5d0 100644 --- a/docker/docker-bake-rocm.hcl +++ b/docker/docker-bake-rocm.hcl @@ -73,6 +73,52 @@ target "test-rocm" { output = ["type=docker"] } +# Per-architecture test targets — build in parallel on separate agents to avoid +# compiling expensive HIP kernels (e.g. rocm/attention.hip) for all 3 archs +# sequentially. Each image only links for one architecture. +# Usage: docker buildx bake -f docker/docker-bake-rocm.hcl test-rocm-all +target "test-rocm-gfx90a" { + inherits = ["_common-rocm", "_labels"] + target = "test" + args = { ARG_PYTORCH_ROCM_ARCH = "gfx90a" } + tags = ["rocm/vllm:test-gfx90a"] + output = ["type=docker"] +} + +target "test-rocm-gfx942" { + inherits = ["_common-rocm", "_labels"] + target = "test" + args = { ARG_PYTORCH_ROCM_ARCH = "gfx942" } + tags = ["rocm/vllm:test-gfx942"] + output = ["type=docker"] +} + +target "test-rocm-gfx950" { + inherits = ["_common-rocm", "_labels"] + target = "test" + args = { ARG_PYTORCH_ROCM_ARCH = "gfx950" } + tags = ["rocm/vllm:test-gfx950"] + output = ["type=docker"] +} + +group "test-rocm-all" { + targets = ["test-rocm-gfx90a", "test-rocm-gfx942", "test-rocm-gfx950"] +} + +# Per-architecture CI targets — the ci-rocm.hcl overlay in ci-infra extends +# these with cache-from/cache-to and registry push configuration. +target "test-rocm-gfx90a-ci" { + inherits = ["test-rocm-gfx90a"] +} + +target "test-rocm-gfx942-ci" { + inherits = ["test-rocm-gfx942"] +} + +target "test-rocm-gfx950-ci" { + inherits = ["test-rocm-gfx950"] +} + target "final-rocm" { inherits = ["_common-rocm", "_labels"] target = "final" From 20d0a5fba5299323da1ef78010e3fef1b6ca6be8 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 15:08:26 -0500 Subject: [PATCH 05/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/docker-bake-rocm.hcl | 6 ++---- setup.py | 7 +++++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl index bf473149a5d0..e227c57b8532 100644 --- a/docker/docker-bake-rocm.hcl +++ b/docker/docker-bake-rocm.hcl @@ -12,11 +12,9 @@ # docker buildx bake -f docker/docker-bake-rocm.hcl -f /tmp/ci-rocm.hcl test-rocm-ci variable "MAX_JOBS" { - # Empty string lets the Dockerfile fall back to $(nproc) via - # MAX_JOBS="${MAX_JOBS:-$(nproc)}" in each RUN step, which uses all - # available cores on whatever machine the build runs on. + # Cap parallelism to avoid OOM during linking on large machines. # Override with --set '*.args.max_jobs=8' for local builds on small machines. - default = "" + default = "64" } variable "PYTORCH_ROCM_ARCH" { diff --git a/setup.py b/setup.py index fa13fff4e62e..051f3bd07baf 100644 --- a/setup.py +++ b/setup.py @@ -298,7 +298,7 @@ def run(self): os.makedirs(os.path.dirname(dst_file), exist_ok=True) self.copy_file(file, dst_file) - if _is_cuda() or _is_hip(): + if _is_cuda(): # copy vllm/third_party/triton_kernels/**/*.py from self.build_lib # to current directory so that they can be included in the editable # build @@ -887,7 +887,10 @@ def _read_requirements(filename: str) -> list[str]: ext_modules.append(CMakeExtension(name="vllm.cumem_allocator")) # Optional since this doesn't get built (produce an .so file). This is just # copying the relevant .py files from the source repository. - ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True)) + # Skipped for ROCm — CMake already gates this on CUDA and the git clone + # of the full triton repo is expensive. + if not _is_hip(): + ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True)) if _is_hip(): ext_modules.append(CMakeExtension(name="vllm._rocm_C")) From f76a302c4f9c9578995e729480cf90a0e98ab30c Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 15:49:04 -0500 Subject: [PATCH 06/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index b63ea9c33fd2..7e438d420a74 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -354,7 +354,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ --mount=type=cache,target=/root/.cache/pip \ cd /install \ && uv pip install --system -r requirements/rocm.txt \ - && grep -v 'git+' requirements/rocm-test.txt | uv pip install --system -r /dev/stdin \ + && grep -v 'git+' requirements/rocm-test.txt > requirements/_rocm-test-nogit.txt \ + && uv pip install --system -r requirements/_rocm-test-nogit.txt \ + && rm requirements/_rocm-test-nogit.txt \ && grep 'git+' requirements/rocm-test.txt > /tmp/git-reqs.txt \ && pip install --no-deps -r /tmp/git-reqs.txt \ && rm /tmp/git-reqs.txt \ From e40694eafe48de3385381d15eb995a98ec26935e Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 15:56:01 -0500 Subject: [PATCH 07/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 7e438d420a74..8cfcf86ff99b 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -354,9 +354,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ --mount=type=cache,target=/root/.cache/pip \ cd /install \ && uv pip install --system -r requirements/rocm.txt \ - && grep -v 'git+' requirements/rocm-test.txt > requirements/_rocm-test-nogit.txt \ - && uv pip install --system -r requirements/_rocm-test-nogit.txt \ - && rm requirements/_rocm-test-nogit.txt \ + && grep -v 'git+' requirements/rocm-test.txt > /tmp/rocm-test-nogit.txt \ + && cd requirements && uv pip install --system -r /tmp/rocm-test-nogit.txt && cd /install \ + && rm /tmp/rocm-test-nogit.txt \ && grep 'git+' requirements/rocm-test.txt > /tmp/git-reqs.txt \ && pip install --no-deps -r /tmp/git-reqs.txt \ && rm /tmp/git-reqs.txt \ From e8346a09e912c9830454503ecb9f8204bba7643d Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 16:01:20 -0500 Subject: [PATCH 08/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 8cfcf86ff99b..11727cbb57ef 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -230,8 +230,11 @@ ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git" ARG DEEPEP_NIC="cx7" # Build DeepEP wheel. DeepEP looks for rocshmem at ROCSHMEM_DIR (inherited from build_rocshmem). +# DeepEP only supports gfx942 and gfx950 — override PYTORCH_ROCM_ARCH to avoid +# the gfx90a in the default list causing a build failure. RUN --mount=type=cache,target=/root/.cache/ccache \ - git clone ${DEEPEP_REPO} \ + export PYTORCH_ROCM_ARCH="gfx942;gfx950" \ + && git clone ${DEEPEP_REPO} \ && cd DeepEP \ && git checkout ${DEEPEP_BRANCH} \ && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install From a2c6035758d014758373fc26bcb8745ce9dd426c Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 16:11:07 -0500 Subject: [PATCH 09/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 17 ++++++----------- requirements/rocm-test.txt | 2 +- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 11727cbb57ef..d33d77ca0f52 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -348,21 +348,16 @@ FROM base AS test RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* -# Install vLLM using uv (inherited from base stage) -# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version -# Note: rocm-test.txt contains a git+ URL (fastsafetensors) that uv cannot resolve; -# we install non-git requirements with uv and git+ requirements with pip separately. +# Install vLLM dependencies and test requirements RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ --mount=type=cache,target=/root/.cache/uv \ - --mount=type=cache,target=/root/.cache/pip \ cd /install \ && uv pip install --system -r requirements/rocm.txt \ - && grep -v 'git+' requirements/rocm-test.txt > /tmp/rocm-test-nogit.txt \ - && cd requirements && uv pip install --system -r /tmp/rocm-test-nogit.txt && cd /install \ - && rm /tmp/rocm-test-nogit.txt \ - && grep 'git+' requirements/rocm-test.txt > /tmp/git-reqs.txt \ - && pip install --no-deps -r /tmp/git-reqs.txt \ - && rm /tmp/git-reqs.txt \ + && if grep -q 'git+' requirements/rocm-test.txt; then \ + pip install -r requirements/rocm-test.txt; \ + else \ + uv pip install --system -r requirements/rocm-test.txt; \ + fi \ && pip uninstall -y vllm \ && uv pip install --system *.whl diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index e616a99c5315..50edda4fe263 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -79,7 +79,7 @@ pqdm==0.2.0 # via lm-eval # Required for fastsafetensors test -fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459 +fastsafetensors==0.2.2 # Required for suffix decoding test arctic-inference == 0.1.1 # Required for Nemotron test From 067a486c8f0cfbe5604dc16954857b6bf46e8e28 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 16:20:08 -0500 Subject: [PATCH 10/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/docker-bake-rocm.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl index e227c57b8532..723f38999efd 100644 --- a/docker/docker-bake-rocm.hcl +++ b/docker/docker-bake-rocm.hcl @@ -60,7 +60,7 @@ target "_labels" { "org.opencontainers.image.revision" = COMMIT } annotations = [ - "index,manifest:org.opencontainers.image.revision=${COMMIT}", + "manifest:org.opencontainers.image.revision=${COMMIT}", ] } From 3509942db63fa6c31c6de91ce5ab58ba6d7889a5 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 17:11:48 -0500 Subject: [PATCH 11/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index d33d77ca0f52..b7ed836dbe35 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -21,9 +21,10 @@ RUN apt-get update -q -y && apt-get install -q -y \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ apt-transport-https ca-certificates wget curl \ ccache mold -# Use mold as the default linker — significantly faster than GNU ld/gold for -# the large C++ link steps in ROCm extension builds (e.g. vLLM, DeepEP, FA). -RUN update-alternatives --install /usr/bin/ld ld /usr/bin/mold 100 +# Note: mold is installed but NOT set as the system default linker because +# some packages (e.g. aiter) use JIT compilation at runtime with flags +# that mold doesn't support (--cref). Build stages opt in via CMAKE_LINKER_TYPE +# or LDFLAGS="-fuse-ld=mold". RUN python3 -m pip install --upgrade pip # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base) ARG USE_SCCACHE @@ -107,12 +108,13 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm # vLLM build stages FROM fetch_vllm AS build_vllm # Build vLLM wheel (setup.py auto-detects ccache/sccache in PATH) +# Use mold linker for faster linking of large C++ extensions RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ cd vllm \ && uv pip install --system -r requirements/rocm.txt \ && python3 setup.py clean --all \ - && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist + && LDFLAGS="-fuse-ld=mold" MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm ARG COMMON_WORKDIR COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl / @@ -237,7 +239,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ && git clone ${DEEPEP_REPO} \ && cd DeepEP \ && git checkout ${DEEPEP_BRANCH} \ - && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install + && LDFLAGS="-fuse-ld=mold" MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install # ----------------------- From 3ca62e109dfe16f414aa0ed4739dce0ca8243618 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 18:17:35 -0500 Subject: [PATCH 12/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 16 ++++----- docker/docker-bake-rocm.hcl | 6 ++-- tools/install_torchcodec_rocm.sh | 57 +++++++++++++++++++++++++++++--- 3 files changed, 64 insertions(+), 15 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index b7ed836dbe35..8186b0a9c894 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -397,20 +397,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \ cd /vllm-workspace \ && uv pip install --system -e tests/vllm_test_utils pytest-shard +# Pre-install FFmpeg dev libs so torchcodec can be built from source at test +# time without apt-get update (saves ~10s per whisper/audio test step). +RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ + pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \ + libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \ + && rm -rf /var/lib/apt/lists/* + # enable fast downloads from hf (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system hf_transfer ENV HF_HUB_ENABLE_HF_TRANSFER=1 -# install audio decode package `torchcodec` from source (required due to -# ROCm and torch version mismatch) for tests with datasets package -COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh -RUN --mount=type=cache,target=/root/.cache/pip \ - bash /tmp/install_torchcodec.sh \ - && rm /tmp/install_torchcodec.sh \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - # Copy in the v1 package (for python-only install test group) COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl index 723f38999efd..ad7b933a788c 100644 --- a/docker/docker-bake-rocm.hcl +++ b/docker/docker-bake-rocm.hcl @@ -12,9 +12,11 @@ # docker buildx bake -f docker/docker-bake-rocm.hcl -f /tmp/ci-rocm.hcl test-rocm-ci variable "MAX_JOBS" { - # Cap parallelism to avoid OOM during linking on large machines. + # Empty string lets the Dockerfile fall back to $(nproc) via + # MAX_JOBS="${MAX_JOBS:-$(nproc)}" in each RUN step, which uses all + # available cores on whatever machine the build runs on. # Override with --set '*.args.max_jobs=8' for local builds on small machines. - default = "64" + default = "" } variable "PYTORCH_ROCM_ARCH" { diff --git a/tools/install_torchcodec_rocm.sh b/tools/install_torchcodec_rocm.sh index 6cb3b39fd66a..210d7b24145a 100755 --- a/tools/install_torchcodec_rocm.sh +++ b/tools/install_torchcodec_rocm.sh @@ -3,12 +3,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Script to install TorchCodec from source (required for ROCm compatibility) +# The PyPI wheel is built against upstream PyTorch and has ABI mismatches with +# ROCm's custom torch build, so we must compile from source. set -e TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}" # Pin to a specific release for reproducibility; update as needed. TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}" +# Cache directory for pre-built wheels to avoid redundant recompilation. +TORCHCODEC_WHEEL_CACHE="${TORCHCODEC_WHEEL_CACHE:-/root/.cache/torchcodec-wheels}" echo "=== TorchCodec Installation Script ===" @@ -18,9 +22,26 @@ if python3 -c "from torchcodec.decoders import VideoDecoder" 2>/dev/null; then exit 0 fi +# Try to install from cached wheel first +ARCH_TAG="${PYTORCH_ROCM_ARCH:-all}" +# Normalize arch tag (replace ; with _) for use in filename +ARCH_TAG="${ARCH_TAG//;/_}" +CACHED_WHEEL="${TORCHCODEC_WHEEL_CACHE}/torchcodec-${TORCHCODEC_BRANCH}-${ARCH_TAG}.whl" + +if [ -f "$CACHED_WHEEL" ]; then + echo "Found cached wheel: $CACHED_WHEEL" + pip install "$CACHED_WHEEL" && { + echo "Installed from cached wheel." + echo "=== TorchCodec installation complete ===" + exit 0 + } + echo "Cached wheel installation failed, rebuilding from source..." +fi + echo "TorchCodec not found. Installing from source..." -# Install system dependencies (FFmpeg + pkg-config) +# Install system dependencies (FFmpeg + pkg-config) if not already present. +# The Docker test image pre-installs these, so this is a fallback for other envs. install_system_deps() { if command -v apt-get &> /dev/null; then echo "Installing system dependencies..." @@ -56,6 +77,12 @@ export pybind11_DIR=$(python3 -c "import pybind11; print(pybind11.get_cmake_dir( export CMAKE_PREFIX_PATH="${pybind11_DIR}:${CMAKE_PREFIX_PATH}" echo "pybind11_DIR set to: $pybind11_DIR" +# Limit GPU architectures to only what this image targets. +# The default builds for all supported archs which is very slow. +if [ -n "$PYTORCH_ROCM_ARCH" ]; then + echo "Building for PYTORCH_ROCM_ARCH=$PYTORCH_ROCM_ARCH" +fi + # Create temp directory for build BUILD_DIR=$(mktemp -d -t torchcodec-XXXXXX) echo "Building in temporary directory: $BUILD_DIR" @@ -77,9 +104,31 @@ cd torchcodec export TORCHCODEC_CMAKE_BUILD_DIR="${PWD}/build" export TORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR=1 export I_CONFIRM_THIS_IS_NOT_A_LICENSE_VIOLATION=1 +# Use ninja for faster builds and parallelize compilation +export CMAKE_GENERATOR=Ninja +export MAX_JOBS="${MAX_JOBS:-$(nproc)}" +# Use ccache if available to speed up recompilation +if command -v ccache &> /dev/null; then + export CMAKE_C_COMPILER_LAUNCHER=ccache + export CMAKE_CXX_COMPILER_LAUNCHER=ccache +fi -echo "Building TorchCodec..." -pip install . --no-build-isolation +echo "Building TorchCodec (MAX_JOBS=$MAX_JOBS)..." +pip wheel . --no-build-isolation --no-deps -w "$BUILD_DIR/dist" + +# Install the built wheel +BUILT_WHEEL=$(ls "$BUILD_DIR/dist"/torchcodec-*.whl 2>/dev/null | head -1) +if [ -z "$BUILT_WHEEL" ]; then + echo "Error: No wheel produced" + exit 1 +fi + +pip install "$BUILT_WHEEL" + +# Cache the wheel for future runs +mkdir -p "$TORCHCODEC_WHEEL_CACHE" +cp "$BUILT_WHEEL" "$CACHED_WHEEL" +echo "Cached wheel to: $CACHED_WHEEL" # Verify installation echo "Verifying installation..." @@ -88,4 +137,4 @@ if python3 -c "from torchcodec.decoders import VideoDecoder; print('TorchCodec i else echo "Error: TorchCodec installation failed verification" exit 1 -fi \ No newline at end of file +fi From 624b413595288b9f1b1e14c96b32266c69dc26f7 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 18:33:00 -0500 Subject: [PATCH 13/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- .buildkite/scripts/hardware_ci/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 1c43c404d247..da76ea8eda53 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -378,7 +378,7 @@ wait_for_clean_gpus # --- Pull test image --- echo "--- Pulling container" -image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" +image_name="${DOCKER_IMAGE_NAME:-rocm/vllm-ci:${BUILDKITE_COMMIT}}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" docker pull "${image_name}" From 04f3ee6c9197f55b1c44160fb6b10e098f44da5c Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 19:05:40 -0500 Subject: [PATCH 14/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- .buildkite/hardware_tests/amd.yaml | 6 ++++-- docker/Dockerfile.rocm | 15 ++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml index 919ef3e3edb2..74978d47a4db 100644 --- a/.buildkite/hardware_tests/amd.yaml +++ b/.buildkite/hardware_tests/amd.yaml @@ -1,7 +1,9 @@ group: Hardware - AMD Build steps: - # Image with all architectures - - label: "AMD: :docker: build image" + # Fat multi-arch image - only auto-runs on main (cache warming / release). + # On PR builds, the Jinja template gates this behind a manual block step. + # This YAML is the source-of-truth for step shape; the template adds the block logic. + - label: "AMD: :docker: build image (all archs)" key: image-build-amd depends_on: [] device: amd_cpu diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 8186b0a9c894..1aa852709784 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -380,12 +380,16 @@ RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_insta # Copy rocshmem runtime libraries COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem -# RIXL/MoRIIO runtime dependencies (RDMA userspace libraries) -RUN apt-get update -q -y && apt-get install -q -y \ +# RDMA userspace libraries (RIXL/MoRIIO runtime) + FFmpeg dev libs (torchcodec +# source builds at test time). Combined into one apt-get to avoid a redundant +# apt-get update round-trip. +RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ librdmacm1 \ libibverbs1 \ ibverbs-providers \ ibverbs-utils \ + pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \ + libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \ && rm -rf /var/lib/apt/lists/* WORKDIR /vllm-workspace @@ -397,13 +401,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ cd /vllm-workspace \ && uv pip install --system -e tests/vllm_test_utils pytest-shard -# Pre-install FFmpeg dev libs so torchcodec can be built from source at test -# time without apt-get update (saves ~10s per whisper/audio test step). -RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ - pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \ - libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \ - && rm -rf /var/lib/apt/lists/* - # enable fast downloads from hf (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system hf_transfer From 87a03a81a89c221ce6fe3f34cf32f9de0fa45e70 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 19:22:42 -0500 Subject: [PATCH 15/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 1aa852709784..1de5ebaa37e3 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -220,6 +220,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ -DROCM_PATH=/opt/rocm \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DUSE_EXTERNAL_MPI=OFF \ + -DAMDGPU_TARGETS="${PYTORCH_ROCM_ARCH}" \ + -DBUILD_TESTS=OFF \ + -DBUILD_EXAMPLES=OFF \ && make -j$(nproc) \ && make install From 17f5ee99e52a83a97dca6abd853c04330f629ce5 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 22:34:47 -0500 Subject: [PATCH 16/23] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 1de5ebaa37e3..d3bccfd73978 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -45,8 +45,10 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_LINK_MODE=copy # ccache directory — persisted across layer rebuilds via --mount=type=cache ENV CCACHE_DIR=/root/.cache/ccache -# Compilation parallelism — overridable via --build-arg max_jobs=N; falls back to nproc -ARG max_jobs +# Compilation parallelism — overridable via --build-arg max_jobs=N. +# Default to 64 to avoid race conditions with hipify at very high core counts +# (e.g. 256-core machines where -j=256 can start compiling before hipify finishes). +ARG max_jobs=64 ENV MAX_JOBS=${max_jobs} # Install sccache if USE_SCCACHE is enabled (for release builds) From 5af45e04d9e55c26dcf578a91b77ce1d84cb4bb2 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 22:41:18 -0500 Subject: [PATCH 17/23] [ROCm][CI] Chain hipify targets sequentially to resolve potential race condition in highly concurrent max job settings Signed-off-by: Andreas Karatzas --- cmake/utils.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index bdb2ba74d944..6afe04724501 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -81,6 +81,15 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) BYPRODUCTS ${HIP_SRCS} COMMENT "Running hipify on ${NAME} extension source files.") + # Chain hipify targets so they run sequentially. Each hipify target runs + # shutil.copytree into a shared output directory; running them in parallel + # causes a race where one target's copytree overwrites .hip files produced + # by another target back to .cu originals. + if (DEFINED _VLLM_LAST_HIPIFY_TARGET) + add_dependencies(hipify${NAME} ${_VLLM_LAST_HIPIFY_TARGET}) + endif() + set(_VLLM_LAST_HIPIFY_TARGET "hipify${NAME}" PARENT_SCOPE) + # Swap out original extension sources with hipified sources. list(APPEND HIP_SRCS ${CXX_SRCS}) set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE) From 7f1f98c08aabc6c116879c05808068f0ba52d638 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 26 Mar 2026 00:21:32 -0500 Subject: [PATCH 18/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and Jinja build steps Signed-off-by: Andreas Karatzas --- .buildkite/hardware_tests/amd-ci-base.yaml | 39 ++++++++++ .buildkite/hardware_tests/amd.yaml | 16 ++++ docker/Dockerfile.rocm | 88 +++++++++++++--------- 3 files changed, 107 insertions(+), 36 deletions(-) create mode 100644 .buildkite/hardware_tests/amd-ci-base.yaml diff --git a/.buildkite/hardware_tests/amd-ci-base.yaml b/.buildkite/hardware_tests/amd-ci-base.yaml new file mode 100644 index 000000000000..63ab32f42624 --- /dev/null +++ b/.buildkite/hardware_tests/amd-ci-base.yaml @@ -0,0 +1,39 @@ +# Scheduled pipeline: build and push the ROCm CI base image (Tier 1). +# +# This image contains all slow, stable CI dependencies (RIXL+UCX, DeepEP+rocshmem, +# torchcodec, RDMA libs, hf_transfer, pytest-shard, MIOPEN env vars) and is used +# as the base for every per-PR test image build via CI_BASE_IMAGE in amd.yaml. +# +# Rebuild triggers (configure in Buildkite UI → Pipelines → Schedules): +# - Weekly cron (e.g. Sunday 00:00 UTC) on the main branch +# - Manual trigger when RIXL_BRANCH, DEEPEP_BRANCH, or ROCSHMEM_BRANCH changes +# +# Produces: +# rocm/vllm-dev:ci_base <- stable tag, always points to most recent build +# rocm/vllm-dev:ci_base-YYYYMMDD <- dated snapshot for rollback +group: Hardware - AMD CI Base Build +steps: + - label: "AMD: :docker: build ci_base image" + key: image-build-amd-ci-base + depends_on: [] + device: amd_cpu + no_plugin: true + commands: + - export DATED_TAG="rocm/vllm-dev:ci_base-$(date +%Y%m%d)" + - export IMAGE_TAG="$DATED_TAG" + - export CI_BASE_IMAGE_TAG_DATED="$DATED_TAG" + - bash .buildkite/scripts/ci-bake.sh ci-base-rocm-ci + env: + DOCKER_BUILDKIT: "1" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl" + CI_BASE_IMAGE_TAG: "rocm/vllm-dev:ci_base" + DOCKERHUB_CACHE_TO: "rocm/vllm-ci-cache:rocm-latest" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 1 + - exit_status: -10 # Agent was lost + limit: 1 + - exit_status: 1 # Machine occasionally fail + limit: 1 diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml index 74978d47a4db..e784d9a87542 100644 --- a/.buildkite/hardware_tests/amd.yaml +++ b/.buildkite/hardware_tests/amd.yaml @@ -9,12 +9,16 @@ steps: device: amd_cpu no_plugin: true commands: + - docker pull rocm/vllm-dev:ci_base - bash .buildkite/scripts/ci-bake.sh test-rocm-ci env: DOCKER_BUILDKIT: "1" IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}" VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl" + CI_BASE_IMAGE: "rocm/vllm-dev:ci_base" + REMOTE_VLLM: "1" + VLLM_BRANCH: "${BUILDKITE_COMMIT}" PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950" timeout_in_minutes: 600 retry: @@ -35,12 +39,16 @@ steps: device: amd_cpu no_plugin: true commands: + - docker pull rocm/vllm-dev:ci_base - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx90a-ci env: DOCKER_BUILDKIT: "1" IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx90a" VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl" + CI_BASE_IMAGE: "rocm/vllm-dev:ci_base" + REMOTE_VLLM: "1" + VLLM_BRANCH: "${BUILDKITE_COMMIT}" PYTORCH_ROCM_ARCH: "gfx90a" timeout_in_minutes: 600 retry: @@ -60,12 +68,16 @@ steps: device: amd_cpu no_plugin: true commands: + - docker pull rocm/vllm-dev:ci_base - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx942-ci env: DOCKER_BUILDKIT: "1" IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx942" VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl" + CI_BASE_IMAGE: "rocm/vllm-dev:ci_base" + REMOTE_VLLM: "1" + VLLM_BRANCH: "${BUILDKITE_COMMIT}" PYTORCH_ROCM_ARCH: "gfx942" timeout_in_minutes: 600 retry: @@ -85,12 +97,16 @@ steps: device: amd_cpu no_plugin: true commands: + - docker pull rocm/vllm-dev:ci_base - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx950-ci env: DOCKER_BUILDKIT: "1" IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx950" VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl" + CI_BASE_IMAGE: "rocm/vllm-dev:ci_base" + REMOTE_VLLM: "1" + VLLM_BRANCH: "${BUILDKITE_COMMIT}" PYTORCH_ROCM_ARCH: "gfx950" timeout_in_minutes: 600 retry: diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index d3bccfd73978..dd80c284f69c 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -2,6 +2,7 @@ ARG REMOTE_VLLM="0" ARG COMMON_WORKDIR=/app ARG BASE_IMAGE=rocm/vllm-dev:base +ARG CI_BASE_IMAGE=rocm/vllm-dev:ci_base # Sccache configuration (only used in release pipeline) ARG USE_SCCACHE @@ -350,8 +351,56 @@ COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildki COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1 # ----------------------- -# Test vLLM image -FROM base AS test +# CI base image (Tier 1) — stable, rarely-changing CI dependencies. +# Rebuilt weekly (or when RIXL/DeepEP/ROCSHMEM branch ARGs change). +# Per-PR test builds pull this as CI_BASE_IMAGE instead of rebuilding +# these slow layers from scratch every commit. +FROM base AS ci_base + +# Install RIXL wheel (pre-built in build_rixl stage) +RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ + uv pip install --system /rixl_install/*.whl + +# Install DeepEP wheel (pre-built in build_deepep stage) +RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \ + uv pip install --system /deep_install/*.whl + +# Copy rocshmem runtime libraries +COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem + +# RDMA userspace libraries (RIXL/MoRIIO runtime) + FFmpeg dev libs (torchcodec). +# Combined into one apt-get to avoid a redundant apt-get update round-trip. +RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ + librdmacm1 \ + libibverbs1 \ + ibverbs-providers \ + ibverbs-utils \ + pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \ + libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install torchcodec from source (ROCm/torch version mismatch prevents PyPI install). +# Pre-building here avoids rebuilding it on every per-PR test image. +COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh +RUN bash /tmp/install_torchcodec.sh \ + && rm /tmp/install_torchcodec.sh \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Stable test tooling that doesn't depend on the vLLM wheel +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system hf_transfer pytest-shard +ENV HF_HUB_ENABLE_HF_TRANSFER=1 + +# Suppress MIOpen 3D convolution performance regressions +# See: https://github.com/pytorch/pytorch/issues/169857 +ENV MIOPEN_DEBUG_CONV_DIRECT=0 +ENV MIOPEN_DEBUG_CONV_GEMM=0 + +# ----------------------- +# Test vLLM image (Tier 2) — thin per-PR layer on top of ci_base. +# Only rebuilds vLLM wheel + workspace, which change every PR. +FROM ${CI_BASE_IMAGE} AS test RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* @@ -374,29 +423,6 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ # vLLM without a compiler (no wheels.vllm.ai equivalent exists for ROCm). COPY --from=export_vllm /*.whl /opt/vllm-wheels/ -# Install RIXL wheel -RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ - uv pip install --system /rixl_install/*.whl - -# Install DeepEP wheel -RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \ - uv pip install --system /deep_install/*.whl - -# Copy rocshmem runtime libraries -COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem - -# RDMA userspace libraries (RIXL/MoRIIO runtime) + FFmpeg dev libs (torchcodec -# source builds at test time). Combined into one apt-get to avoid a redundant -# apt-get update round-trip. -RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ - librdmacm1 \ - libibverbs1 \ - ibverbs-providers \ - ibverbs-utils \ - pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \ - libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \ - && rm -rf /var/lib/apt/lists/* - WORKDIR /vllm-workspace ARG COMMON_WORKDIR COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace @@ -404,21 +430,11 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ cd /vllm-workspace \ - && uv pip install --system -e tests/vllm_test_utils pytest-shard - -# enable fast downloads from hf (for testing) -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system hf_transfer -ENV HF_HUB_ENABLE_HF_TRANSFER=1 + && uv pip install --system -e tests/vllm_test_utils # Copy in the v1 package (for python-only install test group) COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 -# Set MIOPEN ENVS to resolve performance regressions in MIOpen 3D convolution kernel -# See: https://github.com/pytorch/pytorch/issues/169857 -ENV MIOPEN_DEBUG_CONV_DIRECT=0 -ENV MIOPEN_DEBUG_CONV_GEMM=0 - # Source code is used in the `python_only_compile.sh` test # We hide it inside `src/` so that this source code # will not be imported by other tests From 064edb8de96e0755ff8c2676a24dd0d98307303a Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 26 Mar 2026 00:28:33 -0500 Subject: [PATCH 19/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and Jinja build steps Signed-off-by: Andreas Karatzas --- .buildkite/test-amd.yaml | 85 ++-------------------------------------- 1 file changed, 4 insertions(+), 81 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index b691e5705696..254f9f5d0b00 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -192,9 +192,10 @@ steps: commands: - bash standalone_tests/python_only_compile_rocm.sh -- label: Basic Correctness Test # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a] + +- label: Basic Correctness Test # TBD + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdgfx90a] agent_pool: mi250_1 fast_check: true torch_nightly: true @@ -1453,18 +1454,6 @@ steps: timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_1 - optional: true - source_file_dependencies: - - tests/standalone_tests/python_only_compile_rocm.sh - - setup.py - commands: - - bash standalone_tests/python_only_compile_rocm.sh - -- label: Basic Correctness Test # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking fast_check: true torch_nightly: true working_dir: "/vllm-workspace/tests" @@ -3005,72 +2994,6 @@ steps: timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355] agent_pool: mi355_1 - optional: true - soft_fail: true - source_file_dependencies: - - requirements/nightly_torch_test.txt - commands: - - bash standalone_tests/pytorch_nightly_dependency.sh - -- label: Async Engine, Inputs, Utils, Worker Test # 10min - timeout_in_minutes: 15 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/multimodal - - tests/utils_ - commands: - - pytest -v -s -m 'not cpu_test' multimodal - - pytest -v -s utils_ - -- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, amdtentative] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - vllm/ - - tests/test_inputs.py - - tests/test_outputs.py - - tests/test_pooling_params.py - - tests/multimodal - - tests/renderers - - tests/standalone_tests/lazy_imports.py - - tests/tokenizers_ - - tests/tool_parsers - - tests/transformers_utils - - tests/config - no_gpu: true - commands: - - python3 standalone_tests/lazy_imports.py - - pytest -v -s test_inputs.py - - pytest -v -s test_outputs.py - - pytest -v -s test_pooling_params.py - - pytest -v -s -m 'cpu_test' multimodal - - pytest -v -s renderers - - pytest -v -s tokenizers_ - - pytest -v -s tool_parsers - - pytest -v -s transformers_utils - - pytest -v -s config - -- label: Python-only Installation Test # 10min - timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental] - agent_pool: mi355_1 - optional: true - source_file_dependencies: - - tests/standalone_tests/python_only_compile_rocm.sh - - setup.py - commands: - - bash standalone_tests/python_only_compile_rocm.sh - -- label: Basic Correctness Test # 20min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi355_1 - optional: true fast_check: true torch_nightly: true working_dir: "/vllm-workspace/tests" From 12f3da72ef86f5df9e5863d07cd8fdd65880eb88 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 26 Mar 2026 00:29:37 -0500 Subject: [PATCH 20/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and Jinja build steps Signed-off-by: Andreas Karatzas --- .buildkite/test-amd.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 254f9f5d0b00..c75707b659b3 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -193,9 +193,9 @@ steps: - bash standalone_tests/python_only_compile_rocm.sh -- label: Basic Correctness Test # TBD +- label: Basic Correctness # TBD timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdgfx90a] + mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] agent_pool: mi250_1 fast_check: true torch_nightly: true From 758ef6a45f3d1c62377a6d67f3907c5d63bdc22b Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 26 Mar 2026 02:01:05 -0500 Subject: [PATCH 21/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and Jinja build steps Signed-off-by: Andreas Karatzas --- docker/Dockerfile.rocm | 17 ++++++++++++++--- docker/docker-bake-rocm.hcl | 18 ++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index dd80c284f69c..28357fa2d48a 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -26,7 +26,6 @@ RUN apt-get update -q -y && apt-get install -q -y \ # some packages (e.g. aiter) use JIT compilation at runtime with flags # that mold doesn't support (--cref). Build stages opt in via CMAKE_LINKER_TYPE # or LDFLAGS="-fuse-ld=mold". -RUN python3 -m pip install --upgrade pip # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base) ARG USE_SCCACHE RUN if [ "$USE_SCCACHE" != "1" ]; then \ @@ -392,6 +391,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system hf_transfer pytest-shard ENV HF_HUB_ENABLE_HF_TRANSFER=1 +# Pre-install vLLM runtime + test dependencies (stable between PRs). +# The per-PR test stage re-runs the same install, but uv resolves in <100ms +# because 99% of packages are already present from ci_base. +COPY requirements/rocm.txt /tmp/rocm-reqs.txt +COPY requirements/rocm-test.txt /tmp/rocm-test-reqs.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r /tmp/rocm-reqs.txt \ + && if grep -q 'git+' /tmp/rocm-test-reqs.txt; then \ + pip install -r /tmp/rocm-test-reqs.txt; \ + else \ + uv pip install --system -r /tmp/rocm-test-reqs.txt; \ + fi \ + && rm /tmp/rocm-reqs.txt /tmp/rocm-test-reqs.txt + # Suppress MIOpen 3D convolution performance regressions # See: https://github.com/pytorch/pytorch/issues/169857 ENV MIOPEN_DEBUG_CONV_DIRECT=0 @@ -402,8 +415,6 @@ ENV MIOPEN_DEBUG_CONV_GEMM=0 # Only rebuilds vLLM wheel + workspace, which change every PR. FROM ${CI_BASE_IMAGE} AS test -RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* - # Install vLLM dependencies and test requirements RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ --mount=type=cache,target=/root/.cache/uv \ diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl index ad7b933a788c..24e50393d9a7 100644 --- a/docker/docker-bake-rocm.hcl +++ b/docker/docker-bake-rocm.hcl @@ -37,6 +37,13 @@ variable "VLLM_BRANCH" { default = "main" } +# CI_BASE_IMAGE: pre-built ci_base image for per-PR test builds. +# Defaults to the local "ci_base" stage for standalone/local builds. +# CI overrides this to "rocm/vllm-dev:ci_base" via environment variable. +variable "CI_BASE_IMAGE" { + default = "rocm/vllm-dev:ci_base" +} + group "default" { targets = ["test-rocm"] } @@ -49,6 +56,7 @@ target "_common-rocm" { ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH REMOTE_VLLM = REMOTE_VLLM VLLM_BRANCH = VLLM_BRANCH + CI_BASE_IMAGE = CI_BASE_IMAGE } } @@ -119,6 +127,16 @@ target "test-rocm-gfx950-ci" { inherits = ["test-rocm-gfx950"] } +# CI base image target — builds only the ci_base stage (RIXL, DeepEP, +# torchcodec, requirements, etc.). Used by the weekly scheduled build and +# the auto-rebuild trigger when requirements change in a PR. +target "ci-base-rocm" { + inherits = ["_common-rocm", "_labels"] + target = "ci_base" + tags = ["rocm/vllm-dev:ci_base"] + output = ["type=docker"] +} + target "final-rocm" { inherits = ["_common-rocm", "_labels"] target = "final" From 3c5c2ab5810447926d3adeb0b1eaa1a30d239fce Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 26 Mar 2026 04:31:50 -0500 Subject: [PATCH 22/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and Jinja build steps Signed-off-by: Andreas Karatzas --- .buildkite/scripts/ci-bake.sh | 2 +- .pre-commit-config.yaml | 37 +- docker/Dockerfile.rocm | 45 +- requirements/rocm-test.in | 83 ++ requirements/rocm-test.txt | 1451 ++++++++++++++++++++++++++++++--- 5 files changed, 1489 insertions(+), 129 deletions(-) create mode 100644 requirements/rocm-test.in diff --git a/.buildkite/scripts/ci-bake.sh b/.buildkite/scripts/ci-bake.sh index f40c4a2a37dd..4eb690ec403d 100644 --- a/.buildkite/scripts/ci-bake.sh +++ b/.buildkite/scripts/ci-bake.sh @@ -37,7 +37,7 @@ set -euo pipefail # Check if image already exists (skip build if it does) -if [[ -n "${IMAGE_TAG:-}" ]]; then +if [[ -n "${IMAGE_TAG:-}" && "${FORCE_BUILD:-0}" != "1" ]]; then echo "--- :mag: Checking if image exists" if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then echo "Image already exists: ${IMAGE_TAG}" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0b17ad7335c7..e53274480cc0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -36,11 +36,46 @@ repos: hooks: - id: actionlint - repo: https://github.com/astral-sh/uv-pre-commit - rev: 0.9.1 + rev: 0.11.1 hooks: - id: pip-compile args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"] files: ^requirements/test\.(in|txt)$ + - id: pip-compile + alias: pip-compile-rocm + name: pip-compile-rocm + args: [ + requirements/rocm-test.in, -o, requirements/rocm-test.txt, + --index-strategy, unsafe-best-match, + -c, requirements/rocm.txt, + --python-platform, x86_64-manylinux_2_28, + --python-version, "3.12", + # Exclude torch and CUDA/NVIDIA packages + --no-emit-package, torch, + --no-emit-package, torchvision, + --no-emit-package, torchaudio, + --no-emit-package, triton, + --no-emit-package, cuda-bindings, + --no-emit-package, cuda-pathfinder, + --no-emit-package, cuda-toolkit, + --no-emit-package, cupy-cuda12x, + --no-emit-package, nvidia-cublas, + --no-emit-package, nvidia-cuda-cupti, + --no-emit-package, nvidia-cuda-nvrtc, + --no-emit-package, nvidia-cuda-runtime, + --no-emit-package, nvidia-cudnn-cu13, + --no-emit-package, nvidia-cufft, + --no-emit-package, nvidia-cufile, + --no-emit-package, nvidia-curand, + --no-emit-package, nvidia-cusolver, + --no-emit-package, nvidia-cusparse, + --no-emit-package, nvidia-cusparselt-cu13, + --no-emit-package, nvidia-nccl-cu13, + --no-emit-package, nvidia-nvjitlink, + --no-emit-package, nvidia-nvshmem-cu13, + --no-emit-package, nvidia-nvtx, + ] + files: ^requirements/rocm-test\.(in|txt)$ - repo: local hooks: - id: format-torch-nightly-test diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 28357fa2d48a..3be12e4f9781 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -356,19 +356,16 @@ COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1 # these slow layers from scratch every commit. FROM base AS ci_base -# Install RIXL wheel (pre-built in build_rixl stage) +# Install RIXL + DeepEP wheels (pre-built in earlier stages) RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ - uv pip install --system /rixl_install/*.whl - -# Install DeepEP wheel (pre-built in build_deepep stage) -RUN --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \ - uv pip install --system /deep_install/*.whl + --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \ + uv pip install --system /rixl_install/*.whl /deep_install/*.whl # Copy rocshmem runtime libraries COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem -# RDMA userspace libraries (RIXL/MoRIIO runtime) + FFmpeg dev libs (torchcodec). -# Combined into one apt-get to avoid a redundant apt-get update round-trip. +# RDMA userspace libraries (RIXL/MoRIIO runtime) + FFmpeg dev libs (torchcodec +# links against libav* at runtime, so the -dev packages must stay installed). RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ librdmacm1 \ libibverbs1 \ @@ -379,31 +376,24 @@ RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # Install torchcodec from source (ROCm/torch version mismatch prevents PyPI install). -# Pre-building here avoids rebuilding it on every per-PR test image. COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh RUN bash /tmp/install_torchcodec.sh \ && rm /tmp/install_torchcodec.sh \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + && apt-get clean && rm -rf /var/lib/apt/lists/* -# Stable test tooling that doesn't depend on the vLLM wheel +# hf_transfer for fast model downloads (pytest-shard is in rocm-test.txt lockfile) RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system hf_transfer pytest-shard + uv pip install --system hf_transfer ENV HF_HUB_ENABLE_HF_TRANSFER=1 -# Pre-install vLLM runtime + test dependencies (stable between PRs). +# Pre-install vLLM test dependencies (stable between PRs). +# rocm-test.txt is a fully-resolved lockfile (no relative -r includes). # The per-PR test stage re-runs the same install, but uv resolves in <100ms # because 99% of packages are already present from ci_base. -COPY requirements/rocm.txt /tmp/rocm-reqs.txt COPY requirements/rocm-test.txt /tmp/rocm-test-reqs.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r /tmp/rocm-reqs.txt \ - && if grep -q 'git+' /tmp/rocm-test-reqs.txt; then \ - pip install -r /tmp/rocm-test-reqs.txt; \ - else \ - uv pip install --system -r /tmp/rocm-test-reqs.txt; \ - fi \ - && rm /tmp/rocm-reqs.txt /tmp/rocm-test-reqs.txt + uv pip install --system -r /tmp/rocm-test-reqs.txt \ + && rm /tmp/rocm-test-reqs.txt # Suppress MIOpen 3D convolution performance regressions # See: https://github.com/pytorch/pytorch/issues/169857 @@ -415,17 +405,12 @@ ENV MIOPEN_DEBUG_CONV_GEMM=0 # Only rebuilds vLLM wheel + workspace, which change every PR. FROM ${CI_BASE_IMAGE} AS test -# Install vLLM dependencies and test requirements +# Install the vLLM wheel. +# Runtime + test deps are already in ci_base (rocm-test.txt lockfile is +# compiled with -c rocm.txt, so all runtime packages are covered). RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ --mount=type=cache,target=/root/.cache/uv \ cd /install \ - && uv pip install --system -r requirements/rocm.txt \ - && if grep -q 'git+' requirements/rocm-test.txt; then \ - pip install -r requirements/rocm-test.txt; \ - else \ - uv pip install --system -r requirements/rocm-test.txt; \ - fi \ - && pip uninstall -y vllm \ && uv pip install --system *.whl # Store the vLLM wheel in the image for python_only_compile_rocm.sh. diff --git a/requirements/rocm-test.in b/requirements/rocm-test.in new file mode 100644 index 000000000000..856fab7e9f65 --- /dev/null +++ b/requirements/rocm-test.in @@ -0,0 +1,83 @@ +# testing +pytest +tensorizer==2.10.1 +pytest-forked +pytest-asyncio +pytest-rerunfailures +pytest-shard +pytest-timeout +pytest-cov + +# testing utils +albumentations # required for Nemotron Parse in test_common.py +av # required for audio_in_video tests +backoff # required for phi4mm test +blobfile # required for kimi-vl test +einops # required for MPT, qwen-vl +httpx +librosa # required for audio tests +vector_quantize_pytorch # required for minicpmo_26 test +vocos # required for minicpmo_26 test +peft>=0.15.0 # required for phi-4-mm test +pqdm +ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests +resampy # required for audio tests +sentence-transformers>=5.2.0 # required for embedding tests +soundfile # required for audio tests +jiwer # required for audio tests +tblib # for pickling test exceptions +timm>=1.0.17 # required for internvl and gemma3n-mm test +transformers_stream_generator # required for qwen-vl test +matplotlib # required for qwen-vl test +mistral_common[image,audio]>=1.10.0 # required for voxtral test +num2words # required for smolvlm test +open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py +opencv-python-headless>=4.13.0 # required for video test +datamodel_code_generator # required for minicpm3 test +lm-eval[api]>=0.4.11 # required for model evaluation test +mteb[bm25s]>=2, <3 # required for mteb test +transformers==4.57.5 +tokenizers==0.22.0 +schemathesis>=3.39.15 # Required for openai schema test +# quantization +bitsandbytes==0.49.2 +buildkite-test-collector==0.1.9 + +genai_perf>=0.0.8 +tritonclient>=2.51.0 + +# The version of gRPC libraries should be consistent with each other +grpcio==1.78.0 +grpcio-reflection==1.78.0 + +arctic-inference==0.1.1 # Required for suffix decoding test +numba==0.61.2 # Required for N-gram speculative decoding +numpy +runai-model-streamer[s3,gcs,azure]==0.15.7 +fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage +instanttensor>=0.1.5 +pydantic>=2.12 # 2.11 leads to error on python 3.13 +decord==0.6.0 + +# Prithvi tests +terratorch>=1.2.2 +imagehash # Required for Prithvi tests +segmentation-models-pytorch>0.4.0 # Required for Prithvi tests + +gpt-oss>=0.0.7; python_version > '3.11' + +perceptron # required for isaac test +kaldi-native-fbank>=1.18.7 # required for fireredasr2 test + +# Newer versions of datasets require torchcoded, that makes the tests fail in CI because of a missing library. +# Older versions are in conflict with terratorch requirements. +datasets>=3.3.0,<=3.6.0 + +openpyxl # required for perf comparison excel report +plotly # required for perf comparison html report + +# ROCm-specific extras (not in CUDA test.in) +rapidfuzz +torchgeo==0.7.0 +multiprocess==0.70.16 +huggingface-hub==0.36.2 diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 9428ee112ae2..dd4c7c24f40c 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -1,116 +1,1373 @@ -# Common dependencies --r common.txt - -# Test infrastructure -tblib==3.1.0 -pytest==8.3.5 -pytest-asyncio==0.24.0 -pytest-timeout==2.3.1 -pytest-cov==6.3.0 -pytest-forked==1.6.0 -pytest-rerunfailures==14.0 -pytest-shard==0.1.2 - -# Async/HTTP dependencies -anyio==4.6.2.post1 - # via httpx, starlette +# This file was autogenerated by uv via the following command: +# uv pip compile requirements/rocm-test.in -o requirements/rocm-test.txt --index-strategy unsafe-best-match -c requirements/rocm.txt --python-platform x86_64-manylinux_2_28 --python-version 3.12 --no-emit-package torch --no-emit-package torchvision --no-emit-package torchaudio --no-emit-package triton --no-emit-package cuda-bindings --no-emit-package cuda-pathfinder --no-emit-package cuda-toolkit --no-emit-package cupy-cuda12x --no-emit-package nvidia-cublas --no-emit-package nvidia-cuda-cupti --no-emit-package nvidia-cuda-nvrtc --no-emit-package nvidia-cuda-runtime --no-emit-package nvidia-cudnn-cu13 --no-emit-package nvidia-cufft --no-emit-package nvidia-cufile --no-emit-package nvidia-curand --no-emit-package nvidia-cusolver --no-emit-package nvidia-cusparse --no-emit-package nvidia-cusparselt-cu13 --no-emit-package nvidia-nccl-cu13 --no-emit-package nvidia-nvjitlink --no-emit-package nvidia-nvshmem-cu13 --no-emit-package nvidia-nvtx +absl-py==2.4.0 + # via + # rouge-score + # tensorboard +accelerate==1.13.0 + # via peft +aenum==3.1.17 + # via lightly +affine==2.4.0 + # via rasterio +aiohappyeyeballs==2.6.1 + # via aiohttp aiohttp==3.13.3 - # via gpt-oss -httpx==0.27.2 - # HTTP testing - -# Audio processing dependencies + # via + # -c requirements/common.txt + # aiohttp-cors + # fsspec + # gpt-oss + # lm-eval + # ray +aiohttp-cors==0.8.1 + # via ray +aiosignal==1.4.0 + # via aiohttp +albucore==0.1.2 + # via terratorch +albumentations==1.4.6 + # via + # -r requirements/rocm-test.in + # terratorch +alembic==1.18.4 + # via optuna +annotated-doc==0.0.4 + # via + # fastapi + # typer +annotated-types==0.7.0 + # via pydantic +antlr4-python3-runtime==4.9.3 + # via + # hydra-core + # omegaconf +anyio==4.6.2.post1 + # via + # httpx + # starlette +arctic-inference==0.1.1 + # via -r requirements/rocm-test.in +argcomplete==3.6.3 + # via datamodel-code-generator +arrow==1.4.0 + # via isoduration +attrs==26.1.0 + # via + # aiohttp + # fiona + # jsonlines + # jsonschema + # pytest-subtests + # rasterio + # referencing audioread==3.0.1 # via librosa +av==16.1.0 + # via -r requirements/rocm-test.in +azure-core==1.39.0 + # via + # azure-identity + # azure-storage-blob +azure-identity==1.25.3 + # via runai-model-streamer-azure +azure-storage-blob==12.28.0 + # via runai-model-streamer-azure +backoff==2.2.1 + # via + # -r requirements/rocm-test.in + # schemathesis +bitsandbytes==0.49.2 + # via + # -r requirements/rocm-test.in + # lightning +black==26.3.1 + # via datamodel-code-generator +blobfile==3.0.0 + # via -r requirements/rocm-test.in +bm25s==0.2.13 + # via mteb +boto3==1.42.74 + # via + # runai-model-streamer-s3 + # tensorizer +botocore==1.42.74 + # via + # boto3 + # s3transfer +bounded-pool-executor==0.0.3 + # via pqdm +buildkite-test-collector==0.1.9 + # via -r requirements/rocm-test.in +certifi==2026.2.25 + # via + # fiona + # httpcore + # httpx + # lightly + # pyogrio + # pyproj + # rasterio + # requests + # sentry-sdk cffi==1.17.1 - # via soundfile + # via + # cryptography + # soundfile +chardet==5.2.0 + # via mbstrdecoder +charset-normalizer==3.4.6 + # via requests +choreographer==1.2.1 + # via kaleido +chz==0.4.0 + # via gpt-oss +click==8.3.1 + # via + # black + # click-plugins + # cligj + # fiona + # jiwer + # nltk + # rasterio + # ray + # schemathesis + # typer + # uvicorn + # wandb +click-plugins==1.1.1.2 + # via fiona +cligj==0.7.2 + # via + # fiona + # rasterio +colorama==0.4.6 + # via + # perceptron + # sacrebleu + # schemathesis +colorful==0.5.8 + # via ray +colorlog==6.10.1 + # via optuna +contourpy==1.3.3 + # via matplotlib +coverage==7.13.5 + # via pytest-cov +cramjam==2.11.0 + # via fastparquet +cryptography==46.0.0 + # via + # azure-identity + # azure-storage-blob + # google-auth + # msal + # pyjwt +cycler==0.12.1 + # via matplotlib +datamodel-code-generator==0.55.0 + # via -r requirements/rocm-test.in +dataproperty==1.1.0 + # via + # pytablewriter + # tabledata +datasets==3.6.0 + # via + # -r requirements/rocm-test.in + # evaluate + # lm-eval + # mteb decorator==5.2.1 # via librosa +decord==0.6.0 + # via -r requirements/rocm-test.in +diffusers==0.37.0 + # via terratorch +dill==0.3.8 + # via + # datasets + # evaluate + # lm-eval + # multiprocess +distlib==0.4.0 + # via virtualenv +docker==7.1.0 + # via gpt-oss +docopt==0.6.2 + # via num2words +docstring-parser==0.17.0 + # via jsonargparse +einops==0.8.2 + # via + # -r requirements/rocm-test.in + # encodec + # terratorch + # torchgeo + # vector-quantize-pytorch + # vocos +einx==0.4.2 + # via vector-quantize-pytorch +encodec==0.1.1 + # via vocos +et-xmlfile==2.0.0 + # via openpyxl +evaluate==0.4.6 + # via lm-eval +fastapi==0.135.2 + # via + # -c requirements/common.txt + # gpt-oss +fastparquet==2026.3.0 + # via genai-perf +fastsafetensors==0.2.2 + # via -r requirements/rocm-test.in +filelock==3.25.2 + # via + # -c requirements/common.txt + # blobfile + # datasets + # diffusers + # huggingface-hub + # python-discovery + # ray + # torch + # transformers + # virtualenv +fiona==1.10.1 + # via torchgeo +fonttools==4.62.1 + # via matplotlib +fqdn==1.5.1 + # via jsonschema +frozendict==2.4.7 + # via einx +frozenlist==1.8.0 + # via + # aiohttp + # aiosignal +fsspec==2025.3.0 + # via + # datasets + # evaluate + # fastparquet + # huggingface-hub + # lightning + # pytorch-lightning + # tacoreader + # torch +ftfy==6.3.1 + # via open-clip-torch +genai-perf==0.0.16 + # via -r requirements/rocm-test.in +genson==1.3.0 + # via datamodel-code-generator +geopandas==1.1.3 + # via terratorch +gitdb==4.0.12 + # via gitpython +gitpython==3.1.46 + # via wandb +google-api-core==2.30.0 + # via + # google-cloud-core + # google-cloud-storage + # opencensus +google-auth==2.49.1 + # via + # google-api-core + # google-cloud-core + # google-cloud-storage + # runai-model-streamer-gcs +google-cloud-core==2.5.0 + # via google-cloud-storage +google-cloud-storage==3.10.1 + # via runai-model-streamer-gcs +google-crc32c==1.8.0 + # via + # google-cloud-storage + # google-resumable-media +google-resumable-media==2.8.0 + # via google-cloud-storage +googleapis-common-protos==1.73.0 + # via google-api-core +gpt-oss==0.0.8 + # via -r requirements/rocm-test.in +graphql-core==3.2.8 + # via hypothesis-graphql +greenlet==3.3.2 + # via sqlalchemy +grpcio==1.78.0 + # via + # -c requirements/rocm.txt + # -r requirements/rocm-test.in + # grpcio-reflection + # ray + # tensorboard +grpcio-reflection==1.78.0 + # via + # -c requirements/rocm.txt + # -r requirements/rocm-test.in +h11==0.16.0 + # via + # httpcore + # uvicorn +h2==4.3.0 + # via httpx +h5py==3.16.0 + # via terratorch +harfile==0.4.0 + # via schemathesis +hf-xet==1.4.2 + # via huggingface-hub +hiredis==3.3.1 + # via tensorizer +hpack==4.1.0 + # via h2 +html2text==2025.4.15 + # via gpt-oss +httpcore==1.0.9 + # via httpx +httpx==0.27.2 + # via + # -r requirements/rocm-test.in + # diffusers + # perceptron + # schemathesis +huggingface-hub==0.36.2 + # via + # -r requirements/rocm-test.in + # accelerate + # datasets + # diffusers + # evaluate + # open-clip-torch + # peft + # segmentation-models-pytorch + # sentence-transformers + # terratorch + # timm + # tokenizers + # transformers + # vocos +humanize==4.15.0 + # via runai-model-streamer +hydra-core==1.3.2 + # via + # lightly + # lightning +hyperframe==6.1.0 + # via h2 +hypothesis==6.151.9 + # via + # hypothesis-graphql + # hypothesis-jsonschema + # schemathesis +hypothesis-graphql==0.12.0 + # via schemathesis +hypothesis-jsonschema==0.23.1 + # via schemathesis +idna==3.11 + # via + # anyio + # httpx + # jsonschema + # requests + # yarl +imagehash==4.3.2 + # via -r requirements/rocm-test.in +imageio==2.37.3 + # via scikit-image +importlib-metadata==8.7.1 + # via + # diffusers + # opentelemetry-api +importlib-resources==6.5.2 + # via typeshed-client +inflect==7.5.0 + # via datamodel-code-generator +iniconfig==2.3.0 + # via pytest +instanttensor==0.1.6 + # via -r requirements/rocm-test.in +isodate==0.7.2 + # via azure-storage-blob +isoduration==20.11.0 + # via jsonschema +isort==8.0.1 + # via datamodel-code-generator +jinja2==3.1.6 + # via + # datamodel-code-generator + # genai-perf + # lm-eval + # torch +jiwer==4.0.0 + # via -r requirements/rocm-test.in +jmespath==1.1.0 + # via + # boto3 + # botocore +joblib==1.5.3 + # via + # librosa + # nltk + # scikit-learn +jsonargparse==4.47.0 + # via + # lightning + # terratorch +jsonlines==4.0.0 + # via lm-eval +jsonnet==0.21.0 + # via jsonargparse +jsonpointer==3.1.0 + # via jsonschema +jsonschema==4.26.0 + # via + # hypothesis-jsonschema + # mistral-common + # ray + # schemathesis +jsonschema-specifications==2025.9.1 + # via jsonschema +junit-xml==1.9 + # via schemathesis +kaldi-native-fbank==1.22.3 + # via -r requirements/rocm-test.in +kaleido==1.0.0 + # via genai-perf +kiwisolver==1.5.0 + # via matplotlib +kornia==0.8.2 + # via torchgeo +kornia-rs==0.1.10 + # via kornia lazy-loader==0.4 - # via librosa + # via + # librosa + # scikit-image +libnacl==2.1.0 + # via tensorizer +librosa==0.10.2.post1 + # via -r requirements/rocm-test.in +lightly==1.5.22 + # via + # terratorch + # torchgeo +lightly-utils==0.0.2 + # via lightly +lightning==2.6.1 + # via + # terratorch + # torchgeo +lightning-utilities==0.15.3 + # via + # lightning + # pytorch-lightning + # torchmetrics +llvmlite==0.44.0 + # via numba +lm-eval==0.4.11 + # via -r requirements/rocm-test.in +logistro==2.0.1 + # via + # choreographer + # kaleido +lxml==6.0.2 + # via + # blobfile + # gpt-oss + # sacrebleu +mako==1.3.10 + # via alembic +markdown==3.10.2 + # via tensorboard +markdown-it-py==4.0.0 + # via rich +markupsafe==3.0.3 + # via + # jinja2 + # mako + # werkzeug +matplotlib==3.10.8 + # via + # -r requirements/rocm-test.in + # lightning + # torchgeo +mbstrdecoder==1.1.4 + # via + # dataproperty + # pytablewriter + # typepy +mdurl==0.1.2 + # via markdown-it-py +mistral-common==1.10.0 + # via + # -c requirements/common.txt + # -r requirements/rocm-test.in +more-itertools==10.8.0 + # via + # inflect + # lm-eval +mpmath==1.3.0 + # via sympy +msal==1.35.1 + # via + # azure-identity + # msal-extensions +msal-extensions==1.3.1 + # via azure-identity +msgpack==1.1.2 + # via + # librosa + # ray +mteb==2.11.5 + # via -r requirements/rocm-test.in +multidict==6.7.1 + # via + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # -r requirements/rocm-test.in + # datasets + # evaluate +mypy-extensions==1.1.0 + # via black +narwhals==2.18.0 + # via plotly +networkx==3.6.1 + # via + # scikit-image + # torch +nltk==3.9.3 + # via rouge-score +num2words==0.5.14 + # via -r requirements/rocm-test.in +numba==0.61.2 + # via + # -c requirements/rocm.txt + # -r requirements/rocm-test.in + # librosa + # resampy +numkong==7.1.1 + # via albucore +numpy==2.2.6 + # via + # -r requirements/rocm-test.in + # accelerate + # albucore + # albumentations + # bitsandbytes + # bm25s + # contourpy + # cupy-cuda12x + # datasets + # decord + # diffusers + # einx + # encodec + # evaluate + # fastparquet + # genai-perf + # geopandas + # h5py + # imagehash + # imageio + # librosa + # lightly + # lightly-utils + # lm-eval + # matplotlib + # mistral-common + # mteb + # numba + # opencv-python-headless + # optuna + # pandas + # patsy + # peft + # perceptron + # pycocotools + # pyogrio + # pytrec-eval-terrier + # pywavelets + # rasterio + # resampy + # rioxarray + # rouge-score + # runai-model-streamer + # sacrebleu + # scikit-image + # scikit-learn + # scipy + # segmentation-models-pytorch + # sentence-transformers + # shapely + # soundfile + # soxr + # statsmodels + # tensorboard + # tensorboardx + # tensorizer + # terratorch + # tifffile + # torchgeo + # torchmetrics + # torchvision + # transformers + # tritonclient + # vocos + # xarray +omegaconf==2.3.0 + # via + # hydra-core + # lightning +open-clip-torch==2.32.0 + # via -r requirements/rocm-test.in +openai-harmony==0.0.8 + # via + # -c requirements/common.txt + # gpt-oss +opencensus==0.11.4 + # via ray +opencensus-context==0.1.3 + # via opencensus +opencv-python-headless==4.13.0.92 + # via + # -c requirements/common.txt + # -r requirements/rocm-test.in + # albumentations + # mistral-common +openpyxl==3.1.5 + # via -r requirements/rocm-test.in +opentelemetry-api==1.40.0 + # via + # -c requirements/common.txt + # opentelemetry-exporter-prometheus + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-prometheus==0.61b0 + # via ray +opentelemetry-proto==1.40.0 + # via ray +opentelemetry-sdk==1.40.0 + # via + # -c requirements/common.txt + # opentelemetry-exporter-prometheus + # ray +opentelemetry-semantic-conventions==0.61b0 + # via opentelemetry-sdk +optuna==3.6.1 + # via genai-perf +orjson==3.11.7 + # via + # genai-perf + # kaleido +packaging==26.0 + # via + # -c requirements/rocm.txt + # accelerate + # bitsandbytes + # black + # datasets + # evaluate + # fastparquet + # geopandas + # huggingface-hub + # hydra-core + # kaleido + # kornia + # lazy-loader + # lightning + # lightning-utilities + # matplotlib + # optuna + # peft + # plotly + # pooch + # pyogrio + # pytest + # pytest-rerunfailures + # pytorch-lightning + # ray + # rioxarray + # scikit-image + # statsmodels + # tensorboard + # tensorboardx + # torchmetrics + # transformers + # typepy + # wandb + # xarray +pandas==3.0.1 + # via + # datasets + # evaluate + # fastparquet + # genai-perf + # geopandas + # statsmodels + # tacoreader + # torchgeo + # xarray +pathspec==1.0.4 + # via black +pathvalidate==3.3.1 + # via pytablewriter +patsy==1.0.2 + # via statsmodels +peft==0.18.1 + # via -r requirements/rocm-test.in +perceptron==0.1.4 + # via -r requirements/rocm-test.in +perf-analyzer==0.1.0 + # via genai-perf +pillow==12.1.1 + # via + # diffusers + # genai-perf + # imagehash + # imageio + # lightly-utils + # matplotlib + # mistral-common + # perceptron + # scikit-image + # segmentation-models-pytorch + # tensorboard + # torchgeo + # torchvision platformdirs==4.3.6 - # via pooch + # via + # black + # pooch + # python-discovery + # virtualenv + # wandb +plotly==6.6.0 + # via + # -r requirements/rocm-test.in + # genai-perf +pluggy==1.6.0 + # via + # pytest + # pytest-cov +polars==1.39.3 + # via mteb +polars-runtime-32==1.39.3 + # via polars pooch==1.8.2 # via librosa -soundfile==0.13.1 - # via librosa -soxr==0.5.0.post1 - # via librosa -librosa==0.10.2.post1 - -# Retrieval and search -bm25s==0.2.13 - # via mteb +portalocker==3.2.0 + # via sacrebleu +pqdm==0.2.0 + # via -r requirements/rocm-test.in +prometheus-client==0.24.1 + # via + # -c requirements/common.txt + # opentelemetry-exporter-prometheus + # ray +propcache==0.4.1 + # via + # aiohttp + # yarl +proto-plus==1.27.1 + # via google-api-core +protobuf==6.33.6 + # via + # -c requirements/common.txt + # google-api-core + # googleapis-common-protos + # grpcio-reflection + # opentelemetry-proto + # proto-plus + # ray + # tensorboard + # tensorboardx + # tensorizer + # wandb +psutil==7.2.2 + # via + # accelerate + # peft + # tensorizer +py==1.11.0 + # via pytest-forked +py-spy==0.4.1 + # via ray +pyarrow==23.0.1 + # via + # datasets + # genai-perf + # tacoreader + # terratorch +pyasn1==0.6.3 + # via pyasn1-modules +pyasn1-modules==0.4.2 + # via google-auth +pycocotools==2.0.11 + # via terratorch +pycountry==26.2.16 + # via pydantic-extra-types +pycparser==3.0 + # via cffi +pycryptodomex==3.23.0 + # via blobfile +pydantic==2.12.5 + # via + # -c requirements/common.txt + # -r requirements/rocm-test.in + # albumentations + # datamodel-code-generator + # fastapi + # gpt-oss + # lightly + # mistral-common + # mteb + # openai-harmony + # pydantic-extra-types + # ray + # wandb +pydantic-core==2.41.5 + # via pydantic +pydantic-extra-types==2.11.1 + # via mistral-common +pygments==2.19.2 + # via rich +pyjwt==2.12.1 + # via msal +pyogrio==0.12.1 + # via geopandas +pyparsing==3.3.2 + # via + # matplotlib + # rasterio +pyproj==3.7.2 + # via + # geopandas + # rioxarray + # torchgeo +pyrate-limiter==3.9.0 + # via schemathesis pystemmer==3.0.0 # via mteb - -# Multi-modal processing -av==16.1.0 - # required for audio_in_video tests -resampy==0.4.3 - # audio processing, required for audio_in_video tests -blobfile==3.0.0 - # Multi-Modal Models Test -decord==0.6.0 - # video processing, required by entrypoints/openai/chat_completion/test_video.py +pytablewriter==1.2.1 + # via lm-eval +pytest==8.3.5 + # via + # -r requirements/rocm-test.in + # buildkite-test-collector + # genai-perf + # pytest-asyncio + # pytest-cov + # pytest-forked + # pytest-mock + # pytest-rerunfailures + # pytest-shard + # pytest-subtests + # pytest-timeout + # schemathesis +pytest-asyncio==0.24.0 + # via -r requirements/rocm-test.in +pytest-cov==6.3.0 + # via -r requirements/rocm-test.in +pytest-forked==1.6.0 + # via -r requirements/rocm-test.in +pytest-mock==3.15.1 + # via genai-perf +pytest-rerunfailures==14.0 + # via -r requirements/rocm-test.in +pytest-shard==0.1.2 + # via -r requirements/rocm-test.in +pytest-subtests==0.14.2 + # via schemathesis +pytest-timeout==2.3.1 + # via -r requirements/rocm-test.in +python-box==7.4.1 + # via terratorch +python-dateutil==2.9.0.post0 + # via + # arrow + # botocore + # lightly + # matplotlib + # pandas + # typepy +python-discovery==1.2.0 + # via virtualenv +python-rapidjson==1.23 + # via tritonclient +pytokens==0.4.1 + # via black +pytorch-lightning==2.6.1 + # via + # lightly + # lightning +pytrec-eval-terrier==0.5.10 + # via mteb +pytz==2026.1.post1 + # via typepy +pywavelets==1.9.0 + # via imagehash +pyyaml==6.0.3 + # via + # accelerate + # albumentations + # datamodel-code-generator + # datasets + # genai-perf + # huggingface-hub + # jsonargparse + # lightning + # omegaconf + # optuna + # peft + # pytorch-lightning + # ray + # responses + # schemathesis + # timm + # transformers + # vocos + # wandb rapidfuzz==3.12.1 - -# OpenAI compatibility and testing -gpt-oss==0.0.8 - # OpenAI compatibility tests + # via + # -r requirements/rocm-test.in + # jiwer +rasterio==1.5.0 + # via + # rioxarray + # terratorch + # torchgeo +ray==2.54.0 + # via -r requirements/rocm-test.in +redis==7.3.0 + # via tensorizer +referencing==0.37.0 + # via + # jsonschema + # jsonschema-specifications +regex==2026.2.28 + # via + # diffusers + # nltk + # open-clip-torch + # sacrebleu + # tiktoken + # transformers +requests==2.32.5 + # via + # -c requirements/common.txt + # azure-core + # buildkite-test-collector + # datasets + # diffusers + # docker + # evaluate + # google-api-core + # google-cloud-storage + # gpt-oss + # huggingface-hub + # lightly + # lm-eval + # mistral-common + # msal + # mteb + # pooch + # ray + # responses + # schemathesis + # starlette-testclient + # tacoreader + # tiktoken + # transformers + # wandb +resampy==0.4.3 + # via -r requirements/rocm-test.in +responses==0.26.0 + # via genai-perf +rfc3339-validator==0.1.4 + # via jsonschema +rfc3987==1.3.8 + # via jsonschema +rich==14.3.3 + # via + # genai-perf + # lightning + # mteb + # perceptron + # terratorch + # typer +rioxarray==0.22.0 + # via terratorch +rouge-score==0.1.2 + # via lm-eval +rpds-py==0.30.0 + # via + # jsonschema + # referencing +rtree==1.4.1 + # via torchgeo +runai-model-streamer==0.15.7 + # via + # -c requirements/rocm.txt + # -r requirements/rocm-test.in +runai-model-streamer-azure==0.15.7 + # via runai-model-streamer +runai-model-streamer-gcs==0.15.7 + # via runai-model-streamer +runai-model-streamer-s3==0.15.7 + # via runai-model-streamer +s3transfer==0.16.0 + # via boto3 +sacrebleu==2.6.0 + # via lm-eval +safetensors==0.7.0 + # via + # accelerate + # diffusers + # open-clip-torch + # peft + # segmentation-models-pytorch + # timm + # transformers schemathesis==3.39.15 - # OpenAI schema test - -# Evaluation and benchmarking -lm-eval[api]==0.4.11 -jiwer==4.0.0 - -# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test -multiprocess==0.70.16 - -# Required for v1/metrics/test_engine_logger_apis.py -ray[cgraph,default]>=2.48.0 - + # via -r requirements/rocm-test.in +scikit-image==0.26.0 + # via + # albumentations + # terratorch +scikit-learn==1.8.0 + # via + # albumentations + # librosa + # lm-eval + # mteb + # sentence-transformers + # terratorch +scipy==1.17.1 + # via + # albumentations + # bm25s + # imagehash + # librosa + # mteb + # pytrec-eval-terrier + # scikit-image + # scikit-learn + # sentence-transformers + # statsmodels + # vocos +segmentation-models-pytorch==0.5.0 + # via + # -r requirements/rocm-test.in + # terratorch + # torchgeo +sentence-transformers==5.3.0 + # via + # -r requirements/rocm-test.in + # mteb +sentry-sdk==2.55.0 + # via wandb +setuptools==79.0.1 + # via + # -c requirements/common.txt + # -c requirements/rocm.txt + # pytablewriter + # tensorboard + # torch +shapely==2.1.2 + # via + # geopandas + # torchgeo +shellingham==1.5.4 + # via + # perceptron + # typer +simplejson==3.20.2 + # via choreographer +six==1.17.0 + # via + # -c requirements/common.txt + # junit-xml + # lightly + # opencensus + # python-dateutil + # rfc3339-validator + # rouge-score +smart-open==7.5.1 + # via ray +smmap==5.0.3 + # via gitdb +sniffio==1.3.1 + # via + # anyio + # httpx +sortedcontainers==2.4.0 + # via hypothesis +soundfile==0.13.1 + # via + # -r requirements/rocm-test.in + # genai-perf + # librosa + # mistral-common +soxr==0.5.0.post1 + # via + # librosa + # mistral-common +sqlalchemy==2.0.48 + # via + # alembic + # optuna +sqlitedict==2.1.0 + # via lm-eval +starlette==0.52.1 + # via + # fastapi + # schemathesis + # starlette-testclient +starlette-testclient==0.4.1 + # via schemathesis +statsmodels==0.14.6 + # via genai-perf +stringzilla==4.6.0 + # via albucore +structlog==25.5.0 + # via gpt-oss +sympy==1.14.0 + # via + # einx + # torch +tabledata==1.3.4 + # via pytablewriter +tabulate==0.10.0 + # via sacrebleu +tacoreader==0.5.6 + # via terratorch +tblib==3.1.0 + # via -r requirements/rocm-test.in +tcolorpy==0.1.7 + # via pytablewriter +tenacity==9.1.4 + # via + # gpt-oss + # lm-eval +tensorboard==2.20.0 + # via terratorch +tensorboard-data-server==0.7.2 + # via tensorboard +tensorboardx==2.6.4 + # via lightning +tensorizer==2.10.1 + # via + # -c requirements/rocm.txt + # -r requirements/rocm-test.in +termcolor==3.3.0 + # via + # gpt-oss + # terratorch +terratorch==1.2.2 + # via -r requirements/rocm-test.in +threadpoolctl==3.6.0 + # via scikit-learn +tifffile==2026.3.3 + # via + # scikit-image + # terratorch +tiktoken==0.12.0 + # via + # -c requirements/common.txt + # gpt-oss + # lm-eval + # mistral-common +timm==1.0.17 + # via + # -c requirements/rocm.txt + # -r requirements/rocm-test.in + # open-clip-torch + # segmentation-models-pytorch + # terratorch + # torchgeo +tokenizers==0.22.0 + # via + # -c requirements/common.txt + # -r requirements/rocm-test.in + # transformers +tomli==2.4.0 + # via schemathesis +tomli-w==1.2.0 + # via schemathesis torchgeo==0.7.0 + # via + # -r requirements/rocm-test.in + # terratorch +torchmetrics==1.9.0 + # via + # lightning + # pytorch-lightning + # terratorch + # torchgeo +tqdm==4.67.3 + # via + # datasets + # evaluate + # huggingface-hub + # lightly + # lightning + # lm-eval + # mteb + # nltk + # open-clip-torch + # optuna + # peft + # pqdm + # pytorch-lightning + # segmentation-models-pytorch + # sentence-transformers + # tacoreader + # terratorch + # transformers +transformers==4.57.5 + # via + # -c requirements/common.txt + # -r requirements/rocm-test.in + # genai-perf + # peft + # sentence-transformers + # transformers-stream-generator +transformers-stream-generator==0.0.5 + # via -r requirements/rocm-test.in +tritonclient==2.66.0 + # via -r requirements/rocm-test.in +typeguard==4.5.1 + # via inflect +typepy==1.3.4 + # via + # dataproperty + # pytablewriter + # tabledata +typer==0.24.1 + # via + # fastsafetensors + # perceptron +typeshed-client==2.9.0 + # via jsonargparse +typing-extensions==4.15.0 + # via + # -c requirements/common.txt + # aiosignal + # albumentations + # alembic + # azure-core + # azure-identity + # azure-storage-blob + # chz + # fastapi + # grpcio + # huggingface-hub + # librosa + # lightning + # lightning-utilities + # lm-eval + # mistral-common + # mteb + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pqdm + # pydantic + # pydantic-core + # pydantic-extra-types + # pytorch-lightning + # referencing + # sentence-transformers + # sqlalchemy + # starlette + # torch + # torchgeo + # typeguard + # typeshed-client + # typing-inspection + # wandb +typing-inspection==0.4.2 + # via + # fastapi + # pydantic +tzdata==2025.3 + # via arrow +uri-template==1.3.0 + # via jsonschema +urllib3==2.6.3 + # via + # blobfile + # botocore + # docker + # lightly + # requests + # responses + # sentry-sdk + # tritonclient +uvicorn==0.42.0 + # via gpt-oss +vector-quantize-pytorch==1.28.0 + # via -r requirements/rocm-test.in +virtualenv==21.2.0 + # via ray +vocos==0.1.0 + # via -r requirements/rocm-test.in +wandb==0.25.1 # via terratorch -# MTEB Benchmark Test -mteb[bm25s]>=2, <3 - -# Utilities -num2words==0.5.14 +wcwidth==0.6.0 + # via ftfy +webcolors==25.10.0 + # via jsonschema +werkzeug==3.1.6 + # via + # schemathesis + # tensorboard +word2number==1.1 # via lm-eval -pqdm==0.2.0 +wrapt==2.1.2 + # via smart-open +xarray==2026.2.0 + # via rioxarray +xxhash==3.6.0 + # via + # datasets + # evaluate +yarl==1.23.0 + # via + # aiohttp + # schemathesis +zipp==3.23.0 + # via importlib-metadata +zstandard==0.25.0 # via lm-eval -# Required for fastsafetensors test -fastsafetensors==0.2.2 -# Required for suffix decoding test -arctic-inference == 0.1.1 -# Required for Nemotron test -open-clip-torch==2.32.0 -# Required for isaac Multi-Modal generation test -perceptron==0.1.4 -# Required for the multi-modal models test -timm==1.0.17 -# Required for plugins test -albumentations==1.4.6 -# Pin transformers version -transformers==4.57.5 -# Pin HF Hub version -huggingface-hub==0.36.2 -# Pin Mistral Common -mistral-common[image,audio]==1.10.0 -# Required for Prithvi tests -terratorch==1.2.2 -# Required for Prithvi tests -segmentation-models-pytorch==0.5.0 -# Required for Prithvi tests -imagehash==4.3.2 -# Required for bitsandbytes quantization test -bitsandbytes==0.49.2 -# Examples (tensorizer) tests -tensorizer==2.10.1 -# Multi-modal models test (`allendou/FireRedASR2-LLM-vllm`) -kaldi-native-fbank==1.22.3 -# Pinning numpy version -numpy==2.2.6 +# The following packages were excluded from the output: +# torch +# torchvision +# torchaudio +# triton +# cuda-bindings +# cuda-pathfinder +# cuda-toolkit +# cupy-cuda12x +# nvidia-cublas +# nvidia-cuda-cupti +# nvidia-cuda-nvrtc +# nvidia-cuda-runtime +# nvidia-cudnn-cu13 +# nvidia-cufft +# nvidia-cufile +# nvidia-curand +# nvidia-cusolver +# nvidia-cusparse +# nvidia-cusparselt-cu13 +# nvidia-nccl-cu13 +# nvidia-nvjitlink +# nvidia-nvshmem-cu13 +# nvidia-nvtx From d2dd9b03aad43caf67552a4383842e793f50198e Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 26 Mar 2026 13:08:31 -0500 Subject: [PATCH 23/23] [ROCm][CI] Wire CI_BASE_IMAGE into bake targets and Jinja build steps Signed-off-by: Andreas Karatzas --- .buildkite/scripts/hardware_ci/run-amd-test.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index c1c6549daf03..ca7ec6c5dd7c 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -380,7 +380,13 @@ wait_for_clean_gpus # --- Pull test image --- echo "--- Pulling container" -image_name="${DOCKER_IMAGE_NAME:-rocm/vllm-ci:${BUILDKITE_COMMIT}}" +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set. The pipeline must pass the per-arch" \ + "image tag (e.g. rocm/vllm-ci:\$COMMIT-gfx90a). Check pool_to_arch mapping" \ + "in test-template-amd.j2." >&2 + exit 1 +fi +image_name="${DOCKER_IMAGE_NAME}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" docker pull "${image_name}"