From 6d138a4ea2e4ee463f387c0ae3dc4cffea6e82f7 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 12 Mar 2026 22:17:06 -0500 Subject: [PATCH 1/8] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- .../cleanup-dockerhub-rocm-cache.yaml | 35 ++++ .../scripts/cleanup-dockerhub-rocm-cache.sh | 161 ++++++++++++++++++ docker/ci-rocm.hcl | 127 ++++++++++++++ 3 files changed, 323 insertions(+) create mode 100644 buildkite/pipelines/cleanup-dockerhub-rocm-cache.yaml create mode 100644 buildkite/scripts/cleanup-dockerhub-rocm-cache.sh create mode 100644 docker/ci-rocm.hcl diff --git a/buildkite/pipelines/cleanup-dockerhub-rocm-cache.yaml b/buildkite/pipelines/cleanup-dockerhub-rocm-cache.yaml new file mode 100644 index 00000000..f033dc2d --- /dev/null +++ b/buildkite/pipelines/cleanup-dockerhub-rocm-cache.yaml @@ -0,0 +1,35 @@ +# cleanup-dockerhub-rocm-cache.yaml +# +# Buildkite scheduled pipeline — weekly cleanup of stale BuildKit registry-cache +# tags from Docker Hub (rocm/vllm-ci-cache). +# +# Setup in Buildkite UI: +# 1. Create a new pipeline pointing at this file. +# 2. Under "Schedules", add: +# Cron: 0 3 * * 0 (Sundays at 03:00 UTC) +# Branch: main +# 3. Under "Environment Variables" (or via Buildkite secrets), set: +# DOCKERHUB_USERNAME - Docker Hub account with delete access on rocm/vllm-ci-cache +# DOCKERHUB_TOKEN - Docker Hub personal access token (read/write/delete scope) +# +# Optional overrides (set as env vars on the schedule or the pipeline): +# KEEP_DAYS - Days of tags to retain (default: 30) +# DRY_RUN - Set to "1" for a safe preview run that deletes nothing + +steps: + - label: ":wastebasket: Clean stale ROCm cache tags from Docker Hub" + key: cleanup-rocm-cache + agents: + queue: amd_cpu + commands: + - bash buildkite/scripts/cleanup-dockerhub-rocm-cache.sh + env: + KEEP_DAYS: "30" + DRY_RUN: "0" + timeout_in_minutes: 15 + retry: + automatic: + - exit_status: -1 # Agent lost + limit: 2 + - exit_status: 1 # Transient API error + limit: 2 diff --git a/buildkite/scripts/cleanup-dockerhub-rocm-cache.sh b/buildkite/scripts/cleanup-dockerhub-rocm-cache.sh new file mode 100644 index 00000000..40ef65b8 --- /dev/null +++ b/buildkite/scripts/cleanup-dockerhub-rocm-cache.sh @@ -0,0 +1,161 @@ +#!/bin/bash +# cleanup-dockerhub-rocm-cache.sh +# +# Removes stale commit-tagged BuildKit cache entries from Docker Hub. +# +# Background: every CI build (PR and main) writes a commit-specific tag +# (rocm-) to rocm/vllm-ci-cache via ci-rocm.hcl's get_cache_to_rocm(). +# Without periodic cleanup these accumulate indefinitely at several GB each. +# +# This script should be run on a schedule (e.g., weekly via a Buildkite +# scheduled build or cron). It keeps: +# - rocm-latest (the warm main-branch baseline, never deleted) +# - any tag pushed within the last KEEP_DAYS days +# +# Required environment variables: +# DOCKERHUB_USERNAME - Docker Hub account with push rights to rocm/vllm-ci-cache +# DOCKERHUB_TOKEN - Docker Hub personal access token (read/write/delete scope) +# +# Optional: +# KEEP_DAYS - Age threshold in days (default: 30) +# CACHE_REPO - Docker Hub repo to clean (default: rocm/vllm-ci-cache) +# DRY_RUN - Set to "1" to list tags that would be deleted without +# actually deleting them (default: 0) + +set -euo pipefail + +KEEP_DAYS="${KEEP_DAYS:-30}" +CACHE_REPO="${CACHE_REPO:-rocm/vllm-ci-cache}" +DRY_RUN="${DRY_RUN:-0}" + +DOCKERHUB_USERNAME="${DOCKERHUB_USERNAME:?DOCKERHUB_USERNAME must be set}" +DOCKERHUB_TOKEN="${DOCKERHUB_TOKEN:?DOCKERHUB_TOKEN must be set}" + +NAMESPACE="${CACHE_REPO%%/*}" +REPONAME="${CACHE_REPO##*/}" + +echo "=== Docker Hub ROCm cache cleanup ===" +echo "Repo: ${CACHE_REPO}" +echo "Keep days: ${KEEP_DAYS}" +echo "Dry run: ${DRY_RUN}" +echo "" + +# ── Auth ────────────────────────────────────────────────────────────────────── + +echo "--- :key: Authenticating with Docker Hub" +JWT=$(curl -sSf "https://hub.docker.com/v2/users/login" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"${DOCKERHUB_USERNAME}\",\"password\":\"${DOCKERHUB_TOKEN}\"}" \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['token'])") +echo "Authenticated as ${DOCKERHUB_USERNAME}" + +# ── List all tags with pagination ───────────────────────────────────────────── + +echo "--- :mag: Listing tags in ${CACHE_REPO}" + +declare -a STALE_TAGS=() +CUTOFF_EPOCH=$(date -d "-${KEEP_DAYS} days" +%s 2>/dev/null \ + || python3 -c "import time; print(int(time.time()) - ${KEEP_DAYS}*86400)") + +PAGE_URL="https://hub.docker.com/v2/repositories/${NAMESPACE}/${REPONAME}/tags/?page_size=100" +TOTAL_CHECKED=0 +TOTAL_KEPT=0 + +while [[ -n "${PAGE_URL}" && "${PAGE_URL}" != "null" ]]; do + RESPONSE=$(curl -sSf "${PAGE_URL}" \ + -H "Authorization: Bearer ${JWT}") + + # Extract next page URL + PAGE_URL=$(echo "${RESPONSE}" | python3 -c \ + "import sys,json; d=json.load(sys.stdin); print(d.get('next') or '')") + + # Process each tag on this page + while IFS= read -r TAG_JSON; do + NAME=$(echo "${TAG_JSON}" | python3 -c \ + "import sys,json; d=json.load(sys.stdin); print(d['name'])") + LAST_UPDATED=$(echo "${TAG_JSON}" | python3 -c \ + "import sys,json; d=json.load(sys.stdin); print(d.get('tag_last_pushed') or d.get('last_updated') or '')") + + TOTAL_CHECKED=$((TOTAL_CHECKED + 1)) + + # Always keep rocm-latest and any non-commit tags + if [[ "${NAME}" == "rocm-latest" ]]; then + echo " KEEP ${NAME} (protected baseline tag)" + TOTAL_KEPT=$((TOTAL_KEPT + 1)) + continue + fi + + # Only touch commit-specific tags (rocm-<40-hex-char sha>) + if ! [[ "${NAME}" =~ ^rocm-[0-9a-f]{40}$ ]]; then + echo " KEEP ${NAME} (not a commit tag)" + TOTAL_KEPT=$((TOTAL_KEPT + 1)) + continue + fi + + # Parse the push timestamp and compare against cutoff + if [[ -z "${LAST_UPDATED}" ]]; then + echo " KEEP ${NAME} (no timestamp — skipping to be safe)" + TOTAL_KEPT=$((TOTAL_KEPT + 1)) + continue + fi + + TAG_EPOCH=$(python3 -c \ + "import datetime; s='${LAST_UPDATED}'; \ + s=s.rstrip('Z').split('.')[0]; \ + print(int(datetime.datetime.fromisoformat(s).replace(tzinfo=datetime.timezone.utc).timestamp()))" \ + 2>/dev/null || echo "0") + + if [[ "${TAG_EPOCH}" -lt "${CUTOFF_EPOCH}" ]]; then + AGE_DAYS=$(( ($(date +%s) - TAG_EPOCH) / 86400 )) + echo " STALE ${NAME} (${AGE_DAYS}d old, last pushed ${LAST_UPDATED})" + STALE_TAGS+=("${NAME}") + else + echo " KEEP ${NAME} (recent, last pushed ${LAST_UPDATED})" + TOTAL_KEPT=$((TOTAL_KEPT + 1)) + fi + done < <(echo "${RESPONSE}" | python3 -c \ + "import sys,json; [print(json.dumps(t)) for t in json.load(sys.stdin).get('results',[])]") +done + +echo "" +echo "Checked: ${TOTAL_CHECKED} tags — keeping ${TOTAL_KEPT}, deleting ${#STALE_TAGS[@]}" + +if [[ "${#STALE_TAGS[@]}" -eq 0 ]]; then + echo "Nothing to delete." + exit 0 +fi + +# ── Delete stale tags ───────────────────────────────────────────────────────── + +echo "" +if [[ "${DRY_RUN}" == "1" ]]; then + echo "--- :no_entry: DRY RUN — the following tags would be deleted:" + printf ' %s\n' "${STALE_TAGS[@]}" + echo "Set DRY_RUN=0 to actually delete them." + exit 0 +fi + +echo "--- :wastebasket: Deleting ${#STALE_TAGS[@]} stale tags" +DELETED=0 +FAILED=0 + +for TAG in "${STALE_TAGS[@]}"; do + HTTP_STATUS=$(curl -sSo /dev/null -w "%{http_code}" \ + -X DELETE \ + "https://hub.docker.com/v2/repositories/${NAMESPACE}/${REPONAME}/tags/${TAG}/" \ + -H "Authorization: Bearer ${JWT}") + + if [[ "${HTTP_STATUS}" == "204" ]]; then + echo " Deleted: ${TAG}" + DELETED=$((DELETED + 1)) + else + echo " FAILED (HTTP ${HTTP_STATUS}): ${TAG}" + FAILED=$((FAILED + 1)) + fi +done + +echo "" +echo "=== Cleanup complete: ${DELETED} deleted, ${FAILED} failed ===" +if [[ "${FAILED}" -gt 0 ]]; then + exit 1 +fi diff --git a/docker/ci-rocm.hcl b/docker/ci-rocm.hcl new file mode 100644 index 00000000..466df841 --- /dev/null +++ b/docker/ci-rocm.hcl @@ -0,0 +1,127 @@ +# ci-rocm.hcl - CI-specific configuration for vLLM ROCm Docker builds +# +# This file lives in ci-infra repo at docker/ci-rocm.hcl +# Used with: docker buildx bake -f docker/docker-bake-rocm.hcl -f ci-rocm.hcl test-rocm-ci +# +# Registry cache: Docker Hub (rocm/vllm-ci-cache) is used exclusively. +# AMD build agents already have Docker Hub credentials (they push the test +# image to rocm/vllm-ci), so no additional credential setup is required. +# +# sccache is disabled (USE_SCCACHE=0): AMD build agents have no AWS S3 +# credentials; enabling sccache causes every HIP compilation to stall on +# S3 auth timeouts. BuildKit's own layer cache handles stage-level caching. + +# CI metadata + +variable "BUILDKITE_COMMIT" { + default = "" +} + +variable "BUILDKITE_BUILD_NUMBER" { + default = "" +} + +variable "BUILDKITE_BUILD_ID" { + default = "" +} + +variable "PARENT_COMMIT" { + default = "" +} + +# Merge-base of HEAD with main — provides a more stable cache fallback than +# parent commit for long-lived PRs. Mirrors the VLLM_MERGE_BASE_COMMIT +# pattern used in ci.hcl (CUDA). Auto-computed by ci-bake.sh when unset. +variable "VLLM_MERGE_BASE_COMMIT" { + default = "" +} + +# Bridge to vLLM's COMMIT variable for OCI labels +variable "COMMIT" { + default = BUILDKITE_COMMIT +} + +# Image tags (set by CI) + +variable "IMAGE_TAG" { + default = "" +} + +variable "IMAGE_TAG_LATEST" { + default = "" +} + +# ROCm-specific GPU architecture targets + +variable "PYTORCH_ROCM_ARCH" { + default = "gfx90a;gfx942;gfx950" +} + +# Docker Hub registry cache for AMD builds. +# +# A separate repo (rocm/vllm-ci-cache) is used for BuildKit layer cache so +# that mode=max intermediate-stage blobs don't pollute the image repo. +# Docker Hub auto-creates the repo on first push. +# +# DOCKERHUB_CACHE_TO is set by the pipeline only on main-branch builds to +# keep the :rocm-latest tag warm for PR builds to pull from. + +variable "DOCKERHUB_CACHE_REPO" { + default = "rocm/vllm-ci-cache" +} + +variable "DOCKERHUB_CACHE_TO" { + default = "" +} + +# Functions + +function "get_cache_from_rocm" { + params = [] + result = compact([ + # Exact commit hit — fastest cache on re-runs of the same commit + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT},mode=max" : "", + # Parent commit — useful cache for incremental changes + PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${PARENT_COMMIT},mode=max" : "", + # Merge-base with main — stable fallback for long-lived or rebased PRs; + # maps to a real main-branch commit whose cache layers are likely warm + VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${VLLM_MERGE_BASE_COMMIT},mode=max" : "", + # Warm baseline — kept current by main-branch builds + "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-latest,mode=max", + ]) +} + +function "get_cache_to_rocm" { + params = [] + result = compact([ + # Commit-specific tag for traceability and re-run cache hits + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT},mode=max,compression=zstd" : "", + # rocm-latest — only set on main-branch builds (controlled by pipeline via DOCKERHUB_CACHE_TO) + DOCKERHUB_CACHE_TO != "" ? "type=registry,ref=${DOCKERHUB_CACHE_TO},mode=max,compression=zstd" : "", + ]) +} + +# CI targets + +target "_ci-rocm" { + annotations = [ + "index,manifest:vllm.buildkite.build_number=${BUILDKITE_BUILD_NUMBER}", + "index,manifest:vllm.buildkite.build_id=${BUILDKITE_BUILD_ID}", + ] + args = { + ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH + USE_SCCACHE = 0 + } +} + +target "test-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm", "_labels"] + target = "test" + cache-from = get_cache_from_rocm() + cache-to = get_cache_to_rocm() + tags = compact([ + IMAGE_TAG, + IMAGE_TAG_LATEST, + ]) + output = ["type=registry"] +} From 301adfaa37c857e91d440a08ab7416b33e697324 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 12 Mar 2026 22:20:56 -0500 Subject: [PATCH 2/8] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- buildkite/bootstrap-amd.sh | 7 +++++++ buildkite/test-template-amd.j2 | 24 +++++++++++------------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/buildkite/bootstrap-amd.sh b/buildkite/bootstrap-amd.sh index ebb527a6..97e4da67 100644 --- a/buildkite/bootstrap-amd.sh +++ b/buildkite/bootstrap-amd.sh @@ -112,6 +112,13 @@ upload_pipeline() { exit 0 } +# AMD agents use BUILDKITE_GIT_CLONE_FLAGS=--depth=1 to avoid timing out on +# full clones of the vllm repo (~184k objects). Deepen here so git merge-base +# has enough history. 50 commits covers all realistic PR branch depths. +if git rev-parse --is-shallow-repository 2>/dev/null | grep -q "true"; then + git fetch --depth=50 origin main 2>/dev/null || true +fi + get_diff() { $(git add .) echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD)) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index efb8dccc..a3ff2cc0 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -288,22 +288,20 @@ plugins: depends_on: ~ soft_fail: false commands: - # Handle the introduction of test target in Dockerfile.rocm - - > - docker build - --build-arg max_jobs=16 - --build-arg REMOTE_VLLM=1 - --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950' - --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT - --tag {{ docker_image_amd }} - -f docker/Dockerfile.rocm - --target test - --no-cache - --progress plain . - - "docker push {{ docker_image_amd }}" + - bash .buildkite/scripts/ci-bake.sh test-rocm-ci key: "amd-build" + # DOCKERHUB_CACHE_TO is intentionally absent on non-main builds: + # hardware test and PR builds must not overwrite :rocm-latest (reserved for main). env: DOCKER_BUILDKIT: "1" + IMAGE_TAG: "{{ docker_image_amd }}" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/{{ vllm_ci_branch | default('main') }}/docker/ci-rocm.hcl" + PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950" + {% if branch == "main" %} + IMAGE_TAG_LATEST: "rocm/vllm-ci:latest" + DOCKERHUB_CACHE_TO: "rocm/vllm-ci-cache:rocm-latest" + {% endif %} retry: automatic: - exit_status: -1 # Agent was lost From 334f25e7fae4443ad9d7195490a02efb2d620910 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Fri, 13 Mar 2026 16:39:13 -0500 Subject: [PATCH 3/8] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- .../pipeline_generator/buildkite_step.py | 23 ++++- buildkite/test-template-amd.j2 | 97 ++++++++++++++++++- 2 files changed, 117 insertions(+), 3 deletions(-) diff --git a/buildkite/pipeline_generator/buildkite_step.py b/buildkite/pipeline_generator/buildkite_step.py index d853f2c5..7d3ba939 100644 --- a/buildkite/pipeline_generator/buildkite_step.py +++ b/buildkite/pipeline_generator/buildkite_step.py @@ -323,6 +323,27 @@ def _create_amd_mirror_step(step: Step, original_commands: List[str], amd: Dict[ DeviceType.AMD_MI355_8: AgentQueue.AMD_MI355_8, } + # Map device type to GPU architecture for per-arch image builds. + # When a per-arch build step (image-build-amd-) exists, prefer it + # over the fat all-arch build (image-build-amd) for faster CI. + _device_to_arch = { + DeviceType.AMD_MI250_1: "gfx90a", + DeviceType.AMD_MI250_2: "gfx90a", + DeviceType.AMD_MI250_4: "gfx90a", + DeviceType.AMD_MI250_8: "gfx90a", + DeviceType.AMD_MI325_1: "gfx942", + DeviceType.AMD_MI325_2: "gfx942", + DeviceType.AMD_MI325_4: "gfx942", + DeviceType.AMD_MI325_8: "gfx942", + DeviceType.AMD_MI355_1: "gfx950", + DeviceType.AMD_MI355_2: "gfx950", + DeviceType.AMD_MI355_4: "gfx950", + DeviceType.AMD_MI355_8: "gfx950", + } + arch = _device_to_arch.get(amd_device) + arch_build_key = f"image-build-amd-{arch}" if arch else None + build_dep = arch_build_key if arch_build_key else "image-build-amd" + amd_queue = amd_queue_map.get(amd_device) if not amd_queue: raise ValueError(f"Invalid AMD device: {amd_device}. Valid devices: {list(amd_queue_map.keys())}") @@ -330,7 +351,7 @@ def _create_amd_mirror_step(step: Step, original_commands: List[str], amd: Dict[ return BuildkiteCommandStep( label=amd_label, commands=[amd_command_wrapped], - depends_on=["image-build-amd"], + depends_on=[build_dep], agents={"queue": amd_queue}, env={"DOCKER_BUILDKIT": "1"}, priority=200, diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index a3ff2cc0..99185521 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -9,6 +9,20 @@ {% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %} {% endif %} {% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} +{# Per-architecture image tags and agent_pool → arch/build-key mapping #} +{% set docker_image_amd_gfx90a = "rocm/vllm-ci:$BUILDKITE_COMMIT-gfx90a" %} +{% set docker_image_amd_gfx942 = "rocm/vllm-ci:$BUILDKITE_COMMIT-gfx942" %} +{% set docker_image_amd_gfx950 = "rocm/vllm-ci:$BUILDKITE_COMMIT-gfx950" %} +{% set pool_to_arch = { + "mi250_1": "gfx90a", "mi250_2": "gfx90a", "mi250_4": "gfx90a", "mi250_8": "gfx90a", + "mi325_1": "gfx942", "mi325_2": "gfx942", "mi325_4": "gfx942", "mi325_8": "gfx942", + "mi355_1": "gfx950", "mi355_2": "gfx950", "mi355_4": "gfx950", "mi355_8": "gfx950", +} %} +{% set arch_to_build_key = { + "gfx90a": "amd-build-gfx90a", + "gfx942": "amd-build-gfx942", + "gfx950": "amd-build-gfx950", +} %} {% set default_working_dir = "/vllm-workspace/tests" %} {% set hf_home = "/root/.cache/huggingface" %} {% set hf_home_efs = "/mnt/efs/hf_cache" %} @@ -315,6 +329,81 @@ plugins: agents: queue: amd-cpu + - label: "AMD: :docker: build image (gfx90a)" + depends_on: ~ + soft_fail: false + commands: + - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx90a-ci + key: "amd-build-gfx90a" + env: + DOCKER_BUILDKIT: "1" + IMAGE_TAG: "{{ docker_image_amd_gfx90a }}" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/{{ vllm_ci_branch | default('main') }}/docker/ci-rocm.hcl" + PYTORCH_ROCM_ARCH: "gfx90a" + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: -10 + limit: 2 + - exit_status: 128 + limit: 2 + - exit_status: 1 + limit: 1 + agents: + queue: amd-cpu + + - label: "AMD: :docker: build image (gfx942)" + depends_on: ~ + soft_fail: false + commands: + - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx942-ci + key: "amd-build-gfx942" + env: + DOCKER_BUILDKIT: "1" + IMAGE_TAG: "{{ docker_image_amd_gfx942 }}" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/{{ vllm_ci_branch | default('main') }}/docker/ci-rocm.hcl" + PYTORCH_ROCM_ARCH: "gfx942" + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: -10 + limit: 2 + - exit_status: 128 + limit: 2 + - exit_status: 1 + limit: 1 + agents: + queue: amd-cpu + + - label: "AMD: :docker: build image (gfx950)" + depends_on: ~ + soft_fail: false + commands: + - bash .buildkite/scripts/ci-bake.sh test-rocm-gfx950-ci + key: "amd-build-gfx950" + env: + DOCKER_BUILDKIT: "1" + IMAGE_TAG: "{{ docker_image_amd_gfx950 }}" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/{{ vllm_ci_branch | default('main') }}/docker/ci-rocm.hcl" + PYTORCH_ROCM_ARCH: "gfx950" + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: -10 + limit: 2 + - exit_status: 128 + limit: 2 + - exit_status: 1 + limit: 1 + agents: + queue: amd-cpu + {% for step in steps %} {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %} @@ -336,9 +425,13 @@ plugins: {% set ns.blocked = 0 %} {% endif %} + {# Resolve per-arch build key from agent_pool (e.g. mi250_1 → amd-build-gfx90a) #} + {% set step_arch = pool_to_arch.get(step.agent_pool, "") %} + {% set step_build_key = arch_to_build_key.get(step_arch, "amd-build") %} + {% if (ns.blocked == 1 or (step.optional and nightly != "1")) and not (step.autorun_on_main == true and branch == "main") %} - block: "Run {{ step.agent_pool }}: {{ step.label }}" - depends_on: amd-build + depends_on: {{ step_build_key }} key: block-{{ step.agent_pool }}-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }} {% endif %} @@ -346,7 +439,7 @@ plugins: {% if (ns.blocked == 1 or (step.optional and nightly != "1")) and not (step.autorun_on_main == true and branch == "main") %} depends_on: block-{{ step.agent_pool }}-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }} {% else %} - depends_on: amd-build + depends_on: {{ step_build_key }} {% endif %} agents: {% if step.agent_pool %} From 0101524f4bd2c0df408640a4541c1fbc09256a13 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 00:28:03 -0500 Subject: [PATCH 4/8] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- buildkite/test-template-amd.j2 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 99185521..952c58ef 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -227,7 +227,7 @@ plugins: containers: - image: {{ image }} command: - - bash -c "{{ '(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe) }}" + - bash -c "{{ ('(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe)) }}" resources: limits: nvidia.com/gpu: {{ step.num_gpus or 1 }} @@ -260,7 +260,7 @@ plugins: containers: - image: {{ image }} command: - - bash -c "{{ '(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe) }}" + - bash -c "{{ ('(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe)) }}" resources: limits: nvidia.com/gpu: {{ step.num_gpus or 1 }} @@ -426,8 +426,8 @@ plugins: {% endif %} {# Resolve per-arch build key from agent_pool (e.g. mi250_1 → amd-build-gfx90a) #} - {% set step_arch = pool_to_arch.get(step.agent_pool, "") %} - {% set step_build_key = arch_to_build_key.get(step_arch, "amd-build") %} + {% set step_arch = pool_to_arch[step.agent_pool] | default("") %} + {% set step_build_key = arch_to_build_key[step_arch] | default("amd-build") %} {% if (ns.blocked == 1 or (step.optional and nightly != "1")) and not (step.autorun_on_main == true and branch == "main") %} - block: "Run {{ step.agent_pool }}: {{ step.label }}" @@ -447,7 +447,7 @@ plugins: {% else %} queue: amd_mi325_1 {% endif %} - command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && export VLLM_TEST_GROUP_NAME={{ step.agent_pool }}-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }} && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" + command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && export VLLM_TEST_GROUP_NAME={{ step.agent_pool }}-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }} && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ (step.command or (step.commands | join(" && ")) | safe) }}" env: DOCKER_BUILDKIT: "1" priority: 100 From 586ba38920768e1cafc892012cdf7065df2f5931 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 16:48:43 -0500 Subject: [PATCH 5/8] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- docker/ci-rocm.hcl | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/docker/ci-rocm.hcl b/docker/ci-rocm.hcl index 466df841..0aa89c26 100644 --- a/docker/ci-rocm.hcl +++ b/docker/ci-rocm.hcl @@ -105,8 +105,8 @@ function "get_cache_to_rocm" { target "_ci-rocm" { annotations = [ - "index,manifest:vllm.buildkite.build_number=${BUILDKITE_BUILD_NUMBER}", - "index,manifest:vllm.buildkite.build_id=${BUILDKITE_BUILD_ID}", + "manifest:vllm.buildkite.build_number=${BUILDKITE_BUILD_NUMBER}", + "manifest:vllm.buildkite.build_id=${BUILDKITE_BUILD_ID}", ] args = { ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH @@ -125,3 +125,30 @@ target "test-rocm-ci" { ]) output = ["type=registry"] } + +# Per-architecture CI targets — each builds for a single GPU arch and pushes +# to the registry so test agents can pull the image. +# Each per-arch build step sets IMAGE_TAG to e.g. rocm/vllm-ci:-gfx942 +target "test-rocm-gfx90a-ci" { + inherits = ["test-rocm-gfx90a", "_ci-rocm"] + cache-from = get_cache_from_rocm() + cache-to = get_cache_to_rocm() + tags = compact([IMAGE_TAG]) + output = ["type=registry"] +} + +target "test-rocm-gfx942-ci" { + inherits = ["test-rocm-gfx942", "_ci-rocm"] + cache-from = get_cache_from_rocm() + cache-to = get_cache_to_rocm() + tags = compact([IMAGE_TAG]) + output = ["type=registry"] +} + +target "test-rocm-gfx950-ci" { + inherits = ["test-rocm-gfx950", "_ci-rocm"] + cache-from = get_cache_from_rocm() + cache-to = get_cache_to_rocm() + tags = compact([IMAGE_TAG]) + output = ["type=registry"] +} From 87285835d50ab2f8d3dec4ca08e176e8054c5803 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 18:33:19 -0500 Subject: [PATCH 6/8] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- buildkite/test-template-amd.j2 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 5ccf6545..c3e7c748 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -23,6 +23,11 @@ "gfx942": "amd-build-gfx942", "gfx950": "amd-build-gfx950", } %} +{% set arch_to_image = { + "gfx90a": docker_image_amd_gfx90a, + "gfx942": docker_image_amd_gfx942, + "gfx950": docker_image_amd_gfx950, +} %} {% set default_working_dir = "/vllm-workspace/tests" %} {% set hf_home = "/root/.cache/huggingface" %} {% set hf_home_efs = "/mnt/efs/hf_cache" %} @@ -450,6 +455,7 @@ plugins: command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh env: DOCKER_BUILDKIT: "1" + DOCKER_IMAGE_NAME: "{{ arch_to_image[step_arch] | default(docker_image_amd) }}" VLLM_TEST_COMMANDS: "(command rocm-smi || true) && export VLLM_TEST_GROUP_NAME={{ step.agent_pool }}-{{ step.label | replace(' ', '-') | lower | replace('(', '') | replace(')', '') | replace('%', '') | replace(',', '-') | replace('+', '-') }} && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ (step.command or (step.commands | join(' && '))) | replace('\"', '\\\"') | safe }}" priority: 100 {% if step.grade and step.grade == "Blocking" %} From e2d5bf74210a33b52975832fb3a546ccdb7209de Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 19:05:58 -0500 Subject: [PATCH 7/8] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- buildkite/pipeline_generator/buildkite_step.py | 16 +++++++++++----- buildkite/test-template-amd.j2 | 15 ++++++++++++--- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/buildkite/pipeline_generator/buildkite_step.py b/buildkite/pipeline_generator/buildkite_step.py index 1fc5a7e5..a147a470 100644 --- a/buildkite/pipeline_generator/buildkite_step.py +++ b/buildkite/pipeline_generator/buildkite_step.py @@ -231,16 +231,17 @@ def convert_group_step_to_buildkite_step( # Create AMD mirror step and its block step if specified/applicable if step.mirror and step.mirror.get("amd"): - amd_block_step = None + amd_step = _create_amd_mirror_step(step, step_commands, step.mirror["amd"]) + # Block step depends on the same build the mirror step uses + # (per-arch when available, fat build otherwise). + mirror_build_dep = amd_step.depends_on[0] if amd_step.depends_on else "image-build-amd" amd_block_step = BuildkiteBlockStep( block=f"Run AMD: {step.label}", - depends_on=["image-build-amd"], + depends_on=[mirror_build_dep], key=f"block-amd-{_generate_step_key(step.label)}", ) amd_mirror_steps.append(amd_block_step) - amd_step = _create_amd_mirror_step(step, step_commands, step.mirror["amd"]) - if amd_block_step: - amd_step.depends_on.extend([amd_block_step.key]) + amd_step.depends_on.append(amd_block_step.key) amd_mirror_steps.append(amd_step) buildkite_group_steps.append( @@ -261,6 +262,11 @@ def _step_should_run(step: Step, list_file_diff: List[str]) -> bool: return False global_config = get_global_config() if step.key and step.key.startswith("image-build"): + # Fat all-arch build (image-build-amd) only auto-runs on main; + # on PR branches it gets a block step so it's on-demand. + # Per-arch builds (image-build-amd-gfx*) always auto-run. + if step.key == "image-build-amd" and global_config["branch"] != "main": + return False return True if global_config["nightly"] == "1": return True diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index c3e7c748..7efc44dc 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -303,14 +303,23 @@ plugins: - group: "AMD Tests" depends_on: ~ steps: - - label: "AMD: :docker: build image" + # Fat multi-arch image: auto-runs on main (for cache warming / release), + # blocked behind manual approval on PR / hardware-test builds. + {% if branch != "main" %} + - block: "Run AMD fat build (all archs)" + key: "block-amd-build" + {% endif %} + + - label: "AMD: :docker: build image (all archs)" + {% if branch == "main" %} depends_on: ~ + {% else %} + depends_on: "block-amd-build" + {% endif %} soft_fail: false commands: - bash .buildkite/scripts/ci-bake.sh test-rocm-ci key: "amd-build" - # DOCKERHUB_CACHE_TO is intentionally absent on non-main builds: - # hardware test and PR builds must not overwrite :rocm-latest (reserved for main). env: DOCKER_BUILDKIT: "1" IMAGE_TAG: "{{ docker_image_amd }}" From f841d98e10a082b7a1a4a0dac0417823314aab6a Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Sat, 14 Mar 2026 21:15:10 -0500 Subject: [PATCH 8/8] [ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipeline Signed-off-by: Andreas Karatzas --- buildkite/test-template-amd.j2 | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 7efc44dc..31546932 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -333,11 +333,11 @@ plugins: retry: automatic: - exit_status: -1 # Agent was lost - limit: 2 + limit: 1 - exit_status: -10 # Agent was lost - limit: 2 + limit: 1 - exit_status: 128 # Git connectivity issues - limit: 2 + limit: 1 - exit_status: 1 # Machine occasionally fail limit: 1 agents: @@ -358,11 +358,11 @@ plugins: retry: automatic: - exit_status: -1 - limit: 2 + limit: 1 - exit_status: -10 - limit: 2 + limit: 1 - exit_status: 128 - limit: 2 + limit: 1 - exit_status: 1 limit: 1 agents: @@ -383,11 +383,11 @@ plugins: retry: automatic: - exit_status: -1 - limit: 2 + limit: 1 - exit_status: -10 - limit: 2 + limit: 1 - exit_status: 128 - limit: 2 + limit: 1 - exit_status: 1 limit: 1 agents: @@ -408,11 +408,11 @@ plugins: retry: automatic: - exit_status: -1 - limit: 2 + limit: 1 - exit_status: -10 - limit: 2 + limit: 1 - exit_status: 128 - limit: 2 + limit: 1 - exit_status: 1 limit: 1 agents: @@ -478,10 +478,12 @@ plugins: retry: automatic: - exit_status: -1 # Agent was lost - limit: 2 + limit: 1 - exit_status: -10 # Agent was lost - limit: 2 + limit: 1 - exit_status: 128 # Git connectivity issues - limit: 2 + limit: 1 + - exit_status: 1 # Machine occasionally fail + limit: 1 {% endif %} {% endfor %}