Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions buildkite/bootstrap-amd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,13 @@ upload_pipeline() {
exit 0
}

# AMD agents use BUILDKITE_GIT_CLONE_FLAGS=--depth=1 to avoid timing out on
# full clones of the vllm repo (~184k objects). Deepen here so git merge-base
# has enough history. 50 commits covers all realistic PR branch depths.
if git rev-parse --is-shallow-repository 2>/dev/null | grep -q "true"; then
git fetch --depth=50 origin main 2>/dev/null || true
fi

get_diff() {
$(git add .)
echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD))
Expand Down
39 changes: 33 additions & 6 deletions buildkite/pipeline_generator/buildkite_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,16 +231,17 @@ def convert_group_step_to_buildkite_step(

# Create AMD mirror step and its block step if specified/applicable
if step.mirror and step.mirror.get("amd"):
amd_block_step = None
amd_step = _create_amd_mirror_step(step, step_commands, step.mirror["amd"])
# Block step depends on the same build the mirror step uses
# (per-arch when available, fat build otherwise).
mirror_build_dep = amd_step.depends_on[0] if amd_step.depends_on else "image-build-amd"
amd_block_step = BuildkiteBlockStep(
block=f"Run AMD: {step.label}",
depends_on=["image-build-amd"],
depends_on=[mirror_build_dep],
key=f"block-amd-{_generate_step_key(step.label)}",
)
amd_mirror_steps.append(amd_block_step)
amd_step = _create_amd_mirror_step(step, step_commands, step.mirror["amd"])
if amd_block_step:
amd_step.depends_on.extend([amd_block_step.key])
amd_step.depends_on.append(amd_block_step.key)
amd_mirror_steps.append(amd_step)

buildkite_group_steps.append(
Expand All @@ -261,6 +262,11 @@ def _step_should_run(step: Step, list_file_diff: List[str]) -> bool:
return False
global_config = get_global_config()
if step.key and step.key.startswith("image-build"):
# Fat all-arch build (image-build-amd) only auto-runs on main;
# on PR branches it gets a block step so it's on-demand.
# Per-arch builds (image-build-amd-gfx*) always auto-run.
if step.key == "image-build-amd" and global_config["branch"] != "main":
return False
return True
if global_config["nightly"] == "1":
return True
Expand Down Expand Up @@ -330,14 +336,35 @@ def _create_amd_mirror_step(step: Step, original_commands: List[str], amd: Dict[
DeviceType.AMD_MI355_8: AgentQueue.AMD_MI355_8,
}

# Map device type to GPU architecture for per-arch image builds.
# When a per-arch build step (image-build-amd-<arch>) exists, prefer it
# over the fat all-arch build (image-build-amd) for faster CI.
_device_to_arch = {
DeviceType.AMD_MI250_1: "gfx90a",
DeviceType.AMD_MI250_2: "gfx90a",
DeviceType.AMD_MI250_4: "gfx90a",
DeviceType.AMD_MI250_8: "gfx90a",
DeviceType.AMD_MI325_1: "gfx942",
DeviceType.AMD_MI325_2: "gfx942",
DeviceType.AMD_MI325_4: "gfx942",
DeviceType.AMD_MI325_8: "gfx942",
DeviceType.AMD_MI355_1: "gfx950",
DeviceType.AMD_MI355_2: "gfx950",
DeviceType.AMD_MI355_4: "gfx950",
DeviceType.AMD_MI355_8: "gfx950",
}
arch = _device_to_arch.get(amd_device)
arch_build_key = f"image-build-amd-{arch}" if arch else None
build_dep = arch_build_key if arch_build_key else "image-build-amd"

amd_queue = amd_queue_map.get(amd_device)
if not amd_queue:
raise ValueError(f"Invalid AMD device: {amd_device}. Valid devices: {list(amd_queue_map.keys())}")

return BuildkiteCommandStep(
label=amd_label,
commands=[amd_command_wrapped],
depends_on=["image-build-amd"],
depends_on=[build_dep],
agents={"queue": amd_queue},
env={"DOCKER_BUILDKIT": "1", "VLLM_TEST_COMMANDS": amd_commands_str},
priority=200,
Expand Down
35 changes: 35 additions & 0 deletions buildkite/pipelines/cleanup-dockerhub-rocm-cache.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# cleanup-dockerhub-rocm-cache.yaml
#
# Buildkite scheduled pipeline — weekly cleanup of stale BuildKit registry-cache
# tags from Docker Hub (rocm/vllm-ci-cache).
#
# Setup in Buildkite UI:
# 1. Create a new pipeline pointing at this file.
# 2. Under "Schedules", add:
# Cron: 0 3 * * 0 (Sundays at 03:00 UTC)
# Branch: main
# 3. Under "Environment Variables" (or via Buildkite secrets), set:
# DOCKERHUB_USERNAME - Docker Hub account with delete access on rocm/vllm-ci-cache
# DOCKERHUB_TOKEN - Docker Hub personal access token (read/write/delete scope)
#
# Optional overrides (set as env vars on the schedule or the pipeline):
# KEEP_DAYS - Days of tags to retain (default: 30)
# DRY_RUN - Set to "1" for a safe preview run that deletes nothing

steps:
- label: ":wastebasket: Clean stale ROCm cache tags from Docker Hub"
key: cleanup-rocm-cache
agents:
queue: amd_cpu
commands:
- bash buildkite/scripts/cleanup-dockerhub-rocm-cache.sh
env:
KEEP_DAYS: "30"
DRY_RUN: "0"
timeout_in_minutes: 15
retry:
automatic:
- exit_status: -1 # Agent lost
limit: 2
- exit_status: 1 # Transient API error
limit: 2
161 changes: 161 additions & 0 deletions buildkite/scripts/cleanup-dockerhub-rocm-cache.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#!/bin/bash
# cleanup-dockerhub-rocm-cache.sh
#
# Removes stale commit-tagged BuildKit cache entries from Docker Hub.
#
# Background: every CI build (PR and main) writes a commit-specific tag
# (rocm-<sha>) to rocm/vllm-ci-cache via ci-rocm.hcl's get_cache_to_rocm().
# Without periodic cleanup these accumulate indefinitely at several GB each.
#
# This script should be run on a schedule (e.g., weekly via a Buildkite
# scheduled build or cron). It keeps:
# - rocm-latest (the warm main-branch baseline, never deleted)
# - any tag pushed within the last KEEP_DAYS days
#
# Required environment variables:
# DOCKERHUB_USERNAME - Docker Hub account with push rights to rocm/vllm-ci-cache
# DOCKERHUB_TOKEN - Docker Hub personal access token (read/write/delete scope)
#
# Optional:
# KEEP_DAYS - Age threshold in days (default: 30)
# CACHE_REPO - Docker Hub repo to clean (default: rocm/vllm-ci-cache)
# DRY_RUN - Set to "1" to list tags that would be deleted without
# actually deleting them (default: 0)

set -euo pipefail

KEEP_DAYS="${KEEP_DAYS:-30}"
CACHE_REPO="${CACHE_REPO:-rocm/vllm-ci-cache}"
DRY_RUN="${DRY_RUN:-0}"

DOCKERHUB_USERNAME="${DOCKERHUB_USERNAME:?DOCKERHUB_USERNAME must be set}"
DOCKERHUB_TOKEN="${DOCKERHUB_TOKEN:?DOCKERHUB_TOKEN must be set}"

NAMESPACE="${CACHE_REPO%%/*}"
REPONAME="${CACHE_REPO##*/}"

echo "=== Docker Hub ROCm cache cleanup ==="
echo "Repo: ${CACHE_REPO}"
echo "Keep days: ${KEEP_DAYS}"
echo "Dry run: ${DRY_RUN}"
echo ""

# ── Auth ──────────────────────────────────────────────────────────────────────

echo "--- :key: Authenticating with Docker Hub"
JWT=$(curl -sSf "https://hub.docker.com/v2/users/login" \
-H "Content-Type: application/json" \
-d "{\"username\":\"${DOCKERHUB_USERNAME}\",\"password\":\"${DOCKERHUB_TOKEN}\"}" \
| python3 -c "import sys,json; print(json.load(sys.stdin)['token'])")
echo "Authenticated as ${DOCKERHUB_USERNAME}"

# ── List all tags with pagination ─────────────────────────────────────────────

echo "--- :mag: Listing tags in ${CACHE_REPO}"

declare -a STALE_TAGS=()
CUTOFF_EPOCH=$(date -d "-${KEEP_DAYS} days" +%s 2>/dev/null \
|| python3 -c "import time; print(int(time.time()) - ${KEEP_DAYS}*86400)")

PAGE_URL="https://hub.docker.com/v2/repositories/${NAMESPACE}/${REPONAME}/tags/?page_size=100"
TOTAL_CHECKED=0
TOTAL_KEPT=0

while [[ -n "${PAGE_URL}" && "${PAGE_URL}" != "null" ]]; do
RESPONSE=$(curl -sSf "${PAGE_URL}" \
-H "Authorization: Bearer ${JWT}")

# Extract next page URL
PAGE_URL=$(echo "${RESPONSE}" | python3 -c \
"import sys,json; d=json.load(sys.stdin); print(d.get('next') or '')")

# Process each tag on this page
while IFS= read -r TAG_JSON; do
NAME=$(echo "${TAG_JSON}" | python3 -c \
"import sys,json; d=json.load(sys.stdin); print(d['name'])")
LAST_UPDATED=$(echo "${TAG_JSON}" | python3 -c \
"import sys,json; d=json.load(sys.stdin); print(d.get('tag_last_pushed') or d.get('last_updated') or '')")

TOTAL_CHECKED=$((TOTAL_CHECKED + 1))

# Always keep rocm-latest and any non-commit tags
if [[ "${NAME}" == "rocm-latest" ]]; then
echo " KEEP ${NAME} (protected baseline tag)"
TOTAL_KEPT=$((TOTAL_KEPT + 1))
continue
fi

# Only touch commit-specific tags (rocm-<40-hex-char sha>)
if ! [[ "${NAME}" =~ ^rocm-[0-9a-f]{40}$ ]]; then
echo " KEEP ${NAME} (not a commit tag)"
TOTAL_KEPT=$((TOTAL_KEPT + 1))
continue
fi

# Parse the push timestamp and compare against cutoff
if [[ -z "${LAST_UPDATED}" ]]; then
echo " KEEP ${NAME} (no timestamp — skipping to be safe)"
TOTAL_KEPT=$((TOTAL_KEPT + 1))
continue
fi

TAG_EPOCH=$(python3 -c \
"import datetime; s='${LAST_UPDATED}'; \
s=s.rstrip('Z').split('.')[0]; \
print(int(datetime.datetime.fromisoformat(s).replace(tzinfo=datetime.timezone.utc).timestamp()))" \
2>/dev/null || echo "0")

if [[ "${TAG_EPOCH}" -lt "${CUTOFF_EPOCH}" ]]; then
AGE_DAYS=$(( ($(date +%s) - TAG_EPOCH) / 86400 ))
echo " STALE ${NAME} (${AGE_DAYS}d old, last pushed ${LAST_UPDATED})"
STALE_TAGS+=("${NAME}")
else
echo " KEEP ${NAME} (recent, last pushed ${LAST_UPDATED})"
TOTAL_KEPT=$((TOTAL_KEPT + 1))
fi
done < <(echo "${RESPONSE}" | python3 -c \
"import sys,json; [print(json.dumps(t)) for t in json.load(sys.stdin).get('results',[])]")
done

echo ""
echo "Checked: ${TOTAL_CHECKED} tags — keeping ${TOTAL_KEPT}, deleting ${#STALE_TAGS[@]}"

if [[ "${#STALE_TAGS[@]}" -eq 0 ]]; then
echo "Nothing to delete."
exit 0
fi

# ── Delete stale tags ─────────────────────────────────────────────────────────

echo ""
if [[ "${DRY_RUN}" == "1" ]]; then
echo "--- :no_entry: DRY RUN — the following tags would be deleted:"
printf ' %s\n' "${STALE_TAGS[@]}"
echo "Set DRY_RUN=0 to actually delete them."
exit 0
fi

echo "--- :wastebasket: Deleting ${#STALE_TAGS[@]} stale tags"
DELETED=0
FAILED=0

for TAG in "${STALE_TAGS[@]}"; do
HTTP_STATUS=$(curl -sSo /dev/null -w "%{http_code}" \
-X DELETE \
"https://hub.docker.com/v2/repositories/${NAMESPACE}/${REPONAME}/tags/${TAG}/" \
-H "Authorization: Bearer ${JWT}")

if [[ "${HTTP_STATUS}" == "204" ]]; then
echo " Deleted: ${TAG}"
DELETED=$((DELETED + 1))
else
echo " FAILED (HTTP ${HTTP_STATUS}): ${TAG}"
FAILED=$((FAILED + 1))
fi
done

echo ""
echo "=== Cleanup complete: ${DELETED} deleted, ${FAILED} failed ==="
if [[ "${FAILED}" -gt 0 ]]; then
exit 1
fi
Loading