Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
8c1f814
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 13, 2026
4716340
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 13, 2026
f7086c2
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 13, 2026
8e657f0
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 13, 2026
20d0a5f
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 14, 2026
f76a302
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 14, 2026
24e69a1
Merge remote-tracking branch 'origin/main' into akaratza_optimize_doc…
AndreasKaratzas Mar 14, 2026
e40694e
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 14, 2026
e8346a0
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 14, 2026
a2c6035
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 14, 2026
067a486
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 14, 2026
3509942
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 14, 2026
3ca62e1
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 14, 2026
624b413
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 14, 2026
04f3ee6
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 15, 2026
87a03a8
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 15, 2026
17f5ee9
[ROCm][CI] Add ROCm Docker Hub registry cache and weekly cleanup pipe…
AndreasKaratzas Mar 15, 2026
5af45e0
[ROCm][CI] Chain hipify targets sequentially to resolve potential rac…
AndreasKaratzas Mar 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 91 additions & 16 deletions .buildkite/hardware_tests/amd.yaml
Original file line number Diff line number Diff line change
@@ -1,30 +1,105 @@
group: Hardware - AMD Build
group: Hardware - AMD Build
steps:
- label: "AMD: :docker: build image"
# Fat multi-arch image - only auto-runs on main (cache warming / release).
# On PR builds, the Jinja template gates this behind a manual block step.
# This YAML is the source-of-truth for step shape; the template adds the block logic.
- label: "AMD: :docker: build image (all archs)"
key: image-build-amd
depends_on: []
device: amd_cpu
no_plugin: true
commands:
- >
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
--target test
--no-cache
--progress plain .
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
- bash .buildkite/scripts/ci-bake.sh test-rocm-ci
env:
DOCKER_BUILDKIT: "1"
IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}"
VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950"
timeout_in_minutes: 600
retry:
automatic:
- exit_status: -1 # Agent was lost
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
- exit_status: 1 # Machine occasionally fail
- exit_status: 128 # Git / network connectivity issues
limit: 1
- exit_status: 1 # Machine occasionally fails
limit: 1

# Per-architecture images
- label: "AMD: :docker: build image (gfx90a)"
key: image-build-amd-gfx90a
depends_on: []
device: amd_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/ci-bake.sh test-rocm-gfx90a-ci
env:
DOCKER_BUILDKIT: "1"
IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx90a"
VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
PYTORCH_ROCM_ARCH: "gfx90a"
timeout_in_minutes: 600
retry:
automatic:
- exit_status: -1
limit: 1
- exit_status: -10
limit: 1
- exit_status: 128
limit: 1
- exit_status: 1
limit: 1

- label: "AMD: :docker: build image (gfx942)"
key: image-build-amd-gfx942
depends_on: []
device: amd_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/ci-bake.sh test-rocm-gfx942-ci
env:
DOCKER_BUILDKIT: "1"
IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx942"
VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
PYTORCH_ROCM_ARCH: "gfx942"
timeout_in_minutes: 600
retry:
automatic:
- exit_status: -1
limit: 1
- exit_status: -10
limit: 1
- exit_status: 128
limit: 1
- exit_status: 1
limit: 1

- label: "AMD: :docker: build image (gfx950)"
key: image-build-amd-gfx950
depends_on: []
device: amd_cpu
no_plugin: true
commands:
- bash .buildkite/scripts/ci-bake.sh test-rocm-gfx950-ci
env:
DOCKER_BUILDKIT: "1"
IMAGE_TAG: "rocm/vllm-ci:${BUILDKITE_COMMIT}-gfx950"
VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci-rocm.hcl"
PYTORCH_ROCM_ARCH: "gfx950"
timeout_in_minutes: 600
retry:
automatic:
- exit_status: -1
limit: 1
- exit_status: -10
limit: 1
- exit_status: 128
limit: 1
- exit_status: 1
limit: 1
172 changes: 172 additions & 0 deletions .buildkite/scripts/ci-bake.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#!/bin/bash
# ci-bake.sh - Wrapper script for Docker buildx bake CI builds
#
# Canonical location: vllm repo at .buildkite/scripts/ci-bake.sh
# Kept in sync with ci-infra repo at buildkite/scripts/ci-bake.sh.
# Update both when making changes; the vllm copy is what actually runs in CI
# (pinned to the vllm commit under test).
#
# This script handles the common setup for running docker buildx bake:
# - Downloads ci.hcl from ci-infra
# - Detects and uses local buildkitd if available (custom AMI with warm cache)
# - Falls back to docker-container driver on regular instances
# - Runs bake with --print for debugging
# - Runs the actual build
#
# Usage:
# ci-bake.sh [TARGET]
#
# Environment variables (all optional, with sensible defaults):
# CI_HCL_URL - URL to ci.hcl (default: from ci-infra main branch)
# VLLM_CI_BRANCH - ci-infra branch to use (default: main)
# VLLM_BAKE_FILE - Path to vLLM's bake file (default: docker/docker-bake.hcl)
# BUILDER_NAME - Name for buildx builder (default: vllm-builder)
#
# Build configuration (passed through to bake via environment):
# BUILDKITE_COMMIT - Git commit (auto-detected from Buildkite)
# PARENT_COMMIT - Parent commit (HEAD~1) for cache fallback (auto-computed)
# IMAGE_TAG - Primary image tag
# IMAGE_TAG_LATEST - Latest tag (optional)
# CACHE_FROM - Cache source
# CACHE_FROM_BASE - Base branch cache source
# CACHE_FROM_MAIN - Main branch cache source
# CACHE_TO - Cache destination
# VLLM_USE_PRECOMPILED - Use precompiled wheels
# VLLM_MERGE_BASE_COMMIT - Merge base commit for precompiled

set -euo pipefail

# Check if image already exists (skip build if it does)
if [[ -n "${IMAGE_TAG:-}" ]]; then
echo "--- :mag: Checking if image exists"
if docker manifest inspect "${IMAGE_TAG}" >/dev/null 2>&1; then
echo "Image already exists: ${IMAGE_TAG}"
echo "Skipping build"
exit 0
fi
echo "Image not found, proceeding with build"
fi

# Configuration with defaults
TARGET="${1:-test-ci}"
CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}"
VLLM_BAKE_FILE="${VLLM_BAKE_FILE:-docker/docker-bake.hcl}"
BUILDER_NAME="${BUILDER_NAME:-vllm-builder}"
CI_HCL_PATH="/tmp/ci.hcl"
BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock"

echo "--- :docker: Setting up Docker buildx bake"
echo "Target: ${TARGET}"
echo "CI HCL URL: ${CI_HCL_URL}"
echo "vLLM bake file: ${VLLM_BAKE_FILE}"


# Check if vLLM bake file exists
if [[ ! -f "${VLLM_BAKE_FILE}" ]]; then
echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE}"
echo "Make sure you're running from the vLLM repository root"
exit 1
fi

# Download ci.hcl
echo "--- :arrow_down: Downloading ci.hcl"
curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}"
echo "Downloaded to ${CI_HCL_PATH}"

# Set up buildx builder
# Priority: 1) local buildkitd socket (custom AMI) 2) existing builder 3) new docker-container builder
echo "--- :buildkite: Setting up buildx builder"

if [[ -S "${BUILDKIT_SOCKET}" ]]; then
# Custom AMI with standalone buildkitd - use remote driver for warm cache
echo "✅ Found local buildkitd socket at ${BUILDKIT_SOCKET}"
echo "Using remote driver to connect to buildkitd (warm cache available)"

# Check if ${BUILDER_NAME} already exists and is using the socket
if docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
echo "Using existing builder: ${BUILDER_NAME}"
docker buildx use "${BUILDER_NAME}"
else
echo "Creating builder '${BUILDER_NAME}' with remote driver"
docker buildx create \
--name "${BUILDER_NAME}" \
--driver remote \
--use \
"unix://${BUILDKIT_SOCKET}"
fi
docker buildx inspect --bootstrap
elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
# Existing builder available
echo "Using existing builder: ${BUILDER_NAME}"
docker buildx use "${BUILDER_NAME}"
docker buildx inspect --bootstrap
else
# No local buildkitd, no existing builder - create new docker-container builder
echo "No local buildkitd found, using docker-container driver"
docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use
docker buildx inspect --bootstrap
fi

# Show builder info
echo "Active builder:"
docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls

# Deepen shallow clones so HEAD~1 and merge-base are available.
# Buildkite agents often clone with --depth=1; without deepening, git rev-parse
# HEAD~1 and git merge-base both silently fail, disabling the per-commit cache layers.
if git rev-parse --is-shallow-repository 2>/dev/null | grep -q "true"; then
echo "Shallow clone detected — deepening for cache key computation"
# --deepen=1 extends the current shallow clone by 1 commit along the
# already-fetched branch, making HEAD~1 available. Unlike --depth=2
# with a refspec, it operates on the currently checked-out branch and
# is safe in detached-HEAD (Buildkite) checkout state.
git fetch --deepen=1 origin 2>/dev/null || true
fi

# Compute parent commit for cache fallback (if not already set)
if [[ -z "${PARENT_COMMIT:-}" ]]; then
PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "")
if [[ -n "${PARENT_COMMIT}" ]]; then
echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}"
export PARENT_COMMIT
else
echo "Could not determine parent commit (may be first commit in repo)"
fi
else
echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}"
fi

# Compute merge-base with main for an additional cache fallback layer.
# Mirrors the VLLM_MERGE_BASE_COMMIT pattern used in ci.hcl (CUDA).
# Useful for long-lived PRs where parent-commit cache may be missing but the
# merge-base (a real main commit) maps to a warm :rocm-latest snapshot.
if [[ -z "${VLLM_MERGE_BASE_COMMIT:-}" ]]; then
# Fetch just the tip of main so merge-base can be resolved on shallow clones.
git fetch --depth=1 origin main 2>/dev/null || true
VLLM_MERGE_BASE_COMMIT=$(git merge-base HEAD origin/main 2>/dev/null || echo "")
if [[ -n "${VLLM_MERGE_BASE_COMMIT}" ]]; then
echo "Computed merge base commit for cache fallback: ${VLLM_MERGE_BASE_COMMIT}"
export VLLM_MERGE_BASE_COMMIT
else
echo "Could not determine merge base (will skip that cache layer)"
fi
else
echo "Using provided VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}"
fi

# Print resolved configuration for debugging and upload as a Buildkite artifact
echo "--- :page_facing_up: Resolved bake configuration"
BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json"
docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true
if command -v buildkite-agent >/dev/null 2>&1 && [[ -n "${BUILDKITE_BUILD_NUMBER:-}" ]]; then
buildkite-agent artifact upload "${BAKE_CONFIG_FILE}" || true
echo "Uploaded ${BAKE_CONFIG_FILE} as Buildkite artifact"
else
echo "Saved bake config to ${BAKE_CONFIG_FILE} (not in Buildkite, skipping upload)"
fi

# Run the actual build
echo "--- :docker: Building ${TARGET}"
docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}"

echo "--- :white_check_mark: Build complete"
2 changes: 1 addition & 1 deletion .buildkite/scripts/hardware_ci/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ wait_for_clean_gpus

# --- Pull test image ---
echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
image_name="${DOCKER_IMAGE_NAME:-rocm/vllm-ci:${BUILDKITE_COMMIT}}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}"

Expand Down
13 changes: 6 additions & 7 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,10 @@ steps:
mirror_hardwares: [amdexperimental, amdproduction, amdgfx90a]
agent_pool: mi250_1
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
- tests/standalone_tests/python_only_compile_rocm.sh
- setup.py
commands:
- bash standalone_tests/python_only_compile.sh
- bash standalone_tests/python_only_compile_rocm.sh

- label: Basic Correctness Test # 20min
timeout_in_minutes: 30
Expand Down Expand Up @@ -1429,12 +1429,11 @@ steps:
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
optional: true
# grade: Blocking
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
- tests/standalone_tests/python_only_compile_rocm.sh
- setup.py
commands:
- bash standalone_tests/python_only_compile.sh
- bash standalone_tests/python_only_compile_rocm.sh

- label: Basic Correctness Test # 20min
timeout_in_minutes: 30
Expand Down Expand Up @@ -3189,10 +3188,10 @@ steps:
agent_pool: mi355_1
optional: true
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
- tests/standalone_tests/python_only_compile_rocm.sh
- setup.py
commands:
- bash standalone_tests/python_only_compile.sh
- bash standalone_tests/python_only_compile_rocm.sh

- label: Basic Correctness Test # 20min
timeout_in_minutes: 30
Expand Down
6 changes: 4 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1184,8 +1184,10 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
WITH_SOABI)
endif()

# For CUDA and HIP builds also build the triton_kernels external package.
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
# Fetch and vendor triton_kernels (Python-only, no compilation).
# Skipped for HIP/ROCm - the git clone of the full triton repo is expensive
# and triton_kernels is optional at runtime (graceful fallback in import_utils).
if(VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/triton_kernels.cmake)
endif()

Expand Down
9 changes: 9 additions & 0 deletions cmake/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,15 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
BYPRODUCTS ${HIP_SRCS}
COMMENT "Running hipify on ${NAME} extension source files.")

# Chain hipify targets so they run sequentially. Each hipify target runs
# shutil.copytree into a shared output directory; running them in parallel
# causes a race where one target's copytree overwrites .hip files produced
# by another target back to .cu originals.
if (DEFINED _VLLM_LAST_HIPIFY_TARGET)
add_dependencies(hipify${NAME} ${_VLLM_LAST_HIPIFY_TARGET})
endif()
set(_VLLM_LAST_HIPIFY_TARGET "hipify${NAME}" PARENT_SCOPE)

# Swap out original extension sources with hipified sources.
list(APPEND HIP_SRCS ${CXX_SRCS})
set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
Expand Down
Loading
Loading