From 78f97b429340758f2c5619e878710e45081f43fb Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Thu, 30 Apr 2026 18:01:11 +0800 Subject: [PATCH 1/2] CI: retry docker pulls in workflow image downloads Retry image pulls in ATOM, vLLM, and flash attention workflows so transient registry failures do not fail CI immediately. Add a shared helper where the job checks out aiter and keep an inline retry for the ATOM job that checks out the ATOM repo. --- .github/scripts/docker_pull_with_retry.sh | 28 +++++++++++++++++++ .github/workflows/atom-test.yaml | 15 +++++++++- .../flash_attention_integration.yaml | 6 ++-- .github/workflows/vllm_benchmark.yaml | 9 ++++-- 4 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 .github/scripts/docker_pull_with_retry.sh diff --git a/.github/scripts/docker_pull_with_retry.sh b/.github/scripts/docker_pull_with_retry.sh new file mode 100644 index 0000000000..3631897a57 --- /dev/null +++ b/.github/scripts/docker_pull_with_retry.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -euo pipefail + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " >&2 + exit 2 +fi + +IMAGE="$1" +MAX_ATTEMPTS="${DOCKER_PULL_MAX_ATTEMPTS:-3}" +RETRY_DELAY_SECONDS="${DOCKER_PULL_RETRY_DELAY_SECONDS:-10}" + +for attempt in $(seq 1 "${MAX_ATTEMPTS}"); do + echo "Pulling Docker image '${IMAGE}' (attempt ${attempt}/${MAX_ATTEMPTS})" + if docker pull "${IMAGE}"; then + echo "Docker pull succeeded for '${IMAGE}' on attempt ${attempt}" + exit 0 + fi + + if [ "${attempt}" -lt "${MAX_ATTEMPTS}" ]; then + echo "Docker pull failed for '${IMAGE}' on attempt ${attempt}; retrying in ${RETRY_DELAY_SECONDS}s..." + sleep "${RETRY_DELAY_SECONDS}" + fi +done + +echo "Docker pull failed for '${IMAGE}' after ${MAX_ATTEMPTS} attempts" >&2 +exit 1 diff --git a/.github/workflows/atom-test.yaml b/.github/workflows/atom-test.yaml index ae554c8abe..eefed76905 100644 --- a/.github/workflows/atom-test.yaml +++ b/.github/workflows/atom-test.yaml @@ -21,6 +21,8 @@ env: ATOM_BRANCH: "main" ATOM_REPOSITORY_URL: "ROCm/ATOM" BASE_IMAGE: "rocm/atom-dev:latest" + DOCKER_PULL_MAX_ATTEMPTS: "3" + DOCKER_PULL_RETRY_DELAY_SECONDS: "10" GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/Aiter.git' }} GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }} @@ -95,7 +97,18 @@ jobs: - name: Download the ATOM base image if: matrix.run_on_pr == true || github.event_name != 'pull_request' run: | - docker pull ${{ env.BASE_IMAGE }} + for attempt in $(seq 1 "${DOCKER_PULL_MAX_ATTEMPTS}"); do + if docker pull "${{ env.BASE_IMAGE }}"; then + echo "Docker pull succeeded on attempt ${attempt}" + exit 0 + fi + echo "Docker pull attempt ${attempt} failed" + if [ "${attempt}" -lt "${DOCKER_PULL_MAX_ATTEMPTS}" ]; then + sleep "${DOCKER_PULL_RETRY_DELAY_SECONDS}" + fi + done + echo "Docker pull failed after ${DOCKER_PULL_MAX_ATTEMPTS} attempts" + exit 1 - name: Generate Dockerfile if: matrix.run_on_pr == true || github.event_name != 'pull_request' diff --git a/.github/workflows/flash_attention_integration.yaml b/.github/workflows/flash_attention_integration.yaml index 69d99ab889..930ca65b5f 100644 --- a/.github/workflows/flash_attention_integration.yaml +++ b/.github/workflows/flash_attention_integration.yaml @@ -34,6 +34,8 @@ env: FA_REPOSITORY_URL: https://github.com/Dao-AILab/flash-attention.git BASE_IMAGE: rocm/pytorch:latest@sha256:683765a52c61341e1674fe730ab3be861a444a45a36c0a8caae7653a08a0e208 AITER_SUBMODULE_PATH: third_party/aiter + DOCKER_PULL_MAX_ATTEMPTS: "3" + DOCKER_PULL_RETRY_DELAY_SECONDS: "10" jobs: check-signal: @@ -118,7 +120,7 @@ jobs: run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true - name: Pull base image - run: docker pull ${{ env.BASE_IMAGE }} + run: bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}" - name: Generate Dockerfile run: | @@ -311,7 +313,7 @@ jobs: run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true - name: Pull base image - run: docker pull ${{ env.BASE_IMAGE }} + run: bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}" - name: Generate Dockerfile run: | diff --git a/.github/workflows/vllm_benchmark.yaml b/.github/workflows/vllm_benchmark.yaml index 865b9bebba..74ae7290e7 100644 --- a/.github/workflows/vllm_benchmark.yaml +++ b/.github/workflows/vllm_benchmark.yaml @@ -16,6 +16,8 @@ env: VLLM_BRANCH: "main" VLLM_REPOSITORY_URL: "https://github.com/vllm-project/vllm" BASE_IMAGE: rocm/vllm-dev:nightly + DOCKER_PULL_MAX_ATTEMPTS: "3" + DOCKER_PULL_RETRY_DELAY_SECONDS: "10" GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/aiter.git' }} GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} @@ -62,7 +64,7 @@ jobs: - name: Download the vLLM base image run: | - docker pull ${{ env.BASE_IMAGE }} + bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}" - name: Generate Dockerfile run: | @@ -129,12 +131,15 @@ jobs: kv_cache_dtype: 'fp8_kvcache' steps: + - name: Checkout aiter repo + uses: actions/checkout@v4 + - name: Docker login run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} - name: Download the vLLM image run: | - docker pull rocm/aiter-ci:${{ env.GITHUB_COMMIT_SHA }} + bash .github/scripts/docker_pull_with_retry.sh "rocm/aiter-ci:${{ env.GITHUB_COMMIT_SHA }}" - name: Run benchmarks run: | From f45f2c41db8f9c48d362f9e51688eb821945464b Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Sun, 3 May 2026 13:14:55 +0800 Subject: [PATCH 2/2] CI: narrow vLLM helper checkout scope Use actions/checkout@v6 for the vLLM benchmark helper checkout and sparse-checkout only .github/scripts so the job does not clone the full repository just to access the docker pull retry helper. --- .github/workflows/vllm_benchmark.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/vllm_benchmark.yaml b/.github/workflows/vllm_benchmark.yaml index 566bc43204..c1402660ad 100644 --- a/.github/workflows/vllm_benchmark.yaml +++ b/.github/workflows/vllm_benchmark.yaml @@ -146,10 +146,13 @@ jobs: extra_env_args: -e VLLM_USE_TRITON_FLASH_ATTN=0 steps: - - name: Checkout aiter repo - uses: actions/checkout@v4 + - name: Checkout Docker pull retry helper + uses: actions/checkout@v6 with: + repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} ref: ${{ github.event.pull_request.head.sha || github.sha }} + sparse-checkout: | + .github/scripts - name: Download Aiter wheel artifact uses: actions/download-artifact@v4