diff --git a/.github/scripts/docker_pull_with_retry.sh b/.github/scripts/docker_pull_with_retry.sh new file mode 100644 index 0000000000..3631897a57 --- /dev/null +++ b/.github/scripts/docker_pull_with_retry.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -euo pipefail + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " >&2 + exit 2 +fi + +IMAGE="$1" +MAX_ATTEMPTS="${DOCKER_PULL_MAX_ATTEMPTS:-3}" +RETRY_DELAY_SECONDS="${DOCKER_PULL_RETRY_DELAY_SECONDS:-10}" + +for attempt in $(seq 1 "${MAX_ATTEMPTS}"); do + echo "Pulling Docker image '${IMAGE}' (attempt ${attempt}/${MAX_ATTEMPTS})" + if docker pull "${IMAGE}"; then + echo "Docker pull succeeded for '${IMAGE}' on attempt ${attempt}" + exit 0 + fi + + if [ "${attempt}" -lt "${MAX_ATTEMPTS}" ]; then + echo "Docker pull failed for '${IMAGE}' on attempt ${attempt}; retrying in ${RETRY_DELAY_SECONDS}s..." + sleep "${RETRY_DELAY_SECONDS}" + fi +done + +echo "Docker pull failed for '${IMAGE}' after ${MAX_ATTEMPTS} attempts" >&2 +exit 1 diff --git a/.github/workflows/atom-test.yaml b/.github/workflows/atom-test.yaml index af5494b8d6..8ee16ecfdc 100644 --- a/.github/workflows/atom-test.yaml +++ b/.github/workflows/atom-test.yaml @@ -21,6 +21,8 @@ env: ATOM_BRANCH: "main" ATOM_REPOSITORY_URL: "ROCm/ATOM" BASE_IMAGE: "rocm/atom-dev:latest" + DOCKER_PULL_MAX_ATTEMPTS: "3" + DOCKER_PULL_RETRY_DELAY_SECONDS: "10" GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/Aiter.git' }} GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }} @@ -95,7 +97,18 @@ jobs: - name: Download the ATOM base image if: matrix.run_on_pr == true || github.event_name != 'pull_request' run: | - docker pull ${{ env.BASE_IMAGE }} + for attempt in $(seq 1 "${DOCKER_PULL_MAX_ATTEMPTS}"); do + if docker pull "${{ env.BASE_IMAGE }}"; then + echo "Docker pull succeeded on attempt ${attempt}" + exit 0 + fi + echo "Docker pull attempt ${attempt} failed" + if [ "${attempt}" -lt "${DOCKER_PULL_MAX_ATTEMPTS}" ]; then + sleep "${DOCKER_PULL_RETRY_DELAY_SECONDS}" + fi + done + echo "Docker pull failed after ${DOCKER_PULL_MAX_ATTEMPTS} attempts" + exit 1 - name: Generate Dockerfile if: matrix.run_on_pr == true || github.event_name != 'pull_request' diff --git a/.github/workflows/flash_attention_integration.yaml b/.github/workflows/flash_attention_integration.yaml index a441a27ce3..e43b5dd601 100644 --- a/.github/workflows/flash_attention_integration.yaml +++ b/.github/workflows/flash_attention_integration.yaml @@ -34,6 +34,8 @@ env: FA_REPOSITORY_URL: https://github.com/Dao-AILab/flash-attention.git BASE_IMAGE: rocm/pytorch:latest@sha256:683765a52c61341e1674fe730ab3be861a444a45a36c0a8caae7653a08a0e208 AITER_SUBMODULE_PATH: third_party/aiter + DOCKER_PULL_MAX_ATTEMPTS: "3" + DOCKER_PULL_RETRY_DELAY_SECONDS: "10" jobs: check-signal: @@ -133,7 +135,7 @@ jobs: exit 0 - name: Pull base image - run: docker pull ${{ env.BASE_IMAGE }} + run: bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}" - name: Generate Dockerfile run: | @@ -341,7 +343,7 @@ jobs: exit 0 - name: Pull base image - run: docker pull ${{ env.BASE_IMAGE }} + run: bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}" - name: Generate Dockerfile run: | diff --git a/.github/workflows/vllm_benchmark.yaml b/.github/workflows/vllm_benchmark.yaml index 8272c043fd..c1402660ad 100644 --- a/.github/workflows/vllm_benchmark.yaml +++ b/.github/workflows/vllm_benchmark.yaml @@ -16,6 +16,8 @@ env: VLLM_BRANCH: "main" VLLM_REPOSITORY_URL: "https://github.com/vllm-project/vllm" BASE_IMAGE: rocm/vllm-dev:nightly + DOCKER_PULL_MAX_ATTEMPTS: "3" + DOCKER_PULL_RETRY_DELAY_SECONDS: "10" AITER_WHEEL_ARTIFACT_NAME: aiter-vllm-whl-${{ github.run_id }} GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/aiter.git' }} GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} @@ -81,7 +83,7 @@ jobs: - name: Build Aiter wheel in vLLM base image run: | set -euo pipefail - docker pull ${{ env.BASE_IMAGE }} + bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}" docker run --rm \ --network=host \ -v "${{ github.workspace }}:/workspace" \ @@ -144,6 +146,14 @@ jobs: extra_env_args: -e VLLM_USE_TRITON_FLASH_ATTN=0 steps: + - name: Checkout Docker pull retry helper + uses: actions/checkout@v6 + with: + repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} + ref: ${{ github.event.pull_request.head.sha || github.sha }} + sparse-checkout: | + .github/scripts + - name: Download Aiter wheel artifact uses: actions/download-artifact@v4 with: @@ -170,7 +180,7 @@ jobs: - name: Download the vLLM base image run: | - docker pull ${{ env.BASE_IMAGE }} + bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}" - name: Run benchmarks run: |