Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/scripts/docker_pull_with_retry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

set -euo pipefail

if [ "$#" -ne 1 ]; then
echo "Usage: $0 <image>" >&2
exit 2
fi

IMAGE="$1"
MAX_ATTEMPTS="${DOCKER_PULL_MAX_ATTEMPTS:-3}"
RETRY_DELAY_SECONDS="${DOCKER_PULL_RETRY_DELAY_SECONDS:-10}"

for attempt in $(seq 1 "${MAX_ATTEMPTS}"); do
echo "Pulling Docker image '${IMAGE}' (attempt ${attempt}/${MAX_ATTEMPTS})"
if docker pull "${IMAGE}"; then
echo "Docker pull succeeded for '${IMAGE}' on attempt ${attempt}"
exit 0
fi

if [ "${attempt}" -lt "${MAX_ATTEMPTS}" ]; then
echo "Docker pull failed for '${IMAGE}' on attempt ${attempt}; retrying in ${RETRY_DELAY_SECONDS}s..."
sleep "${RETRY_DELAY_SECONDS}"
fi
done

echo "Docker pull failed for '${IMAGE}' after ${MAX_ATTEMPTS} attempts" >&2
exit 1
15 changes: 14 additions & 1 deletion .github/workflows/atom-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ env:
ATOM_BRANCH: "main"
ATOM_REPOSITORY_URL: "ROCm/ATOM"
BASE_IMAGE: "rocm/atom-dev:latest"
DOCKER_PULL_MAX_ATTEMPTS: "3"
DOCKER_PULL_RETRY_DELAY_SECONDS: "10"
GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/Aiter.git' }}
GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }}

Expand Down Expand Up @@ -95,7 +97,18 @@ jobs:
- name: Download the ATOM base image
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
run: |
docker pull ${{ env.BASE_IMAGE }}
for attempt in $(seq 1 "${DOCKER_PULL_MAX_ATTEMPTS}"); do
if docker pull "${{ env.BASE_IMAGE }}"; then
echo "Docker pull succeeded on attempt ${attempt}"
exit 0
fi
echo "Docker pull attempt ${attempt} failed"
if [ "${attempt}" -lt "${DOCKER_PULL_MAX_ATTEMPTS}" ]; then
sleep "${DOCKER_PULL_RETRY_DELAY_SECONDS}"
fi
done
echo "Docker pull failed after ${DOCKER_PULL_MAX_ATTEMPTS} attempts"
exit 1

- name: Generate Dockerfile
if: matrix.run_on_pr == true || github.event_name != 'pull_request'
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/flash_attention_integration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ env:
FA_REPOSITORY_URL: https://github.com/Dao-AILab/flash-attention.git
BASE_IMAGE: rocm/pytorch:latest@sha256:683765a52c61341e1674fe730ab3be861a444a45a36c0a8caae7653a08a0e208
AITER_SUBMODULE_PATH: third_party/aiter
DOCKER_PULL_MAX_ATTEMPTS: "3"
DOCKER_PULL_RETRY_DELAY_SECONDS: "10"

jobs:
check-signal:
Expand Down Expand Up @@ -133,7 +135,7 @@ jobs:
exit 0

- name: Pull base image
run: docker pull ${{ env.BASE_IMAGE }}
run: bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}"

- name: Generate Dockerfile
run: |
Expand Down Expand Up @@ -341,7 +343,7 @@ jobs:
exit 0

- name: Pull base image
run: docker pull ${{ env.BASE_IMAGE }}
run: bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}"

- name: Generate Dockerfile
run: |
Expand Down
14 changes: 12 additions & 2 deletions .github/workflows/vllm_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ env:
VLLM_BRANCH: "main"
VLLM_REPOSITORY_URL: "https://github.com/vllm-project/vllm"
BASE_IMAGE: rocm/vllm-dev:nightly
DOCKER_PULL_MAX_ATTEMPTS: "3"
DOCKER_PULL_RETRY_DELAY_SECONDS: "10"
AITER_WHEEL_ARTIFACT_NAME: aiter-vllm-whl-${{ github.run_id }}
GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/aiter.git' }}
GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
Expand Down Expand Up @@ -81,7 +83,7 @@ jobs:
- name: Build Aiter wheel in vLLM base image
run: |
set -euo pipefail
docker pull ${{ env.BASE_IMAGE }}
bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}"
docker run --rm \
--network=host \
-v "${{ github.workspace }}:/workspace" \
Expand Down Expand Up @@ -144,6 +146,14 @@ jobs:
extra_env_args: -e VLLM_USE_TRITON_FLASH_ATTN=0

steps:
- name: Checkout Docker pull retry helper
uses: actions/checkout@v6
with:
repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
ref: ${{ github.event.pull_request.head.sha || github.sha }}
sparse-checkout: |
.github/scripts

- name: Download Aiter wheel artifact
uses: actions/download-artifact@v4
with:
Expand All @@ -170,7 +180,7 @@ jobs:

- name: Download the vLLM base image
run: |
docker pull ${{ env.BASE_IMAGE }}
bash .github/scripts/docker_pull_with_retry.sh "${{ env.BASE_IMAGE }}"

- name: Run benchmarks
run: |
Expand Down
Loading