From ed688a2145651a30425faa59e11bfaaf1b735810 Mon Sep 17 00:00:00 2001 From: Justin Pierce Date: Tue, 19 Aug 2025 10:45:14 -0400 Subject: [PATCH] Add retries for external dependencies --- 01_install_requirements.sh | 25 ++++++++++++++++++++++++- ocp_install_env.sh | 24 +++++++++++++++++++++--- utils.sh | 19 +++++++++++++++++++ 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/01_install_requirements.sh b/01_install_requirements.sh index 8fe354c05..3f5eaad5e 100755 --- a/01_install_requirements.sh +++ b/01_install_requirements.sh @@ -40,7 +40,30 @@ sudo dnf -y clean all old_version=$(sudo dnf info NetworkManager | grep Version | cut -d ':' -f 2) # Update to latest packages first -sudo dnf -y upgrade --nobest +# Number of attempts +MAX_RETRIES=5 +# Delay between attempts (in seconds) +_YUM_RETRY_BACKOFF=15 + +attempt=1 +while (( attempt <= MAX_RETRIES )); do + if sudo dnf -y upgrade --nobest; then + echo "System upgraded successfully." + break + else + echo "Upgrade failed (attempt $attempt). Cleaning cache and retrying..." + sudo dnf clean all + sudo rm -rf /var/cache/dnf/* + sleep $(( _YUM_RETRY_BACKOFF * attempt )) + fi + + (( attempt++ )) +done + +if (( attempt > MAX_RETRIES )); then + echo "ERROR: Failed to upgrade system after $MAX_RETRIES attempts." + exit 1 +fi new_version=$(sudo dnf info NetworkManager | grep Version | cut -d ':' -f 2) # If NetworkManager was upgraded it needs to be restarted diff --git a/ocp_install_env.sh b/ocp_install_env.sh index 241c62054..521ca000d 100644 --- a/ocp_install_env.sh +++ b/ocp_install_env.sh @@ -19,15 +19,33 @@ function extract_command() { local cmd local outdir local extract_dir + local MAX_RETRIES=5 + local SLEEP_BETWEEN=10 cmd="$1" release_image="$2" outdir="$3" - extract_dir=$(mktemp --tmpdir -d "installer--XXXXXXXXXX") - _tmpfiles="$_tmpfiles $extract_dir" + # Retry loop for oc adm release extract to handle quay.io blips + for attempt in $(seq 1 $MAX_RETRIES); do + extract_dir=$(mktemp --tmpdir -d "installer--XXXXXXXXXX") - oc adm release extract --registry-config "${PULL_SECRET_FILE}" --command=$cmd --to "${extract_dir}" ${release_image} + if oc adm release extract --registry-config "${PULL_SECRET_FILE}" --command="$cmd" --to "${extract_dir}" "${release_image}"; then + echo "Successfully extracted $cmd" + break + fi + + if [[ $attempt -lt $MAX_RETRIES ]]; then + echo "Extraction failed, retrying in ${SLEEP_BETWEEN}s..." + rm -rf "${extract_dir}" + sleep "${SLEEP_BETWEEN}" + else + echo "Failed to extract $cmd from ${release_image} after $MAX_RETRIES attempts" + return 1 + fi + done + + _tmpfiles="$_tmpfiles $extract_dir" if [[ $cmd == "oc.rhel8" ]]; then cmd="oc" diff --git a/utils.sh b/utils.sh index 454c28cf2..73382956b 100755 --- a/utils.sh +++ b/utils.sh @@ -617,6 +617,25 @@ EOF if [[ "$reg_state" != "running" || $restart_registry -eq 1 ]]; then sudo podman rm registry -f || true + MAX_RETRIES=5 + _PULL_RETRY_DELAY=10 + + # Try pulling the image first to tolerate quay.io errors like 504s. + for attempt in $(seq 1 $MAX_RETRIES); do + if sudo podman pull "${DOCKER_REGISTRY_IMAGE}"; then + echo "Successfully pulled ${DOCKER_REGISTRY_IMAGE}" + break + fi + + if [[ $attempt -lt $MAX_RETRIES ]]; then + echo "Pull failed, retrying in ${_PULL_RETRY_DELAY}s..." + sleep "${_PULL_RETRY_DELAY}" + else + echo "Failed to pull ${DOCKER_REGISTRY_IMAGE} after $MAX_RETRIES attempts" + exit 1 + fi + done + sudo podman run -d --name registry --net=host --privileged \ -v ${REGISTRY_DIR}/data:/var/lib/registry:z \ -v ${REGISTRY_DIR}/auth:/auth:z \