diff --git a/pkg/controller/build/assets/buildah-build.sh b/pkg/controller/build/assets/buildah-build.sh index 0c77101ee9..29c1a7fc01 100644 --- a/pkg/controller/build/assets/buildah-build.sh +++ b/pkg/controller/build/assets/buildah-build.sh @@ -8,6 +8,25 @@ set -xeuo ETC_PKI_ENTITLEMENT_MOUNTPOINT="${ETC_PKI_ENTITLEMENT_MOUNTPOINT:-}" ETC_PKI_RPM_GPG_MOUNTPOINT="${ETC_PKI_RPM_GPG_MOUNTPOINT:-}" ETC_YUM_REPOS_D_MOUNTPOINT="${ETC_YUM_REPOS_D_MOUNTPOINT:-}" +MAX_RETRIES="${MAX_RETRIES:-3}" + +# Retry a command up to a specific number of times until it exits successfully. +# Adapted from https://gist.github.com/sj26/88e1c6584397bb7c13bd11108a579746 +function retry { + local count=0 + + until "$@"; do + exit=$? + count=$((count + 1)) + if [ $count -lt $MAX_RETRIES ]; then + echo "Retry $count/$MAX_RETRIES exited $exit, retrying..." + else + echo "Retry $count/$MAX_RETRIES exited $exit, no more retries left." + return $exit + fi + done + return 0 +} build_context="$HOME/context" @@ -69,10 +88,10 @@ if [[ -n "$ETC_PKI_RPM_GPG_MOUNTPOINT" ]] && [[ -d "$ETC_PKI_RPM_GPG_MOUNTPOINT" fi # Build our image. -buildah bud "${build_args[@]}" "$build_context" +retry buildah bud "${build_args[@]}" "$build_context" # Push our built image. -buildah push \ +retry buildah push \ --storage-driver vfs \ --authfile="$FINAL_IMAGE_PUSH_CREDS" \ --digestfile="/tmp/done/digestfile" \ diff --git a/pkg/controller/build/assets/wait.sh b/pkg/controller/build/assets/wait.sh index bf1675fe63..e90418f186 100644 --- a/pkg/controller/build/assets/wait.sh +++ b/pkg/controller/build/assets/wait.sh @@ -4,12 +4,14 @@ # within the Build Controller binary (see //go:embed) and injected into a # custom build pod. -# Wait until the done file appears. +# Wait until the digestfile file appears. The presence of this file indicates +# that the build operation is complete. while [ ! -f "/tmp/done/digestfile" ] do sleep 1 done +# Inject the contents of the digestfile into a ConfigMap. oc create configmap \ "$DIGEST_CONFIGMAP_NAME" \ --namespace openshift-machine-config-operator \ diff --git a/pkg/controller/build/image_build_request.go b/pkg/controller/build/image_build_request.go index 6301f4c379..2cc999f79c 100644 --- a/pkg/controller/build/image_build_request.go +++ b/pkg/controller/build/image_build_request.go @@ -197,6 +197,17 @@ func (i ImageBuildRequest) toBuildPod() *corev1.Pod { // nolint:dupl // I don't want to deduplicate this yet since there are still some unknowns. func (i ImageBuildRequest) toBuildahPod() *corev1.Pod { env := []corev1.EnvVar{ + // How many times the build / push steps should be retried. In the future, + // this should be wired up to the MachineOSConfig or other higher-level + // API. This is useful for retrying builds / pushes when they fail due to a + // transient condition such as a temporary network issue. It does *NOT* + // handle situations where the build pod is evicted or rescheduled. A + // higher-level abstraction will be needed such as a Kubernetes Job + // (https://kubernetes.io/docs/concepts/workloads/controllers/job/). + { + Name: "MAX_RETRIES", + Value: "3", + }, { Name: "DIGEST_CONFIGMAP_NAME", Value: i.getDigestConfigMapName(),