From 21238381d41818c3c88026af90c9d4564a0c4df8 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sun, 24 May 2026 16:55:46 +0200 Subject: [PATCH 1/3] ci: retry cosign sign on transient GHCR/Rekor failures --- .github/actions/publish-apko-base/action.yml | 13 +-- .../actions/publish-image-loaded/action.yml | 15 ++-- .github/scripts/cosign_sign_with_retry.sh | 87 +++++++++++++++++++ 3 files changed, 103 insertions(+), 12 deletions(-) create mode 100644 .github/scripts/cosign_sign_with_retry.sh diff --git a/.github/actions/publish-apko-base/action.yml b/.github/actions/publish-apko-base/action.yml index 7cfbe90db4..7d5629f9f5 100644 --- a/.github/actions/publish-apko-base/action.yml +++ b/.github/actions/publish-apko-base/action.yml @@ -157,6 +157,12 @@ runs: if: steps.cosign_2.outcome == 'failure' uses: sigstore/cosign-installer@6f9f17788090df1f26f669e9d70d6ae9567deba6 # v4.1.2 + # Wrap the cosign call in cosign_sign_with_retry.sh so a GHCR / + # Rekor / Fulcio 5xx during signature upload retries on the same + # budget every docker push already gets via docker_push_with_retry.sh. + # The helper preserves the `createLogEntryConflict` idempotency + # branch (already-signed digest -> success) and only retries on + # the shared transient regex. - name: Sign image shell: bash env: @@ -165,11 +171,8 @@ runs: run: | set -euo pipefail if [ -z "$DIGEST" ]; then echo "::error::No digest"; exit 1; fi - if ! cosign sign --yes "ghcr.io/aureliolo/synthorg-${IMAGE_NAME}-base@${DIGEST}" 2>&1 | tee /tmp/cosign-out; then - if grep -q 'createLogEntryConflict' /tmp/cosign-out; then - echo "::notice::Image already signed -- skipping" - else cat /tmp/cosign-out; exit 1; fi - fi + bash "${GITHUB_WORKSPACE}/.github/scripts/cosign_sign_with_retry.sh" \ + "ghcr.io/aureliolo/synthorg-${IMAGE_NAME}-base@${DIGEST}" # Confirm the input tag still resolves to the digest cosign just # signed and that the cosign signature artifact is reachable on diff --git a/.github/actions/publish-image-loaded/action.yml b/.github/actions/publish-image-loaded/action.yml index 86e4d7c1cc..f17fd2ec47 100644 --- a/.github/actions/publish-image-loaded/action.yml +++ b/.github/actions/publish-image-loaded/action.yml @@ -240,6 +240,12 @@ runs: if: steps.cosign_2.outcome == 'failure' uses: sigstore/cosign-installer@6f9f17788090df1f26f669e9d70d6ae9567deba6 # v4.1.2 + # Wrap the cosign call in cosign_sign_with_retry.sh so a GHCR / + # Rekor / Fulcio 5xx during signature upload retries on the same + # budget every docker push already gets via docker_push_with_retry.sh. + # The helper preserves the `createLogEntryConflict` idempotency + # branch (already-signed digest -> success) and only retries on + # the shared transient regex. - name: Sign image shell: bash env: @@ -251,13 +257,8 @@ runs: echo "::error::Push step did not produce a digest -- cannot sign image" exit 1 fi - if ! cosign sign --yes "ghcr.io/aureliolo/synthorg-${IMAGE_NAME}@${DIGEST}" 2>&1 | tee /tmp/cosign-out; then - if grep -q 'createLogEntryConflict' /tmp/cosign-out; then - echo "::notice::Image already signed in transparency log -- skipping" - else - exit 1 - fi - fi + bash "${GITHUB_WORKSPACE}/.github/scripts/cosign_sign_with_retry.sh" \ + "ghcr.io/aureliolo/synthorg-${IMAGE_NAME}@${DIGEST}" # Belt-and-braces: confirm every tag pushed in this step resolves # (in the registry, NOW) to the same digest cosign just signed. A diff --git a/.github/scripts/cosign_sign_with_retry.sh b/.github/scripts/cosign_sign_with_retry.sh new file mode 100644 index 0000000000..58997ef280 --- /dev/null +++ b/.github/scripts/cosign_sign_with_retry.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# Retry cosign sign on transient registry / Rekor / Fulcio failures. +# +# `cosign sign` against a published digest is idempotent: signing the +# same digest twice either succeeds again or hits a Rekor +# `createLogEntryConflict` (already-logged) response, which callers +# treat as success. Transient GHCR/Rekor/Fulcio errors (5xx, 429, TLS +# handshake stalls, connection resets) almost always settle inside the +# next attempt window, so a bounded retry turns a noisy infra blip +# into a green run instead of failing the whole Docker workflow. +# +# Usage: +# cosign_sign_with_retry.sh +# is the full image reference, e.g. +# ghcr.io/aureliolo/synthorg-sandbox-base@sha256:abc... +# +# Behaviour: +# - Captures combined stdout+stderr of `cosign sign --yes `. +# - On exit 0: prints captured output, exits 0. +# - On exit non-0: +# * Output contains `createLogEntryConflict` -> already signed, +# emit `::notice::` and exit 0 (preserves the idempotency branch +# the inline shell blocks used to carry). +# * Output matches the shared transient regex sourced from +# docker_push_with_retry.sh (single source of truth for +# "is this a registry-side flake?") -> warn + sleep + retry +# with exponential backoff. +# * Otherwise -> non-transient cosign / Rekor / Fulcio error, +# surface output and exit with cosign's exit code. +# - Final attempt with no terminal classification: surface all +# output and exit with cosign's exit code. +set -euo pipefail + +REF="${1:?usage: cosign_sign_with_retry.sh }" + +# Same regex the docker push helper uses; `--print-transient-re` keeps +# both scripts in lockstep so a new transient signature added in one +# place automatically protects every signing call too. +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +TRANSIENT_RE="$("$SCRIPT_DIR/docker_push_with_retry.sh" --print-transient-re)" + +# 4 attempts, backoff 15s -> 30s -> 60s = ~1m45s of wait in the worst +# case before the final attempt. Matches docker_push_with_retry.sh so +# cosign rides through GHCR's typical 30-90s unicorn windows under the +# same budget the push step already does. +ATTEMPTS=4 +BACKOFF=15 + +for ((i = 1; i <= ATTEMPTS; i++)); do + out="" + rc=0 + out="$(cosign sign --yes "$REF" 2>&1)" || rc=$? + if [ "$rc" -eq 0 ]; then + printf '%s\n' "$out" + exit 0 + fi + + # Idempotency branch: a re-sign that lost the createLogEntry race is + # success, not a transient error. Check this BEFORE the regex so + # attempt 1 -> 5xx -> attempt 2 -> conflict resolves cleanly. + if printf '%s' "$out" | grep -q 'createLogEntryConflict'; then + printf '%s\n' "$out" + echo "::notice::Image ${REF} already signed -- skipping" + exit 0 + fi + + if [ "$i" -eq "$ATTEMPTS" ]; then + printf '%s\n' "$out" + echo "::error::cosign sign ${REF} failed after ${ATTEMPTS} attempts (last exit ${rc})" >&2 + exit "$rc" + fi + + if printf '%s' "$out" | grep -qiE "$TRANSIENT_RE"; then + printf '%s\n' "$out" >&2 + echo "::warning::cosign sign ${REF} hit transient error (attempt ${i}/${ATTEMPTS}, rc=${rc}); sleeping ${BACKOFF}s before retry" >&2 + sleep "$BACKOFF" + BACKOFF=$((BACKOFF * 2)) + continue + fi + + # Non-transient: surface output and bubble up immediately. Auth + # denials, malformed digests, Rekor schema rejections, etc. will + # never improve on a retry. + printf '%s\n' "$out" + echo "::error::cosign sign ${REF} failed with non-transient error (exit ${rc}); not retrying" >&2 + exit "$rc" +done From edb4a83e8b86d7d06776135a0ef344e991e77ce4 Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sun, 24 May 2026 17:13:11 +0200 Subject: [PATCH 2/3] ci: strip pre-existing comment rot in publish-image-loaded action --- .../actions/publish-image-loaded/action.yml | 34 +++++++++---------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/.github/actions/publish-image-loaded/action.yml b/.github/actions/publish-image-loaded/action.yml index f17fd2ec47..df63403f88 100644 --- a/.github/actions/publish-image-loaded/action.yml +++ b/.github/actions/publish-image-loaded/action.yml @@ -14,16 +14,15 @@ description: >- the expected names). This action then runs the security-critical publish path that must be byte-identical across every caller. - Background on the digest-pinning: a previous version used - `${REPO}:sha-X-amd64` tag refs in `docker manifest create`. - `docker manifest create` re-resolves tag references against the - registry on every call, so when a concurrent main-push run overwrote - the per-arch tag between two iterations of a tag-push run's loop, - the manifest lists for different destination tags ended up with - different per-arch digests. cosign signed only the last one, leaving - the user-facing version tag unsigned. Capturing the per-arch digest - immediately after push and referencing manifest list members by - `@sha256:digest` makes every iteration produce byte-identical bytes. + Why digest-pinning: `docker manifest create` re-resolves tag + references against the registry on every call. When a concurrent + main-push run overwrites a per-arch tag between two iterations of a + tag-push run's loop, the manifest lists for different destination + tags end up with different per-arch digests, and cosign signs only + the last one, leaving the user-facing version tag unsigned. + Capturing the per-arch digest immediately after push and referencing + manifest list members by `@sha256:digest` makes every iteration + produce byte-identical bytes, immune to concurrent tag mutations. Caller responsibilities: - `packages: write`, `id-token: write`, `attestations: write` @@ -61,11 +60,11 @@ outputs: runs: using: composite steps: - # GHCR's token endpoint occasionally exceeds the docker client's default - # 30s deadline (the failure mode that took down main 2026-05-13). Drive - # `docker login` directly through nick-fields/retry so the login is - # re-attempted on `context deadline exceeded`. Token via env + stdin so - # it never lands on a command line. + # GHCR's token endpoint occasionally exceeds the docker client's + # default 30s deadline. Drive `docker login` directly through + # nick-fields/retry so the login is re-attempted on `context + # deadline exceeded`. Token via env + stdin so it never lands on + # a command line. - name: Log in to GHCR uses: nick-fields/retry@ad984534de44a9489a53aefd81eb77f87c70dc60 # v4.0.0 env: @@ -109,9 +108,8 @@ runs: # # Note: the manifest-list digest below is still read via # `docker buildx imagetools inspect` because `docker manifest - # push` stdout is unreliable across Docker versions (PR #1650 - # documented this: some versions emit the digest, some only - # print `Created manifest list`). + # push` stdout is unreliable across Docker versions: some emit + # the digest, some only print `Created manifest list`. - name: Push per-arch images and assemble manifest list id: push shell: bash From 67877f8f68fad67fbfd70ebfd6bb7b7da555d08d Mon Sep 17 00:00:00 2001 From: Aurelio <19254254+Aureliolo@users.noreply.github.com> Date: Sun, 24 May 2026 17:27:25 +0200 Subject: [PATCH 3/3] fix: babysit round 1, address 3 gemini findings on cosign retry helper G1 (CRITICAL): invoke sibling docker_push_with_retry.sh via bash (script ships 100644, bare invocation would fail Permission denied on Linux runner). G2: printf -- '%s\n' for hyphen-safety and grep compat. G3: guard grep -E against empty TRANSIENT_RE (would otherwise match every line and retry non-transient errors). --- .github/scripts/cosign_sign_with_retry.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/scripts/cosign_sign_with_retry.sh b/.github/scripts/cosign_sign_with_retry.sh index 58997ef280..22959dadb3 100644 --- a/.github/scripts/cosign_sign_with_retry.sh +++ b/.github/scripts/cosign_sign_with_retry.sh @@ -37,7 +37,12 @@ REF="${1:?usage: cosign_sign_with_retry.sh }" # both scripts in lockstep so a new transient signature added in one # place automatically protects every signing call too. SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" -TRANSIENT_RE="$("$SCRIPT_DIR/docker_push_with_retry.sh" --print-transient-re)" +# Invoke the sibling helper via `bash` rather than relying on the +# execute bit. Both scripts ship as git mode 100644 (no execute bit), +# matching the existing publish-* action call sites which always use +# `bash "$RETRY" ...`; a bare `"$SCRIPT_DIR/..."` would fail +# "Permission denied" on the Linux runner. +TRANSIENT_RE="$(bash "$SCRIPT_DIR/docker_push_with_retry.sh" --print-transient-re)" # 4 attempts, backoff 15s -> 30s -> 60s = ~1m45s of wait in the worst # case before the final attempt. Matches docker_push_with_retry.sh so @@ -58,7 +63,7 @@ for ((i = 1; i <= ATTEMPTS; i++)); do # Idempotency branch: a re-sign that lost the createLogEntry race is # success, not a transient error. Check this BEFORE the regex so # attempt 1 -> 5xx -> attempt 2 -> conflict resolves cleanly. - if printf '%s' "$out" | grep -q 'createLogEntryConflict'; then + if printf -- '%s\n' "$out" | grep -q 'createLogEntryConflict'; then printf '%s\n' "$out" echo "::notice::Image ${REF} already signed -- skipping" exit 0 @@ -70,7 +75,10 @@ for ((i = 1; i <= ATTEMPTS; i++)); do exit "$rc" fi - if printf '%s' "$out" | grep -qiE "$TRANSIENT_RE"; then + # Guard against an empty `$TRANSIENT_RE` (e.g. sibling helper drift + # that silently prints nothing) - `grep -E ""` matches every line, + # which would retry auth failures and other non-transient errors. + if [[ -n "$TRANSIENT_RE" ]] && printf -- '%s\n' "$out" | grep -qiE "$TRANSIENT_RE"; then printf '%s\n' "$out" >&2 echo "::warning::cosign sign ${REF} hit transient error (attempt ${i}/${ATTEMPTS}, rc=${rc}); sleeping ${BACKOFF}s before retry" >&2 sleep "$BACKOFF"