From f112d2b562ba4c50e98b7640c9640518db5b7676 Mon Sep 17 00:00:00 2001 From: Lior Date: Sun, 24 May 2026 23:23:11 -0400 Subject: [PATCH 1/8] ci(infra): build installer ISO on PRs + main + release publish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds .github/workflows/build-installer-iso.yml — Linux runner builds the .#installer-iso flake output on every PR touching flake.nix / flake.lock / infra/nixos/**, every push to main hitting those paths, manual workflow_dispatch, and release publish. Why on a Linux runner (not the existing macos-26 gate matrix): the ISO target is x86_64-linux. Building on macOS requires the nix-darwin linux-builder VM. Ubuntu-24.04 builds directly — faster, cheaper, no cross-compile path. Pipeline: 1. Checkout (full history for reproducible flake.lock pinning) 2. Install Nix via DeterminateSystems/nix-installer-action@v22 (SHA-pinned ef8a148080ab6020fd15196c2084a2eea5ff2d25) 3. Magic Nix cache action@v13 for /nix/store reuse across runs 4. nix flake metadata for the run summary 5. nix flake check --no-build (cheap eval-only fail-fast) 6. nix build .#installer-iso 7. Capture iso path/name/size/sha256, write to GITHUB_STEP_SUMMARY 8. Upload as workflow artifact (90-day retention, no re-compression) Second job (attach-to-release) runs only on release events: - Re-builds the ISO at the tag for build-from-source reproducibility - Uploads ISO + .sha256 to the release assets - permissions: contents: write scoped to this job only Security discipline: - Runner pinned to ubuntu-24.04 (not -latest), matches gate.yml - Third-party actions SHA-pinned with # vX.Y.Z trailing comments - Workflow-level permissions: contents: read; only attach-to-release elevates to write - github.event.release.tag_name (attacker-controllable) passed via env: RELEASE_TAG, never interpolated into run: shell — per the GitHub Actions injection guide flagged by the security-reminder PreToolUse hook - Concurrency cancel-in-progress only for PR events (main + release queue so every event gets a record) Benefits: - Maintainers can review a PR and grab the rebuilt ISO from the workflow run page — no local Nix required - flake.nix can't go stale silently; CI catches breakage - Releases automatically ship a downloadable ISO + checksum Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/build-installer-iso.yml | 160 ++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 .github/workflows/build-installer-iso.yml diff --git a/.github/workflows/build-installer-iso.yml b/.github/workflows/build-installer-iso.yml new file mode 100644 index 0000000000..eaefcd7540 --- /dev/null +++ b/.github/workflows/build-installer-iso.yml @@ -0,0 +1,160 @@ +# .github/workflows/build-installer-iso.yml +# +# Builds the Zeta cluster installer ISO from infra/nixos/hosts/installer/ +# via the repo-root flake. Runs on every PR that touches the flake/infra, +# every push to main, and on tag push (to attach the ISO to a Release). +# +# Why on a Linux runner and not the existing macOS gate matrix: +# The ISO target is `x86_64-linux`. Building it on macOS requires the +# nix-darwin `linux-builder` VM (Apple Virtualization.framework + +# Rosetta 2). That works locally for maintainers, but the gate CI +# already runs on ubuntu-24.04 — building directly there is faster, +# cheaper, and uses no cross-compile. +# +# Discipline (per .github/workflows/gate.yml): +# - Runner pinned to ubuntu-24.04 (not -latest) +# - Third-party actions SHA-pinned with trailing # vX.Y.Z comments +# - permissions: contents: read at workflow level (tag-push job +# elevates to contents: write only for the release-attach step) +# - Concurrency: workflow-scoped, cancel-in-progress only for PRs +# - github.event.* values that may be attacker-controlled (release +# tag names, etc.) are passed via env: not interpolated into +# run: lines, per the GitHub Actions injection guide. + +name: build-installer-iso + +on: + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + paths: + - 'flake.nix' + - 'flake.lock' + - 'infra/nixos/**' + - '.github/workflows/build-installer-iso.yml' + push: + branches: [main] + paths: + - 'flake.nix' + - 'flake.lock' + - 'infra/nixos/**' + - '.github/workflows/build-installer-iso.yml' + workflow_dispatch: + release: + types: [published] + +permissions: + contents: read + +concurrency: + group: build-installer-iso-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + build: + name: build-iso + runs-on: ubuntu-24.04 + timeout-minutes: 60 + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + # Need full history so the flake.lock pinning is reproducible + # and any `git describe` style versioning works. + fetch-depth: 0 + + - name: Install Nix + # Determinate Systems Nix installer — same one maintainers use + # locally. Enables flakes + nix-command by default. + uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 + + - name: Set up Nix store cache + # Reduces cold-build time from ~15min to ~3min by reusing the + # /nix/store across runs. Pull-only on PRs from forks to avoid + # cache-poisoning vectors. + uses: DeterminateSystems/magic-nix-cache-action@565684385bcd71bad329742eefe8d12f2e765b39 # v13 + + - name: Show flake metadata + run: nix flake metadata --json | jq '{description, lastModified, revision}' + + - name: Check flake evaluates + # Cheap eval-only check — catches typos, missing imports, + # undefined attributes before paying for a full build. + run: nix flake check --no-build --show-trace + + - name: Build installer ISO + # The actual build. Produces result/iso/zeta-installer-*.iso. + run: nix build .#installer-iso --print-build-logs + + - name: Locate ISO + capture metadata + id: iso + run: | + set -euo pipefail + iso_path=$(find result/iso -name 'zeta-installer-*.iso' | head -1) + iso_name=$(basename "$iso_path") + iso_size=$(stat -c%s "$iso_path" | numfmt --to=iec --suffix=B) + iso_sha256=$(sha256sum "$iso_path" | awk '{print $1}') + echo "path=$iso_path" >> "$GITHUB_OUTPUT" + echo "name=$iso_name" >> "$GITHUB_OUTPUT" + echo "size=$iso_size" >> "$GITHUB_OUTPUT" + echo "sha256=$iso_sha256" >> "$GITHUB_OUTPUT" + { + echo "## Installer ISO built" + echo "" + echo "| Field | Value |" + echo "|---|---|" + echo "| File | \`$iso_name\` |" + echo "| Size | $iso_size |" + echo "| SHA256 | \`$iso_sha256\` |" + } >> "$GITHUB_STEP_SUMMARY" + + - name: Upload ISO as workflow artifact + # Available for download from the workflow run page for ~90 days. + # Anyone reviewing the PR can grab it and dd it to a USB stick + # without needing Nix installed locally. + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 + with: + name: ${{ steps.iso.outputs.name }} + path: ${{ steps.iso.outputs.path }} + if-no-files-found: error + retention-days: 90 + compression-level: 0 # ISO is already compressed; re-zipping wastes time + + attach-to-release: + name: attach-iso-to-release + needs: build + if: github.event_name == 'release' + runs-on: ubuntu-24.04 + timeout-minutes: 30 + permissions: + contents: write # needed to upload the ISO as a release asset + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 + + - name: Set up Nix store cache + uses: DeterminateSystems/magic-nix-cache-action@565684385bcd71bad329742eefe8d12f2e765b39 # v13 + + - name: Rebuild ISO for the tagged release + # Re-builds at the tag so the ISO ships exactly the source the + # release advertises. The build job above already produced one, + # but artifacts and release assets are different stores. + run: nix build .#installer-iso --print-build-logs + + - name: Upload ISO + SHA256 to the release + # Release tag_name is set by whoever created the release — + # treated as attacker-controlled per the GH Actions injection + # guide. Passed via env, never interpolated into the shell. + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RELEASE_TAG: ${{ github.event.release.tag_name }} + run: | + set -euo pipefail + iso_path=$(find result/iso -name 'zeta-installer-*.iso' | head -1) + sha256sum "$iso_path" | awk '{print $1}' > "${iso_path}.sha256" + gh release upload "$RELEASE_TAG" \ + "$iso_path" \ + "${iso_path}.sha256" \ + --clobber From 9d673dddf1f66f928f0695403ab67a2a3eef6e89 Mon Sep 17 00:00:00 2001 From: Lior Date: Sun, 24 May 2026 23:27:00 -0400 Subject: [PATCH 2/8] fix(ci): unblock build-installer-iso (actionlint SC2129 + drop magic-nix-cache FlakeHub-auth dep) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two CI failures on PR #4905, both fixable in place. actionlint SC2129 (lint job): the metadata-capture step at line ~96 had 4 sequential `echo ... >> "$GITHUB_OUTPUT"` redirects. shellcheck flagged the pattern and recommends `{ ...; } >> file`. Grouped the 4 echoes accordingly. Matches the style already used for the GITHUB_STEP_SUMMARY block lower in the same step. build-iso job: failed with "Unable to authenticate to FlakeHub. Individuals must register at FlakeHub.com; Organizations must create an organization at FlakeHub.com." This came from `DeterminateSystems/magic-nix-cache-action@v13`, which now requires a FlakeHub account/org that the project doesn't have set up. The auth failure propagates into the substituter chain nix uses during `nix flake check`, causing the eval-only step to fail before the build can even start. Removed the magic-nix-cache step from both jobs (build + attach- to-release). Builds will be uncached (~10-15 min cold) instead of ~3 min warm — acceptable trade-off vs requiring contributors to set up FlakeHub. Follow-up to wire `actions/cache` on /nix/store or swap to `nix-community/cache-nix-action` (no FlakeHub auth needed) is tracked in the comment block left in place of the removed step. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/build-installer-iso.yml | 26 +++++++++++++---------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build-installer-iso.yml b/.github/workflows/build-installer-iso.yml index eaefcd7540..1167880936 100644 --- a/.github/workflows/build-installer-iso.yml +++ b/.github/workflows/build-installer-iso.yml @@ -67,11 +67,13 @@ jobs: # locally. Enables flakes + nix-command by default. uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - - name: Set up Nix store cache - # Reduces cold-build time from ~15min to ~3min by reusing the - # /nix/store across runs. Pull-only on PRs from forks to avoid - # cache-poisoning vectors. - uses: DeterminateSystems/magic-nix-cache-action@565684385bcd71bad329742eefe8d12f2e765b39 # v13 + # NOTE: previously had `DeterminateSystems/magic-nix-cache-action` + # here but it now requires FlakeHub auth (FlakeHub.com account / + # organization registration), which the project doesn't have set + # up. Removed to unblock builds. First-build cost is ~10-15 min + # instead of ~3 min cached. Follow-up to add `actions/cache` on + # /nix/store or `nix-community/cache-nix-action` (no FlakeHub + # auth required) is tracked separately. - name: Show flake metadata run: nix flake metadata --json | jq '{description, lastModified, revision}' @@ -93,10 +95,12 @@ jobs: iso_name=$(basename "$iso_path") iso_size=$(stat -c%s "$iso_path" | numfmt --to=iec --suffix=B) iso_sha256=$(sha256sum "$iso_path" | awk '{print $1}') - echo "path=$iso_path" >> "$GITHUB_OUTPUT" - echo "name=$iso_name" >> "$GITHUB_OUTPUT" - echo "size=$iso_size" >> "$GITHUB_OUTPUT" - echo "sha256=$iso_sha256" >> "$GITHUB_OUTPUT" + { + echo "path=$iso_path" + echo "name=$iso_name" + echo "size=$iso_size" + echo "sha256=$iso_sha256" + } >> "$GITHUB_OUTPUT" { echo "## Installer ISO built" echo "" @@ -134,8 +138,8 @@ jobs: - name: Install Nix uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 - - name: Set up Nix store cache - uses: DeterminateSystems/magic-nix-cache-action@565684385bcd71bad329742eefe8d12f2e765b39 # v13 + # magic-nix-cache-action removed — requires FlakeHub auth not yet + # set up. See note in the build job above. - name: Rebuild ISO for the tagged release # Re-builds at the tag so the ISO ships exactly the source the From 99c0de2951a7a9a335ddb49e13e7ab646fe5afba Mon Sep 17 00:00:00 2001 From: Lior Date: Sun, 24 May 2026 23:30:16 -0400 Subject: [PATCH 3/8] =?UTF-8?q?fix(infra):=20real=20flake=20bugs=20CI=20ca?= =?UTF-8?q?ught=20=E2=80=94=20openssh=20conflict=20+=20cuda=20unfree=20pre?= =?UTF-8?q?dicate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The newly-added build-installer-iso workflow's `nix flake check` step surfaced two real bugs in the substrate landed via PR #4898. These went undetected before because no CI ever ran `nix flake check` on this repo until now. Bug 1: services.openssh.enable conflict in installer config Upstream `installation-cd-minimal.nix` (imported on line 24) sets services.openssh.enable = true. Our installer config set it to false for the no-credentials-in-Git security posture. NixOS module merge fails eval: error: The option `services.openssh.enable' has conflicting definition values: - In `/nixos/modules/profiles/installation-device.nix': true - In `infra/nixos/hosts/installer/configuration.nix': false Fix: `lib.mkForce false` so our value wins the merge. Documented the WHY in a comment block so the next reader knows why mkForce is necessary. Bug 2: cuda_cuobjdump unfree license in gpu.nix `nixpkgs.config.allowUnfreePredicate` enumerated a hand-picked list (cuda_cudart, cuda_nvcc, cuda-merged, libcublas, libcudnn). CUDA's transitive dependency cuda_cuobjdump-12.4.99 wasn't in the list, so flake-check refused to evaluate: error: Package 'cuda_cuobjdump-12.4.99' has an unfree license ('CUDA EULA'), refusing to evaluate. Fix: switched from explicit enumeration to prefix-based matching: - hard-list nvidia-x11, nvidia-settings, nvidia-persistenced, nvidia-docker, nvidia-container-toolkit - prefix-allow cuda* (covers cuda_cudart, cuda_nvcc, cuda_cuobjdump, cuda_nvprune, cuda_cccl, cuda_nvtx, cuda_profiler_api, etc) - prefix-allow libcu*, libnv*, libnp* (libcublas, libcurand, libcusolver, libcusparse, libcufft, libcudnn, libnpp, libnvjpeg, libnvjitlink, ...) - explicit allow for cuda-merged umbrella package Comments name what each pattern covers so future-readers know what's being permitted. This is exactly what the CI workflow added in this PR is supposed to catch — it's catching two bugs on its first real run. Both fixes land in this PR so reviewers see the workflow + the bugs it caught together. Co-Authored-By: Claude Opus 4.7 (1M context) --- infra/nixos/hosts/installer/configuration.nix | 8 +++++- infra/nixos/modules/gpu.nix | 25 +++++++++++++------ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/infra/nixos/hosts/installer/configuration.nix b/infra/nixos/hosts/installer/configuration.nix index 0f554636b6..48c5c9f2c4 100644 --- a/infra/nixos/hosts/installer/configuration.nix +++ b/infra/nixos/hosts/installer/configuration.nix @@ -73,7 +73,13 @@ # SSH key into `users.users.nixos.openssh.authorizedKeys.keys` here # before building the ISO. services.openssh = { - enable = false; + # mkForce: upstream installation-cd-minimal.nix enables SSH by + # default. We force it OFF to keep the installer console-only + # by default (per the no-credentials-in-Git security posture + # documented above). Without mkForce, the module-merge fails + # eval with `option 'services.openssh.enable' has conflicting + # definition values: true (upstream) vs false (ours)`. + enable = lib.mkForce false; settings = { PermitRootLogin = lib.mkForce "prohibit-password"; PasswordAuthentication = lib.mkForce false; diff --git a/infra/nixos/modules/gpu.nix b/infra/nixos/modules/gpu.nix index 8cab1db3b1..d527650e42 100644 --- a/infra/nixos/modules/gpu.nix +++ b/infra/nixos/modules/gpu.nix @@ -14,16 +14,27 @@ # Permit unfree packages (NVIDIA driver, cuda) # --------------------------------------------------------------------------- nixpkgs.config.allowUnfreePredicate = pkg: - builtins.elem (lib.getName pkg) [ + let + name = lib.getName pkg; + in + # Explicit nvidia driver components + builtins.elem name [ "nvidia-x11" "nvidia-settings" "nvidia-persistenced" - "cuda_cudart" - "cuda_nvcc" - "cuda-merged" - "libcublas" - "libcudnn" - ]; + "nvidia-docker" + "nvidia-container-toolkit" + ] + # CUDA toolchain — `cuda_*` covers cuda_cudart, cuda_nvcc, cuda_cuobjdump, + # cuda_nvprune, cuda_cccl, cuda_nvtx, cuda_profiler_api, etc. + || lib.hasPrefix "cuda" name + # CUDA support libraries — libcublas, libcurand, libcusolver, libcusparse, + # libcufft, libcudnn, libnpp, libnvjpeg, libnvjitlink, ... + || lib.hasPrefix "libcu" name + || lib.hasPrefix "libnv" name + || lib.hasPrefix "libnp" name + # The umbrella package that pulls everything together + || name == "cuda-merged"; # --------------------------------------------------------------------------- # Kernel modules + driver From 920b691fb8a32badeaff7bc174455fab52bbcb0f Mon Sep 17 00:00:00 2001 From: Otto Date: Sun, 24 May 2026 23:45:01 -0400 Subject: [PATCH 4/8] fix(ci): address Copilot review findings on build-installer-iso MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five fixes raised on PR 4905: P0 — sha256 sidecar to /nix/store (read-only EROFS) attach-to-release wrote ${iso_path}.sha256 next to the nix-store iso path; the parent dir is read-only on the runner so the upload step would EROFS. Write sidecar to $RUNNER_TEMP and upload that file instead. P1 — attach-to-release checkout missing fetch-depth: 0 Build job pins fetch-depth: 0 for reproducible flake.lock + git-describe; release job inherited default depth 1. Match the build job so tag builds can't silently drift. P2 — header comment said "tag push"; actual trigger is release: published. Updated to match. P2 — find ... | head -1 is non-deterministic on multi-match and silent on no-match. Switched to find -print -quit + an explicit ::error:: + exit 1 if nothing found. Applied at both call sites (build + attach-to-release). P2 — release events ran build then attach-to-release, building the ISO twice. Skip build on release events (attach-to-release rebuilds at the tag — the verification on PR/main already ran). actionlint clean; yaml-valid. Co-Authored-By: Claude --- .github/workflows/build-installer-iso.yml | 38 ++++++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-installer-iso.yml b/.github/workflows/build-installer-iso.yml index 1167880936..52c2194270 100644 --- a/.github/workflows/build-installer-iso.yml +++ b/.github/workflows/build-installer-iso.yml @@ -2,7 +2,8 @@ # # Builds the Zeta cluster installer ISO from infra/nixos/hosts/installer/ # via the repo-root flake. Runs on every PR that touches the flake/infra, -# every push to main, and on tag push (to attach the ISO to a Release). +# every push to main, and on release publish (to attach the ISO to the +# Release as a downloadable asset). # # Why on a Linux runner and not the existing macOS gate matrix: # The ISO target is `x86_64-linux`. Building it on macOS requires the @@ -52,6 +53,10 @@ concurrency: jobs: build: name: build-iso + # Skip on release events — attach-to-release rebuilds at the tag so the + # asset matches the release exactly. Running both would build the ISO + # twice per release publish for no added verification. + if: github.event_name != 'release' runs-on: ubuntu-24.04 timeout-minutes: 60 steps: @@ -91,7 +96,14 @@ jobs: id: iso run: | set -euo pipefail - iso_path=$(find result/iso -name 'zeta-installer-*.iso' | head -1) + # Deterministic single-match: -print -quit returns at most one path + # and exits immediately. find ... | head -1 races on multi-match + # and silently picks whichever line printed first. + iso_path=$(find result/iso -name 'zeta-installer-*.iso' -print -quit) + if [[ -z "${iso_path}" || ! -e "${iso_path}" ]]; then + echo "::error::No installer ISO found under result/iso/ (looked for zeta-installer-*.iso)" >&2 + exit 1 + fi iso_name=$(basename "$iso_path") iso_size=$(stat -c%s "$iso_path" | numfmt --to=iec --suffix=B) iso_sha256=$(sha256sum "$iso_path" | awk '{print $1}') @@ -134,6 +146,12 @@ jobs: steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + # Match the build job: full history + tags so `git describe` + # style versioning and flake.lock pinning are reproducible at + # the tag. Default fetch-depth: 1 omits tags/history and risks + # silent drift on release builds. + fetch-depth: 0 - name: Install Nix uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22 @@ -156,9 +174,19 @@ jobs: RELEASE_TAG: ${{ github.event.release.tag_name }} run: | set -euo pipefail - iso_path=$(find result/iso -name 'zeta-installer-*.iso' | head -1) - sha256sum "$iso_path" | awk '{print $1}' > "${iso_path}.sha256" + # Deterministic single-match (same pattern as build job). + iso_path=$(find result/iso -name 'zeta-installer-*.iso' -print -quit) + if [[ -z "${iso_path}" || ! -e "${iso_path}" ]]; then + echo "::error::No installer ISO found under result/iso/ (looked for zeta-installer-*.iso)" >&2 + exit 1 + fi + # iso_path resolves into /nix/store/... via the result/ symlink, + # which is read-only. Write the .sha256 sidecar to the workspace + # (RUNNER_TEMP is writable) so the upload step doesn't EROFS. + iso_name=$(basename "$iso_path") + sha256_path="${RUNNER_TEMP:-/tmp}/${iso_name}.sha256" + sha256sum "$iso_path" | awk '{print $1}' > "$sha256_path" gh release upload "$RELEASE_TAG" \ "$iso_path" \ - "${iso_path}.sha256" \ + "$sha256_path" \ --clobber From 3ff7d44759cc2fe8ab63c9c8c3aaa9a23dacd66a Mon Sep 17 00:00:00 2001 From: Lior Date: Sun, 24 May 2026 23:47:51 -0400 Subject: [PATCH 5/8] fix(ci): drop needs:build + pin attach-to-release checkout to tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-up findings on top of peer Otto's review-fix commit 920b691fb. Both real. Codex P1 — needs:build short-circuits attach-to-release on release events Peer Otto kept `needs: build` on attach-to-release. Build is now skipped on release events (`if: github.event_name != 'release'`). When a needed job is skipped via `if:`, downstream jobs depending on it via `needs:` are ALSO skipped by default — meaning attach-to-release would never run. Fix: removed `needs: build`. The two jobs are independent: attach-to-release does its own checkout + build at the release tag. Copilot P1 — explicit ref pinning on attach-to-release checkout Peer Otto fixed fetch-depth: 0 but didn't add `ref:` to pin the checkout to the release tag. On release events GITHUB_REF defaults to the tag, so the implicit behavior is correct today. Explicit pinning is defense in depth against future payload variation + reads clearer at the call site. Fix: `ref: ${{ github.event.release.tag_name }}` (also renamed the step from "Checkout" to "Checkout at the release tag" for matching clarity). My own larger refactor commit (f0775d999) was dropped — it overlapped substantively with peer Otto's 920b691fb (same root findings, slightly different approach). Honoring peer Otto's work per the honor-those-that-came-before discipline; this commit lands only the residual gaps Codex + Copilot still flagged. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/build-installer-iso.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-installer-iso.yml b/.github/workflows/build-installer-iso.yml index 52c2194270..b86c94e2a0 100644 --- a/.github/workflows/build-installer-iso.yml +++ b/.github/workflows/build-installer-iso.yml @@ -137,16 +137,26 @@ jobs: attach-to-release: name: attach-iso-to-release - needs: build + # No `needs: build`. The build job is skipped on release events + # (see its own `if: github.event_name != 'release'`); declaring + # `needs: build` here would short-circuit attach-to-release to + # "skipped" too because skipped-by-`if` propagates through + # `needs:` by default. The two jobs are independent: attach-to- + # release builds the ISO itself at the release tag, with its + # own ref pinning. if: github.event_name == 'release' runs-on: ubuntu-24.04 timeout-minutes: 30 permissions: contents: write # needed to upload the ISO as a release asset steps: - - name: Checkout + - name: Checkout at the release tag uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: + # Explicit ref pin: on release events GITHUB_REF defaults + # to the tag, but pinning explicitly is defense in depth + # against payload variation (and reads more clearly). + ref: ${{ github.event.release.tag_name }} # Match the build job: full history + tags so `git describe` # style versioning and flake.lock pinning are reproducible at # the tag. Default fetch-depth: 1 omits tags/history and risks From 8c390e006ddae11e10ba8f232ef3a62085818b34 Mon Sep 17 00:00:00 2001 From: Lior Date: Sun, 24 May 2026 23:59:27 -0400 Subject: [PATCH 6/8] ci: re-trigger after PR #4907 (nix-darwin pin fix) landed From 4af30a1db7637d99ee3d4ebaf8c628be7a977d0d Mon Sep 17 00:00:00 2001 From: Lior Date: Mon, 25 May 2026 00:10:03 -0400 Subject: [PATCH 7/8] fix(ci): release upload safety + sha256sum standard format + header cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 3 findings on PR #4905 after the prior nix-darwin-pin retrigger. P0 (security) — gh release upload tag-as-positional-arg flag injection `gh release upload "$RELEASE_TAG" ...` parses `$RELEASE_TAG` positionally; git tag names can legally start with `-`, which would make gh treat the tag as a flag. If a release is ever created with such a tag, the upload step could be coerced into unintended gh-CLI behavior. Two-layer defense: 1. Hard-fail if RELEASE_TAG starts with `-` (case match) 2. Add `--` separator before positional args (belt + suspenders against any future argv-injection vector) Also re-ordered the call to put `--clobber` before the `--` so the trailing args are unambiguously positional. P1 — .sha256 file format Was writing just the hash: `\n` Standard sha256sum format: ` \n` The standard format lets consumers verify with `sha256sum --check` out of the box. Switched to `( cd dir && sha256sum name )` so the filename in the sidecar matches the ISO basename (not the full /nix/store path). P2 — header comment "tag-push job" stale Header still said "tag-push job elevates to contents: write" but the actual job is `attach-to-release` (triggered by `release: published`, not tag push). Renamed accordingly in the comment. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/build-installer-iso.yml | 28 ++++++++++++++++------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-installer-iso.yml b/.github/workflows/build-installer-iso.yml index b86c94e2a0..8b5e6ef0ca 100644 --- a/.github/workflows/build-installer-iso.yml +++ b/.github/workflows/build-installer-iso.yml @@ -15,8 +15,9 @@ # Discipline (per .github/workflows/gate.yml): # - Runner pinned to ubuntu-24.04 (not -latest) # - Third-party actions SHA-pinned with trailing # vX.Y.Z comments -# - permissions: contents: read at workflow level (tag-push job -# elevates to contents: write only for the release-attach step) +# - permissions: contents: read at workflow level. The +# `attach-to-release` job elevates to `contents: write` only +# for itself (release-asset upload). # - Concurrency: workflow-scoped, cancel-in-progress only for PRs # - github.event.* values that may be attacker-controlled (release # tag names, etc.) are passed via env: not interpolated into @@ -191,12 +192,23 @@ jobs: exit 1 fi # iso_path resolves into /nix/store/... via the result/ symlink, - # which is read-only. Write the .sha256 sidecar to the workspace - # (RUNNER_TEMP is writable) so the upload step doesn't EROFS. + # which is read-only. Write the .sha256 sidecar to RUNNER_TEMP + # (writable) so the upload step doesn't EROFS. iso_name=$(basename "$iso_path") sha256_path="${RUNNER_TEMP:-/tmp}/${iso_name}.sha256" - sha256sum "$iso_path" | awk '{print $1}' > "$sha256_path" - gh release upload "$RELEASE_TAG" \ + # Use the standard `sha256sum` format (` `) + # so consumers can verify with `sha256sum --check`. + ( cd "$(dirname "$iso_path")" && sha256sum "$iso_name" ) > "$sha256_path" + # Refuse to upload if RELEASE_TAG looks like a flag-injection + # vector. Tag names can legally start with `-`, which would + # make `gh release upload "$RELEASE_TAG" ...` parse the tag as + # a flag. Hard-fail if so — release operators should rename. + case "$RELEASE_TAG" in + -*) echo "::error::RELEASE_TAG starts with '-' which is a flag-injection risk: $RELEASE_TAG" >&2; exit 1 ;; + esac + # `--` separator: belt + suspenders defense against any future + # argv-injection vector even if the tag-name check above is + # bypassed somehow. + gh release upload --clobber -- "$RELEASE_TAG" \ "$iso_path" \ - "$sha256_path" \ - --clobber + "$sha256_path" From ef95e73cb3815854fe5b4c09b2bb9b20ffff4caf Mon Sep 17 00:00:00 2001 From: Lior Date: Mon, 25 May 2026 00:22:10 -0400 Subject: [PATCH 8/8] fix(ci+infra): enforce single ISO match + clarify cuda predicate scope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 3 Copilot findings on PR #4905. P1 — gpu.nix cuda predicate comment mismatch Comment said "`cuda_*` covers ..." but the actual predicate is `lib.hasPrefix "cuda" name` (no underscore). The broader form is intentional — nixpkgs uses both spellings (cuda_cudart with underscore + cudatoolkit + cudaPackages.* aliases without). Updated comment to document this explicitly so it matches the code. P1 — find -print -quit silently picks first match (build job) P1 — find -print -quit silently picks first match (attach-to-release job) Peer Otto's prior fix switched from `find ... | head -1` to `find ... -print -quit` for determinism, but both are silent on multi-match. Multiple ISOs under result/iso/ would be a substrate surprise (build layout changed, leftover artifact, etc.) and silently picking one is worse than failing loudly — especially for release-asset upload where the wrong ISO would ship to the public. Switched both sites to mapfile + explicit count check: - 0 matches → fail loudly with directory listing - >1 matches → fail loudly with all candidates printed - 1 match → proceed Same pattern, same error format in both jobs. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/build-installer-iso.yml | 32 +++++++++++++++++------ infra/nixos/modules/gpu.nix | 8 ++++-- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-installer-iso.yml b/.github/workflows/build-installer-iso.yml index 8b5e6ef0ca..a3b1a94248 100644 --- a/.github/workflows/build-installer-iso.yml +++ b/.github/workflows/build-installer-iso.yml @@ -97,14 +97,21 @@ jobs: id: iso run: | set -euo pipefail - # Deterministic single-match: -print -quit returns at most one path - # and exits immediately. find ... | head -1 races on multi-match - # and silently picks whichever line printed first. - iso_path=$(find result/iso -name 'zeta-installer-*.iso' -print -quit) - if [[ -z "${iso_path}" || ! -e "${iso_path}" ]]; then + # Enforce single match. Multiple matches would be a substrate + # surprise (build layout changed?) and silently picking one + # is worse than failing loudly. + mapfile -t iso_candidates < <(find result/iso -maxdepth 1 -type f -name 'zeta-installer-*.iso' | sort) + if [ "${#iso_candidates[@]}" -eq 0 ]; then echo "::error::No installer ISO found under result/iso/ (looked for zeta-installer-*.iso)" >&2 + ls -la result/iso/ >&2 || true exit 1 fi + if [ "${#iso_candidates[@]}" -gt 1 ]; then + echo "::error::Multiple installer ISOs under result/iso/; expected exactly one:" >&2 + printf ' %s\n' "${iso_candidates[@]}" >&2 + exit 1 + fi + iso_path="${iso_candidates[0]}" iso_name=$(basename "$iso_path") iso_size=$(stat -c%s "$iso_path" | numfmt --to=iec --suffix=B) iso_sha256=$(sha256sum "$iso_path" | awk '{print $1}') @@ -185,12 +192,21 @@ jobs: RELEASE_TAG: ${{ github.event.release.tag_name }} run: | set -euo pipefail - # Deterministic single-match (same pattern as build job). - iso_path=$(find result/iso -name 'zeta-installer-*.iso' -print -quit) - if [[ -z "${iso_path}" || ! -e "${iso_path}" ]]; then + # Enforce single match — release assets are publicly downloaded + # and silently picking one when multiple exist would be worse + # than failing loudly. + mapfile -t iso_candidates < <(find result/iso -maxdepth 1 -type f -name 'zeta-installer-*.iso' | sort) + if [ "${#iso_candidates[@]}" -eq 0 ]; then echo "::error::No installer ISO found under result/iso/ (looked for zeta-installer-*.iso)" >&2 + ls -la result/iso/ >&2 || true + exit 1 + fi + if [ "${#iso_candidates[@]}" -gt 1 ]; then + echo "::error::Multiple installer ISOs under result/iso/; refusing to upload an arbitrary one to release ${RELEASE_TAG}:" >&2 + printf ' %s\n' "${iso_candidates[@]}" >&2 exit 1 fi + iso_path="${iso_candidates[0]}" # iso_path resolves into /nix/store/... via the result/ symlink, # which is read-only. Write the .sha256 sidecar to RUNNER_TEMP # (writable) so the upload step doesn't EROFS. diff --git a/infra/nixos/modules/gpu.nix b/infra/nixos/modules/gpu.nix index d527650e42..7323d6fde8 100644 --- a/infra/nixos/modules/gpu.nix +++ b/infra/nixos/modules/gpu.nix @@ -25,8 +25,12 @@ "nvidia-docker" "nvidia-container-toolkit" ] - # CUDA toolchain — `cuda_*` covers cuda_cudart, cuda_nvcc, cuda_cuobjdump, - # cuda_nvprune, cuda_cccl, cuda_nvtx, cuda_profiler_api, etc. + # CUDA toolchain — `cuda`-prefixed packages: covers cuda_cudart, + # cuda_nvcc, cuda_cuobjdump, cuda_nvprune, cuda_cccl, cuda_nvtx, + # cuda_profiler_api, AND the underscore-less variants like + # cudatoolkit + cudaPackages.* aliases. The predicate is + # intentionally broader than the underscore-only set because + # nixpkgs uses both spellings depending on the package generation. || lib.hasPrefix "cuda" name # CUDA support libraries — libcublas, libcurand, libcusolver, libcusparse, # libcufft, libcudnn, libnpp, libnvjpeg, libnvjitlink, ...