diff --git a/full-ai-cluster/nixos/modules/common.nix b/full-ai-cluster/nixos/modules/common.nix index e37c244b0c..6e5e85abbf 100644 --- a/full-ai-cluster/nixos/modules/common.nix +++ b/full-ai-cluster/nixos/modules/common.nix @@ -5,6 +5,11 @@ { config, pkgs, lib, stateVersion ? "24.11", ... }: { + # iter-5.2 (B-0792): per-node hostname injection lives in its own + # module so every host (control-plane, worker-gpu, worker-template, + # future configs) inherits the override capability automatically. + imports = [ ./injected-hostname.nix ]; + nix.settings = { experimental-features = [ "nix-command" "flakes" ]; auto-optimise-store = true; @@ -31,6 +36,24 @@ networking.networkmanager.enable = true; networking.firewall.enable = true; + # iter-5.1 (B-0792): Avahi mDNS publishing so cluster nodes resolve + # via `.local` from operator Mac (Bonjour) + Linux peers + # (nss-mdns) on the LAN without IP-discovery step. Without this, + # `ssh zeta@control-plane.local` fails to resolve even though the + # node is up. Empirical anchor: 2026-05-26 iter-4.2 PC1 test + # surfaced the gap. + services.avahi = { + enable = true; + nssmdns4 = true; + openFirewall = true; # firewall hole for mDNS (5353/udp) + publish = { + enable = true; + addresses = true; + workstation = true; + domain = true; + }; + }; + services.openssh = { enable = true; settings = { diff --git a/full-ai-cluster/nixos/modules/injected-hostname.nix b/full-ai-cluster/nixos/modules/injected-hostname.nix new file mode 100644 index 0000000000..e997c5738a --- /dev/null +++ b/full-ai-cluster/nixos/modules/injected-hostname.nix @@ -0,0 +1,58 @@ +# full-ai-cluster/nixos/modules/injected-hostname.nix +# +# iter-5.2 (B-0792): per-node hostname injection for multi-node clusters. +# +# Without this, every USB-installed node uses the flake host config's +# hardcoded `networking.hostName` (e.g., "control-plane" for every node +# built from the control-plane flake). Result for multi-node: +# +# - All control-plane nodes have hostname "control-plane" +# - mDNS publishing collides; second + later nodes auto-rename to +# "control-plane-2.local" / "control-plane-3.local" via Avahi's +# conflict-resolution but underlying NixOS hostname stays +# "control-plane" (confusing in logs, journalctl, kubectl, etc.) +# +# Fix: zeta-install.sh reads `zeta-hostname.txt` from the USB ESP (if +# the operator passed `--host ` to zflash) + writes the chosen +# hostname to `/mnt/etc/zeta/cluster-node-id` during install. This +# module reads that file at NixOS evaluation time + overrides +# `networking.hostName`. +# +# Imported by `common.nix` so EVERY host (control-plane, worker-gpu, +# worker-template, future configs) gets the override capability +# automatically. Default behavior preserved: if the file doesn't +# exist, `networking.hostName` from the per-host config remains in +# effect (e.g., `control-plane` stays `control-plane` for the +# zero-typing single-node case). +# +# Aaron 2026-05-26: "make any multi node changes we need to like +# think though mdns names when we have two control planes." + +{ config, lib, ... }: + +let + idFile = "/etc/zeta/cluster-node-id"; + injectedRaw = + if builtins.pathExists idFile + then builtins.readFile idFile + else null; + # Strip trailing newline + whitespace; reject empty result. + injected = + if injectedRaw == null + then null + else + let + trimmed = lib.removeSuffix "\n" (lib.removeSuffix " " injectedRaw); + in + if trimmed == "" then null else trimmed; +in +{ + # mkOverride 50 wins over the per-host config's default + # (mkDefault = 1000) but loses to explicit operator overrides + # (mkForce = 50 too; explicit `networking.hostName = "..."` without + # any priority modifier = 100). The intent: zeta-install.sh's + # injected value is preferred over the flake's per-host default + # BUT operators rebuilding with a custom override in a new + # configuration retain control. + networking.hostName = lib.mkIf (injected != null) (lib.mkOverride 50 injected); +} diff --git a/full-ai-cluster/tools/zflash.ts b/full-ai-cluster/tools/zflash.ts index 6f0ebbc943..46b0c486b9 100755 --- a/full-ai-cluster/tools/zflash.ts +++ b/full-ai-cluster/tools/zflash.ts @@ -615,8 +615,11 @@ function dumpDiagnostics(context: string): void { ); } -async function injectPubkeyToUsb(pubkeyPath: string): Promise { +async function injectPubkeyToUsb(pubkeyPath: string, hostOverride: string | null): Promise { process.stdout.write(`\niter-4.2: injecting ${pubkeyPath} into freshly-flashed USB ESP ...\n`); + if (hostOverride !== null) { + process.stdout.write(`iter-5.2: ALSO injecting hostname '${hostOverride}' into ESP ...\n`); + } // Brief settle so macOS re-reads partition table after dd await new Promise((r) => setTimeout(r, 2000)); @@ -693,6 +696,41 @@ async function injectPubkeyToUsb(pubkeyPath: string): Promise { } process.stdout.write(`iter-4.2: wrote pubkey to ${target}\n`); + // iter-5.2 (B-0792): if --host was passed, write zeta-hostname.txt + // to ESP in the same mount session (covered by the same sudo + // timestamp window; no additional Touch ID). zeta-install.sh reads + // this file at install time + writes to /etc/zeta/cluster-node-id; + // injected-hostname.nix module reads that file at NixOS evaluation + // time + overrides networking.hostName. + // + // Hostname validation already happened at flag-parse time (RFC1123 + // check); re-verify shape here as defense-in-depth before writing. + if (hostOverride !== null) { + if (!/^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$/.test(hostOverride)) { + unmountEsp(espPart, mountResult); + bail( + 3, + `iter-5.2 inject failed: hostname '${hostOverride}' fails RFC1123 validation at write time (should have caught at flag-parse; internal bug)`, + ); + } + const hostnameTarget = join(mountPoint, "zeta-hostname.txt"); + try { + execFileSync("sudo", ["tee", hostnameTarget], { + input: hostOverride + "\n", + stdio: ["pipe", "ignore", "inherit"], + }); + } catch (e) { + dumpDiagnostics(`sudo tee ${hostnameTarget} failed`); + unmountEsp(espPart, mountResult); + bail( + 3, + `iter-5.2 inject failed: sudo tee ${hostnameTarget} failed: ${e instanceof Error ? e.message : String(e)}`, + ); + } + process.stdout.write(`iter-5.2: wrote hostname '${hostOverride}' to ${hostnameTarget}\n`); + process.stdout.write(`iter-5.2: installed node will be reachable as ssh zeta@${hostOverride}.local\n`); + } + // Unmount via the matching method (diskutil-mounted → diskutil // unmount; mount_msdos-mounted → sudo umount + rmSync tmpdir). unmountEsp(espPart, mountResult); @@ -725,14 +763,16 @@ async function main() { "--no-inject", "--skip-freshness-check", "--skip-iso-pull", + "--host", ]); const argv = process.argv.slice(2); - // Two-arg flag parsing for --ssh-key + // Two-arg flag parsing for --ssh-key and --host let sshKeyOverride: string | null = null; let noInject = false; let skipFreshnessCheck = false; let skipIsoPull = false; + let hostOverride: string | null = null; const rawFlags: string[] = []; const positional: string[] = []; for (let i = 0; i < argv.length; i++) { @@ -765,6 +805,27 @@ async function main() { skipIsoPull = true; continue; } + if (a === "--host") { + const next = argv[i + 1]; + if (!next || next.startsWith("-")) { + bail(2, "--host requires a name argument (e.g., --host pikachu)"); + } + // iter-5.2 (B-0792): hostname per RFC1123 — alphanumeric + hyphens, + // no leading/trailing hyphen, 1-63 chars. Reject empty + invalid + // shapes BEFORE writing to USB so cluster-side substrate doesn't + // have to handle garbage. Aaron 2026-05-26 architectural framing: + // hostname is a unique identity, NOT a role label — operator picks + // any short memorable name (pikachu, charizard, sapphire, etc.). + if (!/^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$/.test(next)) { + bail( + 2, + `--host '${next}' is not a valid RFC1123 hostname (alphanumeric + hyphens, 1-63 chars, no leading/trailing hyphen)`, + ); + } + hostOverride = next; + i++; + continue; + } if (a.startsWith("-")) { rawFlags.push(a); continue; @@ -797,6 +858,10 @@ async function main() { " --no-inject skip the iter-4.2 ESP pubkey write (v1 manual-edit fallback)\n" + " --skip-freshness-check bypass iter-4.3 stale-checkout detection (NOT recommended)\n" + " --skip-iso-pull bypass iter-4.3 CI-ISO auto-download (use local newest)\n" + + " --host iter-5.2 inject node hostname (RFC1123); decoupled from\n" + + " role-stack — e.g., --host pikachu installs as pikachu\n" + + " regardless of flake role config. Default: flake config name\n" + + " (control-plane for the zero-typing single-node path)\n" + " iso-path (optional) explicit ISO; default = newest under ~/Downloads,\n" + " auto-pulled from CI if origin/main has fresher build\n" + " Run zflash-setup once first to install Touch ID for sudo.\n", @@ -864,9 +929,15 @@ async function main() { } if (willInject) { - await injectPubkeyToUsb(pubkeyPath); + await injectPubkeyToUsb(pubkeyPath, hostOverride); } else { process.stdout.write("\n(iter-4.2 inject skipped per --no-inject or missing pubkey)\n"); + if (hostOverride !== null) { + process.stdout.write( + `(iter-5.2 hostname inject ALSO skipped — --host ${hostOverride} requires --no-inject NOT set;\n` + + ` re-run without --no-inject if you want the hostname to land on the USB ESP)\n`, + ); + } } } diff --git a/full-ai-cluster/usb-nixos-installer/zeta-install.sh b/full-ai-cluster/usb-nixos-installer/zeta-install.sh index f49b915cea..6063cf8491 100755 --- a/full-ai-cluster/usb-nixos-installer/zeta-install.sh +++ b/full-ai-cluster/usb-nixos-installer/zeta-install.sh @@ -348,6 +348,126 @@ else echo "==============================" fi +# ── Step 6.6: iter-5.2 hostname injection (B-0792) ────────────── +# +# Per the maintainer 2026-05-26: "since our different roles are +# multi install you can be control plane AND gpu node AND cpu +# node these distinctions are not very elegant and host names +# tied to them are not great either" — hostname should be just +# a unique identity, decoupled from role-stack selection. +# +# zflash on macOS writes the operator's chosen hostname to +# `zeta-hostname.txt` on the USB ESP if --host was passed +# (e.g., zflash --host pikachu). This step writes that to +# /mnt/etc/zeta/cluster-node-id where the NixOS module +# `injected-hostname.nix` reads it via builtins.readFile at +# evaluation time + overrides networking.hostName. +# +# If no zeta-hostname.txt on ESP: skip; the flake's per-host +# config default (e.g., "control-plane") stays in effect. +# Backward-compatible with single-node zero-typing path. +echo +echo "[iter-5.2] ── probing boot USB for injected hostname ──" +HOSTNAME_DST="/mnt/etc/zeta/cluster-node-id" +HOSTNAME_FILE="" +# Reuse the SEARCH_DIRS pattern from the iter-4.2 pubkey probe; +# zflash writes zeta-hostname.txt alongside zeta-authorized-keys.pub +# in the same ESP mount session. +if [ ${#SEARCH_DIRS[@]} -gt 0 ]; then + HOSTNAME_FILE=$(sudo find "${SEARCH_DIRS[@]}" \ + -maxdepth 5 -name "zeta-hostname.txt" -type f 2>/dev/null | head -1 || true) +fi +# Also check the PROBE_MOUNT in case the USB ESP was mounted there +# during iter-4.2 probe (don't re-mount; it's already there). +if [ -z "$HOSTNAME_FILE" ] && [ -f "$PROBE_MOUNT/zeta-hostname.txt" ]; then + HOSTNAME_FILE="$PROBE_MOUNT/zeta-hostname.txt" +fi +if [ -n "$HOSTNAME_FILE" ]; then + # Validate: hostname per RFC1123 (alphanumeric + hyphens, no + # leading/trailing hyphen, 1-63 chars). Strip whitespace + newlines. + INJECTED_HOSTNAME=$(sudo cat "$HOSTNAME_FILE" | tr -d '[:space:]' | head -c 63) + if [ -n "$INJECTED_HOSTNAME" ] \ + && echo "$INJECTED_HOSTNAME" \ + | grep -Eq '^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$'; then + echo "[iter-5.2] found injected hostname: $INJECTED_HOSTNAME (source: $HOSTNAME_FILE)" + sudo mkdir -p "$(dirname "$HOSTNAME_DST")" + echo "$INJECTED_HOSTNAME" | sudo tee "$HOSTNAME_DST" >/dev/null + sudo chmod 0644 "$HOSTNAME_DST" + echo "[iter-5.2] wrote $HOSTNAME_DST" + echo "[iter-5.2] networking.hostName will be '$INJECTED_HOSTNAME' on first boot" + echo "[iter-5.2] ssh access: ssh zeta@${INJECTED_HOSTNAME}.local" + else + echo "[iter-5.2] WARN: $HOSTNAME_FILE contains invalid hostname '$INJECTED_HOSTNAME'" + echo "[iter-5.2] (must match RFC1123: alphanumeric + hyphens, 1-63 chars)" + echo "[iter-5.2] falling back to flake default ($HOST)" + fi +else + echo "[iter-5.2] no zeta-hostname.txt on USB ESP" + echo "[iter-5.2] using flake default hostname for #$HOST" +fi +echo + +# ── Step 6.7: iter-5.1 wifi persistence (B-0792) ──────────────── +# +# By the time this step runs, the live installer is already on the +# network — either via ethernet auto-DHCP (no profile to copy; this +# is a no-op) or via nmtui setup at first boot (`zeta-first-boot.sh` +# Step 2 launches nmtui when ethernet is absent; operator entered +# wifi creds once via TUI; NetworkManager wrote a .nmconnection +# profile to /etc/NetworkManager/system-connections/). +# +# Without this step, the freshly-installed system inherits the +# NixOS NetworkManager service but NOT the operator's connection +# profile. Result: wifi-only mini-PCs boot installed system, +# NetworkManager comes up with empty profile dir, no wifi, no SSH. +# The maintainer 2026-05-26: "we won't have ethernet for most +# machines it needs to remember the wifi on setup." +# +# Fix: copy *.nmconnection files from the live installer to /mnt. +# NetworkManager requires chmod 0600 + chown root:root on these +# files. sudo handles both during the cp. +echo +echo "[iter-5.1] ── checking for NetworkManager connection profiles to persist ──" +NM_SRC="/etc/NetworkManager/system-connections" +NM_DST="/mnt/etc/NetworkManager/system-connections" +NM_PROFILE_COUNT=0 +if [ -d "$NM_SRC" ]; then + # Enumerate .nmconnection files via find (NOT glob; bash globs + # would need nullglob to handle the empty-dir case, but find + + # filtered-output handles it naturally with no shell-option deps) + NM_PROFILES=$(sudo find "$NM_SRC" -maxdepth 1 -name "*.nmconnection" -type f 2>/dev/null || true) + if [ -n "$NM_PROFILES" ]; then + NM_PROFILE_COUNT=$(echo "$NM_PROFILES" | wc -l | tr -d ' ') + sudo mkdir -p "$NM_DST" + sudo chmod 0700 "$NM_DST" + # Copy preserving permissions; NM requires 0600 root:root on each + # .nmconnection file (else it ignores them at startup with a + # "permissions not strict enough" warning in journalctl) + echo "$NM_PROFILES" | while read -r src; do + [ -n "$src" ] || continue + name=$(basename "$src") + dst="$NM_DST/$name" + sudo cp -p "$src" "$dst" + sudo chown root:root "$dst" + sudo chmod 0600 "$dst" + # Print SSID (parsed from [wifi] ssid=...) without printing the psk. + # Per 802.11 spec, SSIDs MAY contain '=' (and arbitrary bytes + # including spaces). awk -F= '...; print $2' would truncate after + # the first '='. sed-after-first-'ssid=' preserves the full SSID. + ssid=$(sudo sed -n 's/^ssid=//p' "$dst" 2>/dev/null | head -1) + [ -z "$ssid" ] && ssid="(unknown)" + echo "[iter-5.1] persisted: $name (ssid=$ssid)" + done + echo "[iter-5.1] $NM_PROFILE_COUNT NetworkManager profile(s) persisted to installed system" + echo "[iter-5.1] installed system will reconnect to wifi automatically on reboot" + else + echo "[iter-5.1] no .nmconnection profiles in $NM_SRC (ethernet-DHCP path; nothing to persist)" + fi +else + echo "[iter-5.1] $NM_SRC does not exist; skipping wifi persistence (no harm; ethernet-DHCP works)" +fi +echo + echo "Running nixos-install --flake /mnt/etc/zeta/full-ai-cluster#$HOST ..." sudo nixos-install --flake "/mnt/etc/zeta/full-ai-cluster#$HOST" --no-root-password