Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions full-ai-cluster/nixos/modules/common.nix
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
{ config, pkgs, lib, stateVersion ? "24.11", ... }:

{
# iter-5.2 (B-0792): per-node hostname injection lives in its own
# module so every host (control-plane, worker-gpu, worker-template,
# future configs) inherits the override capability automatically.
imports = [ ./injected-hostname.nix ];

nix.settings = {
experimental-features = [ "nix-command" "flakes" ];
auto-optimise-store = true;
Expand All @@ -31,6 +36,24 @@
networking.networkmanager.enable = true;
networking.firewall.enable = true;

# iter-5.1 (B-0792): Avahi mDNS publishing so cluster nodes resolve
# via `<hostname>.local` from operator Mac (Bonjour) + Linux peers
# (nss-mdns) on the LAN without IP-discovery step. Without this,
# `ssh zeta@control-plane.local` fails to resolve even though the
# node is up. Empirical anchor: 2026-05-26 iter-4.2 PC1 test
# surfaced the gap.
services.avahi = {
enable = true;
nssmdns4 = true;
openFirewall = true; # firewall hole for mDNS (5353/udp)
publish = {
enable = true;
addresses = true;
workstation = true;
domain = true;
};
};

services.openssh = {
enable = true;
settings = {
Expand Down
58 changes: 58 additions & 0 deletions full-ai-cluster/nixos/modules/injected-hostname.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# full-ai-cluster/nixos/modules/injected-hostname.nix
#
# iter-5.2 (B-0792): per-node hostname injection for multi-node clusters.
#
# Without this, every USB-installed node uses the flake host config's
# hardcoded `networking.hostName` (e.g., "control-plane" for every node
# built from the control-plane flake). Result for multi-node:
#
# - All control-plane nodes have hostname "control-plane"
# - mDNS publishing collides; second + later nodes auto-rename to
# "control-plane-2.local" / "control-plane-3.local" via Avahi's
# conflict-resolution but underlying NixOS hostname stays
# "control-plane" (confusing in logs, journalctl, kubectl, etc.)
#
# Fix: zeta-install.sh reads `zeta-hostname.txt` from the USB ESP (if
# the operator passed `--host <name>` to zflash) + writes the chosen
# hostname to `/mnt/etc/zeta/cluster-node-id` during install. This
# module reads that file at NixOS evaluation time + overrides
# `networking.hostName`.
#
# Imported by `common.nix` so EVERY host (control-plane, worker-gpu,
# worker-template, future configs) gets the override capability
# automatically. Default behavior preserved: if the file doesn't
# exist, `networking.hostName` from the per-host config remains in
# effect (e.g., `control-plane` stays `control-plane` for the
# zero-typing single-node case).
#
# Aaron 2026-05-26: "make any multi node changes we need to like
# think though mdns names when we have two control planes."

{ config, lib, ... }:

let
idFile = "/etc/zeta/cluster-node-id";
injectedRaw =
if builtins.pathExists idFile
then builtins.readFile idFile
else null;
# Strip trailing newline + whitespace; reject empty result.
injected =
if injectedRaw == null
then null
else
let
trimmed = lib.removeSuffix "\n" (lib.removeSuffix " " injectedRaw);
in
if trimmed == "" then null else trimmed;
in
{
# mkOverride 50 wins over the per-host config's default
# (mkDefault = 1000) but loses to explicit operator overrides
# (mkForce = 50 too; explicit `networking.hostName = "..."` without
# any priority modifier = 100). The intent: zeta-install.sh's
# injected value is preferred over the flake's per-host default
# BUT operators rebuilding with a custom override in a new
# configuration retain control.
networking.hostName = lib.mkIf (injected != null) (lib.mkOverride 50 injected);
}
77 changes: 74 additions & 3 deletions full-ai-cluster/tools/zflash.ts
Original file line number Diff line number Diff line change
Expand Up @@ -615,8 +615,11 @@ function dumpDiagnostics(context: string): void {
);
}

async function injectPubkeyToUsb(pubkeyPath: string): Promise<void> {
async function injectPubkeyToUsb(pubkeyPath: string, hostOverride: string | null): Promise<void> {
process.stdout.write(`\niter-4.2: injecting ${pubkeyPath} into freshly-flashed USB ESP ...\n`);
if (hostOverride !== null) {
process.stdout.write(`iter-5.2: ALSO injecting hostname '${hostOverride}' into ESP ...\n`);
}

// Brief settle so macOS re-reads partition table after dd
await new Promise((r) => setTimeout(r, 2000));
Expand Down Expand Up @@ -693,6 +696,41 @@ async function injectPubkeyToUsb(pubkeyPath: string): Promise<void> {
}
process.stdout.write(`iter-4.2: wrote pubkey to ${target}\n`);

// iter-5.2 (B-0792): if --host was passed, write zeta-hostname.txt
// to ESP in the same mount session (covered by the same sudo
// timestamp window; no additional Touch ID). zeta-install.sh reads
// this file at install time + writes to /etc/zeta/cluster-node-id;
// injected-hostname.nix module reads that file at NixOS evaluation
// time + overrides networking.hostName.
//
// Hostname validation already happened at flag-parse time (RFC1123
// check); re-verify shape here as defense-in-depth before writing.
if (hostOverride !== null) {
if (!/^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$/.test(hostOverride)) {
unmountEsp(espPart, mountResult);
bail(
3,
`iter-5.2 inject failed: hostname '${hostOverride}' fails RFC1123 validation at write time (should have caught at flag-parse; internal bug)`,
);
}
const hostnameTarget = join(mountPoint, "zeta-hostname.txt");
try {
execFileSync("sudo", ["tee", hostnameTarget], {
input: hostOverride + "\n",
stdio: ["pipe", "ignore", "inherit"],
});
} catch (e) {
dumpDiagnostics(`sudo tee ${hostnameTarget} failed`);
unmountEsp(espPart, mountResult);
bail(
3,
`iter-5.2 inject failed: sudo tee ${hostnameTarget} failed: ${e instanceof Error ? e.message : String(e)}`,
);
}
process.stdout.write(`iter-5.2: wrote hostname '${hostOverride}' to ${hostnameTarget}\n`);
process.stdout.write(`iter-5.2: installed node will be reachable as ssh zeta@${hostOverride}.local\n`);
}

// Unmount via the matching method (diskutil-mounted → diskutil
// unmount; mount_msdos-mounted → sudo umount + rmSync tmpdir).
unmountEsp(espPart, mountResult);
Expand Down Expand Up @@ -725,14 +763,16 @@ async function main() {
"--no-inject",
"--skip-freshness-check",
"--skip-iso-pull",
"--host",
]);
const argv = process.argv.slice(2);

// Two-arg flag parsing for --ssh-key <path>
// Two-arg flag parsing for --ssh-key <path> and --host <name>
let sshKeyOverride: string | null = null;
let noInject = false;
let skipFreshnessCheck = false;
let skipIsoPull = false;
let hostOverride: string | null = null;
const rawFlags: string[] = [];
const positional: string[] = [];
for (let i = 0; i < argv.length; i++) {
Expand Down Expand Up @@ -765,6 +805,27 @@ async function main() {
skipIsoPull = true;
continue;
}
if (a === "--host") {
const next = argv[i + 1];
if (!next || next.startsWith("-")) {
bail(2, "--host requires a name argument (e.g., --host pikachu)");
}
// iter-5.2 (B-0792): hostname per RFC1123 — alphanumeric + hyphens,
// no leading/trailing hyphen, 1-63 chars. Reject empty + invalid
// shapes BEFORE writing to USB so cluster-side substrate doesn't
// have to handle garbage. Aaron 2026-05-26 architectural framing:
// hostname is a unique identity, NOT a role label — operator picks
// any short memorable name (pikachu, charizard, sapphire, etc.).
if (!/^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?$/.test(next)) {
bail(
2,
`--host '${next}' is not a valid RFC1123 hostname (alphanumeric + hyphens, 1-63 chars, no leading/trailing hyphen)`,
);
}
hostOverride = next;
i++;
continue;
}
if (a.startsWith("-")) {
rawFlags.push(a);
continue;
Expand Down Expand Up @@ -797,6 +858,10 @@ async function main() {
" --no-inject skip the iter-4.2 ESP pubkey write (v1 manual-edit fallback)\n" +
" --skip-freshness-check bypass iter-4.3 stale-checkout detection (NOT recommended)\n" +
" --skip-iso-pull bypass iter-4.3 CI-ISO auto-download (use local newest)\n" +
" --host <name> iter-5.2 inject node hostname (RFC1123); decoupled from\n" +
" role-stack — e.g., --host pikachu installs as pikachu\n" +
" regardless of flake role config. Default: flake config name\n" +
" (control-plane for the zero-typing single-node path)\n" +
" iso-path (optional) explicit ISO; default = newest under ~/Downloads,\n" +
" auto-pulled from CI if origin/main has fresher build\n" +
" Run zflash-setup once first to install Touch ID for sudo.\n",
Expand Down Expand Up @@ -864,9 +929,15 @@ async function main() {
}

if (willInject) {
await injectPubkeyToUsb(pubkeyPath);
await injectPubkeyToUsb(pubkeyPath, hostOverride);
} else {
process.stdout.write("\n(iter-4.2 inject skipped per --no-inject or missing pubkey)\n");
if (hostOverride !== null) {
process.stdout.write(
`(iter-5.2 hostname inject ALSO skipped — --host ${hostOverride} requires --no-inject NOT set;\n` +
` re-run without --no-inject if you want the hostname to land on the USB ESP)\n`,
);
}
}
}

Expand Down
120 changes: 120 additions & 0 deletions full-ai-cluster/usb-nixos-installer/zeta-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,126 @@ else
echo "=============================="
fi

# ── Step 6.6: iter-5.2 hostname injection (B-0792) ──────────────
#
# Per the maintainer 2026-05-26: "since our different roles are
# multi install you can be control plane AND gpu node AND cpu
# node these distinctions are not very elegant and host names
# tied to them are not great either" — hostname should be just
# a unique identity, decoupled from role-stack selection.
#
# zflash on macOS writes the operator's chosen hostname to
# `zeta-hostname.txt` on the USB ESP if --host <name> was passed
# (e.g., zflash --host pikachu). This step writes that to
# /mnt/etc/zeta/cluster-node-id where the NixOS module
# `injected-hostname.nix` reads it via builtins.readFile at
# evaluation time + overrides networking.hostName.
#
Comment thread
AceHack marked this conversation as resolved.
# If no zeta-hostname.txt on ESP: skip; the flake's per-host
# config default (e.g., "control-plane") stays in effect.
# Backward-compatible with single-node zero-typing path.
echo
echo "[iter-5.2] ── probing boot USB for injected hostname ──"
HOSTNAME_DST="/mnt/etc/zeta/cluster-node-id"
HOSTNAME_FILE=""
# Reuse the SEARCH_DIRS pattern from the iter-4.2 pubkey probe;
# zflash writes zeta-hostname.txt alongside zeta-authorized-keys.pub
# in the same ESP mount session.
if [ ${#SEARCH_DIRS[@]} -gt 0 ]; then
HOSTNAME_FILE=$(sudo find "${SEARCH_DIRS[@]}" \
Comment thread
AceHack marked this conversation as resolved.
-maxdepth 5 -name "zeta-hostname.txt" -type f 2>/dev/null | head -1 || true)
fi
# Also check the PROBE_MOUNT in case the USB ESP was mounted there
# during iter-4.2 probe (don't re-mount; it's already there).
if [ -z "$HOSTNAME_FILE" ] && [ -f "$PROBE_MOUNT/zeta-hostname.txt" ]; then
HOSTNAME_FILE="$PROBE_MOUNT/zeta-hostname.txt"
fi
if [ -n "$HOSTNAME_FILE" ]; then
# Validate: hostname per RFC1123 (alphanumeric + hyphens, no
# leading/trailing hyphen, 1-63 chars). Strip whitespace + newlines.
INJECTED_HOSTNAME=$(sudo cat "$HOSTNAME_FILE" | tr -d '[:space:]' | head -c 63)
if [ -n "$INJECTED_HOSTNAME" ] \
&& echo "$INJECTED_HOSTNAME" \
| grep -Eq '^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$'; then
echo "[iter-5.2] found injected hostname: $INJECTED_HOSTNAME (source: $HOSTNAME_FILE)"
sudo mkdir -p "$(dirname "$HOSTNAME_DST")"
echo "$INJECTED_HOSTNAME" | sudo tee "$HOSTNAME_DST" >/dev/null
Comment thread
AceHack marked this conversation as resolved.
sudo chmod 0644 "$HOSTNAME_DST"
echo "[iter-5.2] wrote $HOSTNAME_DST"
echo "[iter-5.2] networking.hostName will be '$INJECTED_HOSTNAME' on first boot"
echo "[iter-5.2] ssh access: ssh zeta@${INJECTED_HOSTNAME}.local"
else
echo "[iter-5.2] WARN: $HOSTNAME_FILE contains invalid hostname '$INJECTED_HOSTNAME'"
echo "[iter-5.2] (must match RFC1123: alphanumeric + hyphens, 1-63 chars)"
echo "[iter-5.2] falling back to flake default ($HOST)"
fi
else
echo "[iter-5.2] no zeta-hostname.txt on USB ESP"
echo "[iter-5.2] using flake default hostname for #$HOST"
fi
echo

# ── Step 6.7: iter-5.1 wifi persistence (B-0792) ────────────────
#
# By the time this step runs, the live installer is already on the
# network — either via ethernet auto-DHCP (no profile to copy; this
# is a no-op) or via nmtui setup at first boot (`zeta-first-boot.sh`
# Step 2 launches nmtui when ethernet is absent; operator entered
# wifi creds once via TUI; NetworkManager wrote a .nmconnection
# profile to /etc/NetworkManager/system-connections/).
#
# Without this step, the freshly-installed system inherits the
# NixOS NetworkManager service but NOT the operator's connection
# profile. Result: wifi-only mini-PCs boot installed system,
# NetworkManager comes up with empty profile dir, no wifi, no SSH.
# The maintainer 2026-05-26: "we won't have ethernet for most
# machines it needs to remember the wifi on setup."
#
# Fix: copy *.nmconnection files from the live installer to /mnt.
# NetworkManager requires chmod 0600 + chown root:root on these
# files. sudo handles both during the cp.
echo
echo "[iter-5.1] ── checking for NetworkManager connection profiles to persist ──"
NM_SRC="/etc/NetworkManager/system-connections"
NM_DST="/mnt/etc/NetworkManager/system-connections"
NM_PROFILE_COUNT=0
if [ -d "$NM_SRC" ]; then
# Enumerate .nmconnection files via find (NOT glob; bash globs
# would need nullglob to handle the empty-dir case, but find +
# filtered-output handles it naturally with no shell-option deps)
NM_PROFILES=$(sudo find "$NM_SRC" -maxdepth 1 -name "*.nmconnection" -type f 2>/dev/null || true)
if [ -n "$NM_PROFILES" ]; then
NM_PROFILE_COUNT=$(echo "$NM_PROFILES" | wc -l | tr -d ' ')
sudo mkdir -p "$NM_DST"
sudo chmod 0700 "$NM_DST"
# Copy preserving permissions; NM requires 0600 root:root on each
# .nmconnection file (else it ignores them at startup with a
# "permissions not strict enough" warning in journalctl)
echo "$NM_PROFILES" | while read -r src; do
[ -n "$src" ] || continue
name=$(basename "$src")
dst="$NM_DST/$name"
sudo cp -p "$src" "$dst"
sudo chown root:root "$dst"
sudo chmod 0600 "$dst"
# Print SSID (parsed from [wifi] ssid=...) without printing the psk.
# Per 802.11 spec, SSIDs MAY contain '=' (and arbitrary bytes
# including spaces). awk -F= '...; print $2' would truncate after
# the first '='. sed-after-first-'ssid=' preserves the full SSID.
ssid=$(sudo sed -n 's/^ssid=//p' "$dst" 2>/dev/null | head -1)
[ -z "$ssid" ] && ssid="(unknown)"
echo "[iter-5.1] persisted: $name (ssid=$ssid)"
done
echo "[iter-5.1] $NM_PROFILE_COUNT NetworkManager profile(s) persisted to installed system"
echo "[iter-5.1] installed system will reconnect to wifi automatically on reboot"
else
echo "[iter-5.1] no .nmconnection profiles in $NM_SRC (ethernet-DHCP path; nothing to persist)"
fi
else
echo "[iter-5.1] $NM_SRC does not exist; skipping wifi persistence (no harm; ethernet-DHCP works)"
fi
echo

echo "Running nixos-install --flake /mnt/etc/zeta/full-ai-cluster#$HOST ..."
sudo nixos-install --flake "/mnt/etc/zeta/full-ai-cluster#$HOST" --no-root-password

Expand Down
Loading