diff --git a/flake.nix b/flake.nix index ff947f47a9..bf7e3e6c2a 100644 --- a/flake.nix +++ b/flake.nix @@ -73,10 +73,23 @@ ]; }; - # Future hosts land in PR 2 (per-host configs): - # control-plane = mkSystem { modules = [ ./infra/nixos/hosts/control-plane/configuration.nix ]; }; - # worker-gpu-01 = mkSystem { modules = [ ./infra/nixos/hosts/worker-gpu-01/configuration.nix ]; }; - # worker-gpu-02 = mkSystem { modules = [ ./infra/nixos/hosts/worker-gpu-02/configuration.nix ]; }; + control-plane = mkSystem { + modules = [ + ./infra/nixos/hosts/control-plane/configuration.nix + ]; + }; + + worker-gpu-01 = mkSystem { + modules = [ + ./infra/nixos/hosts/worker-gpu-01/configuration.nix + ]; + }; + + worker-gpu-02 = mkSystem { + modules = [ + ./infra/nixos/hosts/worker-gpu-02/configuration.nix + ]; + }; }; # Shared modules exposed as flake outputs so per-host configs can diff --git a/infra/nixos/hosts/control-plane/README.md b/infra/nixos/hosts/control-plane/README.md new file mode 100644 index 0000000000..028ded7092 --- /dev/null +++ b/infra/nixos/hosts/control-plane/README.md @@ -0,0 +1,58 @@ +# control-plane + +Zeta cluster control-plane node — runs K3S server + embedded etcd + +auto-bootstraps ArgoCD on first boot. + +## Install + +```bash +# From the live USB installer (built from this same flake): +git clone https://github.com/Lucent-Financial-Group/Zeta /mnt/etc/zeta + +# Partition + mount /mnt as desired, then: +nixos-generate-config --root /mnt +cp /mnt/etc/nixos/hardware-configuration.nix \ + /mnt/etc/zeta/infra/nixos/hosts/control-plane/hardware-configuration.nix + +# Install: +nixos-install --flake /mnt/etc/zeta#control-plane + +# Reboot. K3S starts, applies bootstrap manifests, ArgoCD installs, +# root-application reconciles every other workload from this repo. +``` + +## Post-install verification + +```bash +ssh zeta@control-plane +sudo kubectl get nodes +sudo kubectl -n argocd get pods +sudo kubectl -n argocd get applications +``` + +## What it runs + +- K3S server with embedded etcd (`clusterInit = true`) +- ArgoCD (auto-applied on first boot via `services.k3s.manifests`) +- Root Application of Applications (reconciles `infra/k8s/applications/`) + +## What it does NOT run + +No AI workloads. Heavy compute lives on `worker-gpu-*` nodes. The +control-plane is intentionally small so a single-node failure doesn't +take down both the cluster API and the work. + +## Multi-control-plane HA (future) + +The current config uses `clusterInit = true` on a single server. To +add additional control-plane nodes for HA: + +1. Drop `clusterInit = true` on the second + third nodes. +2. Set `serverAddr = "https://control-plane.zeta.local:6443"` on them. +3. Share the K3S token across all three (sops-nix or agenix). + +## Hardware config + +The `hardware-configuration.nix` in this directory is generated +per-machine by `nixos-generate-config` during install. See +`hardware-configuration.nix.example` for the placeholder shape. diff --git a/infra/nixos/hosts/control-plane/configuration.nix b/infra/nixos/hosts/control-plane/configuration.nix new file mode 100644 index 0000000000..f6ae642901 --- /dev/null +++ b/infra/nixos/hosts/control-plane/configuration.nix @@ -0,0 +1,52 @@ +# infra/nixos/hosts/control-plane/configuration.nix +# +# Zeta cluster control-plane node. Runs K3S server + embedded etcd, and +# auto-applies the bootstrap manifests that install ArgoCD and the +# root Application of Applications. +# +# After install, ArgoCD takes over and reconciles everything else from +# this same Git repo. The control-plane node is intentionally minimal — +# heavy AI workloads run on worker-gpu-* nodes. + +{ config, pkgs, lib, ... }: + +{ + imports = [ + # hardware-configuration.nix is per-machine and generated by + # `nixos-generate-config --root /mnt` on the target during install. + # Commit it alongside this file once the target machine is known + # and its hardware shouldn't drift. + ./hardware-configuration.nix + + # Shared baseline. + ../../modules/common.nix + + # K3S control-plane role. + ../../modules/k3s-server.nix + ]; + + # --------------------------------------------------------------------------- + # Identity + # --------------------------------------------------------------------------- + networking.hostName = "control-plane"; + # Static IP recommended for control-plane so worker-gpu-* nodes can + # join via a stable serverAddr. Per-site override here: + # networking.interfaces.eth0.ipv4.addresses = [{ + # address = "192.168.1.10"; + # prefixLength = 24; + # }]; + # networking.defaultGateway = "192.168.1.1"; + # networking.nameservers = [ "1.1.1.1" "9.9.9.9" ]; + + # --------------------------------------------------------------------------- + # SSH keys for the zeta admin user. Add maintainer keys here: + # --------------------------------------------------------------------------- + users.users.zeta.openssh.authorizedKeys.keys = [ + # "ssh-ed25519 AAAAC3Nz... aaron@zeta" + # "ssh-ed25519 AAAAC3Nz... addison@zeta" + ]; + + # --------------------------------------------------------------------------- + # Control-plane has no GPU; nothing to import from gpu.nix. + # --------------------------------------------------------------------------- +} diff --git a/infra/nixos/hosts/control-plane/hardware-configuration.nix b/infra/nixos/hosts/control-plane/hardware-configuration.nix new file mode 100644 index 0000000000..6203942451 --- /dev/null +++ b/infra/nixos/hosts/control-plane/hardware-configuration.nix @@ -0,0 +1,43 @@ +# infra/nixos/hosts/control-plane/hardware-configuration.nix +# +# PLACEHOLDER — replace during install on the actual target machine. +# +# ssh into the live installer: +# nixos-generate-config --root /mnt +# cp /mnt/etc/nixos/hardware-configuration.nix \ +# /mnt/etc/zeta/infra/nixos/hosts/control-plane/hardware-configuration.nix +# +# The generator writes real kernel modules, fileSystems, swap, and +# boot loader entries. This stub exists so `nix flake check` and +# `nix build .#nixosConfigurations.control-plane` succeed in CI +# before the real machine is provisioned. + +{ config, lib, modulesPath, ... }: + +{ + imports = [ + (modulesPath + "/installer/scan/not-detected.nix") + ]; + + # Minimal valid stub. nixos-generate-config overrides all of this. + boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usb_storage" "sd_mod" ]; + boot.initrd.kernelModules = [ ]; + boot.kernelModules = [ ]; + boot.extraModulePackages = [ ]; + + # PLACEHOLDER UUIDs — generator replaces with real ones for the target disk. + fileSystems."/" = lib.mkDefault { + device = "/dev/disk/by-label/nixos"; + fsType = "ext4"; + }; + + fileSystems."/boot" = lib.mkDefault { + device = "/dev/disk/by-label/boot"; + fsType = "vfat"; + }; + + swapDevices = lib.mkDefault [ ]; + + networking.useDHCP = lib.mkDefault true; + nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; +} diff --git a/infra/nixos/hosts/worker-gpu-01/configuration.nix b/infra/nixos/hosts/worker-gpu-01/configuration.nix new file mode 100644 index 0000000000..a726fd92a6 --- /dev/null +++ b/infra/nixos/hosts/worker-gpu-01/configuration.nix @@ -0,0 +1,58 @@ +# infra/nixos/hosts/worker-gpu-01/configuration.nix +# +# Worker node #01 — runs GPU-accelerated AI workloads. Joins the K3S +# cluster as an agent and advertises its NVIDIA GPU(s) for pod +# scheduling via `nvidia.com/gpu` resource requests. + +{ config, pkgs, lib, ... }: + +{ + imports = [ + # Per-machine hardware config — see hardware-configuration.nix.example + # for the template; real file generated by `nixos-generate-config`. + ./hardware-configuration.nix + + # Shared baseline. + ../../modules/common.nix + + # K3S agent role. + ../../modules/k3s-agent.nix + + # NVIDIA driver + container toolkit + node labels. + ../../modules/gpu.nix + ]; + + # --------------------------------------------------------------------------- + # Identity + # --------------------------------------------------------------------------- + networking.hostName = "worker-gpu-01"; + + # --------------------------------------------------------------------------- + # K3S join — point at the control-plane and provide the cluster token. + # The token file should be sops-nix / agenix decrypted at boot in + # production; for initial bootstrap copy it manually after install. + # --------------------------------------------------------------------------- + services.k3s.serverAddr = "https://control-plane.zeta.local:6443"; + # services.k3s.tokenFile = config.sops.secrets.k3s-token.path; + + # --------------------------------------------------------------------------- + # Worker-specific node labels — exposed to the scheduler for placement. + # gpu.nix already adds zeta.io/gpu=nvidia; k3s-agent.nix adds + # zeta.io/role=worker. Add hardware-specific labels here, e.g.: + # zeta.io/gpu-model=rtx-4090 + # zeta.io/gpu-count=2 + # zeta.io/cpu-cores=32 + # --------------------------------------------------------------------------- + services.k3s.extraFlags = lib.mkAfter [ + # "--node-label=zeta.io/gpu-model=rtx-4090" + # "--node-label=zeta.io/gpu-count=2" + ]; + + # --------------------------------------------------------------------------- + # SSH keys for the zeta admin user. + # --------------------------------------------------------------------------- + users.users.zeta.openssh.authorizedKeys.keys = [ + # "ssh-ed25519 AAAAC3Nz... aaron@zeta" + # "ssh-ed25519 AAAAC3Nz... addison@zeta" + ]; +} diff --git a/infra/nixos/hosts/worker-gpu-01/hardware-configuration.nix b/infra/nixos/hosts/worker-gpu-01/hardware-configuration.nix new file mode 100644 index 0000000000..3dd8c47e95 --- /dev/null +++ b/infra/nixos/hosts/worker-gpu-01/hardware-configuration.nix @@ -0,0 +1,34 @@ +# infra/nixos/hosts/worker-gpu-01/hardware-configuration.nix +# +# PLACEHOLDER — replace during install on the actual target machine +# via `nixos-generate-config --root /mnt`. See +# infra/nixos/hosts/control-plane/hardware-configuration.nix for +# the same pattern. + +{ config, lib, modulesPath, ... }: + +{ + imports = [ + (modulesPath + "/installer/scan/not-detected.nix") + ]; + + boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usb_storage" "sd_mod" ]; + boot.initrd.kernelModules = [ ]; + boot.kernelModules = [ ]; + boot.extraModulePackages = [ ]; + + fileSystems."/" = lib.mkDefault { + device = "/dev/disk/by-label/nixos"; + fsType = "ext4"; + }; + + fileSystems."/boot" = lib.mkDefault { + device = "/dev/disk/by-label/boot"; + fsType = "vfat"; + }; + + swapDevices = lib.mkDefault [ ]; + + networking.useDHCP = lib.mkDefault true; + nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; +} diff --git a/infra/nixos/hosts/worker-gpu-02/configuration.nix b/infra/nixos/hosts/worker-gpu-02/configuration.nix new file mode 100644 index 0000000000..89666eabc1 --- /dev/null +++ b/infra/nixos/hosts/worker-gpu-02/configuration.nix @@ -0,0 +1,31 @@ +# infra/nixos/hosts/worker-gpu-02/configuration.nix +# +# Worker node #02 — identical shape to worker-gpu-01, separate file +# so per-machine labels / hardware specifics stay declared per host. +# Add more workers as worker-gpu-03, -04, ... following this template. + +{ config, pkgs, lib, ... }: + +{ + imports = [ + ./hardware-configuration.nix + ../../modules/common.nix + ../../modules/k3s-agent.nix + ../../modules/gpu.nix + ]; + + networking.hostName = "worker-gpu-02"; + + services.k3s.serverAddr = "https://control-plane.zeta.local:6443"; + # services.k3s.tokenFile = config.sops.secrets.k3s-token.path; + + services.k3s.extraFlags = lib.mkAfter [ + # "--node-label=zeta.io/gpu-model=rtx-4090" + # "--node-label=zeta.io/gpu-count=1" + ]; + + users.users.zeta.openssh.authorizedKeys.keys = [ + # "ssh-ed25519 AAAAC3Nz... aaron@zeta" + # "ssh-ed25519 AAAAC3Nz... addison@zeta" + ]; +} diff --git a/infra/nixos/hosts/worker-gpu-02/hardware-configuration.nix b/infra/nixos/hosts/worker-gpu-02/hardware-configuration.nix new file mode 100644 index 0000000000..7d640057df --- /dev/null +++ b/infra/nixos/hosts/worker-gpu-02/hardware-configuration.nix @@ -0,0 +1,34 @@ +# infra/nixos/hosts/worker-gpu-02/hardware-configuration.nix +# +# PLACEHOLDER — replace during install on the actual target machine +# via `nixos-generate-config --root /mnt`. See +# infra/nixos/hosts/control-plane/hardware-configuration.nix for +# the same pattern. + +{ config, lib, modulesPath, ... }: + +{ + imports = [ + (modulesPath + "/installer/scan/not-detected.nix") + ]; + + boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usb_storage" "sd_mod" ]; + boot.initrd.kernelModules = [ ]; + boot.kernelModules = [ ]; + boot.extraModulePackages = [ ]; + + fileSystems."/" = lib.mkDefault { + device = "/dev/disk/by-label/nixos"; + fsType = "ext4"; + }; + + fileSystems."/boot" = lib.mkDefault { + device = "/dev/disk/by-label/boot"; + fsType = "vfat"; + }; + + swapDevices = lib.mkDefault [ ]; + + networking.useDHCP = lib.mkDefault true; + nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; +}