Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,23 @@
];
};

# Future hosts land in PR 2 (per-host configs):
# control-plane = mkSystem { modules = [ ./infra/nixos/hosts/control-plane/configuration.nix ]; };
# worker-gpu-01 = mkSystem { modules = [ ./infra/nixos/hosts/worker-gpu-01/configuration.nix ]; };
# worker-gpu-02 = mkSystem { modules = [ ./infra/nixos/hosts/worker-gpu-02/configuration.nix ]; };
control-plane = mkSystem {
modules = [
./infra/nixos/hosts/control-plane/configuration.nix
];
};

worker-gpu-01 = mkSystem {
modules = [
./infra/nixos/hosts/worker-gpu-01/configuration.nix
];
};

worker-gpu-02 = mkSystem {
modules = [
./infra/nixos/hosts/worker-gpu-02/configuration.nix
];
};
};

# Shared modules exposed as flake outputs so per-host configs can
Expand Down
58 changes: 58 additions & 0 deletions infra/nixos/hosts/control-plane/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# control-plane

Zeta cluster control-plane node — runs K3S server + embedded etcd +
auto-bootstraps ArgoCD on first boot.

## Install

```bash
# From the live USB installer (built from this same flake):
git clone https://github.com/Lucent-Financial-Group/Zeta /mnt/etc/zeta

# Partition + mount /mnt as desired, then:
nixos-generate-config --root /mnt
cp /mnt/etc/nixos/hardware-configuration.nix \
/mnt/etc/zeta/infra/nixos/hosts/control-plane/hardware-configuration.nix

# Install:
nixos-install --flake /mnt/etc/zeta#control-plane

# Reboot. K3S starts, applies bootstrap manifests, ArgoCD installs,
# root-application reconciles every other workload from this repo.
```

## Post-install verification

```bash
ssh zeta@control-plane
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Add credential bootstrap before SSH verification

Following this runbook verbatim on a fresh install will fail at the first verification step because ssh zeta@control-plane assumes remote auth is already configured, but this commit's host config leaves users.users.zeta.openssh.authorizedKeys.keys empty while the shared baseline uses key-only SSH and no initial password. In that state, operators cannot complete post-install verification remotely; add an explicit pre-SSH step to install a key (or set a password) before this command.

Useful? React with 👍 / 👎.

sudo kubectl get nodes
sudo kubectl -n argocd get pods
sudo kubectl -n argocd get applications
```

## What it runs

- K3S server with embedded etcd (`clusterInit = true`)
- ArgoCD (auto-applied on first boot via `services.k3s.manifests`)
- Root Application of Applications (reconciles `infra/k8s/applications/`)

## What it does NOT run

No AI workloads. Heavy compute lives on `worker-gpu-*` nodes. The
control-plane is intentionally small so a single-node failure doesn't
take down both the cluster API and the work.

## Multi-control-plane HA (future)

The current config uses `clusterInit = true` on a single server. To
add additional control-plane nodes for HA:

1. Drop `clusterInit = true` on the second + third nodes.
2. Set `serverAddr = "https://control-plane.zeta.local:6443"` on them.
3. Share the K3S token across all three (sops-nix or agenix).

## Hardware config

The `hardware-configuration.nix` in this directory is generated
per-machine by `nixos-generate-config` during install. See
`hardware-configuration.nix.example` for the placeholder shape.
52 changes: 52 additions & 0 deletions infra/nixos/hosts/control-plane/configuration.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# infra/nixos/hosts/control-plane/configuration.nix
#
# Zeta cluster control-plane node. Runs K3S server + embedded etcd, and
# auto-applies the bootstrap manifests that install ArgoCD and the
# root Application of Applications.
#
# After install, ArgoCD takes over and reconciles everything else from
# this same Git repo. The control-plane node is intentionally minimal —
# heavy AI workloads run on worker-gpu-* nodes.

{ config, pkgs, lib, ... }:

{
imports = [
# hardware-configuration.nix is per-machine and generated by
# `nixos-generate-config --root /mnt` on the target during install.
# Commit it alongside this file once the target machine is known
# and its hardware shouldn't drift.
./hardware-configuration.nix

# Shared baseline.
../../modules/common.nix

# K3S control-plane role.
../../modules/k3s-server.nix
];

# ---------------------------------------------------------------------------
# Identity
# ---------------------------------------------------------------------------
networking.hostName = "control-plane";
# Static IP recommended for control-plane so worker-gpu-* nodes can
# join via a stable serverAddr. Per-site override here:
# networking.interfaces.eth0.ipv4.addresses = [{
# address = "192.168.1.10";
# prefixLength = 24;
# }];
# networking.defaultGateway = "192.168.1.1";
# networking.nameservers = [ "1.1.1.1" "9.9.9.9" ];

# ---------------------------------------------------------------------------
# SSH keys for the zeta admin user. Add maintainer keys here:
# ---------------------------------------------------------------------------
users.users.zeta.openssh.authorizedKeys.keys = [
# "ssh-ed25519 AAAAC3Nz... aaron@zeta"
# "ssh-ed25519 AAAAC3Nz... addison@zeta"
];

# ---------------------------------------------------------------------------
# Control-plane has no GPU; nothing to import from gpu.nix.
# ---------------------------------------------------------------------------
}
43 changes: 43 additions & 0 deletions infra/nixos/hosts/control-plane/hardware-configuration.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# infra/nixos/hosts/control-plane/hardware-configuration.nix
#
# PLACEHOLDER — replace during install on the actual target machine.
#
# ssh into the live installer:
# nixos-generate-config --root /mnt
# cp /mnt/etc/nixos/hardware-configuration.nix \
# /mnt/etc/zeta/infra/nixos/hosts/control-plane/hardware-configuration.nix
#
# The generator writes real kernel modules, fileSystems, swap, and
# boot loader entries. This stub exists so `nix flake check` and
# `nix build .#nixosConfigurations.control-plane` succeed in CI
# before the real machine is provisioned.

{ config, lib, modulesPath, ... }:

{
imports = [
(modulesPath + "/installer/scan/not-detected.nix")
];

# Minimal valid stub. nixos-generate-config overrides all of this.
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usb_storage" "sd_mod" ];
boot.initrd.kernelModules = [ ];
boot.kernelModules = [ ];
boot.extraModulePackages = [ ];

# PLACEHOLDER UUIDs — generator replaces with real ones for the target disk.
fileSystems."/" = lib.mkDefault {
device = "/dev/disk/by-label/nixos";
fsType = "ext4";
};

fileSystems."/boot" = lib.mkDefault {
device = "/dev/disk/by-label/boot";
fsType = "vfat";
};

swapDevices = lib.mkDefault [ ];

networking.useDHCP = lib.mkDefault true;
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
}
58 changes: 58 additions & 0 deletions infra/nixos/hosts/worker-gpu-01/configuration.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# infra/nixos/hosts/worker-gpu-01/configuration.nix
#
# Worker node #01 — runs GPU-accelerated AI workloads. Joins the K3S
# cluster as an agent and advertises its NVIDIA GPU(s) for pod
# scheduling via `nvidia.com/gpu` resource requests.

{ config, pkgs, lib, ... }:

{
imports = [
# Per-machine hardware config — see hardware-configuration.nix.example
# for the template; real file generated by `nixos-generate-config`.
./hardware-configuration.nix

# Shared baseline.
../../modules/common.nix

# K3S agent role.
../../modules/k3s-agent.nix

# NVIDIA driver + container toolkit + node labels.
../../modules/gpu.nix
];

# ---------------------------------------------------------------------------
# Identity
# ---------------------------------------------------------------------------
networking.hostName = "worker-gpu-01";

# ---------------------------------------------------------------------------
# K3S join — point at the control-plane and provide the cluster token.
# The token file should be sops-nix / agenix decrypted at boot in
# production; for initial bootstrap copy it manually after install.
# ---------------------------------------------------------------------------
services.k3s.serverAddr = "https://control-plane.zeta.local:6443";
# services.k3s.tokenFile = config.sops.secrets.k3s-token.path;

# ---------------------------------------------------------------------------
# Worker-specific node labels — exposed to the scheduler for placement.
# gpu.nix already adds zeta.io/gpu=nvidia; k3s-agent.nix adds
# zeta.io/role=worker. Add hardware-specific labels here, e.g.:
# zeta.io/gpu-model=rtx-4090
# zeta.io/gpu-count=2
# zeta.io/cpu-cores=32
# ---------------------------------------------------------------------------
services.k3s.extraFlags = lib.mkAfter [
# "--node-label=zeta.io/gpu-model=rtx-4090"
# "--node-label=zeta.io/gpu-count=2"
];

# ---------------------------------------------------------------------------
# SSH keys for the zeta admin user.
# ---------------------------------------------------------------------------
users.users.zeta.openssh.authorizedKeys.keys = [
# "ssh-ed25519 AAAAC3Nz... aaron@zeta"
# "ssh-ed25519 AAAAC3Nz... addison@zeta"
];
}
34 changes: 34 additions & 0 deletions infra/nixos/hosts/worker-gpu-01/hardware-configuration.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# infra/nixos/hosts/worker-gpu-01/hardware-configuration.nix
#
# PLACEHOLDER — replace during install on the actual target machine
# via `nixos-generate-config --root /mnt`. See
# infra/nixos/hosts/control-plane/hardware-configuration.nix for
# the same pattern.

{ config, lib, modulesPath, ... }:

{
imports = [
(modulesPath + "/installer/scan/not-detected.nix")
];

boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usb_storage" "sd_mod" ];
boot.initrd.kernelModules = [ ];
boot.kernelModules = [ ];
boot.extraModulePackages = [ ];

fileSystems."/" = lib.mkDefault {
device = "/dev/disk/by-label/nixos";
fsType = "ext4";
};

fileSystems."/boot" = lib.mkDefault {
device = "/dev/disk/by-label/boot";
fsType = "vfat";
};

swapDevices = lib.mkDefault [ ];

networking.useDHCP = lib.mkDefault true;
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
}
31 changes: 31 additions & 0 deletions infra/nixos/hosts/worker-gpu-02/configuration.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# infra/nixos/hosts/worker-gpu-02/configuration.nix
#
# Worker node #02 — identical shape to worker-gpu-01, separate file
# so per-machine labels / hardware specifics stay declared per host.
# Add more workers as worker-gpu-03, -04, ... following this template.

{ config, pkgs, lib, ... }:

{
imports = [
./hardware-configuration.nix
../../modules/common.nix
../../modules/k3s-agent.nix
../../modules/gpu.nix
];

networking.hostName = "worker-gpu-02";

services.k3s.serverAddr = "https://control-plane.zeta.local:6443";
# services.k3s.tokenFile = config.sops.secrets.k3s-token.path;

services.k3s.extraFlags = lib.mkAfter [
# "--node-label=zeta.io/gpu-model=rtx-4090"
# "--node-label=zeta.io/gpu-count=1"
];

users.users.zeta.openssh.authorizedKeys.keys = [
# "ssh-ed25519 AAAAC3Nz... aaron@zeta"
# "ssh-ed25519 AAAAC3Nz... addison@zeta"
];
}
34 changes: 34 additions & 0 deletions infra/nixos/hosts/worker-gpu-02/hardware-configuration.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# infra/nixos/hosts/worker-gpu-02/hardware-configuration.nix
#
# PLACEHOLDER — replace during install on the actual target machine
# via `nixos-generate-config --root /mnt`. See
# infra/nixos/hosts/control-plane/hardware-configuration.nix for
# the same pattern.

{ config, lib, modulesPath, ... }:

{
imports = [
(modulesPath + "/installer/scan/not-detected.nix")
];

boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usb_storage" "sd_mod" ];
boot.initrd.kernelModules = [ ];
boot.kernelModules = [ ];
boot.extraModulePackages = [ ];

fileSystems."/" = lib.mkDefault {
device = "/dev/disk/by-label/nixos";
fsType = "ext4";
};

fileSystems."/boot" = lib.mkDefault {
device = "/dev/disk/by-label/boot";
fsType = "vfat";
};

swapDevices = lib.mkDefault [ ];

networking.useDHCP = lib.mkDefault true;
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
}
Loading