diff --git a/data/data/bootstrap/baremetal/README.md b/data/data/bootstrap/baremetal/README.md new file mode 100644 index 00000000000..4baa852f3f0 --- /dev/null +++ b/data/data/bootstrap/baremetal/README.md @@ -0,0 +1,40 @@ +# Bare Metal IPI Bootstrap Assets + +The `baremetal` platform (IPI for Bare Metal hosts) includes some additional +assets on the bootstrap node for automating some infrastructure requirements +that would have normally been handled by some cloud infrastructure service. +The [Bare Metal IPI Networking Infrastructure design +document](../../../../docs/design/baremetal/networking-infrastructure.md) +covers the high-level background, and this document explains these +bootstrap assets in more detail. + +## API failover from bootstrap to control plane machines + +`keepalived` is used to manage the failover of a VIP (Virtual IP) for the API +server. This VIP first resides on the bootstrap VM. Once the master nodes come +up, the VIP will move to the control plane machines. + +Relevant files: +* **files/etc/keepalived/keepalived.conf.tmpl** - `keepalived` configuration + template +* **files/usr/local/bin/keepalived.sh** - This script runs before `keepalived` + starts and generates the `keepalived` configuration file from the template. +* **systemd/units/keepalived.service** - systemd unit file for `keepalived`. + This runs `keepalived.sh` to generate the proper configuration from the + template and then runs podman to launch `keepalived`. + +## Internal DNS + +The bootstrap assets relating to DNS automate as much of the DNS requirements +internal to the cluster as possible. + +TODO - explain how this works ... + +Relevant files: +* files/etc/coredns/Corefile +* files/etc/keepalived/keepalived.conf.tmpl +* files/etc/dhcp/dhclient.conf +* files/usr/local/bin/fletcher8 +* files/usr/local/bin/get_vip_subnet_cidr +* files/usr/local/bin/coredns.sh +* systemd/units/coredns.service diff --git a/data/data/bootstrap/baremetal/files/etc/coredns/Corefile b/data/data/bootstrap/baremetal/files/etc/coredns/Corefile new file mode 100644 index 00000000000..d292b7ee4d2 --- /dev/null +++ b/data/data/bootstrap/baremetal/files/etc/coredns/Corefile @@ -0,0 +1,15 @@ +# Configuration template for the CoreDNS instance used to provide DNS +# resolution between nodes in the cluster. + +. { + errors + health + mdns {$CLUSTER_DOMAIN} {$NUM_DNS_MEMBERS} {$CLUSTER_NAME} + forward . /etc/coredns/resolv.conf + cache 30 + reload + hosts /etc/coredns/api-int.hosts {$CLUSTER_DOMAIN} { + {$API_VIP} api-int.{$CLUSTER_DOMAIN} + fallthrough + } +} diff --git a/data/data/bootstrap/baremetal/files/etc/dhcp/dhclient.conf b/data/data/bootstrap/baremetal/files/etc/dhcp/dhclient.conf new file mode 100644 index 00000000000..6bdd4b02e04 --- /dev/null +++ b/data/data/bootstrap/baremetal/files/etc/dhcp/dhclient.conf @@ -0,0 +1,4 @@ +# Specifies that the bootstrap node should use its own local DNS server for +# name resolution. + +prepend domain-name-servers 127.0.0.1; diff --git a/data/data/bootstrap/baremetal/files/etc/keepalived/keepalived.conf.tmpl b/data/data/bootstrap/baremetal/files/etc/keepalived/keepalived.conf.tmpl new file mode 100644 index 00000000000..d3b6ae5d1f6 --- /dev/null +++ b/data/data/bootstrap/baremetal/files/etc/keepalived/keepalived.conf.tmpl @@ -0,0 +1,35 @@ +# Configuration template for Keepalived, which is used to manage the DNS and +# API VIPs. +# +# For more information, see installer/data/data/bootstrap/baremetal/README.md +# + +vrrp_instance ${CLUSTER_NAME}_API { + state BACKUP + interface ${INTERFACE} + virtual_router_id ${API_VRID} + priority 50 + advert_int 1 + authentication { + auth_type PASS + auth_pass ${CLUSTER_NAME}_api_vip + } + virtual_ipaddress { + ${API_VIP}/${NET_MASK} + } +} + +vrrp_instance ${CLUSTER_NAME}_DNS { + state MASTER + interface ${INTERFACE} + virtual_router_id ${DNS_VRID} + priority 140 + advert_int 1 + authentication { + auth_type PASS + auth_pass ${CLUSTER_NAME}_dns_vip + } + virtual_ipaddress { + ${DNS_VIP}/${NET_MASK} + } +} diff --git a/data/data/bootstrap/baremetal/files/usr/local/bin/coredns.sh b/data/data/bootstrap/baremetal/files/usr/local/bin/coredns.sh new file mode 100755 index 00000000000..399628c92fb --- /dev/null +++ b/data/data/bootstrap/baremetal/files/usr/local/bin/coredns.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -e + +# Script to configure and run the CoreDNS instance used to provide DNS +# resolution between nodes in the cluster. + + +mkdir --parents /etc/keepalived + +API_DNS="$(sudo awk -F[/:] '/apiServerURL/ {print $5}' /opt/openshift/manifests/cluster-infrastructure-02-config.yml)" +CLUSTER_DOMAIN="${API_DNS#*.}" +read -r -d '.' -a CLUSTER_ARR <<< "$CLUSTER_DOMAIN" +CLUSTER_NAME=${CLUSTER_ARR[0]} +API_VIP="$(dig +noall +answer "$API_DNS" | awk '{print $NF}')" +DNS_VIP="$(dig +noall +answer "ns1.${CLUSTER_DOMAIN}" | awk '{print $NF}')" +grep -Ev "${DNS_VIP}|127.0.0.1" /etc/resolv.conf | tee /etc/coredns/resolv.conf +NUM_DNS_MEMBERS=$(grep -A 5 'controlPlane' /opt/openshift/manifests/cluster-config.yaml | awk '/replicas/ {print $2}') +export API_VIP CLUSTER_DOMAIN + +COREDNS_IMAGE="quay.io/openshift-metal3/coredns-mdns:latest" +if ! podman inspect "$COREDNS_IMAGE" &>/dev/null; then + echo "Pulling release image..." + podman pull "$COREDNS_IMAGE" +fi +MATCHES="$(sudo podman ps -a --format "{{.Names}}" | awk '/coredns$/ {print $0}')" +if [[ -z "$MATCHES" ]]; then + /usr/bin/podman create \ + --name coredns \ + --volume /etc/coredns:/etc/coredns:z \ + --network host \ + --env CLUSTER_DOMAIN="$CLUSTER_DOMAIN" \ + --env CLUSTER_NAME="$CLUSTER_NAME" \ + --env NUM_DNS_MEMBERS="$NUM_DNS_MEMBERS" \ + --env API_VIP="$API_VIP" \ + "${COREDNS_IMAGE}" \ + --conf /etc/coredns/Corefile +fi diff --git a/data/data/bootstrap/baremetal/files/usr/local/bin/fletcher8 b/data/data/bootstrap/baremetal/files/usr/local/bin/fletcher8 new file mode 100755 index 00000000000..cf8ce0c9d97 --- /dev/null +++ b/data/data/bootstrap/baremetal/files/usr/local/bin/fletcher8 @@ -0,0 +1,14 @@ +#!/usr/libexec/platform-python + +# Script that uses the fletcher8 algorithm to generate a hash from an input +# string. This is used to generate VRRP ids for use with Keepalived. + +import sys + +data = map(ord, sys.argv[1]) +ckA = ckB = 0 + +for b in data: + ckA = (ckA + b) & 0xf + ckB = (ckB + ckA) & 0xf +print((ckB << 4) | ckA ) diff --git a/data/data/bootstrap/baremetal/files/usr/local/bin/get_vip_subnet_cidr b/data/data/bootstrap/baremetal/files/usr/local/bin/get_vip_subnet_cidr new file mode 100755 index 00000000000..12e5c6afc15 --- /dev/null +++ b/data/data/bootstrap/baremetal/files/usr/local/bin/get_vip_subnet_cidr @@ -0,0 +1,27 @@ +#!/usr/libexec/platform-python + +# Script to determine the network CIDR for a given VIP. + +import sys +import socket +import struct + +vip = sys.argv[1] +iface_cidrs = sys.argv[2].split() +vip_int = struct.unpack("!I", socket.inet_aton(vip))[0] + +for iface_cidr in iface_cidrs: + ip, prefix = iface_cidr.split('/') + ip_int = struct.unpack("!I", socket.inet_aton(ip))[0] + prefix_int = int(prefix) + mask = int('1' * prefix_int + '0' * (32 - prefix_int), 2) + subnet_ip_int_min = ip_int & mask + subnet_ip = socket.inet_ntoa(struct.pack("!I", subnet_ip_int_min)) + subnet_ip_int_max = subnet_ip_int_min | int('1' * (32 - prefix_int), 2) + subnet_ip_max = socket.inet_ntoa(struct.pack("!I", subnet_ip_int_max)) + sys.stderr.write('Is %s between %s and %s\n' % (vip, subnet_ip, subnet_ip_max)) + if subnet_ip_int_min < vip_int < subnet_ip_int_max: + subnet_ip = socket.inet_ntoa(struct.pack("!I", subnet_ip_int_min)) + print('%s/%s' % (subnet_ip, prefix)) + sys.exit(0) +sys.exit(1) diff --git a/data/data/bootstrap/baremetal/files/usr/local/bin/keepalived.sh b/data/data/bootstrap/baremetal/files/usr/local/bin/keepalived.sh new file mode 100755 index 00000000000..02d84865142 --- /dev/null +++ b/data/data/bootstrap/baremetal/files/usr/local/bin/keepalived.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +# +# For more information, see installer/data/data/bootstrap/baremetal/README.md +# + +set -e + +# Script to configure and run the Keepalived instance used to manage the DNS +# and API VIPs. + +mkdir --parents /etc/keepalived + +KEEPALIVED_IMAGE=quay.io/celebdor/keepalived:latest +if ! podman inspect "$KEEPALIVED_IMAGE" &>/dev/null; then + echo "Pulling release image..." + podman pull "$KEEPALIVED_IMAGE" +fi + +API_DNS="$(sudo awk -F[/:] '/apiServerURL/ {print $5}' /opt/openshift/manifests/cluster-infrastructure-02-config.yml)" +CLUSTER_NAME="$(awk -F. '{print $2}' <<< "$API_DNS")" +API_VIP="$(dig +noall +answer "$API_DNS" | awk '{print $NF}')" +IFACE_CIDRS="$(ip addr show | grep -v "scope host" | grep -Po 'inet \K[\d.]+/[\d.]+' | xargs)" +SUBNET_CIDR="$(/usr/local/bin/get_vip_subnet_cidr "$API_VIP" "$IFACE_CIDRS")" +NET_MASK="$(echo "$SUBNET_CIDR" | cut -d "/" -f 2)" +INTERFACE="$(ip -o addr show to "$SUBNET_CIDR" | head -n 1 | awk '{print $2}')" +CLUSTER_DOMAIN="${API_DNS#*.}" +DNS_VIP="$(dig +noall +answer "ns1.${CLUSTER_DOMAIN}" | awk '{print $NF}')" + +# Virtual Router IDs. They must be different and 8 bit in length +API_VRID=$(/usr/local/bin/fletcher8 "$CLUSTER_NAME-api") +DNS_VRID=$(/usr/local/bin/fletcher8 "$CLUSTER_NAME-dns") + +export API_VIP +export CLUSTER_NAME +export INTERFACE +export DNS_VIP +export API_VRID +export DNS_VRID +export NET_MASK +envsubst < /etc/keepalived/keepalived.conf.tmpl | sudo tee /etc/keepalived/keepalived.conf + +MATCHES="$(sudo podman ps -a --format "{{.Names}}" | awk '/keepalived$/ {print $0}')" +if [[ -z "$MATCHES" ]]; then + # TODO(bnemec): Figure out how to run with less perms + podman create \ + --name keepalived \ + --volume /etc/keepalived:/etc/keepalived:z \ + --network=host \ + --privileged \ + --cap-add=ALL \ + "${KEEPALIVED_IMAGE}" \ + /usr/sbin/keepalived -f /etc/keepalived/keepalived.conf \ + --dont-fork -D -l -P +fi diff --git a/data/data/bootstrap/baremetal/systemd/units/coredns.service b/data/data/bootstrap/baremetal/systemd/units/coredns.service new file mode 100644 index 00000000000..bee22874b2e --- /dev/null +++ b/data/data/bootstrap/baremetal/systemd/units/coredns.service @@ -0,0 +1,20 @@ +# Systemd service file used to start the bootstrap CoreDNS instance. + +[Unit] +Description=Serve cluster DNS gathered from mDNS +Wants=network-online.target +After=network-online.target + +[Service] +WorkingDirectory=/etc/coredns +ExecStartPre=/usr/local/bin/coredns.sh +ExecStart=/usr/bin/podman start -a coredns +ExecStop=/usr/bin/podman stop -t 10 coredns +ConditionPathExists=!/etc/pivot/image-pullspec + +Restart=on-failure +RestartSec=5 +TimeoutStartSec=600 + +[Install] +WantedBy=multi-user.target diff --git a/data/data/bootstrap/baremetal/systemd/units/keepalived.service b/data/data/bootstrap/baremetal/systemd/units/keepalived.service new file mode 100644 index 00000000000..8e90f1968e4 --- /dev/null +++ b/data/data/bootstrap/baremetal/systemd/units/keepalived.service @@ -0,0 +1,23 @@ +# Systemd service file used to start the bootstrap Keepalived instance. +# +# For more information, see installer/data/data/bootstrap/baremetal/README.md +# + +[Unit] +Description=Manage node VIPs with keepalived +Wants=network-online.target +After=network-online.target + +[Service] +WorkingDirectory=/etc/keepalived +ExecStartPre=/usr/local/bin/keepalived.sh +ExecStart=/usr/bin/podman start -a keepalived +ExecStop=/usr/bin/podman stop -t 10 keepalived +ConditionPathExists=!/etc/pivot/image-pullspec + +Restart=on-failure +RestartSec=5 +TimeoutStartSec=600 + +[Install] +WantedBy=multi-user.target diff --git a/docs/design/baremetal/networking-infrastructure.md b/docs/design/baremetal/networking-infrastructure.md new file mode 100644 index 00000000000..53b6ca8e690 --- /dev/null +++ b/docs/design/baremetal/networking-infrastructure.md @@ -0,0 +1,47 @@ +# Bare Metal IPI Networking Infrastructure + +The `baremetal` platform (IPI for Bare Metal hosts) automates a number +of networking infrastructure requirements that are handled on other +platforms by cloud infrastructure services. + +## Load-balanced control plane access + +Access to the Kubernetes API (port 6443) from clients both external +and internal to the cluster should be load-balanced across control +plane machines. + +Access to Ignition configs (port 22623) from clients within the +cluster should also be load-balanced across control plane machines. + +In both cases, the installation process expects these ports to be +reachable on the bootstrap VM at first and then later on the +newly-deployed control plane machines. + +On other platforms (for example, see [the AWS UPI +instructions](../../user/aws/install_upi.md)) an external +load-balancer is required to be configured in advance in order to +provide this access. + +In the `baremetal` platform, a VIP (Virtual IP) is used to provide +failover of the API server across the control plane machines +(including the bootstrap VM). This "API VIP" is provided by the user +as an `install-config.yaml` parameter and the installation process +configures `keepalived` to manage this VIP. + +The API VIP first resides on the bootstrap VM. The `keepalived` +instance here is managed by systemd and a script is used to generate +the `keepalived` configuration before launching the service using +`podman`. See [here](../../../data/data/bootstrap/baremetal/README.md) +for more informations about the relevant bootstrap assets. + +Once the control plane machines come up, the VIP will move to the one +of these machines. This happens because the `keepalived` instances on +control plane machines are configured (in `keepalived.conf`) with a +higher +[VRRP](https://en.wikipedia.org/wiki/Virtual_Router_Redundancy_Protocol) +priority. These `keepalived` instances are run as [static +pods](https://kubernetes.io/docs/tasks/administer-cluster/static-pod/) +and the relevant assets are [rendered by the Machine Config +Operator](https://github.com/openshift/machine-config-operator/pull/795). See +[here](FIXME: link to a README in MCO) for more information about +these assets. diff --git a/pkg/asset/ignition/bootstrap/bootstrap.go b/pkg/asset/ignition/bootstrap/bootstrap.go index c6f5a0d795c..a7b5b639d03 100644 --- a/pkg/asset/ignition/bootstrap/bootstrap.go +++ b/pkg/asset/ignition/bootstrap/bootstrap.go @@ -132,6 +132,29 @@ func (a *Bootstrap) Generate(dependencies asset.Parents) error { if err != nil { return err } + + // Check for optional platform specific files/units + platform := installConfig.Config.Platform.Name() + platformFilePath := fmt.Sprintf("bootstrap/%s/files", platform) + directory, err := data.Assets.Open(platformFilePath) + if err == nil { + directory.Close() + err = a.addStorageFiles("/", platformFilePath, templateData) + if err != nil { + return err + } + } + + platformUnitPath := fmt.Sprintf("bootstrap/%s/systemd/units", platform) + directory, err = data.Assets.Open(platformUnitPath) + if err == nil { + directory.Close() + err = a.addSystemdUnits(platformUnitPath, templateData) + if err != nil { + return err + } + } + a.addParentFiles(dependencies) a.Config.Passwd.Users = append( @@ -256,6 +279,9 @@ func (a *Bootstrap) addSystemdUnits(uri string, templateData *bootstrapTemplateD "chown-gatewayd-key.service": {}, "systemd-journal-gatewayd.socket": {}, "approve-csr.service": {}, + // baremetal platform services + "keepalived.service": {}, + "coredns.service": {}, } directory, err := data.Assets.Open(uri)