From 32629a02c84f03edcd67c03dd53efb2ab685ba7d Mon Sep 17 00:00:00 2001 From: David Cassany Date: Wed, 7 Jul 2021 14:19:27 +0200 Subject: [PATCH 1/2] Make use of grub2 environment block This commit sets the grub2 configuration to load /grubenv file, if any, and checks the `next_entry` variable to set the default grub2 option. This is helpful to set default boot in grub only for the next boot. From cOS it is enough to run `grub2-editenv /oem/grubenv set next_entry=recovery` to set the next reboot to default to recovery system. This commit also adds a new docs page devoted to grub configuration. Finally in also moves the wiki page devoted to a K3s and Fleet deployment example into its own docs page. Signed-off-by: David Cassany --- README.md | 6 +- docs/configure_grub.md | 34 +++++ .../k3s_and_fleet_on_vanilla_image_example.md | 140 ++++++++++++++++++ packages/cos/collection.yaml | 6 +- packages/grub-config/config/grub.cfg | 16 +- packages/grub-config/definition.yaml | 2 +- packages/recovery-img/definition.yaml | 2 +- packages/recovery-img/squash/definition.yaml | 2 +- 8 files changed, 200 insertions(+), 8 deletions(-) create mode 100644 docs/configure_grub.md create mode 100644 docs/k3s_and_fleet_on_vanilla_image_example.md diff --git a/README.md b/README.md index 32ebb72dcc6..733ec6338c7 100644 --- a/README.md +++ b/README.md @@ -90,13 +90,17 @@ If you are looking after only generating a container image that can be used for - [Sample repository](https://github.com/rancher-sandbox/cos-toolkit-sample-repo) - [EpinioOS sample repository](https://github.com/rancher-sandbox/epinio-appliance-demo-sample) - [Use Fleet to upgrade a cOS derivative](https://github.com/rancher-sandbox/cos-fleet-upgrades-sample) -- [Deploy Fleet on a cOS vanilla image](https://github.com/rancher-sandbox/cOS-toolkit/wiki/K3s-and-Fleet-on-top-of-cOS-Vanilla-image) +- [Deploy Fleet on a cOS vanilla image](/docs/k3s_and_fleet_on_vanilla_image_example.md) ### cOS development - [Development notes](/docs/dev.md) - [High Level architecture](/docs/high_level_architecture.md) - [Github project](https://github.com/mudler/cOS/projects/1) for a short-term Roadmap +### Usage hints + +- [Grub2 default boot entry setup](/docs/configure_grub.md) + ## License Copyright (c) 2020-2021 [SUSE, LLC](http://suse.com) diff --git a/docs/configure_grub.md b/docs/configure_grub.md new file mode 100644 index 00000000000..ffefe8fcf16 --- /dev/null +++ b/docs/configure_grub.md @@ -0,0 +1,34 @@ +# Grub2 default boot entry setup + +cOS (since v0.5.8) makes use of the grub2 environment block which can used to define +persistent grub2 variables across reboots. + +The default grub configuration loads the `/grubenv` of any available device +and evaluates on `next_entry` variable and `saved_entry` variable. By default +none is set. + +The default boot entry is set to the value of `saved_entry`, in case the variable +is not set grub just defaults to the first menu entry. + +`next_entry` variable can be used to overwrite the default boot entry for a single +boot. If `next_entry` variable is set this is only being used once, grub2 will +unset it after reading it for the first time. This is helpful to define the menu entry +to reboot to without having to make any permanent config change. + +Use `grub2-editenv` command line utility to define desired values. + +For instance use the following command to reboot to recovery system only once: + +```bash +> grub2-editenv /oem/grubenv set next_entry=recovery +``` + +Or to set the default entry to `fallback` system: + +```bash +> grub2-editenv /oem/grubenv set default=fallback +``` + +These examples make of the `COS_OEM` device, however it could use any device +detected by grub2 that includes the file `/grubenv`. First match wins. + diff --git a/docs/k3s_and_fleet_on_vanilla_image_example.md b/docs/k3s_and_fleet_on_vanilla_image_example.md new file mode 100644 index 00000000000..540acc8f40c --- /dev/null +++ b/docs/k3s_and_fleet_on_vanilla_image_example.md @@ -0,0 +1,140 @@ +# K3s + Fleet on top of cOS Vanilla image + +This is a work in progress example of how to deploy K3S + Fleet + System Uprade Controller over a cOS vanilla image only +by using `yip` yaml configuration files (cloud-init style). The config file reproduced here is meant to be included +as a user-data in a cloud provider (aws, gcp, azure, etc) or as part of a cdrom (cOS-Recovery will try to fetch `/userdata` file +from a cdrom device). + +A vanilla image is an image that only provides the cOS-Recovery system on a `COS_RECOVERY` partition. It does not include any other +system and it is meant to be dumped to a bigger disk and deploy a cOS system or a derivative system over the free space in disk. +COS vanilla images are build as part of the CI workflow, see CI artifacts to download one of those. + +The configuration file of this example has two purposes: first it deploys cOS, second in reboots on the deployed OS and deploys +K3S + Fleet + System Upgrades Controller. + +On first boot it will fail to boot cOS grub menu entry and fallback +to cOS-Recovery system. From there it will partition the vanilla image to create the main system partition (`COS_STATE`) +and add an extra partition for persistent data (`COS_PERSISTENT`). It will use the full disk, a disk of at least 20GiB +is recommended. After partitioning it will deploy the main system on `COS_STATE` and reboot to it. + +On consequent boots it will simply boot from `COS_STATE`, there it prepares the persistent areas of the system (arranges few bind +mounts inside `COS_PERSISTENT`) and then it runs an standard installation of K3s, Fleet and System Upgrade Controller. After few +minutes after the system is up the K3s cluster is up and running. + +Note this setup similar to the [derivative example](https://github.com/rancher-sandbox/cos-fleet-upgrades-sample) using Fleet. +The main difference is that this example does not require to build any image, it is pure cloud-init configuration based. + +### User data configuration file +```yaml +name: "Default deployment" +stages: + rootfs.after: + - if: '[ -f "/run/cos/recovery_mode" ]' + name: "Repart image" + layout: + # It will partition a device including the given filesystem label or part label (filesystem label matches first) + device: + label: COS_RECOVERY + add_partitions: + - fsLabel: COS_STATE + # 15Gb for COS_STATE, so the disk should have, at least, 20Gb + size: 15360 + pLabel: state + - fsLabel: COS_PERSISTENT + # unset size or 0 size means all available space + pLabel: persistent + initramfs: + - name: "Set /etc/hosts" + files: + - path: /etc/hosts + content: | + 127.0.0.1 localhost + - if: '[ ! -f "/run/cos/recovery_mode" ]' + name: "Persist" + commands: + - | + target=/usr/local/.cos-state + + # Always want the latest update of systemd conf from the image + # TODO: This might break the fallback system + mkdir -p "${target}/etc/systemd/" + rsync -av /etc/systemd/ "${target}/etc/systemd/" + + # Only populate ssh conf once + if [ ! -e "${target}/etc/ssh" ]; then + mkdir -p "${target}/etc/ssh/" + rsync -av /etc/ssh/ "${target}/etc/ssh/" + fi + + # undo /home /opt /root mount from cos immutable-rootfs module + # TODO: we could think of configuring custom overlay paths in + # immutable rootfs package. So this part could be omitted + for i in home opt root; do + sed -i "/overlay \/${i} /d" /etc/fstab + nsenter -m -t 1 -- umount "/sysroot/${i}" + done + + # setup directories as persistent + # TODO: would it make sense defining persistent state overlayfs mounts + # as part of the immutable rootfs config? + for i in root opt home var/lib/rancher var/lib/kubelet etc/systemd etc/rancher etc/ssh; do + mkdir -p "${target}/${i}" "/${i}" + echo "${target}/${i} /${i} none defaults,bind 0 0" >> /etc/fstab + nsenter -m -t 1 -- mount -o defaults,bind "/sysroot${target}/${i}" "/sysroot/${i}" + done + + # ensure /var/log/journal exists so it's labeled correctly + mkdir -p /var/log/journal + network.before: + - name: "Setup SSH keys" + authorized_keys: + root: + # It can download ssh key from remote places, such as github user keys (e.g. `github:my_user`) + - my_custom_ssh_key + - if: '[ ! -f "/run/cos/recovery_mode" ]' + name: "Fleet deployment" + files: + - path: /etc/k3s/manifests/fleet-config.yaml + content: | + apiVersion: helm.cattle.io/v1 + kind: HelmChart + metadata: + name: fleet-crd + namespace: kube-system + spec: + chart: https://github.com/rancher/fleet/releases/download/v0.3.3/fleet-crd-0.3.3.tgz + --- + apiVersion: helm.cattle.io/v1 + kind: HelmChart + metadata: + name: fleet + namespace: kube-system + spec: + chart: https://github.com/rancher/fleet/releases/download/v0.3.3/fleet-0.3.3.tgz + network: + - if: '[ -f "/run/cos/recovery_mode" ]' + name: "Deploy cos-system" + commands: + # Deploys the latest image available in default channel (quay.io/costoolkit/releases-opensuse) + # use --docker-image to deploy a custom image + # e.g. `cos-deploy --docker-image quay.io/my_custom_repo:my_image` + - cos-deploy && shutdown -r now + - if: '[ ! -f "/run/cos/recovery_mode" ]' + name: "Setup k3s" + directories: + - path: "/usr/local/bin" + permissions: 0755 + owner: 0 + group: 0 + commands: + - | + curl -sfL https://get.k3s.io | \ + INSTALL_K3S_VERSION="v1.20.4+k3s1" \ + INSTALL_K3S_EXEC="--tls-san {{.Values.node.hostname}}" \ + INSTALL_K3S_SELINUX_WARN="true" \ + sh - + # Install fleet + kubectl apply -f /etc/k3s/manifests/fleet-config.yaml + # Install system-upgrade-controller + kubectl apply -f https://raw.githubusercontent.com/rancher/system-upgrade-controller/v0.6.2/manifests/system-upgrade-controller.yaml +``` diff --git a/packages/cos/collection.yaml b/packages/cos/collection.yaml index e833e1a7540..32a33da2bb8 100644 --- a/packages/cos/collection.yaml +++ b/packages/cos/collection.yaml @@ -1,7 +1,7 @@ packages: - name: "cos" category: "system" - version: "0.5.7+5" + version: "0.5.8" description: "cOS base image, used to build cOS live ISOs" brand_name: "cOS" labels: @@ -9,7 +9,7 @@ packages: autobump.revbump_related: "recovery/cos-img recovery/cos-squash" - name: "cos" category: "recovery" - version: 0.5.7+5 + version: "0.5.8" brand_name: "cOS recovery" description: "cOS recovery image, used to boot cOS for troubleshooting" labels: @@ -17,7 +17,7 @@ packages: autobump.revbump_related: "recovery/cos-img recovery/cos-squash" - name: "cos-container" category: "system" - version: 0.5.7+6 + version: "0.5.8" brand_name: "cOS" description: "cOS container image, used to build cOS derivatives from scratch" labels: diff --git a/packages/grub-config/config/grub.cfg b/packages/grub-config/config/grub.cfg index 520c8b7def4..c3156de0619 100644 --- a/packages/grub-config/config/grub.cfg +++ b/packages/grub-config/config/grub.cfg @@ -1,5 +1,19 @@ set timeout=10 -set default="${saved_entry}" + +set env_file="/grubenv" +search --file --set=env_blk "${env_file}" + +if [ "${env_blk}" ] ; then + load_env -f "(${env_blk})${env_file}" +fi + +if [ "${next_entry}" ]; then + set default="${next_entry}" + set next_entry= + save_env -f "(${env_blk})${env_file}" next_entry +else + set default="${saved_entry}" +fi set fallback="0 1 2" set gfxmode=auto diff --git a/packages/grub-config/definition.yaml b/packages/grub-config/definition.yaml index fb46af1e36d..61413292931 100644 --- a/packages/grub-config/definition.yaml +++ b/packages/grub-config/definition.yaml @@ -1,3 +1,3 @@ name: "grub-config" category: "system" -version: 0.0.8+4 +version: 0.0.9 diff --git a/packages/recovery-img/definition.yaml b/packages/recovery-img/definition.yaml index bdf4c8b8764..4fd9a6625ed 100644 --- a/packages/recovery-img/definition.yaml +++ b/packages/recovery-img/definition.yaml @@ -1,4 +1,4 @@ name: "cos-img" category: "recovery" -version: 0.5.7+7 +version: "0.5.8" brand_name: "cOS" diff --git a/packages/recovery-img/squash/definition.yaml b/packages/recovery-img/squash/definition.yaml index b852e3d5c8c..5e48a8084ca 100644 --- a/packages/recovery-img/squash/definition.yaml +++ b/packages/recovery-img/squash/definition.yaml @@ -1,3 +1,3 @@ name: "cos-squash" category: "recovery" -version: "0.5.7+5" +version: "0.5.8" From 75f594112b478d6765a76b80fd8edf63c18721d9 Mon Sep 17 00:00:00 2001 From: David Cassany Date: Wed, 7 Jul 2021 22:45:37 +0200 Subject: [PATCH 2/2] Adapt tests to new grub config This commit refactors the tests to make use of `grub2-editenv` utility to configure the default grub boot entry. Signed-off-by: David Cassany --- tests/deploys-images/deploy_test.go | 2 +- tests/recovery/recovery_test.go | 50 +++++++-------------- tests/smoke/smoke_test.go | 5 +-- tests/sut/sut.go | 68 +++++++++++++++-------------- 4 files changed, 55 insertions(+), 70 deletions(-) diff --git a/tests/deploys-images/deploy_test.go b/tests/deploys-images/deploy_test.go index f5fc56f5a8f..a7d590be7e7 100644 --- a/tests/deploys-images/deploy_test.go +++ b/tests/deploys-images/deploy_test.go @@ -40,7 +40,7 @@ var _ = Describe("cOS Deploy tests", func() { Expect(err).NotTo(HaveOccurred()) }) It("force deploys from recovery", func() { - err := s.ChangeBoot(sut.Recovery) + err := s.ChangeBootOnce(sut.Recovery) Expect(err).ToNot(HaveOccurred()) s.Reboot() ExpectWithOffset(1, s.BootFrom()).To(Equal(sut.Recovery)) diff --git a/tests/recovery/recovery_test.go b/tests/recovery/recovery_test.go index d62fa959414..335dcacc484 100644 --- a/tests/recovery/recovery_test.go +++ b/tests/recovery/recovery_test.go @@ -15,18 +15,17 @@ var _ = Describe("cOS Recovery upgrade tests", func() { s.EventuallyConnects() }) - AfterEach(func() { - if CurrentGinkgoTestDescription().Failed == false { - s.Reset() - } - }) - Context("upgrading COS_ACTIVE from the recovery partition", func() { + AfterEach(func() { + if CurrentGinkgoTestDescription().Failed == false { + s.Reset() + } + }) It("upgrades to the latest", func() { currentName := s.GetOSRelease("NAME") By("booting into recovery to check the OS version") - err := s.ChangeBoot(sut.Recovery) + err := s.ChangeBootOnce(sut.Recovery) Expect(err).ToNot(HaveOccurred()) s.Reboot() @@ -42,14 +41,13 @@ var _ = Describe("cOS Recovery upgrade tests", func() { Expect(currentName).To(Equal(recoveryName)) } + By("upgrade with CURRENT=active.img") out, err := s.Command("CURRENT=active.img cos-upgrade") Expect(err).ToNot(HaveOccurred()) Expect(out).Should(ContainSubstring("Upgrade done, now you might want to reboot")) Expect(out).Should(ContainSubstring("Upgrading system")) - err = s.ChangeBoot(sut.Active) - Expect(err).ToNot(HaveOccurred()) - + By("Reboot to upgraded active") s.Reboot() ExpectWithOffset(1, s.BootFrom()).To(Equal(sut.Active)) }) @@ -85,11 +83,8 @@ var _ = Describe("cOS Recovery upgrade tests", func() { }) }) + // After this tests the VM is no longer in its initial state!! Context("upgrading recovery", func() { - AfterEach(func() { - s.Reset() - }) - When("using specific images", func() { It("upgrades to a specific image and reset back to the installed version", func() { version := s.GetOSRelease("VERSION") @@ -100,7 +95,7 @@ var _ = Describe("cOS Recovery upgrade tests", func() { Expect(out).Should(ContainSubstring("Upgrading recovery partition")) By("booting into recovery to check the OS version") - err = s.ChangeBoot(sut.Recovery) + err = s.ChangeBootOnce(sut.Recovery) Expect(err).ToNot(HaveOccurred()) s.Reboot() @@ -111,39 +106,26 @@ var _ = Describe("cOS Recovery upgrade tests", func() { Expect(out).ToNot(Equal(version)) Expect(out).To(Equal("0.5.3\n")) - By("setting back to active and rebooting") - err = s.ChangeBoot(sut.Active) - Expect(err).ToNot(HaveOccurred()) - + By("rebooting back to active") s.Reboot() ExpectWithOffset(1, s.BootFrom()).To(Equal(sut.Active)) }) }) When("using upgrade channel", func() { - // TODO: This test cannot be enabled until we have in master a published version of cOS >=0.5.3 It("upgrades to latest image", func() { - By("upgrading recovery and reboot") + By("upgrading recovery") out, err := s.Command("cos-upgrade --no-verify --recovery") Expect(err).ToNot(HaveOccurred()) Expect(out).Should(ContainSubstring("Upgrade done, now you might want to reboot")) Expect(out).Should(ContainSubstring("Upgrading recovery partition")) - err = s.ChangeBoot(sut.Recovery) + By("Reboot to upgraded recovery") + err = s.ChangeBootOnce(sut.Recovery) Expect(err).ToNot(HaveOccurred()) - s.Reboot() - - By("checking recovery version") - out, err = s.Command("source /etc/os-release && echo $VERSION") - Expect(err).ToNot(HaveOccurred()) - Expect(out).ToNot(Equal("")) - Expect(out).ToNot(Equal("0.5.1\n")) - - By("switch back to active and reboot") - err = s.ChangeBoot(sut.Active) - Expect(err).ToNot(HaveOccurred()) - + ExpectWithOffset(1, s.BootFrom()).To(Equal(sut.Recovery)) + By("rebooting back to active") s.Reboot() ExpectWithOffset(1, s.BootFrom()).To(Equal(sut.Active)) }) diff --git a/tests/smoke/smoke_test.go b/tests/smoke/smoke_test.go index 5d6ad383471..7401539fbef 100644 --- a/tests/smoke/smoke_test.go +++ b/tests/smoke/smoke_test.go @@ -15,7 +15,7 @@ var _ = Describe("cOS Smoke tests", func() { Context("After install", func() { It("can boot into passive", func() { - err := s.ChangeBoot(sut.Passive) + err := s.ChangeBootOnce(sut.Passive) Expect(err).ToNot(HaveOccurred()) By("rebooting into passive") @@ -25,8 +25,7 @@ var _ = Describe("cOS Smoke tests", func() { _, err = s.Command("cat /run/cos/recovery_mode") Expect(err).To(HaveOccurred()) - By("switching back to active") - s.ChangeBoot(sut.Active) + By("reboot back to active") s.Reboot() Expect(s.BootFrom()).To(Equal(sut.Active)) }) diff --git a/tests/sut/sut.go b/tests/sut/sut.go index d185fa02135..2353ea8f0df 100644 --- a/tests/sut/sut.go +++ b/tests/sut/sut.go @@ -10,23 +10,15 @@ import ( "strings" "time" + . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" "github.com/pkg/errors" ssh "golang.org/x/crypto/ssh" ) const ( - grubSwap = `dev=$(blkid -L COS_STATE); \ -mount -o rw,remount $dev && \ -mount $dev /boot/grub2 && \ -sed -i 's/set default=.*/set default=%s/' /boot/grub2/grub2/grub.cfg && \ -sync` - - grubSwapRecovery = ` -dev=$(blkid -L COS_STATE); mkdir /run/state; \ -mount $dev /run/state && \ -sed -i 's/set default=.*/set default=%s/' /run/state/grub2/grub.cfg -` + grubSwapOnce = "grub2-editenv /oem/grubenv set next_entry=%s" + grubSwap = "grub2-editenv /oem/grubenv set saved_entry=%s" Passive = 0 Active = iota @@ -75,35 +67,49 @@ func (s *SUT) ChangeBoot(b int) error { bootEntry = "recovery" } - if s.BootFrom() == Recovery { - _, err := s.command(fmt.Sprintf(grubSwapRecovery, bootEntry), false) - Expect(err).ToNot(HaveOccurred()) - } else { - _, err := s.command(fmt.Sprintf(grubSwap, bootEntry), false) - Expect(err).ToNot(HaveOccurred()) + _, err := s.command(fmt.Sprintf(grubSwap, bootEntry), false) + Expect(err).ToNot(HaveOccurred()) + + return nil +} + +func (s *SUT) ChangeBootOnce(b int) error { + + var bootEntry string + + switch b { + case Active: + bootEntry = "cos" + case Passive: + bootEntry = "fallback" + case Recovery: + bootEntry = "recovery" } + _, err := s.command(fmt.Sprintf(grubSwapOnce, bootEntry), false) + Expect(err).ToNot(HaveOccurred()) + return nil } // Reset runs reboots cOS into Recovery and runs cos-reset. // It will boot back the system from the Active partition afterwards func (s *SUT) Reset() { - err := s.ChangeBoot(Recovery) - Expect(err).ToNot(HaveOccurred()) - - s.Reboot() + if s.BootFrom() != Recovery { + By("Reboot to recovery before reset") + err := s.ChangeBootOnce(Recovery) + Expect(err).ToNot(HaveOccurred()) + s.Reboot() + Expect(s.BootFrom()).To(Equal(Recovery)) + } - Expect(s.BootFrom()).To(Equal(Recovery)) + By("Running cos-reset") out, err := s.command("cos-reset", false) Expect(err).ToNot(HaveOccurred()) Expect(out).Should(ContainSubstring("Installing")) - err = s.ChangeBoot(Active) - Expect(err).ToNot(HaveOccurred()) - + By("Reboot to active after cos-reset") s.Reboot() - ExpectWithOffset(1, s.BootFrom()).To(Equal(Active)) } @@ -129,7 +135,7 @@ func (s *SUT) SquashFSRecovery() bool { out, err := s.command("cat /proc/cmdline", false) ExpectWithOffset(1, err).ToNot(HaveOccurred()) - return strings.Contains(out,"rd.live.squashimg") + return strings.Contains(out, "rd.live.squashimg") } func (s *SUT) GetOSRelease(ss string) string { @@ -204,10 +210,10 @@ func (s *SUT) connectToHost(timeout bool) (*ssh.Client, error) { } // GatherLog will try to scp the given log from the machine to a local file -func (s SUT) GatherLog(logPath string) { +func (s SUT) GatherLog(logPath string) { fmt.Printf("Trying to get file: %s\n", logPath) clientConfig, _ := auth.PasswordKey(s.Username, s.Password, ssh.InsecureIgnoreHostKey()) - scpClient := scp.NewClientWithTimeout(s.Host, &clientConfig, 10 * time.Second) + scpClient := scp.NewClientWithTimeout(s.Host, &clientConfig, 10*time.Second) err := scpClient.Connect() if err != nil { @@ -226,7 +232,6 @@ func (s SUT) GatherLog(logPath string) { defer scpClient.Close() defer f.Close() - err = scpClient.CopyFromRemote(f, logPath) if err != nil { @@ -237,7 +242,6 @@ func (s SUT) GatherLog(logPath string) { _ = os.Chmod(fmt.Sprintf("logs/%s", baseName), 0666) fmt.Printf("File %s copied!\n", baseName) - } // DialWithDeadline Dials SSH with a deadline to avoid Read timeouts @@ -270,4 +274,4 @@ func DialWithDeadline(network string, addr string, config *ssh.ClientConfig, tim } }() return ssh.NewClient(c, chans, reqs), nil -} \ No newline at end of file +}