From c4c2dc296eb53927b730ee7c2eb3938729024180 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Thu, 18 Nov 2021 04:05:01 +0000 Subject: [PATCH 1/5] systemd: exclude default repart service The repart service is a oneshot, so we can't use a drop-in to replace the `ExecStart` command that runs by default, which attempts to add or grow defined partitions on the same device as the root filesystem. This is OK for "unified" images, where the data partition is at the end of the device, but not OK for "split" images, where it lives on a different device. Signed-off-by: Ben Cressey --- packages/systemd/systemd.spec | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/systemd/systemd.spec b/packages/systemd/systemd.spec index bf1efe6ab4c..7896b77dc6f 100644 --- a/packages/systemd/systemd.spec +++ b/packages/systemd/systemd.spec @@ -326,6 +326,7 @@ rm -f %{buildroot}%{_cross_libdir}/systemd/{system,user}/graphical.target %exclude %{_cross_unitdir}/systemd-ask-password-console.path %exclude %{_cross_unitdir}/systemd-ask-password-wall.path %exclude %{_cross_unitdir}/systemd-oomd.service +%exclude %{_cross_unitdir}/systemd-repart.service %exclude %{_cross_unitdir}/sysinit.target.wants/systemd-ask-password-console.path %exclude %{_cross_unitdir}/multi-user.target.wants/systemd-ask-password-wall.path From 067671a1c2372871bcad56660920859b2e685985 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Thu, 18 Nov 2021 03:09:57 +0000 Subject: [PATCH 2/5] release: use systemd-repart for partition resizing Switch from our `growpart` tool to `systemd-repart` to resize the data partition. For a unified root+data image. `growpart` uses the `gptman` crate, which calls the BLKRRPART ioctl to tell the kernel to re-read the partition table. This call fails if the device contains mounted partitions. `systemd-repart` uses the newer BLKPG ioctl, which manipulates the kernel's view of individual partitions. This works even if the root filesystem is present on the same device and already mounted. It also avoids the need to handle the partition symlink going away and coming back, since udev does not get the change event that triggers this. The two tools differ in how much free space is left on the device after the last partition is resized. `growpart` ends the partition one sector before the last 1 MiB boundary, while `systemd-repart` ends it just before the GPT label. Both tools run on every boot. To avoid problems on downgrade after a newer release resizes the data filesystem beyond where the older release will end the partition, we constrain `systemd-repart` to leave the older number of free sectors. Since `/local` can be mounted during the resize operation, we can use a real mount unit for it, which greatly simplifies the dependencies, and allows us to decouple the "prepare" logic from "resize" logic. Signed-off-by: Ben Cressey --- packages/release/local.mount | 14 ++++++++ packages/release/prepare-local.service | 42 ++++++---------------- packages/release/release-repart-local.conf | 12 +++++++ packages/release/release.spec | 16 +++++++-- packages/release/repart-local.service | 29 +++++++++++++++ 5 files changed, 78 insertions(+), 35 deletions(-) create mode 100644 packages/release/local.mount create mode 100644 packages/release/release-repart-local.conf create mode 100644 packages/release/repart-local.service diff --git a/packages/release/local.mount b/packages/release/local.mount new file mode 100644 index 00000000000..0d1c468bac3 --- /dev/null +++ b/packages/release/local.mount @@ -0,0 +1,14 @@ +[Unit] +Description=Local Directory (/local) +DefaultDependencies=no +Conflicts=umount.target +Before=local-fs.target umount.target + +[Mount] +What=/dev/disk/by-partlabel/BOTTLEROCKET-DATA +Where=/local +Type=ext4 +Options=defaults,noatime,nosuid,nodev + +[Install] +WantedBy=preconfigured.target diff --git a/packages/release/prepare-local.service b/packages/release/prepare-local.service index a41f79973a5..167873d9c50 100644 --- a/packages/release/prepare-local.service +++ b/packages/release/prepare-local.service @@ -1,44 +1,20 @@ [Unit] Description=Prepare Local Directory (/local) DefaultDependencies=no - -# We need udev to create /dev/disk/by-partlabel/BOTTLEROCKET-DATA first. -Wants=dev-disk-by\x2dpartlabel-BOTTLEROCKET\x2dDATA.device -After=dev-disk-by\x2dpartlabel-BOTTLEROCKET\x2dDATA.device +RequiresMountsFor=/local [Service] Type=oneshot -Environment=BOTTLEROCKET_DATA=/dev/disk/by-partlabel/BOTTLEROCKET-DATA Environment=LOCAL_DIR=/local -# To "grow" the partition, we delete it and recreate it at the larger size, then -# write it back to the device. udevd observes the write via inotify, and tells -# the kernel to reload the partition table. This causes the partition link to be -# deleted and then recreated. -ExecStart=/usr/sbin/growpart ${BOTTLEROCKET_DATA} - -# If the GPT label was not already at the end of the disk, the first pass will -# write it there, but any additional sectors beyond the original position were -# not included in the resized partition. Now that the kernel has reloaded the -# partition table, the second pass can find and use those sectors. -ExecStart=/usr/sbin/growpart ${BOTTLEROCKET_DATA} - -# The above note means we can't have a "normal" mount unit here, because it would -# depend on the link, and would immediately transition to the failed state when the -# link is removed. systemd will create local.mount for us as a side effect. -ExecStart=/usr/bin/mount \ - -o defaults,noatime,nosuid,nodev \ - ${BOTTLEROCKET_DATA} ${LOCAL_DIR} - -# After the mount is active, we grow the filesystem to fill the resized partition, -# and ensure that it has the directories we need for subsequent mounts. -ExecStart=/usr/lib/systemd/systemd-growfs ${LOCAL_DIR} +# Create the directories we need for our bind mounts. ExecStart=/usr/bin/mkdir -p ${LOCAL_DIR}/var ${LOCAL_DIR}/opt ${LOCAL_DIR}/mnt # Create the directories we need to set up a read-write overlayfs for the kernel -# development sources and the kernel modules -ExecStart=/usr/bin/rm -rf ${LOCAL_DIR}/var/lib/kernel-devel \ - %{LOCAL_DIR}/var/lib/kernel-modules +# development sources and kernel modules. +ExecStart=/usr/bin/rm -rf \ + ${LOCAL_DIR}/var/lib/kernel-devel \ + ${LOCAL_DIR}/var/lib/kernel-modules ExecStart=/usr/bin/mkdir -p \ ${LOCAL_DIR}/var/lib/kernel-devel/.overlay/lower \ ${LOCAL_DIR}/var/lib/kernel-devel/.overlay/upper \ @@ -48,11 +24,13 @@ ExecStart=/usr/bin/mkdir -p \ # Create the directories we need to set up a read-write overlayfs for any CNI # plugin binaries. -ExecStart=/usr/bin/rm -rf ${LOCAL_DIR}/opt/cni ${LOCAL_DIR}/var/lib/cni-plugins +ExecStart=/usr/bin/rm -rf \ + ${LOCAL_DIR}/opt/cni \ + ${LOCAL_DIR}/var/lib/cni-plugins ExecStart=/usr/bin/mkdir -p \ ${LOCAL_DIR}/opt/cni/bin \ ${LOCAL_DIR}/var/lib/cni-plugins/.overlay/upper \ - ${LOCAL_DIR}/var/lib/cni-plugins/.overlay/work \ + ${LOCAL_DIR}/var/lib/cni-plugins/.overlay/work RemainAfterExit=true StandardError=journal+console diff --git a/packages/release/release-repart-local.conf b/packages/release/release-repart-local.conf new file mode 100644 index 00000000000..c026191367a --- /dev/null +++ b/packages/release/release-repart-local.conf @@ -0,0 +1,12 @@ +[Partition] +# This is the partition type UUID for BOTTLEROCKET-DATA, which will be resized +# to fill the remaining sectors on the disk where it resides. +Type=626f7474-6c65-6474-6861-726d61726b73 + +# We want the partition to end on the last 1 MiB boundary before the end of +# the disk, to match the historical implementation. Assuming the disk itself is +# an even multiple of MiBs in size, and using 512 byte sectors as an example, +# we need 33 sectors for the GPT label in the last MiB, and therefore want 2015 +# sectors left, or 1031680 bytes. The repart tool expects a multiple of 4096, +# which is (1031680 - (1031680 % 4096)), or 1028096 bytes. +PaddingMinBytes=1028096 diff --git a/packages/release/release.spec b/packages/release/release.spec index 6cfb4c7e5c8..e1a38b9f65c 100644 --- a/packages/release/release.spec +++ b/packages/release/release.spec @@ -7,6 +7,7 @@ Summary: Bottlerocket release License: Apache-2.0 OR MIT Source11: nsswitch.conf +Source96: release-repart-local.conf Source97: release-sysctl.conf Source98: release-systemd-system.conf Source99: release-tmpfiles.conf @@ -31,6 +32,7 @@ Source1008: var-lib-bottlerocket.mount Source1009: etc-cni.mount Source1010: mnt.mount Source1012: opt-cni-bin.mount +Source1013: local.mount # CD-ROM mount & associated udev rules Source1015: media-cdrom.mount @@ -45,6 +47,7 @@ Source1023: lib-modules.mount.in # Mounts that require helper programs Source1040: prepare-boot.service Source1041: prepare-local.service +Source1042: repart-local.service # Services for kdump support Source1060: capture-kernel-dump.service @@ -101,6 +104,9 @@ install -p -m 0644 %{S:11} %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir} install -d %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/wicked/ifconfig install -p -m 0644 %{S:1000} %{buildroot}%{_cross_factorydir}%{_cross_sysconfdir}/wicked/ifconfig +install -d %{buildroot}%{_cross_libdir}/repart.d +install -p -m 0644 %{S:96} %{buildroot}%{_cross_libdir}/repart.d/80-local.conf + install -d %{buildroot}%{_cross_sysctldir} install -p -m 0644 %{S:97} %{buildroot}%{_cross_sysctldir}/80-release.conf @@ -117,9 +123,9 @@ EOF install -d %{buildroot}%{_cross_unitdir} install -p -m 0644 \ - %{S:1001} %{S:1002} %{S:1003} %{S:1004} %{S:1005} \ - %{S:1006} %{S:1007} %{S:1008} %{S:1009} %{S:1010} %{S:1011} %{S:1012} \ - %{S:1015} %{S:1040} %{S:1041} %{S:1060} %{S:1061} %{S:1062} %{S:1080} \ + %{S:1001} %{S:1002} %{S:1003} %{S:1004} %{S:1005} %{S:1006} %{S:1007} \ + %{S:1008} %{S:1009} %{S:1010} %{S:1011} %{S:1012} %{S:1013} %{S:1015} \ + %{S:1040} %{S:1041} %{S:1042} %{S:1060} %{S:1061} %{S:1062} %{S:1080} \ %{buildroot}%{_cross_unitdir} install -d %{buildroot}%{_cross_unitdir}/systemd-tmpfiles-setup.service.d @@ -162,6 +168,8 @@ ln -s %{_cross_unitdir}/preconfigured.target %{buildroot}%{_cross_unitdir}/defau %{_cross_sysctldir}/80-release.conf %{_cross_tmpfilesdir}/release.conf %{_cross_libdir}/os-release +%dir %{_cross_libdir}/repart.d +%{_cross_libdir}/repart.d/80-local.conf %{_cross_libdir}/systemd/system.conf.d/80-release.conf %{_cross_unitdir}/configured.target %{_cross_unitdir}/preconfigured.target @@ -174,12 +182,14 @@ ln -s %{_cross_unitdir}/preconfigured.target %{buildroot}%{_cross_unitdir}/defau %{_cross_unitdir}/load-crash-kernel.service %{_cross_unitdir}/prepare-boot.service %{_cross_unitdir}/prepare-local.service +%{_cross_unitdir}/repart-local.service %{_cross_unitdir}/var.mount %{_cross_unitdir}/opt.mount %{_cross_unitdir}/mnt.mount %{_cross_unitdir}/etc-cni.mount %{_cross_unitdir}/opt-cni-bin.mount %{_cross_unitdir}/media-cdrom.mount +%{_cross_unitdir}/local.mount %{_cross_unitdir}/*-lower.mount %{_cross_unitdir}/*-kernels.mount %{_cross_unitdir}/*-licenses.mount diff --git a/packages/release/repart-local.service b/packages/release/repart-local.service new file mode 100644 index 00000000000..bbcb18f9214 --- /dev/null +++ b/packages/release/repart-local.service @@ -0,0 +1,29 @@ +[Unit] +Description=Resize Data Partition +DefaultDependencies=no +Conflicts=shutdown.target +Wants=dev-disk-by\x2dpartlabel-BOTTLEROCKET\x2dDATA.device +After=dev-disk-by\x2dpartlabel-BOTTLEROCKET\x2dDATA.device + +# Ensure the device is mounted first, to avoid racing with the unit that tries +# to mount it since the symlink can disappear if the partition is resized. +RequiresMountsFor=/local + +[Service] +Type=oneshot + +# Resize the partition, whether or not it resides on the same disk as /. +ExecStart=/usr/bin/systemd-repart --dry-run=no /dev/disk/by-partlabel/BOTTLEROCKET-DATA + +# Grow the filesystem to fill the partition. Doing this in another unit could +# introduce a race if the underlying block device is not ready after resizing. +ExecStart=/usr/lib/systemd/systemd-growfs /local + +RemainAfterExit=true +StandardError=journal+console + +# systemd-repart returns 77 if there's no existing GPT partition table +SuccessExitStatus=77 + +[Install] +WantedBy=local-fs.target From 6f2c17347832bed0e2895e386a560265abe3dbc0 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Thu, 18 Nov 2021 00:06:32 +0000 Subject: [PATCH 3/5] build: add support for building unified images For some targets such as bare metal systems, the requirement for a separate block device to hold the data partition is unworkable. Implement a "unified" image layout, which places the data partition after the final OS partition, and is suitable for targets which may only have one disk. The old "split" layout remains the default. Signed-off-by: Ben Cressey --- Dockerfile | 8 ++-- tools/buildsys/src/builder.rs | 10 ++++- tools/buildsys/src/manifest.rs | 28 +++++++++++-- tools/partyplanner | 35 ++++++++++++---- tools/rpm2img | 73 ++++++++++++++++++++++++++-------- 5 files changed, 122 insertions(+), 32 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9618fd58a91..e769379b5cb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -166,6 +166,7 @@ ARG IMAGE_NAME ARG IMAGE_FORMAT ARG OS_IMAGE_SIZE_GIB ARG DATA_IMAGE_SIZE_GIB +ARG PARTITION_PLAN ARG KERNEL_PARAMETERS ENV VARIANT=${VARIANT} VERSION_ID=${VERSION_ID} BUILD_ID=${BUILD_ID} \ PRETTY_NAME=${PRETTY_NAME} IMAGE_NAME=${IMAGE_NAME} \ @@ -177,9 +178,10 @@ RUN --mount=target=/host \ /host/tools/rpm2img \ --package-dir=/local/rpms \ --output-dir=/local/output \ - --output-fmt=${IMAGE_FORMAT} \ - --os-image-size-gib=${OS_IMAGE_SIZE_GIB} \ - --data-image-size-gib=${DATA_IMAGE_SIZE_GIB} \ + --output-fmt="${IMAGE_FORMAT}" \ + --os-image-size-gib="${OS_IMAGE_SIZE_GIB}" \ + --data-image-size-gib="${DATA_IMAGE_SIZE_GIB}" \ + --partition-plan="${PARTITION_PLAN}" \ && echo ${NOCACHE} # =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= =^..^= diff --git a/tools/buildsys/src/builder.rs b/tools/buildsys/src/builder.rs index a83661eb9fd..293bd561302 100644 --- a/tools/buildsys/src/builder.rs +++ b/tools/buildsys/src/builder.rs @@ -21,7 +21,7 @@ use std::path::{Path, PathBuf}; use std::process::Output; use walkdir::{DirEntry, WalkDir}; -use crate::manifest::{ImageFormat, ImageLayout, SupportedArch}; +use crate::manifest::{ImageFormat, ImageLayout, PartitionPlan, SupportedArch}; /* There's a bug in BuildKit that can lead to a build failure during parallel @@ -131,6 +131,7 @@ impl VariantBuilder { let ImageLayout { os_image_size_gib, data_image_size_gib, + partition_plan, } = image_layout; let mut args = Vec::new(); @@ -152,6 +153,13 @@ impl VariantBuilder { ); args.build_arg("OS_IMAGE_SIZE_GIB", format!("{}", os_image_size_gib)); args.build_arg("DATA_IMAGE_SIZE_GIB", format!("{}", data_image_size_gib)); + args.build_arg( + "PARTITION_PLAN", + match partition_plan { + PartitionPlan::Split => "split", + PartitionPlan::Unified => "unified", + }, + ); args.build_arg( "KERNEL_PARAMETERS", kernel_parameters diff --git a/tools/buildsys/src/manifest.rs b/tools/buildsys/src/manifest.rs index e4c1ba485a8..b9a4f332068 100644 --- a/tools/buildsys/src/manifest.rs +++ b/tools/buildsys/src/manifest.rs @@ -86,10 +86,15 @@ The full size will be used for the single data partition, except for the 2 MiB overhead for the GPT labels and partition alignment. The data partition will be automatically resized to fill the disk on boot, so it is usually not necessary to increase this value. + +`partition-plan` is the desired strategy for image partitioning. +This can be `split` (the default) for "os" and "data" images backed by separate +volumes, or `unified` to have "os" and "data" share the same volume. ``` [package.metadata.build-variant.image-layout] os-image-size-gib = 2 data-image-size-gib = 1 +partition-plan = "split" ``` `supported-arches` is the list of architectures the variant is able to run on. @@ -120,9 +125,6 @@ use std::fmt; use std::fs; use std::path::{Path, PathBuf}; -static DEFAULT_OS_IMAGE_SIZE_GIB: u32 = 2; -static DEFAULT_DATA_IMAGE_SIZE_GIB: u32 = 1; - /// The nested structures here are somewhat complex, but they make it trivial /// to deserialize the structure we expect to find in the manifest. #[derive(Deserialize, Debug)] @@ -251,8 +253,16 @@ pub(crate) struct ImageLayout { pub(crate) os_image_size_gib: u32, #[serde(default = "ImageLayout::default_data_image_size_gib")] pub(crate) data_image_size_gib: u32, + #[serde(default = "ImageLayout::default_partition_plan")] + pub(crate) partition_plan: PartitionPlan, } +/// These are the historical defaults for all variants, before we added support +/// for customizing these properties. +static DEFAULT_OS_IMAGE_SIZE_GIB: u32 = 2; +static DEFAULT_DATA_IMAGE_SIZE_GIB: u32 = 1; +static DEFAULT_PARTITION_PLAN: PartitionPlan = PartitionPlan::Split; + impl ImageLayout { fn default_os_image_size_gib() -> u32 { DEFAULT_OS_IMAGE_SIZE_GIB @@ -261,6 +271,10 @@ impl ImageLayout { fn default_data_image_size_gib() -> u32 { DEFAULT_DATA_IMAGE_SIZE_GIB } + + fn default_partition_plan() -> PartitionPlan { + DEFAULT_PARTITION_PLAN + } } impl Default for ImageLayout { @@ -268,10 +282,18 @@ impl Default for ImageLayout { Self { os_image_size_gib: Self::default_os_image_size_gib(), data_image_size_gib: Self::default_data_image_size_gib(), + partition_plan: Self::default_partition_plan(), } } } +#[derive(Deserialize, Debug, Copy, Clone)] +#[serde(rename_all = "lowercase")] +pub(crate) enum PartitionPlan { + Split, + Unified, +} + #[derive(Deserialize, Debug, PartialEq, Eq, Hash)] #[serde(rename_all = "lowercase")] pub(crate) enum SupportedArch { diff --git a/tools/partyplanner b/tools/partyplanner index 601e97c63f9..edd852f88c7 100755 --- a/tools/partyplanner +++ b/tools/partyplanner @@ -110,16 +110,21 @@ PRIVATE_SCALE_FACTOR="24" # Populate the caller's tables with sizes and offsets for known partitions. set_partition_sizes() { - local os_image_gib data_image_gib + local os_image_gib data_image_gib partition_plan local -n pp_size pp_offset os_image_gib="${1:?}" data_image_gib="${2:?}" + # Whether we're building a layout for a "split" image, where OS and data + # volumes are on separate disks, or a "unified" image, where they share the + # same disk. + partition_plan="${3:?}" + # Table for partition sizes, in MiB. - pp_size="${3:?}" + pp_size="${4:?}" # Table for partition offsets from start of disk, in MiB. - pp_offset="${4:?}" + pp_offset="${5:?}" # Most of the partitions on the main image scale with the overall size. local boot_mib root_mib hash_mib reserved_mib private_mib @@ -168,11 +173,25 @@ set_partition_sizes() { pp_size["PRIVATE"]="${private_mib}" ((offset += private_mib)) - # The data image is relatively easy to plan, at least until we add support - # for unified images. The first and last MiB are reserved for the GPT labels, - # and the remainder is for the lone "data" partition. - pp_size["DATA"]="$((data_image_gib * 1024 - GPT_MIB * 2))" - pp_offset["DATA"]="1" + case "${partition_plan}" in + split) + # For a split data image, the first and last MiB are reserved for the GPT + # labels, and the rest is for the "data" partition. + pp_size["DATA"]="$((data_image_gib * 1024 - GPT_MIB * 2))" + pp_offset["DATA"]="1" + ;; + unified) + # For a unified image, we've already accounted for the GPT label space in + # the earlier calculations, so all the space is for the "data" partition. + pp_size["DATA"]="$((data_image_gib * 1024))" + pp_offset["DATA"]="${offset}" + ((offset += data_image_gib * 1024)) + ;; + *) + echo "unknown partition plan '${partition_plan}'" >&2 + exit 1 + ;; + esac } # Populate the caller's table with labels for known partitions. diff --git a/tools/rpm2img b/tools/rpm2img index 0c00b561c9c..956d44a39d9 100755 --- a/tools/rpm2img +++ b/tools/rpm2img @@ -17,6 +17,7 @@ for opt in "$@"; do --output-fmt=*) OUTPUT_FMT="${optarg}" ;; --os-image-size-gib=*) OS_IMAGE_SIZE_GIB="${optarg}" ;; --data-image-size-gib=*) DATA_IMAGE_SIZE_GIB="${optarg}" ;; + --partition-plan=*) PARTITION_PLAN="${optarg}" ;; esac done @@ -28,6 +29,14 @@ case "${OUTPUT_FMT}" in ;; esac +case "${PARTITION_PLAN}" in + split|unified) ;; + *) + echo "unexpected partition plan '${PARTITION_PLAN}'" >&2 + exit 1 + ;; +esac + mkdir -p "${OUTPUT_DIR}" FILENAME_PREFIX="${IMAGE_NAME}-${VARIANT}-${ARCH}-${VERSION_ID}-${BUILD_ID}" @@ -62,11 +71,19 @@ VERITY_HASH_ALGORITHM=sha256 VERITY_DATA_BLOCK_SIZE=4096 VERITY_HASH_BLOCK_SIZE=4096 -truncate -s "${OS_IMAGE_SIZE_GIB}"G "${OS_IMAGE}" +case "${PARTITION_PLAN}" in + split) + truncate -s "${OS_IMAGE_SIZE_GIB}G" "${OS_IMAGE}" + truncate -s "${DATA_IMAGE_SIZE_GIB}G" "${DATA_IMAGE}" + ;; + unified) + truncate -s "$((OS_IMAGE_SIZE_GIB + DATA_IMAGE_SIZE_GIB))G" "${OS_IMAGE}" + ;; +esac declare -A partlabel parttype partsize partoff set_partition_sizes \ - "${OS_IMAGE_SIZE_GIB}" "${DATA_IMAGE_SIZE_GIB}" \ + "${OS_IMAGE_SIZE_GIB}" "${DATA_IMAGE_SIZE_GIB}" "${PARTITION_PLAN}" \ partsize partoff set_partition_labels partlabel set_partition_types parttype @@ -76,8 +93,13 @@ for part in \ BIOS \ EFI-A BOOT-A ROOT-A HASH-A RESERVED-A \ EFI-B BOOT-B ROOT-B HASH-B RESERVED-B \ - PRIVATE ; + PRIVATE DATA ; do + # We only append the data partition if we're using the unified layout. + if [ "${part}" == "DATA" ] && [ "${PARTITION_PLAN}" != "unified" ] ; then + continue + fi + # Each partition is aligned to a 1 MiB boundary, and extends to the sector # before the next partition starts. Specify the end point in sectors so we # can subtract a sector to fix the off-by-one error that comes from adding @@ -101,6 +123,18 @@ done sgdisk --clear "${partargs[@]}" --sort --print "${OS_IMAGE}" +# Partition the separate data disk, if we're using the split layout. +if [ "${PARTITION_PLAN}" == "split" ] ; then + data_start="${partoff[DATA]}" + data_end=$((data_start + partsize[DATA])) + data_end=$((data_end * 2048 - 1)) + sgdisk --clear \ + -n "0:${data_start}M:${data_end}" \ + -c "0:${partlabel[DATA]}" \ + -t "0:${parttype[DATA]}" \ + --sort --print "${DATA_IMAGE}" +fi + rpm -iv --root "${ROOT_MOUNT}" "${PACKAGE_DIR}"/*.rpm install -p -m 0644 /host/{COPYRIGHT,LICENSE-APACHE,LICENSE-MIT} "${ROOT_MOUNT}"/usr/share/licenses/ mksquashfs \ @@ -226,15 +260,7 @@ mkfs.ext4 -b 4096 -i 4096 -I 256 "${PRIVATE_IMAGE}" "${partsize[PRIVATE]}M" dd if="${PRIVATE_IMAGE}" of="${OS_IMAGE}" conv=notrunc bs=1M seek="${partoff[PRIVATE]}" # BOTTLEROCKET-DATA -truncate -s "${DATA_IMAGE_SIZE_GIB}"G "${DATA_IMAGE}" -data_start="${partoff[DATA]}" -data_end=$((data_start + partsize[DATA])) -data_end=$((data_end * 2048 - 1)) -sgdisk --clear \ - -n "0:${data_start}M:${data_end}" \ - -c "0:${partlabel[DATA]}" \ - -t "0:${parttype[DATA]}" \ - --sort --print "${DATA_IMAGE}" + # If we build on a host with SELinux enabled, we could end up with labels that # do not match our policy. Since we allow replacing the data volume at runtime, # we can't count on these labels being correct in any case, and it's better to @@ -243,21 +269,34 @@ UNLABELED=$(find "${DATA_MOUNT}" \ | awk -v root="${DATA_MOUNT}" '{gsub(root"/","/"); gsub(root,"/"); print "ea_rm", $1, "security.selinux"}') mkfs.ext4 -d "${DATA_MOUNT}" "${BOTTLEROCKET_DATA}" "${partsize[DATA]}M" echo "${UNLABELED}" | debugfs -w -f - "${BOTTLEROCKET_DATA}" -dd if="${BOTTLEROCKET_DATA}" of="${DATA_IMAGE}" conv=notrunc bs=1M seek="${partoff[DATA]}" +case "${PARTITION_PLAN}" in + split) + dd if="${BOTTLEROCKET_DATA}" of="${DATA_IMAGE}" conv=notrunc bs=1M seek="${partoff[DATA]}" + ;; + unified) + dd if="${BOTTLEROCKET_DATA}" of="${OS_IMAGE}" conv=notrunc bs=1M seek="${partoff[DATA]}" + ;; +esac sgdisk -v "${OS_IMAGE}" -sgdisk -v "${DATA_IMAGE}" +[ -s "${DATA_IMAGE}" ] && sgdisk -v "${DATA_IMAGE}" if [[ ${OUTPUT_FMT} == "raw" ]]; then lz4 -vc "${OS_IMAGE}" >"${OUTPUT_DIR}/${OS_IMAGE_BASENAME}.img.lz4" - lz4 -vc "${DATA_IMAGE}" >"${OUTPUT_DIR}/${DATA_IMAGE_BASENAME}.img.lz4" + if [ -s "${DATA_IMAGE}" ] ; then + lz4 -vc "${DATA_IMAGE}" >"${OUTPUT_DIR}/${DATA_IMAGE_BASENAME}.img.lz4" + fi elif [[ ${OUTPUT_FMT} == "qcow2" ]]; then qemu-img convert -f raw -O qcow2 "${OS_IMAGE}" "${OUTPUT_DIR}/${OS_IMAGE_BASENAME}.qcow2" - qemu-img convert -f raw -O qcow2 "${DATA_IMAGE}" "${OUTPUT_DIR}/${DATA_IMAGE_BASENAME}.qcow2" + if [ -s "${DATA_IMAGE}" ] ; then + qemu-img convert -f raw -O qcow2 "${DATA_IMAGE}" "${OUTPUT_DIR}/${DATA_IMAGE_BASENAME}.qcow2" + fi elif [[ ${OUTPUT_FMT} == "vmdk" ]]; then # Stream optimization is required for creating an Open Virtual Appliance (OVA) qemu-img convert -f raw -O vmdk -o subformat=streamOptimized "${OS_IMAGE}" "${OUTPUT_DIR}/${OS_IMAGE_BASENAME}.vmdk" - qemu-img convert -f raw -O vmdk -o subformat=streamOptimized "${DATA_IMAGE}" "${OUTPUT_DIR}/${DATA_IMAGE_BASENAME}.vmdk" + if [ -s "${DATA_IMAGE}" ] ; then + qemu-img convert -f raw -O vmdk -o subformat=streamOptimized "${DATA_IMAGE}" "${OUTPUT_DIR}/${DATA_IMAGE_BASENAME}.vmdk" + fi fi lz4 -9vc "${BOOT_IMAGE}" >"${OUTPUT_DIR}/${BOOT_IMAGE_NAME}" From 4c7912f4cb161c17185cc51d77899004b0143b7e Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Thu, 18 Nov 2021 19:06:40 +0000 Subject: [PATCH 4/5] pubsys: add support for publishing unified images Although the existing AWS and VMware variants use the "split" image layout, custom variants for these platforms might use the "unified" layout instead. Adapt the AMI registration and OVA creation logic to account for the possibility that we only build a single disk image. Signed-off-by: Ben Cressey --- Makefile.toml | 197 +++++++++++++++++++++----- tools/pubsys/src/aws/ami/mod.rs | 4 +- tools/pubsys/src/aws/ami/register.rs | 62 ++++---- variants/shared/template-split.ovf | 89 ++++++++++++ variants/shared/template-unified.ovf | 79 +++++++++++ variants/vmware-dev/template.ovf | 90 +----------- variants/vmware-k8s-1.20/template.ovf | 2 +- variants/vmware-k8s-1.21/template.ovf | 2 +- 8 files changed, 374 insertions(+), 151 deletions(-) create mode 100644 variants/shared/template-split.ovf create mode 100644 variants/shared/template-unified.ovf mode change 100644 => 120000 variants/vmware-dev/template.ovf diff --git a/Makefile.toml b/Makefile.toml index 7d26687ffe3..86001c3a468 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -45,6 +45,12 @@ PUBLISH_REPO = "default" # AMIs. (You can also specify PUBLISH_ROOT_VOLUME_SIZE to override the root # volume size; by default it's the image size, rounded up.) PUBLISH_DATA_VOLUME_SIZE = "20" + +# For images using a "unified" layout where both OS and data partitions are in +# the same image, the root / data distinction doesn't make sense, but we still +# have an idea of how big the overall volume should be to have "enough" space. +PUBLISH_UNIFIED_VOLUME_SIZE = "22" + # This can be overridden with -e to change the path to the file containing SSM # parameter templates. This file determines the parameter names and values # that will be published to SSM when you run `cargo make ssm`. See @@ -438,57 +444,140 @@ cleanup() { } trap 'cleanup' EXIT +measure_image() { + local image + image="${1:?}" + # Can't count on "realpath" availability, so assume an absolute image path underneath + # our build root directory. + image=".${image#${BUILDSYS_ROOT_DIR}}" + docker run --rm \ + --network=none \ + --user "$(id -u):$(id -g)" \ + --security-opt label:disable \ + -v "${BUILDSYS_ROOT_DIR}/build":/tmp/build \ + "${BUILDSYS_SDK_IMAGE}" \ + bash -c "set -o pipefail ; cd /tmp; qemu-img measure ${image} | awk '/required size/{print \$NF}'" +} + root_vmdk_path="${BUILDSYS_OUTPUT_DIR}/${BUILDSYS_NAME_FULL}.vmdk" data_vmdk_path="${BUILDSYS_OUTPUT_DIR}/${BUILDSYS_NAME_FULL}-data.vmdk" ova_tmp_dir="$(mktemp -d)" ovf="${BUILDSYS_NAME_FULL}.ovf" manifest="${BUILDSYS_NAME_FULL}.mf" -# Short circuit if neither VMDK images nor an OVF template exist -if [ ! -s "${BUILDSYS_OVF_TEMPLATE}" ] && \ - [[ ! -s "${root_vmdk_path}" || ! -s "${data_vmdk_path}" ]]; then - echo "No OVF template or VMDK images, skipping OVA build" - exit 0 +# Short circuit if no OVF template exists. +if [ ! -s "${BUILDSYS_OVF_TEMPLATE}" ] ; then + if [ ! -s "${root_vmdk_path}" ]; then + # If no VMDK exists either, there's nothing to do. + echo "No OVF template or VMDK images, skipping OVA build" + exit 0 + else + # Warn the user if a VMDK exists but an OVF template does not. Assume we do not + # need to build an OVA in this case + echo "VMDK image found, but OVF template '${BUILDSYS_OVF_TEMPLATE}' doesn't exist, skipping OVA build" + exit 0 + fi +fi + +# OVF templates all expect at least one disk. +if [ ! -s "${root_vmdk_path}" ] ; then + echo "OVF template exists but VMDK root image doesn't exist for the current version/commit - ${BUILDSYS_VERSION_FULL}." >&2 + echo "Unable to build an OVA" >&2 + exit 1 fi -# Warn the user if VMDK's exist but an OVF template does not. Assume we do not -# need to build an OVA in this case -if [ ! -s "${BUILDSYS_OVF_TEMPLATE}" ] && \ - [[ -s "${root_vmdk_path}" || -s "${data_vmdk_path}" ]]; then - echo "VMDK images exist, but OVF template '${BUILDSYS_OVF_TEMPLATE}' doesn't exist, skipping OVA build" - exit 0 +# If the template expects a data disk, make sure the image exists - it might not if we +# built the variant with the "unified" layout. +if grep -Fq '{{DATA_DISK}}' ${BUILDSYS_OVF_TEMPLATE} && [ ! -s "${data_vmdk_path}" ] ; then + echo "OVF template has data disk but VMDK data image doesn't exist for the current version/commit - ${BUILDSYS_VERSION_FULL}." >&2 + echo "Unable to build an OVA" >&2 + exit 1 fi -# If an OVF template exists but either of the images do not exist, fail -if [ -s "${BUILDSYS_OVF_TEMPLATE}" ] && \ - [[ ! -s "${root_vmdk_path}" || ! -s "${data_vmdk_path}" ]]; then - echo "OVF template exists but VMDK images don't exist for the current version/commit - ${BUILDSYS_VERSION_FULL}. Unable to build an OVA" >&2 +# If the template doesn't expect a data disk, make sure the image doesn't exist - it +# might if we built the variant with the "split" layout. +if ! grep -Fq '{{DATA_DISK}}' ${BUILDSYS_OVF_TEMPLATE} && [ -s "${data_vmdk_path}" ] ; then + echo "OVF template does not have data disk but VMDK data image exists for the current version/commit - ${BUILDSYS_VERSION_FULL}." >&2 + echo "Unable to build an OVA" >&2 exit 1 fi +is_split="no" +if [ -s "${data_vmdk_path}" ] ; then + is_split="yes" +fi + +bytes_in_gib="$((1024 * 1024 * 1024))" +root_image_size_bytes="$(measure_image "${root_vmdk_path}")" +root_image_size_gib="$((root_image_size_bytes / bytes_in_gib))" +if [ "${is_split}" == "yes" ] ; then + # If an optional root volume size is given, it must be larger than the root image. + if [ -n "${PUBLISH_ROOT_VOLUME_SIZE}" ] ; then + if [ "${PUBLISH_ROOT_VOLUME_SIZE}" -lt "${root_image_size_gib}" ] ; then + echo "Root image is larger than the given volume size - pass '-e PUBLISH_ROOT_VOLUME_SIZE=${root_image_size_gib}' to fix" >&2 + exit 1 + fi + root_image_size_bytes="$((PUBLISH_ROOT_VOLUME_SIZE * bytes_in_gib))" + fi + data_image_size_bytes="$(measure_image "${data_vmdk_path}")" + data_image_size_gib="$((data_image_size_bytes / bytes_in_gib))" + if [ "${PUBLISH_DATA_VOLUME_SIZE}" -lt "${data_image_size_gib}" ] ; then + echo "Data image is larger than the given volume size - pass '-e PUBLISH_DATA_VOLUME_SIZE=${data_image_size_gib}' to fix" >&2 + exit 1 + fi + data_image_size_bytes="$((PUBLISH_DATA_VOLUME_SIZE * bytes_in_gib))" +else # unified + if [ "${PUBLISH_UNIFIED_VOLUME_SIZE}" -lt "${root_image_size_gib}" ] ; then + echo "Unified image is larger than the given volume size - pass '-e PUBLISH_UNIFIED_VOLUME_SIZE=${root_image_size_gib}' to fix" >&2 + exit 1 + fi + root_image_size_bytes="$((PUBLISH_UNIFIED_VOLUME_SIZE * bytes_in_gib))" + data_image_size_bytes="0" +fi + # Create the OVF with the correct values sed "${BUILDSYS_OVF_TEMPLATE}" \ -e "s/{{ROOT_DISK}}/${root_vmdk_path##*/}/g" \ -e "s/{{DATA_DISK}}/${data_vmdk_path##*/}/g" \ + -e "s/{{ROOT_DISK_BYTES}}/${root_image_size_bytes}/g" \ + -e "s/{{DATA_DISK_BYTES}}/${data_image_size_bytes}/g" \ > "${ova_tmp_dir}/${ovf}" +# Make sure we replaced all the '{{...}}' fields with real values. +if grep -F -e '{{' -e '}}' "${ova_tmp_dir}/${ovf}" ; then + echo "Failed to fully render the OVF template" >&2 + exit 1 +fi + # Create the manifest file with the SHA's of the VMDK's and the OVF root_sha256="$(sha256sum ${root_vmdk_path} | awk '{print $1}')" -data_sha256="$(sha256sum ${data_vmdk_path} | awk '{print $1}')" -ovf_sha256="$(sha256sum ${ova_tmp_dir}/${ovf} | awk '{print $1}')" +if [ "${is_split}" == "yes" ] ; then + data_sha256="$(sha256sum ${data_vmdk_path} | awk '{print $1}')" +fi + echo "SHA256(${root_vmdk_path##*/})= ${root_sha256}" > "${ova_tmp_dir}/${manifest}" -echo "SHA256(${data_vmdk_path##*/})= ${data_sha256}" >> "${ova_tmp_dir}/${manifest}" +if [ "${is_split}" == "yes" ] ; then + echo "SHA256(${data_vmdk_path##*/})= ${data_sha256}" >> "${ova_tmp_dir}/${manifest}" +fi + +ovf_sha256="$(sha256sum ${ova_tmp_dir}/${ovf} | awk '{print $1}')" echo "SHA256(${ovf})= ${ovf_sha256}" >> "${ova_tmp_dir}/${manifest}" cp "${root_vmdk_path}" "${ova_tmp_dir}" -cp "${data_vmdk_path}" "${ova_tmp_dir}" +if [ "${is_split}" == "yes" ] ; then + cp "${data_vmdk_path}" "${ova_tmp_dir}" +fi # According to the OVF spec: # https://www.dmtf.org/sites/default/files/standards/documents/DSP0243_2.1.1.pdf, # the OVF must be first in the tar bundle. Manifest is next, and then the # files must fall in the same order as listed in the References section of the # OVF file -tar -cf "${ova_tmp_dir}/${BUILDSYS_OVA}" -C "${ova_tmp_dir}" "${ovf}" "${manifest}" "${root_vmdk_path##*/}" "${data_vmdk_path##*/}" +tar -cf "${ova_tmp_dir}/${BUILDSYS_OVA}" -C "${ova_tmp_dir}" "${ovf}" "${manifest}" "${root_vmdk_path##*/}" +if [ "${is_split}" == "yes" ] ; then + tar -rf "${ova_tmp_dir}/${BUILDSYS_OVA}" -C "${ova_tmp_dir}" "${data_vmdk_path##*/}" +fi + mv "${ova_tmp_dir}/${BUILDSYS_OVA}" "${BUILDSYS_OUTPUT_DIR}/${BUILDSYS_NAME_FULL}.ova" ''' ] @@ -653,9 +742,12 @@ LINK_REPO_TARGETS=("--link-target ${BUILDSYS_KMOD_KIT_PATH}") # Include the root and data disk images in the repo if they exist os_disk_img="${BUILDSYS_OUTPUT_DIR}/${BUILDSYS_NAME_FULL}.img.lz4" -data_disk_img="${BUILDSYS_OUTPUT_DIR}/${BUILDSYS_NAME_FULL}-data.img.lz4" -if [ -s "${os_disk_img}" ] && [ -s "${data_disk_img}" ]; then +if [ -s "${os_disk_img}" ] ; then LINK_REPO_TARGETS+=("--link-target ${os_disk_img}") +fi + +data_disk_img="${BUILDSYS_OUTPUT_DIR}/${BUILDSYS_NAME_FULL}-data.img.lz4" +if [ -s "${data_disk_img}" ]; then LINK_REPO_TARGETS+=("--link-target ${data_disk_img}") fi @@ -794,22 +886,60 @@ set -e export PATH="${BUILDSYS_TOOLS_DIR}/bin:${PATH}" cleanup() { - [ -f "${root_image}" ] && rm -f "${root_image}" - [ -f "${data_image}" ] && rm -f "${data_image}" + ([ -f "${root_image}" ] && rm -f "${root_image}") ||: + ([ -f "${data_image}" ] && rm -f "${data_image}") ||: } trap 'cleanup' EXIT -# Unlz4 the root / data images +# Unlz4 the root image, and the data image if present rootlz4="${BUILDSYS_OUTPUT_DIR}/${BUILDSYS_NAME_FULL}.img.lz4" root_image="${rootlz4%.lz4}" -datalz4="${BUILDSYS_OUTPUT_DIR}/${BUILDSYS_NAME_FULL}-data.img.lz4" -data_image="${datalz4%.lz4}" -if [ ! -s "${rootlz4}" ] || [ ! -s "${datalz4}" ]; then - echo "Image files don't exist for the current version/commit - ${BUILDSYS_VERSION_FULL} - please run 'cargo make'" >&2 +if [ ! -s "${rootlz4}" ]; then + echo "Image file doesn't exist for the current version/commit - ${BUILDSYS_VERSION_FULL} - please run 'cargo make'" >&2 exit 1 fi lz4 -df "${rootlz4}" "${root_image}" -lz4 -df "${datalz4}" "${data_image}" + +datalz4="${BUILDSYS_OUTPUT_DIR}/${BUILDSYS_NAME_FULL}-data.img.lz4" +data_image="${datalz4%.lz4}" + +# We will only have a data image if the variant uses the "split" format. +is_split="no" +if [ -s "${datalz4}" ] ; then + lz4 -df "${datalz4}" "${data_image}" + is_split="yes" +fi + +bytes_in_gib="$((1024 * 1024 * 1024))" +root_image_size_gib="$(($(stat -c %s "${root_image}") / bytes_in_gib))" +if [ "${is_split}" == "yes" ] ; then + # If an optional root volume size is given, it must be larger than the root image. + if [ -n "${PUBLISH_ROOT_VOLUME_SIZE}" ] && [ "${PUBLISH_ROOT_VOLUME_SIZE}" -lt "${root_image_size_gib}" ] ; then + echo "Root image is larger than the given volume size - pass '-e PUBLISH_ROOT_VOLUME_SIZE=${root_image_size_gib}' to fix" >&2 + exit 1 + fi + data_image_size_gib="$(($(stat -c %s "${data_image}") / bytes_in_gib))" + if [ "${PUBLISH_DATA_VOLUME_SIZE}" -lt "${data_image_size_gib}" ] ; then + echo "Data image is larger than the given volume size - pass '-e PUBLISH_DATA_VOLUME_SIZE=${data_image_size_gib}' to fix" >&2 + exit 1 + fi +else # unified + if [ "${PUBLISH_UNIFIED_VOLUME_SIZE}" -lt "${root_image_size_gib}" ] ; then + echo "Unified image is larger than the given volume size - pass '-e PUBLISH_UNIFIED_VOLUME_SIZE=${root_image_size_gib}' to fix" >&2 + exit 1 + fi +fi + +root_volume_args=(--root-image "${root_image}") +data_volume_args=() +if [ "${is_split}" == "yes" ] ; then + # Pass the root volume size if specified, otherwise it defaults to the size of the image. + root_volume_args+=(${PUBLISH_ROOT_VOLUME_SIZE:+--root-volume-size "${PUBLISH_ROOT_VOLUME_SIZE}"}) + # Pass the data image to register as a snapshot, and its desired size. + data_volume_args+=(--data-image "${data_image}" --data-volume-size "${PUBLISH_DATA_VOLUME_SIZE}") +else # unified + root_volume_args+=(--root-volume-size "${PUBLISH_UNIFIED_VOLUME_SIZE}") +fi ami_output="${BUILDSYS_OUTPUT_DIR}/${BUILDSYS_NAME_FULL}-${AMI_DATA_FILE_SUFFIX}" ami_output_latest="${BUILDSYS_OUTPUT_DIR}/latest/${BUILDSYS_NAME_VARIANT}-${AMI_DATA_FILE_SUFFIX}" @@ -821,10 +951,8 @@ pubsys \ \ ami \ \ - --root-image "${root_image}" \ - --data-image "${data_image}" \ - ${PUBLISH_ROOT_VOLUME_SIZE:+--root-volume-size "${PUBLISH_ROOT_VOLUME_SIZE}"} \ - --data-volume-size "${PUBLISH_DATA_VOLUME_SIZE}" \ + "${root_volume_args[@]}" \ + "${data_volume_args[@]}" \ \ --arch "${BUILDSYS_ARCH}" \ --name "${ami_name}" \ @@ -1101,6 +1229,7 @@ script = [ ''' for ws in sources variants/* tools/{buildsys,pubsys}; do [ -d "${ws}" ] || continue + [ "${ws}" == "variants/shared" ] && continue cargo clean --manifest-path ${ws}/Cargo.toml done rm -f ${BUILDSYS_TOOLS_DIR}/bin/{buildsys,pubsys} diff --git a/tools/pubsys/src/aws/ami/mod.rs b/tools/pubsys/src/aws/ami/mod.rs index 9f3d9acdaac..fc23716dc2e 100644 --- a/tools/pubsys/src/aws/ami/mod.rs +++ b/tools/pubsys/src/aws/ami/mod.rs @@ -37,7 +37,7 @@ pub(crate) struct AmiArgs { /// Path to the image containing the data volume #[structopt(short = "d", long, parse(from_os_str))] - data_image: PathBuf, + data_image: Option, /// Desired root volume size in gibibytes #[structopt(long)] @@ -45,7 +45,7 @@ pub(crate) struct AmiArgs { /// Desired data volume size in gibibytes #[structopt(long)] - data_volume_size: i64, + data_volume_size: Option, /// The architecture of the machine image #[structopt(short = "a", long, parse(try_from_str = parse_arch))] diff --git a/tools/pubsys/src/aws/ami/register.rs b/tools/pubsys/src/aws/ami/register.rs index 9cd02c02810..45dc4b41c4c 100644 --- a/tools/pubsys/src/aws/ami/register.rs +++ b/tools/pubsys/src/aws/ami/register.rs @@ -32,10 +32,7 @@ async fn _register_image( ec2_client: &Ec2Client, cleanup_snapshot_ids: &mut Vec, ) -> Result { - debug!( - "Uploading root and data images into EBS snapshots in {}", - region - ); + debug!("Uploading images into EBS snapshots in {}", region); let uploader = SnapshotUploader::new(ebs_client); let root_snapshot = snapshot_from_image(&ami_args.root_image, &uploader, None, ami_args.no_progress) @@ -46,19 +43,19 @@ async fn _register_image( })?; cleanup_snapshot_ids.push(root_snapshot.clone()); - let data_snapshot = - snapshot_from_image(&ami_args.data_image, &uploader, None, ami_args.no_progress) + let mut data_snapshot = None; + if let Some(data_image) = &ami_args.data_image { + let snapshot = snapshot_from_image(data_image, &uploader, None, ami_args.no_progress) .await .context(error::Snapshot { path: &ami_args.root_image, region, })?; - cleanup_snapshot_ids.push(data_snapshot.clone()); + cleanup_snapshot_ids.push(snapshot.clone()); + data_snapshot = Some(snapshot); + } - info!( - "Waiting for root and data snapshots to become available in {}", - region - ); + info!("Waiting for snapshots to become available in {}", region); let waiter = SnapshotWaiter::new(ec2_client.clone()); waiter .wait(&root_snapshot, Default::default()) @@ -66,12 +63,15 @@ async fn _register_image( .context(error::WaitSnapshot { snapshot_type: "root", })?; - waiter - .wait(&data_snapshot, Default::default()) - .await - .context(error::WaitSnapshot { - snapshot_type: "data", - })?; + + if let Some(ref data_snapshot) = data_snapshot { + waiter + .wait(&data_snapshot, Default::default()) + .await + .context(error::WaitSnapshot { + snapshot_type: "data", + })?; + } // Prepare parameters for AMI registration request let root_bdm = BlockDeviceMapping { @@ -86,16 +86,25 @@ async fn _register_image( ..Default::default() }; - let mut data_bdm = root_bdm.clone(); - data_bdm.device_name = Some(DATA_DEVICE_NAME.to_string()); - if let Some(ebs) = data_bdm.ebs.as_mut() { - ebs.snapshot_id = Some(data_snapshot.clone()); - ebs.volume_size = Some(ami_args.data_volume_size); + let mut data_bdm = None; + if let Some(ref data_snapshot) = data_snapshot { + let mut bdm = root_bdm.clone(); + bdm.device_name = Some(DATA_DEVICE_NAME.to_string()); + if let Some(ebs) = bdm.ebs.as_mut() { + ebs.snapshot_id = Some(data_snapshot.clone()); + ebs.volume_size = ami_args.data_volume_size; + } + data_bdm = Some(bdm); + } + + let mut block_device_mappings = vec![root_bdm]; + if let Some(data_bdm) = data_bdm { + block_device_mappings.push(data_bdm); } let register_request = RegisterImageRequest { architecture: Some(ami_args.arch.clone()), - block_device_mappings: Some(vec![root_bdm, data_bdm]), + block_device_mappings: Some(block_device_mappings), description: ami_args.description.clone(), ena_support: Some(ENA), name: ami_args.name.clone(), @@ -115,9 +124,14 @@ async fn _register_image( .image_id .context(error::MissingImageId { region })?; + let mut snapshot_ids = vec![root_snapshot]; + if let Some(data_snapshot) = data_snapshot { + snapshot_ids.push(data_snapshot); + } + Ok(RegisteredIds { image_id, - snapshot_ids: vec![root_snapshot, data_snapshot], + snapshot_ids, }) } diff --git a/variants/shared/template-split.ovf b/variants/shared/template-split.ovf new file mode 100644 index 00000000000..dcc87969faf --- /dev/null +++ b/variants/shared/template-split.ovf @@ -0,0 +1,89 @@ + + + + + + + + List of the virtual disks + + + + + The list of logical networks + + The network + + + + A Virtual machine + + The operating system installed + Other 4.x or later Linux (64-bit) + + + Virtual hardware requirements + + Virtual Hardware Family + 0 + vmx-15 + + + hertz * 10^6 + Number of Virtual CPUs + 2 virtual CPU(s) + 1 + 3 + 2 + + + byte * 2^20 + Memory Size + 8192MB of memory + 2 + 4 + 8192 + + + 0 + NVMe Controller + NVMe Controller 1 + 4 + vmware.nvme.controller + 20 + + + 0 + Hard Disk 1 + ovf:/disk/vmdisk1 + 6 + 4 + 17 + + + 1 + Hard Disk 2 + ovf:/disk/vmdisk2 + 7 + 4 + 17 + + + 0 + true + VM Network + Network adapter 1 + 9 + VmxNet3 + 10 + + + + + + + + + + + diff --git a/variants/shared/template-unified.ovf b/variants/shared/template-unified.ovf new file mode 100644 index 00000000000..1def5deb5fb --- /dev/null +++ b/variants/shared/template-unified.ovf @@ -0,0 +1,79 @@ + + + + + + + List of the virtual disks + + + + The list of logical networks + + The network + + + + A Virtual machine + + The operating system installed + Other 4.x or later Linux (64-bit) + + + Virtual hardware requirements + + Virtual Hardware Family + 0 + vmx-15 + + + hertz * 10^6 + Number of Virtual CPUs + 2 virtual CPU(s) + 1 + 3 + 2 + + + byte * 2^20 + Memory Size + 8192MB of memory + 2 + 4 + 8192 + + + 0 + NVMe Controller + NVMe Controller 1 + 4 + vmware.nvme.controller + 20 + + + 0 + Hard Disk 1 + ovf:/disk/vmdisk1 + 6 + 4 + 17 + + + 0 + true + VM Network + Network adapter 1 + 9 + VmxNet3 + 10 + + + + + + + + + + + diff --git a/variants/vmware-dev/template.ovf b/variants/vmware-dev/template.ovf deleted file mode 100644 index 451e8c99c4c..00000000000 --- a/variants/vmware-dev/template.ovf +++ /dev/null @@ -1,89 +0,0 @@ - - - - - - - - List of the virtual disks - - - - - The list of logical networks - - The network - - - - A Virtual machine - - The operating system installed - Other 4.x or later Linux (64-bit) - - - Virtual hardware requirements - - Virtual Hardware Family - 0 - vmx-15 - - - hertz * 10^6 - Number of Virtual CPUs - 2 virtual CPU(s) - 1 - 3 - 2 - - - byte * 2^20 - Memory Size - 8192MB of memory - 2 - 4 - 8192 - - - 0 - NVMe Controller - NVMe Controller 1 - 4 - vmware.nvme.controller - 20 - - - 0 - Hard Disk 1 - ovf:/disk/vmdisk1 - 6 - 4 - 17 - - - 1 - Hard Disk 2 - ovf:/disk/vmdisk2 - 7 - 4 - 17 - - - 0 - true - VM Network - Network adapter 1 - 9 - VmxNet3 - 10 - - - - - - - - - - - \ No newline at end of file diff --git a/variants/vmware-dev/template.ovf b/variants/vmware-dev/template.ovf new file mode 120000 index 00000000000..e1c6ad4baa0 --- /dev/null +++ b/variants/vmware-dev/template.ovf @@ -0,0 +1 @@ +../shared/template-split.ovf \ No newline at end of file diff --git a/variants/vmware-k8s-1.20/template.ovf b/variants/vmware-k8s-1.20/template.ovf index 00688303038..e1c6ad4baa0 120000 --- a/variants/vmware-k8s-1.20/template.ovf +++ b/variants/vmware-k8s-1.20/template.ovf @@ -1 +1 @@ -../vmware-dev/template.ovf \ No newline at end of file +../shared/template-split.ovf \ No newline at end of file diff --git a/variants/vmware-k8s-1.21/template.ovf b/variants/vmware-k8s-1.21/template.ovf index 00688303038..e1c6ad4baa0 120000 --- a/variants/vmware-k8s-1.21/template.ovf +++ b/variants/vmware-k8s-1.21/template.ovf @@ -1 +1 @@ -../vmware-dev/template.ovf \ No newline at end of file +../shared/template-split.ovf \ No newline at end of file From 29de89fc1bc81449acb09099202339eb8bf75247 Mon Sep 17 00:00:00 2001 From: Ben Cressey Date: Mon, 13 Dec 2021 23:01:12 +0000 Subject: [PATCH 5/5] release: label overlayfs state directories Using `state_t` as the label makes the directories read-only for all unprivileged containers, even if they have access via a host mount. Signed-off-by: Ben Cressey --- packages/release/prepare-local.service | 7 +++++++ packages/selinux-policy/fs.cil | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/packages/release/prepare-local.service b/packages/release/prepare-local.service index 167873d9c50..bec31ad6bbe 100644 --- a/packages/release/prepare-local.service +++ b/packages/release/prepare-local.service @@ -21,6 +21,10 @@ ExecStart=/usr/bin/mkdir -p \ ${LOCAL_DIR}/var/lib/kernel-devel/.overlay/work \ ${LOCAL_DIR}/var/lib/kernel-modules/.overlay/upper \ ${LOCAL_DIR}/var/lib/kernel-modules/.overlay/work +ExecStart=/usr/sbin/setfiles -r ${LOCAL_DIR} \ + -F /etc/selinux/fortified/contexts/files/file_contexts \ + ${LOCAL_DIR}/var/lib/kernel-devel \ + ${LOCAL_DIR}/var/lib/kernel-modules # Create the directories we need to set up a read-write overlayfs for any CNI # plugin binaries. @@ -31,6 +35,9 @@ ExecStart=/usr/bin/mkdir -p \ ${LOCAL_DIR}/opt/cni/bin \ ${LOCAL_DIR}/var/lib/cni-plugins/.overlay/upper \ ${LOCAL_DIR}/var/lib/cni-plugins/.overlay/work +ExecStart=/usr/sbin/setfiles -r ${LOCAL_DIR} \ + -F /etc/selinux/fortified/contexts/files/file_contexts \ + ${LOCAL_DIR}/var/lib/cni-plugins RemainAfterExit=true StandardError=journal+console diff --git a/packages/selinux-policy/fs.cil b/packages/selinux-policy/fs.cil index 457aace294a..844ef56094e 100644 --- a/packages/selinux-policy/fs.cil +++ b/packages/selinux-policy/fs.cil @@ -80,6 +80,14 @@ (filecon "/var/lib/netdog" any lease) (filecon "/var/lib/netdog/.*" any lease) +; Label local directories for overlayfs mounts. +(filecon "/var/lib/cni-plugins" any state) +(filecon "/var/lib/cni-plugins/.*" any state) +(filecon "/var/lib/kernel-devel" any state) +(filecon "/var/lib/kernel-devel/.*" any state) +(filecon "/var/lib/kernel-modules" any state) +(filecon "/var/lib/kernel-modules/.*" any state) + ; Label kernel filesystem mounts. (filecon "/proc" any proc) (filecon "/proc/.*" any ())