diff --git a/images/base/files/etc/systemd/system/kubelet.service.d/10-kubeadm.conf b/images/base/files/etc/systemd/system/kubelet.service.d/10-kubeadm.conf index 17d6239f34..04fd985301 100644 --- a/images/base/files/etc/systemd/system/kubelet.service.d/10-kubeadm.conf +++ b/images/base/files/etc/systemd/system/kubelet.service.d/10-kubeadm.conf @@ -8,5 +8,8 @@ EnvironmentFile=-/var/lib/kubelet/kubeadm-flags.env # This is a file that the user can use for overrides of the kubelet args as a last resort. Preferably, the user should use # the .NodeRegistration.KubeletExtraArgs object in the configuration files instead. KUBELET_EXTRA_ARGS should be sourced from this file. EnvironmentFile=-/etc/default/kubelet +# On cgroup v1, the /kubelet cgroup is created in the entrypoint script before running systemd. +# On cgroup v2, the /kubelet cgroup is created here. (See the comments in the entrypoint script for the reason.) +ExecStartPre=/bin/sh -euc "if [ -f /sys/fs/cgroup/cgroup.controllers ]; then create-kubelet-cgroup-v2; fi" ExecStart= ExecStart=/usr/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_KUBEADM_ARGS $KUBELET_EXTRA_ARGS --cgroup-root=/kubelet diff --git a/images/base/files/usr/local/bin/create-kubelet-cgroup-v2 b/images/base/files/usr/local/bin/create-kubelet-cgroup-v2 new file mode 100755 index 0000000000..3c4ddfe44d --- /dev/null +++ b/images/base/files/usr/local/bin/create-kubelet-cgroup-v2 @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright 2021 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail +if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then + echo 'ERROR: this script should not be called on cgroup v1 hosts' >&2 + exit 1 +fi + +# NOTE: we can't use `test -s` because cgroup.procs is not a regular file. +if [[ -n "$(cat /sys/fs/cgroup/cgroup.procs)" ]]; then + echo 'ERROR: this script needs /sys/fs/cgroup/cgroup.procs to be empty (for writing the top-level cgroup.subtree_control)' >&2 + # So, this script needs to be called after launching systemd. + # This script cannot be called from /usr/local/bin/entrypoint. + exit 1 +fi + +ensure_subtree_control() { + local group=$1 + # When cgroup.controllers is like "cpu cpuset memory io pids", + # cgroup.subtree_control is written with "+cpu +cpuset +memory +io +pids" . + sed -e 's/ / +/g' -e 's/^/+/' <"/sys/fs/cgroup/$group/cgroup.controllers" >"/sys/fs/cgroup/$group/cgroup.subtree_control" +} + +# kubelet requires all the controllers (including hugetlb) in /sys/fs/cgroup/cgroup.controllers to be available in +# /sys/fs/cgroup/kubelet/cgroup.subtree_control. +# +# We need to update the top-level cgroup.subtree_controllers as well, because hugetlb is not present in the file by default. +ensure_subtree_control / +mkdir -p /sys/fs/cgroup/kubelet +ensure_subtree_control /kubelet diff --git a/images/base/files/usr/local/bin/entrypoint b/images/base/files/usr/local/bin/entrypoint index 71bac034ee..4f1ac63cc2 100755 --- a/images/base/files/usr/local/bin/entrypoint +++ b/images/base/files/usr/local/bin/entrypoint @@ -83,6 +83,26 @@ mount_kubelet_cgroup_root() { } fix_cgroup() { + if [[ -f "/sys/fs/cgroup/cgroup.controllers" ]]; then + echo 'INFO: detected cgroup v2' + # Both Docker and Podman enable CgroupNS on cgroup v2 hosts by default. + # + # So mostly we do not need to mess around with the cgroup path stuff, + # however, we still need to create the "/kubelet" cgroup at least. + # (Otherwise kubelet fails with `cgroup-root ["kubelet"] doesn't exist` error, see #1969) + # + # The "/kubelet" cgroup is created in ExecStartPre of the kubeadm service. + # + # [FAQ: Why not create "/kubelet" cgroup here?] + # We can't create the cgroup with controllers here, because /sys/fs/cgroup/cgroup.subtree_control is empty. + # And yet we can't write controllers to /sys/fs/cgroup/cgroup.subtree_control by ourselves either, because + # /sys/fs/cgroup/cgroup.procs is not empty at this moment. + # + # After switching from this entrypoint script to systemd, systemd evacuates the processes in the root + # group to "/init.scope" group, so we can write the root subtree_control and create "/kubelet" cgroup. + return + fi + echo 'INFO: detected cgroup v1' echo 'INFO: fix cgroup mounts for all subsystems' # see: https://d2iq.com/blog/running-kind-inside-a-kubernetes-cluster-for-continuous-integration # capture initial state before modifying