From b810da149008f1d7d07f481970d40c0d035958af Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 11 May 2020 01:20:26 +1000 Subject: [PATCH] cgroups: systemd: make use of Device*= properties It seems we missed that systemd added support for the devices cgroup, as a result systemd would actually *write an allow-all rule each time you did 'runc update'* if you used the systemd cgroup driver. This is obviously ... bad and was a clear security bug. Luckily the commits which introduced this were never in an actual runc release. So we simply generate the cgroupv1-style rules (which is what systemd's DeviceAllow wants) and default to a deny-all ruleset. Unfortunately it turns out that systemd is susceptible to the same spurrious error failure that we were, so that problem is out of our hands for systemd cgroup users. However, systemd has a similar bug to the one fixed in [1]. It will happily write a disruptive deny-all rule when it is not necessary. Unfortunately, we cannot even use devices.Emulator to generate a minimal set of transition rules because the DBus API is limited (you can only clear or append to the DeviceAllow= list -- so we are forced to always clear it). To work around this, we simply freeze the container during SetUnitProperties. [1]: afe83489d424 ("cgroupv1: devices: use minimal transition rules with devices.Emulator") Fixes: 1d4ccc8e0caa ("fix data inconsistent when runc update in systemd driven cgroup v1") Fixes: 7682a2b2a575 ("fix data inconsistent when runc update in systemd driven cgroup v2") Signed-off-by: Aleksa Sarai --- .../cgroups/devices/devices_emulator.go | 4 + libcontainer/cgroups/systemd/common.go | 205 ++++++++++++++++++ libcontainer/cgroups/systemd/v1.go | 36 ++- libcontainer/cgroups/systemd/v2.go | 39 ++++ 4 files changed, 283 insertions(+), 1 deletion(-) diff --git a/libcontainer/cgroups/devices/devices_emulator.go b/libcontainer/cgroups/devices/devices_emulator.go index 9d9a60e39dc..cfb535bd4d8 100644 --- a/libcontainer/cgroups/devices/devices_emulator.go +++ b/libcontainer/cgroups/devices/devices_emulator.go @@ -57,6 +57,10 @@ func (e *Emulator) IsBlacklist() bool { return e.defaultAllow } +func (e *Emulator) IsAllowAll() bool { + return e.IsBlacklist() && len(e.rules) == 0 +} + var devicesListRegexp = regexp.MustCompile(`^([abc])\s+(\d+|\*):(\d+|\*)\s+([rwm]+)$`) func parseLine(line string) (*deviceRule, error) { diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index 725c0d6ffc9..2c610ff0909 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -1,6 +1,7 @@ package systemd import ( + "bufio" "fmt" "os" "strings" @@ -9,6 +10,7 @@ import ( systemdDbus "github.com/coreos/go-systemd/v22/dbus" dbus "github.com/godbus/dbus/v5" + "github.com/opencontainers/runc/libcontainer/cgroups/devices" "github.com/opencontainers/runc/libcontainer/configs" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -69,6 +71,209 @@ func ExpandSlice(slice string) (string, error) { return path, nil } +func groupPrefix(ruleType configs.DeviceType) (string, error) { + switch ruleType { + case configs.BlockDevice: + return "block-", nil + case configs.CharDevice: + return "char-", nil + default: + return "", errors.Errorf("device type %v has no group prefix", ruleType) + } +} + +// findDeviceGroup tries to find the device group name (as listed in +// /proc/devices) with the type prefixed as requried for DeviceAllow, for a +// given (type, major) combination. If more than one device group exists, an +// arbitrary one is chosen. +func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, error) { + fh, err := os.Open("/proc/devices") + if err != nil { + return "", err + } + defer fh.Close() + + prefix, err := groupPrefix(ruleType) + if err != nil { + return "", err + } + + scanner := bufio.NewScanner(fh) + var currentType configs.DeviceType + for scanner.Scan() { + // We need to strip spaces because the first number is column-aligned. + line := strings.TrimSpace(scanner.Text()) + + // Handle the "header" lines. + switch line { + case "Block devices:": + currentType = configs.BlockDevice + continue + case "Character devices:": + currentType = configs.CharDevice + continue + case "": + continue + } + + // Skip lines unrelated to our type. + if currentType != ruleType { + continue + } + + // Parse out the (major, name). + var ( + currMajor int64 + currName string + ) + if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 { + if err == nil { + err = errors.Errorf("wrong number of fields") + } + return "", errors.Wrapf(err, "scan /proc/devices line %q", line) + } + + if currMajor == ruleMajor { + return prefix + currName, nil + } + } + if err := scanner.Err(); err != nil { + return "", errors.Wrap(err, "reading /proc/devices") + } + // Couldn't find the device group. + return "", nil +} + +// generateDeviceProperties takes the configured device rules and generates a +// corresponding set of systemd properties to configure the devices correctly. +func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Property, error) { + // DeviceAllow is the type "a(ss)" which means we need a temporary struct + // to represent it in Go. + type deviceAllowEntry struct { + Path string + Perms string + } + + properties := []systemdDbus.Property{ + // Always run in the strictest white-list mode. + newProp("DevicePolicy", "strict"), + // Empty the DeviceAllow array before filling it. + newProp("DeviceAllow", []deviceAllowEntry{}), + } + + // Figure out the set of rules. + configEmu := &devices.Emulator{} + for _, rule := range rules { + if err := configEmu.Apply(*rule); err != nil { + return nil, errors.Wrap(err, "apply rule for systemd") + } + } + // systemd doesn't support blacklists. So we log a warning, and tell + // systemd to act as a deny-all whitelist. This ruleset will be replaced + // with our normal fallback code. This may result in spurrious errors, but + // the only other option is to error out here. + if configEmu.IsBlacklist() { + // However, if we're dealing with an allow-all rule then we can do it. + if configEmu.IsAllowAll() { + return []systemdDbus.Property{ + // Run in white-list mode by setting to "auto" and removing all + // DeviceAllow rules. + newProp("DevicePolicy", "auto"), + newProp("DeviceAllow", []deviceAllowEntry{}), + }, nil + } + logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") + return properties, nil + } + + // Now generate the set of rules we actually need to apply. Unlike the + // normal devices cgroup, in "strict" mode systemd defaults to a deny-all + // whitelist which is the default for devices.Emulator. + baseEmu := &devices.Emulator{} + finalRules, err := baseEmu.Transition(configEmu) + if err != nil { + return nil, errors.Wrap(err, "get simplified rules for systemd") + } + var deviceAllowList []deviceAllowEntry + for _, rule := range finalRules { + if !rule.Allow { + // Should never happen. + return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) + } + switch rule.Type { + case configs.BlockDevice, configs.CharDevice: + default: + // Should never happen. + return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type) + } + + entry := deviceAllowEntry{ + Perms: string(rule.Permissions), + } + + // systemd has a fairly odd (though understandable) syntax here, and + // because of the OCI configuration format we have to do quite a bit of + // trickery to convert things: + // + // * Concrete rules with non-wildcard major/minor numbers have to use + // /dev/{block,char} paths. This is slightly odd because it means + // that we cannot add whitelist rules for devices that don't exist, + // but there's not too much we can do about that. + // + // However, path globbing is not support for path-based rules so we + // need to handle wildcards in some other manner. + // + // * Wildcard-minor rules have to specify a "device group name" (the + // second column in /proc/devices). + // + // * Wildcard (major and minor) rules can just specify a glob with the + // type ("char-*" or "block-*"). + // + // The only type of rule we can't handle is wildcard-major rules, and + // so we'll give a warning in that case (note that the fallback code + // will insert any rules systemd couldn't handle). What amazing fun. + + if rule.Major == configs.Wildcard { + // "_ *:n _" rules aren't supported by systemd. + if rule.Minor != configs.Wildcard { + logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) + continue + } + + // "_ *:* _" rules just wildcard everything. + prefix, err := groupPrefix(rule.Type) + if err != nil { + return nil, err + } + entry.Path = prefix + "*" + } else if rule.Minor == configs.Wildcard { + // "_ n:* _" rules require a device group from /proc/devices. + group, err := findDeviceGroup(rule.Type, rule.Major) + if err != nil { + return nil, errors.Wrapf(err, "find device '%v/%d'", rule.Type, rule.Major) + } + if group == "" { + // Couldn't find a group. + logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) + continue + } + entry.Path = group + } else { + // "_ n:m _" rules are just a path in /dev/{block,char}/. + switch rule.Type { + case configs.BlockDevice: + entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) + case configs.CharDevice: + entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) + } + } + deviceAllowList = append(deviceAllowList, entry) + } + + properties = append(properties, newProp("DeviceAllow", deviceAllowList)) + return properties, nil +} + // getDbusConnection lazy initializes systemd dbus connection // and returns it func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) { diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 933d46e0caf..d5bc17139c1 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -15,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" ) type legacyManager struct { @@ -70,6 +71,13 @@ var legacySubsystems = subsystemSet{ func genV1ResourcesProperties(c *configs.Cgroup) ([]systemdDbus.Property, error) { var properties []systemdDbus.Property + + deviceProperties, err := generateDeviceProperties(c.Resources.Devices) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + if c.Resources.Memory != 0 { properties = append(properties, newProp("MemoryLimit", uint64(c.Resources.Memory))) @@ -381,21 +389,47 @@ func (m *legacyManager) Set(container *configs.Config) error { if err != nil { return err } + + // Figure out the current freezer state, so we can revert to it after we + // temporarily freeze the container. + targetFreezerState, err := m.GetFreezerState() + if err != nil { + return err + } + if targetFreezerState == configs.Undefined { + targetFreezerState = configs.Thawed + } + + // We have to freeze the container while systemd sets the cgroup settings. + // The reason for this is that systemd's application of DeviceAllow rules + // is done disruptively, resulting in spurrious errors to common devices + // (unlike our fs driver, they will happily write deny-all rules to running + // containers). So we freeze the container to avoid them hitting the cgroup + // error. But if the freezer cgroup isn't supported, we just warn about it. + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + } + dbusConnection, err := getDbusConnection(false) if err != nil { + _ = m.Freeze(targetFreezerState) return err } if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil { + _ = m.Freeze(targetFreezerState) return err } + // Reset freezer state before we apply the configuration, to avoid clashing + // with the freezer setting in the configuration. + _ = m.Freeze(targetFreezerState) + for _, sys := range legacySubsystems { // Get the subsystem path, but don't error out for not found cgroups. path, err := getSubsystemPath(container.Cgroups, sys.Name()) if err != nil && !cgroups.IsNotFound(err) { return err } - if err := sys.Set(path, container.Cgroups); err != nil { return err } diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index a23b40e1ebc..59d7d8999e9 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -16,6 +16,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/pkg/errors" + "github.com/sirupsen/logrus" ) type unifiedManager struct { @@ -37,6 +38,17 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou func genV2ResourcesProperties(c *configs.Cgroup) ([]systemdDbus.Property, error) { var properties []systemdDbus.Property + // NOTE: This is of questionable correctness because we insert our own + // devices eBPF program later. Two programs with identical rules + // aren't the end of the world, but it is a bit concerning. However + // it's unclear if systemd removes all eBPF programs attached when + // doing SetUnitProperties... + deviceProperties, err := generateDeviceProperties(c.Resources.Devices) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + if c.Resources.Memory != 0 { properties = append(properties, newProp("MemoryMax", uint64(c.Resources.Memory))) @@ -295,14 +307,41 @@ func (m *unifiedManager) Set(container *configs.Config) error { if err != nil { return err } + + // Figure out the current freezer state, so we can revert to it after we + // temporarily freeze the container. + targetFreezerState, err := m.GetFreezerState() + if err != nil { + return err + } + if targetFreezerState == configs.Undefined { + targetFreezerState = configs.Thawed + } + + // We have to freeze the container while systemd sets the cgroup settings. + // The reason for this is that systemd's application of DeviceAllow rules + // is done disruptively, resulting in spurrious errors to common devices + // (unlike our fs driver, they will happily write deny-all rules to running + // containers). So we freeze the container to avoid them hitting the cgroup + // error. But if the freezer cgroup isn't supported, we just warn about it. + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + } + dbusConnection, err := getDbusConnection(m.rootless) if err != nil { + _ = m.Freeze(targetFreezerState) return err } if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil { + _ = m.Freeze(targetFreezerState) return errors.Wrap(err, "error while setting unit properties") } + // Reset freezer state before we apply the configuration, to avoid clashing + // with the freezer setting in the configuration. + _ = m.Freeze(targetFreezerState) + fsMgr, err := m.fsManager() if err != nil { return err