diff --git a/libcontainer/cgroups/devices/devices_emulator.go b/libcontainer/cgroups/devices/devices_emulator.go index 9d9a60e39dc..cfb535bd4d8 100644 --- a/libcontainer/cgroups/devices/devices_emulator.go +++ b/libcontainer/cgroups/devices/devices_emulator.go @@ -57,6 +57,10 @@ func (e *Emulator) IsBlacklist() bool { return e.defaultAllow } +func (e *Emulator) IsAllowAll() bool { + return e.IsBlacklist() && len(e.rules) == 0 +} + var devicesListRegexp = regexp.MustCompile(`^([abc])\s+(\d+|\*):(\d+|\*)\s+([rwm]+)$`) func parseLine(line string) (*deviceRule, error) { diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index 725c0d6ffc9..2c610ff0909 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -1,6 +1,7 @@ package systemd import ( + "bufio" "fmt" "os" "strings" @@ -9,6 +10,7 @@ import ( systemdDbus "github.com/coreos/go-systemd/v22/dbus" dbus "github.com/godbus/dbus/v5" + "github.com/opencontainers/runc/libcontainer/cgroups/devices" "github.com/opencontainers/runc/libcontainer/configs" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -69,6 +71,209 @@ func ExpandSlice(slice string) (string, error) { return path, nil } +func groupPrefix(ruleType configs.DeviceType) (string, error) { + switch ruleType { + case configs.BlockDevice: + return "block-", nil + case configs.CharDevice: + return "char-", nil + default: + return "", errors.Errorf("device type %v has no group prefix", ruleType) + } +} + +// findDeviceGroup tries to find the device group name (as listed in +// /proc/devices) with the type prefixed as requried for DeviceAllow, for a +// given (type, major) combination. If more than one device group exists, an +// arbitrary one is chosen. +func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, error) { + fh, err := os.Open("/proc/devices") + if err != nil { + return "", err + } + defer fh.Close() + + prefix, err := groupPrefix(ruleType) + if err != nil { + return "", err + } + + scanner := bufio.NewScanner(fh) + var currentType configs.DeviceType + for scanner.Scan() { + // We need to strip spaces because the first number is column-aligned. + line := strings.TrimSpace(scanner.Text()) + + // Handle the "header" lines. + switch line { + case "Block devices:": + currentType = configs.BlockDevice + continue + case "Character devices:": + currentType = configs.CharDevice + continue + case "": + continue + } + + // Skip lines unrelated to our type. + if currentType != ruleType { + continue + } + + // Parse out the (major, name). + var ( + currMajor int64 + currName string + ) + if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 { + if err == nil { + err = errors.Errorf("wrong number of fields") + } + return "", errors.Wrapf(err, "scan /proc/devices line %q", line) + } + + if currMajor == ruleMajor { + return prefix + currName, nil + } + } + if err := scanner.Err(); err != nil { + return "", errors.Wrap(err, "reading /proc/devices") + } + // Couldn't find the device group. + return "", nil +} + +// generateDeviceProperties takes the configured device rules and generates a +// corresponding set of systemd properties to configure the devices correctly. +func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Property, error) { + // DeviceAllow is the type "a(ss)" which means we need a temporary struct + // to represent it in Go. + type deviceAllowEntry struct { + Path string + Perms string + } + + properties := []systemdDbus.Property{ + // Always run in the strictest white-list mode. + newProp("DevicePolicy", "strict"), + // Empty the DeviceAllow array before filling it. + newProp("DeviceAllow", []deviceAllowEntry{}), + } + + // Figure out the set of rules. + configEmu := &devices.Emulator{} + for _, rule := range rules { + if err := configEmu.Apply(*rule); err != nil { + return nil, errors.Wrap(err, "apply rule for systemd") + } + } + // systemd doesn't support blacklists. So we log a warning, and tell + // systemd to act as a deny-all whitelist. This ruleset will be replaced + // with our normal fallback code. This may result in spurrious errors, but + // the only other option is to error out here. + if configEmu.IsBlacklist() { + // However, if we're dealing with an allow-all rule then we can do it. + if configEmu.IsAllowAll() { + return []systemdDbus.Property{ + // Run in white-list mode by setting to "auto" and removing all + // DeviceAllow rules. + newProp("DevicePolicy", "auto"), + newProp("DeviceAllow", []deviceAllowEntry{}), + }, nil + } + logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") + return properties, nil + } + + // Now generate the set of rules we actually need to apply. Unlike the + // normal devices cgroup, in "strict" mode systemd defaults to a deny-all + // whitelist which is the default for devices.Emulator. + baseEmu := &devices.Emulator{} + finalRules, err := baseEmu.Transition(configEmu) + if err != nil { + return nil, errors.Wrap(err, "get simplified rules for systemd") + } + var deviceAllowList []deviceAllowEntry + for _, rule := range finalRules { + if !rule.Allow { + // Should never happen. + return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) + } + switch rule.Type { + case configs.BlockDevice, configs.CharDevice: + default: + // Should never happen. + return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type) + } + + entry := deviceAllowEntry{ + Perms: string(rule.Permissions), + } + + // systemd has a fairly odd (though understandable) syntax here, and + // because of the OCI configuration format we have to do quite a bit of + // trickery to convert things: + // + // * Concrete rules with non-wildcard major/minor numbers have to use + // /dev/{block,char} paths. This is slightly odd because it means + // that we cannot add whitelist rules for devices that don't exist, + // but there's not too much we can do about that. + // + // However, path globbing is not support for path-based rules so we + // need to handle wildcards in some other manner. + // + // * Wildcard-minor rules have to specify a "device group name" (the + // second column in /proc/devices). + // + // * Wildcard (major and minor) rules can just specify a glob with the + // type ("char-*" or "block-*"). + // + // The only type of rule we can't handle is wildcard-major rules, and + // so we'll give a warning in that case (note that the fallback code + // will insert any rules systemd couldn't handle). What amazing fun. + + if rule.Major == configs.Wildcard { + // "_ *:n _" rules aren't supported by systemd. + if rule.Minor != configs.Wildcard { + logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) + continue + } + + // "_ *:* _" rules just wildcard everything. + prefix, err := groupPrefix(rule.Type) + if err != nil { + return nil, err + } + entry.Path = prefix + "*" + } else if rule.Minor == configs.Wildcard { + // "_ n:* _" rules require a device group from /proc/devices. + group, err := findDeviceGroup(rule.Type, rule.Major) + if err != nil { + return nil, errors.Wrapf(err, "find device '%v/%d'", rule.Type, rule.Major) + } + if group == "" { + // Couldn't find a group. + logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) + continue + } + entry.Path = group + } else { + // "_ n:m _" rules are just a path in /dev/{block,char}/. + switch rule.Type { + case configs.BlockDevice: + entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) + case configs.CharDevice: + entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) + } + } + deviceAllowList = append(deviceAllowList, entry) + } + + properties = append(properties, newProp("DeviceAllow", deviceAllowList)) + return properties, nil +} + // getDbusConnection lazy initializes systemd dbus connection // and returns it func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) { diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 933d46e0caf..d5bc17139c1 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -15,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" ) type legacyManager struct { @@ -70,6 +71,13 @@ var legacySubsystems = subsystemSet{ func genV1ResourcesProperties(c *configs.Cgroup) ([]systemdDbus.Property, error) { var properties []systemdDbus.Property + + deviceProperties, err := generateDeviceProperties(c.Resources.Devices) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + if c.Resources.Memory != 0 { properties = append(properties, newProp("MemoryLimit", uint64(c.Resources.Memory))) @@ -381,21 +389,47 @@ func (m *legacyManager) Set(container *configs.Config) error { if err != nil { return err } + + // Figure out the current freezer state, so we can revert to it after we + // temporarily freeze the container. + targetFreezerState, err := m.GetFreezerState() + if err != nil { + return err + } + if targetFreezerState == configs.Undefined { + targetFreezerState = configs.Thawed + } + + // We have to freeze the container while systemd sets the cgroup settings. + // The reason for this is that systemd's application of DeviceAllow rules + // is done disruptively, resulting in spurrious errors to common devices + // (unlike our fs driver, they will happily write deny-all rules to running + // containers). So we freeze the container to avoid them hitting the cgroup + // error. But if the freezer cgroup isn't supported, we just warn about it. + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + } + dbusConnection, err := getDbusConnection(false) if err != nil { + _ = m.Freeze(targetFreezerState) return err } if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil { + _ = m.Freeze(targetFreezerState) return err } + // Reset freezer state before we apply the configuration, to avoid clashing + // with the freezer setting in the configuration. + _ = m.Freeze(targetFreezerState) + for _, sys := range legacySubsystems { // Get the subsystem path, but don't error out for not found cgroups. path, err := getSubsystemPath(container.Cgroups, sys.Name()) if err != nil && !cgroups.IsNotFound(err) { return err } - if err := sys.Set(path, container.Cgroups); err != nil { return err } diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index a23b40e1ebc..59d7d8999e9 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -16,6 +16,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/pkg/errors" + "github.com/sirupsen/logrus" ) type unifiedManager struct { @@ -37,6 +38,17 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou func genV2ResourcesProperties(c *configs.Cgroup) ([]systemdDbus.Property, error) { var properties []systemdDbus.Property + // NOTE: This is of questionable correctness because we insert our own + // devices eBPF program later. Two programs with identical rules + // aren't the end of the world, but it is a bit concerning. However + // it's unclear if systemd removes all eBPF programs attached when + // doing SetUnitProperties... + deviceProperties, err := generateDeviceProperties(c.Resources.Devices) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + if c.Resources.Memory != 0 { properties = append(properties, newProp("MemoryMax", uint64(c.Resources.Memory))) @@ -295,14 +307,41 @@ func (m *unifiedManager) Set(container *configs.Config) error { if err != nil { return err } + + // Figure out the current freezer state, so we can revert to it after we + // temporarily freeze the container. + targetFreezerState, err := m.GetFreezerState() + if err != nil { + return err + } + if targetFreezerState == configs.Undefined { + targetFreezerState = configs.Thawed + } + + // We have to freeze the container while systemd sets the cgroup settings. + // The reason for this is that systemd's application of DeviceAllow rules + // is done disruptively, resulting in spurrious errors to common devices + // (unlike our fs driver, they will happily write deny-all rules to running + // containers). So we freeze the container to avoid them hitting the cgroup + // error. But if the freezer cgroup isn't supported, we just warn about it. + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + } + dbusConnection, err := getDbusConnection(m.rootless) if err != nil { + _ = m.Freeze(targetFreezerState) return err } if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil { + _ = m.Freeze(targetFreezerState) return errors.Wrap(err, "error while setting unit properties") } + // Reset freezer state before we apply the configuration, to avoid clashing + // with the freezer setting in the configuration. + _ = m.Freeze(targetFreezerState) + fsMgr, err := m.fsManager() if err != nil { return err