Skip to content

Commit

Permalink
cgroups: systemd: make use of Device*= properties
Browse files Browse the repository at this point in the history
It seems we missed that systemd added support for the devices cgroup, as
a result systemd would actually *write an allow-all rule each time you
did 'runc update'* if you used the systemd cgroup driver. This is
obviously ... bad and was a clear security bug. Luckily the commits which
introduced this were never in an actual runc release.

So we simply generate the cgroupv1-style rules (which is what systemd's
DeviceAllow wants) and default to a deny-all ruleset. Unfortunately it
turns out that systemd is susceptible to the same spurrious error
failure that we were, so that problem is out of our hands for systemd
cgroup users.

However, systemd has a similar bug to the one fixed in [1]. It will
happily write a disruptive deny-all rule when it is not necessary.
Unfortunately, we cannot even use devices.Emulator to generate a minimal
set of transition rules because the DBus API is limited (you can only
clear or append to the DeviceAllow= list -- so we are forced to always
clear it). To work around this, we simply freeze the container during
SetUnitProperties.

[1]: afe8348 ("cgroupv1: devices: use minimal transition rules with devices.Emulator")

Fixes: 1d4ccc8 ("fix data inconsistent when runc update in systemd driven cgroup v1")
Fixes: 7682a2b ("fix data inconsistent when runc update in systemd driven cgroup v2")
Signed-off-by: Aleksa Sarai <[email protected]>
  • Loading branch information
cyphar committed May 13, 2020
1 parent afe8348 commit b810da1
Show file tree
Hide file tree
Showing 4 changed files with 283 additions and 1 deletion.
4 changes: 4 additions & 0 deletions libcontainer/cgroups/devices/devices_emulator.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ func (e *Emulator) IsBlacklist() bool {
return e.defaultAllow
}

func (e *Emulator) IsAllowAll() bool {
return e.IsBlacklist() && len(e.rules) == 0
}

var devicesListRegexp = regexp.MustCompile(`^([abc])\s+(\d+|\*):(\d+|\*)\s+([rwm]+)$`)

func parseLine(line string) (*deviceRule, error) {
Expand Down
205 changes: 205 additions & 0 deletions libcontainer/cgroups/systemd/common.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package systemd

import (
"bufio"
"fmt"
"os"
"strings"
Expand All @@ -9,6 +10,7 @@ import (

systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -69,6 +71,209 @@ func ExpandSlice(slice string) (string, error) {
return path, nil
}

func groupPrefix(ruleType configs.DeviceType) (string, error) {
switch ruleType {
case configs.BlockDevice:
return "block-", nil
case configs.CharDevice:
return "char-", nil
default:
return "", errors.Errorf("device type %v has no group prefix", ruleType)
}
}

// findDeviceGroup tries to find the device group name (as listed in
// /proc/devices) with the type prefixed as requried for DeviceAllow, for a
// given (type, major) combination. If more than one device group exists, an
// arbitrary one is chosen.
func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, error) {
fh, err := os.Open("/proc/devices")
if err != nil {
return "", err
}
defer fh.Close()

prefix, err := groupPrefix(ruleType)
if err != nil {
return "", err
}

scanner := bufio.NewScanner(fh)
var currentType configs.DeviceType
for scanner.Scan() {
// We need to strip spaces because the first number is column-aligned.
line := strings.TrimSpace(scanner.Text())

// Handle the "header" lines.
switch line {
case "Block devices:":
currentType = configs.BlockDevice
continue
case "Character devices:":
currentType = configs.CharDevice
continue
case "":
continue
}

// Skip lines unrelated to our type.
if currentType != ruleType {
continue
}

// Parse out the (major, name).
var (
currMajor int64
currName string
)
if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
if err == nil {
err = errors.Errorf("wrong number of fields")
}
return "", errors.Wrapf(err, "scan /proc/devices line %q", line)
}

if currMajor == ruleMajor {
return prefix + currName, nil
}
}
if err := scanner.Err(); err != nil {
return "", errors.Wrap(err, "reading /proc/devices")
}
// Couldn't find the device group.
return "", nil
}

// generateDeviceProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Property, error) {
// DeviceAllow is the type "a(ss)" which means we need a temporary struct
// to represent it in Go.
type deviceAllowEntry struct {
Path string
Perms string
}

properties := []systemdDbus.Property{
// Always run in the strictest white-list mode.
newProp("DevicePolicy", "strict"),
// Empty the DeviceAllow array before filling it.
newProp("DeviceAllow", []deviceAllowEntry{}),
}

// Figure out the set of rules.
configEmu := &devices.Emulator{}
for _, rule := range rules {
if err := configEmu.Apply(*rule); err != nil {
return nil, errors.Wrap(err, "apply rule for systemd")
}
}
// systemd doesn't support blacklists. So we log a warning, and tell
// systemd to act as a deny-all whitelist. This ruleset will be replaced
// with our normal fallback code. This may result in spurrious errors, but
// the only other option is to error out here.
if configEmu.IsBlacklist() {
// However, if we're dealing with an allow-all rule then we can do it.
if configEmu.IsAllowAll() {
return []systemdDbus.Property{
// Run in white-list mode by setting to "auto" and removing all
// DeviceAllow rules.
newProp("DevicePolicy", "auto"),
newProp("DeviceAllow", []deviceAllowEntry{}),
}, nil
}
logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
return properties, nil
}

// Now generate the set of rules we actually need to apply. Unlike the
// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
// whitelist which is the default for devices.Emulator.
baseEmu := &devices.Emulator{}
finalRules, err := baseEmu.Transition(configEmu)
if err != nil {
return nil, errors.Wrap(err, "get simplified rules for systemd")
}
var deviceAllowList []deviceAllowEntry
for _, rule := range finalRules {
if !rule.Allow {
// Should never happen.
return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
}
switch rule.Type {
case configs.BlockDevice, configs.CharDevice:
default:
// Should never happen.
return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
}

entry := deviceAllowEntry{
Perms: string(rule.Permissions),
}

// systemd has a fairly odd (though understandable) syntax here, and
// because of the OCI configuration format we have to do quite a bit of
// trickery to convert things:
//
// * Concrete rules with non-wildcard major/minor numbers have to use
// /dev/{block,char} paths. This is slightly odd because it means
// that we cannot add whitelist rules for devices that don't exist,
// but there's not too much we can do about that.
//
// However, path globbing is not support for path-based rules so we
// need to handle wildcards in some other manner.
//
// * Wildcard-minor rules have to specify a "device group name" (the
// second column in /proc/devices).
//
// * Wildcard (major and minor) rules can just specify a glob with the
// type ("char-*" or "block-*").
//
// The only type of rule we can't handle is wildcard-major rules, and
// so we'll give a warning in that case (note that the fallback code
// will insert any rules systemd couldn't handle). What amazing fun.

if rule.Major == configs.Wildcard {
// "_ *:n _" rules aren't supported by systemd.
if rule.Minor != configs.Wildcard {
logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
continue
}

// "_ *:* _" rules just wildcard everything.
prefix, err := groupPrefix(rule.Type)
if err != nil {
return nil, err
}
entry.Path = prefix + "*"
} else if rule.Minor == configs.Wildcard {
// "_ n:* _" rules require a device group from /proc/devices.
group, err := findDeviceGroup(rule.Type, rule.Major)
if err != nil {
return nil, errors.Wrapf(err, "find device '%v/%d'", rule.Type, rule.Major)
}
if group == "" {
// Couldn't find a group.
logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
continue
}
entry.Path = group
} else {
// "_ n:m _" rules are just a path in /dev/{block,char}/.
switch rule.Type {
case configs.BlockDevice:
entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
case configs.CharDevice:
entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
}
}
deviceAllowList = append(deviceAllowList, entry)
}

properties = append(properties, newProp("DeviceAllow", deviceAllowList))
return properties, nil
}

// getDbusConnection lazy initializes systemd dbus connection
// and returns it
func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) {
Expand Down
36 changes: 35 additions & 1 deletion libcontainer/cgroups/systemd/v1.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
)

type legacyManager struct {
Expand Down Expand Up @@ -70,6 +71,13 @@ var legacySubsystems = subsystemSet{

func genV1ResourcesProperties(c *configs.Cgroup) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property

deviceProperties, err := generateDeviceProperties(c.Resources.Devices)
if err != nil {
return nil, err
}
properties = append(properties, deviceProperties...)

if c.Resources.Memory != 0 {
properties = append(properties,
newProp("MemoryLimit", uint64(c.Resources.Memory)))
Expand Down Expand Up @@ -381,21 +389,47 @@ func (m *legacyManager) Set(container *configs.Config) error {
if err != nil {
return err
}

// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err := m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}

// We have to freeze the container while systemd sets the cgroup settings.
// The reason for this is that systemd's application of DeviceAllow rules
// is done disruptively, resulting in spurrious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we freeze the container to avoid them hitting the cgroup
// error. But if the freezer cgroup isn't supported, we just warn about it.
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
}

dbusConnection, err := getDbusConnection(false)
if err != nil {
_ = m.Freeze(targetFreezerState)
return err
}
if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil {
_ = m.Freeze(targetFreezerState)
return err
}

// Reset freezer state before we apply the configuration, to avoid clashing
// with the freezer setting in the configuration.
_ = m.Freeze(targetFreezerState)

for _, sys := range legacySubsystems {
// Get the subsystem path, but don't error out for not found cgroups.
path, err := getSubsystemPath(container.Cgroups, sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
}

if err := sys.Set(path, container.Cgroups); err != nil {
return err
}
Expand Down
39 changes: 39 additions & 0 deletions libcontainer/cgroups/systemd/v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)

type unifiedManager struct {
Expand All @@ -37,6 +38,17 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou
func genV2ResourcesProperties(c *configs.Cgroup) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property

// NOTE: This is of questionable correctness because we insert our own
// devices eBPF program later. Two programs with identical rules
// aren't the end of the world, but it is a bit concerning. However
// it's unclear if systemd removes all eBPF programs attached when
// doing SetUnitProperties...
deviceProperties, err := generateDeviceProperties(c.Resources.Devices)
if err != nil {
return nil, err
}
properties = append(properties, deviceProperties...)

if c.Resources.Memory != 0 {
properties = append(properties,
newProp("MemoryMax", uint64(c.Resources.Memory)))
Expand Down Expand Up @@ -295,14 +307,41 @@ func (m *unifiedManager) Set(container *configs.Config) error {
if err != nil {
return err
}

// Figure out the current freezer state, so we can revert to it after we
// temporarily freeze the container.
targetFreezerState, err := m.GetFreezerState()
if err != nil {
return err
}
if targetFreezerState == configs.Undefined {
targetFreezerState = configs.Thawed
}

// We have to freeze the container while systemd sets the cgroup settings.
// The reason for this is that systemd's application of DeviceAllow rules
// is done disruptively, resulting in spurrious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we freeze the container to avoid them hitting the cgroup
// error. But if the freezer cgroup isn't supported, we just warn about it.
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
}

dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
_ = m.Freeze(targetFreezerState)
return err
}
if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil {
_ = m.Freeze(targetFreezerState)
return errors.Wrap(err, "error while setting unit properties")
}

// Reset freezer state before we apply the configuration, to avoid clashing
// with the freezer setting in the configuration.
_ = m.Freeze(targetFreezerState)

fsMgr, err := m.fsManager()
if err != nil {
return err
Expand Down

0 comments on commit b810da1

Please sign in to comment.