diff --git a/MAINTAINERS b/MAINTAINERS index 9bee195..413edcb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1,8 +1,8 @@ -This meta-project is maintained by the union of MAINTAINERS for all OCI Projects [1]. - -Other OCI Projects should list one maintainer per line, with a name, email address, and GitHub username: - -Random J Developer (@RandomJDeveloperExample) -A. U. Thor (@AUThorExample) - -[1]: https://github.com/opencontainers/ +Akihiro Suda (@AkihiroSuda) +Aleksa Sarai (@cyphar) +Kir Kolyshkin (@kolyshkin) +Mrunal Patel (@mrunalp) +Sebastiaan van Stijn (@thaJeztah) +Odin Ugedal (@odinuge) +Peter Hunt (@haircommander) +Davanum Srinivas (@dims) diff --git a/cgroups.go b/cgroups.go new file mode 100644 index 0000000..1f12755 --- /dev/null +++ b/cgroups.go @@ -0,0 +1,78 @@ +package cgroups + +import ( + "errors" +) + +var ( + // ErrDevicesUnsupported is an error returned when a cgroup manager + // is not configured to set device rules. + ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules") + + // ErrRootless is returned by [Manager.Apply] when there is an error + // creating cgroup directory, and cgroup.Rootless is set. In general, + // this error is to be ignored. + ErrRootless = errors.New("cgroup manager can not access cgroup (rootless container)") + + // DevicesSetV1 and DevicesSetV2 are functions to set devices for + // cgroup v1 and v2, respectively. Unless + // [github.com/opencontainers/cgroups/devices] + // package is imported, it is set to nil, so cgroup managers can't + // manage devices. + DevicesSetV1 func(path string, r *Resources) error + DevicesSetV2 func(path string, r *Resources) error +) + +type Manager interface { + // Apply creates a cgroup, if not yet created, and adds a process + // with the specified pid into that cgroup. A special value of -1 + // can be used to merely create a cgroup. + Apply(pid int) error + + // GetPids returns the PIDs of all processes inside the cgroup. + GetPids() ([]int, error) + + // GetAllPids returns the PIDs of all processes inside the cgroup + // any all its sub-cgroups. + GetAllPids() ([]int, error) + + // GetStats returns cgroups statistics. + GetStats() (*Stats, error) + + // Freeze sets the freezer cgroup to the specified state. + Freeze(state FreezerState) error + + // Destroy removes cgroup. + Destroy() error + + // Path returns a cgroup path to the specified controller/subsystem. + // For cgroupv2, the argument is unused and can be empty. + Path(string) string + + // Set sets cgroup resources parameters/limits. If the argument is nil, + // the resources specified during Manager creation (or the previous call + // to Set) are used. + Set(r *Resources) error + + // GetPaths returns cgroup path(s) to save in a state file in order to + // restore later. + // + // For cgroup v1, a key is cgroup subsystem name, and the value is the + // path to the cgroup for this subsystem. + // + // For cgroup v2 unified hierarchy, a key is "", and the value is the + // unified path. + GetPaths() map[string]string + + // GetCgroups returns the cgroup data as configured. + GetCgroups() (*Cgroup, error) + + // GetFreezerState retrieves the current FreezerState of the cgroup. + GetFreezerState() (FreezerState, error) + + // Exists returns whether the cgroup path exists or not. + Exists() bool + + // OOMKillCount reports OOM kill count for the cgroup. + OOMKillCount() (uint64, error) +} diff --git a/cgroups_test.go b/cgroups_test.go new file mode 100644 index 0000000..b7ca7b1 --- /dev/null +++ b/cgroups_test.go @@ -0,0 +1,21 @@ +package cgroups + +import ( + "testing" +) + +func TestParseCgroups(t *testing.T) { + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + t.Fatal(err) + } + if IsCgroup2UnifiedMode() { + return + } + if _, ok := cgroups["cpu"]; !ok { + t.Fail() + } +} diff --git a/config_blkio_device.go b/config_blkio_device.go new file mode 100644 index 0000000..9dc2a03 --- /dev/null +++ b/config_blkio_device.go @@ -0,0 +1,66 @@ +package cgroups + +import "fmt" + +// BlockIODevice holds major:minor format supported in blkio cgroup. +type BlockIODevice struct { + // Major is the device's major number + Major int64 `json:"major"` + // Minor is the device's minor number + Minor int64 `json:"minor"` +} + +// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair +type WeightDevice struct { + BlockIODevice + // Weight is the bandwidth rate for the device, range is from 10 to 1000 + Weight uint16 `json:"weight"` + // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + LeafWeight uint16 `json:"leafWeight"` +} + +// NewWeightDevice returns a configured WeightDevice pointer +func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice { + wd := &WeightDevice{} + wd.Major = major + wd.Minor = minor + wd.Weight = weight + wd.LeafWeight = leafWeight + return wd +} + +// WeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) WeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight) +} + +// LeafWeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) LeafWeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight) +} + +// ThrottleDevice struct holds a `major:minor rate_per_second` pair +type ThrottleDevice struct { + BlockIODevice + // Rate is the IO rate limit per cgroup per device + Rate uint64 `json:"rate"` +} + +// NewThrottleDevice returns a configured ThrottleDevice pointer +func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { + td := &ThrottleDevice{} + td.Major = major + td.Minor = minor + td.Rate = rate + return td +} + +// String formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) String() string { + return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) +} + +// StringName formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) StringName(name string) string { + return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate) +} diff --git a/config_hugepages.go b/config_hugepages.go new file mode 100644 index 0000000..5357dd0 --- /dev/null +++ b/config_hugepages.go @@ -0,0 +1,9 @@ +package cgroups + +type HugepageLimit struct { + // which type of hugepage to limit. + Pagesize string `json:"page_size"` + + // usage limit for hugepage. + Limit uint64 `json:"limit"` +} diff --git a/config_ifprio_map.go b/config_ifprio_map.go new file mode 100644 index 0000000..d771603 --- /dev/null +++ b/config_ifprio_map.go @@ -0,0 +1,14 @@ +package cgroups + +import ( + "fmt" +) + +type IfPrioMap struct { + Interface string `json:"interface"` + Priority int64 `json:"priority"` +} + +func (i *IfPrioMap) CgroupString() string { + return fmt.Sprintf("%s %d", i.Interface, i.Priority) +} diff --git a/config_linux.go b/config_linux.go new file mode 100644 index 0000000..ce98b3d --- /dev/null +++ b/config_linux.go @@ -0,0 +1,169 @@ +package cgroups + +import ( + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + devices "github.com/opencontainers/cgroups/devices/config" +) + +type FreezerState string + +const ( + Undefined FreezerState = "" + Frozen FreezerState = "FROZEN" + Thawed FreezerState = "THAWED" +) + +// Cgroup holds properties of a cgroup on Linux. +type Cgroup struct { + // Name specifies the name of the cgroup + Name string `json:"name,omitempty"` + + // Parent specifies the name of parent of cgroup or slice + Parent string `json:"parent,omitempty"` + + // Path specifies the path to cgroups that are created and/or joined by the container. + // The path is assumed to be relative to the host system cgroup mountpoint. + Path string `json:"path"` + + // ScopePrefix describes prefix for the scope name + ScopePrefix string `json:"scope_prefix"` + + // Resources contains various cgroups settings to apply + *Resources + + // Systemd tells if systemd should be used to manage cgroups. + Systemd bool + + // SystemdProps are any additional properties for systemd, + // derived from org.systemd.property.xxx annotations. + // Ignored unless systemd is used for managing cgroups. + SystemdProps []systemdDbus.Property `json:"-"` + + // Rootless tells if rootless cgroups should be used. + Rootless bool + + // The host UID that should own the cgroup, or nil to accept + // the default ownership. This should only be set when the + // cgroupfs is to be mounted read/write. + // Not all cgroup manager implementations support changing + // the ownership. + OwnerUID *int `json:"owner_uid,omitempty"` +} + +type Resources struct { + // Devices is the set of access rules for devices in the container. + Devices []*devices.Rule `json:"devices"` + + // Memory limit (in bytes) + Memory int64 `json:"memory"` + + // Memory reservation or soft_limit (in bytes) + MemoryReservation int64 `json:"memory_reservation"` + + // Total memory usage (memory + swap); set `-1` to enable unlimited swap + MemorySwap int64 `json:"memory_swap"` + + // CPU shares (relative weight vs. other containers) + CpuShares uint64 `json:"cpu_shares"` + + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + CpuQuota int64 `json:"cpu_quota"` + + // CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period. + CpuBurst *uint64 `json:"cpu_burst"` //nolint:revive + + // CPU period to be used for hardcapping (in usecs). 0 to use system default. + CpuPeriod uint64 `json:"cpu_period"` + + // How many time CPU will use in realtime scheduling (in usecs). + CpuRtRuntime int64 `json:"cpu_rt_quota"` + + // CPU period to be used for realtime scheduling (in usecs). + CpuRtPeriod uint64 `json:"cpu_rt_period"` + + // CPU to use + CpusetCpus string `json:"cpuset_cpus"` + + // MEM to use + CpusetMems string `json:"cpuset_mems"` + + // cgroup SCHED_IDLE + CPUIdle *int64 `json:"cpu_idle,omitempty"` + + // Process limit; set <= `0' to disable limit. + PidsLimit int64 `json:"pids_limit"` + + // Specifies per cgroup weight, range is from 10 to 1000. + BlkioWeight uint16 `json:"blkio_weight"` + + // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + BlkioLeafWeight uint16 `json:"blkio_leaf_weight"` + + // Weight per cgroup per device, can override BlkioWeight. + BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"` + + // IO read rate limit per cgroup per device, bytes per second. + BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` + + // IO write rate limit per cgroup per device, bytes per second. + BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` + + // IO read rate limit per cgroup per device, IO per second. + BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"` + + // IO write rate limit per cgroup per device, IO per second. + BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"` + + // set the freeze value for the process + Freezer FreezerState `json:"freezer"` + + // Hugetlb limit (in bytes) + HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"` + + // Whether to disable OOM Killer + OomKillDisable bool `json:"oom_kill_disable"` + + // Tuning swappiness behaviour per cgroup + MemorySwappiness *uint64 `json:"memory_swappiness"` + + // Set priority of network traffic for container + NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` + + // Set class identifier for container's network packets + NetClsClassid uint32 `json:"net_cls_classid_u"` + + // Rdma resource restriction configuration + Rdma map[string]LinuxRdma `json:"rdma"` + + // Used on cgroups v2: + + // CpuWeight sets a proportional bandwidth limit. + CpuWeight uint64 `json:"cpu_weight"` + + // Unified is cgroupv2-only key-value map. + Unified map[string]string `json:"unified"` + + // SkipDevices allows to skip configuring device permissions. + // Used by e.g. kubelet while creating a parent cgroup (kubepods) + // common for many containers, and by runc update. + // + // NOTE it is impossible to start a container which has this flag set. + SkipDevices bool `json:"-"` + + // SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup + // freeze when setting resources. Only applicable to systemd legacy + // (i.e. cgroup v1) manager (which uses freeze by default to avoid + // spurious permission errors caused by systemd inability to update + // device rules in a non-disruptive manner). + // + // If not set, a few methods (such as looking into cgroup's + // devices.list and querying the systemd unit properties) are used + // during Set() to figure out whether the freeze is required. Those + // methods may be relatively slow, thus this flag. + SkipFreezeOnSet bool `json:"-"` + + // MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check + // if the new memory limits (Memory and MemorySwap) being set are lower + // than the current memory usage, and reject if so. + MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"` +} diff --git a/config_rdma.go b/config_rdma.go new file mode 100644 index 0000000..a0bd54f --- /dev/null +++ b/config_rdma.go @@ -0,0 +1,9 @@ +package cgroups + +// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11) +type LinuxRdma struct { + // Maximum number of HCA handles that can be opened. Default is "no limit". + HcaHandles *uint32 `json:"hca_handles,omitempty"` + // Maximum number of HCA objects that can be created. Default is "no limit". + HcaObjects *uint32 `json:"hca_objects,omitempty"` +} diff --git a/config_unsupported.go b/config_unsupported.go new file mode 100644 index 0000000..db32ec4 --- /dev/null +++ b/config_unsupported.go @@ -0,0 +1,8 @@ +//go:build !linux + +package cgroups + +// Cgroup holds properties of a cgroup on Linux +// TODO Windows: This can ultimately be entirely factored out on Windows as +// cgroups are a Unix-specific construct. +type Cgroup struct{} diff --git a/devices/config/device.go b/devices/config/device.go new file mode 100644 index 0000000..05ad3ef --- /dev/null +++ b/devices/config/device.go @@ -0,0 +1,174 @@ +package config + +import ( + "fmt" + "os" + "strconv" +) + +const ( + Wildcard = -1 +) + +type Device struct { + Rule + + // Path to the device. + Path string `json:"path"` + + // FileMode permission bits for the device. + FileMode os.FileMode `json:"file_mode"` + + // Uid of the device. + Uid uint32 `json:"uid"` + + // Gid of the device. + Gid uint32 `json:"gid"` +} + +// Permissions is a cgroupv1-style string to represent device access. It +// has to be a string for backward compatibility reasons, hence why it has +// methods to do set operations. +type Permissions string + +const ( + deviceRead uint = (1 << iota) + deviceWrite + deviceMknod +) + +func (p Permissions) toSet() uint { + var set uint + for _, perm := range p { + switch perm { + case 'r': + set |= deviceRead + case 'w': + set |= deviceWrite + case 'm': + set |= deviceMknod + } + } + return set +} + +func fromSet(set uint) Permissions { + var perm string + if set&deviceRead == deviceRead { + perm += "r" + } + if set&deviceWrite == deviceWrite { + perm += "w" + } + if set&deviceMknod == deviceMknod { + perm += "m" + } + return Permissions(perm) +} + +// Union returns the union of the two sets of Permissions. +func (p Permissions) Union(o Permissions) Permissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs | rhs) +} + +// Difference returns the set difference of the two sets of Permissions. +// In set notation, A.Difference(B) gives you A\B. +func (p Permissions) Difference(o Permissions) Permissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs &^ rhs) +} + +// Intersection computes the intersection of the two sets of Permissions. +func (p Permissions) Intersection(o Permissions) Permissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs & rhs) +} + +// IsEmpty returns whether the set of permissions in a Permissions is +// empty. +func (p Permissions) IsEmpty() bool { + return p == Permissions("") +} + +// IsValid returns whether the set of permissions is a subset of valid +// permissions (namely, {r,w,m}). +func (p Permissions) IsValid() bool { + return p == fromSet(p.toSet()) +} + +type Type rune + +const ( + WildcardDevice Type = 'a' + BlockDevice Type = 'b' + CharDevice Type = 'c' // or 'u' + FifoDevice Type = 'p' +) + +func (t Type) IsValid() bool { + switch t { + case WildcardDevice, BlockDevice, CharDevice, FifoDevice: + return true + default: + return false + } +} + +func (t Type) CanMknod() bool { + switch t { + case BlockDevice, CharDevice, FifoDevice: + return true + default: + return false + } +} + +func (t Type) CanCgroup() bool { + switch t { + case WildcardDevice, BlockDevice, CharDevice: + return true + default: + return false + } +} + +type Rule struct { + // Type of device ('c' for char, 'b' for block). If set to 'a', this rule + // acts as a wildcard and all fields other than Allow are ignored. + Type Type `json:"type"` + + // Major is the device's major number. + Major int64 `json:"major"` + + // Minor is the device's minor number. + Minor int64 `json:"minor"` + + // Permissions is the set of permissions that this rule applies to (in the + // cgroupv1 format -- any combination of "rwm"). + Permissions Permissions `json:"permissions"` + + // Allow specifies whether this rule is allowed. + Allow bool `json:"allow"` +} + +func (d *Rule) CgroupString() string { + var ( + major = strconv.FormatInt(d.Major, 10) + minor = strconv.FormatInt(d.Minor, 10) + ) + if d.Major == Wildcard { + major = "*" + } + if d.Minor == Wildcard { + minor = "*" + } + return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions) +} + +func (d *Rule) Mkdev() (uint64, error) { + return mkDev(d) +} diff --git a/devices/config/mknod_unix.go b/devices/config/mknod_unix.go new file mode 100644 index 0000000..98cdc6e --- /dev/null +++ b/devices/config/mknod_unix.go @@ -0,0 +1,14 @@ +package config + +import ( + "errors" + + "golang.org/x/sys/unix" +) + +func mkDev(d *Rule) (uint64, error) { + if d.Major == Wildcard || d.Minor == Wildcard { + return 0, errors.New("cannot mkdev() device with wildcards") + } + return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil +} diff --git a/devices/devicefilter.go b/devices/devicefilter.go new file mode 100644 index 0000000..aafa0d0 --- /dev/null +++ b/devices/devicefilter.go @@ -0,0 +1,207 @@ +// Implements creation of eBPF device filter program. +// +// Based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c +// +// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) +// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 +package devices + +import ( + "errors" + "fmt" + "math" + "strconv" + + "github.com/cilium/ebpf/asm" + devices "github.com/opencontainers/cgroups/devices/config" + "golang.org/x/sys/unix" +) + +const ( + // license string format is same as kernel MODULE_LICENSE macro + license = "Apache" +) + +// deviceFilter returns eBPF device filter program and its license string. +func deviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) { + // Generate the minimum ruleset for the device rules we are given. While we + // don't care about minimum transitions in cgroupv2, using the emulator + // gives us a guarantee that the behaviour of devices filtering is the same + // as cgroupv1, including security hardenings to avoid misconfiguration + // (such as punching holes in wildcard rules). + emu := new(emulator) + for _, rule := range rules { + if err := emu.Apply(*rule); err != nil { + return nil, "", err + } + } + cleanRules, err := emu.Rules() + if err != nil { + return nil, "", err + } + + p := &program{ + defaultAllow: emu.IsBlacklist(), + } + p.init() + + for idx, rule := range cleanRules { + if rule.Type == devices.WildcardDevice { + // We can safely skip over wildcard entries because there should + // only be one (at most) at the very start to instruct cgroupv1 to + // go into allow-list mode. However we do double-check this here. + if idx != 0 || rule.Allow != emu.IsBlacklist() { + return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString()) + } + continue + } + if rule.Allow == p.defaultAllow { + // There should be no rules which have an action equal to the + // default action, the emulator removes those. + return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString()) + } + if err := p.appendRule(rule); err != nil { + return nil, "", err + } + } + return p.finalize(), license, nil +} + +type program struct { + insts asm.Instructions + defaultAllow bool + blockID int +} + +func (p *program) init() { + // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 + /* + u32 access_type + u32 major + u32 minor + */ + // R2 <- type (lower 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R2, asm.R1, 0, asm.Word), + asm.And.Imm32(asm.R2, 0xFFFF)) + + // R3 <- access (upper 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), + // RSh: bitwise shift right + asm.RSh.Imm32(asm.R3, 16)) + + // R4 <- major (u32 major at R1[4]) + p.insts = append(p.insts, + asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) + + // R5 <- minor (u32 minor at R1[8]) + p.insts = append(p.insts, + asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) +} + +// appendRule rule converts an OCI rule to the relevant eBPF block and adds it +// to the in-progress filter program. In order to operate properly, it must be +// called with a "clean" rule list (generated by devices.Emulator.Rules() -- +// with any "a" rules removed). +func (p *program) appendRule(rule *devices.Rule) error { + if p.blockID < 0 { + return errors.New("the program is finalized") + } + + var bpfType int32 + switch rule.Type { + case devices.CharDevice: + bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) + case devices.BlockDevice: + bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) + default: + // We do not permit 'a', nor any other types we don't know about. + return fmt.Errorf("invalid type %q", string(rule.Type)) + } + if rule.Major > math.MaxUint32 { + return fmt.Errorf("invalid major %d", rule.Major) + } + if rule.Minor > math.MaxUint32 { + return fmt.Errorf("invalid minor %d", rule.Major) + } + hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1 + hasMinor := rule.Minor >= 0 + bpfAccess := int32(0) + for _, r := range rule.Permissions { + switch r { + case 'r': + bpfAccess |= unix.BPF_DEVCG_ACC_READ + case 'w': + bpfAccess |= unix.BPF_DEVCG_ACC_WRITE + case 'm': + bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD + default: + return fmt.Errorf("unknown device access %v", r) + } + } + // If the access is rwm, skip the check. + hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) + + var ( + blockSym = "block-" + strconv.Itoa(p.blockID) + nextBlockSym = "block-" + strconv.Itoa(p.blockID+1) + prevBlockLastIdx = len(p.insts) - 1 + ) + p.insts = append(p.insts, + // if (R2 != bpfType) goto next + asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), + ) + if hasAccess { + p.insts = append(p.insts, + // if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next + asm.Mov.Reg32(asm.R1, asm.R3), + asm.And.Imm32(asm.R1, bpfAccess), + asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym), + ) + } + if hasMajor { + p.insts = append(p.insts, + // if (R4 != major) goto next + asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym), + ) + } + if hasMinor { + p.insts = append(p.insts, + // if (R5 != minor) goto next + asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym), + ) + } + p.insts = append(p.insts, acceptBlock(rule.Allow)...) + // set blockSym to the first instruction we added in this iteration + p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].WithSymbol(blockSym) + p.blockID++ + return nil +} + +func (p *program) finalize() asm.Instructions { + var v int32 + if p.defaultAllow { + v = 1 + } + blockSym := "block-" + strconv.Itoa(p.blockID) + p.insts = append(p.insts, + // R0 <- v + asm.Mov.Imm32(asm.R0, v).WithSymbol(blockSym), + asm.Return(), + ) + p.blockID = -1 + return p.insts +} + +func acceptBlock(accept bool) asm.Instructions { + var v int32 + if accept { + v = 1 + } + return []asm.Instruction{ + // R0 <- v + asm.Mov.Imm32(asm.R0, v), + asm.Return(), + } +} diff --git a/devices/devicefilter_test.go b/devices/devicefilter_test.go new file mode 100644 index 0000000..4010deb --- /dev/null +++ b/devices/devicefilter_test.go @@ -0,0 +1,336 @@ +package devices + +import ( + "strings" + "testing" + + devices "github.com/opencontainers/cgroups/devices/config" +) + +func hash(s, comm string) string { + var res []string + for _, l := range strings.Split(s, "\n") { + trimmed := strings.TrimSpace(l) + if trimmed == "" || strings.HasPrefix(trimmed, comm) { + continue + } + res = append(res, trimmed) + } + return strings.Join(res, "\n") +} + +func testDeviceFilter(t testing.TB, devices []*devices.Rule, expectedStr string) { + insts, _, err := deviceFilter(devices) + if err != nil { + t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices) + } + s := insts.String() + if expectedStr != "" { + hashed := hash(s, "//") + expectedHashed := hash(expectedStr, "//") + if expectedHashed != hashed { + t.Fatalf("expected:\n%q\ngot\n%q", expectedHashed, hashed) + } + } +} + +func TestDeviceFilter_Nil(t *testing.T) { + expected := ` +// load parameters into registers + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: AndImm32 dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RShImm32 dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) + 6: MovImm32 dst: r0 imm: 0 + 7: Exit + ` + testDeviceFilter(t, nil, expected) +} + +func TestDeviceFilter_BuiltInAllowList(t *testing.T) { + // This is a copy of all rules from + // github.com/opencontainers/runc/libcontainer/specconv.AllowedDevices. + devices := []*devices.Rule{ + { + Type: devices.CharDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: "m", + Allow: true, + }, + { + Type: devices.BlockDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: "m", + Allow: true, + }, + { + Type: devices.CharDevice, + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, + { + Type: devices.CharDevice, + Major: 1, + Minor: 8, + Permissions: "rwm", + Allow: true, + }, + { + Type: devices.CharDevice, + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, + { + Type: devices.CharDevice, + Major: 5, + Minor: 0, + Permissions: "rwm", + Allow: true, + }, + { + Type: devices.CharDevice, + Major: 1, + Minor: 5, + Permissions: "rwm", + Allow: true, + }, + { + Type: devices.CharDevice, + Major: 1, + Minor: 9, + Permissions: "rwm", + Allow: true, + }, + { + Type: devices.CharDevice, + Major: 136, + Minor: devices.Wildcard, + Permissions: "rwm", + Allow: true, + }, + { + Type: devices.CharDevice, + Major: 5, + Minor: 2, + Permissions: "rwm", + Allow: true, + }, + { + Type: devices.CharDevice, + Major: 10, + Minor: 200, + Permissions: "rwm", + Allow: true, + }, + } + + expected := ` +// load parameters into registers + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: AndImm32 dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RShImm32 dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// (b, wildcard, wildcard, m, true) + 6: JNEImm dst: r2 off: -1 imm: 1 + 7: MovReg32 dst: r1 src: r3 + 8: AndImm32 dst: r1 imm: 1 + 9: JNEReg dst: r1 off: -1 src: r3 + 10: MovImm32 dst: r0 imm: 1 + 11: Exit +block-1: +// (c, wildcard, wildcard, m, true) + 12: JNEImm dst: r2 off: -1 imm: 2 + 13: MovReg32 dst: r1 src: r3 + 14: AndImm32 dst: r1 imm: 1 + 15: JNEReg dst: r1 off: -1 src: r3 + 16: MovImm32 dst: r0 imm: 1 + 17: Exit +block-2: + 18: JNEImm dst: r2 off: -1 imm: 2 + 19: JNEImm dst: r4 off: -1 imm: 1 + 20: JNEImm dst: r5 off: -1 imm: 3 + 21: MovImm32 dst: r0 imm: 1 + 22: Exit +block-3: + 23: JNEImm dst: r2 off: -1 imm: 2 + 24: JNEImm dst: r4 off: -1 imm: 1 + 25: JNEImm dst: r5 off: -1 imm: 5 + 26: MovImm32 dst: r0 imm: 1 + 27: Exit +block-4: + 28: JNEImm dst: r2 off: -1 imm: 2 + 29: JNEImm dst: r4 off: -1 imm: 1 + 30: JNEImm dst: r5 off: -1 imm: 7 + 31: MovImm32 dst: r0 imm: 1 + 32: Exit +block-5: + 33: JNEImm dst: r2 off: -1 imm: 2 + 34: JNEImm dst: r4 off: -1 imm: 1 + 35: JNEImm dst: r5 off: -1 imm: 8 + 36: MovImm32 dst: r0 imm: 1 + 37: Exit +block-6: + 38: JNEImm dst: r2 off: -1 imm: 2 + 39: JNEImm dst: r4 off: -1 imm: 1 + 40: JNEImm dst: r5 off: -1 imm: 9 + 41: MovImm32 dst: r0 imm: 1 + 42: Exit +block-7: + 43: JNEImm dst: r2 off: -1 imm: 2 + 44: JNEImm dst: r4 off: -1 imm: 5 + 45: JNEImm dst: r5 off: -1 imm: 0 + 46: MovImm32 dst: r0 imm: 1 + 47: Exit +block-8: + 48: JNEImm dst: r2 off: -1 imm: 2 + 49: JNEImm dst: r4 off: -1 imm: 5 + 50: JNEImm dst: r5 off: -1 imm: 2 + 51: MovImm32 dst: r0 imm: 1 + 52: Exit +block-9: +// tuntap (c, 10, 200, rwm, true) + 53: JNEImm dst: r2 off: -1 imm: 2 + 54: JNEImm dst: r4 off: -1 imm: 10 + 55: JNEImm dst: r5 off: -1 imm: 200 + 56: MovImm32 dst: r0 imm: 1 + 57: Exit +block-10: +// /dev/pts (c, 136, wildcard, rwm, true) + 58: JNEImm dst: r2 off: -1 imm: 2 + 59: JNEImm dst: r4 off: -1 imm: 136 + 60: MovImm32 dst: r0 imm: 1 + 61: Exit +block-11: + 62: MovImm32 dst: r0 imm: 0 + 63: Exit +` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_Privileged(t *testing.T) { + devices := []*devices.Rule{ + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + } + expected := ` +// load parameters into registers + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: AndImm32 dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RShImm32 dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 1 (accept) + 6: MovImm32 dst: r0 imm: 1 + 7: Exit + ` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) { + devices := []*devices.Rule{ + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'b', + Major: 8, + Minor: 0, + Permissions: "rwm", + Allow: false, + }, + } + expected := ` +// load parameters into registers + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: AndImm32 dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RShImm32 dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) if type==b && major == 8 && minor == 0 + 6: JNEImm dst: r2 off: -1 imm: 1 + 7: JNEImm dst: r4 off: -1 imm: 8 + 8: JNEImm dst: r5 off: -1 imm: 0 + 9: MovImm32 dst: r0 imm: 0 + 10: Exit +block-1: +// return 1 (accept) + 11: MovImm32 dst: r0 imm: 1 + 12: Exit +` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_Weird(t *testing.T) { + devices := []*devices.Rule{ + { + Type: 'b', + Major: 8, + Minor: 1, + Permissions: "rwm", + Allow: false, + }, + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'b', + Major: 8, + Minor: 2, + Permissions: "rwm", + Allow: false, + }, + } + // 8/1 is allowed, 8/2 is not allowed. + // This conforms to runc v1.0.0-rc.9 (cgroup1) behavior. + expected := ` +// load parameters into registers + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: AndImm32 dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RShImm32 dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) if type==b && major == 8 && minor == 2 + 6: JNEImm dst: r2 off: -1 imm: 1 + 7: JNEImm dst: r4 off: -1 imm: 8 + 8: JNEImm dst: r5 off: -1 imm: 2 + 9: MovImm32 dst: r0 imm: 0 + 10: Exit +block-1: +// return 1 (accept) + 11: MovImm32 dst: r0 imm: 1 + 12: Exit +` + testDeviceFilter(t, devices, expected) +} diff --git a/devices/devices.go b/devices/devices.go new file mode 100644 index 0000000..2cfd7d0 --- /dev/null +++ b/devices/devices.go @@ -0,0 +1,16 @@ +// Package devices contains functionality to manage cgroup devices, which +// is exposed indirectly via libcontainer/cgroups managers. +// +// To enable cgroup managers to manage devices, this package must be imported. +package devices + +import ( + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/systemd" +) + +func init() { + cgroups.DevicesSetV1 = setV1 + cgroups.DevicesSetV2 = setV2 + systemd.GenerateDeviceProps = systemdProperties +} diff --git a/devices/devices_emulator.go b/devices/devices_emulator.go new file mode 100644 index 0000000..ab18268 --- /dev/null +++ b/devices/devices_emulator.go @@ -0,0 +1,386 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright (C) 2020 Aleksa Sarai + * Copyright (C) 2020 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devices + +import ( + "bufio" + "fmt" + "io" + "sort" + "strconv" + "strings" + + devices "github.com/opencontainers/cgroups/devices/config" +) + +// deviceMeta is a Rule without the Allow or Permissions fields, and no +// wildcard-type support. It's effectively the "match" portion of a metadata +// rule, for the purposes of our emulation. +type deviceMeta struct { + node devices.Type + major int64 + minor int64 +} + +// deviceRule is effectively the tuple (deviceMeta, Permissions). +type deviceRule struct { + meta deviceMeta + perms devices.Permissions +} + +// deviceRules is a mapping of device metadata rules to the associated +// permissions in the ruleset. +type deviceRules map[deviceMeta]devices.Permissions + +func (r deviceRules) orderedEntries() []deviceRule { + var rules []deviceRule + for meta, perms := range r { + rules = append(rules, deviceRule{meta: meta, perms: perms}) + } + sort.Slice(rules, func(i, j int) bool { + // Sort by (major, minor, type). + a, b := rules[i].meta, rules[j].meta + return a.major < b.major || + (a.major == b.major && a.minor < b.minor) || + (a.major == b.major && a.minor == b.minor && a.node < b.node) + }) + return rules +} + +type emulator struct { + defaultAllow bool + rules deviceRules +} + +func (e *emulator) IsBlacklist() bool { + return e.defaultAllow +} + +func (e *emulator) IsAllowAll() bool { + return e.IsBlacklist() && len(e.rules) == 0 +} + +func parseLine(line string) (*deviceRule, error) { + // Input: node major:minor perms. + fields := strings.FieldsFunc(line, func(r rune) bool { + return r == ' ' || r == ':' + }) + if len(fields) != 4 { + return nil, fmt.Errorf("malformed devices.list rule %s", line) + } + + var ( + rule deviceRule + node = fields[0] + major = fields[1] + minor = fields[2] + perms = fields[3] + ) + + // Parse the node type. + switch node { + case "a": + // Super-special case -- "a" always means every device with every + // access mode. In fact, for devices.list this actually indicates that + // the cgroup is in black-list mode. + // TODO: Double-check that the entire file is "a *:* rwm". + return nil, nil + case "b": + rule.meta.node = devices.BlockDevice + case "c": + rule.meta.node = devices.CharDevice + default: + return nil, fmt.Errorf("unknown device type %q", node) + } + + // Parse the major number. + if major == "*" { + rule.meta.major = devices.Wildcard + } else { + val, err := strconv.ParseUint(major, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid major number: %w", err) + } + rule.meta.major = int64(val) + } + + // Parse the minor number. + if minor == "*" { + rule.meta.minor = devices.Wildcard + } else { + val, err := strconv.ParseUint(minor, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid minor number: %w", err) + } + rule.meta.minor = int64(val) + } + + // Parse the access permissions. + rule.perms = devices.Permissions(perms) + if !rule.perms.IsValid() || rule.perms.IsEmpty() { + return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms) + } + return &rule, nil +} + +func (e *emulator) addRule(rule deviceRule) error { //nolint:unparam + if e.rules == nil { + e.rules = make(map[deviceMeta]devices.Permissions) + } + + // Merge with any pre-existing permissions. + oldPerms := e.rules[rule.meta] + newPerms := rule.perms.Union(oldPerms) + e.rules[rule.meta] = newPerms + return nil +} + +func (e *emulator) rmRule(rule deviceRule) error { + // Give an error if any of the permissions requested to be removed are + // present in a partially-matching wildcard rule, because such rules will + // be ignored by cgroupv1. + // + // This is a diversion from cgroupv1, but is necessary to avoid leading + // users into a false sense of security. cgroupv1 will silently(!) ignore + // requests to remove partial exceptions, but we really shouldn't do that. + // + // It may seem like we could just "split" wildcard rules which hit this + // issue, but unfortunately there are 2^32 possible major and minor + // numbers, which would exhaust kernel memory quickly if we did this. Not + // to mention it'd be really slow (the kernel side is implemented as a + // linked-list of exceptions). + for _, partialMeta := range []deviceMeta{ + {node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor}, + {node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard}, + {node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard}, + } { + // This wildcard rule is equivalent to the requested rule, so skip it. + if rule.meta == partialMeta { + continue + } + // Only give an error if the set of permissions overlap. + partialPerms := e.rules[partialMeta] + if !partialPerms.Intersection(rule.perms).IsEmpty() { + return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms) + } + } + + // Subtract all of the permissions listed from the full match rule. If the + // rule didn't exist, all of this is a no-op. + newPerms := e.rules[rule.meta].Difference(rule.perms) + if newPerms.IsEmpty() { + delete(e.rules, rule.meta) + } else { + e.rules[rule.meta] = newPerms + } + // TODO: The actual cgroup code doesn't care if an exception didn't exist + // during removal, so not erroring out here is /accurate/ but quite + // worrying. Maybe we should do additional validation, but again we + // have to worry about backwards-compatibility. + return nil +} + +func (e *emulator) allow(rule *deviceRule) error { + // This cgroup is configured as a black-list. Reset the entire emulator, + // and put is into black-list mode. + if rule == nil || rule.meta.node == devices.WildcardDevice { + *e = emulator{ + defaultAllow: true, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception") + } else { + err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception") + } + return err +} + +func (e *emulator) deny(rule *deviceRule) error { + // This cgroup is configured as a white-list. Reset the entire emulator, + // and put is into white-list mode. + if rule == nil || rule.meta.node == devices.WildcardDevice { + *e = emulator{ + defaultAllow: false, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception") + } else { + err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception") + } + return err +} + +func (e *emulator) Apply(rule devices.Rule) error { + if !rule.Type.CanCgroup() { + return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type) + } + + innerRule := &deviceRule{ + meta: deviceMeta{ + node: rule.Type, + major: rule.Major, + minor: rule.Minor, + }, + perms: rule.Permissions, + } + if innerRule.meta.node == devices.WildcardDevice { + innerRule = nil + } + + if rule.Allow { + return e.allow(innerRule) + } + + return e.deny(innerRule) +} + +// emulatorFromList takes a reader to a "devices.list"-like source, and returns +// a new Emulator that represents the state of the devices cgroup. Note that +// black-list devices cgroups cannot be fully reconstructed, due to limitations +// in the devices cgroup API. Instead, such cgroups are always treated as +// "allow all" cgroups. +func emulatorFromList(list io.Reader) (*emulator, error) { + // Normally cgroups are in black-list mode by default, but the way we + // figure out the current mode is whether or not devices.list has an + // allow-all rule. So we default to a white-list, and the existence of an + // "a *:* rwm" entry will tell us otherwise. + e := &emulator{ + defaultAllow: false, + } + + // Parse the "devices.list". + s := bufio.NewScanner(list) + for s.Scan() { + line := s.Text() + deviceRule, err := parseLine(line) + if err != nil { + return nil, fmt.Errorf("error parsing line %q: %w", line, err) + } + // "devices.list" is an allow list. Note that this means that in + // black-list mode, we have no idea what rules are in play. As a + // result, we need to be very careful in Transition(). + if err := e.allow(deviceRule); err != nil { + return nil, fmt.Errorf("error adding devices.list rule: %w", err) + } + } + if err := s.Err(); err != nil { + return nil, fmt.Errorf("error reading devices.list lines: %w", err) + } + return e, nil +} + +// Transition calculates what is the minimally-disruptive set of rules need to +// be applied to a devices cgroup in order to transition to the given target. +// This means that any already-existing rules will not be applied, and +// disruptive rules (like denying all device access) will only be applied if +// necessary. +// +// This function is the sole reason for all of Emulator -- to allow us +// to figure out how to update a containers' cgroups without causing spurious +// device errors (if possible). +func (source *emulator) Transition(target *emulator) ([]*devices.Rule, error) { //nolint:revive // Ignore receiver-naming warning. + var transitionRules []*devices.Rule + oldRules := source.rules + + // If the default policy doesn't match, we need to include a "disruptive" + // rule (either allow-all or deny-all) in order to switch the cgroup to the + // correct default policy. + // + // However, due to a limitation in "devices.list" we cannot be sure what + // deny rules are in place in a black-list cgroup. Thus if the source is a + // black-list we also have to include a disruptive rule. + if source.IsBlacklist() || source.defaultAllow != target.defaultAllow { + transitionRules = append(transitionRules, &devices.Rule{ + Type: 'a', + Major: -1, + Minor: -1, + Permissions: devices.Permissions("rwm"), + Allow: target.defaultAllow, + }) + // The old rules are only relevant if we aren't starting out with a + // disruptive rule. + oldRules = nil + } + + // NOTE: We traverse through the rules in a sorted order so we always write + // the same set of rules (this is to aid testing). + + // First, we create inverse rules for any old rules not in the new set. + // This includes partial-inverse rules for specific permissions. This is a + // no-op if we added a disruptive rule, since oldRules will be empty. + for _, rule := range oldRules.orderedEntries() { + meta, oldPerms := rule.meta, rule.perms + newPerms := target.rules[meta] + droppedPerms := oldPerms.Difference(newPerms) + if !droppedPerms.IsEmpty() { + transitionRules = append(transitionRules, &devices.Rule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: droppedPerms, + Allow: target.defaultAllow, + }) + } + } + + // Add any additional rules which weren't in the old set. We happen to + // filter out rules which are present in both sets, though this isn't + // strictly necessary. + for _, rule := range target.rules.orderedEntries() { + meta, newPerms := rule.meta, rule.perms + oldPerms := oldRules[meta] + gainedPerms := newPerms.Difference(oldPerms) + if !gainedPerms.IsEmpty() { + transitionRules = append(transitionRules, &devices.Rule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: gainedPerms, + Allow: !target.defaultAllow, + }) + } + } + return transitionRules, nil +} + +// Rules returns the minimum set of rules necessary to convert a *deny-all* +// cgroup to the emulated filter state (note that this is not the same as a +// default cgroupv1 cgroup -- which is allow-all). This is effectively just a +// wrapper around Transition() with the source emulator being an empty cgroup. +func (e *emulator) Rules() ([]*devices.Rule, error) { + defaultCgroup := &emulator{defaultAllow: false} + return defaultCgroup.Transition(e) +} + +func wrapErr(err error, text string) error { + if err == nil { + return nil + } + return fmt.Errorf(text+": %w", err) +} diff --git a/devices/devices_emulator_test.go b/devices/devices_emulator_test.go new file mode 100644 index 0000000..24c1d1e --- /dev/null +++ b/devices/devices_emulator_test.go @@ -0,0 +1,1144 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright (C) 2020 Aleksa Sarai + * Copyright (C) 2020 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devices + +import ( + "bufio" + "bytes" + "reflect" + "strings" + "testing" + + devices "github.com/opencontainers/cgroups/devices/config" +) + +func TestDeviceEmulatorLoad(t *testing.T) { + tests := []struct { + name, list string + expected *emulator + }{ + { + name: "BlacklistMode", + list: `a *:* rwm`, + expected: &emulator{ + defaultAllow: true, + }, + }, + { + name: "WhitelistBasic", + list: `c 4:2 rw`, + expected: &emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 4, + minor: 2, + }: devices.Permissions("rw"), + }, + }, + }, + { + name: "WhitelistWildcard", + list: `b 0:* m`, + expected: &emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 0, + minor: devices.Wildcard, + }: devices.Permissions("m"), + }, + }, + }, + { + name: "WhitelistDuplicate", + list: `c *:* rwm +c 1:1 r`, + expected: &emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + // To match the kernel, we allow redundant rules. + { + node: devices.CharDevice, + major: 1, + minor: 1, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "WhitelistComplicated", + list: `c *:* m +b *:* m +c 1:3 rwm +c 1:5 rwm +c 1:7 rwm +c 1:8 rwm +c 1:9 rwm +c 5:0 rwm +c 5:2 rwm +c 136:* rwm +c 10:200 rwm`, + expected: &emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("m"), + { + node: devices.BlockDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("m"), + { + node: devices.CharDevice, + major: 1, + minor: 3, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 5, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 7, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 8, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 9, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 5, + minor: 0, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 5, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 136, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 10, + minor: 200, + }: devices.Permissions("rwm"), + }, + }, + }, + // Some invalid lists. + { + name: "InvalidFieldNumber", + list: `b 1:0`, + expected: nil, + }, + { + name: "InvalidDeviceType", + list: `p *:* rwm`, + expected: nil, + }, + { + name: "InvalidMajorNumber1", + list: `p -1:3 rwm`, + expected: nil, + }, + { + name: "InvalidMajorNumber2", + list: `c foo:27 rwm`, + expected: nil, + }, + { + name: "InvalidMinorNumber1", + list: `b 1:-4 rwm`, + expected: nil, + }, + { + name: "InvalidMinorNumber2", + list: `b 1:foo rwm`, + expected: nil, + }, + { + name: "InvalidPermissions", + list: `b 1:7 rwk`, + expected: nil, + }, + } + + for _, test := range tests { + test := test // capture range variable + t.Run(test.name, func(t *testing.T) { + list := bytes.NewBufferString(test.list) + emu, err := emulatorFromList(list) + if err != nil && test.expected != nil { + t.Fatalf("unexpected failure when creating emulator: %v", err) + } else if err == nil && test.expected == nil { + t.Fatalf("unexpected success when creating emulator: %#v", emu) + } + + if !reflect.DeepEqual(emu, test.expected) { + t.Errorf("final emulator state mismatch: %#v != %#v", emu, test.expected) + } + }) + } +} + +func testDeviceEmulatorApply(t *testing.T, baseDefaultAllow bool) { + tests := []struct { + name string + rule devices.Rule + base, expected *emulator + }{ + // Switch between default modes. + { + name: "SwitchToOtherMode", + rule: devices.Rule{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: !baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 1, + }: devices.Permissions("r"), + }, + }, + expected: &emulator{ + defaultAllow: !baseDefaultAllow, + rules: nil, + }, + }, + { + name: "SwitchToSameModeNoop", + rule: devices.Rule{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + }, + { + name: "SwitchToSameMode", + rule: devices.Rule{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 1, + }: devices.Permissions("r"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + }, + // Rule addition logic. + { + name: "RuleAdditionBasic", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + }, + { + name: "RuleAdditionBasicDuplicate", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + // To match the kernel, we allow redundant rules. + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + }, + { + name: "RuleAdditionBasicDuplicateNoop", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + }, + { + name: "RuleAdditionMerge", + rule: devices.Rule{ + Type: devices.BlockDevice, + Major: 5, + Minor: 12, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rw"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rwm"), + }, + }, + }, + { + name: "RuleAdditionMergeWildcard", + rule: devices.Rule{ + Type: devices.BlockDevice, + Major: 5, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: devices.Wildcard, + }: devices.Permissions("rw"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + }, + }, + }, + { + name: "RuleAdditionMergeNoop", + rule: devices.Rule{ + Type: devices.BlockDevice, + Major: 5, + Minor: 12, + Permissions: devices.Permissions("r"), + Allow: !baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rw"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rw"), + }, + }, + }, + // Rule removal logic. + { + name: "RuleRemovalBasic", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "RuleRemovalNonexistent", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 4, + Minor: 1, + Permissions: devices.Permissions("rw"), + Allow: baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "RuleRemovalFull", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rw"), + Allow: baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("w"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "RuleRemovalPartial", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("r"), + Allow: baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("m"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + // Check our non-canonical behaviour when it comes to try to "punch + // out" holes in a wildcard rule. + { + name: "RuleRemovalWildcardPunchoutImpossible", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("r"), + Allow: baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("rm"), + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("r"), + }, + }, + expected: nil, + }, + { + name: "RuleRemovalWildcardPunchoutPossible", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("r"), + Allow: baseDefaultAllow, + }, + base: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("r"), + }, + }, + expected: &emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + }, + } + + for _, test := range tests { + test := test + t.Run(test.name, func(t *testing.T) { + err := test.base.Apply(test.rule) + if err != nil && test.expected != nil { + t.Fatalf("unexpected failure when applying apply rule: %v", err) + } else if err == nil && test.expected == nil { + t.Fatalf("unexpected success when applying apply rule: %#v", test.base) + } + + if test.expected != nil && !reflect.DeepEqual(test.base, test.expected) { + t.Errorf("final emulator state mismatch: %#v != %#v", test.base, test.expected) + } + }) + } +} + +func TestDeviceEmulatorWhitelistApply(t *testing.T) { + testDeviceEmulatorApply(t, false) +} + +func TestDeviceEmulatorBlacklistApply(t *testing.T) { + testDeviceEmulatorApply(t, true) +} + +func testDeviceEmulatorTransition(t *testing.T, sourceDefaultAllow bool) { + tests := []struct { + name string + source, target *emulator + expected []*devices.Rule + }{ + // No-op changes. + { + name: "Noop", + source: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + target: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + // Identical white-lists produce no extra rules. + expected: nil, + }, + // Switching modes. + { + name: "SwitchToOtherMode", + source: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + target: &emulator{ + defaultAllow: !sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + expected: []*devices.Rule{ + // Clear-all rule. + { + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: !sourceDefaultAllow, + }, + // The actual rule-set. + { + Type: devices.BlockDevice, + Major: 42, + Minor: devices.Wildcard, + Permissions: devices.Permissions("wm"), + Allow: sourceDefaultAllow, + }, + }, + }, + // Rule changes. + { + name: "RuleAddition", + source: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + target: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.BlockDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rwm"), + Allow: !sourceDefaultAllow, + }, + }, + }, + { + name: "RuleRemoval", + source: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rwm"), + }, + }, + target: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.BlockDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rwm"), + Allow: sourceDefaultAllow, + }, + }, + }, + { + name: "RuleMultipleAdditionRemoval", + source: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 3, + minor: 9, + }: devices.Permissions("rw"), + }, + }, + target: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.BlockDevice, + Major: 3, + Minor: 9, + Permissions: devices.Permissions("rw"), + Allow: sourceDefaultAllow, + }, + }, + }, + // Modifying the access permissions. + { + name: "RulePartialAddition", + source: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("r"), + }, + }, + target: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("wm"), + Allow: !sourceDefaultAllow, + }, + }, + }, + { + name: "RulePartialRemoval", + source: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rw"), + }, + }, + target: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("w"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("r"), + Allow: sourceDefaultAllow, + }, + }, + }, + { + name: "RulePartialBoth", + source: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rw"), + }, + }, + target: &emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("w"), + Allow: sourceDefaultAllow, + }, + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("m"), + Allow: !sourceDefaultAllow, + }, + }, + }, + } + + for _, test := range tests { + test := test + t.Run(test.name, func(t *testing.T) { + // If we are in black-list mode, we need to prepend the relevant + // clear-all rule (the expected rule lists are written with + // white-list mode in mind), and then make a full copy of the + // target rules. + if sourceDefaultAllow && test.source.defaultAllow == test.target.defaultAllow { + test.expected = []*devices.Rule{{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: test.target.defaultAllow, + }} + for _, rule := range test.target.rules.orderedEntries() { + test.expected = append(test.expected, &devices.Rule{ + Type: rule.meta.node, + Major: rule.meta.major, + Minor: rule.meta.minor, + Permissions: rule.perms, + Allow: !test.target.defaultAllow, + }) + } + } + + rules, err := test.source.Transition(test.target) + if err != nil { + t.Fatalf("unexpected error while calculating transition rules: %#v", err) + } + + if !reflect.DeepEqual(rules, test.expected) { + t.Errorf("rules don't match expected set: %#v != %#v", rules, test.expected) + } + + // Apply the rules to the source to see if it actually transitions + // correctly. This is all emulated but it's a good thing to + // double-check. + for _, rule := range rules { + if err := test.source.Apply(*rule); err != nil { + t.Fatalf("error while applying transition rule [%#v]: %v", rule, err) + } + } + if !reflect.DeepEqual(test.source, test.target) { + t.Errorf("transition incomplete after applying all rules: %#v != %#v", test.source, test.target) + } + }) + } +} + +func TestDeviceEmulatorTransitionFromBlacklist(t *testing.T) { + testDeviceEmulatorTransition(t, true) +} + +func TestDeviceEmulatorTransitionFromWhitelist(t *testing.T) { + testDeviceEmulatorTransition(t, false) +} + +func BenchmarkParseLine(b *testing.B) { + list := `c *:* m +b *:* m +c 1:3 rwm +c 1:5 rwm +c 1:7 rwm +c 1:8 rwm +c 1:9 rwm +c 5:0 rwm +c 5:2 rwm +c 136:* rwm +c 10:200 rwm` + + var r *deviceRule + var err error + for i := 0; i < b.N; i++ { + s := bufio.NewScanner(strings.NewReader(list)) + for s.Scan() { + line := s.Text() + r, err = parseLine(line) + } + if err := s.Err(); err != nil { + b.Fatal(err) + } + } + b.Logf("rule: %v, err: %v", r, err) +} diff --git a/devices/ebpf_linux.go b/devices/ebpf_linux.go new file mode 100644 index 0000000..6a41aff --- /dev/null +++ b/devices/ebpf_linux.go @@ -0,0 +1,256 @@ +package devices + +import ( + "errors" + "fmt" + "os" + "runtime" + "sync" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/link" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +func nilCloser() error { + return nil +} + +func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) { + type bpfAttrQuery struct { + TargetFd uint32 + AttachType uint32 + QueryType uint32 + AttachFlags uint32 + ProgIds uint64 // __aligned_u64 + ProgCnt uint32 + } + + // Currently you can only have 64 eBPF programs attached to a cgroup. + size := 64 + retries := 0 + for retries < 10 { + progIds := make([]uint32, size) + query := bpfAttrQuery{ + TargetFd: uint32(dirFd), + AttachType: uint32(unix.BPF_CGROUP_DEVICE), + ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))), + ProgCnt: uint32(len(progIds)), + } + + // Fetch the list of program ids. + _, _, errno := unix.Syscall(unix.SYS_BPF, + uintptr(unix.BPF_PROG_QUERY), + uintptr(unsafe.Pointer(&query)), + unsafe.Sizeof(query)) + size = int(query.ProgCnt) + runtime.KeepAlive(query) + if errno != 0 { + // On ENOSPC we get the correct number of programs. + if errno == unix.ENOSPC { + retries++ + continue + } + return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno) + } + + // Convert the ids to program handles. + progIds = progIds[:size] + programs := make([]*ebpf.Program, 0, len(progIds)) + for _, progId := range progIds { + program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId)) + if err != nil { + // We skip over programs that give us -EACCES or -EPERM. This + // is necessary because there may be BPF programs that have + // been attached (such as with --systemd-cgroup) which have an + // LSM label that blocks us from interacting with the program. + // + // Because additional BPF_CGROUP_DEVICE programs only can add + // restrictions, there's no real issue with just ignoring these + // programs (and stops runc from breaking on distributions with + // very strict SELinux policies). + if errors.Is(err, os.ErrPermission) { + logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err) + continue + } + return nil, fmt.Errorf("cannot fetch program from id: %w", err) + } + programs = append(programs, program) + } + runtime.KeepAlive(progIds) + return programs, nil + } + + return nil, errors.New("could not get complete list of CGROUP_DEVICE programs") +} + +var ( + haveBpfProgReplaceBool bool + haveBpfProgReplaceOnce sync.Once +) + +// Loosely based on the BPF_F_REPLACE support check in +// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go. +// +// TODO: move this logic to cilium/ebpf +func haveBpfProgReplace() bool { + haveBpfProgReplaceOnce.Do(func() { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + License: "MIT", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + }) + if err != nil { + logrus.Warnf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err) + return + } + defer prog.Close() + + devnull, err := os.Open("/dev/null") + if err != nil { + logrus.Warnf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err) + return + } + defer devnull.Close() + + // We know that we have BPF_PROG_ATTACH since we can load + // BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL + // we know that the feature isn't present. + err = link.RawAttachProgram(link.RawAttachProgramOptions{ + // We rely on this fd being checked after attachFlags in the kernel. + Target: int(devnull.Fd()), + // Attempt to "replace" our BPF program with itself. This will + // always fail, but we should get -EINVAL if BPF_F_REPLACE is not + // supported. + Anchor: link.ReplaceProgram(prog), + Program: prog, + Attach: ebpf.AttachCGroupDevice, + Flags: unix.BPF_F_ALLOW_MULTI, + }) + if errors.Is(err, ebpf.ErrNotSupported) || errors.Is(err, unix.EINVAL) { + // not supported + return + } + if !errors.Is(err, unix.EBADF) { + // If we see any new errors here, it's possible that there is a + // regression due to a cilium/ebpf update and the above EINVAL + // checks are not working. So, be loud about it so someone notices + // and we can get the issue fixed quicker. + logrus.Warnf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err) + } + haveBpfProgReplaceBool = true + }) + return haveBpfProgReplaceBool +} + +// loadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. +// +// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . +// +// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 +func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) { + // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167). + // This limit is not inherited into the container. + memlockLimit := &unix.Rlimit{ + Cur: unix.RLIM_INFINITY, + Max: unix.RLIM_INFINITY, + } + _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) + + // Get the list of existing programs. + oldProgs, err := findAttachedCgroupDeviceFilters(dirFd) + if err != nil { + return nilCloser, err + } + useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1 + + // Generate new program. + spec := &ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + Instructions: insts, + License: license, + } + prog, err := ebpf.NewProgram(spec) + if err != nil { + return nilCloser, err + } + + // If there is only one old program, we can just replace it directly. + + attachProgramOptions := link.RawAttachProgramOptions{ + Target: dirFd, + Program: prog, + Attach: ebpf.AttachCGroupDevice, + Flags: unix.BPF_F_ALLOW_MULTI, + } + + if useReplaceProg { + attachProgramOptions.Anchor = link.ReplaceProgram(oldProgs[0]) + } + err = link.RawAttachProgram(attachProgramOptions) + if err != nil { + return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err) + } + closer := func() error { + err = link.RawDetachProgram(link.RawDetachProgramOptions{ + Target: dirFd, + Program: prog, + Attach: ebpf.AttachCGroupDevice, + }) + if err != nil { + return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err) + } + // TODO: Should we attach the old filters back in this case? Otherwise + // we fail-open on a security feature, which is a bit scary. + return nil + } + if !useReplaceProg { + logLevel := logrus.DebugLevel + // If there was more than one old program, give a warning (since this + // really shouldn't happen with runc-managed cgroups) and then detach + // all the old programs. + if len(oldProgs) > 1 { + // NOTE: Ideally this should be a warning but it turns out that + // systemd-managed cgroups trigger this warning (apparently + // systemd doesn't delete old non-systemd programs when + // setting properties). + logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs)) + logLevel = logrus.InfoLevel + } + for idx, oldProg := range oldProgs { + // Output some extra debug info. + if info, err := oldProg.Info(); err == nil { + fields := logrus.Fields{ + "type": info.Type.String(), + "tag": info.Tag, + "name": info.Name, + } + if id, ok := info.ID(); ok { + fields["id"] = id + } + if runCount, ok := info.RunCount(); ok { + fields["run_count"] = runCount + } + if runtime, ok := info.Runtime(); ok { + fields["runtime"] = runtime.String() + } + logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx) + } + err = link.RawDetachProgram(link.RawDetachProgramOptions{ + Target: dirFd, + Program: oldProg, + Attach: ebpf.AttachCGroupDevice, + }) + if err != nil { + return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err) + } + } + } + return closer, nil +} diff --git a/devices/systemd.go b/devices/systemd.go new file mode 100644 index 0000000..010f7f2 --- /dev/null +++ b/devices/systemd.go @@ -0,0 +1,252 @@ +package devices + +import ( + "bufio" + "fmt" + "os" + "strconv" + "strings" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + "github.com/sirupsen/logrus" + + "github.com/opencontainers/cgroups" + devices "github.com/opencontainers/cgroups/devices/config" +) + +// systemdProperties takes the configured device rules and generates a +// corresponding set of systemd properties to configure the devices correctly. +func systemdProperties(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, error) { + if r.SkipDevices { + return nil, nil + } + + properties := []systemdDbus.Property{ + // When we later add DeviceAllow=/dev/foo properties, we are + // appending devices to the allow list for the unit. However, + // if this is an existing unit, it already has DeviceAllow= + // entries, and we need to clear them all before applying the + // new set. (We also do this for new units, mainly for safety + // to ensure we only enable the devices we expect.) + // + // To clear any existing DeviceAllow= rules, we have to add an + // empty DeviceAllow= property. + newProp("DeviceAllow", []deviceAllowEntry{}), + // Always run in the strictest white-list mode. + newProp("DevicePolicy", "strict"), + } + + // Figure out the set of rules. + configEmu := emulator{} + for _, rule := range r.Devices { + if err := configEmu.Apply(*rule); err != nil { + return nil, fmt.Errorf("unable to apply rule for systemd: %w", err) + } + } + // systemd doesn't support blacklists. So we log a warning, and tell + // systemd to act as a deny-all whitelist. This ruleset will be replaced + // with our normal fallback code. This may result in spurious errors, but + // the only other option is to error out here. + if configEmu.IsBlacklist() { + // However, if we're dealing with an allow-all rule then we can do it. + if configEmu.IsAllowAll() { + return allowAllDevices(), nil + } + logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") + return properties, nil + } + + // Now generate the set of rules we actually need to apply. Unlike the + // normal devices cgroup, in "strict" mode systemd defaults to a deny-all + // whitelist which is the default for devices.Emulator. + finalRules, err := configEmu.Rules() + if err != nil { + return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err) + } + var deviceAllowList []deviceAllowEntry + for _, rule := range finalRules { + if !rule.Allow { + // Should never happen. + return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) + } + switch rule.Type { + case devices.BlockDevice, devices.CharDevice: + default: + // Should never happen. + return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type) + } + + entry := deviceAllowEntry{ + Perms: string(rule.Permissions), + } + + // systemd has a fairly odd (though understandable) syntax here, and + // because of the OCI configuration format we have to do quite a bit of + // trickery to convert things: + // + // * Concrete rules with non-wildcard major/minor numbers have to use + // /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses + // stat(2) on such paths to look up device properties, meaning we + // cannot add whitelist rules for devices that don't exist. Since v240, + // device properties are parsed from the path string. + // + // However, path globbing is not supported for path-based rules so we + // need to handle wildcards in some other manner. + // + // * If systemd older than v240 is used, wildcard-minor rules + // have to specify a "device group name" (the second column + // in /proc/devices). + // + // * Wildcard (major and minor) rules can just specify a glob with the + // type ("char-*" or "block-*"). + // + // The only type of rule we can't handle is wildcard-major rules, and + // so we'll give a warning in that case (note that the fallback code + // will insert any rules systemd couldn't handle). What amazing fun. + + if rule.Major == devices.Wildcard { + // "_ *:n _" rules aren't supported by systemd. + if rule.Minor != devices.Wildcard { + logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) + continue + } + + // "_ *:* _" rules just wildcard everything. + prefix, err := groupPrefix(rule.Type) + if err != nil { + return nil, err + } + entry.Path = prefix + "*" + } else if rule.Minor == devices.Wildcard { + if sdVer >= 240 { + // systemd v240+ allows for {block,char}-MAJOR syntax. + prefix, err := groupPrefix(rule.Type) + if err != nil { + return nil, err + } + entry.Path = prefix + strconv.FormatInt(rule.Major, 10) + } else { + // For older systemd, "_ n:* _" rules require a device group from /proc/devices. + group, err := findDeviceGroup(rule.Type, rule.Major) + if err != nil { + return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err) + } + if group == "" { + // Couldn't find a group. + logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) + continue + } + entry.Path = group + } + } else { + // "_ n:m _" rules are just a path in /dev/{block,char}/. + switch rule.Type { + case devices.BlockDevice: + entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) + case devices.CharDevice: + entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) + } + if sdVer < 240 { + // Old systemd versions use stat(2) on path to find out device major:minor + // numbers and type. If the path doesn't exist, it will not add the rule, + // emitting a warning instead. + // Since all of this logic is best-effort anyway (we manually set these + // rules separately to systemd) we can safely skip entries that don't + // have a corresponding path. + if _, err := os.Stat(entry.Path); err != nil { + continue + } + } + } + deviceAllowList = append(deviceAllowList, entry) + } + + properties = append(properties, newProp("DeviceAllow", deviceAllowList)) + return properties, nil +} + +func newProp(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +func groupPrefix(ruleType devices.Type) (string, error) { + switch ruleType { + case devices.BlockDevice: + return "block-", nil + case devices.CharDevice: + return "char-", nil + default: + return "", fmt.Errorf("device type %v has no group prefix", ruleType) + } +} + +// findDeviceGroup tries to find the device group name (as listed in +// /proc/devices) with the type prefixed as required for DeviceAllow, for a +// given (type, major) combination. If more than one device group exists, an +// arbitrary one is chosen. +func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) { + fh, err := os.Open("/proc/devices") + if err != nil { + return "", err + } + defer fh.Close() + + prefix, err := groupPrefix(ruleType) + if err != nil { + return "", err + } + ruleMajorStr := strconv.FormatInt(ruleMajor, 10) + " " + + scanner := bufio.NewScanner(fh) + var currentType devices.Type + for scanner.Scan() { + // We need to strip spaces because the first number is column-aligned. + line := strings.TrimSpace(scanner.Text()) + + // Handle the "header" lines. + switch line { + case "Block devices:": + currentType = devices.BlockDevice + continue + case "Character devices:": + currentType = devices.CharDevice + continue + case "": + continue + } + + // Skip lines unrelated to our type. + if currentType != ruleType { + continue + } + + if group, ok := strings.CutPrefix(line, ruleMajorStr); ok { + return prefix + group, nil + } + } + if err := scanner.Err(); err != nil { + return "", fmt.Errorf("reading /proc/devices: %w", err) + } + // Couldn't find the device group. + return "", nil +} + +// DeviceAllow is the dbus type "a(ss)" which means we need a struct +// to represent it in Go. +type deviceAllowEntry struct { + Path string + Perms string +} + +func allowAllDevices() []systemdDbus.Property { + // Setting mode to auto and removing all DeviceAllow rules + // results in allowing access to all devices. + return []systemdDbus.Property{ + newProp("DeviceAllow", []deviceAllowEntry{}), + newProp("DevicePolicy", "auto"), + } +} diff --git a/devices/systemd_test.go b/devices/systemd_test.go new file mode 100644 index 0000000..21b8a6d --- /dev/null +++ b/devices/systemd_test.go @@ -0,0 +1,279 @@ +package devices + +import ( + "bytes" + "fmt" + "os" + "os/exec" + "strings" + "testing" + + "github.com/opencontainers/cgroups" + devices "github.com/opencontainers/cgroups/devices/config" + "github.com/opencontainers/cgroups/systemd" +) + +// TestPodSkipDevicesUpdate checks that updating a pod having SkipDevices: true +// does not result in spurious "permission denied" errors in a container +// running under the pod. The test is somewhat similar in nature to the +// @test "update devices [minimal transition rules]" in tests/integration, +// but uses a pod. +func TestPodSkipDevicesUpdate(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podName := "system-runc_test_pod" + t.Name() + ".slice" + podConfig := &cgroups.Cgroup{ + Systemd: true, + Parent: "system.slice", + Name: podName, + Resources: &cgroups.Resources{ + PidsLimit: 42, + Memory: 32 * 1024 * 1024, + SkipDevices: true, + }, + } + // Create "pod" cgroup (a systemd slice to hold containers). + pm := newManager(t, podConfig) + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + + containerConfig := &cgroups.Cgroup{ + Parent: podName, + ScopePrefix: "test", + Name: "PodSkipDevicesUpdate", + Resources: &cgroups.Resources{ + Devices: []*devices.Rule{ + // Allow access to /dev/null. + { + Type: devices.CharDevice, + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, + }, + }, + } + + // Create a "container" within the "pod" cgroup. + // This is not a real container, just a process in the cgroup. + cmd := exec.Command("sleep", "infinity") + cmd.Env = append(os.Environ(), "LANG=C") + var stderr bytes.Buffer + cmd.Stderr = &stderr + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + + // Put the process into a cgroup. + cm := newManager(t, containerConfig) + if err := cm.Apply(cmd.Process.Pid); err != nil { + t.Fatal(err) + } + // Check that we put the "container" into the "pod" cgroup. + if !strings.HasPrefix(cm.Path("devices"), pm.Path("devices")) { + t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", + cm.Path("devices"), pm.Path("devices")) + } + if err := cm.Set(containerConfig.Resources); err != nil { + t.Fatal(err) + } + + // Now update the pod a few times. + for i := 0; i < 42; i++ { + podConfig.Resources.PidsLimit++ + podConfig.Resources.Memory += 1024 * 1024 + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + } + // Kill the "container". + if err := cmd.Process.Kill(); err != nil { + t.Fatal(err) + } + + _ = cmd.Wait() + + // "Container" stderr should be empty. + if stderr.Len() != 0 { + t.Fatalf("container stderr not empty: %s", stderr.String()) + } +} + +func testSkipDevices(t *testing.T, skipDevices bool, expected []string) { + if !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podConfig := &cgroups.Cgroup{ + Parent: "system.slice", + Name: "system-runc_test_pods.slice", + Resources: &cgroups.Resources{ + SkipDevices: skipDevices, + }, + } + // Create "pods" cgroup (a systemd slice to hold containers). + pm := newManager(t, podConfig) + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + + config := &cgroups.Cgroup{ + Parent: "system-runc_test_pods.slice", + ScopePrefix: "test", + Name: "SkipDevices", + Resources: &cgroups.Resources{ + Devices: []*devices.Rule{ + // Allow access to /dev/full only. + { + Type: devices.CharDevice, + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, + }, + }, + } + + // Create a "container" within the "pods" cgroup. + // This is not a real container, just a process in the cgroup. + cmd := exec.Command("bash", "-c", "read; echo > /dev/full; cat /dev/null; true") + cmd.Env = append(os.Environ(), "LANG=C") + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + cmd.Stdin = stdinR + var stderr bytes.Buffer + cmd.Stderr = &stderr + err = cmd.Start() + stdinR.Close() + defer stdinW.Close() + if err != nil { + t.Fatal(err) + } + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _, _ = stdinW.WriteString("hey\n") + _ = cmd.Wait() + }() + + // Put the process into a cgroup. + m := newManager(t, config) + if err := m.Apply(cmd.Process.Pid); err != nil { + t.Fatal(err) + } + // Check that we put the "container" into the "pod" cgroup. + if !strings.HasPrefix(m.Path("devices"), pm.Path("devices")) { + t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", + m.Path("devices"), pm.Path("devices")) + } + if err := m.Set(config.Resources); err != nil { + // failed to write "c 1:7 rwm": write /sys/fs/cgroup/devices/system.slice/system-runc_test_pods.slice/test-SkipDevices.scope/devices.allow: operation not permitted + if skipDevices == false && strings.HasSuffix(err.Error(), "/devices.allow: operation not permitted") { + // Cgroup v1 devices controller gives EPERM on trying + // to enable devices that are not enabled + // (skipDevices=false) in a parent cgroup. + // If this happens, test is passing. + return + } + t.Fatal(err) + } + + // Check that we can access /dev/full but not /dev/zero. + if _, err := stdinW.WriteString("wow\n"); err != nil { + t.Fatal(err) + } + if err := cmd.Wait(); err != nil { + t.Fatal(err) + } + for _, exp := range expected { + if !strings.Contains(stderr.String(), exp) { + t.Errorf("expected %q, got: %s", exp, stderr.String()) + } + } +} + +func TestSkipDevicesTrue(t *testing.T) { + testSkipDevices(t, true, []string{ + "echo: write error: No space left on device", + "cat: /dev/null: Operation not permitted", + }) +} + +func TestSkipDevicesFalse(t *testing.T) { + // If SkipDevices is not set for the parent slice, access to both + // devices should fail. This is done to assess the test correctness. + // For cgroup v1, we check for m.Set returning EPERM. + // For cgroup v2, we check for the errors below. + testSkipDevices(t, false, []string{ + "/dev/full: Operation not permitted", + "cat: /dev/null: Operation not permitted", + }) +} + +func testFindDeviceGroup() error { + const ( + major = 136 + group = "char-pts" + ) + res, err := findDeviceGroup(devices.CharDevice, major) + if res != group || err != nil { + return fmt.Errorf("expected %v, nil, got %v, %w", group, res, err) + } + return nil +} + +func TestFindDeviceGroup(t *testing.T) { + if err := testFindDeviceGroup(); err != nil { + t.Fatal(err) + } +} + +func BenchmarkFindDeviceGroup(b *testing.B) { + for i := 0; i < b.N; i++ { + if err := testFindDeviceGroup(); err != nil { + b.Fatal(err) + } + } +} + +func newManager(t *testing.T, config *cgroups.Cgroup) (m cgroups.Manager) { + t.Helper() + var err error + + if cgroups.IsCgroup2UnifiedMode() { + m, err = systemd.NewUnifiedManager(config, "") + } else { + m, err = systemd.NewLegacyManager(config, nil) + } + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = m.Destroy() }) + + return m +} diff --git a/devices/v1.go b/devices/v1.go new file mode 100644 index 0000000..8d0986d --- /dev/null +++ b/devices/v1.go @@ -0,0 +1,83 @@ +package devices + +import ( + "bytes" + "errors" + "reflect" + + "github.com/moby/sys/userns" + "github.com/opencontainers/cgroups" + devices "github.com/opencontainers/cgroups/devices/config" +) + +var testingSkipFinalCheck bool + +func setV1(path string, r *cgroups.Resources) error { + if userns.RunningInUserNS() || r.SkipDevices { + return nil + } + // Generate two emulators, one for the current state of the cgroup and one + // for the requested state by the user. + current, err := loadEmulator(path) + if err != nil { + return err + } + target, err := buildEmulator(r.Devices) + if err != nil { + return err + } + + // Compute the minimal set of transition rules needed to achieve the + // requested state. + transitionRules, err := current.Transition(target) + if err != nil { + return err + } + for _, rule := range transitionRules { + file := "devices.deny" + if rule.Allow { + file = "devices.allow" + } + if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil { + return err + } + } + + // Final safety check -- ensure that the resulting state is what was + // requested. This is only really correct for white-lists, but for + // black-lists we can at least check that the cgroup is in the right mode. + // + // This safety-check is skipped for the unit tests because we cannot + // currently mock devices.list correctly. + if !testingSkipFinalCheck { + currentAfter, err := loadEmulator(path) + if err != nil { + return err + } + if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) { + return errors.New("resulting devices cgroup doesn't precisely match target") + } else if target.IsBlacklist() != currentAfter.IsBlacklist() { + return errors.New("resulting devices cgroup doesn't match target mode") + } + } + return nil +} + +func loadEmulator(path string) (*emulator, error) { + list, err := cgroups.ReadFile(path, "devices.list") + if err != nil { + return nil, err + } + return emulatorFromList(bytes.NewBufferString(list)) +} + +func buildEmulator(rules []*devices.Rule) (*emulator, error) { + // This defaults to a white-list -- which is what we want! + emu := &emulator{} + for _, rule := range rules { + if err := emu.Apply(*rule); err != nil { + return nil, err + } + } + return emu, nil +} diff --git a/devices/v1_test.go b/devices/v1_test.go new file mode 100644 index 0000000..29e4637 --- /dev/null +++ b/devices/v1_test.go @@ -0,0 +1,68 @@ +package devices + +import ( + "os" + "path" + "testing" + + "github.com/moby/sys/userns" + + "github.com/opencontainers/cgroups" + devices "github.com/opencontainers/cgroups/devices/config" + "github.com/opencontainers/cgroups/fscommon" +) + +func init() { + testingSkipFinalCheck = true + cgroups.TestMode = true +} + +func TestSetV1Allow(t *testing.T) { + if userns.RunningInUserNS() { + t.Skip("userns detected; setV1 does nothing") + } + dir := t.TempDir() + + for file, contents := range map[string]string{ + "devices.allow": "", + "devices.deny": "", + "devices.list": "a *:* rwm", + } { + err := os.WriteFile(path.Join(dir, file), []byte(contents), 0o600) + if err != nil { + t.Fatal(err) + } + } + + r := &cgroups.Resources{ + Devices: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 5, + Permissions: devices.Permissions("rwm"), + Allow: true, + }, + }, + } + + if err := setV1(dir, r); err != nil { + t.Fatal(err) + } + + // The default deny rule must be written. + value, err := fscommon.GetCgroupParamString(dir, "devices.deny") + if err != nil { + t.Fatal(err) + } + if value[0] != 'a' { + t.Errorf("Got the wrong value (%q), set devices.deny failed.", value) + } + + // Permitted rule must be written. + if value, err := fscommon.GetCgroupParamString(dir, "devices.allow"); err != nil { + t.Fatal(err) + } else if value != "c 1:5 rwm" { + t.Errorf("Got the wrong value (%q), set devices.allow failed.", value) + } +} diff --git a/devices/v2.go b/devices/v2.go new file mode 100644 index 0000000..d54298f --- /dev/null +++ b/devices/v2.go @@ -0,0 +1,73 @@ +package devices + +import ( + "fmt" + + "github.com/moby/sys/userns" + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" + devices "github.com/opencontainers/cgroups/devices/config" +) + +func isRWM(perms devices.Permissions) bool { + var r, w, m bool + for _, perm := range perms { + switch perm { + case 'r': + r = true + case 'w': + w = true + case 'm': + m = true + } + } + return r && w && m +} + +// This is similar to the logic applied in crun for handling errors from bpf(2) +// . +func canSkipEBPFError(r *cgroups.Resources) bool { + // If we're running in a user namespace we can ignore eBPF rules because we + // usually cannot use bpf(2), as well as rootless containers usually don't + // have the necessary privileges to mknod(2) device inodes or access + // host-level instances (though ideally we would be blocking device access + // for rootless containers anyway). + if userns.RunningInUserNS() { + return true + } + + // We cannot ignore an eBPF load error if any rule if is a block rule or it + // doesn't permit all access modes. + // + // NOTE: This will sometimes trigger in cases where access modes are split + // between different rules but to handle this correctly would require + // using ".../libcontainer/cgroup/devices".Emulator. + for _, dev := range r.Devices { + if !dev.Allow || !isRWM(dev.Permissions) { + return false + } + } + return true +} + +func setV2(dirPath string, r *cgroups.Resources) error { + if r.SkipDevices { + return nil + } + insts, license, err := deviceFilter(r.Devices) + if err != nil { + return err + } + dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600) + if err != nil { + return fmt.Errorf("cannot get dir FD for %s", dirPath) + } + defer unix.Close(dirFD) + if _, err := loadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { + if !canSkipEBPFError(r) { + return err + } + } + return nil +} diff --git a/file.go b/file.go new file mode 100644 index 0000000..c1b8f5c --- /dev/null +++ b/file.go @@ -0,0 +1,216 @@ +package cgroups + +import ( + "bytes" + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// OpenFile opens a cgroup file in a given dir with given flags. +// It is supposed to be used for cgroup files only, and returns +// an error if the file is not a cgroup file. +// +// Arguments dir and file are joined together to form an absolute path +// to a file being opened. +func OpenFile(dir, file string, flags int) (*os.File, error) { + if dir == "" { + return nil, fmt.Errorf("no directory specified for %s", file) + } + return openFile(dir, file, flags) +} + +// ReadFile reads data from a cgroup file in dir. +// It is supposed to be used for cgroup files only. +func ReadFile(dir, file string) (string, error) { + fd, err := OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return "", err + } + defer fd.Close() + var buf bytes.Buffer + + _, err = buf.ReadFrom(fd) + return buf.String(), err +} + +// WriteFile writes data to a cgroup file in dir. +// It is supposed to be used for cgroup files only. +func WriteFile(dir, file, data string) error { + fd, err := OpenFile(dir, file, unix.O_WRONLY) + if err != nil { + return err + } + defer fd.Close() + if _, err := fd.WriteString(data); err != nil { + // Having data in the error message helps in debugging. + return fmt.Errorf("failed to write %q: %w", data, err) + } + return nil +} + +// WriteFileByLine is the same as WriteFile, except if data contains newlines, +// it is written line by line. +func WriteFileByLine(dir, file, data string) error { + i := strings.Index(data, "\n") + if i == -1 { + return WriteFile(dir, file, data) + } + + fd, err := OpenFile(dir, file, unix.O_WRONLY) + if err != nil { + return err + } + defer fd.Close() + start := 0 + for { + var line string + if i == -1 { + line = data[start:] + } else { + line = data[start : start+i+1] + } + _, err := fd.WriteString(line) + if err != nil { + return fmt.Errorf("failed to write %q: %w", line, err) + } + if i == -1 { + break + } + start += i + 1 + i = strings.Index(data[start:], "\n") + } + return nil +} + +const ( + cgroupfsDir = "/sys/fs/cgroup" + cgroupfsPrefix = cgroupfsDir + "/" +) + +var ( + // TestMode is set to true by unit tests that need "fake" cgroupfs. + TestMode bool + + cgroupRootHandle *os.File + prepOnce sync.Once + prepErr error + resolveFlags uint64 +) + +func prepareOpenat2() error { + prepOnce.Do(func() { + fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ + Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC, + }) + if err != nil { + prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} + if err != unix.ENOSYS { + logrus.Warnf("falling back to securejoin: %s", prepErr) + } else { + logrus.Debug("openat2 not available, falling back to securejoin") + } + return + } + file := os.NewFile(uintptr(fd), cgroupfsDir) + + var st unix.Statfs_t + if err := unix.Fstatfs(int(file.Fd()), &st); err != nil { + prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} + logrus.Warnf("falling back to securejoin: %s", prepErr) + return + } + + cgroupRootHandle = file + resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS + if st.Type == unix.CGROUP2_SUPER_MAGIC { + // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks + resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS + } + }) + + return prepErr +} + +func openFile(dir, file string, flags int) (*os.File, error) { + mode := os.FileMode(0) + if TestMode && flags&os.O_WRONLY != 0 { + // "emulate" cgroup fs for unit tests + flags |= os.O_TRUNC | os.O_CREATE + mode = 0o600 + } + // NOTE it is important to use filepath.Clean("/"+file) here + // (see https://github.com/opencontainers/runc/issues/4103)! + path := filepath.Join(dir, filepath.Clean("/"+file)) + + if prepareOpenat2() != nil { + return openFallback(path, flags, mode) + } + relPath, ok := strings.CutPrefix(path, cgroupfsPrefix) + if !ok { // Non-standard path, old system? + return openFallback(path, flags, mode) + } + + fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath, + &unix.OpenHow{ + Resolve: resolveFlags, + Flags: uint64(flags) | unix.O_CLOEXEC, + Mode: uint64(mode), + }) + if err != nil { + err = &os.PathError{Op: "openat2", Path: path, Err: err} + // Check if cgroupRootHandle is still opened to cgroupfsDir + // (happens when this package is incorrectly used + // across the chroot/pivot_root/mntns boundary, or + // when /sys/fs/cgroup is remounted). + // + // TODO: if such usage will ever be common, amend this + // to reopen cgroupRootHandle and retry openat2. + fdDest, fdErr := os.Readlink("/proc/thread-self/fd/" + strconv.Itoa(int(cgroupRootHandle.Fd()))) + if fdErr == nil && fdDest != cgroupfsDir { + // Wrap the error so it is clear that cgroupRootHandle + // is opened to an unexpected/wrong directory. + err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w", + cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err) + } + return nil, err + } + + return os.NewFile(uintptr(fd), path), nil +} + +var errNotCgroupfs = errors.New("not a cgroup file") + +// Can be changed by unit tests. +var openFallback = openAndCheck + +// openAndCheck is used when openat2(2) is not available. It checks the opened +// file is on cgroupfs, returning an error otherwise. +func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) { + fd, err := os.OpenFile(path, flags, mode) + if err != nil { + return nil, err + } + if TestMode { + return fd, nil + } + // Check this is a cgroupfs file. + var st unix.Statfs_t + if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil { + _ = fd.Close() + return nil, &os.PathError{Op: "statfs", Path: path, Err: err} + } + if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC { + _ = fd.Close() + return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs} + } + + return fd, nil +} diff --git a/file_test.go b/file_test.go new file mode 100644 index 0000000..3a9fac3 --- /dev/null +++ b/file_test.go @@ -0,0 +1,93 @@ +package cgroups + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "testing" + "time" +) + +func TestWriteCgroupFileHandlesInterrupt(t *testing.T) { + const ( + memoryCgroupMount = "/sys/fs/cgroup/memory" + memoryLimit = "memory.limit_in_bytes" + ) + if _, err := os.Stat(memoryCgroupMount); err != nil { + // most probably cgroupv2 + t.Skip(err) + } + + cgroupName := fmt.Sprintf("test-eint-%d", time.Now().Nanosecond()) + cgroupPath := filepath.Join(memoryCgroupMount, cgroupName) + if err := os.MkdirAll(cgroupPath, 0o755); err != nil { + t.Fatal(err) + } + defer os.RemoveAll(cgroupPath) + + if _, err := os.Stat(filepath.Join(cgroupPath, memoryLimit)); err != nil { + // either cgroupv2, or memory controller is not available + t.Skip(err) + } + + for i := 0; i < 100000; i++ { + limit := 1024*1024 + i + if err := WriteFile(cgroupPath, memoryLimit, strconv.Itoa(limit)); err != nil { + t.Fatalf("Failed to write %d on attempt %d: %+v", limit, i, err) + } + } +} + +func TestOpenat2(t *testing.T) { + if !IsCgroup2UnifiedMode() { + // The reason is many test cases below test opening files from + // the top-level directory, where cgroup v1 has no files. + t.Skip("test requires cgroup v2") + } + + // Make sure we test openat2, not its fallback. + openFallback = func(_ string, _ int, _ os.FileMode) (*os.File, error) { + return nil, errors.New("fallback") + } + defer func() { openFallback = openAndCheck }() + + for _, tc := range []struct{ dir, file string }{ + {"/sys/fs/cgroup", "cgroup.controllers"}, + {"/sys/fs/cgroup", "/cgroup.controllers"}, + {"/sys/fs/cgroup/", "cgroup.controllers"}, + {"/sys/fs/cgroup/", "/cgroup.controllers"}, + {"/", "/sys/fs/cgroup/cgroup.controllers"}, + {"/", "sys/fs/cgroup/cgroup.controllers"}, + {"/sys/fs/cgroup/cgroup.controllers", ""}, + } { + fd, err := OpenFile(tc.dir, tc.file, os.O_RDONLY) + if err != nil { + t.Errorf("case %+v: %v", tc, err) + } + fd.Close() + } +} + +func BenchmarkWriteFile(b *testing.B) { + TestMode = true + defer func() { TestMode = false }() + + dir := b.TempDir() + tc := []string{ + "one", + "one\ntwo\nthree", + "10:200 foo=bar boo=far\n300:1200 something=other\ndefault 45000\n", + "\n\n\n\n\n\n\n\n", + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + for _, val := range tc { + if err := WriteFileByLine(dir, "file", val); err != nil { + b.Fatal(err) + } + } + } +} diff --git a/fs/blkio.go b/fs/blkio.go new file mode 100644 index 0000000..f3c4c5c --- /dev/null +++ b/fs/blkio.go @@ -0,0 +1,310 @@ +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/cgroups" +) + +type BlkioGroup struct { + weightFilename string + weightDeviceFilename string +} + +func (s *BlkioGroup) Name() string { + return "blkio" +} + +func (s *BlkioGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + return apply(path, pid) +} + +func (s *BlkioGroup) Set(path string, r *cgroups.Resources) error { + s.detectWeightFilenames(path) + if r.BlkioWeight != 0 { + if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { + return err + } + } + + if r.BlkioLeafWeight != 0 { + if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil { + return err + } + } + for _, wd := range r.BlkioWeightDevice { + if wd.Weight != 0 { + if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil { + return err + } + } + if wd.LeafWeight != 0 { + if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { + return err + } + } + } + for _, td := range r.BlkioThrottleReadBpsDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteBpsDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleReadIOPSDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteIOPSDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { + return err + } + } + + return nil +} + +/* +examples: + + blkio.sectors + 8:0 6792 + + blkio.io_service_bytes + 8:0 Read 1282048 + 8:0 Write 2195456 + 8:0 Sync 2195456 + 8:0 Async 1282048 + 8:0 Total 3477504 + Total 3477504 + + blkio.io_serviced + 8:0 Read 124 + 8:0 Write 104 + 8:0 Sync 104 + 8:0 Async 124 + 8:0 Total 228 + Total 228 + + blkio.io_queued + 8:0 Read 0 + 8:0 Write 0 + 8:0 Sync 0 + 8:0 Async 0 + 8:0 Total 0 + Total 0 +*/ + +func splitBlkioStatLine(r rune) bool { + return r == ' ' || r == ':' +} + +func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) { + var blkioStats []cgroups.BlkioStatEntry + f, err := cgroups.OpenFile(dir, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return blkioStats, nil + } + return nil, err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + // format: dev type amount + fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine) + if len(fields) < 3 { + if len(fields) == 2 && fields[0] == "Total" { + // skip total line + continue + } else { + return nil, malformedLine(dir, file, sc.Text()) + } + } + + v, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + major := v + + v, err = strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + minor := v + + op := "" + valueField := 2 + if len(fields) == 4 { + op = fields[2] + valueField = 3 + } + v, err = strconv.ParseUint(fields[valueField], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) + } + if err := sc.Err(); err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + + return blkioStats, nil +} + +func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { + type blkioStatInfo struct { + filename string + blkioStatEntriesPtr *[]cgroups.BlkioStatEntry + } + bfqDebugStats := []blkioStatInfo{ + { + filename: "blkio.bfq.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.bfq.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.bfq.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.bfq.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.bfq.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.bfq.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + bfqStats := []blkioStatInfo{ + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + cfqStats := []blkioStatInfo{ + { + filename: "blkio.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + throttleRecursiveStats := []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + baseStats := []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + orderedStats := [][]blkioStatInfo{ + bfqDebugStats, + bfqStats, + cfqStats, + throttleRecursiveStats, + baseStats, + } + + var blkioStats []cgroups.BlkioStatEntry + var err error + + for _, statGroup := range orderedStats { + for i, statInfo := range statGroup { + if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil { + // if error occurs on first file, move to next group + if i == 0 { + break + } + return err + } + *statInfo.blkioStatEntriesPtr = blkioStats + // finish if all stats are gathered + if i == len(statGroup)-1 { + return nil + } + } + } + return nil +} + +func (s *BlkioGroup) detectWeightFilenames(path string) { + if s.weightFilename != "" { + // Already detected. + return + } + if cgroups.PathExists(filepath.Join(path, "blkio.weight")) { + s.weightFilename = "blkio.weight" + s.weightDeviceFilename = "blkio.weight_device" + } else { + s.weightFilename = "blkio.bfq.weight" + s.weightDeviceFilename = "blkio.bfq.weight_device" + } +} diff --git a/fs/blkio_test.go b/fs/blkio_test.go new file mode 100644 index 0000000..31aafab --- /dev/null +++ b/fs/blkio_test.go @@ -0,0 +1,862 @@ +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +const ( + sectorsRecursiveContents = `8:0 1024` + sectorsRecursiveContentsBFQ = `8:0 2048` + serviceBytesRecursiveContents = `8:0 Read 100 +8:0 Write 200 +8:0 Sync 300 +8:0 Async 500 +8:0 Total 500 +Total 500` + + serviceBytesRecursiveContentsBFQ = `8:0 Read 1100 +8:0 Write 1200 +8:0 Sync 1300 +8:0 Async 1500 +8:0 Total 1500 +Total 1500` + servicedRecursiveContents = `8:0 Read 10 +8:0 Write 40 +8:0 Sync 20 +8:0 Async 30 +8:0 Total 50 +Total 50` + servicedRecursiveContentsBFQ = `8:0 Read 11 +8:0 Write 41 +8:0 Sync 21 +8:0 Async 31 +8:0 Total 51 +Total 51` + queuedRecursiveContents = `8:0 Read 1 +8:0 Write 4 +8:0 Sync 2 +8:0 Async 3 +8:0 Total 5 +Total 5` + queuedRecursiveContentsBFQ = `8:0 Read 2 +8:0 Write 3 +8:0 Sync 4 +8:0 Async 5 +8:0 Total 6 +Total 6` + serviceTimeRecursiveContents = `8:0 Read 173959 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 173959 +8:0 Total 17395 +Total 17395` + serviceTimeRecursiveContentsBFQ = `8:0 Read 173959 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 173 +8:0 Total 174 +Total 174` + waitTimeRecursiveContents = `8:0 Read 15571 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 15571 +8:0 Total 15571` + waitTimeRecursiveContentsBFQ = `8:0 Read 1557 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 1557 +8:0 Total 1557` + mergedRecursiveContents = `8:0 Read 5 +8:0 Write 10 +8:0 Sync 0 +8:0 Async 0 +8:0 Total 15 +Total 15` + mergedRecursiveContentsBFQ = `8:0 Read 51 +8:0 Write 101 +8:0 Sync 0 +8:0 Async 0 +8:0 Total 151 +Total 151` + timeRecursiveContents = `8:0 8` + timeRecursiveContentsBFQ = `8:0 16` + throttleServiceBytes = `8:0 Read 11030528 +8:0 Write 23 +8:0 Sync 42 +8:0 Async 11030528 +8:0 Total 11030528 +252:0 Read 11030528 +252:0 Write 23 +252:0 Sync 42 +252:0 Async 11030528 +252:0 Total 11030528 +Total 22061056` + throttleServiceBytesRecursive = `8:0 Read 110305281 +8:0 Write 231 +8:0 Sync 421 +8:0 Async 110305281 +8:0 Total 110305281 +252:0 Read 110305281 +252:0 Write 231 +252:0 Sync 421 +252:0 Async 110305281 +252:0 Total 110305281 +Total 220610561` + throttleServiced = `8:0 Read 164 +8:0 Write 23 +8:0 Sync 42 +8:0 Async 164 +8:0 Total 164 +252:0 Read 164 +252:0 Write 23 +252:0 Sync 42 +252:0 Async 164 +252:0 Total 164 +Total 328` + throttleServicedRecursive = `8:0 Read 1641 +8:0 Write 231 +8:0 Sync 421 +8:0 Async 1641 +8:0 Total 1641 +252:0 Read 1641 +252:0 Write 231 +252:0 Sync 421 +252:0 Async 1641 +252:0 Total 1641 +Total 3281` +) + +var blkioBFQDebugStatsTestFiles = map[string]string{ + "blkio.bfq.io_service_bytes_recursive": serviceBytesRecursiveContentsBFQ, + "blkio.bfq.io_serviced_recursive": servicedRecursiveContentsBFQ, + "blkio.bfq.io_queued_recursive": queuedRecursiveContentsBFQ, + "blkio.bfq.io_service_time_recursive": serviceTimeRecursiveContentsBFQ, + "blkio.bfq.io_wait_time_recursive": waitTimeRecursiveContentsBFQ, + "blkio.bfq.io_merged_recursive": mergedRecursiveContentsBFQ, + "blkio.bfq.time_recursive": timeRecursiveContentsBFQ, + "blkio.bfq.sectors_recursive": sectorsRecursiveContentsBFQ, +} + +var blkioBFQStatsTestFiles = map[string]string{ + "blkio.bfq.io_service_bytes_recursive": serviceBytesRecursiveContentsBFQ, + "blkio.bfq.io_serviced_recursive": servicedRecursiveContentsBFQ, +} + +var blkioCFQStatsTestFiles = map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, +} + +type blkioStatFailureTestCase struct { + desc string + filename string +} + +func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) { //nolint:unparam + *blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op}) +} + +func TestBlkioSetWeight(t *testing.T) { + const ( + weightBefore = 100 + weightAfter = 200 + ) + + for _, legacyIOScheduler := range []bool{false, true} { + // Populate cgroup + path := tempDir(t, "blkio") + weightFilename := "blkio.bfq.weight" + if legacyIOScheduler { + weightFilename = "blkio.weight" + } + writeFileContents(t, path, map[string]string{ + weightFilename: strconv.Itoa(weightBefore), + }) + // Apply new configuration + r := &cgroups.Resources{ + BlkioWeight: weightAfter, + } + blkio := &BlkioGroup{} + if err := blkio.Set(path, r); err != nil { + t.Fatal(err) + } + // Verify results + if weightFilename != blkio.weightFilename { + t.Fatalf("weight filename detection failed: expected %q, detected %q", weightFilename, blkio.weightFilename) + } + value, err := fscommon.GetCgroupParamUint(path, weightFilename) + if err != nil { + t.Fatal(err) + } + if value != weightAfter { + t.Fatalf("Got the wrong value, set %s failed.", weightFilename) + } + } +} + +func TestBlkioSetWeightDevice(t *testing.T) { + const ( + weightDeviceBefore = "8:0 400" + ) + + for _, legacyIOScheduler := range []bool{false, true} { + // Populate cgroup + path := tempDir(t, "blkio") + weightFilename := "blkio.bfq.weight" + weightDeviceFilename := "blkio.bfq.weight_device" + if legacyIOScheduler { + weightFilename = "blkio.weight" + weightDeviceFilename = "blkio.weight_device" + } + writeFileContents(t, path, map[string]string{ + weightFilename: "", + weightDeviceFilename: weightDeviceBefore, + }) + // Apply new configuration + wd := cgroups.NewWeightDevice(8, 0, 500, 0) + weightDeviceAfter := wd.WeightString() + r := &cgroups.Resources{ + BlkioWeightDevice: []*cgroups.WeightDevice{wd}, + } + blkio := &BlkioGroup{} + if err := blkio.Set(path, r); err != nil { + t.Fatal(err) + } + // Verify results + if weightDeviceFilename != blkio.weightDeviceFilename { + t.Fatalf("weight_device filename detection failed: expected %q, detected %q", weightDeviceFilename, blkio.weightDeviceFilename) + } + value, err := fscommon.GetCgroupParamString(path, weightDeviceFilename) + if err != nil { + t.Fatal(err) + } + if value != weightDeviceAfter { + t.Fatalf("Got the wrong value, set %s failed.", weightDeviceFilename) + } + } +} + +// regression #274 +func TestBlkioSetMultipleWeightDevice(t *testing.T) { + path := tempDir(t, "blkio") + + const ( + weightDeviceBefore = "8:0 400" + ) + + wd1 := cgroups.NewWeightDevice(8, 0, 500, 0) + wd2 := cgroups.NewWeightDevice(8, 16, 500, 0) + // we cannot actually set and check both because normal os.WriteFile + // when writing to cgroup file will overwrite the whole file content instead + // of updating it as the kernel is doing. Just check the second device + // is present will suffice for the test to ensure multiple writes are done. + weightDeviceAfter := wd2.WeightString() + + blkio := &BlkioGroup{} + blkio.detectWeightFilenames(path) + if blkio.weightDeviceFilename != "blkio.bfq.weight_device" { + t.Fatalf("when blkio controller is unavailable, expected to use \"blkio.bfq.weight_device\", tried to use %q", blkio.weightDeviceFilename) + } + writeFileContents(t, path, map[string]string{ + blkio.weightDeviceFilename: weightDeviceBefore, + }) + + r := &cgroups.Resources{ + BlkioWeightDevice: []*cgroups.WeightDevice{wd1, wd2}, + } + if err := blkio.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(path, blkio.weightDeviceFilename) + if err != nil { + t.Fatal(err) + } + if value != weightDeviceAfter { + t.Fatalf("Got the wrong value, set %s failed.", blkio.weightDeviceFilename) + } +} + +func TestBlkioBFQDebugStats(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, blkioBFQDebugStatsTestFiles) + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.BlkioStats{} + appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 2048, "") + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") + + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Read") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Write") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Sync") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Async") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 6, "Total") + + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 174, "Total") + + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Read") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Async") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Total") + + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 51, "Read") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 101, "Write") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 151, "Total") + + appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 16, "") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioMultipleStatsFiles(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, blkioBFQDebugStatsTestFiles) + writeFileContents(t, path, blkioCFQStatsTestFiles) + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.BlkioStats{} + appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 2048, "") + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") + + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Read") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Write") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Sync") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Async") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 6, "Total") + + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 174, "Total") + + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Read") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Async") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Total") + + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 51, "Read") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 101, "Write") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 151, "Total") + + appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 16, "") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioBFQStats(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, blkioBFQStatsTestFiles) + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.BlkioStats{} + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioStatsNoFilesBFQDebug(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + testCases := []blkioStatFailureTestCase{ + { + desc: "missing blkio.bfq.io_service_bytes_recursive file", + filename: "blkio.bfq.io_service_bytes_recursive", + }, + { + desc: "missing blkio.bfq.io_serviced_recursive file", + filename: "blkio.bfq.io_serviced_recursive", + }, + { + desc: "missing blkio.bfq.io_queued_recursive file", + filename: "blkio.bfq.io_queued_recursive", + }, + { + desc: "missing blkio.bfq.sectors_recursive file", + filename: "blkio.bfq.sectors_recursive", + }, + { + desc: "missing blkio.bfq.io_service_time_recursive file", + filename: "blkio.bfq.io_service_time_recursive", + }, + { + desc: "missing blkio.bfq.io_wait_time_recursive file", + filename: "blkio.bfq.io_wait_time_recursive", + }, + { + desc: "missing blkio.bfq.io_merged_recursive file", + filename: "blkio.bfq.io_merged_recursive", + }, + { + desc: "missing blkio.bfq.time_recursive file", + filename: "blkio.bfq.time_recursive", + }, + } + + for _, testCase := range testCases { + path := tempDir(t, "cpuset") + + tempBlkioTestFiles := map[string]string{} + for i, v := range blkioBFQDebugStatsTestFiles { + tempBlkioTestFiles[i] = v + } + delete(tempBlkioTestFiles, testCase.filename) + + writeFileContents(t, path, tempBlkioTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(path, &actualStats) + if err != nil { + t.Errorf("%s: want no error, got: %+v", testCase.desc, err) + } + } +} + +func TestBlkioCFQStats(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, blkioCFQStatsTestFiles) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 1024, "") + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 10, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 40, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 20, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 30, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 50, "Total") + + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 1, "Read") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Write") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Sync") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Async") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Total") + + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 17395, "Total") + + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Read") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Async") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Total") + + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 5, "Read") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 10, "Write") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 15, "Total") + + appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 8, "") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioStatsNoFilesCFQ(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + testCases := []blkioStatFailureTestCase{ + { + desc: "missing blkio.io_service_bytes_recursive file", + filename: "blkio.io_service_bytes_recursive", + }, + { + desc: "missing blkio.io_serviced_recursive file", + filename: "blkio.io_serviced_recursive", + }, + { + desc: "missing blkio.io_queued_recursive file", + filename: "blkio.io_queued_recursive", + }, + { + desc: "missing blkio.sectors_recursive file", + filename: "blkio.sectors_recursive", + }, + { + desc: "missing blkio.io_service_time_recursive file", + filename: "blkio.io_service_time_recursive", + }, + { + desc: "missing blkio.io_wait_time_recursive file", + filename: "blkio.io_wait_time_recursive", + }, + { + desc: "missing blkio.io_merged_recursive file", + filename: "blkio.io_merged_recursive", + }, + { + desc: "missing blkio.time_recursive file", + filename: "blkio.time_recursive", + }, + } + + for _, testCase := range testCases { + path := tempDir(t, "cpuset") + + tempBlkioTestFiles := map[string]string{} + for i, v := range blkioCFQStatsTestFiles { + tempBlkioTestFiles[i] = v + } + delete(tempBlkioTestFiles, testCase.filename) + + writeFileContents(t, path, tempBlkioTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(path, &actualStats) + if err != nil { + t.Errorf("%s: want no error, got %+v", testCase.desc, err) + } + } +} + +func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, map[string]string{ + "blkio.io_service_bytes_recursive": "8:0 Read 100 100", + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected to fail, but did not") + } +} + +func TestBlkioStatsUnexpectedFieldType(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, map[string]string{ + "blkio.io_service_bytes_recursive": "8:0 Read Write", + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected to fail, but did not") + } +} + +func TestThrottleRecursiveBlkioStats(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, map[string]string{ + "blkio.io_service_bytes_recursive": "", + "blkio.io_serviced_recursive": "", + "blkio.io_queued_recursive": "", + "blkio.sectors_recursive": "", + "blkio.io_service_time_recursive": "", + "blkio.io_wait_time_recursive": "", + "blkio.io_merged_recursive": "", + "blkio.time_recursive": "", + "blkio.throttle.io_service_bytes_recursive": throttleServiceBytesRecursive, + "blkio.throttle.io_serviced_recursive": throttleServicedRecursive, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Total") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Total") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestThrottleBlkioStats(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, map[string]string{ + "blkio.io_service_bytes_recursive": "", + "blkio.io_serviced_recursive": "", + "blkio.io_queued_recursive": "", + "blkio.sectors_recursive": "", + "blkio.io_service_time_recursive": "", + "blkio.io_wait_time_recursive": "", + "blkio.io_merged_recursive": "", + "blkio.time_recursive": "", + "blkio.throttle.io_service_bytes": throttleServiceBytes, + "blkio.throttle.io_serviced": throttleServiced, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Total") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Total") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioSetThrottleReadBpsDevice(t *testing.T) { + path := tempDir(t, "blkio") + + const ( + throttleBefore = `8:0 1024` + ) + + td := cgroups.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + writeFileContents(t, path, map[string]string{ + "blkio.throttle.read_bps_device": throttleBefore, + }) + + r := &cgroups.Resources{ + BlkioThrottleReadBpsDevice: []*cgroups.ThrottleDevice{td}, + } + blkio := &BlkioGroup{} + if err := blkio.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.read_bps_device") + if err != nil { + t.Fatal(err) + } + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.read_bps_device failed.") + } +} + +func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) { + path := tempDir(t, "blkio") + + const ( + throttleBefore = `8:0 1024` + ) + + td := cgroups.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + writeFileContents(t, path, map[string]string{ + "blkio.throttle.write_bps_device": throttleBefore, + }) + + r := &cgroups.Resources{ + BlkioThrottleWriteBpsDevice: []*cgroups.ThrottleDevice{td}, + } + blkio := &BlkioGroup{} + if err := blkio.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.write_bps_device") + if err != nil { + t.Fatal(err) + } + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.write_bps_device failed.") + } +} + +func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) { + path := tempDir(t, "blkio") + + const ( + throttleBefore = `8:0 1024` + ) + + td := cgroups.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + writeFileContents(t, path, map[string]string{ + "blkio.throttle.read_iops_device": throttleBefore, + }) + + r := &cgroups.Resources{ + BlkioThrottleReadIOPSDevice: []*cgroups.ThrottleDevice{td}, + } + blkio := &BlkioGroup{} + if err := blkio.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.read_iops_device") + if err != nil { + t.Fatal(err) + } + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.read_iops_device failed.") + } +} + +func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) { + path := tempDir(t, "blkio") + + const ( + throttleBefore = `8:0 1024` + ) + + td := cgroups.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + writeFileContents(t, path, map[string]string{ + "blkio.throttle.write_iops_device": throttleBefore, + }) + + r := &cgroups.Resources{ + BlkioThrottleWriteIOPSDevice: []*cgroups.ThrottleDevice{td}, + } + blkio := &BlkioGroup{} + if err := blkio.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.write_iops_device") + if err != nil { + t.Fatal(err) + } + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.write_iops_device failed.") + } +} diff --git a/fs/cpu.go b/fs/cpu.go new file mode 100644 index 0000000..3e05788 --- /dev/null +++ b/fs/cpu.go @@ -0,0 +1,181 @@ +package fs + +import ( + "bufio" + "errors" + "fmt" + "os" + "strconv" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" + "golang.org/x/sys/unix" +) + +type CpuGroup struct{} + +func (s *CpuGroup) Name() string { + return "cpu" +} + +func (s *CpuGroup) Apply(path string, r *cgroups.Resources, pid int) error { + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + // We should set the real-Time group scheduling settings before moving + // in the process because if the process is already in SCHED_RR mode + // and no RT bandwidth is set, adding it will fail. + if err := s.SetRtSched(path, r); err != nil { + return err + } + // Since we are not using apply(), we need to place the pid + // into the procs file. + return cgroups.WriteCgroupProc(path, pid) +} + +func (s *CpuGroup) SetRtSched(path string, r *cgroups.Resources) error { + var period string + if r.CpuRtPeriod != 0 { + period = strconv.FormatUint(r.CpuRtPeriod, 10) + if err := cgroups.WriteFile(path, "cpu.rt_period_us", period); err != nil { + // The values of cpu.rt_period_us and cpu.rt_runtime_us + // are inter-dependent and need to be set in a proper order. + // If the kernel rejects the new period value with EINVAL + // and the new runtime value is also being set, let's + // ignore the error for now and retry later. + if !errors.Is(err, unix.EINVAL) || r.CpuRtRuntime == 0 { + return err + } + } else { + period = "" + } + } + if r.CpuRtRuntime != 0 { + if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil { + return err + } + if period != "" { + if err := cgroups.WriteFile(path, "cpu.rt_period_us", period); err != nil { + return err + } + } + } + return nil +} + +func (s *CpuGroup) Set(path string, r *cgroups.Resources) error { + if r.CpuShares != 0 { + shares := r.CpuShares + if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil { + return err + } + // read it back + sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares") + if err != nil { + return err + } + // ... and check + if shares > sharesRead { + return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead) + } else if shares < sharesRead { + return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead) + } + } + + var period string + if r.CpuPeriod != 0 { + period = strconv.FormatUint(r.CpuPeriod, 10) + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + // Sometimes when the period to be set is smaller + // than the current one, it is rejected by the kernel + // (EINVAL) as old_quota/new_period exceeds the parent + // cgroup quota limit. If this happens and the quota is + // going to be set, ignore the error for now and retry + // after setting the quota. + if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { + return err + } + } else { + period = "" + } + } + + var burst string + if r.CpuBurst != nil { + burst = strconv.FormatUint(*r.CpuBurst, 10) + if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil { + if errors.Is(err, unix.ENOENT) { + // If CPU burst knob is not available (e.g. + // older kernel), ignore it. + burst = "" + } else { + // Sometimes when the burst to be set is larger + // than the current one, it is rejected by the kernel + // (EINVAL) as old_quota/new_burst exceeds the parent + // cgroup quota limit. If this happens and the quota is + // going to be set, ignore the error for now and retry + // after setting the quota. + if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { + return err + } + } + } else { + burst = "" + } + } + if r.CpuQuota != 0 { + if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil { + return err + } + if period != "" { + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + return err + } + } + if burst != "" { + if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil { + return err + } + } + } + + if r.CPUIdle != nil { + idle := strconv.FormatInt(*r.CPUIdle, 10) + if err := cgroups.WriteFile(path, "cpu.idle", idle); err != nil { + return err + } + } + + return s.SetRtSched(path, r) +} + +func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { + const file = "cpu.stat" + f, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: path, File: file, Err: err} + } + switch t { + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_time": + stats.CpuStats.ThrottlingData.ThrottledTime = v + } + } + return nil +} diff --git a/fs/cpu_test.go b/fs/cpu_test.go new file mode 100644 index 0000000..a2b64c3 --- /dev/null +++ b/fs/cpu_test.go @@ -0,0 +1,226 @@ +package fs + +import ( + "fmt" + "strconv" + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +func TestCpuSetShares(t *testing.T) { + path := tempDir(t, "cpu") + + const ( + sharesBefore = 1024 + sharesAfter = 512 + ) + + writeFileContents(t, path, map[string]string{ + "cpu.shares": strconv.Itoa(sharesBefore), + }) + + r := &cgroups.Resources{ + CpuShares: sharesAfter, + } + cpu := &CpuGroup{} + if err := cpu.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(path, "cpu.shares") + if err != nil { + t.Fatal(err) + } + if value != sharesAfter { + t.Fatal("Got the wrong value, set cpu.shares failed.") + } +} + +func TestCpuSetBandWidth(t *testing.T) { + path := tempDir(t, "cpu") + + const ( + quotaBefore = 8000 + quotaAfter = 5000 + burstBefore = 2000 + periodBefore = 10000 + periodAfter = 7000 + rtRuntimeBefore = 8000 + rtRuntimeAfter = 5000 + rtPeriodBefore = 10000 + rtPeriodAfter = 7000 + ) + burstAfter := uint64(1000) + + writeFileContents(t, path, map[string]string{ + "cpu.cfs_quota_us": strconv.Itoa(quotaBefore), + "cpu.cfs_burst_us": strconv.Itoa(burstBefore), + "cpu.cfs_period_us": strconv.Itoa(periodBefore), + "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore), + "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore), + }) + + r := &cgroups.Resources{ + CpuQuota: quotaAfter, + CpuBurst: &burstAfter, + CpuPeriod: periodAfter, + CpuRtRuntime: rtRuntimeAfter, + CpuRtPeriod: rtPeriodAfter, + } + cpu := &CpuGroup{} + if err := cpu.Set(path, r); err != nil { + t.Fatal(err) + } + + quota, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_quota_us") + if err != nil { + t.Fatal(err) + } + if quota != quotaAfter { + t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.") + } + + burst, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_burst_us") + if err != nil { + t.Fatal(err) + } + if burst != burstAfter { + t.Fatal("Got the wrong value, set cpu.cfs_burst_us failed.") + } + + period, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_period_us") + if err != nil { + t.Fatal(err) + } + if period != periodAfter { + t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.") + } + + rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us") + if err != nil { + t.Fatal(err) + } + if rtRuntime != rtRuntimeAfter { + t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.") + } + + rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us") + if err != nil { + t.Fatal(err) + } + if rtPeriod != rtPeriodAfter { + t.Fatal("Got the wrong value, set cpu.rt_period_us failed.") + } +} + +func TestCpuStats(t *testing.T) { + path := tempDir(t, "cpu") + + const ( + nrPeriods = 2000 + nrThrottled = 200 + throttledTime = uint64(18446744073709551615) + ) + + cpuStatContent := fmt.Sprintf("nr_periods %d\nnr_throttled %d\nthrottled_time %d\n", + nrPeriods, nrThrottled, throttledTime) + writeFileContents(t, path, map[string]string{ + "cpu.stat": cpuStatContent, + }) + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.ThrottlingData{ + Periods: nrPeriods, + ThrottledPeriods: nrThrottled, + ThrottledTime: throttledTime, + } + + expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData) +} + +func TestNoCpuStatFile(t *testing.T) { + path := tempDir(t, "cpu") + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(path, &actualStats) + if err != nil { + t.Fatal("Expected not to fail, but did") + } +} + +func TestInvalidCpuStat(t *testing.T) { + path := tempDir(t, "cpu") + + cpuStatContent := `nr_periods 2000 + nr_throttled 200 + throttled_time fortytwo` + writeFileContents(t, path, map[string]string{ + "cpu.stat": cpuStatContent, + }) + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failed stat parsing.") + } +} + +func TestCpuSetRtSchedAtApply(t *testing.T) { + path := tempDir(t, "cpu") + + const ( + rtRuntimeBefore = 0 + rtRuntimeAfter = 5000 + rtPeriodBefore = 0 + rtPeriodAfter = 7000 + ) + + writeFileContents(t, path, map[string]string{ + "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore), + "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore), + }) + + r := &cgroups.Resources{ + CpuRtRuntime: rtRuntimeAfter, + CpuRtPeriod: rtPeriodAfter, + } + cpu := &CpuGroup{} + + if err := cpu.Apply(path, r, 1234); err != nil { + t.Fatal(err) + } + + rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us") + if err != nil { + t.Fatal(err) + } + if rtRuntime != rtRuntimeAfter { + t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.") + } + + rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us") + if err != nil { + t.Fatal(err) + } + if rtPeriod != rtPeriodAfter { + t.Fatal("Got the wrong value, set cpu.rt_period_us failed.") + } + + pid, err := fscommon.GetCgroupParamUint(path, "cgroup.procs") + if err != nil { + t.Fatal(err) + } + if pid != 1234 { + t.Fatal("Got the wrong value, set cgroup.procs failed.") + } +} diff --git a/fs/cpuacct.go b/fs/cpuacct.go new file mode 100644 index 0000000..391a023 --- /dev/null +++ b/fs/cpuacct.go @@ -0,0 +1,158 @@ +package fs + +import ( + "bufio" + "os" + "strconv" + "strings" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +const ( + nsInSec = 1000000000 + + // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and + // on Linux it's a constant which is safe to be hard coded, + // so we can avoid using cgo here. For details, see: + // https://github.com/containerd/cgroups/pull/12 + clockTicks uint64 = 100 +) + +type CpuacctGroup struct{} + +func (s *CpuacctGroup) Name() string { + return "cpuacct" +} + +func (s *CpuacctGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + return apply(path, pid) +} + +func (s *CpuacctGroup) Set(_ string, _ *cgroups.Resources) error { + return nil +} + +func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) + if err != nil { + return err + } + + totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage") + if err != nil { + return err + } + + percpuUsage, err := getPercpuUsage(path) + if err != nil { + return err + } + + percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path) + if err != nil { + return err + } + + stats.CpuStats.CpuUsage.TotalUsage = totalUsage + stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage + stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode + stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode + stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage + stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage + return nil +} + +// Returns user and kernel usage breakdown in nanoseconds. +func getCpuUsageBreakdown(path string) (uint64, uint64, error) { + var userModeUsage, kernelModeUsage uint64 + const ( + userField = "user" + systemField = "system" + file = "cpuacct.stat" + ) + + // Expected format: + // user + // system + data, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, 0, err + } + + fields := strings.Fields(data) + if len(fields) < 4 || fields[0] != userField || fields[2] != systemField { + return 0, 0, malformedLine(path, file, data) + } + if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { + return 0, 0, &parseError{Path: path, File: file, Err: err} + } + if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { + return 0, 0, &parseError{Path: path, File: file, Err: err} + } + + return (userModeUsage * nsInSec) / clockTicks, (kernelModeUsage * nsInSec) / clockTicks, nil +} + +func getPercpuUsage(path string) ([]uint64, error) { + const file = "cpuacct.usage_percpu" + percpuUsage := []uint64{} + data, err := cgroups.ReadFile(path, file) + if err != nil { + return percpuUsage, err + } + for _, value := range strings.Fields(data) { + value, err := strconv.ParseUint(value, 10, 64) + if err != nil { + return percpuUsage, &parseError{Path: path, File: file, Err: err} + } + percpuUsage = append(percpuUsage, value) + } + return percpuUsage, nil +} + +func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) { + usageKernelMode := []uint64{} + usageUserMode := []uint64{} + const file = "cpuacct.usage_all" + + fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if os.IsNotExist(err) { + return usageKernelMode, usageUserMode, nil + } else if err != nil { + return nil, nil, err + } + defer fd.Close() + + scanner := bufio.NewScanner(fd) + scanner.Scan() // skipping header line + + for scanner.Scan() { + // Each line is: cpu user system + fields := strings.SplitN(scanner.Text(), " ", 3) + if len(fields) != 3 { + continue + } + + user, err := strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + usageUserMode = append(usageUserMode, user) + + kernel, err := strconv.ParseUint(fields[2], 10, 64) + if err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + usageKernelMode = append(usageKernelMode, kernel) + } + if err := scanner.Err(); err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + + return usageKernelMode, usageUserMode, nil +} diff --git a/fs/cpuacct_test.go b/fs/cpuacct_test.go new file mode 100644 index 0000000..c0c9543 --- /dev/null +++ b/fs/cpuacct_test.go @@ -0,0 +1,112 @@ +package fs + +import ( + "reflect" + "testing" + + "github.com/opencontainers/cgroups" +) + +const ( + cpuAcctUsageContents = "12262454190222160" + cpuAcctUsagePerCPUContents = "1564936537989058 1583937096487821 1604195415465681 1596445226820187 1481069084155629 1478735613864327 1477610593414743 1476362015778086" + cpuAcctStatContents = "user 452278264\nsystem 291429664" + cpuAcctUsageAll = `cpu user system + 0 962250696038415 637727786389114 + 1 981956408513304 638197595421064 + 2 1002658817529022 638956774598358 + 3 994937703492523 637985531181620 + 4 874843781648690 638837766495476 + 5 872544369885276 638763309884944 + 6 870104915696359 640081778921247 + 7 870202363887496 638716766259495 + ` +) + +func TestCpuacctStats(t *testing.T) { + path := tempDir(t, "cpuacct") + writeFileContents(t, path, map[string]string{ + "cpuacct.usage": cpuAcctUsageContents, + "cpuacct.usage_percpu": cpuAcctUsagePerCPUContents, + "cpuacct.stat": cpuAcctStatContents, + "cpuacct.usage_all": cpuAcctUsageAll, + }) + + cpuacct := &CpuacctGroup{} + actualStats := *cgroups.NewStats() + err := cpuacct.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.CpuUsage{ + TotalUsage: uint64(12262454190222160), + PercpuUsage: []uint64{ + 1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187, + 1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086, + }, + PercpuUsageInKernelmode: []uint64{ + 637727786389114, 638197595421064, 638956774598358, 637985531181620, + 638837766495476, 638763309884944, 640081778921247, 638716766259495, + }, + PercpuUsageInUsermode: []uint64{ + 962250696038415, 981956408513304, 1002658817529022, 994937703492523, + 874843781648690, 872544369885276, 870104915696359, 870202363887496, + }, + UsageInKernelmode: (uint64(291429664) * nsInSec) / clockTicks, + UsageInUsermode: (uint64(452278264) * nsInSec) / clockTicks, + } + + if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) { + t.Errorf("Expected CPU usage %#v but found %#v\n", + expectedStats, actualStats.CpuStats.CpuUsage) + } +} + +func TestCpuacctStatsWithoutUsageAll(t *testing.T) { + path := tempDir(t, "cpuacct") + writeFileContents(t, path, map[string]string{ + "cpuacct.usage": cpuAcctUsageContents, + "cpuacct.usage_percpu": cpuAcctUsagePerCPUContents, + "cpuacct.stat": cpuAcctStatContents, + }) + + cpuacct := &CpuacctGroup{} + actualStats := *cgroups.NewStats() + err := cpuacct.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.CpuUsage{ + TotalUsage: uint64(12262454190222160), + PercpuUsage: []uint64{ + 1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187, + 1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086, + }, + PercpuUsageInKernelmode: []uint64{}, + PercpuUsageInUsermode: []uint64{}, + UsageInKernelmode: (uint64(291429664) * nsInSec) / clockTicks, + UsageInUsermode: (uint64(452278264) * nsInSec) / clockTicks, + } + + if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) { + t.Errorf("Expected CPU usage %#v but found %#v\n", + expectedStats, actualStats.CpuStats.CpuUsage) + } +} + +func BenchmarkGetCpuUsageBreakdown(b *testing.B) { + path := tempDir(b, "cpuacct") + writeFileContents(b, path, map[string]string{ + "cpuacct.stat": cpuAcctStatContents, + }) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _, err := getCpuUsageBreakdown(path) + if err != nil { + b.Fatal(err) + } + } +} diff --git a/fs/cpuset.go b/fs/cpuset.go new file mode 100644 index 0000000..ef6ff7d --- /dev/null +++ b/fs/cpuset.go @@ -0,0 +1,276 @@ +package fs + +import ( + "errors" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +var ( + cpusetLock sync.Mutex + cpusetPrefix = "cpuset." + cpusetFastPath bool +) + +func cpusetFile(path string, name string) string { + cpusetLock.Lock() + defer cpusetLock.Unlock() + + // Only the v1 cpuset cgroup is allowed to mount with noprefix. + // See kernel source: https://github.com/torvalds/linux/blob/2e1b3cc9d7f790145a80cb705b168f05dab65df2/kernel/cgroup/cgroup-v1.c#L1070 + // Cpuset cannot be mounted with and without prefix simultaneously. + // Commonly used in Android environments. + + if cpusetFastPath { + return cpusetPrefix + name + } + + err := unix.Access(filepath.Join(path, cpusetPrefix+name), unix.F_OK) + if err == nil { + // Use the fast path only if we can access one type of mount for cpuset already + cpusetFastPath = true + } else { + err = unix.Access(filepath.Join(path, name), unix.F_OK) + if err == nil { + cpusetPrefix = "" + cpusetFastPath = true + } + } + + return cpusetPrefix + name +} + +type CpusetGroup struct{} + +func (s *CpusetGroup) Name() string { + return "cpuset" +} + +func (s *CpusetGroup) Apply(path string, r *cgroups.Resources, pid int) error { + return s.ApplyDir(path, r, pid) +} + +func (s *CpusetGroup) Set(path string, r *cgroups.Resources) error { + if r.CpusetCpus != "" { + if err := cgroups.WriteFile(path, cpusetFile(path, "cpus"), r.CpusetCpus); err != nil { + return err + } + } + if r.CpusetMems != "" { + if err := cgroups.WriteFile(path, cpusetFile(path, "mems"), r.CpusetMems); err != nil { + return err + } + } + return nil +} + +func getCpusetStat(path string, file string) ([]uint16, error) { + var extracted []uint16 + fileContent, err := fscommon.GetCgroupParamString(path, file) + if err != nil { + return extracted, err + } + if len(fileContent) == 0 { + return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")} + } + + for _, s := range strings.Split(fileContent, ",") { + fromStr, toStr, ok := strings.Cut(s, "-") + if ok { + from, err := strconv.ParseUint(fromStr, 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + to, err := strconv.ParseUint(toStr, 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + if from > to { + return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, from > to")} + } + for i := from; i <= to; i++ { + extracted = append(extracted, uint16(i)) + } + } else { + value, err := strconv.ParseUint(s, 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + extracted = append(extracted, uint16(value)) + } + } + + return extracted, nil +} + +func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { + var err error + + stats.CPUSetStats.CPUs, err = getCpusetStat(path, cpusetFile(path, "cpus")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "cpu_exclusive")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.Mems, err = getCpusetStat(path, cpusetFile(path, "mems")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "mem_hardwall")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "mem_exclusive")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_migrate")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_spread_page")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_spread_slab")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_pressure")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "sched_load_balance")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, cpusetFile(path, "sched_relax_domain_level")) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + return nil +} + +func (s *CpusetGroup) ApplyDir(dir string, r *cgroups.Resources, pid int) error { + // This might happen if we have no cpuset cgroup mounted. + // Just do nothing and don't fail. + if dir == "" { + return nil + } + // 'ensureParent' start with parent because we don't want to + // explicitly inherit from parent, it could conflict with + // 'cpuset.cpu_exclusive'. + if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil { + return err + } + if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) { + return err + } + // We didn't inherit cpuset configs from parent, but we have + // to ensure cpuset configs are set before moving task into the + // cgroup. + // The logic is, if user specified cpuset configs, use these + // specified configs, otherwise, inherit from parent. This makes + // cpuset configs work correctly with 'cpuset.cpu_exclusive', and + // keep backward compatibility. + if err := s.ensureCpusAndMems(dir, r); err != nil { + return err + } + // Since we are not using apply(), we need to place the pid + // into the procs file. + return cgroups.WriteCgroupProc(dir, pid) +} + +func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { + if cpus, err = cgroups.ReadFile(parent, cpusetFile(parent, "cpus")); err != nil { + return + } + if mems, err = cgroups.ReadFile(parent, cpusetFile(parent, "mems")); err != nil { + return + } + return cpus, mems, nil +} + +// cpusetEnsureParent makes sure that the parent directories of current +// are created and populated with the proper cpus and mems files copied +// from their respective parent. It does that recursively, starting from +// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). +func cpusetEnsureParent(current string) error { + var st unix.Statfs_t + + parent := filepath.Dir(current) + err := unix.Statfs(parent, &st) + if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { + return nil + } + // Treat non-existing directory as cgroupfs as it will be created, + // and the root cpuset directory obviously exists. + if err != nil && err != unix.ENOENT { + return &os.PathError{Op: "statfs", Path: parent, Err: err} + } + + if err := cpusetEnsureParent(parent); err != nil { + return err + } + if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) { + return err + } + return cpusetCopyIfNeeded(current, parent) +} + +// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func cpusetCopyIfNeeded(current, parent string) error { + currentCpus, currentMems, err := getCpusetSubsystemSettings(current) + if err != nil { + return err + } + parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) + if err != nil { + return err + } + + if isEmptyCpuset(currentCpus) { + if err := cgroups.WriteFile(current, cpusetFile(current, "cpus"), parentCpus); err != nil { + return err + } + } + if isEmptyCpuset(currentMems) { + if err := cgroups.WriteFile(current, cpusetFile(current, "mems"), parentMems); err != nil { + return err + } + } + return nil +} + +func isEmptyCpuset(str string) bool { + return str == "" || str == "\n" +} + +func (s *CpusetGroup) ensureCpusAndMems(path string, r *cgroups.Resources) error { + if err := s.Set(path, r); err != nil { + return err + } + return cpusetCopyIfNeeded(path, filepath.Dir(path)) +} diff --git a/fs/cpuset_test.go b/fs/cpuset_test.go new file mode 100644 index 0000000..58e571b --- /dev/null +++ b/fs/cpuset_test.go @@ -0,0 +1,241 @@ +package fs + +import ( + "reflect" + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +const ( + cpus = "0-2,7,12-14\n" + cpuExclusive = "1\n" + mems = "1-4,6,9\n" + memHardwall = "0\n" + memExclusive = "0\n" + memoryMigrate = "1\n" + memorySpreadPage = "0\n" + memorySpeadSlab = "1\n" + memoryPressure = "34377\n" + schedLoadBalance = "1\n" + schedRelaxDomainLevel = "-1\n" +) + +var cpusetTestFiles = map[string]string{ + "cpuset.cpus": cpus, + "cpuset.cpu_exclusive": cpuExclusive, + "cpuset.mems": mems, + "cpuset.mem_hardwall": memHardwall, + "cpuset.mem_exclusive": memExclusive, + "cpuset.memory_migrate": memoryMigrate, + "cpuset.memory_spread_page": memorySpreadPage, + "cpuset.memory_spread_slab": memorySpeadSlab, + "cpuset.memory_pressure": memoryPressure, + "cpuset.sched_load_balance": schedLoadBalance, + "cpuset.sched_relax_domain_level": schedRelaxDomainLevel, +} + +func TestCPUSetSetCpus(t *testing.T) { + path := tempDir(t, "cpuset") + + const ( + cpusBefore = "0" + cpusAfter = "1-3" + ) + + writeFileContents(t, path, map[string]string{ + "cpuset.cpus": cpusBefore, + }) + + r := &cgroups.Resources{ + CpusetCpus: cpusAfter, + } + cpuset := &CpusetGroup{} + if err := cpuset.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(path, "cpuset.cpus") + if err != nil { + t.Fatal(err) + } + if value != cpusAfter { + t.Fatal("Got the wrong value, set cpuset.cpus failed.") + } +} + +func TestCPUSetSetMems(t *testing.T) { + path := tempDir(t, "cpuset") + + const ( + memsBefore = "0" + memsAfter = "1" + ) + + writeFileContents(t, path, map[string]string{ + "cpuset.mems": memsBefore, + }) + + r := &cgroups.Resources{ + CpusetMems: memsAfter, + } + cpuset := &CpusetGroup{} + if err := cpuset.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(path, "cpuset.mems") + if err != nil { + t.Fatal(err) + } + if value != memsAfter { + t.Fatal("Got the wrong value, set cpuset.mems failed.") + } +} + +func TestCPUSetStatsCorrect(t *testing.T) { + path := tempDir(t, "cpuset") + writeFileContents(t, path, cpusetTestFiles) + + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.CPUSetStats{ + CPUs: []uint16{0, 1, 2, 7, 12, 13, 14}, + CPUExclusive: 1, + Mems: []uint16{1, 2, 3, 4, 6, 9}, + MemoryMigrate: 1, + MemHardwall: 0, + MemExclusive: 0, + MemorySpreadPage: 0, + MemorySpreadSlab: 1, + MemoryPressure: 34377, + SchedLoadBalance: 1, + SchedRelaxDomainLevel: -1, + } + if !reflect.DeepEqual(expectedStats, actualStats.CPUSetStats) { + t.Fatalf("Expected Cpuset stats usage %#v but found %#v", + expectedStats, actualStats.CPUSetStats) + } +} + +func TestCPUSetStatsMissingFiles(t *testing.T) { + for _, testCase := range []struct { + desc string + filename, contents string + removeFile bool + }{ + { + desc: "empty cpus file", + filename: "cpuset.cpus", + contents: "", + removeFile: false, + }, + { + desc: "empty mems file", + filename: "cpuset.mems", + contents: "", + removeFile: false, + }, + { + desc: "corrupted cpus file", + filename: "cpuset.cpus", + contents: "0-3,*4^2", + removeFile: false, + }, + { + desc: "corrupted mems file", + filename: "cpuset.mems", + contents: "0,1,2-5,8-7", + removeFile: false, + }, + { + desc: "missing cpu_exclusive file", + filename: "cpuset.cpu_exclusive", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_migrate file", + filename: "cpuset.memory_migrate", + contents: "", + removeFile: true, + }, + { + desc: "missing mem_hardwall file", + filename: "cpuset.mem_hardwall", + contents: "", + removeFile: true, + }, + { + desc: "missing mem_exclusive file", + filename: "cpuset.mem_exclusive", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_spread_page file", + filename: "cpuset.memory_spread_page", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_spread_slab file", + filename: "cpuset.memory_spread_slab", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_pressure file", + filename: "cpuset.memory_pressure", + contents: "", + removeFile: true, + }, + { + desc: "missing sched_load_balance file", + filename: "cpuset.sched_load_balance", + contents: "", + removeFile: true, + }, + { + desc: "missing sched_relax_domain_level file", + filename: "cpuset.sched_relax_domain_level", + contents: "", + removeFile: true, + }, + } { + t.Run(testCase.desc, func(t *testing.T) { + path := tempDir(t, "cpuset") + + tempCpusetTestFiles := map[string]string{} + for i, v := range cpusetTestFiles { + tempCpusetTestFiles[i] = v + } + + if testCase.removeFile { + delete(tempCpusetTestFiles, testCase.filename) + writeFileContents(t, path, tempCpusetTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(path, &actualStats) + if err != nil { + t.Errorf("failed unexpectedly: %q", err) + } + } else { + tempCpusetTestFiles[testCase.filename] = testCase.contents + writeFileContents(t, path, tempCpusetTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(path, &actualStats) + + if err == nil { + t.Error("failed to return expected error") + } + } + }) + } +} diff --git a/fs/devices.go b/fs/devices.go new file mode 100644 index 0000000..26483ec --- /dev/null +++ b/fs/devices.go @@ -0,0 +1,38 @@ +package fs + +import ( + "github.com/opencontainers/cgroups" +) + +type DevicesGroup struct{} + +func (s *DevicesGroup) Name() string { + return "devices" +} + +func (s *DevicesGroup) Apply(path string, r *cgroups.Resources, pid int) error { + if r.SkipDevices { + return nil + } + if path == "" { + // Return error here, since devices cgroup + // is a hard requirement for container's security. + return errSubsystemDoesNotExist + } + + return apply(path, pid) +} + +func (s *DevicesGroup) Set(path string, r *cgroups.Resources) error { + if cgroups.DevicesSetV1 == nil { + if len(r.Devices) == 0 { + return nil + } + return cgroups.ErrDevicesUnsupported + } + return cgroups.DevicesSetV1(path, r) +} + +func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/fs/error.go b/fs/error.go new file mode 100644 index 0000000..f13033e --- /dev/null +++ b/fs/error.go @@ -0,0 +1,15 @@ +package fs + +import ( + "fmt" + + "github.com/opencontainers/cgroups/fscommon" +) + +type parseError = fscommon.ParseError + +// malformedLine is used by all cgroupfs file parsers that expect a line +// in a particular format but get some garbage instead. +func malformedLine(path, file, line string) error { + return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)} +} diff --git a/fs/freezer.go b/fs/freezer.go new file mode 100644 index 0000000..dae4a60 --- /dev/null +++ b/fs/freezer.go @@ -0,0 +1,157 @@ +package fs + +import ( + "errors" + "fmt" + "os" + "strings" + "time" + + "github.com/opencontainers/cgroups" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +type FreezerGroup struct{} + +func (s *FreezerGroup) Name() string { + return "freezer" +} + +func (s *FreezerGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + return apply(path, pid) +} + +func (s *FreezerGroup) Set(path string, r *cgroups.Resources) (Err error) { + switch r.Freezer { + case cgroups.Frozen: + defer func() { + if Err != nil { + // Freezing failed, and it is bad and dangerous + // to leave the cgroup in FROZEN or FREEZING + // state, so (try to) thaw it back. + _ = cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed)) + } + }() + + // As per older kernel docs (freezer-subsystem.txt before + // kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + // userspace should either retry or thaw. While current + // kernel cgroup v1 docs no longer mention a need to retry, + // even a recent kernel (v5.4, Ubuntu 20.04) can't reliably + // freeze a cgroup v1 while new processes keep appearing in it + // (either via fork/clone or by writing new PIDs to + // cgroup.procs). + // + // The numbers below are empirically chosen to have a decent + // chance to succeed in various scenarios ("runc pause/unpause + // with parallel runc exec" and "bare freeze/unfreeze on a very + // slow system"), tested on RHEL7 and Ubuntu 20.04 kernels. + // + // Adding any amount of sleep in between retries did not + // increase the chances of successful freeze in "pause/unpause + // with parallel exec" reproducer. OTOH, adding an occasional + // sleep helped for the case where the system is extremely slow + // (CentOS 7 VM on GHA CI). + // + // Alas, this is still a game of chances, since the real fix + // belong to the kernel (cgroup v2 do not have this bug). + + for i := 0; i < 1000; i++ { + if i%50 == 49 { + // Occasional thaw and sleep improves + // the chances to succeed in freezing + // in case new processes keep appearing + // in the cgroup. + _ = cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed)) + time.Sleep(10 * time.Millisecond) + } + + if err := cgroups.WriteFile(path, "freezer.state", string(cgroups.Frozen)); err != nil { + return err + } + + if i%25 == 24 { + // Occasional short sleep before reading + // the state back also improves the chances to + // succeed in freezing in case of a very slow + // system. + time.Sleep(10 * time.Microsecond) + } + state, err := cgroups.ReadFile(path, "freezer.state") + if err != nil { + return err + } + state = strings.TrimSpace(state) + switch state { + case "FREEZING": + continue + case string(cgroups.Frozen): + if i > 1 { + logrus.Debugf("frozen after %d retries", i) + } + return nil + default: + // should never happen + return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state)) + } + } + // Despite our best efforts, it got stuck in FREEZING. + return errors.New("unable to freeze") + case cgroups.Thawed: + return cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed)) + case cgroups.Undefined: + return nil + default: + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer)) + } +} + +func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *FreezerGroup) GetState(path string) (cgroups.FreezerState, error) { + for { + state, err := cgroups.ReadFile(path, "freezer.state") + if err != nil { + // If the kernel is too old, then we just treat the freezer as + // being in an "undefined" state. + if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { + err = nil + } + return cgroups.Undefined, err + } + switch strings.TrimSpace(state) { + case "THAWED": + return cgroups.Thawed, nil + case "FROZEN": + // Find out whether the cgroup is frozen directly, + // or indirectly via an ancestor. + self, err := cgroups.ReadFile(path, "freezer.self_freezing") + if err != nil { + // If the kernel is too old, then we just treat + // it as being frozen. + if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) { + err = nil + } + return cgroups.Frozen, err + } + switch self { + case "0\n": + return cgroups.Thawed, nil + case "1\n": + return cgroups.Frozen, nil + default: + return cgroups.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self) + } + case "FREEZING": + // Make sure we get a stable freezer state, so retry if the cgroup + // is still undergoing freezing. This should be a temporary delay. + time.Sleep(1 * time.Millisecond) + continue + default: + return cgroups.Undefined, fmt.Errorf("unknown freezer.state %q", state) + } + } +} diff --git a/fs/freezer_test.go b/fs/freezer_test.go new file mode 100644 index 0000000..c76ee79 --- /dev/null +++ b/fs/freezer_test.go @@ -0,0 +1,46 @@ +package fs + +import ( + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +func TestFreezerSetState(t *testing.T) { + path := tempDir(t, "freezer") + + writeFileContents(t, path, map[string]string{ + "freezer.state": string(cgroups.Frozen), + }) + + r := &cgroups.Resources{ + Freezer: cgroups.Thawed, + } + freezer := &FreezerGroup{} + if err := freezer.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(path, "freezer.state") + if err != nil { + t.Fatal(err) + } + if value != string(cgroups.Thawed) { + t.Fatal("Got the wrong value, set freezer.state failed.") + } +} + +func TestFreezerSetInvalidState(t *testing.T) { + path := tempDir(t, "freezer") + + const invalidArg cgroups.FreezerState = "Invalid" + + r := &cgroups.Resources{ + Freezer: invalidArg, + } + freezer := &FreezerGroup{} + if err := freezer.Set(path, r); err == nil { + t.Fatal("Failed to return invalid argument error") + } +} diff --git a/fs/fs.go b/fs/fs.go new file mode 100644 index 0000000..23a8fb8 --- /dev/null +++ b/fs/fs.go @@ -0,0 +1,265 @@ +package fs + +import ( + "errors" + "fmt" + "os" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +var subsystems = []subsystem{ + &CpusetGroup{}, + &DevicesGroup{}, + &MemoryGroup{}, + &CpuGroup{}, + &CpuacctGroup{}, + &PidsGroup{}, + &BlkioGroup{}, + &HugetlbGroup{}, + &NetClsGroup{}, + &NetPrioGroup{}, + &PerfEventGroup{}, + &FreezerGroup{}, + &RdmaGroup{}, + &NameGroup{GroupName: "name=systemd", Join: true}, + &NameGroup{GroupName: "misc", Join: true}, +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +func init() { + // If using cgroups-hybrid mode then add a "" controller indicating + // it should join the cgroups v2. + if cgroups.IsCgroup2HybridMode() { + subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true}) + } +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // GetStats fills in the stats for the subsystem. + GetStats(path string, stats *cgroups.Stats) error + // Apply creates and joins a cgroup, adding pid into it. Some + // subsystems use resources to pre-configure the cgroup parents + // before creating or joining it. + Apply(path string, r *cgroups.Resources, pid int) error + // Set sets the cgroup resources. + Set(path string, r *cgroups.Resources) error +} + +type Manager struct { + mu sync.Mutex + cgroups *cgroups.Cgroup + paths map[string]string +} + +func NewManager(cg *cgroups.Cgroup, paths map[string]string) (*Manager, error) { + // Some v1 controllers (cpu, cpuset, and devices) expect + // cgroups.Resources to not be nil in Apply. + if cg.Resources == nil { + return nil, errors.New("cgroup v1 manager needs cgroups.Resources to be set during manager creation") + } + if cg.Resources.Unified != nil { + return nil, cgroups.ErrV1NoUnified + } + + if paths == nil { + var err error + paths, err = initPaths(cg) + if err != nil { + return nil, err + } + } + + return &Manager{ + cgroups: cg, + paths: paths, + }, nil +} + +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(rootless bool, err error) bool { + // We do not ignore errors if we are root. + if !rootless { + return false + } + // Is it an ordinary EPERM? + if errors.Is(err, os.ErrPermission) { + return true + } + // Handle some specific syscall errors. + var errno unix.Errno + if errors.As(err, &errno) { + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES + } + return false +} + +func (m *Manager) Apply(pid int) (retErr error) { + m.mu.Lock() + defer m.mu.Unlock() + + c := m.cgroups + + for _, sys := range subsystems { + name := sys.Name() + p, ok := m.paths[name] + if !ok { + continue + } + + if err := sys.Apply(p, c.Resources, pid); err != nil { + // In the case of rootless (including euid=0 in userns), where an + // explicit cgroup path hasn't been set, we don't bail on error in + // case of permission problems here, but do delete the path from + // the m.paths map, since it is either non-existent and could not + // be created, or the pid could not be added to it. + // + // Cases where limits for the subsystem have been set are handled + // later by Set, which fails with a friendly error (see + // if path == "" in Set). + if isIgnorableError(c.Rootless, err) && c.Path == "" { + retErr = cgroups.ErrRootless + delete(m.paths, name) + continue + } + return err + } + + } + return retErr +} + +func (m *Manager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + return cgroups.RemovePaths(m.paths) +} + +func (m *Manager) Path(subsys string) string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths[subsys] +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if path == "" { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + return stats, nil +} + +func (m *Manager) Set(r *cgroups.Resources) error { + if r == nil { + return nil + } + + if r.Unified != nil { + return cgroups.ErrV1NoUnified + } + + m.mu.Lock() + defer m.mu.Unlock() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if err := sys.Set(path, r); err != nil { + // When rootless is true, errors from the device subsystem + // are ignored, as it is really not expected to work. + if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) { + continue + } + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if path == "" { + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) + } + return err + } + } + + return nil +} + +// Freeze toggles the container's freezer cgroup depending on the state +// provided +func (m *Manager) Freeze(state cgroups.FreezerState) error { + path := m.Path("freezer") + if path == "" { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + + prevState := m.cgroups.Resources.Freezer + m.cgroups.Resources.Freezer = state + freezer := &FreezerGroup{} + if err := freezer.Set(path, m.cgroups.Resources); err != nil { + m.cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *Manager) GetPids() ([]int, error) { + return cgroups.GetPids(m.Path("devices")) +} + +func (m *Manager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.Path("devices")) +} + +func (m *Manager) GetPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths +} + +func (m *Manager) GetCgroups() (*cgroups.Cgroup, error) { + return m.cgroups, nil +} + +func (m *Manager) GetFreezerState() (cgroups.FreezerState, error) { + dir := m.Path("freezer") + // If the container doesn't have the freezer cgroup, say it's undefined. + if dir == "" { + return cgroups.Undefined, nil + } + freezer := &FreezerGroup{} + return freezer.GetState(dir) +} + +func (m *Manager) Exists() bool { + return cgroups.PathExists(m.Path("devices")) +} + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill") +} + +func (m *Manager) OOMKillCount() (uint64, error) { + c, err := OOMKillCount(m.Path("memory")) + // Ignore ENOENT when rootless as it couldn't create cgroup. + if err != nil && m.cgroups.Rootless && os.IsNotExist(err) { + err = nil + } + + return c, err +} diff --git a/fs/fs_test.go b/fs/fs_test.go new file mode 100644 index 0000000..f9a0935 --- /dev/null +++ b/fs/fs_test.go @@ -0,0 +1,49 @@ +package fs + +import ( + "testing" + + "github.com/opencontainers/cgroups" +) + +func BenchmarkGetStats(b *testing.B) { + if cgroups.IsCgroup2UnifiedMode() { + b.Skip("cgroup v2 is not supported") + } + + // Unset TestMode as we work with real cgroupfs here, + // and we want OpenFile to perform the fstype check. + cgroups.TestMode = false + defer func() { + cgroups.TestMode = true + }() + + cg := &cgroups.Cgroup{ + Path: "/some/kind/of/a/path/here", + Resources: &cgroups.Resources{}, + } + m, err := NewManager(cg, nil) + if err != nil { + b.Fatal(err) + } + err = m.Apply(-1) + if err != nil { + b.Fatal(err) + } + defer func() { + _ = m.Destroy() + }() + + var st *cgroups.Stats + + b.ResetTimer() + for i := 0; i < b.N; i++ { + st, err = m.GetStats() + if err != nil { + b.Fatal(err) + } + } + if st.CpuStats.CpuUsage.TotalUsage != 0 { + b.Fatalf("stats: %+v", st) + } +} diff --git a/fs/hugetlb.go b/fs/hugetlb.go new file mode 100644 index 0000000..698fd69 --- /dev/null +++ b/fs/hugetlb.go @@ -0,0 +1,83 @@ +package fs + +import ( + "errors" + "os" + "strconv" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +type HugetlbGroup struct{} + +func (s *HugetlbGroup) Name() string { + return "hugetlb" +} + +func (s *HugetlbGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + return apply(path, pid) +} + +func (s *HugetlbGroup) Set(path string, r *cgroups.Resources) error { + const suffix = ".limit_in_bytes" + skipRsvd := false + + for _, hugetlb := range r.HugetlbLimit { + prefix := "hugetlb." + hugetlb.Pagesize + val := strconv.FormatUint(hugetlb.Limit, 10) + if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil { + return err + } + if skipRsvd { + continue + } + if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil { + if errors.Is(err, os.ErrNotExist) { + skipRsvd = true + continue + } + return err + } + } + + return nil +} + +func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + rsvd := ".rsvd" + hugetlbStats := cgroups.HugetlbStats{} + for _, pageSize := range cgroups.HugePageSizes() { + again: + prefix := "hugetlb." + pageSize + rsvd + + value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes") + if err != nil { + if rsvd != "" && errors.Is(err, os.ErrNotExist) { + rsvd = "" + goto again + } + return err + } + hugetlbStats.Usage = value + + value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes") + if err != nil { + return err + } + hugetlbStats.MaxUsage = value + + value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt") + if err != nil { + return err + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pageSize] = hugetlbStats + } + + return nil +} diff --git a/fs/hugetlb_test.go b/fs/hugetlb_test.go new file mode 100644 index 0000000..c37e3ec --- /dev/null +++ b/fs/hugetlb_test.go @@ -0,0 +1,176 @@ +package fs + +import ( + "fmt" + "strconv" + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +const ( + hugetlbUsageContents = "128\n" + hugetlbMaxUsageContents = "256\n" + hugetlbFailcnt = "100\n" +) + +const ( + usage = "hugetlb.%s.usage_in_bytes" + limit = "hugetlb.%s.limit_in_bytes" + maxUsage = "hugetlb.%s.max_usage_in_bytes" + failcnt = "hugetlb.%s.failcnt" + + rsvdUsage = "hugetlb.%s.rsvd.usage_in_bytes" + rsvdLimit = "hugetlb.%s.rsvd.limit_in_bytes" + rsvdMaxUsage = "hugetlb.%s.rsvd.max_usage_in_bytes" + rsvdFailcnt = "hugetlb.%s.rsvd.failcnt" +) + +func TestHugetlbSetHugetlb(t *testing.T) { + path := tempDir(t, "hugetlb") + + const ( + hugetlbBefore = 256 + hugetlbAfter = 512 + ) + + for _, pageSize := range cgroups.HugePageSizes() { + writeFileContents(t, path, map[string]string{ + fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore), + }) + } + + r := &cgroups.Resources{} + for _, pageSize := range cgroups.HugePageSizes() { + r.HugetlbLimit = []*cgroups.HugepageLimit{ + { + Pagesize: pageSize, + Limit: hugetlbAfter, + }, + } + hugetlb := &HugetlbGroup{} + if err := hugetlb.Set(path, r); err != nil { + t.Fatal(err) + } + } + + for _, pageSize := range cgroups.HugePageSizes() { + for _, f := range []string{limit, rsvdLimit} { + limit := fmt.Sprintf(f, pageSize) + value, err := fscommon.GetCgroupParamUint(path, limit) + if err != nil { + t.Fatal(err) + } + if value != hugetlbAfter { + t.Fatalf("Set %s failed. Expected: %v, Got: %v", limit, hugetlbAfter, value) + } + } + } +} + +func TestHugetlbStats(t *testing.T) { + path := tempDir(t, "hugetlb") + for _, pageSize := range cgroups.HugePageSizes() { + writeFileContents(t, path, map[string]string{ + fmt.Sprintf(usage, pageSize): hugetlbUsageContents, + fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents, + fmt.Sprintf(failcnt, pageSize): hugetlbFailcnt, + }) + } + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100} + for _, pageSize := range cgroups.HugePageSizes() { + expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize]) + } +} + +func TestHugetlbRStatsRsvd(t *testing.T) { + path := tempDir(t, "hugetlb") + for _, pageSize := range cgroups.HugePageSizes() { + writeFileContents(t, path, map[string]string{ + fmt.Sprintf(rsvdUsage, pageSize): hugetlbUsageContents, + fmt.Sprintf(rsvdMaxUsage, pageSize): hugetlbMaxUsageContents, + fmt.Sprintf(rsvdFailcnt, pageSize): hugetlbFailcnt, + }) + } + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100} + for _, pageSize := range cgroups.HugePageSizes() { + expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize]) + } +} + +func TestHugetlbStatsNoUsageFile(t *testing.T) { + path := tempDir(t, "hugetlb") + writeFileContents(t, path, map[string]string{ + maxUsage: hugetlbMaxUsageContents, + }) + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestHugetlbStatsNoMaxUsageFile(t *testing.T) { + path := tempDir(t, "hugetlb") + for _, pageSize := range cgroups.HugePageSizes() { + writeFileContents(t, path, map[string]string{ + fmt.Sprintf(usage, pageSize): hugetlbUsageContents, + }) + } + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestHugetlbStatsBadUsageFile(t *testing.T) { + path := tempDir(t, "hugetlb") + for _, pageSize := range cgroups.HugePageSizes() { + writeFileContents(t, path, map[string]string{ + fmt.Sprintf(usage, pageSize): "bad", + maxUsage: hugetlbMaxUsageContents, + }) + } + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestHugetlbStatsBadMaxUsageFile(t *testing.T) { + path := tempDir(t, "hugetlb") + writeFileContents(t, path, map[string]string{ + usage: hugetlbUsageContents, + maxUsage: "bad", + }) + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} diff --git a/fs/memory.go b/fs/memory.go new file mode 100644 index 0000000..d92f232 --- /dev/null +++ b/fs/memory.go @@ -0,0 +1,356 @@ +package fs + +import ( + "bufio" + "errors" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +const ( + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemoryUsage = "memory.usage_in_bytes" + cgroupMemoryMaxUsage = "memory.max_usage_in_bytes" +) + +type MemoryGroup struct{} + +func (s *MemoryGroup) Name() string { + return "memory" +} + +func (s *MemoryGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + return apply(path, pid) +} + +func setMemory(path string, val int64) error { + if val == 0 { + return nil + } + + err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10)) + if !errors.Is(err, unix.EBUSY) { + return err + } + + // EBUSY means the kernel can't set new limit as it's too low + // (lower than the current usage). Return more specific error. + usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage) + if err != nil { + return err + } + max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage) + if err != nil { + return err + } + + return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max) +} + +func setSwap(path string, val int64) error { + if val == 0 { + return nil + } + + return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10)) +} + +func setMemoryAndSwap(path string, r *cgroups.Resources) error { + // If the memory update is set to -1 and the swap is not explicitly + // set, we should also set swap to -1, it means unlimited memory. + if r.Memory == -1 && r.MemorySwap == 0 { + // Only set swap if it's enabled in kernel + if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { + r.MemorySwap = -1 + } + } + + // When memory and swap memory are both set, we need to handle the cases + // for updating container. + if r.Memory != 0 && r.MemorySwap != 0 { + curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit) + if err != nil { + return err + } + + // When update memory limit, we should adapt the write sequence + // for memory and swap memory, so it won't fail because the new + // value and the old value don't fit kernel's validation. + if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) { + if err := setSwap(path, r.MemorySwap); err != nil { + return err + } + if err := setMemory(path, r.Memory); err != nil { + return err + } + return nil + } + } + + if err := setMemory(path, r.Memory); err != nil { + return err + } + if err := setSwap(path, r.MemorySwap); err != nil { + return err + } + + return nil +} + +func (s *MemoryGroup) Set(path string, r *cgroups.Resources) error { + if err := setMemoryAndSwap(path, r); err != nil { + return err + } + + // ignore KernelMemory and KernelMemoryTCP + + if r.MemoryReservation != 0 { + if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil { + return err + } + } + + if r.OomKillDisable { + if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil { + return err + } + } + if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 { + return nil + } else if *r.MemorySwappiness <= 100 { + if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil { + return err + } + } else { + return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness) + } + + return nil +} + +func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { + const file = "memory.stat" + statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: path, File: file, Err: err} + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryData(path, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryData(path, "memsw") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{ + Usage: swapUsage.Usage - memoryUsage.Usage, + Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt, + } + kernelUsage, err := getMemoryData(path, "kmem") + if err != nil { + return err + } + stats.MemoryStats.KernelUsage = kernelUsage + kernelTCPUsage, err := getMemoryData(path, "kmem.tcp") + if err != nil { + return err + } + stats.MemoryStats.KernelTCPUsage = kernelTCPUsage + + value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy") + if err != nil { + return err + } + if value == 1 { + stats.MemoryStats.UseHierarchy = true + } + + pagesByNUMA, err := getPageUsageByNUMA(path) + if err != nil { + return err + } + stats.MemoryStats.PageUsageByNUMA = pagesByNUMA + + return nil +} + +func getMemoryData(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = "memory." + name + } + var ( + usage = moduleName + ".usage_in_bytes" + maxUsage = moduleName + ".max_usage_in_bytes" + failcnt = moduleName + ".failcnt" + limit = moduleName + ".limit_in_bytes" + ) + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if name != "" && os.IsNotExist(err) { + // Ignore ENOENT as swap and kmem controllers + // are optional in the kernel. + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, err + } + memoryData.Usage = value + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.MaxUsage = value + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.Failcnt = value + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + if name == "kmem" && os.IsNotExist(err) { + // Ignore ENOENT as kmem.limit_in_bytes has + // been removed in newer kernels. + return memoryData, nil + } + + return cgroups.MemoryData{}, err + } + memoryData.Limit = value + + return memoryData, nil +} + +func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) { + const ( + maxColumns = math.MaxUint8 + 1 + file = "memory.numa_stat" + ) + stats := cgroups.PageUsageByNUMA{} + + fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if os.IsNotExist(err) { + return stats, nil + } else if err != nil { + return stats, err + } + defer fd.Close() + + // File format is documented in linux/Documentation/cgroup-v1/memory.txt + // and it looks like this: + // + // total= N0= N1= ... + // file= N0= N1= ... + // anon= N0= N1= ... + // unevictable= N0= N1= ... + // hierarchical_= N0= N1= ... + + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + var field *cgroups.PageStats + + line := scanner.Text() + columns := strings.SplitN(line, " ", maxColumns) + for i, column := range columns { + key, val, ok := strings.Cut(column, "=") + // Some custom kernels have non-standard fields, like + // numa_locality 0 0 0 0 0 0 0 0 0 0 + // numa_exectime 0 + if !ok { + if i == 0 { + // Ignore/skip those. + break + } else { + // The first column was already validated, + // so be strict to the rest. + return stats, malformedLine(path, file, line) + } + } + if i == 0 { // First column: key is name, val is total. + field = getNUMAField(&stats, key) + if field == nil { // unknown field (new kernel?) + break + } + field.Total, err = strconv.ParseUint(val, 0, 64) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + field.Nodes = map[uint8]uint64{} + } else { // Subsequent columns: key is N, val is usage. + if len(key) < 2 || key[0] != 'N' { + // This is definitely an error. + return stats, malformedLine(path, file, line) + } + + n, err := strconv.ParseUint(key[1:], 10, 8) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + + usage, err := strconv.ParseUint(val, 10, 64) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + + field.Nodes[uint8(n)] = usage + } + + } + } + if err := scanner.Err(); err != nil { + return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err} + } + + return stats, nil +} + +func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats { + switch name { + case "total": + return &stats.Total + case "file": + return &stats.File + case "anon": + return &stats.Anon + case "unevictable": + return &stats.Unevictable + case "hierarchical_total": + return &stats.Hierarchical.Total + case "hierarchical_file": + return &stats.Hierarchical.File + case "hierarchical_anon": + return &stats.Hierarchical.Anon + case "hierarchical_unevictable": + return &stats.Hierarchical.Unevictable + } + return nil +} diff --git a/fs/memory_test.go b/fs/memory_test.go new file mode 100644 index 0000000..c94279e --- /dev/null +++ b/fs/memory_test.go @@ -0,0 +1,506 @@ +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +const ( + memoryStatContents = `cache 512 +rss 1024` + memoryUsageContents = "2048\n" + memoryMaxUsageContents = "4096\n" + memoryFailcnt = "100\n" + memoryLimitContents = "8192\n" + memoryUseHierarchyContents = "1\n" + memoryNUMAStatContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497 +file=44428 N0=32614 N1=7335 N2=1982 N3=2497 +anon=183 N0=17 N1=166 N2=0 N3=0 +unevictable=0 N0=0 N1=0 N2=0 N3=0 +hierarchical_total=768133 N0=509113 N1=138887 N2=20464 N3=99669 +hierarchical_file=722017 N0=496516 N1=119997 N2=20181 N3=85323 +hierarchical_anon=46096 N0=12597 N1=18890 N2=283 N3=14326 +hierarchical_unevictable=20 N0=0 N1=0 N2=0 N3=20 +` + memoryNUMAStatNoHierarchyContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497 +file=44428 N0=32614 N1=7335 N2=1982 N3=2497 +anon=183 N0=17 N1=166 N2=0 N3=0 +unevictable=0 N0=0 N1=0 N2=0 N3=0 +` + // Some custom kernels has extra fields that should be ignored + memoryNUMAStatExtraContents = `numa_locality 0 0 0 0 0 0 0 0 0 0 +numa_exectime 0 +whatever=100 N0=0 +` +) + +func TestMemorySetMemory(t *testing.T) { + path := tempDir(t, "memory") + + const ( + memoryBefore = 314572800 // 300M + memoryAfter = 524288000 // 500M + reservationBefore = 209715200 // 200M + reservationAfter = 314572800 // 300M + ) + + writeFileContents(t, path, map[string]string{ + "memory.limit_in_bytes": strconv.Itoa(memoryBefore), + "memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore), + }) + + r := &cgroups.Resources{ + Memory: memoryAfter, + MemoryReservation: reservationAfter, + } + memory := &MemoryGroup{} + if err := memory.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes") + if err != nil { + t.Fatal(err) + } + if value != memoryAfter { + t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") + } + + value, err = fscommon.GetCgroupParamUint(path, "memory.soft_limit_in_bytes") + if err != nil { + t.Fatal(err) + } + if value != reservationAfter { + t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.") + } +} + +func TestMemorySetMemoryswap(t *testing.T) { + path := tempDir(t, "memory") + + const ( + memoryswapBefore = 314572800 // 300M + memoryswapAfter = 524288000 // 500M + ) + + writeFileContents(t, path, map[string]string{ + "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), + }) + + r := &cgroups.Resources{ + MemorySwap: memoryswapAfter, + } + memory := &MemoryGroup{} + if err := memory.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes") + if err != nil { + t.Fatal(err) + } + if value != memoryswapAfter { + t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") + } +} + +func TestMemorySetMemoryLargerThanSwap(t *testing.T) { + path := tempDir(t, "memory") + + const ( + memoryBefore = 314572800 // 300M + memoryswapBefore = 524288000 // 500M + memoryAfter = 629145600 // 600M + memoryswapAfter = 838860800 // 800M + ) + + writeFileContents(t, path, map[string]string{ + "memory.limit_in_bytes": strconv.Itoa(memoryBefore), + "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), + // Set will call getMemoryData when memory and swap memory are + // both set, fake these fields so we don't get error. + "memory.usage_in_bytes": "0", + "memory.max_usage_in_bytes": "0", + "memory.failcnt": "0", + }) + + r := &cgroups.Resources{ + Memory: memoryAfter, + MemorySwap: memoryswapAfter, + } + memory := &MemoryGroup{} + if err := memory.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes") + if err != nil { + t.Fatal(err) + } + if value != memoryAfter { + t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") + } + + value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes") + if err != nil { + t.Fatal(err) + } + if value != memoryswapAfter { + t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") + } +} + +func TestMemorySetSwapSmallerThanMemory(t *testing.T) { + path := tempDir(t, "memory") + + const ( + memoryBefore = 629145600 // 600M + memoryswapBefore = 838860800 // 800M + memoryAfter = 314572800 // 300M + memoryswapAfter = 524288000 // 500M + ) + + writeFileContents(t, path, map[string]string{ + "memory.limit_in_bytes": strconv.Itoa(memoryBefore), + "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), + }) + + r := &cgroups.Resources{ + Memory: memoryAfter, + MemorySwap: memoryswapAfter, + } + memory := &MemoryGroup{} + if err := memory.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes") + if err != nil { + t.Fatal(err) + } + if value != memoryAfter { + t.Fatalf("Got the wrong value (%d != %d), set memory.limit_in_bytes failed", value, memoryAfter) + } + + value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes") + if err != nil { + t.Fatal(err) + } + if value != memoryswapAfter { + t.Fatalf("Got the wrong value (%d != %d), set memory.memsw.limit_in_bytes failed", value, memoryswapAfter) + } +} + +func TestMemorySetMemorySwappinessDefault(t *testing.T) { + path := tempDir(t, "memory") + + swappinessBefore := 60 // default is 60 + swappinessAfter := uint64(0) + + writeFileContents(t, path, map[string]string{ + "memory.swappiness": strconv.Itoa(swappinessBefore), + }) + + r := &cgroups.Resources{ + MemorySwappiness: &swappinessAfter, + } + memory := &MemoryGroup{} + if err := memory.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(path, "memory.swappiness") + if err != nil { + t.Fatal(err) + } + if value != swappinessAfter { + t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter) + } +} + +func TestMemoryStats(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.failcnt": memoryFailcnt, + "memory.memsw.usage_in_bytes": memoryUsageContents, + "memory.memsw.max_usage_in_bytes": memoryMaxUsageContents, + "memory.memsw.failcnt": memoryFailcnt, + "memory.memsw.limit_in_bytes": memoryLimitContents, + "memory.kmem.usage_in_bytes": memoryUsageContents, + "memory.kmem.max_usage_in_bytes": memoryMaxUsageContents, + "memory.kmem.failcnt": memoryFailcnt, + "memory.kmem.limit_in_bytes": memoryLimitContents, + "memory.use_hierarchy": memoryUseHierarchyContents, + "memory.numa_stat": memoryNUMAStatContents + memoryNUMAStatExtraContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.MemoryStats{ + Cache: 512, + Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + SwapOnlyUsage: cgroups.MemoryData{Usage: 0, MaxUsage: 0, Failcnt: 0, Limit: 0}, + KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + Stats: map[string]uint64{"cache": 512, "rss": 1024}, + UseHierarchy: true, + PageUsageByNUMA: cgroups.PageUsageByNUMA{ + PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{ + Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}}, + File: cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}}, + Anon: cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}}, + Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}}, + }, + Hierarchical: cgroups.PageUsageByNUMAInner{ + Total: cgroups.PageStats{Total: 768133, Nodes: map[uint8]uint64{0: 509113, 1: 138887, 2: 20464, 3: 99669}}, + File: cgroups.PageStats{Total: 722017, Nodes: map[uint8]uint64{0: 496516, 1: 119997, 2: 20181, 3: 85323}}, + Anon: cgroups.PageStats{Total: 46096, Nodes: map[uint8]uint64{0: 12597, 1: 18890, 2: 283, 3: 14326}}, + Unevictable: cgroups.PageStats{Total: 20, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 20}}, + }, + }, + } + expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats) +} + +func TestMemoryStatsNoStatFile(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } +} + +func TestMemoryStatsNoUsageFile(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.stat": memoryStatContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsNoMaxUsageFile(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsNoLimitInBytesFile(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadStatFile(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.stat": "rss rss", + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadUsageFile(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": "bad", + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadMaxUsageFile(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": "bad", + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadLimitInBytesFile(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": "bad", + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(path, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemorySetOomControl(t *testing.T) { + path := tempDir(t, "memory") + + const ( + oomKillDisable = 1 // disable oom killer, default is 0 + ) + + writeFileContents(t, path, map[string]string{ + "memory.oom_control": strconv.Itoa(oomKillDisable), + }) + + memory := &MemoryGroup{} + r := &cgroups.Resources{} + if err := memory.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(path, "memory.oom_control") + if err != nil { + t.Fatal(err) + } + if value != oomKillDisable { + t.Fatalf("Got the wrong value, set memory.oom_control failed.") + } +} + +func TestNoHierarchicalNumaStat(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.numa_stat": memoryNUMAStatNoHierarchyContents + memoryNUMAStatExtraContents, + }) + + actualStats, err := getPageUsageByNUMA(path) + if err != nil { + t.Fatal(err) + } + pageUsageByNUMA := cgroups.PageUsageByNUMA{ + PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{ + Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}}, + File: cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}}, + Anon: cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}}, + Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}}, + }, + Hierarchical: cgroups.PageUsageByNUMAInner{}, + } + expectPageUsageByNUMAEquals(t, pageUsageByNUMA, actualStats) +} + +func TestBadNumaStat(t *testing.T) { + memoryNUMAStatBadContents := []struct { + desc, contents string + }{ + { + desc: "Nx where x is not a number", + contents: `total=44611 N0=44611, +file=44428 Nx=0 +`, + }, { + desc: "Nx where x > 255", + contents: `total=44611 N333=444`, + }, { + desc: "Nx argument missing", + contents: `total=44611 N0=123 N1=`, + }, { + desc: "Nx argument is not a number", + contents: `total=44611 N0=123 N1=a`, + }, { + desc: "Missing = after Nx", + contents: `total=44611 N0=123 N1`, + }, { + desc: "No Nx at non-first position", + contents: `total=44611 N0=32631 +file=44428 N0=32614 +anon=183 N0=12 badone +`, + }, + } + path := tempDir(t, "memory") + for _, c := range memoryNUMAStatBadContents { + writeFileContents(t, path, map[string]string{ + "memory.numa_stat": c.contents, + }) + + _, err := getPageUsageByNUMA(path) + if err == nil { + t.Errorf("case %q: expected error, got nil", c.desc) + } + } +} + +func TestWithoutNumaStat(t *testing.T) { + path := tempDir(t, "memory") + + actualStats, err := getPageUsageByNUMA(path) + if err != nil { + t.Fatal(err) + } + expectPageUsageByNUMAEquals(t, cgroups.PageUsageByNUMA{}, actualStats) +} diff --git a/fs/name.go b/fs/name.go new file mode 100644 index 0000000..2864351 --- /dev/null +++ b/fs/name.go @@ -0,0 +1,30 @@ +package fs + +import ( + "github.com/opencontainers/cgroups" +) + +type NameGroup struct { + GroupName string + Join bool +} + +func (s *NameGroup) Name() string { + return s.GroupName +} + +func (s *NameGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + if s.Join { + // Ignore errors if the named cgroup does not exist. + _ = apply(path, pid) + } + return nil +} + +func (s *NameGroup) Set(_ string, _ *cgroups.Resources) error { + return nil +} + +func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/fs/net_cls.go b/fs/net_cls.go new file mode 100644 index 0000000..2bd6c5a --- /dev/null +++ b/fs/net_cls.go @@ -0,0 +1,31 @@ +package fs + +import ( + "strconv" + + "github.com/opencontainers/cgroups" +) + +type NetClsGroup struct{} + +func (s *NetClsGroup) Name() string { + return "net_cls" +} + +func (s *NetClsGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + return apply(path, pid) +} + +func (s *NetClsGroup) Set(path string, r *cgroups.Resources) error { + if r.NetClsClassid != 0 { + if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil { + return err + } + } + + return nil +} + +func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/fs/net_cls_test.go b/fs/net_cls_test.go new file mode 100644 index 0000000..2252cdd --- /dev/null +++ b/fs/net_cls_test.go @@ -0,0 +1,41 @@ +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +const ( + classidBefore = 0x100002 + classidAfter = 0x100001 +) + +func TestNetClsSetClassid(t *testing.T) { + path := tempDir(t, "net_cls") + + writeFileContents(t, path, map[string]string{ + "net_cls.classid": strconv.FormatUint(classidBefore, 10), + }) + + r := &cgroups.Resources{ + NetClsClassid: classidAfter, + } + netcls := &NetClsGroup{} + if err := netcls.Set(path, r); err != nil { + t.Fatal(err) + } + + // As we are in mock environment, we can't get correct value of classid from + // net_cls.classid. + // So. we just judge if we successfully write classid into file + value, err := fscommon.GetCgroupParamUint(path, "net_cls.classid") + if err != nil { + t.Fatal(err) + } + if value != classidAfter { + t.Fatal("Got the wrong value, set net_cls.classid failed.") + } +} diff --git a/fs/net_prio.go b/fs/net_prio.go new file mode 100644 index 0000000..b51682b --- /dev/null +++ b/fs/net_prio.go @@ -0,0 +1,29 @@ +package fs + +import ( + "github.com/opencontainers/cgroups" +) + +type NetPrioGroup struct{} + +func (s *NetPrioGroup) Name() string { + return "net_prio" +} + +func (s *NetPrioGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + return apply(path, pid) +} + +func (s *NetPrioGroup) Set(path string, r *cgroups.Resources) error { + for _, prioMap := range r.NetPrioIfpriomap { + if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { + return err + } + } + + return nil +} + +func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/fs/net_prio_test.go b/fs/net_prio_test.go new file mode 100644 index 0000000..1a82be4 --- /dev/null +++ b/fs/net_prio_test.go @@ -0,0 +1,36 @@ +package fs + +import ( + "strings" + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +var prioMap = []*cgroups.IfPrioMap{ + { + Interface: "test", + Priority: 5, + }, +} + +func TestNetPrioSetIfPrio(t *testing.T) { + path := tempDir(t, "net_prio") + + r := &cgroups.Resources{ + NetPrioIfpriomap: prioMap, + } + netPrio := &NetPrioGroup{} + if err := netPrio.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(path, "net_prio.ifpriomap") + if err != nil { + t.Fatal(err) + } + if !strings.Contains(value, "test 5") { + t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.") + } +} diff --git a/fs/paths.go b/fs/paths.go new file mode 100644 index 0000000..edbe041 --- /dev/null +++ b/fs/paths.go @@ -0,0 +1,169 @@ +package fs + +import ( + "errors" + "os" + "path/filepath" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/internal/path" +) + +// The absolute path to the root of the cgroup hierarchies. +var ( + cgroupRootLock sync.Mutex + cgroupRoot string +) + +const defaultCgroupRoot = "/sys/fs/cgroup" + +func initPaths(cg *cgroups.Cgroup) (map[string]string, error) { + root, err := rootPath() + if err != nil { + return nil, err + } + + inner, err := path.Inner(cg) + if err != nil { + return nil, err + } + + paths := make(map[string]string) + for _, sys := range subsystems { + name := sys.Name() + path, err := subsysPath(root, inner, name) + if err != nil { + // The non-presence of the devices subsystem + // is considered fatal for security reasons. + if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") { + continue + } + + return nil, err + } + paths[name] = path + } + + return paths, nil +} + +func tryDefaultCgroupRoot() string { + var st, pst unix.Stat_t + + // (1) it should be a directory... + err := unix.Lstat(defaultCgroupRoot, &st) + if err != nil || st.Mode&unix.S_IFDIR == 0 { + return "" + } + + // (2) ... and a mount point ... + err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst) + if err != nil { + return "" + } + + if st.Dev == pst.Dev { + // parent dir has the same dev -- not a mount point + return "" + } + + // (3) ... of 'tmpfs' fs type. + var fst unix.Statfs_t + err = unix.Statfs(defaultCgroupRoot, &fst) + if err != nil || fst.Type != unix.TMPFS_MAGIC { + return "" + } + + // (4) it should have at least 1 entry ... + dir, err := os.Open(defaultCgroupRoot) + if err != nil { + return "" + } + defer dir.Close() + names, err := dir.Readdirnames(1) + if err != nil { + return "" + } + if len(names) < 1 { + return "" + } + // ... which is a cgroup mount point. + err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst) + if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { + return "" + } + + return defaultCgroupRoot +} + +// rootPath finds and returns path to the root of the cgroup hierarchies. +func rootPath() (string, error) { + cgroupRootLock.Lock() + defer cgroupRootLock.Unlock() + + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // fast path + cgroupRoot = tryDefaultCgroupRoot() + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // slow path: parse mountinfo + mi, err := cgroups.GetCgroupMounts(false) + if err != nil { + return "", err + } + if len(mi) < 1 { + return "", errors.New("no cgroup mount found in mountinfo") + } + + // Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"), + // use its parent directory. + root := filepath.Dir(mi[0].Mountpoint) + + if _, err := os.Stat(root); err != nil { + return "", err + } + + cgroupRoot = root + return cgroupRoot, nil +} + +func subsysPath(root, inner, subsystem string) (string, error) { + // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. + if filepath.IsAbs(inner) { + mnt, err := cgroups.FindCgroupMountpoint(root, subsystem) + // If we didn't mount the subsystem, there is no point we make the path. + if err != nil { + return "", err + } + + // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. + return filepath.Join(root, filepath.Base(mnt), inner), nil + } + + // Use GetOwnCgroupPath for dind-like cases, when cgroupns is not + // available. This is ugly. + parentPath, err := cgroups.GetOwnCgroupPath(subsystem) + if err != nil { + return "", err + } + + return filepath.Join(parentPath, inner), nil +} + +func apply(path string, pid int) error { + if path == "" { + return nil + } + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + return cgroups.WriteCgroupProc(path, pid) +} diff --git a/fs/paths_test.go b/fs/paths_test.go new file mode 100644 index 0000000..42b8b66 --- /dev/null +++ b/fs/paths_test.go @@ -0,0 +1,104 @@ +package fs + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/internal/path" +) + +func TestInvalidCgroupPath(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v2 is not supported") + } + + root, err := rootPath() + if err != nil { + t.Fatalf("couldn't get cgroup root: %v", err) + } + + testCases := []struct { + test string + path, name, parent string + }{ + { + test: "invalid cgroup path", + path: "../../../../../../../../../../some/path", + }, + { + test: "invalid absolute cgroup path", + path: "/../../../../../../../../../../some/path", + }, + { + test: "invalid cgroup parent", + parent: "../../../../../../../../../../some/path", + name: "name", + }, + { + test: "invalid absolute cgroup parent", + parent: "/../../../../../../../../../../some/path", + name: "name", + }, + { + test: "invalid cgroup name", + parent: "parent", + name: "../../../../../../../../../../some/path", + }, + { + test: "invalid absolute cgroup name", + parent: "parent", + name: "/../../../../../../../../../../some/path", + }, + { + test: "invalid cgroup name and parent", + parent: "../../../../../../../../../../some/path", + name: "../../../../../../../../../../some/path", + }, + { + test: "invalid absolute cgroup name and parent", + parent: "/../../../../../../../../../../some/path", + name: "/../../../../../../../../../../some/path", + }, + } + + for _, tc := range testCases { + t.Run(tc.test, func(t *testing.T) { + config := &cgroups.Cgroup{Path: tc.path, Name: tc.name, Parent: tc.parent} + + inner, err := path.Inner(config) + if err != nil { + t.Fatalf("couldn't get cgroup data: %v", err) + } + + // Make sure the final inner path doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(inner, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := subsysPath(root, inner, "devices") + if err != nil { + t.Fatalf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } + }) + } +} + +func TestTryDefaultCgroupRoot(t *testing.T) { + res := tryDefaultCgroupRoot() + exp := defaultCgroupRoot + if cgroups.IsCgroup2UnifiedMode() { + // checking that tryDefaultCgroupRoot does return "" + // in case /sys/fs/cgroup is not cgroup v1 root dir. + exp = "" + } + if res != exp { + t.Errorf("tryDefaultCgroupRoot: want %q, got %q", exp, res) + } +} diff --git a/fs/perf_event.go b/fs/perf_event.go new file mode 100644 index 0000000..929c412 --- /dev/null +++ b/fs/perf_event.go @@ -0,0 +1,23 @@ +package fs + +import ( + "github.com/opencontainers/cgroups" +) + +type PerfEventGroup struct{} + +func (s *PerfEventGroup) Name() string { + return "perf_event" +} + +func (s *PerfEventGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + return apply(path, pid) +} + +func (s *PerfEventGroup) Set(_ string, _ *cgroups.Resources) error { + return nil +} + +func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/fs/pids.go b/fs/pids.go new file mode 100644 index 0000000..9319761 --- /dev/null +++ b/fs/pids.go @@ -0,0 +1,61 @@ +package fs + +import ( + "math" + "strconv" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +type PidsGroup struct{} + +func (s *PidsGroup) Name() string { + return "pids" +} + +func (s *PidsGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + return apply(path, pid) +} + +func (s *PidsGroup) Set(path string, r *cgroups.Resources) error { + if r.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if r.PidsLimit > 0 { + limit = strconv.FormatInt(r.PidsLimit, 10) + } + + if err := cgroups.WriteFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + current, err := fscommon.GetCgroupParamUint(path, "pids.current") + if err != nil { + return err + } + + max, err := fscommon.GetCgroupParamUint(path, "pids.max") + if err != nil { + return err + } + // If no limit is set, read from pids.max returns "max", which is + // converted to MaxUint64 by GetCgroupParamUint. Historically, we + // represent "no limit" for pids as 0, thus this conversion. + if max == math.MaxUint64 { + max = 0 + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/fs/pids_test.go b/fs/pids_test.go new file mode 100644 index 0000000..a33db7a --- /dev/null +++ b/fs/pids_test.go @@ -0,0 +1,108 @@ +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +const ( + maxUnlimited = -1 + maxLimited = 1024 +) + +func TestPidsSetMax(t *testing.T) { + path := tempDir(t, "pids") + + writeFileContents(t, path, map[string]string{ + "pids.max": "max", + }) + + r := &cgroups.Resources{ + PidsLimit: maxLimited, + } + pids := &PidsGroup{} + if err := pids.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(path, "pids.max") + if err != nil { + t.Fatal(err) + } + if value != maxLimited { + t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value) + } +} + +func TestPidsSetUnlimited(t *testing.T) { + path := tempDir(t, "pids") + + writeFileContents(t, path, map[string]string{ + "pids.max": strconv.Itoa(maxLimited), + }) + + r := &cgroups.Resources{ + PidsLimit: maxUnlimited, + } + pids := &PidsGroup{} + if err := pids.Set(path, r); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(path, "pids.max") + if err != nil { + t.Fatal(err) + } + if value != "max" { + t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value) + } +} + +func TestPidsStats(t *testing.T) { + path := tempDir(t, "pids") + + writeFileContents(t, path, map[string]string{ + "pids.current": strconv.Itoa(1337), + "pids.max": strconv.Itoa(maxLimited), + }) + + pids := &PidsGroup{} + stats := *cgroups.NewStats() + if err := pids.GetStats(path, &stats); err != nil { + t.Fatal(err) + } + + if stats.PidsStats.Current != 1337 { + t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current) + } + + if stats.PidsStats.Limit != maxLimited { + t.Fatalf("Expected %d, got %d for pids.max", maxLimited, stats.PidsStats.Limit) + } +} + +func TestPidsStatsUnlimited(t *testing.T) { + path := tempDir(t, "pids") + + writeFileContents(t, path, map[string]string{ + "pids.current": strconv.Itoa(4096), + "pids.max": "max", + }) + + pids := &PidsGroup{} + stats := *cgroups.NewStats() + if err := pids.GetStats(path, &stats); err != nil { + t.Fatal(err) + } + + if stats.PidsStats.Current != 4096 { + t.Fatalf("Expected %d, got %d for pids.current", 4096, stats.PidsStats.Current) + } + + if stats.PidsStats.Limit != 0 { + t.Fatalf("Expected %d, got %d for pids.max", 0, stats.PidsStats.Limit) + } +} diff --git a/fs/rdma.go b/fs/rdma.go new file mode 100644 index 0000000..4b17536 --- /dev/null +++ b/fs/rdma.go @@ -0,0 +1,24 @@ +package fs + +import ( + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +type RdmaGroup struct{} + +func (s *RdmaGroup) Name() string { + return "rdma" +} + +func (s *RdmaGroup) Apply(path string, _ *cgroups.Resources, pid int) error { + return apply(path, pid) +} + +func (s *RdmaGroup) Set(path string, r *cgroups.Resources) error { + return fscommon.RdmaSet(path, r) +} + +func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error { + return fscommon.RdmaGetStats(path, stats) +} diff --git a/fs/stats_util_test.go b/fs/stats_util_test.go new file mode 100644 index 0000000..330dced --- /dev/null +++ b/fs/stats_util_test.go @@ -0,0 +1,138 @@ +package fs + +import ( + "errors" + "fmt" + "reflect" + "testing" + + "github.com/opencontainers/cgroups" +) + +func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error { + if len(expected) != len(actual) { + return errors.New("blkioStatEntries length do not match") + } + for i, expValue := range expected { + actValue := actual[i] + if expValue != actValue { + return fmt.Errorf("expected: %v, actual: %v", expValue, actValue) + } + } + return nil +} + +func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) { + t.Helper() + if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil { + t.Errorf("blkio IoServiceBytesRecursive do not match: %s", err) + } + + if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil { + t.Errorf("blkio IoServicedRecursive do not match: %s", err) + } + + if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil { + t.Errorf("blkio IoQueuedRecursive do not match: %s", err) + } + + if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil { + t.Errorf("blkio SectorsRecursive do not match: %s", err) + } + + if err := blkioStatEntryEquals(expected.IoServiceTimeRecursive, actual.IoServiceTimeRecursive); err != nil { + t.Errorf("blkio IoServiceTimeRecursive do not match: %s", err) + } + + if err := blkioStatEntryEquals(expected.IoWaitTimeRecursive, actual.IoWaitTimeRecursive); err != nil { + t.Errorf("blkio IoWaitTimeRecursive do not match: %s", err) + } + + if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil { + t.Errorf("blkio IoMergedRecursive do not match: expected: %v, actual: %v", expected.IoMergedRecursive, actual.IoMergedRecursive) + } + + if err := blkioStatEntryEquals(expected.IoTimeRecursive, actual.IoTimeRecursive); err != nil { + t.Errorf("blkio IoTimeRecursive do not match: %s", err) + } +} + +func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) { + t.Helper() + if expected != actual { + t.Errorf("Expected throttling data: %v, actual: %v", expected, actual) + } +} + +func expectHugetlbStatEquals(t *testing.T, expected, actual cgroups.HugetlbStats) { + t.Helper() + if expected != actual { + t.Errorf("Expected hugetlb stats: %v, actual: %v", expected, actual) + } +} + +func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) { + t.Helper() + expectMemoryDataEquals(t, expected.Usage, actual.Usage) + expectMemoryDataEquals(t, expected.SwapUsage, actual.SwapUsage) + expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage) + expectPageUsageByNUMAEquals(t, expected.PageUsageByNUMA, actual.PageUsageByNUMA) + + if expected.UseHierarchy != actual.UseHierarchy { + t.Errorf("Expected memory use hierarchy: %v, actual: %v", expected.UseHierarchy, actual.UseHierarchy) + } + + for key, expValue := range expected.Stats { + actValue, ok := actual.Stats[key] + if !ok { + t.Errorf("Expected memory stat key %s not found", key) + } + if expValue != actValue { + t.Errorf("Expected memory stat value: %d, actual: %d", expValue, actValue) + } + } +} + +func expectMemoryDataEquals(t *testing.T, expected, actual cgroups.MemoryData) { + t.Helper() + if expected.Usage != actual.Usage { + t.Errorf("Expected memory usage: %d, actual: %d", expected.Usage, actual.Usage) + } + if expected.MaxUsage != actual.MaxUsage { + t.Errorf("Expected memory max usage: %d, actual: %d", expected.MaxUsage, actual.MaxUsage) + } + if expected.Failcnt != actual.Failcnt { + t.Errorf("Expected memory failcnt %d, actual: %d", expected.Failcnt, actual.Failcnt) + } + if expected.Limit != actual.Limit { + t.Errorf("Expected memory limit: %d, actual: %d", expected.Limit, actual.Limit) + } +} + +func expectPageUsageByNUMAEquals(t *testing.T, expected, actual cgroups.PageUsageByNUMA) { + t.Helper() + if !reflect.DeepEqual(expected.Total, actual.Total) { + t.Errorf("Expected total page usage by NUMA: %#v, actual: %#v", expected.Total, actual.Total) + } + if !reflect.DeepEqual(expected.File, actual.File) { + t.Errorf("Expected file page usage by NUMA: %#v, actual: %#v", expected.File, actual.File) + } + if !reflect.DeepEqual(expected.Anon, actual.Anon) { + t.Errorf("Expected anon page usage by NUMA: %#v, actual: %#v", expected.Anon, actual.Anon) + } + if !reflect.DeepEqual(expected.Unevictable, actual.Unevictable) { + t.Errorf("Expected unevictable page usage by NUMA: %#v, actual: %#v", expected.Unevictable, actual.Unevictable) + } + if !reflect.DeepEqual(expected.Hierarchical.Total, actual.Hierarchical.Total) { + t.Errorf("Expected hierarchical total page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Total, actual.Hierarchical.Total) + } + if !reflect.DeepEqual(expected.Hierarchical.File, actual.Hierarchical.File) { + t.Errorf("Expected hierarchical file page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.File, actual.Hierarchical.File) + } + if !reflect.DeepEqual(expected.Hierarchical.Anon, actual.Hierarchical.Anon) { + t.Errorf("Expected hierarchical anon page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Anon, actual.Hierarchical.Anon) + } + if !reflect.DeepEqual(expected.Hierarchical.Unevictable, actual.Hierarchical.Unevictable) { + t.Errorf("Expected hierarchical total page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Unevictable, actual.Hierarchical.Unevictable) + } +} diff --git a/fs/util_test.go b/fs/util_test.go new file mode 100644 index 0000000..e620fda --- /dev/null +++ b/fs/util_test.go @@ -0,0 +1,39 @@ +/* +Utility for testing cgroup operations. + +Creates a mock of the cgroup filesystem for the duration of the test. +*/ +package fs + +import ( + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/cgroups" +) + +func init() { + cgroups.TestMode = true +} + +// tempDir creates a new test directory for the specified subsystem. +func tempDir(t testing.TB, subsystem string) string { + path := filepath.Join(t.TempDir(), subsystem) + // Ensure the full mock cgroup path exists. + if err := os.Mkdir(path, 0o755); err != nil { + t.Fatal(err) + } + return path +} + +// writeFileContents writes the specified contents on the mock of the specified +// cgroup files. +func writeFileContents(t testing.TB, path string, fileContents map[string]string) { + for file, contents := range fileContents { + err := cgroups.WriteFile(path, file, contents) + if err != nil { + t.Fatal(err) + } + } +} diff --git a/fs2/cpu.go b/fs2/cpu.go new file mode 100644 index 0000000..8eae673 --- /dev/null +++ b/fs2/cpu.go @@ -0,0 +1,117 @@ +package fs2 + +import ( + "bufio" + "errors" + "os" + "strconv" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +func isCPUSet(r *cgroups.Resources) bool { + return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil || r.CpuBurst != nil +} + +func setCPU(dirPath string, r *cgroups.Resources) error { + if !isCPUSet(r) { + return nil + } + + if r.CPUIdle != nil { + if err := cgroups.WriteFile(dirPath, "cpu.idle", strconv.FormatInt(*r.CPUIdle, 10)); err != nil { + return err + } + } + + // NOTE: .CpuShares is not used here. Conversion is the caller's responsibility. + if r.CpuWeight != 0 { + if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil { + return err + } + } + + var burst string + if r.CpuBurst != nil { + burst = strconv.FormatUint(*r.CpuBurst, 10) + if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil { + // Sometimes when the burst to be set is larger + // than the current one, it is rejected by the kernel + // (EINVAL) as old_quota/new_burst exceeds the parent + // cgroup quota limit. If this happens and the quota is + // going to be set, ignore the error for now and retry + // after setting the quota. + if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { + return err + } + } else { + burst = "" + } + } + if r.CpuQuota != 0 || r.CpuPeriod != 0 { + str := "max" + if r.CpuQuota > 0 { + str = strconv.FormatInt(r.CpuQuota, 10) + } + period := r.CpuPeriod + if period == 0 { + // This default value is documented in + // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + period = 100000 + } + str += " " + strconv.FormatUint(period, 10) + if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil { + return err + } + if burst != "" { + if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil { + return err + } + } + } + + return nil +} + +func statCpu(dirPath string, stats *cgroups.Stats) error { + const file = "cpu.stat" + f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) + if err != nil { + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + switch t { + case "usage_usec": + stats.CpuStats.CpuUsage.TotalUsage = v * 1000 + + case "user_usec": + stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000 + + case "system_usec": + stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000 + + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_usec": + stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000 + } + } + if err := sc.Err(); err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + return nil +} diff --git a/fs2/cpuset.go b/fs2/cpuset.go new file mode 100644 index 0000000..9399919 --- /dev/null +++ b/fs2/cpuset.go @@ -0,0 +1,27 @@ +package fs2 + +import ( + "github.com/opencontainers/cgroups" +) + +func isCpusetSet(r *cgroups.Resources) bool { + return r.CpusetCpus != "" || r.CpusetMems != "" +} + +func setCpuset(dirPath string, r *cgroups.Resources) error { + if !isCpusetSet(r) { + return nil + } + + if r.CpusetCpus != "" { + if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil { + return err + } + } + if r.CpusetMems != "" { + if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil { + return err + } + } + return nil +} diff --git a/fs2/create.go b/fs2/create.go new file mode 100644 index 0000000..565ca88 --- /dev/null +++ b/fs2/create.go @@ -0,0 +1,151 @@ +package fs2 + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/cgroups" +) + +func supportedControllers() (string, error) { + return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers") +} + +// needAnyControllers returns whether we enable some supported controllers or not, +// based on (1) controllers available and (2) resources that are being set. +// We don't check "pseudo" controllers such as +// "freezer" and "devices". +func needAnyControllers(r *cgroups.Resources) (bool, error) { + if r == nil { + return false, nil + } + + // list of all available controllers + content, err := supportedControllers() + if err != nil { + return false, err + } + avail := make(map[string]struct{}) + for _, ctr := range strings.Fields(content) { + avail[ctr] = struct{}{} + } + + // check whether the controller if available or not + have := func(controller string) bool { + _, ok := avail[controller] + return ok + } + + if isPidsSet(r) && have("pids") { + return true, nil + } + if isMemorySet(r) && have("memory") { + return true, nil + } + if isIoSet(r) && have("io") { + return true, nil + } + if isCPUSet(r) && have("cpu") { + return true, nil + } + if isCpusetSet(r) && have("cpuset") { + return true, nil + } + if isHugeTlbSet(r) && have("hugetlb") { + return true, nil + } + + return false, nil +} + +// containsDomainController returns whether the current config contains domain controller or not. +// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html +// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids. +func containsDomainController(r *cgroups.Resources) bool { + return isMemorySet(r) || isIoSet(r) || isCPUSet(r) || isHugeTlbSet(r) +} + +// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers. +func CreateCgroupPath(path string, c *cgroups.Cgroup) (Err error) { + if !strings.HasPrefix(path, UnifiedMountpoint) { + return fmt.Errorf("invalid cgroup path %s", path) + } + + content, err := supportedControllers() + if err != nil { + return err + } + + const ( + cgTypeFile = "cgroup.type" + cgStCtlFile = "cgroup.subtree_control" + ) + ctrs := strings.Fields(content) + res := "+" + strings.Join(ctrs, " +") + + elements := strings.Split(path, "/") + elements = elements[3:] + current := "/sys/fs" + for i, e := range elements { + current = filepath.Join(current, e) + if i > 0 { + if err := os.Mkdir(current, 0o755); err != nil { + if !os.IsExist(err) { + return err + } + } else { + // If the directory was created, be sure it is not left around on errors. + current := current + defer func() { + if Err != nil { + os.Remove(current) + } + }() + } + cgType, _ := cgroups.ReadFile(current, cgTypeFile) + cgType = strings.TrimSpace(cgType) + switch cgType { + // If the cgroup is in an invalid mode (usually this means there's an internal + // process in the cgroup tree, because we created a cgroup under an + // already-populated-by-other-processes cgroup), then we have to error out if + // the user requested controllers which are not thread-aware. However, if all + // the controllers requested are thread-aware we can simply put the cgroup into + // threaded mode. + case "domain invalid": + if containsDomainController(c.Resources) { + return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current) + } else { + // Not entirely correct (in theory we'd always want to be a domain -- + // since that means we're a properly delegated cgroup subtree) but in + // this case there's not much we can do and it's better than giving an + // error. + _ = cgroups.WriteFile(current, cgTypeFile, "threaded") + } + // If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers + // (and you cannot usually take a cgroup out of threaded mode). + case "domain threaded": + fallthrough + case "threaded": + if containsDomainController(c.Resources) { + return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType) + } + } + } + // enable all supported controllers + if i < len(elements)-1 { + if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil { + // try write one by one + allCtrs := strings.Split(res, " ") + for _, ctr := range allCtrs { + _ = cgroups.WriteFile(current, cgStCtlFile, ctr) + } + } + // Some controllers might not be enabled when rootless or containerized, + // but we don't catch the error here. (Caught in setXXX() functions.) + } + } + + return nil +} diff --git a/fs2/defaultpath.go b/fs2/defaultpath.go new file mode 100644 index 0000000..0bc479d --- /dev/null +++ b/fs2/defaultpath.go @@ -0,0 +1,80 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package fs2 + +import ( + "bufio" + "errors" + "io" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/internal/path" +) + +const UnifiedMountpoint = "/sys/fs/cgroup" + +func defaultDirPath(c *cgroups.Cgroup) (string, error) { + innerPath, err := path.Inner(c) + if err != nil { + return "", err + } + + if filepath.IsAbs(innerPath) { + return filepath.Join(UnifiedMountpoint, innerPath), nil + } + + // we don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. + ownCgroup, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + // The current user scope most probably has tasks in it already, + // making it impossible to enable controllers for its sub-cgroup. + // A parent cgroup (with no tasks in it) is what we need. + ownCgroup = filepath.Dir(ownCgroup) + + return filepath.Join(UnifiedMountpoint, ownCgroup, innerPath), nil +} + +// parseCgroupFile parses /proc/PID/cgroup file and return string +func parseCgroupFile(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + return parseCgroupFromReader(f) +} + +func parseCgroupFromReader(r io.Reader) (string, error) { + s := bufio.NewScanner(r) + for s.Scan() { + // "0::/user.slice/user-1001.slice/session-1.scope" + if path, ok := strings.CutPrefix(s.Text(), "0::"); ok { + return path, nil + } + } + if err := s.Err(); err != nil { + return "", err + } + return "", errors.New("cgroup path not found") +} diff --git a/fs2/defaultpath_test.go b/fs2/defaultpath_test.go new file mode 100644 index 0000000..8fdad88 --- /dev/null +++ b/fs2/defaultpath_test.go @@ -0,0 +1,93 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package fs2 + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/cgroups" +) + +func TestParseCgroupFromReader(t *testing.T) { + cases := map[string]string{ + "0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope", + "2:cpuset:/foo\n1:name=systemd:/\n": "", + "2:cpuset:/foo\n1:name=systemd:/\n0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope", + } + for s, expected := range cases { + g, err := parseCgroupFromReader(strings.NewReader(s)) + if expected != "" { + if g != expected { + t.Errorf("expected %q, got %q", expected, g) + } + if err != nil { + t.Error(err) + } + } else { + if err == nil { + t.Error("error is expected") + } + } + } +} + +func TestDefaultDirPath(t *testing.T) { + if !cgroups.IsCgroup2UnifiedMode() { + t.Skip("need cgroupv2") + } + // same code as in defaultDirPath() + ownCgroup, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + // Not a test failure, but rather some weird + // environment so we can't run this test. + t.Skipf("can't get own cgroup: %v", err) + } + ownCgroup = filepath.Dir(ownCgroup) + + cases := []struct { + cgPath string + cgParent string + cgName string + expected string + }{ + { + cgPath: "/foo/bar", + expected: "/sys/fs/cgroup/foo/bar", + }, + { + cgPath: "foo/bar", + expected: filepath.Join(UnifiedMountpoint, ownCgroup, "foo/bar"), + }, + } + for _, c := range cases { + cg := &cgroups.Cgroup{ + Path: c.cgPath, + Parent: c.cgParent, + Name: c.cgName, + } + + got, err := defaultDirPath(cg) + if err != nil { + t.Fatal(err) + } + if got != c.expected { + t.Fatalf("expected %q, got %q", c.expected, got) + } + } +} diff --git a/fs2/freezer.go b/fs2/freezer.go new file mode 100644 index 0000000..f0192f0 --- /dev/null +++ b/fs2/freezer.go @@ -0,0 +1,124 @@ +package fs2 + +import ( + "bufio" + "errors" + "fmt" + "os" + "strings" + "time" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" +) + +func setFreezer(dirPath string, state cgroups.FreezerState) error { + var stateStr string + switch state { + case cgroups.Undefined: + return nil + case cgroups.Frozen: + stateStr = "1" + case cgroups.Thawed: + stateStr = "0" + default: + return fmt.Errorf("invalid freezer state %q requested", state) + } + + fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR) + if err != nil { + // We can ignore this request as long as the user didn't ask us to + // freeze the container (since without the freezer cgroup, that's a + // no-op). + if state != cgroups.Frozen { + return nil + } + return fmt.Errorf("freezer not supported: %w", err) + } + defer fd.Close() + + if _, err := fd.WriteString(stateStr); err != nil { + return err + } + // Confirm that the cgroup did actually change states. + if actualState, err := readFreezer(dirPath, fd); err != nil { + return err + } else if actualState != state { + return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState) + } + return nil +} + +func getFreezer(dirPath string) (cgroups.FreezerState, error) { + fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY) + if err != nil { + // If the kernel is too old, then we just treat the freezer as being in + // an "undefined" state. + if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { + err = nil + } + return cgroups.Undefined, err + } + defer fd.Close() + + return readFreezer(dirPath, fd) +} + +func readFreezer(dirPath string, fd *os.File) (cgroups.FreezerState, error) { + if _, err := fd.Seek(0, 0); err != nil { + return cgroups.Undefined, err + } + state := make([]byte, 2) + if _, err := fd.Read(state); err != nil { + return cgroups.Undefined, err + } + switch string(state) { + case "0\n": + return cgroups.Thawed, nil + case "1\n": + return waitFrozen(dirPath) + default: + return cgroups.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state) + } +} + +// waitFrozen polls cgroup.events until it sees "frozen 1" in it. +func waitFrozen(dirPath string) (cgroups.FreezerState, error) { + fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY) + if err != nil { + return cgroups.Undefined, err + } + defer fd.Close() + + // XXX: Simple wait/read/retry is used here. An implementation + // based on poll(2) or inotify(7) is possible, but it makes the code + // much more complicated. Maybe address this later. + const ( + // Perform maxIter with waitTime in between iterations. + waitTime = 10 * time.Millisecond + maxIter = 1000 + ) + scanner := bufio.NewScanner(fd) + for i := 0; scanner.Scan(); { + if i == maxIter { + return cgroups.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter) + } + if val, ok := strings.CutPrefix(scanner.Text(), "frozen "); ok { + if val[0] == '1' { + return cgroups.Frozen, nil + } + + i++ + // wait, then re-read + time.Sleep(waitTime) + _, err := fd.Seek(0, 0) + if err != nil { + return cgroups.Undefined, err + } + } + } + // Should only reach here either on read error, + // or if the file does not contain "frozen " line. + return cgroups.Undefined, scanner.Err() +} diff --git a/fs2/fs2.go b/fs2/fs2.go new file mode 100644 index 0000000..c5d5a1f --- /dev/null +++ b/fs2/fs2.go @@ -0,0 +1,316 @@ +package fs2 + +import ( + "errors" + "fmt" + "os" + "strings" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +type parseError = fscommon.ParseError + +type Manager struct { + config *cgroups.Cgroup + // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" + dirPath string + // controllers is content of "cgroup.controllers" file. + // excludes pseudo-controllers ("devices" and "freezer"). + controllers map[string]struct{} +} + +// NewManager creates a manager for cgroup v2 unified hierarchy. +// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope". +// If dirPath is empty, it is automatically set using config. +func NewManager(config *cgroups.Cgroup, dirPath string) (*Manager, error) { + if dirPath == "" { + var err error + dirPath, err = defaultDirPath(config) + if err != nil { + return nil, err + } + } + + m := &Manager{ + config: config, + dirPath: dirPath, + } + return m, nil +} + +func (m *Manager) getControllers() error { + if m.controllers != nil { + return nil + } + + data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers") + if err != nil { + if m.config.Rootless && m.config.Path == "" { + return nil + } + return err + } + fields := strings.Fields(data) + m.controllers = make(map[string]struct{}, len(fields)) + for _, c := range fields { + m.controllers[c] = struct{}{} + } + + return nil +} + +func (m *Manager) Apply(pid int) error { + if err := CreateCgroupPath(m.dirPath, m.config); err != nil { + // Related tests: + // - "runc create (no limits + no cgrouppath + no permission) succeeds" + // - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" + // - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if m.config.Rootless { + if m.config.Path == "" { + if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed { + return cgroups.ErrRootless + } + return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err) + } + } + return err + } + if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil { + return err + } + return nil +} + +func (m *Manager) GetPids() ([]int, error) { + return cgroups.GetPids(m.dirPath) +} + +func (m *Manager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.dirPath) +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + var errs []error + + st := cgroups.NewStats() + + // pids (since kernel 4.5) + if err := statPids(m.dirPath, st); err != nil { + errs = append(errs, err) + } + // memory (since kernel 4.5) + if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // io (since kernel 4.5) + if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // cpu (since kernel 4.15) + // Note cpu.stat is available even if the controller is not enabled. + if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // PSI (since kernel 4.20). + var err error + if st.CpuStats.PSI, err = statPSI(m.dirPath, "cpu.pressure"); err != nil { + errs = append(errs, err) + } + if st.MemoryStats.PSI, err = statPSI(m.dirPath, "memory.pressure"); err != nil { + errs = append(errs, err) + } + if st.BlkioStats.PSI, err = statPSI(m.dirPath, "io.pressure"); err != nil { + errs = append(errs, err) + } + // hugetlb (since kernel 5.6) + if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // rdma (since kernel 4.11) + if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // misc (since kernel 5.13) + if err := statMisc(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + if len(errs) > 0 && !m.config.Rootless { + return st, fmt.Errorf("error while statting cgroup v2: %+v", errs) + } + return st, nil +} + +func (m *Manager) Freeze(state cgroups.FreezerState) error { + if m.config.Resources == nil { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + if err := setFreezer(m.dirPath, state); err != nil { + return err + } + m.config.Resources.Freezer = state + return nil +} + +func (m *Manager) Destroy() error { + return cgroups.RemovePath(m.dirPath) +} + +func (m *Manager) Path(_ string) string { + return m.dirPath +} + +func (m *Manager) Set(r *cgroups.Resources) error { + if r == nil { + return nil + } + if err := m.getControllers(); err != nil { + return err + } + // pids (since kernel 4.5) + if err := setPids(m.dirPath, r); err != nil { + return err + } + // memory (since kernel 4.5) + if err := setMemory(m.dirPath, r); err != nil { + return err + } + // io (since kernel 4.5) + if err := setIo(m.dirPath, r); err != nil { + return err + } + // cpu (since kernel 4.15) + if err := setCPU(m.dirPath, r); err != nil { + return err + } + // devices (since kernel 4.15, pseudo-controller) + // + // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if err := setDevices(m.dirPath, r); err != nil { + if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) { + return err + } + } + // cpuset (since kernel 5.0) + if err := setCpuset(m.dirPath, r); err != nil { + return err + } + // hugetlb (since kernel 5.6) + if err := setHugeTlb(m.dirPath, r); err != nil { + return err + } + // rdma (since kernel 4.11) + if err := fscommon.RdmaSet(m.dirPath, r); err != nil { + return err + } + // freezer (since kernel 5.2, pseudo-controller) + if err := setFreezer(m.dirPath, r.Freezer); err != nil { + return err + } + if err := m.setUnified(r.Unified); err != nil { + return err + } + m.config.Resources = r + return nil +} + +func setDevices(dirPath string, r *cgroups.Resources) error { + if cgroups.DevicesSetV2 == nil { + if len(r.Devices) > 0 { + return cgroups.ErrDevicesUnsupported + } + return nil + } + return cgroups.DevicesSetV2(dirPath, r) +} + +func (m *Manager) setUnified(res map[string]string) error { + for k, v := range res { + if strings.Contains(k, "/") { + return fmt.Errorf("unified resource %q must be a file name (no slashes)", k) + } + if err := cgroups.WriteFileByLine(m.dirPath, k, v); err != nil { + // Check for both EPERM and ENOENT since O_CREAT is used by WriteFile. + if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) { + // Check if a controller is available, + // to give more specific error if not. + c, _, ok := strings.Cut(k, ".") + if !ok { + return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) + } + if _, ok := m.controllers[c]; !ok && c != "cgroup" { + return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c) + } + } + return fmt.Errorf("unable to set unified resource %q: %w", k, err) + } + } + + return nil +} + +func (m *Manager) GetPaths() map[string]string { + paths := make(map[string]string, 1) + paths[""] = m.dirPath + return paths +} + +func (m *Manager) GetCgroups() (*cgroups.Cgroup, error) { + return m.config, nil +} + +func (m *Manager) GetFreezerState() (cgroups.FreezerState, error) { + return getFreezer(m.dirPath) +} + +func (m *Manager) Exists() bool { + return cgroups.PathExists(m.dirPath) +} + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.events", "oom_kill") +} + +func (m *Manager) OOMKillCount() (uint64, error) { + c, err := OOMKillCount(m.dirPath) + if err != nil && m.config.Rootless && os.IsNotExist(err) { + err = nil + } + + return c, err +} + +func CheckMemoryUsage(dirPath string, r *cgroups.Resources) error { + if !r.MemoryCheckBeforeUpdate { + return nil + } + + if r.Memory <= 0 && r.MemorySwap <= 0 { + return nil + } + + usage, err := fscommon.GetCgroupParamUint(dirPath, "memory.current") + if err != nil { + // This check is on best-effort basis, so if we can't read the + // current usage (cgroup not yet created, or any other error), + // we should not fail. + return nil + } + + if r.MemorySwap > 0 { + if uint64(r.MemorySwap) <= usage { + return fmt.Errorf("rejecting memory+swap limit %d <= usage %d", r.MemorySwap, usage) + } + } + + if r.Memory > 0 { + if uint64(r.Memory) <= usage { + return fmt.Errorf("rejecting memory limit %d <= usage %d", r.Memory, usage) + } + } + + return nil +} diff --git a/fs2/hugetlb.go b/fs2/hugetlb.go new file mode 100644 index 0000000..8e1ac87 --- /dev/null +++ b/fs2/hugetlb.go @@ -0,0 +1,69 @@ +package fs2 + +import ( + "errors" + "os" + "strconv" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +func isHugeTlbSet(r *cgroups.Resources) bool { + return len(r.HugetlbLimit) > 0 +} + +func setHugeTlb(dirPath string, r *cgroups.Resources) error { + if !isHugeTlbSet(r) { + return nil + } + const suffix = ".max" + skipRsvd := false + for _, hugetlb := range r.HugetlbLimit { + prefix := "hugetlb." + hugetlb.Pagesize + val := strconv.FormatUint(hugetlb.Limit, 10) + if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil { + return err + } + if skipRsvd { + continue + } + if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil { + if errors.Is(err, os.ErrNotExist) { + skipRsvd = true + continue + } + return err + } + } + + return nil +} + +func statHugeTlb(dirPath string, stats *cgroups.Stats) error { + hugetlbStats := cgroups.HugetlbStats{} + rsvd := ".rsvd" + for _, pagesize := range cgroups.HugePageSizes() { + again: + prefix := "hugetlb." + pagesize + rsvd + value, err := fscommon.GetCgroupParamUint(dirPath, prefix+".current") + if err != nil { + if rsvd != "" && errors.Is(err, os.ErrNotExist) { + rsvd = "" + goto again + } + return err + } + hugetlbStats.Usage = value + + value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max") + if err != nil { + return err + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pagesize] = hugetlbStats + } + + return nil +} diff --git a/fs2/io.go b/fs2/io.go new file mode 100644 index 0000000..0f6ef7f --- /dev/null +++ b/fs2/io.go @@ -0,0 +1,192 @@ +package fs2 + +import ( + "bufio" + "bytes" + "fmt" + "os" + "strconv" + "strings" + + "github.com/sirupsen/logrus" + + "github.com/opencontainers/cgroups" +) + +func isIoSet(r *cgroups.Resources) bool { + return r.BlkioWeight != 0 || + len(r.BlkioWeightDevice) > 0 || + len(r.BlkioThrottleReadBpsDevice) > 0 || + len(r.BlkioThrottleWriteBpsDevice) > 0 || + len(r.BlkioThrottleReadIOPSDevice) > 0 || + len(r.BlkioThrottleWriteIOPSDevice) > 0 +} + +// bfqDeviceWeightSupported checks for per-device BFQ weight support (added +// in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight". +func bfqDeviceWeightSupported(bfq *os.File) bool { + if bfq == nil { + return false + } + _, _ = bfq.Seek(0, 0) + buf := make([]byte, 32) + _, _ = bfq.Read(buf) + // If only a single number (default weight) if read back, we have older kernel. + _, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64) + return err != nil +} + +func setIo(dirPath string, r *cgroups.Resources) error { + if !isIoSet(r) { + return nil + } + + // If BFQ IO scheduler is available, use it. + var bfq *os.File + if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 { + var err error + bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR) + if err == nil { + defer bfq.Close() + } else if !os.IsNotExist(err) { + return err + } + } + + if r.BlkioWeight != 0 { + if bfq != nil { // Use BFQ. + if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { + return err + } + } else { + // Fallback to io.weight with a conversion scheme. + v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight) + if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil { + return err + } + } + } + if bfqDeviceWeightSupported(bfq) { + for _, wd := range r.BlkioWeightDevice { + if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil { + return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err) + } + } + } + for _, td := range r.BlkioThrottleReadBpsDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteBpsDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleReadIOPSDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteIOPSDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil { + return err + } + } + + return nil +} + +func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) { + ret := map[string][]string{} + f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY) + if err != nil { + return nil, err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + ret[parts[0]] = parts[1:] + } + if err := scanner.Err(); err != nil { + return nil, &parseError{Path: dirPath, File: name, Err: err} + } + return ret, nil +} + +func statIo(dirPath string, stats *cgroups.Stats) error { + const file = "io.stat" + values, err := readCgroup2MapFile(dirPath, file) + if err != nil { + return err + } + // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt + var parsedStats cgroups.BlkioStats + for k, v := range values { + d := strings.Split(k, ":") + if len(d) != 2 { + continue + } + major, err := strconv.ParseUint(d[0], 10, 64) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + minor, err := strconv.ParseUint(d[1], 10, 64) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + + for _, item := range v { + d := strings.Split(item, "=") + if len(d) != 2 { + continue + } + op := d[0] + + // Map to the cgroupv1 naming and layout (in separate tables). + var targetTable *[]cgroups.BlkioStatEntry + switch op { + // Equivalent to cgroupv1's blkio.io_service_bytes. + case "rbytes": + op = "Read" + targetTable = &parsedStats.IoServiceBytesRecursive + case "wbytes": + op = "Write" + targetTable = &parsedStats.IoServiceBytesRecursive + // Equivalent to cgroupv1's blkio.io_serviced. + case "rios": + op = "Read" + targetTable = &parsedStats.IoServicedRecursive + case "wios": + op = "Write" + targetTable = &parsedStats.IoServicedRecursive + default: + // Skip over entries we cannot map to cgroupv1 stats for now. + // In the future we should expand the stats struct to include + // them. + logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item) + continue + } + + value, err := strconv.ParseUint(d[1], 10, 64) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + + entry := cgroups.BlkioStatEntry{ + Op: op, + Major: major, + Minor: minor, + Value: value, + } + *targetTable = append(*targetTable, entry) + } + } + stats.BlkioStats = parsedStats + return nil +} diff --git a/fs2/io_test.go b/fs2/io_test.go new file mode 100644 index 0000000..2f3f6c6 --- /dev/null +++ b/fs2/io_test.go @@ -0,0 +1,81 @@ +package fs2 + +import ( + "os" + "path/filepath" + "reflect" + "sort" + "testing" + + "github.com/opencontainers/cgroups" +) + +const exampleIoStatData = `254:1 rbytes=6901432320 wbytes=14245535744 rios=263278 wios=248603 dbytes=0 dios=0 +254:0 rbytes=2702336 wbytes=0 rios=97 wios=0 dbytes=0 dios=0 +259:0 rbytes=6911345664 wbytes=14245536256 rios=264538 wios=244914 dbytes=530485248 dios=2` + +var exampleIoStatsParsed = cgroups.BlkioStats{ + IoServiceBytesRecursive: []cgroups.BlkioStatEntry{ + {Major: 254, Minor: 1, Value: 6901432320, Op: "Read"}, + {Major: 254, Minor: 1, Value: 14245535744, Op: "Write"}, + {Major: 254, Minor: 0, Value: 2702336, Op: "Read"}, + {Major: 254, Minor: 0, Value: 0, Op: "Write"}, + {Major: 259, Minor: 0, Value: 6911345664, Op: "Read"}, + {Major: 259, Minor: 0, Value: 14245536256, Op: "Write"}, + }, + IoServicedRecursive: []cgroups.BlkioStatEntry{ + {Major: 254, Minor: 1, Value: 263278, Op: "Read"}, + {Major: 254, Minor: 1, Value: 248603, Op: "Write"}, + {Major: 254, Minor: 0, Value: 97, Op: "Read"}, + {Major: 254, Minor: 0, Value: 0, Op: "Write"}, + {Major: 259, Minor: 0, Value: 264538, Op: "Read"}, + {Major: 259, Minor: 0, Value: 244914, Op: "Write"}, + }, +} + +func lessBlkioStatEntry(a, b cgroups.BlkioStatEntry) bool { + if a.Major != b.Major { + return a.Major < b.Major + } + if a.Minor != b.Minor { + return a.Minor < b.Minor + } + if a.Op != b.Op { + return a.Op < b.Op + } + return a.Value < b.Value +} + +func sortBlkioStats(stats *cgroups.BlkioStats) { + for _, table := range []*[]cgroups.BlkioStatEntry{ + &stats.IoServicedRecursive, + &stats.IoServiceBytesRecursive, + } { + sort.SliceStable(*table, func(i, j int) bool { return lessBlkioStatEntry((*table)[i], (*table)[j]) }) + } +} + +func TestStatIo(t *testing.T) { + // We're using a fake cgroupfs. + cgroups.TestMode = true + + fakeCgroupDir := t.TempDir() + statPath := filepath.Join(fakeCgroupDir, "io.stat") + + if err := os.WriteFile(statPath, []byte(exampleIoStatData), 0o644); err != nil { + t.Fatal(err) + } + + var gotStats cgroups.Stats + if err := statIo(fakeCgroupDir, &gotStats); err != nil { + t.Error(err) + } + + // Sort the output since statIo uses a map internally. + sortBlkioStats(&gotStats.BlkioStats) + sortBlkioStats(&exampleIoStatsParsed) + + if !reflect.DeepEqual(gotStats.BlkioStats, exampleIoStatsParsed) { + t.Errorf("parsed cgroupv2 io.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.BlkioStats, exampleIoStatsParsed) + } +} diff --git a/fs2/memory.go b/fs2/memory.go new file mode 100644 index 0000000..d67fd8a --- /dev/null +++ b/fs2/memory.go @@ -0,0 +1,241 @@ +package fs2 + +import ( + "bufio" + "errors" + "math" + "os" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +// numToStr converts an int64 value to a string for writing to a +// cgroupv2 files with .min, .max, .low, or .high suffix. +// The value of -1 is converted to "max" for cgroupv1 compatibility +// (which used to write -1 to remove the limit). +func numToStr(value int64) (ret string) { + switch { + case value == 0: + ret = "" + case value == -1: + ret = "max" + default: + ret = strconv.FormatInt(value, 10) + } + + return ret +} + +func isMemorySet(r *cgroups.Resources) bool { + return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0 +} + +func setMemory(dirPath string, r *cgroups.Resources) error { + if !isMemorySet(r) { + return nil + } + + if err := CheckMemoryUsage(dirPath, r); err != nil { + return err + } + + swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) + if err != nil { + return err + } + swapStr := numToStr(swap) + if swapStr == "" && swap == 0 && r.MemorySwap > 0 { + // memory and memorySwap set to the same value -- disable swap + swapStr = "0" + } + // never write empty string to `memory.swap.max`, it means set to 0. + if swapStr != "" { + if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil { + // If swap is not enabled, silently ignore setting to max or disabling it. + if !(errors.Is(err, os.ErrNotExist) && (swapStr == "max" || swapStr == "0")) { + return err + } + } + } + + if val := numToStr(r.Memory); val != "" { + if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil { + return err + } + } + + // cgroup.Resources.KernelMemory is ignored + + if val := numToStr(r.MemoryReservation); val != "" { + if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil { + return err + } + } + + return nil +} + +func statMemory(dirPath string, stats *cgroups.Stats) error { + const file = "memory.stat" + statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) + if err != nil { + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + stats.MemoryStats.Stats[t] = v + } + if err := sc.Err(); err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"] + // Unlike cgroup v1 which has memory.use_hierarchy binary knob, + // cgroup v2 is always hierarchical. + stats.MemoryStats.UseHierarchy = true + + memoryUsage, err := getMemoryDataV2(dirPath, "") + if err != nil { + if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint { + // The root cgroup does not have memory.{current,max,peak} + // so emulate those using data from /proc/meminfo and + // /sys/fs/cgroup/memory.stat + return rootStatsFromMeminfo(stats) + } + return err + } + stats.MemoryStats.Usage = memoryUsage + swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap") + if err != nil { + return err + } + stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage + swapUsage := swapOnlyUsage + // As cgroup v1 reports SwapUsage values as mem+swap combined, + // while in cgroup v2 swap values do not include memory, + // report combined mem+swap for v1 compatibility. + swapUsage.Usage += memoryUsage.Usage + if swapUsage.Limit != math.MaxUint64 { + swapUsage.Limit += memoryUsage.Limit + } + // The `MaxUsage` of mem+swap cannot simply combine mem with + // swap. So set it to 0 for v1 compatibility. + swapUsage.MaxUsage = 0 + stats.MemoryStats.SwapUsage = swapUsage + + return nil +} + +func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = "memory." + name + } + usage := moduleName + ".current" + limit := moduleName + ".max" + maxUsage := moduleName + ".peak" + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if name != "" && os.IsNotExist(err) { + // Ignore EEXIST as there's no swap accounting + // if kernel CONFIG_MEMCG_SWAP is not set or + // swapaccount=0 kernel boot parameter is given. + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, err + } + memoryData.Usage = value + + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.Limit = value + + // `memory.peak` since kernel 5.19 + // `memory.swap.peak` since kernel 6.5 + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil && !os.IsNotExist(err) { + return cgroups.MemoryData{}, err + } + memoryData.MaxUsage = value + + return memoryData, nil +} + +func rootStatsFromMeminfo(stats *cgroups.Stats) error { + const file = "/proc/meminfo" + f, err := os.Open(file) + if err != nil { + return err + } + defer f.Close() + + // Fields we are interested in. + var ( + swap_free uint64 + swap_total uint64 + ) + mem := map[string]*uint64{ + "SwapFree": &swap_free, + "SwapTotal": &swap_total, + } + + found := 0 + sc := bufio.NewScanner(f) + for sc.Scan() { + parts := strings.SplitN(sc.Text(), ":", 3) + if len(parts) != 2 { + // Should not happen. + continue + } + k := parts[0] + p, ok := mem[k] + if !ok { + // Unknown field -- not interested. + continue + } + vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB")) + *p, err = strconv.ParseUint(vStr, 10, 64) + if err != nil { + return &parseError{File: file, Err: errors.New("bad value for " + k)} + } + + found++ + if found == len(mem) { + // Got everything we need -- skip the rest. + break + } + } + if err := sc.Err(); err != nil { + return &parseError{Path: "", File: file, Err: err} + } + + // cgroup v1 `usage_in_bytes` reports memory usage as the sum of + // - rss (NR_ANON_MAPPED) + // - cache (NR_FILE_PAGES) + // cgroup v1 reports SwapUsage values as mem+swap combined + // cgroup v2 reports rss and cache as anon and file. + // sum `anon` + `file` to report the same value as `usage_in_bytes` in v1. + // sum swap usage as combined mem+swap usage for consistency as well. + stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"] + stats.MemoryStats.Usage.Limit = math.MaxUint64 + stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024 + stats.MemoryStats.SwapUsage.Limit = math.MaxUint64 + stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage + + return nil +} diff --git a/fs2/memory_test.go b/fs2/memory_test.go new file mode 100644 index 0000000..e46dbe6 --- /dev/null +++ b/fs2/memory_test.go @@ -0,0 +1,155 @@ +package fs2 + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/cgroups" +) + +const exampleMemoryStatData = `anon 790425600 +file 6502666240 +kernel_stack 7012352 +pagetables 8867840 +percpu 2445520 +sock 40960 +shmem 6721536 +file_mapped 656187392 +file_dirty 1122304 +file_writeback 0 +swapcached 10 +anon_thp 438304768 +file_thp 0 +shmem_thp 0 +inactive_anon 892223488 +active_anon 2973696 +inactive_file 5307346944 +active_file 1179316224 +unevictable 31477760 +slab_reclaimable 348866240 +slab_unreclaimable 10099808 +slab 358966048 +workingset_refault_anon 0 +workingset_refault_file 0 +workingset_activate_anon 0 +workingset_activate_file 0 +workingset_restore_anon 0 +workingset_restore_file 0 +workingset_nodereclaim 0 +pgfault 103216687 +pgmajfault 6879 +pgrefill 0 +pgscan 0 +pgsteal 0 +pgactivate 1110217 +pgdeactivate 292 +pglazyfree 267 +pglazyfreed 0 +thp_fault_alloc 57411 +thp_collapse_alloc 443` + +func TestStatMemoryPodCgroupNotFound(t *testing.T) { + // We're using a fake cgroupfs. + cgroups.TestMode = true + fakeCgroupDir := t.TempDir() + + // only write memory.stat to ensure pod cgroup usage + // still reads memory.current. + statPath := filepath.Join(fakeCgroupDir, "memory.stat") + if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil { + t.Fatal(err) + } + + gotStats := cgroups.NewStats() + + // use a fake root path to mismatch the file we wrote. + // this triggers the non-root path which should fail to find memory.current. + err := statMemory(fakeCgroupDir, gotStats) + if err == nil { + t.Errorf("expected error when statting memory for cgroupv2 root, but was nil") + } + + if !strings.Contains(err.Error(), "memory.current: no such file or directory") { + t.Errorf("expected error to contain 'memory.current: no such file or directory', but was %s", err.Error()) + } +} + +func TestStatMemoryPodCgroup(t *testing.T) { + // We're using a fake cgroupfs. + cgroups.TestMode = true + fakeCgroupDir := t.TempDir() + + statPath := filepath.Join(fakeCgroupDir, "memory.stat") + if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil { + t.Fatal(err) + } + + if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.current"), []byte("123456789"), 0o644); err != nil { + t.Fatal(err) + } + + if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.max"), []byte("999999999"), 0o644); err != nil { + t.Fatal(err) + } + + if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.peak"), []byte("987654321"), 0o644); err != nil { + t.Fatal(err) + } + + gotStats := cgroups.NewStats() + + // use a fake root path to trigger the pod cgroup lookup. + err := statMemory(fakeCgroupDir, gotStats) + if err != nil { + t.Errorf("expected no error when statting memory for cgroupv2 root, but got %#+v", err) + } + + // result should be "memory.current" + var expectedUsageBytes uint64 = 123456789 + if gotStats.MemoryStats.Usage.Usage != expectedUsageBytes { + t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Usage, expectedUsageBytes) + } + + // result should be "memory.max" + var expectedLimitBytes uint64 = 999999999 + if gotStats.MemoryStats.Usage.Limit != expectedLimitBytes { + t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Limit, expectedLimitBytes) + } + + // result should be "memory.peak" + var expectedMaxUsageBytes uint64 = 987654321 + if gotStats.MemoryStats.Usage.MaxUsage != expectedMaxUsageBytes { + t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.MaxUsage, expectedMaxUsageBytes) + } +} + +func TestRootStatsFromMeminfo(t *testing.T) { + stats := &cgroups.Stats{ + MemoryStats: cgroups.MemoryStats{ + Stats: map[string]uint64{ + "anon": 790425600, + "file": 6502666240, + }, + }, + } + + if err := rootStatsFromMeminfo(stats); err != nil { + t.Fatal(err) + } + + // result is anon + file + var expectedUsageBytes uint64 = 7293091840 + if stats.MemoryStats.Usage.Usage != expectedUsageBytes { + t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %d\nexpected %d\n", stats.MemoryStats.Usage.Usage, expectedUsageBytes) + } + + // swap is adjusted to mem+swap + if stats.MemoryStats.SwapUsage.Usage < stats.MemoryStats.Usage.Usage { + t.Errorf("swap usage %d should be at least mem usage %d", stats.MemoryStats.SwapUsage.Usage, stats.MemoryStats.Usage.Usage) + } + if stats.MemoryStats.SwapUsage.Limit < stats.MemoryStats.Usage.Limit { + t.Errorf("swap limit %d should be at least mem limit %d", stats.MemoryStats.SwapUsage.Limit, stats.MemoryStats.Usage.Limit) + } +} diff --git a/fs2/misc.go b/fs2/misc.go new file mode 100644 index 0000000..f20136b --- /dev/null +++ b/fs2/misc.go @@ -0,0 +1,52 @@ +package fs2 + +import ( + "bufio" + "os" + "strings" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +func statMisc(dirPath string, stats *cgroups.Stats) error { + for _, file := range []string{"current", "events"} { + fd, err := cgroups.OpenFile(dirPath, "misc."+file, os.O_RDONLY) + if err != nil { + return err + } + + s := bufio.NewScanner(fd) + for s.Scan() { + key, value, err := fscommon.ParseKeyValue(s.Text()) + if err != nil { + fd.Close() + return err + } + + key = strings.TrimSuffix(key, ".max") + + if _, ok := stats.MiscStats[key]; !ok { + stats.MiscStats[key] = cgroups.MiscStats{} + } + + tmp := stats.MiscStats[key] + + switch file { + case "current": + tmp.Usage = value + case "events": + tmp.Events = value + } + + stats.MiscStats[key] = tmp + } + fd.Close() + + if err := s.Err(); err != nil { + return err + } + } + + return nil +} diff --git a/fs2/misc_test.go b/fs2/misc_test.go new file mode 100644 index 0000000..01ccc0a --- /dev/null +++ b/fs2/misc_test.go @@ -0,0 +1,103 @@ +package fs2 + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/cgroups" +) + +const exampleMiscCurrentData = `res_a 123 +res_b 456 +res_c 42` + +const exampleMiscEventsData = `res_a.max 1 +res_b.max 2 +res_c.max 3` + +func TestStatMiscPodCgroupEmpty(t *testing.T) { + // We're using a fake cgroupfs. + cgroups.TestMode = true + fakeCgroupDir := t.TempDir() + + // create empty misc.current and misc.events files to test the common case + // where no misc resource keys are available + for _, file := range []string{"misc.current", "misc.events"} { + if _, err := os.Create(filepath.Join(fakeCgroupDir, file)); err != nil { + t.Fatal(err) + } + } + + gotStats := cgroups.NewStats() + + err := statMisc(fakeCgroupDir, gotStats) + if err != nil { + t.Errorf("expected no error when statting empty misc.current/misc.events for cgroupv2, but got %#v", err) + } + + if len(gotStats.MiscStats) != 0 { + t.Errorf("parsed cgroupv2 misc.* returns unexpected resources: got %#v but expected nothing", gotStats.MiscStats) + } +} + +func TestStatMiscPodCgroupNotFound(t *testing.T) { + // We're using a fake cgroupfs. + cgroups.TestMode = true + fakeCgroupDir := t.TempDir() + + // only write misc.current to ensure pod cgroup usage + // still reads misc.events. + statPath := filepath.Join(fakeCgroupDir, "misc.current") + if err := os.WriteFile(statPath, []byte(exampleMiscCurrentData), 0o644); err != nil { + t.Fatal(err) + } + + gotStats := cgroups.NewStats() + + // use a fake root path to mismatch the file we wrote. + // this triggers the non-root path which should fail to find misc.events. + err := statMisc(fakeCgroupDir, gotStats) + if err == nil { + t.Errorf("expected error when statting misc.current for cgroupv2 root, but was nil") + } + + if !strings.Contains(err.Error(), "misc.events: no such file or directory") { + t.Errorf("expected error to contain 'misc.events: no such file or directory', but was %s", err.Error()) + } +} + +func TestStatMiscPodCgroup(t *testing.T) { + // We're using a fake cgroupfs. + cgroups.TestMode = true + fakeCgroupDir := t.TempDir() + + currentPath := filepath.Join(fakeCgroupDir, "misc.current") + if err := os.WriteFile(currentPath, []byte(exampleMiscCurrentData), 0o644); err != nil { + t.Fatal(err) + } + + eventsPath := filepath.Join(fakeCgroupDir, "misc.events") + if err := os.WriteFile(eventsPath, []byte(exampleMiscEventsData), 0o644); err != nil { + t.Fatal(err) + } + + gotStats := cgroups.NewStats() + + // use a fake root path to trigger the pod cgroup lookup. + err := statMisc(fakeCgroupDir, gotStats) + if err != nil { + t.Errorf("expected no error when statting misc for cgroupv2 root, but got %#+v", err) + } + + // make sure all res_* from exampleMisc*Data are returned + if len(gotStats.MiscStats) != 3 { + t.Errorf("parsed cgroupv2 misc doesn't return all expected resources: \ngot %#v\nexpected %#v\n", len(gotStats.MiscStats), 3) + } + + var expectedUsageBytes uint64 = 42 + if gotStats.MiscStats["res_c"].Usage != expectedUsageBytes { + t.Errorf("parsed cgroupv2 misc.current for res_c doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MiscStats["res_c"].Usage, expectedUsageBytes) + } +} diff --git a/fs2/pids.go b/fs2/pids.go new file mode 100644 index 0000000..9b82b90 --- /dev/null +++ b/fs2/pids.go @@ -0,0 +1,71 @@ +package fs2 + +import ( + "errors" + "math" + "os" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fscommon" +) + +func isPidsSet(r *cgroups.Resources) bool { + return r.PidsLimit != 0 +} + +func setPids(dirPath string, r *cgroups.Resources) error { + if !isPidsSet(r) { + return nil + } + if val := numToStr(r.PidsLimit); val != "" { + if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil { + return err + } + } + + return nil +} + +func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error { + // if the controller is not enabled, let's read PIDS from cgroups.procs + // (or threads if cgroup.threads is enabled) + contents, err := cgroups.ReadFile(dirPath, "cgroup.procs") + if errors.Is(err, unix.ENOTSUP) { + contents, err = cgroups.ReadFile(dirPath, "cgroup.threads") + } + if err != nil { + return err + } + pids := strings.Count(contents, "\n") + stats.PidsStats.Current = uint64(pids) + stats.PidsStats.Limit = 0 + return nil +} + +func statPids(dirPath string, stats *cgroups.Stats) error { + current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current") + if err != nil { + if os.IsNotExist(err) { + return statPidsFromCgroupProcs(dirPath, stats) + } + return err + } + + max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max") + if err != nil { + return err + } + // If no limit is set, read from pids.max returns "max", which is + // converted to MaxUint64 by GetCgroupParamUint. Historically, we + // represent "no limit" for pids as 0, thus this conversion. + if max == math.MaxUint64 { + max = 0 + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/fs2/psi.go b/fs2/psi.go new file mode 100644 index 0000000..010fe0b --- /dev/null +++ b/fs2/psi.go @@ -0,0 +1,89 @@ +package fs2 + +import ( + "bufio" + "errors" + "fmt" + "os" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" +) + +func statPSI(dirPath string, file string) (*cgroups.PSIStats, error) { + f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + // Kernel < 4.20, or CONFIG_PSI is not set, + // or PSI stats are turned off for the cgroup + // ("echo 0 > cgroup.pressure", kernel >= 6.1). + return nil, nil + } + return nil, err + } + defer f.Close() + + var psistats cgroups.PSIStats + sc := bufio.NewScanner(f) + for sc.Scan() { + parts := strings.Fields(sc.Text()) + var pv *cgroups.PSIData + switch parts[0] { + case "some": + pv = &psistats.Some + case "full": + pv = &psistats.Full + } + if pv != nil { + *pv, err = parsePSIData(parts[1:]) + if err != nil { + return nil, &parseError{Path: dirPath, File: file, Err: err} + } + } + } + if err := sc.Err(); err != nil { + if errors.Is(err, unix.ENOTSUP) { + // Some kernels (e.g. CS9) may return ENOTSUP on read + // if psi=1 kernel cmdline parameter is required. + return nil, nil + } + return nil, &parseError{Path: dirPath, File: file, Err: err} + } + return &psistats, nil +} + +func parsePSIData(psi []string) (cgroups.PSIData, error) { + data := cgroups.PSIData{} + for _, f := range psi { + key, val, ok := strings.Cut(f, "=") + if !ok { + return data, fmt.Errorf("invalid psi data: %q", f) + } + var pv *float64 + switch key { + case "avg10": + pv = &data.Avg10 + case "avg60": + pv = &data.Avg60 + case "avg300": + pv = &data.Avg300 + case "total": + v, err := strconv.ParseUint(val, 10, 64) + if err != nil { + return data, fmt.Errorf("invalid %s PSI value: %w", key, err) + } + data.Total = v + } + if pv != nil { + v, err := strconv.ParseFloat(val, 64) + if err != nil { + return data, fmt.Errorf("invalid %s PSI value: %w", key, err) + } + *pv = v + } + } + return data, nil +} diff --git a/fs2/psi_test.go b/fs2/psi_test.go new file mode 100644 index 0000000..7007efe --- /dev/null +++ b/fs2/psi_test.go @@ -0,0 +1,47 @@ +package fs2 + +import ( + "os" + "path/filepath" + "reflect" + "testing" + + "github.com/opencontainers/cgroups" +) + +func TestStatCPUPSI(t *testing.T) { + const examplePSIData = `some avg10=1.71 avg60=2.36 avg300=2.57 total=230548833 +full avg10=1.00 avg60=1.01 avg300=1.00 total=157622356` + + // We're using a fake cgroupfs. + cgroups.TestMode = true + + fakeCgroupDir := t.TempDir() + statPath := filepath.Join(fakeCgroupDir, "cpu.pressure") + + if err := os.WriteFile(statPath, []byte(examplePSIData), 0o644); err != nil { + t.Fatal(err) + } + + st, err := statPSI(fakeCgroupDir, "cpu.pressure") + if err != nil { + t.Fatal(err) + } + + if !reflect.DeepEqual(*st, cgroups.PSIStats{ + Some: cgroups.PSIData{ + Avg10: 1.71, + Avg60: 2.36, + Avg300: 2.57, + Total: 230548833, + }, + Full: cgroups.PSIData{ + Avg10: 1.00, + Avg60: 1.01, + Avg300: 1.00, + Total: 157622356, + }, + }) { + t.Errorf("unexpected PSI result: %+v", st) + } +} diff --git a/fscommon/rdma.go b/fscommon/rdma.go new file mode 100644 index 0000000..86e38fd --- /dev/null +++ b/fscommon/rdma.go @@ -0,0 +1,120 @@ +package fscommon + +import ( + "bufio" + "errors" + "math" + "os" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/cgroups" +) + +// parseRdmaKV parses raw string to RdmaEntry. +func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error { + var value uint32 + + k, v, ok := strings.Cut(raw, "=") + + if !ok { + return errors.New("Unable to parse RDMA entry") + } + + if v == "max" { + value = math.MaxUint32 + } else { + val64, err := strconv.ParseUint(v, 10, 32) + if err != nil { + return err + } + value = uint32(val64) + } + switch k { + case "hca_handle": + entry.HcaHandles = value + case "hca_object": + entry.HcaObjects = value + } + + return nil +} + +// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file. +// example entry: mlx4_0 hca_handle=2 hca_object=2000 +func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) { + rdmaEntries := make([]cgroups.RdmaEntry, 0) + fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return nil, err + } + defer fd.Close() //nolint:errorlint + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + parts := strings.SplitN(scanner.Text(), " ", 4) + if len(parts) == 3 { + entry := new(cgroups.RdmaEntry) + entry.Device = parts[0] + err = parseRdmaKV(parts[1], entry) + if err != nil { + continue + } + err = parseRdmaKV(parts[2], entry) + if err != nil { + continue + } + + rdmaEntries = append(rdmaEntries, *entry) + } + } + return rdmaEntries, scanner.Err() +} + +// RdmaGetStats returns rdma stats such as totalLimit and current entries. +func RdmaGetStats(path string, stats *cgroups.Stats) error { + currentEntries, err := readRdmaEntries(path, "rdma.current") + if err != nil { + if errors.Is(err, os.ErrNotExist) { + err = nil + } + return err + } + maxEntries, err := readRdmaEntries(path, "rdma.max") + if err != nil { + return err + } + // If device got removed between reading two files, ignore returning stats. + if len(currentEntries) != len(maxEntries) { + return nil + } + + stats.RdmaStats = cgroups.RdmaStats{ + RdmaLimit: maxEntries, + RdmaCurrent: currentEntries, + } + + return nil +} + +func createCmdString(device string, limits cgroups.LinuxRdma) string { + cmdString := device + if limits.HcaHandles != nil { + cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) + } + if limits.HcaObjects != nil { + cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) + } + return cmdString +} + +// RdmaSet sets RDMA resources. +func RdmaSet(path string, r *cgroups.Resources) error { + for device, limits := range r.Rdma { + if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil { + return err + } + } + return nil +} diff --git a/fscommon/rdma_test.go b/fscommon/rdma_test.go new file mode 100644 index 0000000..6af3151 --- /dev/null +++ b/fscommon/rdma_test.go @@ -0,0 +1,57 @@ +package fscommon + +import ( + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/cgroups" +) + +/* Roadmap for future */ +// (Low-priority) TODO: Check if it is possible to virtually mimic an actual RDMA device. +// TODO: Think of more edge-cases to add. + +// TestRdmaSet performs an E2E test of RdmaSet(), parseRdmaKV() using dummy device and a dummy cgroup file-system. +// Note: Following test does not guarantees that your host supports RDMA since this mocks underlying infrastructure. +func TestRdmaSet(t *testing.T) { + testCgroupPath := filepath.Join(t.TempDir(), "rdma") + + // Ensure the full mock cgroup path exists. + err := os.Mkdir(testCgroupPath, 0o755) + if err != nil { + t.Fatal(err) + } + + rdmaDevice := "mlx5_1" + maxHandles := uint32(100) + maxObjects := uint32(300) + + rdmaStubResource := &cgroups.Resources{ + Rdma: map[string]cgroups.LinuxRdma{ + rdmaDevice: { + HcaHandles: &maxHandles, + HcaObjects: &maxObjects, + }, + }, + } + + if err := RdmaSet(testCgroupPath, rdmaStubResource); err != nil { + t.Fatal(err) + } + + // The default rdma.max must be written. + rdmaEntries, err := readRdmaEntries(testCgroupPath, "rdma.max") + if err != nil { + t.Fatal(err) + } + if len(rdmaEntries) != 1 { + t.Fatal("rdma_test: Got the wrong values while parsing entries from rdma.max") + } + if rdmaEntries[0].HcaHandles != maxHandles { + t.Fatalf("rdma_test: Got the wrong value for hca_handles") + } + if rdmaEntries[0].HcaObjects != maxObjects { + t.Fatalf("rdma_test: Got the wrong value for hca_Objects") + } +} diff --git a/fscommon/utils.go b/fscommon/utils.go new file mode 100644 index 0000000..d8f8dfc --- /dev/null +++ b/fscommon/utils.go @@ -0,0 +1,144 @@ +package fscommon + +import ( + "errors" + "fmt" + "math" + "path" + "strconv" + "strings" + + "github.com/opencontainers/cgroups" +) + +var ( + // Deprecated: use cgroups.OpenFile instead. + OpenFile = cgroups.OpenFile + // Deprecated: use cgroups.ReadFile instead. + ReadFile = cgroups.ReadFile + // Deprecated: use cgroups.WriteFile instead. + WriteFile = cgroups.WriteFile +) + +// ParseError records a parse error details, including the file path. +type ParseError struct { + Path string + File string + Err error +} + +func (e *ParseError) Error() string { + return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error() +} + +func (e *ParseError) Unwrap() error { return e.Err } + +// ParseUint converts a string to an uint64 integer. +// Negative values are returned at zero as, due to kernel bugs, +// some of the memory cgroup stats can be negative. +func ParseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// ParseKeyValue parses a space-separated "key value" kind of cgroup +// parameter and returns its key as a string, and its value as uint64 +// (using [ParseUint] to convert the value). For example, +// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. +func ParseKeyValue(t string) (string, uint64, error) { + key, val, ok := strings.Cut(t, " ") + if !ok || key == "" || val == "" { + return "", 0, fmt.Errorf(`line %q is not in "key value" format`, t) + } + + value, err := ParseUint(val, 10, 64) + if err != nil { + return "", 0, err + } + + return key, value, nil +} + +// GetValueByKey reads space-separated "key value" pairs from the specified +// cgroup file, looking for a specified key, and returns its value as uint64, +// using [ParseUint] for conversion. If the value is not found, 0 is returned. +func GetValueByKey(path, file, key string) (uint64, error) { + content, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, err + } + + key += " " + lines := strings.Split(content, "\n") + for _, line := range lines { + v, ok := strings.CutPrefix(line, key) + if ok { + val, err := ParseUint(v, 10, 64) + if err != nil { + err = &ParseError{Path: path, File: file, Err: err} + } + return val, err + } + } + + return 0, nil +} + +// GetCgroupParamUint reads a single uint64 value from the specified cgroup file. +// If the value read is "max", the math.MaxUint64 is returned. +func GetCgroupParamUint(path, file string) (uint64, error) { + contents, err := GetCgroupParamString(path, file) + if err != nil { + return 0, err + } + if contents == "max" { + return math.MaxUint64, nil + } + + res, err := ParseUint(contents, 10, 64) + if err != nil { + return res, &ParseError{Path: path, File: file, Err: err} + } + return res, nil +} + +// GetCgroupParamInt reads a single int64 value from specified cgroup file. +// If the value read is "max", the math.MaxInt64 is returned. +func GetCgroupParamInt(path, file string) (int64, error) { + contents, err := GetCgroupParamString(path, file) + if err != nil { + return 0, err + } + if contents == "max" { + return math.MaxInt64, nil + } + + res, err := strconv.ParseInt(contents, 10, 64) + if err != nil { + return res, &ParseError{Path: path, File: file, Err: err} + } + return res, nil +} + +// GetCgroupParamString reads a string from the specified cgroup file. +func GetCgroupParamString(path, file string) (string, error) { + contents, err := cgroups.ReadFile(path, file) + if err != nil { + return "", err + } + + return strings.TrimSpace(contents), nil +} diff --git a/fscommon/utils_test.go b/fscommon/utils_test.go new file mode 100644 index 0000000..2bc411a --- /dev/null +++ b/fscommon/utils_test.go @@ -0,0 +1,95 @@ +package fscommon + +import ( + "math" + "os" + "path/filepath" + "strconv" + "testing" + + "github.com/opencontainers/cgroups" +) + +const ( + cgroupFile = "cgroup.file" + floatValue = 2048.0 + floatString = "2048" +) + +func init() { + cgroups.TestMode = true +} + +func TestGetCgroupParamsInt(t *testing.T) { + // Setup tempdir. + tempDir := t.TempDir() + tempFile := filepath.Join(tempDir, cgroupFile) + + // Success. + if err := os.WriteFile(tempFile, []byte(floatString), 0o755); err != nil { + t.Fatal(err) + } + value, err := GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != floatValue { + t.Fatalf("Expected %d to equal %f", value, floatValue) + } + + // Success with new line. + err = os.WriteFile(tempFile, []byte(floatString+"\n"), 0o755) + if err != nil { + t.Fatal(err) + } + value, err = GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != floatValue { + t.Fatalf("Expected %d to equal %f", value, floatValue) + } + + // Success with negative values + err = os.WriteFile(tempFile, []byte("-12345"), 0o755) + if err != nil { + t.Fatal(err) + } + value, err = GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != 0 { + t.Fatalf("Expected %d to equal %d", value, 0) + } + + // Success with negative values lesser than min int64 + s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64) + err = os.WriteFile(tempFile, []byte(s), 0o755) + if err != nil { + t.Fatal(err) + } + value, err = GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != 0 { + t.Fatalf("Expected %d to equal %d", value, 0) + } + + // Not a float. + err = os.WriteFile(tempFile, []byte("not-a-float"), 0o755) + if err != nil { + t.Fatal(err) + } + _, err = GetCgroupParamUint(tempDir, cgroupFile) + if err == nil { + t.Fatal("Expecting error, got none") + } + + // Unknown file. + err = os.Remove(tempFile) + if err != nil { + t.Fatal(err) + } + _, err = GetCgroupParamUint(tempDir, cgroupFile) + if err == nil { + t.Fatal("Expecting error, got none") + } +} diff --git a/getallpids.go b/getallpids.go new file mode 100644 index 0000000..1355a51 --- /dev/null +++ b/getallpids.go @@ -0,0 +1,27 @@ +package cgroups + +import ( + "io/fs" + "path/filepath" +) + +// GetAllPids returns all pids from the cgroup identified by path, and all its +// sub-cgroups. +func GetAllPids(path string) ([]int, error) { + var pids []int + err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error { + if iErr != nil { + return iErr + } + if !d.IsDir() { + return nil + } + cPids, err := readProcsFile(p) + if err != nil { + return err + } + pids = append(pids, cPids...) + return nil + }) + return pids, err +} diff --git a/getallpids_test.go b/getallpids_test.go new file mode 100644 index 0000000..e6b0632 --- /dev/null +++ b/getallpids_test.go @@ -0,0 +1,17 @@ +package cgroups + +import ( + "testing" +) + +func BenchmarkGetAllPids(b *testing.B) { + total := 0 + for i := 0; i < b.N; i++ { + i, err := GetAllPids("/sys/fs/cgroup") + if err != nil { + b.Fatal(err) + } + total += len(i) + } + b.Logf("iter: %d, total: %d", b.N, total) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..151d458 --- /dev/null +++ b/go.mod @@ -0,0 +1,14 @@ +module github.com/opencontainers/cgroups + +go 1.23.0 + +require ( + github.com/cilium/ebpf v0.17.3 + github.com/coreos/go-systemd/v22 v22.5.0 + github.com/cyphar/filepath-securejoin v0.4.1 + github.com/godbus/dbus/v5 v5.1.0 + github.com/moby/sys/mountinfo v0.7.2 + github.com/moby/sys/userns v0.1.0 + github.com/sirupsen/logrus v1.9.3 + golang.org/x/sys v0.30.0 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..f63cca1 --- /dev/null +++ b/go.sum @@ -0,0 +1,53 @@ +github.com/cilium/ebpf v0.17.3 h1:FnP4r16PWYSE4ux6zN+//jMcW4nMVRvuTLVTvCjyyjg= +github.com/cilium/ebpf v0.17.3/go.mod h1:G5EDHij8yiLzaqn0WjyfJHvRa+3aDlReIaLVRMvOyJk= +github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s= +github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-quicktest/qt v1.101.0 h1:O1K29Txy5P2OK0dGo59b7b0LR6wKfIhttaAhHUyn7eI= +github.com/go-quicktest/qt v1.101.0/go.mod h1:14Bz/f7NwaXPtdYEgzsx46kqSxVwTbzVZsDC26tQJow= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= +github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= +github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= +github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM= +github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= +github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= +github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U= +github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA= +github.com/moby/sys/mountinfo v0.7.2 h1:1shs6aH5s4o5H2zQLn796ADW1wMrIwHsyJ2v9KouLrg= +github.com/moby/sys/mountinfo v0.7.2/go.mod h1:1YOa8w8Ih7uW0wALDUgT1dTTSBrZ+HiBLGws92L2RU4= +github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= +github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/path/path.go b/internal/path/path.go new file mode 100644 index 0000000..a105a7c --- /dev/null +++ b/internal/path/path.go @@ -0,0 +1,52 @@ +package path + +import ( + "errors" + "os" + "path/filepath" + + "github.com/opencontainers/cgroups" +) + +// Inner returns a path to cgroup relative to a cgroup mount point, based +// on cgroup configuration, or an error, if cgroup configuration is invalid. +// To be used only by fs cgroup managers (systemd has different path rules). +func Inner(c *cgroups.Cgroup) (string, error) { + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return "", errors.New("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove cleanPath. Path safety is important! -- cyphar + innerPath := cleanPath(c.Path) + if innerPath == "" { + cgParent := cleanPath(c.Parent) + cgName := cleanPath(c.Name) + innerPath = filepath.Join(cgParent, cgName) + } + + return innerPath, nil +} + +// cleanPath is a copy of github.com/opencontainers/runc/libcontainer/utils.CleanPath. +func cleanPath(path string) string { + // Deal with empty strings nicely. + if path == "" { + return "" + } + + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + + if filepath.IsAbs(path) { + return filepath.Clean(path) + } + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../..//some/path". We also shouldn't convert absolute + // paths to relative ones. + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + + return path +} diff --git a/manager/manager_test.go b/manager/manager_test.go new file mode 100644 index 0000000..f3a446a --- /dev/null +++ b/manager/manager_test.go @@ -0,0 +1,55 @@ +package manager + +import ( + "testing" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/systemd" +) + +// TestNilResources checks that a cgroup manager do not panic when +// config.Resources is nil. While it does not make sense to use a +// manager with no resources, it should not result in a panic. +// +// This tests either v1 or v2 fs cgroup manager, depending on which +// cgroup version is available. +func TestNilResources(t *testing.T) { + testNilResources(t, false) +} + +// TestNilResourcesSystemd is the same as TestNilResources, +// only checking the systemd cgroup manager. +func TestNilResourcesSystemd(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("requires systemd") + } + testNilResources(t, true) +} + +func testNilResources(t *testing.T, systemd bool) { + cg := &cgroups.Cgroup{} // .Resources is nil + cg.Systemd = systemd + mgr, err := New(cg) + if err != nil { + // Some managers require non-nil Resources during + // instantiation -- provide and retry. In such case + // we're mostly testing Set(nil) below. + cg.Resources = &cgroups.Resources{} + mgr, err = New(cg) + if err != nil { + t.Fatal(err) + } + } + _ = mgr.Apply(-1) + _ = mgr.Set(nil) + _ = mgr.Freeze(cgroups.Thawed) + _ = mgr.Exists() + _, _ = mgr.GetAllPids() + _, _ = mgr.GetCgroups() + _, _ = mgr.GetFreezerState() + _ = mgr.Path("") + _ = mgr.GetPaths() + _, _ = mgr.GetStats() + _, _ = mgr.OOMKillCount() + _ = mgr.Destroy() +} diff --git a/manager/new.go b/manager/new.go new file mode 100644 index 0000000..2df39e5 --- /dev/null +++ b/manager/new.go @@ -0,0 +1,77 @@ +package manager + +import ( + "errors" + "fmt" + "path/filepath" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fs" + "github.com/opencontainers/cgroups/fs2" + "github.com/opencontainers/cgroups/systemd" +) + +// New returns the instance of a cgroup manager, which is chosen +// based on the local environment (whether cgroup v1 or v2 is used) +// and the config (whether config.Systemd is set or not). +func New(config *cgroups.Cgroup) (cgroups.Manager, error) { + return NewWithPaths(config, nil) +} + +// NewWithPaths is similar to New, and can be used in case cgroup paths +// are already well known, which can save some resources. +// +// For cgroup v1, the keys are controller/subsystem name, and the values +// are absolute filesystem paths to the appropriate cgroups. +// +// For cgroup v2, the only key allowed is "" (empty string), and the value +// is the unified cgroup path. +func NewWithPaths(config *cgroups.Cgroup, paths map[string]string) (cgroups.Manager, error) { + if config == nil { + return nil, errors.New("cgroups/manager.New: config must not be nil") + } + if config.Systemd && !systemd.IsRunningSystemd() { + return nil, errors.New("systemd not running on this host, cannot use systemd cgroups manager") + } + + // Cgroup v2 aka unified hierarchy. + if cgroups.IsCgroup2UnifiedMode() { + path, err := getUnifiedPath(paths) + if err != nil { + return nil, fmt.Errorf("manager.NewWithPaths: inconsistent paths: %w", err) + } + if config.Systemd { + return systemd.NewUnifiedManager(config, path) + } + return fs2.NewManager(config, path) + } + + // Cgroup v1. + if config.Systemd { + return systemd.NewLegacyManager(config, paths) + } + + return fs.NewManager(config, paths) +} + +// getUnifiedPath is an implementation detail of libcontainer. +// Historically, libcontainer.Create saves cgroup paths as per-subsystem path +// map (as returned by cm.GetPaths(""), but with v2 we only have one single +// unified path (with "" as a key). +// +// This function converts from that map to string (using "" as a key), +// and also checks that the map itself is sane. +func getUnifiedPath(paths map[string]string) (string, error) { + if len(paths) > 1 { + return "", fmt.Errorf("expected a single path, got %+v", paths) + } + path := paths[""] + // can be empty + if path != "" { + if filepath.Clean(path) != path || !filepath.IsAbs(path) { + return "", fmt.Errorf("invalid path: %q", path) + } + } + + return path, nil +} diff --git a/stats.go b/stats.go new file mode 100644 index 0000000..b475567 --- /dev/null +++ b/stats.go @@ -0,0 +1,200 @@ +package cgroups + +type ThrottlingData struct { + // Number of periods with throttling active + Periods uint64 `json:"periods,omitempty"` + // Number of periods when the container hit its throttling limit. + ThrottledPeriods uint64 `json:"throttled_periods,omitempty"` + // Aggregate time the container was throttled for in nanoseconds. + ThrottledTime uint64 `json:"throttled_time,omitempty"` +} + +// CpuUsage denotes the usage of a CPU. +// All CPU stats are aggregate since container inception. +type CpuUsage struct { + // Total CPU time consumed. + // Units: nanoseconds. + TotalUsage uint64 `json:"total_usage,omitempty"` + // Total CPU time consumed per core. + // Units: nanoseconds. + PercpuUsage []uint64 `json:"percpu_usage,omitempty"` + // CPU time consumed per core in kernel mode + // Units: nanoseconds. + PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"` + // CPU time consumed per core in user mode + // Units: nanoseconds. + PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"` + // Time spent by tasks of the cgroup in kernel mode. + // Units: nanoseconds. + UsageInKernelmode uint64 `json:"usage_in_kernelmode"` + // Time spent by tasks of the cgroup in user mode. + // Units: nanoseconds. + UsageInUsermode uint64 `json:"usage_in_usermode"` +} + +type PSIData struct { + Avg10 float64 `json:"avg10"` + Avg60 float64 `json:"avg60"` + Avg300 float64 `json:"avg300"` + Total uint64 `json:"total"` +} + +type PSIStats struct { + Some PSIData `json:"some,omitempty"` + Full PSIData `json:"full,omitempty"` +} + +type CpuStats struct { + CpuUsage CpuUsage `json:"cpu_usage,omitempty"` + ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` + PSI *PSIStats `json:"psi,omitempty"` +} + +type CPUSetStats struct { + // List of the physical numbers of the CPUs on which processes + // in that cpuset are allowed to execute + CPUs []uint16 `json:"cpus,omitempty"` + // cpu_exclusive flag + CPUExclusive uint64 `json:"cpu_exclusive"` + // List of memory nodes on which processes in that cpuset + // are allowed to allocate memory + Mems []uint16 `json:"mems,omitempty"` + // mem_hardwall flag + MemHardwall uint64 `json:"mem_hardwall"` + // mem_exclusive flag + MemExclusive uint64 `json:"mem_exclusive"` + // memory_migrate flag + MemoryMigrate uint64 `json:"memory_migrate"` + // memory_spread page flag + MemorySpreadPage uint64 `json:"memory_spread_page"` + // memory_spread slab flag + MemorySpreadSlab uint64 `json:"memory_spread_slab"` + // memory_pressure + MemoryPressure uint64 `json:"memory_pressure"` + // sched_load balance flag + SchedLoadBalance uint64 `json:"sched_load_balance"` + // sched_relax_domain_level + SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"` +} + +type MemoryData struct { + Usage uint64 `json:"usage,omitempty"` + MaxUsage uint64 `json:"max_usage,omitempty"` + Failcnt uint64 `json:"failcnt"` + Limit uint64 `json:"limit"` +} + +type MemoryStats struct { + // memory used for cache + Cache uint64 `json:"cache,omitempty"` + // usage of memory + Usage MemoryData `json:"usage,omitempty"` + // usage of memory + swap + SwapUsage MemoryData `json:"swap_usage,omitempty"` + // usage of swap only + SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"` + // usage of kernel memory + KernelUsage MemoryData `json:"kernel_usage,omitempty"` + // usage of kernel TCP memory + KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` + // usage of memory pages by NUMA node + // see chapter 5.6 of memory controller documentation + PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"` + // if true, memory usage is accounted for throughout a hierarchy of cgroups. + UseHierarchy bool `json:"use_hierarchy"` + + Stats map[string]uint64 `json:"stats,omitempty"` + PSI *PSIStats `json:"psi,omitempty"` +} + +type PageUsageByNUMA struct { + // Embedding is used as types can't be recursive. + PageUsageByNUMAInner + Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"` +} + +type PageUsageByNUMAInner struct { + Total PageStats `json:"total,omitempty"` + File PageStats `json:"file,omitempty"` + Anon PageStats `json:"anon,omitempty"` + Unevictable PageStats `json:"unevictable,omitempty"` +} + +type PageStats struct { + Total uint64 `json:"total,omitempty"` + Nodes map[uint8]uint64 `json:"nodes,omitempty"` +} + +type PidsStats struct { + // number of pids in the cgroup + Current uint64 `json:"current,omitempty"` + // active pids hard limit + Limit uint64 `json:"limit,omitempty"` +} + +type BlkioStatEntry struct { + Major uint64 `json:"major,omitempty"` + Minor uint64 `json:"minor,omitempty"` + Op string `json:"op,omitempty"` + Value uint64 `json:"value,omitempty"` +} + +type BlkioStats struct { + // number of bytes transferred to and from the block device + IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"` + IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"` + IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"` + IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"` + IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"` + IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"` + IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"` + SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` + PSI *PSIStats `json:"psi,omitempty"` +} + +type HugetlbStats struct { + // current res_counter usage for hugetlb + Usage uint64 `json:"usage,omitempty"` + // maximum usage ever recorded. + MaxUsage uint64 `json:"max_usage,omitempty"` + // number of times hugetlb usage allocation failure. + Failcnt uint64 `json:"failcnt"` +} + +type RdmaEntry struct { + Device string `json:"device,omitempty"` + HcaHandles uint32 `json:"hca_handles,omitempty"` + HcaObjects uint32 `json:"hca_objects,omitempty"` +} + +type RdmaStats struct { + RdmaLimit []RdmaEntry `json:"rdma_limit,omitempty"` + RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"` +} + +type MiscStats struct { + // current resource usage for a key in misc + Usage uint64 `json:"usage,omitempty"` + // number of times the resource usage was about to go over the max boundary + Events uint64 `json:"events,omitempty"` +} + +type Stats struct { + CpuStats CpuStats `json:"cpu_stats,omitempty"` + CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` + MemoryStats MemoryStats `json:"memory_stats,omitempty"` + PidsStats PidsStats `json:"pids_stats,omitempty"` + BlkioStats BlkioStats `json:"blkio_stats,omitempty"` + // the map is in the format "size of hugepage: stats of the hugepage" + HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` + RdmaStats RdmaStats `json:"rdma_stats,omitempty"` + // the map is in the format "misc resource name: stats of the key" + MiscStats map[string]MiscStats `json:"misc_stats,omitempty"` +} + +func NewStats() *Stats { + memoryStats := MemoryStats{Stats: make(map[string]uint64)} + hugetlbStats := make(map[string]HugetlbStats) + miscStats := make(map[string]MiscStats) + return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats} +} diff --git a/systemd/common.go b/systemd/common.go new file mode 100644 index 0000000..b3077bd --- /dev/null +++ b/systemd/common.go @@ -0,0 +1,362 @@ +package systemd + +import ( + "context" + "errors" + "fmt" + "math" + "os" + "strconv" + "strings" + "sync" + "time" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" + "github.com/sirupsen/logrus" + + "github.com/opencontainers/cgroups" +) + +const ( + // Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2. + // v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and + // v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + defCPUQuotaPeriod = uint64(100000) +) + +var ( + versionOnce sync.Once + version int + + isRunningSystemdOnce sync.Once + isRunningSystemd bool + + // GenerateDeviceProps is a function to generate systemd device + // properties, used by Set methods. Unless + // [github.com/opencontainers/cgroups/devices] + // package is imported, it is set to nil, so cgroup managers can't + // configure devices. + GenerateDeviceProps func(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, error) +) + +// NOTE: This function comes from package github.com/coreos/go-systemd/util +// It was borrowed here to avoid a dependency on cgo. +// +// IsRunningSystemd checks whether the host was booted with systemd as its init +// system. This functions similarly to systemd's `sd_booted(3)`: internally, it +// checks whether /run/systemd/system/ exists and is a directory. +// http://www.freedesktop.org/software/systemd/man/sd_booted.html +func IsRunningSystemd() bool { + isRunningSystemdOnce.Do(func() { + fi, err := os.Lstat("/run/systemd/system") + isRunningSystemd = err == nil && fi.IsDir() + }) + return isRunningSystemd +} + +// systemd represents slice hierarchy using `-`, so we need to follow suit when +// generating the path of slice. Essentially, test-a-b.slice becomes +// /test.slice/test-a.slice/test-a-b.slice. +func ExpandSlice(slice string) (string, error) { + suffix := ".slice" + // Name has to end with ".slice", but can't be just ".slice". + if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Path-separators are not allowed. + if strings.Contains(slice, "/") { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + var path, prefix string + sliceName := strings.TrimSuffix(slice, suffix) + // if input was -.slice, we should just return root now + if sliceName == "-" { + return "/", nil + } + for _, component := range strings.Split(sliceName, "-") { + // test--a.slice isn't permitted, nor is -test.slice. + if component == "" { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Append the component to the path and to the prefix. + path += "/" + prefix + component + suffix + prefix += component + "-" + } + return path, nil +} + +func newProp(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +func getUnitName(c *cgroups.Cgroup) string { + // by default, we create a scope unless the user explicitly asks for a slice. + if !strings.HasSuffix(c.Name, ".slice") { + return c.ScopePrefix + "-" + c.Name + ".scope" + } + return c.Name +} + +// This code should be in sync with getUnitName. +func getUnitType(unitName string) string { + if strings.HasSuffix(unitName, ".slice") { + return "Slice" + } + return "Scope" +} + +// isDbusError returns true if the error is a specific dbus error. +func isDbusError(err error, name string) bool { + if err != nil { + var derr dbus.Error + if errors.As(err, &derr) { + return strings.Contains(derr.Name, name) + } + } + return false +} + +// isUnitExists returns true if the error is that a systemd unit already exists. +func isUnitExists(err error) bool { + return isDbusError(err, "org.freedesktop.systemd1.UnitExists") +} + +func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error { + statusChan := make(chan string, 1) + retry := true + +retry: + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) + return err + }) + if err != nil { + if !isUnitExists(err) { + return err + } + if ignoreExist { + // TODO: remove this hack. + // This is kubelet making sure a slice exists (see + // https://github.com/opencontainers/runc/pull/1124). + return nil + } + if retry { + // In case a unit with the same name exists, this may + // be a leftover failed unit. Reset it, so systemd can + // remove it, and retry once. + err = resetFailedUnit(cm, unitName) + if err != nil { + logrus.Warnf("unable to reset failed unit: %v", err) + } + retry = false + goto retry + } + return err + } + + timeout := time.NewTimer(30 * time.Second) + defer timeout.Stop() + + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit + if s != "done" { + _ = resetFailedUnit(cm, unitName) + return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) + } + case <-timeout.C: + _ = resetFailedUnit(cm, unitName) + return errors.New("Timeout waiting for systemd to create " + unitName) + } + + return nil +} + +func stopUnit(cm *dbusConnManager, unitName string) error { + statusChan := make(chan string, 1) + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + _, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan) + return err + }) + if err == nil { + timeout := time.NewTimer(30 * time.Second) + defer timeout.Stop() + + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit + if s != "done" { + logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s) + } + case <-timeout.C: + return errors.New("Timed out while waiting for systemd to remove " + unitName) + } + } + + // In case of a failed unit, let systemd remove it. + _ = resetFailedUnit(cm, unitName) + + return nil +} + +func resetFailedUnit(cm *dbusConnManager, name string) error { + return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + return c.ResetFailedUnitContext(context.TODO(), name) + }) +} + +func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) { + var prop *systemdDbus.Property + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) { + prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName) + return Err + }) + return prop, err +} + +func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error { + return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...) + }) +} + +func getManagerProperty(cm *dbusConnManager, name string) (string, error) { + str := "" + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + var err error + str, err = c.GetManagerProperty(name) + return err + }) + if err != nil { + return "", err + } + return strconv.Unquote(str) +} + +func systemdVersion(cm *dbusConnManager) int { + versionOnce.Do(func() { + version = -1 + verStr, err := getManagerProperty(cm, "Version") + if err == nil { + version, err = systemdVersionAtoi(verStr) + } + + if err != nil { + logrus.WithError(err).Error("unable to get systemd version") + } + }) + + return version +} + +// systemdVersionAtoi extracts a numeric systemd version from the argument. +// The argument should be of the form: "v245.4-1.fc32", "245", "v245-1.fc32", +// "245-1.fc32" (with or without quotes). The result for all of the above +// should be 245. +func systemdVersionAtoi(str string) (int, error) { + // Unconditionally remove the leading prefix ("v). + str = strings.TrimLeft(str, `"v`) + // Match on the first integer we can grab. + for i := 0; i < len(str); i++ { + if str[i] < '0' || str[i] > '9' { + // First non-digit: cut the tail. + str = str[:i] + break + } + } + ver, err := strconv.Atoi(str) + if err != nil { + return -1, fmt.Errorf("can't parse version: %w", err) + } + return ver, nil +} + +func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) { + if period != 0 { + // systemd only supports CPUQuotaPeriodUSec since v242 + sdVer := systemdVersion(cm) + if sdVer >= 242 { + *properties = append(*properties, + newProp("CPUQuotaPeriodUSec", period)) + } else { + logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+ + " (setting will still be applied to cgroupfs)", sdVer) + } + } + if quota != 0 || period != 0 { + // corresponds to USEC_INFINITY in systemd + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if quota > 0 { + if period == 0 { + // assume the default + period = defCPUQuotaPeriod + } + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(quota*1000000) / period + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } + *properties = append(*properties, + newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) + } +} + +func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error { + if cpus == "" && mems == "" { + return nil + } + + // systemd only supports AllowedCPUs/AllowedMemoryNodes since v244 + sdVer := systemdVersion(cm) + if sdVer < 244 { + logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+ + " (settings will still be applied to cgroupfs)", sdVer) + return nil + } + + if cpus != "" { + bits, err := RangeToBits(cpus) + if err != nil { + return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w", + cpus, err) + } + *props = append(*props, + newProp("AllowedCPUs", bits)) + } + if mems != "" { + bits, err := RangeToBits(mems) + if err != nil { + return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w", + mems, err) + } + *props = append(*props, + newProp("AllowedMemoryNodes", bits)) + } + return nil +} + +// generateDeviceProperties takes the configured device rules and generates a +// corresponding set of systemd properties to configure the devices correctly. +func generateDeviceProperties(r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { + if GenerateDeviceProps == nil { + if len(r.Devices) > 0 { + return nil, cgroups.ErrDevicesUnsupported + } + return nil, nil + } + + return GenerateDeviceProps(r, systemdVersion(cm)) +} diff --git a/systemd/cpuset.go b/systemd/cpuset.go new file mode 100644 index 0000000..c6f5642 --- /dev/null +++ b/systemd/cpuset.go @@ -0,0 +1,60 @@ +package systemd + +import ( + "errors" + "math/big" + "strconv" + "strings" +) + +// RangeToBits converts a text representation of a CPU mask (as written to +// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes +// with the corresponding bits set (as consumed by systemd over dbus as +// AllowedCPUs/AllowedMemoryNodes unit property value). +func RangeToBits(str string) ([]byte, error) { + bits := new(big.Int) + + for _, r := range strings.Split(str, ",") { + // allow extra spaces around + r = strings.TrimSpace(r) + // allow empty elements (extra commas) + if r == "" { + continue + } + startr, endr, ok := strings.Cut(r, "-") + if ok { + start, err := strconv.ParseUint(startr, 10, 32) + if err != nil { + return nil, err + } + end, err := strconv.ParseUint(endr, 10, 32) + if err != nil { + return nil, err + } + if start > end { + return nil, errors.New("invalid range: " + r) + } + for i := start; i <= end; i++ { + bits.SetBit(bits, int(i), 1) + } + } else { + val, err := strconv.ParseUint(startr, 10, 32) + if err != nil { + return nil, err + } + bits.SetBit(bits, int(val), 1) + } + } + + ret := bits.Bytes() + if len(ret) == 0 { + // do not allow empty values + return nil, errors.New("empty value") + } + + // fit cpuset parsing order in systemd + for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 { + ret[l], ret[r] = ret[r], ret[l] + } + return ret, nil +} diff --git a/systemd/cpuset_test.go b/systemd/cpuset_test.go new file mode 100644 index 0000000..bda31a5 --- /dev/null +++ b/systemd/cpuset_test.go @@ -0,0 +1,55 @@ +package systemd + +import ( + "bytes" + "testing" +) + +func TestRangeToBits(t *testing.T) { + testCases := []struct { + in string + out []byte + isErr bool + }{ + {in: "", isErr: true}, + {in: "0", out: []byte{1}}, + {in: "1", out: []byte{2}}, + {in: "0-1", out: []byte{3}}, + {in: "0,1", out: []byte{3}}, + {in: ",0,1,", out: []byte{3}}, + {in: "0-3", out: []byte{0x0f}}, + {in: "0,1,2-3", out: []byte{0x0f}}, + {in: "4-7", out: []byte{0xf0}}, + {in: "0-7", out: []byte{0xff}}, + {in: "0-15", out: []byte{0xff, 0xff}}, + {in: "16", out: []byte{0, 0, 1}}, + {in: "0-3,32-33", out: []byte{0x0f, 0, 0, 0, 3}}, + // extra spaces and tabs are ok + {in: "1, 2, 1-2", out: []byte{6}}, + {in: " , 1 , 3 , 5-7, ", out: []byte{0xea}}, + // somewhat large values + {in: "128-130,1", out: []byte{2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7}}, + + {in: "-", isErr: true}, + {in: "1-", isErr: true}, + {in: "-3", isErr: true}, + // bad range (start > end) + {in: "54-53", isErr: true}, + // kernel does not allow extra spaces inside a range + {in: "1 - 2", isErr: true}, + } + + for _, tc := range testCases { + out, err := RangeToBits(tc.in) + if err != nil { + if !tc.isErr { + t.Errorf("case %q: unexpected error: %v", tc.in, err) + } + + continue + } + if !bytes.Equal(out, tc.out) { + t.Errorf("case %q: expected %v, got %v", tc.in, tc.out, out) + } + } +} diff --git a/systemd/dbus.go b/systemd/dbus.go new file mode 100644 index 0000000..bb87ae8 --- /dev/null +++ b/systemd/dbus.go @@ -0,0 +1,102 @@ +package systemd + +import ( + "context" + "errors" + "fmt" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" +) + +var ( + dbusC *systemdDbus.Conn + dbusMu sync.RWMutex + dbusInited bool + dbusRootless bool +) + +type dbusConnManager struct{} + +// newDbusConnManager initializes systemd dbus connection manager. +func newDbusConnManager(rootless bool) *dbusConnManager { + dbusMu.Lock() + defer dbusMu.Unlock() + if dbusInited && rootless != dbusRootless { + panic("can't have both root and rootless dbus") + } + dbusInited = true + dbusRootless = rootless + return &dbusConnManager{} +} + +// getConnection lazily initializes and returns systemd dbus connection. +func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) { + // In the case where dbusC != nil + // Use the read lock the first time to ensure + // that Conn can be acquired at the same time. + dbusMu.RLock() + if conn := dbusC; conn != nil { + dbusMu.RUnlock() + return conn, nil + } + dbusMu.RUnlock() + + // In the case where dbusC == nil + // Use write lock to ensure that only one + // will be created + dbusMu.Lock() + defer dbusMu.Unlock() + if conn := dbusC; conn != nil { + return conn, nil + } + + conn, err := d.newConnection() + if err != nil { + // When dbus-user-session is not installed, we can't detect whether we should try to connect to user dbus or system dbus, so d.dbusRootless is set to false. + // This may fail with a cryptic error "read unix @->/run/systemd/private: read: connection reset by peer: unknown." + // https://github.com/moby/moby/issues/42793 + return nil, fmt.Errorf("failed to connect to dbus (hint: for rootless containers, maybe you need to install dbus-user-session package, see https://github.com/opencontainers/runc/blob/master/docs/cgroup-v2.md): %w", err) + } + dbusC = conn + return conn, nil +} + +func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) { + if dbusRootless { + return newUserSystemdDbus() + } + return systemdDbus.NewWithContext(context.TODO()) +} + +// resetConnection resets the connection to its initial state +// (so it can be reconnected if necessary). +func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) { + dbusMu.Lock() + defer dbusMu.Unlock() + if dbusC != nil && dbusC == conn { + dbusC.Close() + dbusC = nil + } +} + +// retryOnDisconnect calls op, and if the error it returns is about closed dbus +// connection, the connection is re-established and the op is retried. This helps +// with the situation when dbus is restarted and we have a stale connection. +func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error { + for { + conn, err := d.getConnection() + if err != nil { + return err + } + err = op(conn) + if err == nil { + return nil + } + if !errors.Is(err, dbus.ErrClosed) { + return err + } + d.resetConnection(conn) + } +} diff --git a/systemd/devices.go b/systemd/devices.go new file mode 100644 index 0000000..51ca7fa --- /dev/null +++ b/systemd/devices.go @@ -0,0 +1,74 @@ +package systemd + +import ( + "reflect" + + dbus "github.com/godbus/dbus/v5" + + "github.com/opencontainers/cgroups" +) + +// freezeBeforeSet answers whether there is a need to freeze the cgroup before +// applying its systemd unit properties, and thaw after, while avoiding +// unnecessary freezer state changes. +// +// The reason why we have to freeze is that systemd's application of device +// rules is done disruptively, resulting in spurious errors to common devices +// (unlike our fs driver, they will happily write deny-all rules to running +// containers). So we have to freeze the container to avoid the container get +// an occasional "permission denied" error. +func (m *LegacyManager) freezeBeforeSet(unitName string, r *cgroups.Resources) (needsFreeze, needsThaw bool, err error) { + // Special case for SkipDevices, as used by Kubernetes to create pod + // cgroups with allow-all device policy). + if r.SkipDevices { + if r.SkipFreezeOnSet { + // Both needsFreeze and needsThaw are false. + return + } + + // No need to freeze if SkipDevices is set, and either + // (1) systemd unit does not (yet) exist, or + // (2) it has DevicePolicy=auto and empty DeviceAllow list. + // + // Interestingly, (1) and (2) are the same here because + // a non-existent unit returns default properties, + // and settings in (2) are the defaults. + // + // Do not return errors from getUnitTypeProperty, as they alone + // should not prevent Set from working. + + unitType := getUnitType(unitName) + + devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy") + if e == nil && devPolicy.Value == dbus.MakeVariant("auto") { + devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow") + if e == nil { + if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 { + needsFreeze = false + needsThaw = false + return + } + } + } + } + + needsFreeze = true + needsThaw = true + + // Check the current freezer state. + freezerState, err := m.GetFreezerState() + if err != nil { + return + } + if freezerState == cgroups.Frozen { + // Already frozen, and should stay frozen. + needsFreeze = false + needsThaw = false + } + + if r.Freezer == cgroups.Frozen { + // Will be frozen anyway -- no need to thaw. + needsThaw = false + } + return +} diff --git a/systemd/freeze_test.go b/systemd/freeze_test.go new file mode 100644 index 0000000..35558a8 --- /dev/null +++ b/systemd/freeze_test.go @@ -0,0 +1,354 @@ +package systemd + +import ( + "bufio" + "bytes" + "os" + "os/exec" + "strings" + "testing" + + "github.com/opencontainers/cgroups" + "golang.org/x/sys/unix" +) + +func TestFreezeBeforeSet(t *testing.T) { + requireV1(t) + + testCases := []struct { + desc string + // Test input. + cg *cgroups.Cgroup + preFreeze bool + // Expected values. + // Before unit creation (Apply). + freeze0, thaw0 bool + // After unit creation. + freeze1, thaw1 bool + }{ + { + // A slice with SkipDevices. + desc: "slice,skip-devices", + cg: &cgroups.Cgroup{ + Name: "system-runc_test_freeze_1.slice", + Parent: "system.slice", + Resources: &cgroups.Resources{ + SkipDevices: true, + }, + }, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A scope with SkipDevices. Not a realistic scenario with runc + // (as container can't have SkipDevices == true), but possible + // for a standalone cgroup manager. + desc: "scope,skip-devices", + cg: &cgroups.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze2", + Parent: "system.slice", + Resources: &cgroups.Resources{ + SkipDevices: true, + }, + }, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A slice that is about to be frozen in Set. + desc: "slice,will-freeze", + cg: &cgroups.Cgroup{ + Name: "system-runc_test_freeze_3.slice", + Parent: "system.slice", + Resources: &cgroups.Resources{ + Freezer: cgroups.Frozen, + }, + }, + // Expected. + freeze0: true, + thaw0: false, + freeze1: true, + thaw1: false, + }, + { + // A pre-frozen slice that should stay frozen. + desc: "slice,pre-frozen,will-freeze", + cg: &cgroups.Cgroup{ + Name: "system-runc_test_freeze_4.slice", + Parent: "system.slice", + Resources: &cgroups.Resources{ + Freezer: cgroups.Frozen, + }, + }, + preFreeze: true, + // Expected. + freeze0: true, // not actually frozen yet. + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A pre-frozen scope with skip devices set. + desc: "scope,pre-frozen,skip-devices", + cg: &cgroups.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze5", + Parent: "system.slice", + Resources: &cgroups.Resources{ + SkipDevices: true, + }, + }, + preFreeze: true, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A pre-frozen scope which will be thawed. + desc: "scope,pre-frozen", + cg: &cgroups.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze6", + Parent: "system.slice", + Resources: &cgroups.Resources{}, + }, + preFreeze: true, + // Expected. + freeze0: true, // not actually frozen yet. + thaw0: true, + freeze1: false, + thaw1: false, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.desc, func(t *testing.T) { + m, err := NewLegacyManager(tc.cg, nil) + if err != nil { + t.Fatal(err) + } + defer m.Destroy() //nolint:errcheck + + // Checks for a non-existent unit. + freeze, thaw, err := m.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources) + if err != nil { + t.Fatal(err) + } + if freeze != tc.freeze0 || thaw != tc.thaw0 { + t.Errorf("before Apply (non-existent unit): expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v", + tc.freeze0, tc.thaw0, freeze, thaw) + } + + // Create systemd unit. + pid := -1 + if strings.HasSuffix(getUnitName(tc.cg), ".scope") { + // Scopes require a process inside. + cmd := exec.Command("bash", "-c", "sleep 1m") + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + pid = cmd.Process.Pid + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + } + if err := m.Apply(pid); err != nil { + t.Fatal(err) + } + if tc.preFreeze { + if err := m.Freeze(cgroups.Frozen); err != nil { + t.Error(err) + return // no more checks + } + } + freeze, thaw, err = m.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources) + if err != nil { + t.Error(err) + return // no more checks + } + if freeze != tc.freeze1 || thaw != tc.thaw1 { + t.Errorf("expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v", + tc.freeze1, tc.thaw1, freeze, thaw) + } + // Destroy() timeouts on a frozen container, so we need to thaw it. + if tc.preFreeze { + if err := m.Freeze(cgroups.Thawed); err != nil { + t.Error(err) + } + } + // Destroy() does not kill processes in cgroup, so we should. + if pid != -1 { + if err = unix.Kill(pid, unix.SIGKILL); err != nil { + t.Errorf("unable to kill pid %d: %s", pid, err) + } + } + // Not really needed, but may help catch some bugs. + if err := m.Destroy(); err != nil { + t.Errorf("destroy: %s", err) + } + }) + } +} + +// requireV1 skips the test unless a set of requirements (cgroup v1, +// systemd, root) is met. +func requireV1(t *testing.T) { + t.Helper() + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("Test requires cgroup v1.") + } + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } +} + +func TestFreezePodCgroup(t *testing.T) { + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podConfig := &cgroups.Cgroup{ + Parent: "system.slice", + Name: "system-runc_test_pod.slice", + Resources: &cgroups.Resources{ + SkipDevices: true, + Freezer: cgroups.Frozen, + }, + } + // Create a "pod" cgroup (a systemd slice to hold containers), + // which is frozen initially. + pm := newManager(t, podConfig) + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + + // Check the pod is frozen. + pf, err := pm.GetFreezerState() + if err != nil { + t.Fatal(err) + } + if pf != cgroups.Frozen { + t.Fatalf("expected pod to be frozen, got %v", pf) + } + + // Create a "container" within the "pod" cgroup. + // This is not a real container, just a process in the cgroup. + containerConfig := &cgroups.Cgroup{ + Parent: "system-runc_test_pod.slice", + ScopePrefix: "test", + Name: "inner-container", + Resources: &cgroups.Resources{}, + } + + cmd := exec.Command("bash", "-c", "while read; do echo $REPLY; done") + cmd.Env = append(os.Environ(), "LANG=C") + + // Setup stdin. + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + cmd.Stdin = stdinR + + // Setup stdout. + stdoutR, stdoutW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + cmd.Stdout = stdoutW + rdr := bufio.NewReader(stdoutR) + + // Setup stderr. + var stderr bytes.Buffer + cmd.Stderr = &stderr + + err = cmd.Start() + stdinR.Close() + stdoutW.Close() + defer func() { + _ = stdinW.Close() + _ = stdoutR.Close() + }() + if err != nil { + t.Fatal(err) + } + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + + // Put the process into a cgroup. + cm := newManager(t, containerConfig) + + if err := cm.Apply(cmd.Process.Pid); err != nil { + t.Fatal(err) + } + if err := cm.Set(containerConfig.Resources); err != nil { + t.Fatal(err) + } + // Check that we put the "container" into the "pod" cgroup. + if !strings.HasPrefix(cm.Path("freezer"), pm.Path("freezer")) { + t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", + cm.Path("freezer"), pm.Path("freezer")) + } + // Check the container is not reported as frozen despite the frozen parent. + cf, err := cm.GetFreezerState() + if err != nil { + t.Fatal(err) + } + if cf != cgroups.Thawed { + t.Fatalf("expected container to be thawed, got %v", cf) + } + + // Unfreeze the pod. + if err := pm.Freeze(cgroups.Thawed); err != nil { + t.Fatal(err) + } + + cf, err = cm.GetFreezerState() + if err != nil { + t.Fatal(err) + } + if cf != cgroups.Thawed { + t.Fatalf("expected container to be thawed, got %v", cf) + } + + // Check the "container" works. + marker := "one two\n" + _, err = stdinW.WriteString(marker) + if err != nil { + t.Fatal(err) + } + reply, err := rdr.ReadString('\n') + if err != nil { + t.Fatalf("reading from container: %v", err) + } + if reply != marker { + t.Fatalf("expected %q, got %q", marker, reply) + } +} diff --git a/systemd/systemd_test.go b/systemd/systemd_test.go new file mode 100644 index 0000000..dae851c --- /dev/null +++ b/systemd/systemd_test.go @@ -0,0 +1,180 @@ +package systemd + +import ( + "os" + "reflect" + "testing" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/opencontainers/cgroups" +) + +func newManager(t *testing.T, config *cgroups.Cgroup) (m cgroups.Manager) { + t.Helper() + var err error + + if cgroups.IsCgroup2UnifiedMode() { + m, err = NewUnifiedManager(config, "") + } else { + m, err = NewLegacyManager(config, nil) + } + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = m.Destroy() }) + + return m +} + +func TestSystemdVersion(t *testing.T) { + systemdVersionTests := []struct { + verStr string + expectedVer int + expectErr bool + }{ + {`"219"`, 219, false}, + {`"v245.4-1.fc32"`, 245, false}, + {`"241-1"`, 241, false}, + {`"v241-1"`, 241, false}, + {`333.45"`, 333, false}, + {`v321-0`, 321, false}, + {"NaN", -1, true}, + {"", -1, true}, + {"v", -1, true}, + } + for _, sdTest := range systemdVersionTests { + ver, err := systemdVersionAtoi(sdTest.verStr) + if !sdTest.expectErr && err != nil { + t.Errorf("systemdVersionAtoi(%s); want nil; got %v", sdTest.verStr, err) + } + if sdTest.expectErr && err == nil { + t.Errorf("systemdVersionAtoi(%s); wanted failure; got nil", sdTest.verStr) + } + if ver != sdTest.expectedVer { + t.Errorf("systemdVersionAtoi(%s); want %d; got %d", sdTest.verStr, sdTest.expectedVer, ver) + } + } +} + +func TestValidUnitTypes(t *testing.T) { + testCases := []struct { + unitName string + expectedUnitType string + }{ + {"system.slice", "Slice"}, + {"kubepods.slice", "Slice"}, + {"testing-container:ab.scope", "Scope"}, + } + for _, sdTest := range testCases { + unitType := getUnitType(sdTest.unitName) + if unitType != sdTest.expectedUnitType { + t.Errorf("getUnitType(%s); want %q; got %q", sdTest.unitName, sdTest.expectedUnitType, unitType) + } + } +} + +func TestUnitExistsIgnored(t *testing.T) { + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podConfig := &cgroups.Cgroup{ + Parent: "system.slice", + Name: "system-runc_test_exists.slice", + Resources: &cgroups.Resources{}, + } + // Create "pods" cgroup (a systemd slice to hold containers). + pm := newManager(t, podConfig) + + // create twice to make sure "UnitExists" error is ignored. + for i := 0; i < 2; i++ { + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + } +} + +func TestUnifiedResToSystemdProps(t *testing.T) { + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if !cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v2 is required") + } + + cm := newDbusConnManager(os.Geteuid() != 0) + + testCases := []struct { + name string + minVer int + res map[string]string + expError bool + expProps []systemdDbus.Property + }{ + { + name: "empty map", + res: map[string]string{}, + }, + { + name: "only cpu.idle=1", + minVer: cpuIdleSupportedVersion, + res: map[string]string{ + "cpu.idle": "1", + }, + expProps: []systemdDbus.Property{ + newProp("CPUWeight", uint64(0)), + }, + }, + { + name: "only cpu.idle=0", + minVer: cpuIdleSupportedVersion, + res: map[string]string{ + "cpu.idle": "0", + }, + }, + { + name: "cpu.idle=1 and cpu.weight=1000", + minVer: cpuIdleSupportedVersion, + res: map[string]string{ + "cpu.idle": "1", + "cpu.weight": "1000", + }, + expProps: []systemdDbus.Property{ + newProp("CPUWeight", uint64(0)), + }, + }, + { + name: "cpu.idle=0 and cpu.weight=1000", + minVer: cpuIdleSupportedVersion, + res: map[string]string{ + "cpu.idle": "0", + "cpu.weight": "1000", + }, + expProps: []systemdDbus.Property{ + newProp("CPUWeight", uint64(1000)), + }, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + if tc.minVer != 0 && systemdVersion(cm) < tc.minVer { + t.Skipf("requires systemd >= %d", tc.minVer) + } + props, err := unifiedResToSystemdProps(cm, tc.res) + if err != nil && !tc.expError { + t.Fatalf("expected no error, got: %v", err) + } + if err == nil && tc.expError { + t.Fatal("expected error, got nil") + } + if !reflect.DeepEqual(tc.expProps, props) { + t.Errorf("wrong properties (exp %+v, got %+v)", tc.expProps, props) + } + }) + } +} diff --git a/systemd/user.go b/systemd/user.go new file mode 100644 index 0000000..4a4348e --- /dev/null +++ b/systemd/user.go @@ -0,0 +1,92 @@ +package systemd + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" + "github.com/moby/sys/userns" +) + +// newUserSystemdDbus creates a connection for systemd user-instance. +func newUserSystemdDbus() (*systemdDbus.Conn, error) { + addr, err := DetectUserDbusSessionBusAddress() + if err != nil { + return nil, err + } + uid, err := DetectUID() + if err != nil { + return nil, err + } + + return systemdDbus.NewConnection(func() (*dbus.Conn, error) { + conn, err := dbus.Dial(addr) + if err != nil { + return nil, fmt.Errorf("error while dialing %q: %w", addr, err) + } + methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))} + err = conn.Auth(methods) + if err != nil { + conn.Close() + return nil, fmt.Errorf("error while authenticating connection (address=%q, UID=%d): %w", addr, uid, err) + } + if err = conn.Hello(); err != nil { + conn.Close() + return nil, fmt.Errorf("error while sending Hello message (address=%q, UID=%d): %w", addr, uid, err) + } + return conn, nil + }) +} + +// DetectUID detects UID from the OwnerUID field of `busctl --user status` +// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) . +// +// Otherwise returns os.Getuid() . +func DetectUID() (int, error) { + if !userns.RunningInUserNS() { + return os.Getuid(), nil + } + b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput() + if err != nil { + return -1, fmt.Errorf("could not execute `busctl --user --no-pager status` (output: %q): %w", string(b), err) + } + scanner := bufio.NewScanner(bytes.NewReader(b)) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if uidStr, ok := strings.CutPrefix(s, "OwnerUID="); ok { + i, err := strconv.Atoi(uidStr) + if err != nil { + return -1, fmt.Errorf("could not detect the OwnerUID: %w", err) + } + return i, nil + } + } + if err := scanner.Err(); err != nil { + return -1, err + } + return -1, errors.New("could not detect the OwnerUID") +} + +// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS, if set. +// Otherwise it returns "unix:path=$XDG_RUNTIME_DIR/bus", if $XDG_RUNTIME_DIR/bus exists. +func DetectUserDbusSessionBusAddress() (string, error) { + if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" { + return env, nil + } + if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" { + busPath := filepath.Join(xdr, "bus") + if _, err := os.Stat(busPath); err == nil { + busAddress := "unix:path=" + dbus.EscapeBusAddressValue(busPath) + return busAddress, nil + } + } + return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from the environment; make sure you have installed the dbus-user-session or dbus-daemon package; note you may need to re-login") +} diff --git a/systemd/v1.go b/systemd/v1.go new file mode 100644 index 0000000..8453e9b --- /dev/null +++ b/systemd/v1.go @@ -0,0 +1,412 @@ +package systemd + +import ( + "errors" + "os" + "path/filepath" + "strings" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/sirupsen/logrus" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fs" +) + +type LegacyManager struct { + mu sync.Mutex + cgroups *cgroups.Cgroup + paths map[string]string + dbus *dbusConnManager +} + +func NewLegacyManager(cg *cgroups.Cgroup, paths map[string]string) (*LegacyManager, error) { + if cg.Rootless { + return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1") + } + if cg.Resources != nil && cg.Resources.Unified != nil { + return nil, cgroups.ErrV1NoUnified + } + if paths == nil { + var err error + paths, err = initPaths(cg) + if err != nil { + return nil, err + } + } + return &LegacyManager{ + cgroups: cg, + paths: paths, + dbus: newDbusConnManager(false), + }, nil +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // GetStats returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error + // Set sets cgroup resource limits. + Set(path string, r *cgroups.Resources) error +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +var legacySubsystems = []subsystem{ + &fs.CpusetGroup{}, + &fs.DevicesGroup{}, + &fs.MemoryGroup{}, + &fs.CpuGroup{}, + &fs.CpuacctGroup{}, + &fs.PidsGroup{}, + &fs.BlkioGroup{}, + &fs.HugetlbGroup{}, + &fs.PerfEventGroup{}, + &fs.FreezerGroup{}, + &fs.NetPrioGroup{}, + &fs.NetClsGroup{}, + &fs.NameGroup{GroupName: "name=systemd"}, + &fs.RdmaGroup{}, + &fs.NameGroup{GroupName: "misc"}, +} + +func genV1ResourcesProperties(r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { + var properties []systemdDbus.Property + + deviceProperties, err := generateDeviceProperties(r, cm) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + + if r.Memory != 0 { + properties = append(properties, + newProp("MemoryLimit", uint64(r.Memory))) + } + + if r.CpuShares != 0 { + properties = append(properties, + newProp("CPUShares", r.CpuShares)) + } + + addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod) + + if r.BlkioWeight != 0 { + properties = append(properties, + newProp("BlockIOWeight", uint64(r.BlkioWeight))) + } + + if r.PidsLimit > 0 || r.PidsLimit == -1 { + properties = append(properties, + newProp("TasksMax", uint64(r.PidsLimit))) + } + + err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) + if err != nil { + return nil, err + } + + return properties, nil +} + +// initPaths figures out and returns paths to cgroups. +func initPaths(c *cgroups.Cgroup) (map[string]string, error) { + slice := "system.slice" + if c.Parent != "" { + var err error + slice, err = ExpandSlice(c.Parent) + if err != nil { + return nil, err + } + } + + unit := getUnitName(c) + + paths := make(map[string]string) + for _, s := range legacySubsystems { + subsystemPath, err := getSubsystemPath(slice, unit, s.Name()) + if err != nil { + // Even if it's `not found` error, we'll return err + // because devices cgroup is hard requirement for + // container security. + if s.Name() == "devices" { + return nil, err + } + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return nil, err + } + paths[s.Name()] = subsystemPath + } + + // If systemd is using cgroups-hybrid mode then add the slice path of + // this container to the paths so the following process executed with + // "runc exec" joins that cgroup as well. + if cgroups.IsCgroup2HybridMode() { + // "" means cgroup-hybrid path + cgroupsHybridPath, err := getSubsystemPath(slice, unit, "") + if err != nil && cgroups.IsNotFound(err) { + return nil, err + } + paths[""] = cgroupsHybridPath + } + + return paths, nil +} + +func (m *LegacyManager) Apply(pid int) error { + var ( + c = m.cgroups + unitName = getUnitName(c) + slice = "system.slice" + properties []systemdDbus.Property + ) + + m.mu.Lock() + defer m.mu.Unlock() + + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + if strings.HasSuffix(unitName, ".slice") { + // If we create a slice, the parent is defined via a Wants=. + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // Otherwise it's a scope, which we put into a Slice=. + properties = append(properties, systemdDbus.PropSlice(slice)) + // Assume scopes always support delegation (supported since systemd v218). + properties = append(properties, newProp("Delegate", true)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("BlockIOAccounting", true), + newProp("TasksAccounting", true), + ) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + properties = append(properties, c.SystemdProps...) + + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { + return err + } + + if err := m.joinCgroups(pid); err != nil { + return err + } + + return nil +} + +func (m *LegacyManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + + stopErr := stopUnit(m.dbus, getUnitName(m.cgroups)) + + // Both on success and on error, cleanup all the cgroups + // we are aware of, as some of them were created directly + // by Apply() and are not managed by systemd. + if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil { + return err + } + + return stopErr +} + +func (m *LegacyManager) Path(subsys string) string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths[subsys] +} + +func (m *LegacyManager) joinCgroups(pid int) error { + for _, sys := range legacySubsystems { + name := sys.Name() + switch name { + case "name=systemd": + // let systemd handle this + case "cpuset": + if path, ok := m.paths[name]; ok { + s := &fs.CpusetGroup{} + if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil { + return err + } + } + default: + if path, ok := m.paths[name]; ok { + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + if err := cgroups.WriteCgroupProc(path, pid); err != nil { + return err + } + } + } + } + + return nil +} + +func getSubsystemPath(slice, unit, subsystem string) (string, error) { + mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem) + if err != nil { + return "", err + } + + return filepath.Join(mountpoint, slice, unit), nil +} + +func (m *LegacyManager) Freeze(state cgroups.FreezerState) error { + err := m.doFreeze(state) + if err == nil { + m.cgroups.Resources.Freezer = state + } + return err +} + +// doFreeze is the same as Freeze but without +// changing the m.cgroups.Resources.Frozen field. +func (m *LegacyManager) doFreeze(state cgroups.FreezerState) error { + path, ok := m.paths["freezer"] + if !ok { + return errSubsystemDoesNotExist + } + freezer := &fs.FreezerGroup{} + resources := &cgroups.Resources{Freezer: state} + return freezer.Set(path, resources) +} + +func (m *LegacyManager) GetPids() ([]int, error) { + path, ok := m.paths["devices"] + if !ok { + return nil, errSubsystemDoesNotExist + } + return cgroups.GetPids(path) +} + +func (m *LegacyManager) GetAllPids() ([]int, error) { + path, ok := m.paths["devices"] + if !ok { + return nil, errSubsystemDoesNotExist + } + return cgroups.GetAllPids(path) +} + +func (m *LegacyManager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for _, sys := range legacySubsystems { + path := m.paths[sys.Name()] + if path == "" { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + + return stats, nil +} + +func (m *LegacyManager) Set(r *cgroups.Resources) error { + if r == nil { + return nil + } + if r.Unified != nil { + return cgroups.ErrV1NoUnified + } + properties, err := genV1ResourcesProperties(r, m.dbus) + if err != nil { + return err + } + + unitName := getUnitName(m.cgroups) + needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r) + if err != nil { + return err + } + + if needsFreeze { + if err := m.doFreeze(cgroups.Frozen); err != nil { + // If freezer cgroup isn't supported, we just warn about it. + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + // skip update the cgroup while frozen failed. #3803 + if !errors.Is(err, errSubsystemDoesNotExist) { + if needsThaw { + if thawErr := m.doFreeze(cgroups.Thawed); thawErr != nil { + logrus.Infof("thaw container after doFreeze failed: %v", thawErr) + } + } + return err + } + } + } + setErr := setUnitProperties(m.dbus, unitName, properties...) + if needsThaw { + if err := m.doFreeze(cgroups.Thawed); err != nil { + logrus.Infof("thaw container after SetUnitProperties failed: %v", err) + } + } + if setErr != nil { + return setErr + } + + for _, sys := range legacySubsystems { + // Get the subsystem path, but don't error out for not found cgroups. + path, ok := m.paths[sys.Name()] + if !ok { + continue + } + if err := sys.Set(path, r); err != nil { + return err + } + } + + return nil +} + +func (m *LegacyManager) GetPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths +} + +func (m *LegacyManager) GetCgroups() (*cgroups.Cgroup, error) { + return m.cgroups, nil +} + +func (m *LegacyManager) GetFreezerState() (cgroups.FreezerState, error) { + path, ok := m.paths["freezer"] + if !ok { + return cgroups.Undefined, nil + } + freezer := &fs.FreezerGroup{} + return freezer.GetState(path) +} + +func (m *LegacyManager) Exists() bool { + return cgroups.PathExists(m.Path("devices")) +} + +func (m *LegacyManager) OOMKillCount() (uint64, error) { + return fs.OOMKillCount(m.Path("memory")) +} diff --git a/systemd/v2.go b/systemd/v2.go new file mode 100644 index 0000000..42a6e35 --- /dev/null +++ b/systemd/v2.go @@ -0,0 +1,515 @@ +package systemd + +import ( + "bufio" + "errors" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/sirupsen/logrus" + + "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fs2" +) + +const ( + cpuIdleSupportedVersion = 252 +) + +type UnifiedManager struct { + mu sync.Mutex + cgroups *cgroups.Cgroup + // path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" + path string + dbus *dbusConnManager + fsMgr cgroups.Manager +} + +func NewUnifiedManager(config *cgroups.Cgroup, path string) (*UnifiedManager, error) { + m := &UnifiedManager{ + cgroups: config, + path: path, + dbus: newDbusConnManager(config.Rootless), + } + if err := m.initPath(); err != nil { + return nil, err + } + + fsMgr, err := fs2.NewManager(config, m.path) + if err != nil { + return nil, err + } + m.fsMgr = fsMgr + + return m, nil +} + +func shouldSetCPUIdle(cm *dbusConnManager, v string) bool { + // The only valid values for cpu.idle are 0 and 1. As it is + // not possible to directly set cpu.idle to 0 via systemd, + // ignore 0. Ignore other values as we'll error out later + // in Set() while calling fsMgr.Set(). + return v == "1" && systemdVersion(cm) >= cpuIdleSupportedVersion +} + +// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified +// key/value map (where key is cgroupfs file name) to systemd unit properties. +// This is on a best-effort basis, so the properties that are not known +// (to this function and/or systemd) are ignored (but logged with "debug" +// log level). +// +// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt +// +// For the list of systemd unit properties, see systemd.resource-control(5). +func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) { + var err error + + for k, v := range res { + if strings.Contains(k, "/") { + return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k) + } + if strings.IndexByte(k, '.') <= 0 { + return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) + } + // Kernel is quite forgiving to extra whitespace + // around the value, and so should we. + v = strings.TrimSpace(v) + // Please keep cases in alphabetical order. + switch k { + case "cpu.idle": + if shouldSetCPUIdle(cm, v) { + // Setting CPUWeight to 0 tells systemd + // to set cpu.idle to 1. + props = append(props, + newProp("CPUWeight", uint64(0))) + } + + case "cpu.max": + // value: quota [period] + quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set + period := defCPUQuotaPeriod + sv := strings.Fields(v) + if len(sv) < 1 || len(sv) > 2 { + return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v) + } + // quota + if sv[0] != "max" { + quota, err = strconv.ParseInt(sv[0], 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err) + } + } + // period + if len(sv) == 2 { + period, err = strconv.ParseUint(sv[1], 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err) + } + } + addCpuQuota(cm, &props, quota, period) + + case "cpu.weight": + if shouldSetCPUIdle(cm, strings.TrimSpace(res["cpu.idle"])) { + // Do not add duplicate CPUWeight property + // (see case "cpu.idle" above). + logrus.Warn("unable to apply both cpu.weight and cpu.idle to systemd, ignoring cpu.weight") + continue + } + num, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + props = append(props, + newProp("CPUWeight", num)) + + case "cpuset.cpus", "cpuset.mems": + bits, err := RangeToBits(v) + if err != nil { + return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err) + } + m := map[string]string{ + "cpuset.cpus": "AllowedCPUs", + "cpuset.mems": "AllowedMemoryNodes", + } + // systemd only supports these properties since v244 + sdVer := systemdVersion(cm) + if sdVer >= 244 { + props = append(props, + newProp(m[k], bits)) + } else { + logrus.Debugf("systemd v%d is too old to support %s"+ + " (setting will still be applied to cgroupfs)", + sdVer, m[k]) + } + + case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max": + num := uint64(math.MaxUint64) + if v != "max" { + num, err = strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + } + m := map[string]string{ + "memory.high": "MemoryHigh", + "memory.low": "MemoryLow", + "memory.min": "MemoryMin", + "memory.max": "MemoryMax", + "memory.swap.max": "MemorySwapMax", + } + props = append(props, + newProp(m[k], num)) + + case "pids.max": + num := uint64(math.MaxUint64) + if v != "max" { + var err error + num, err = strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + } + props = append(props, + newProp("TasksMax", num)) + + case "memory.oom.group": + // Setting this to 1 is roughly equivalent to OOMPolicy=kill + // (as per systemd.service(5) and + // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html), + // but it's not clear what to do if it is unset or set + // to 0 in runc update, as there are two other possible + // values for OOMPolicy (continue/stop). + fallthrough + + default: + // Ignore the unknown resource here -- will still be + // applied in Set which calls fs2.Set. + logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v) + } + } + + return props, nil +} + +func genV2ResourcesProperties(dirPath string, r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { + // We need this check before setting systemd properties, otherwise + // the container is OOM-killed and the systemd unit is removed + // before we get to fsMgr.Set(). + if err := fs2.CheckMemoryUsage(dirPath, r); err != nil { + return nil, err + } + + var properties []systemdDbus.Property + + // NOTE: This is of questionable correctness because we insert our own + // devices eBPF program later. Two programs with identical rules + // aren't the end of the world, but it is a bit concerning. However + // it's unclear if systemd removes all eBPF programs attached when + // doing SetUnitProperties... + deviceProperties, err := generateDeviceProperties(r, cm) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + + if r.Memory != 0 { + properties = append(properties, + newProp("MemoryMax", uint64(r.Memory))) + } + if r.MemoryReservation != 0 { + properties = append(properties, + newProp("MemoryLow", uint64(r.MemoryReservation))) + } + + swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) + if err != nil { + return nil, err + } + if swap != 0 { + properties = append(properties, + newProp("MemorySwapMax", uint64(swap))) + } + + idleSet := false + // The logic here is the same as in shouldSetCPUIdle. + if r.CPUIdle != nil && *r.CPUIdle == 1 && systemdVersion(cm) >= cpuIdleSupportedVersion { + properties = append(properties, + newProp("CPUWeight", uint64(0))) + idleSet = true + } + if r.CpuWeight != 0 { + if idleSet { + // Ignore CpuWeight if CPUIdle is already set. + logrus.Warn("unable to apply both CPUWeight and CpuIdle to systemd, ignoring CPUWeight") + } else { + properties = append(properties, + newProp("CPUWeight", r.CpuWeight)) + } + } + + addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod) + + if r.PidsLimit > 0 || r.PidsLimit == -1 { + properties = append(properties, + newProp("TasksMax", uint64(r.PidsLimit))) + } + + err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) + if err != nil { + return nil, err + } + + // ignore r.KernelMemory + + // convert Resources.Unified map to systemd properties + if r.Unified != nil { + unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified) + if err != nil { + return nil, err + } + properties = append(properties, unifiedProps...) + } + + return properties, nil +} + +func (m *UnifiedManager) Apply(pid int) error { + var ( + c = m.cgroups + unitName = getUnitName(c) + properties []systemdDbus.Property + ) + + slice := "system.slice" + if m.cgroups.Rootless { + slice = "user.slice" + } + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + if strings.HasSuffix(unitName, ".slice") { + // If we create a slice, the parent is defined via a Wants=. + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // Otherwise it's a scope, which we put into a Slice=. + properties = append(properties, systemdDbus.PropSlice(slice)) + // Assume scopes always support delegation (supported since systemd v218). + properties = append(properties, newProp("Delegate", true)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("IOAccounting", true), + newProp("TasksAccounting", true), + ) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + properties = append(properties, c.SystemdProps...) + + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { + return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) + } + + if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil { + return err + } + + if c.OwnerUID != nil { + // The directory itself must be chowned. + err := os.Chown(m.path, *c.OwnerUID, -1) + if err != nil { + return err + } + + filesToChown, err := cgroupFilesToChown() + if err != nil { + return err + } + + for _, v := range filesToChown { + err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1) + // Some files might not be present. + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + } + } + + return nil +} + +// The kernel exposes a list of files that should be chowned to the delegate +// uid in /sys/kernel/cgroup/delegate. If the file is not present +// (Linux < 4.15), use the initial values mentioned in cgroups(7). +func cgroupFilesToChown() ([]string, error) { + const cgroupDelegateFile = "/sys/kernel/cgroup/delegate" + + f, err := os.Open(cgroupDelegateFile) + if err != nil { + return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil + } + defer f.Close() + + filesToChown := []string{} + scanner := bufio.NewScanner(f) + for scanner.Scan() { + filesToChown = append(filesToChown, scanner.Text()) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err) + } + + return filesToChown, nil +} + +func (m *UnifiedManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + + unitName := getUnitName(m.cgroups) + if err := stopUnit(m.dbus, unitName); err != nil { + return err + } + + // systemd 239 do not remove sub-cgroups. + err := m.fsMgr.Destroy() + // fsMgr.Destroy has handled ErrNotExist + if err != nil { + return err + } + + return nil +} + +func (m *UnifiedManager) Path(_ string) string { + return m.path +} + +// getSliceFull value is used in initPath. +// The value is incompatible with systemdDbus.PropSlice. +func (m *UnifiedManager) getSliceFull() (string, error) { + c := m.cgroups + slice := "system.slice" + if c.Rootless { + slice = "user.slice" + } + if c.Parent != "" { + var err error + slice, err = ExpandSlice(c.Parent) + if err != nil { + return "", err + } + } + + if c.Rootless { + // managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service". + managerCG, err := getManagerProperty(m.dbus, "ControlGroup") + if err != nil { + return "", err + } + slice = filepath.Join(managerCG, slice) + } + + // an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice" + // NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified. + return slice, nil +} + +func (m *UnifiedManager) initPath() error { + if m.path != "" { + return nil + } + + sliceFull, err := m.getSliceFull() + if err != nil { + return err + } + + c := m.cgroups + path := filepath.Join(sliceFull, getUnitName(c)) + path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path) + if err != nil { + return err + } + + // an example of the final path in rootless: + // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope" + m.path = path + + return nil +} + +func (m *UnifiedManager) Freeze(state cgroups.FreezerState) error { + return m.fsMgr.Freeze(state) +} + +func (m *UnifiedManager) GetPids() ([]int, error) { + return cgroups.GetPids(m.path) +} + +func (m *UnifiedManager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.path) +} + +func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) { + return m.fsMgr.GetStats() +} + +func (m *UnifiedManager) Set(r *cgroups.Resources) error { + if r == nil { + return nil + } + properties, err := genV2ResourcesProperties(m.fsMgr.Path(""), r, m.dbus) + if err != nil { + return err + } + + if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil { + return fmt.Errorf("unable to set unit properties: %w", err) + } + + return m.fsMgr.Set(r) +} + +func (m *UnifiedManager) GetPaths() map[string]string { + paths := make(map[string]string, 1) + paths[""] = m.path + return paths +} + +func (m *UnifiedManager) GetCgroups() (*cgroups.Cgroup, error) { + return m.cgroups, nil +} + +func (m *UnifiedManager) GetFreezerState() (cgroups.FreezerState, error) { + return m.fsMgr.GetFreezerState() +} + +func (m *UnifiedManager) Exists() bool { + return cgroups.PathExists(m.path) +} + +func (m *UnifiedManager) OOMKillCount() (uint64, error) { + return m.fsMgr.OOMKillCount() +} diff --git a/utils.go b/utils.go new file mode 100644 index 0000000..9ef24b1 --- /dev/null +++ b/utils.go @@ -0,0 +1,468 @@ +package cgroups + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + "github.com/moby/sys/userns" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + CgroupProcesses = "cgroup.procs" + unifiedMountpoint = "/sys/fs/cgroup" + hybridMountpoint = "/sys/fs/cgroup/unified" +) + +var ( + isUnifiedOnce sync.Once + isUnified bool + isHybridOnce sync.Once + isHybrid bool +) + +// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. +func IsCgroup2UnifiedMode() bool { + isUnifiedOnce.Do(func() { + var st unix.Statfs_t + err := unix.Statfs(unifiedMountpoint, &st) + if err != nil { + level := logrus.WarnLevel + if os.IsNotExist(err) && userns.RunningInUserNS() { + // For rootless containers, sweep it under the rug. + level = logrus.DebugLevel + } + logrus.StandardLogger().Logf(level, + "statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err) + } + isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC + }) + return isUnified +} + +// IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode. +func IsCgroup2HybridMode() bool { + isHybridOnce.Do(func() { + var st unix.Statfs_t + err := unix.Statfs(hybridMountpoint, &st) + if err != nil { + isHybrid = false + if !os.IsNotExist(err) { + // Report unexpected errors. + logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint) + } + return + } + isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC + }) + return isHybrid +} + +type Mount struct { + Mountpoint string + Root string + Subsystems []string +} + +// GetCgroupMounts returns the mounts for the cgroup subsystems. +// all indicates whether to return just the first instance or all the mounts. +// This function should not be used from cgroupv2 code, as in this case +// all the controllers are available under the constant unifiedMountpoint. +func GetCgroupMounts(all bool) ([]Mount, error) { + if IsCgroup2UnifiedMode() { + // TODO: remove cgroupv2 case once all external users are converted + availableControllers, err := GetAllSubsystems() + if err != nil { + return nil, err + } + m := Mount{ + Mountpoint: unifiedMountpoint, + Root: unifiedMountpoint, + Subsystems: availableControllers, + } + return []Mount{m}, nil + } + + return getCgroupMountsV1(all) +} + +// GetAllSubsystems returns all the cgroup subsystems supported by the kernel +func GetAllSubsystems() ([]string, error) { + // /proc/cgroups is meaningless for v2 + // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features + if IsCgroup2UnifiedMode() { + // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers. + // - devices: implemented in kernel 4.15 + // - freezer: implemented in kernel 5.2 + // We assume these are always available, as it is hard to detect availability. + pseudo := []string{"devices", "freezer"} + data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers") + if err != nil { + return nil, err + } + subsystems := append(pseudo, strings.Fields(data)...) + return subsystems, nil + } + f, err := os.Open("/proc/cgroups") + if err != nil { + return nil, err + } + defer f.Close() + + subsystems := []string{} + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + if text[0] != '#' { + parts := strings.Fields(text) + if len(parts) >= 4 && parts[3] != "0" { + subsystems = append(subsystems, parts[0]) + } + } + } + if err := s.Err(); err != nil { + return nil, err + } + return subsystems, nil +} + +func readProcsFile(dir string) (out []int, _ error) { + file := CgroupProcesses + retry := true + +again: + f, err := OpenFile(dir, file, os.O_RDONLY) + if err != nil { + return nil, err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + if errors.Is(s.Err(), unix.ENOTSUP) && retry { + // For a threaded cgroup, read returns ENOTSUP, and we should + // read from cgroup.threads instead. + file = "cgroup.threads" + retry = false + goto again + } + return out, s.Err() +} + +// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup +// or /proc//cgroup, into a map of subsystems to cgroup paths, e.g. +// +// "cpu": "/user.slice/user-1000.slice" +// "pids": "/user.slice/user-1000.slice" +// +// etc. +// +// Note that for cgroup v2 unified hierarchy, there are no per-controller +// cgroup paths, so the resulting map will have a single element where the key +// is empty string ("") and the value is the cgroup path the is in. +func ParseCgroupFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + return parseCgroupFromReader(f) +} + +// helper function for ParseCgroupFile to make testing easier +func parseCgroupFromReader(r io.Reader) (map[string]string, error) { + s := bufio.NewScanner(r) + cgroups := make(map[string]string) + + for s.Scan() { + text := s.Text() + // from cgroups(7): + // /proc/[pid]/cgroup + // ... + // For each cgroup hierarchy ... there is one entry + // containing three colon-separated fields of the form: + // hierarchy-ID:subsystem-list:cgroup-path + parts := strings.SplitN(text, ":", 3) + if len(parts) < 3 { + return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text) + } + + for _, subs := range strings.Split(parts[1], ",") { + cgroups[subs] = parts[2] + } + } + if err := s.Err(); err != nil { + return nil, err + } + + return cgroups, nil +} + +func PathExists(path string) bool { + if _, err := os.Stat(path); err != nil { + return false + } + return true +} + +// rmdir tries to remove a directory, optionally retrying on EBUSY. +func rmdir(path string, retry bool) error { + delay := time.Millisecond + tries := 10 + +again: + err := unix.Rmdir(path) + switch err { // nolint:errorlint // unix errors are bare + case nil, unix.ENOENT: + return nil + case unix.EINTR: + goto again + case unix.EBUSY: + if retry && tries > 0 { + time.Sleep(delay) + delay *= 2 + tries-- + goto again + + } + } + return &os.PathError{Op: "rmdir", Path: path, Err: err} +} + +// RemovePath aims to remove cgroup path. It does so recursively, +// by removing any subdirectories (sub-cgroups) first. +func RemovePath(path string) error { + // Try the fast path first; don't retry on EBUSY yet. + if err := rmdir(path, false); err == nil { + return nil + } + + // There are many reasons why rmdir can fail, including: + // 1. cgroup have existing sub-cgroups; + // 2. cgroup (still) have some processes (that are about to vanish); + // 3. lack of permission (one example is read-only /sys/fs/cgroup mount, + // in which case rmdir returns EROFS even for for a non-existent path, + // see issue 4518). + // + // Using os.ReadDir here kills two birds with one stone: check if + // the directory exists (handling scenario 3 above), and use + // directory contents to remove sub-cgroups (handling scenario 1). + infos, err := os.ReadDir(path) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + // Let's remove sub-cgroups, if any. + for _, info := range infos { + if info.IsDir() { + if err = RemovePath(filepath.Join(path, info.Name())); err != nil { + return err + } + } + } + // Finally, try rmdir again, this time with retries on EBUSY, + // which may help with scenario 2 above. + return rmdir(path, true) +} + +// RemovePaths iterates over the provided paths removing them. +func RemovePaths(paths map[string]string) (err error) { + for s, p := range paths { + if err := RemovePath(p); err == nil { + delete(paths, s) + } + } + if len(paths) == 0 { + clear(paths) + return nil + } + return fmt.Errorf("Failed to remove paths: %v", paths) +} + +var ( + hugePageSizes []string + initHPSOnce sync.Once +) + +func HugePageSizes() []string { + initHPSOnce.Do(func() { + dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return + } + files, err := dir.Readdirnames(0) + dir.Close() + if err != nil { + return + } + + hugePageSizes, err = getHugePageSizeFromFilenames(files) + if err != nil { + logrus.Warn("HugePageSizes: ", err) + } + }) + + return hugePageSizes +} + +func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { + pageSizes := make([]string, 0, len(fileNames)) + var warn error + + for _, file := range fileNames { + // example: hugepages-1048576kB + val, ok := strings.CutPrefix(file, "hugepages-") + if !ok { + // Unexpected file name: no prefix found, ignore it. + continue + } + // The suffix is always "kB" (as of Linux 5.13). If we find + // something else, produce an error but keep going. + eLen := len(val) - 2 + val = strings.TrimSuffix(val, "kB") + if len(val) != eLen { + // Highly unlikely. + if warn == nil { + warn = errors.New(file + `: invalid suffix (expected "kB")`) + } + continue + } + size, err := strconv.Atoi(val) + if err != nil { + // Highly unlikely. + if warn == nil { + warn = fmt.Errorf("%s: %w", file, err) + } + continue + } + // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574 + // but in our case the size is in KB already. + if size >= (1 << 20) { + val = strconv.Itoa(size>>20) + "GB" + } else if size >= (1 << 10) { + val = strconv.Itoa(size>>10) + "MB" + } else { + val += "KB" + } + pageSizes = append(pageSizes, val) + } + + return pageSizes, warn +} + +// GetPids returns all pids, that were added to cgroup at path. +func GetPids(dir string) ([]int, error) { + return readProcsFile(dir) +} + +// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file +func WriteCgroupProc(dir string, pid int) error { + // Normally dir should not be empty, one case is that cgroup subsystem + // is not mounted, we will get empty dir, and we want it fail here. + if dir == "" { + return fmt.Errorf("no such directory for %s", CgroupProcesses) + } + + // Dont attach any pid to the cgroup if -1 is specified as a pid + if pid == -1 { + return nil + } + + file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY) + if err != nil { + return fmt.Errorf("failed to write %v: %w", pid, err) + } + defer file.Close() + + for i := 0; i < 5; i++ { + _, err = file.WriteString(strconv.Itoa(pid)) + if err == nil { + return nil + } + + // EINVAL might mean that the task being added to cgroup.procs is in state + // TASK_NEW. We should attempt to do so again. + if errors.Is(err, unix.EINVAL) { + time.Sleep(30 * time.Millisecond) + continue + } + + return fmt.Errorf("failed to write %v: %w", pid, err) + } + return err +} + +// Since the OCI spec is designed for cgroup v1, in some cases +// there is need to convert from the cgroup v1 configuration to cgroup v2 +// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142) +// convert from [2-262144] to [1-10000] +// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)" +func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 { + if cpuShares == 0 { + return 0 + } + return (1 + ((cpuShares-2)*9999)/262142) +} + +// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec +// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap +// is defined as memory+swap combined, while in cgroup v2 swap is a separate value, +// so we need to subtract memory from it where it makes sense. +func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { + switch { + case memory == -1 && memorySwap == 0: + // For compatibility with cgroup1 controller, set swap to unlimited in + // case the memory is set to unlimited and the swap is not explicitly set, + // treating the request as "set both memory and swap to unlimited". + return -1, nil + case memorySwap == -1, memorySwap == 0: + // Treat -1 ("max") and 0 ("unset") swap as is. + return memorySwap, nil + case memory == -1: + // Unlimited memory, so treat swap as is. + return memorySwap, nil + case memory == 0: + // Unset or unknown memory, can't calculate swap. + return 0, errors.New("unable to set swap limit without memory limit") + case memory < 0: + // Does not make sense to subtract a negative value. + return 0, fmt.Errorf("invalid memory value: %d", memory) + case memorySwap < memory: + // Sanity check. + return 0, errors.New("memory+swap limit should be >= memory limit") + } + + return memorySwap - memory, nil +} + +// Since the OCI spec is designed for cgroup v1, in some cases +// there is need to convert from the cgroup v1 configuration to cgroup v2 +// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990) +// convert linearly from [10-1000] to [1-10000] +func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 { + if blkIoWeight == 0 { + return 0 + } + return 1 + (uint64(blkIoWeight)-10)*9999/990 +} diff --git a/utils_test.go b/utils_test.go new file mode 100644 index 0000000..58ac85a --- /dev/null +++ b/utils_test.go @@ -0,0 +1,691 @@ +package cgroups + +import ( + "bytes" + "errors" + "path/filepath" + "reflect" + "strings" + "testing" + + "github.com/moby/sys/mountinfo" + "golang.org/x/sys/unix" +) + +const fedoraMountinfo = `15 35 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw +16 35 0:14 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel +17 35 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8056484k,nr_inodes=2014121,mode=755 +18 16 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw +19 16 0:13 / /sys/fs/selinux rw,relatime shared:8 - selinuxfs selinuxfs rw +20 17 0:16 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel +21 17 0:10 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000 +22 35 0:17 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,seclabel,mode=755 +23 16 0:18 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:9 - tmpfs tmpfs rw,seclabel,mode=755 +24 23 0:19 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd +25 16 0:20 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw +26 23 0:21 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,clone_children +27 23 0:22 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,cpuacct,cpu,clone_children +28 23 0:23 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,memory,clone_children +29 23 0:24 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,devices,clone_children +30 23 0:25 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,freezer,clone_children +31 23 0:26 / /sys/fs/cgroup/net_cls rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,net_cls,clone_children +32 23 0:27 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,blkio,clone_children +33 23 0:28 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,perf_event,clone_children +34 23 0:29 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,hugetlb,clone_children +35 1 253:2 / / rw,relatime shared:1 - ext4 /dev/mapper/ssd-root--f20 rw,seclabel,data=ordered +36 15 0:30 / /proc/sys/fs/binfmt_misc rw,relatime shared:22 - autofs systemd-1 rw,fd=38,pgrp=1,timeout=300,minproto=5,maxproto=5,direct +37 17 0:12 / /dev/mqueue rw,relatime shared:23 - mqueue mqueue rw,seclabel +38 35 0:31 / /tmp rw shared:24 - tmpfs tmpfs rw,seclabel +39 17 0:32 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw,seclabel +40 16 0:7 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw +41 16 0:33 / /sys/kernel/config rw,relatime shared:27 - configfs configfs rw +42 35 0:34 / /var/lib/nfs/rpc_pipefs rw,relatime shared:28 - rpc_pipefs sunrpc rw +43 15 0:35 / /proc/fs/nfsd rw,relatime shared:29 - nfsd sunrpc rw +45 35 8:17 / /boot rw,relatime shared:30 - ext4 /dev/sdb1 rw,seclabel,data=ordered +46 35 253:4 / /home rw,relatime shared:31 - ext4 /dev/mapper/ssd-home rw,seclabel,data=ordered +47 35 253:5 / /var/lib/libvirt/images rw,noatime,nodiratime shared:32 - ext4 /dev/mapper/ssd-virt rw,seclabel,discard,data=ordered +48 35 253:12 / /mnt/old rw,relatime shared:33 - ext4 /dev/mapper/HelpDeskRHEL6-FedoraRoot rw,seclabel,data=ordered +121 22 0:36 / /run/user/1000/gvfs rw,nosuid,nodev,relatime shared:104 - fuse.gvfsd-fuse gvfsd-fuse rw,user_id=1000,group_id=1000 +124 16 0:37 / /sys/fs/fuse/connections rw,relatime shared:107 - fusectl fusectl rw +165 38 253:3 / /tmp/mnt rw,relatime shared:147 - ext4 /dev/mapper/ssd-root rw,seclabel,data=ordered +167 35 253:15 / /var/lib/docker/devicemapper/mnt/aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,relatime shared:149 - ext4 /dev/mapper/docker-253:2-425882-aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,seclabel,discard,stripe=16,data=ordered +171 35 253:16 / /var/lib/docker/devicemapper/mnt/c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,relatime shared:153 - ext4 /dev/mapper/docker-253:2-425882-c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,seclabel,discard,stripe=16,data=ordered +175 35 253:17 / /var/lib/docker/devicemapper/mnt/1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,relatime shared:157 - ext4 /dev/mapper/docker-253:2-425882-1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,seclabel,discard,stripe=16,data=ordered +179 35 253:18 / /var/lib/docker/devicemapper/mnt/d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,relatime shared:161 - ext4 /dev/mapper/docker-253:2-425882-d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,seclabel,discard,stripe=16,data=ordered +183 35 253:19 / /var/lib/docker/devicemapper/mnt/6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,relatime shared:165 - ext4 /dev/mapper/docker-253:2-425882-6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,seclabel,discard,stripe=16,data=ordered +187 35 253:20 / /var/lib/docker/devicemapper/mnt/8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,relatime shared:169 - ext4 /dev/mapper/docker-253:2-425882-8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,seclabel,discard,stripe=16,data=ordered +191 35 253:21 / /var/lib/docker/devicemapper/mnt/c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,relatime shared:173 - ext4 /dev/mapper/docker-253:2-425882-c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,seclabel,discard,stripe=16,data=ordered +195 35 253:22 / /var/lib/docker/devicemapper/mnt/2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,relatime shared:177 - ext4 /dev/mapper/docker-253:2-425882-2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,seclabel,discard,stripe=16,data=ordered +199 35 253:23 / /var/lib/docker/devicemapper/mnt/37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,relatime shared:181 - ext4 /dev/mapper/docker-253:2-425882-37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,seclabel,discard,stripe=16,data=ordered +203 35 253:24 / /var/lib/docker/devicemapper/mnt/aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,relatime shared:185 - ext4 /dev/mapper/docker-253:2-425882-aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,seclabel,discard,stripe=16,data=ordered +207 35 253:25 / /var/lib/docker/devicemapper/mnt/928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,relatime shared:189 - ext4 /dev/mapper/docker-253:2-425882-928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,seclabel,discard,stripe=16,data=ordered +211 35 253:26 / /var/lib/docker/devicemapper/mnt/0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,relatime shared:193 - ext4 /dev/mapper/docker-253:2-425882-0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,seclabel,discard,stripe=16,data=ordered +215 35 253:27 / /var/lib/docker/devicemapper/mnt/d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,relatime shared:197 - ext4 /dev/mapper/docker-253:2-425882-d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,seclabel,discard,stripe=16,data=ordered +219 35 253:28 / /var/lib/docker/devicemapper/mnt/bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,relatime shared:201 - ext4 /dev/mapper/docker-253:2-425882-bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,seclabel,discard,stripe=16,data=ordered +223 35 253:29 / /var/lib/docker/devicemapper/mnt/7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,relatime shared:205 - ext4 /dev/mapper/docker-253:2-425882-7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,seclabel,discard,stripe=16,data=ordered +227 35 253:30 / /var/lib/docker/devicemapper/mnt/c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,relatime shared:209 - ext4 /dev/mapper/docker-253:2-425882-c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,seclabel,discard,stripe=16,data=ordered +231 35 253:31 / /var/lib/docker/devicemapper/mnt/8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,relatime shared:213 - ext4 /dev/mapper/docker-253:2-425882-8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,seclabel,discard,stripe=16,data=ordered +235 35 253:32 / /var/lib/docker/devicemapper/mnt/1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,relatime shared:217 - ext4 /dev/mapper/docker-253:2-425882-1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,seclabel,discard,stripe=16,data=ordered +239 35 253:33 / /var/lib/docker/devicemapper/mnt/e9aa60c60128cad1 rw,relatime shared:221 - ext4 /dev/mapper/docker-253:2-425882-e9aa60c60128cad1 rw,seclabel,discard,stripe=16,data=ordered +243 35 253:34 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,relatime shared:225 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,seclabel,discard,stripe=16,data=ordered +247 35 253:35 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,relatime shared:229 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,seclabel,discard,stripe=16,data=ordered +31 21 0:23 / /DATA/foo_bla_bla rw,relatime - cifs //foo/BLA\040BLA\040BLA/ rw,sec=ntlm,cache=loose,unc=\\foo\BLA BLA BLA,username=my_login,domain=mydomain.com,uid=12345678,forceuid,gid=12345678,forcegid,addr=10.1.30.10,file_mode=0755,dir_mode=0755,nounix,rsize=61440,wsize=65536,actimeo=1` + +const systemdMountinfo = `115 83 0:32 / / rw,relatime - aufs none rw,si=c0bd3d3,dio,dirperm1 +116 115 0:35 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw +117 115 0:36 / /dev rw,nosuid - tmpfs tmpfs rw,mode=755 +118 117 0:37 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666 +119 115 0:38 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw +120 119 0:39 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755 +121 120 0:19 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +122 120 0:20 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,devices +123 120 0:21 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer +124 120 0:22 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory +125 120 0:23 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,net_cls,net_prio +126 120 0:24 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,blkio +127 120 0:25 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuset,clone_children +128 120 0:26 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpu,cpuacct +129 120 0:27 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,perf_event,release_agent=/run/cgmanager/agents/cgm-release-agent.perf_event +130 115 43:0 /var/lib/docker/volumes/a44a712176377f57c094397330ee04387284c478364eb25f4c3d25f775f25c26/_data /var/lib/docker rw,relatime - ext4 /dev/nbd0 rw,data=ordered +131 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/resolv.conf /etc/resolv.conf rw,relatime - ext4 /dev/nbd0 rw,data=ordered +132 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hostname /etc/hostname rw,relatime - ext4 /dev/nbd0 rw,data=ordered +133 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hosts /etc/hosts rw,relatime - ext4 /dev/nbd0 rw,data=ordered +134 117 0:33 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k +135 117 0:13 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw +136 117 0:12 /1 /dev/console rw,nosuid,noexec,relatime - devpts none rw,gid=5,mode=620,ptmxmode=000 +84 115 0:40 / /tmp rw,relatime - tmpfs none rw` + +const bedrockMountinfo = `120 17 0:28 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +124 28 0:28 / /bedrock/strata/arch/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +123 53 0:28 / /bedrock/strata/fallback/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +122 71 0:28 / /bedrock/strata/gentoo/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +121 89 0:28 / /bedrock/strata/kde/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +125 120 0:29 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +129 124 0:29 / /bedrock/strata/arch/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +128 123 0:29 / /bedrock/strata/fallback/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +127 122 0:29 / /bedrock/strata/gentoo/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +126 121 0:29 / /bedrock/strata/kde/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +140 120 0:32 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +144 124 0:32 / /bedrock/strata/arch/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +143 123 0:32 / /bedrock/strata/fallback/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +142 122 0:32 / /bedrock/strata/gentoo/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +141 121 0:32 / /bedrock/strata/kde/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +145 120 0:33 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +149 124 0:33 / /bedrock/strata/arch/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +148 123 0:33 / /bedrock/strata/fallback/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +147 122 0:33 / /bedrock/strata/gentoo/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +146 121 0:33 / /bedrock/strata/kde/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +150 120 0:34 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +154 124 0:34 / /bedrock/strata/arch/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +153 123 0:34 / /bedrock/strata/fallback/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +152 122 0:34 / /bedrock/strata/gentoo/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +151 121 0:34 / /bedrock/strata/kde/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +155 120 0:35 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +159 124 0:35 / /bedrock/strata/arch/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +158 123 0:35 / /bedrock/strata/fallback/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +157 122 0:35 / /bedrock/strata/gentoo/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +156 121 0:35 / /bedrock/strata/kde/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +160 120 0:36 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +164 124 0:36 / /bedrock/strata/arch/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +163 123 0:36 / /bedrock/strata/fallback/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +162 122 0:36 / /bedrock/strata/gentoo/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +161 121 0:36 / /bedrock/strata/kde/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +165 120 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +169 124 0:37 / /bedrock/strata/arch/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +168 123 0:37 / /bedrock/strata/fallback/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +167 122 0:37 / /bedrock/strata/gentoo/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +166 121 0:37 / /bedrock/strata/kde/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +170 120 0:38 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +174 124 0:38 / /bedrock/strata/arch/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +173 123 0:38 / /bedrock/strata/fallback/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +172 122 0:38 / /bedrock/strata/gentoo/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +171 121 0:38 / /bedrock/strata/kde/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +175 120 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +179 124 0:39 / /bedrock/strata/arch/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +178 123 0:39 / /bedrock/strata/fallback/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +177 122 0:39 / /bedrock/strata/gentoo/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +176 121 0:39 / /bedrock/strata/kde/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +180 120 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +184 124 0:40 / /bedrock/strata/arch/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +183 123 0:40 / /bedrock/strata/fallback/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +182 122 0:40 / /bedrock/strata/gentoo/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +181 121 0:40 / /bedrock/strata/kde/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event` + +const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel +19 64 0:4 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw +20 64 0:6 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8171204k,nr_inodes=2042801,mode=755 +21 18 0:19 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw +22 20 0:20 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel +23 20 0:21 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000 +24 64 0:22 / /run rw,nosuid,nodev shared:24 - tmpfs tmpfs rw,seclabel,mode=755 +25 18 0:23 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755 +26 25 0:24 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup2 cgroup rw +27 18 0:25 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw,seclabel +28 18 0:26 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime shared:21 - efivarfs efivarfs rw +29 25 0:27 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpu,cpuacct +30 25 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,memory +31 25 0:29 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,net_cls,net_prio +32 25 0:30 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,blkio +33 25 0:31 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,perf_event +34 25 0:32 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,hugetlb +35 25 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,freezer +36 25 0:34 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset +37 25 0:35 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices +38 25 0:36 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids +61 18 0:37 / /sys/kernel/config rw,relatime shared:22 - configfs configfs rw +64 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/fedora_dhcp--16--129-root rw,seclabel,data=ordered +39 18 0:17 / /sys/fs/selinux rw,relatime shared:23 - selinuxfs selinuxfs rw +40 20 0:16 / /dev/mqueue rw,relatime shared:25 - mqueue mqueue rw,seclabel +41 20 0:39 / /dev/hugepages rw,relatime shared:26 - hugetlbfs hugetlbfs rw,seclabel +` + +func TestGetCgroupMounts(t *testing.T) { + type testData struct { + mountInfo string + root string + // all is the total number of records expected with all=true, + // or 0 for no extra records expected (most cases). + all int + subsystems map[string]bool + } + testTable := []testData{ + { + mountInfo: fedoraMountinfo, + root: "/", + subsystems: map[string]bool{ + "name=systemd": false, + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + "hugetlb": false, + }, + }, + { + mountInfo: systemdMountinfo, + root: "/system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope", + subsystems: map[string]bool{ + "name=systemd": false, + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "net_prio": false, + "blkio": false, + "perf_event": false, + }, + }, + { + mountInfo: bedrockMountinfo, + root: "/", + all: 50, + subsystems: map[string]bool{ + "name=systemd": false, + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "net_prio": false, + "blkio": false, + "perf_event": false, + "pids": false, + }, + }, + } + for _, td := range testTable { + mi, err := mountinfo.GetMountsFromReader( + bytes.NewBufferString(td.mountInfo), + mountinfo.FSTypeFilter("cgroup"), + ) + if err != nil { + t.Fatal(err) + } + cgMounts, err := getCgroupMountsHelper(td.subsystems, mi, false) + if err != nil { + t.Fatal(err) + } + cgMap := make(map[string]Mount) + for _, m := range cgMounts { + for _, ss := range m.Subsystems { + cgMap[ss] = m + } + } + for ss := range td.subsystems { + ss = strings.TrimPrefix(ss, CgroupNamePrefix) + m, ok := cgMap[ss] + if !ok { + t.Fatalf("%s not found", ss) + } + if m.Root != td.root { + t.Fatalf("unexpected root for %s: %s", ss, m.Root) + } + if !strings.HasPrefix(m.Mountpoint, "/sys/fs/cgroup/") && !strings.Contains(m.Mountpoint, ss) { + t.Fatalf("unexpected mountpoint for %s: %s", ss, m.Mountpoint) + } + var ssFound bool + for _, mss := range m.Subsystems { + if mss == ss { + ssFound = true + break + } + } + if !ssFound { + t.Fatalf("subsystem %s not found in Subsystems field %v", ss, m.Subsystems) + } + } + // Test the all=true case. + + // Reset the test input. + for k := range td.subsystems { + td.subsystems[k] = false + } + cgMountsAll, err := getCgroupMountsHelper(td.subsystems, mi, true) + if err != nil { + t.Fatal(err) + } + if td.all == 0 { + // Results with and without "all" should be the same. + if len(cgMounts) != len(cgMountsAll) || !reflect.DeepEqual(cgMounts, cgMountsAll) { + t.Errorf("expected same results, got (all=false) %v, (all=true) %v", cgMounts, cgMountsAll) + } + } else { + // Make sure we got all records. + if len(cgMountsAll) != td.all { + t.Errorf("expected %d records, got %d (%+v)", td.all, len(cgMountsAll), cgMountsAll) + } + } + + } +} + +func BenchmarkGetCgroupMounts(b *testing.B) { + subsystems := map[string]bool{ + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + "hugetlb": false, + } + mi, err := mountinfo.GetMountsFromReader( + bytes.NewBufferString(fedoraMountinfo), + mountinfo.FSTypeFilter("cgroup"), + ) + if err != nil { + b.Fatal(err) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := getCgroupMountsHelper(subsystems, mi, false); err != nil { + b.Fatal(err) + } + } +} + +func TestParseCgroupString(t *testing.T) { + testCases := []struct { + input string + expectedError error + expectedOutput map[string]string + }{ + { + // Taken from a CoreOS instance running systemd 225 with CPU/Mem + // accounting enabled in systemd + input: `9:blkio:/ +8:freezer:/ +7:perf_event:/ +6:devices:/system.slice/system-sshd.slice +5:cpuset:/ +4:cpu,cpuacct:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service +3:net_cls,net_prio:/ +2:memory:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service +1:name=systemd:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service`, + expectedOutput: map[string]string{ + "name=systemd": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + "blkio": "/", + "freezer": "/", + "perf_event": "/", + "devices": "/system.slice/system-sshd.slice", + "cpuset": "/", + "cpu": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + "cpuacct": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + "net_cls": "/", + "net_prio": "/", + "memory": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + }, + }, + { + input: `malformed input`, + expectedError: errors.New(`invalid cgroup entry: must contain at least two colons: malformed input`), + }, + } + + for ndx, testCase := range testCases { + out, err := parseCgroupFromReader(strings.NewReader(testCase.input)) + if err != nil { + if testCase.expectedError == nil || testCase.expectedError.Error() != err.Error() { + t.Errorf("%v: expected error %v, got error %v", ndx, testCase.expectedError, err) + } + } else { + if !reflect.DeepEqual(testCase.expectedOutput, out) { + t.Errorf("%v: expected output %v, got error %v", ndx, testCase.expectedOutput, out) + } + } + } +} + +func TestIgnoreCgroup2Mount(t *testing.T) { + subsystems := map[string]bool{ + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + "pids": false, + "name=systemd": false, + } + + mi, err := mountinfo.GetMountsFromReader( + bytes.NewBufferString(cgroup2Mountinfo), + mountinfo.FSTypeFilter("cgroup"), + ) + if err != nil { + t.Fatal(err) + } + cgMounts, err := getCgroupMountsHelper(subsystems, mi, false) + if err != nil { + t.Fatal(err) + } + for _, m := range cgMounts { + if m.Mountpoint == "/sys/fs/cgroup/systemd" { + t.Errorf("parsed a cgroup2 mount at /sys/fs/cgroup/systemd instead of ignoring it") + } + } +} + +func TestFindCgroupMountpointAndRoot(t *testing.T) { + fakeMountInfo := `35 27 0:29 / /foo rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices +35 27 0:29 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices` + testCases := []struct { + cgroupPath string + output string + }{ + {cgroupPath: "/sys/fs", output: "/sys/fs/cgroup/devices"}, + {cgroupPath: "", output: "/foo"}, + } + + mi, err := mountinfo.GetMountsFromReader( + bytes.NewBufferString(fakeMountInfo), + mountinfo.FSTypeFilter("cgroup"), + ) + if err != nil { + t.Fatal(err) + } + + for _, c := range testCases { + mountpoint, _, _ := findCgroupMountpointAndRootFromMI(mi, c.cgroupPath, "devices") + if mountpoint != c.output { + t.Errorf("expected %s, got %s", c.output, mountpoint) + } + } +} + +func BenchmarkGetHugePageSizeImpl(b *testing.B) { + var ( + input = []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"} + output []string + err error + ) + for i := 0; i < b.N; i++ { + output, err = getHugePageSizeFromFilenames(input) + } + if err != nil || len(output) != len(input) { + b.Fatal("unexpected results") + } +} + +func TestGetHugePageSizeImpl(t *testing.T) { + testCases := []struct { + doc string + input []string + output []string + isErr bool + }{ + { + doc: "normal input", + input: []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"}, + output: []string{"1GB", "2MB", "32MB", "64KB"}, + }, + { + doc: "empty input", + input: []string{}, + output: []string{}, + }, + { + doc: "not a number", + input: []string{"hugepages-akB"}, + isErr: true, + }, + { + doc: "no prefix (silently skipped)", + input: []string{"1024kB"}, + }, + { + doc: "invalid prefix (silently skipped)", + input: []string{"whatever-1024kB"}, + }, + { + doc: "invalid suffix", + input: []string{"hugepages-1024gB"}, + isErr: true, + }, + { + doc: "no suffix", + input: []string{"hugepages-1024"}, + isErr: true, + }, + { + doc: "mixed valid and invalid entries", + input: []string{"hugepages-4194304kB", "hugepages-2048kB", "hugepages-akB", "hugepages-64kB"}, + output: []string{"4GB", "2MB", "64KB"}, + isErr: true, + }, + { + doc: "more mixed valid and invalid entries", + input: []string{"hugepages-2048kB", "hugepages-kB", "hugepages-64kB"}, + output: []string{"2MB", "64KB"}, + isErr: true, + }, + } + + for _, c := range testCases { + c := c + t.Run(c.doc, func(t *testing.T) { + output, err := getHugePageSizeFromFilenames(c.input) + t.Log("input:", c.input, "; output:", output, "; err:", err) + if err != nil { + if !c.isErr { + t.Errorf("input %v, expected nil, got error: %v", c.input, err) + } + // no more checks + return + } + if c.isErr { + t.Errorf("input %v, expected error, got error: nil, output: %v", c.input, output) + } + // check output + if len(output) != len(c.output) || (len(output) > 0 && !reflect.DeepEqual(output, c.output)) { + t.Errorf("input %v, expected %v, got %v", c.input, c.output, output) + } + }) + } +} + +func TestConvertCPUSharesToCgroupV2Value(t *testing.T) { + cases := map[uint64]uint64{ + 0: 0, + 2: 1, + 262144: 10000, + } + for i, expected := range cases { + got := ConvertCPUSharesToCgroupV2Value(i) + if got != expected { + t.Errorf("expected ConvertCPUSharesToCgroupV2Value(%d) to be %d, got %d", i, expected, got) + } + } +} + +func TestConvertMemorySwapToCgroupV2Value(t *testing.T) { + cases := []struct { + descr string + memswap, memory int64 + expected int64 + expErr bool + }{ + { + descr: "all unset", + memswap: 0, + memory: 0, + expected: 0, + }, + { + descr: "unlimited memory+swap, unset memory", + memswap: -1, + memory: 0, + expected: -1, + }, + { + descr: "unlimited memory", + memswap: 300, + memory: -1, + expected: 300, + }, + { + descr: "all unlimited", + memswap: -1, + memory: -1, + expected: -1, + }, + { + descr: "negative memory+swap", + memswap: -2, + memory: 0, + expErr: true, + }, + { + descr: "unlimited memory+swap, set memory", + memswap: -1, + memory: 1000, + expected: -1, + }, + { + descr: "memory+swap == memory", + memswap: 1000, + memory: 1000, + expected: 0, + }, + { + descr: "memory+swap > memory", + memswap: 500, + memory: 200, + expected: 300, + }, + { + descr: "memory+swap < memory", + memswap: 300, + memory: 400, + expErr: true, + }, + { + descr: "unset memory", + memswap: 300, + memory: 0, + expErr: true, + }, + { + descr: "negative memory", + memswap: 300, + memory: -300, + expErr: true, + }, + } + + for _, c := range cases { + c := c + t.Run(c.descr, func(t *testing.T) { + swap, err := ConvertMemorySwapToCgroupV2Value(c.memswap, c.memory) + if c.expErr { + if err == nil { + t.Errorf("memswap: %d, memory %d, expected error, got %d, nil", c.memswap, c.memory, swap) + } + // No more checks. + return + } + if err != nil { + t.Errorf("memswap: %d, memory %d, expected success, got error %s", c.memswap, c.memory, err) + } + if swap != c.expected { + t.Errorf("memswap: %d, memory %d, expected %d, got %d", c.memswap, c.memory, c.expected, swap) + } + }) + } +} + +func TestConvertBlkIOToIOWeightValue(t *testing.T) { + cases := map[uint16]uint64{ + 0: 0, + 10: 1, + 1000: 10000, + } + for i, expected := range cases { + got := ConvertBlkIOToIOWeightValue(i) + if got != expected { + t.Errorf("expected ConvertBlkIOToIOWeightValue(%d) to be %d, got %d", i, expected, got) + } + } +} + +// TestRemovePathReadOnly is to test remove a non-existent dir in a ro mount point. +// The similar issue example: https://github.com/opencontainers/runc/issues/4518 +func TestRemovePathReadOnly(t *testing.T) { + dirTo := t.TempDir() + err := unix.Mount(t.TempDir(), dirTo, "", unix.MS_BIND, "") + if err != nil { + t.Skip("no permission of mount") + } + defer func() { + _ = unix.Unmount(dirTo, 0) + }() + err = unix.Mount("", dirTo, "", unix.MS_REMOUNT|unix.MS_BIND|unix.MS_RDONLY, "") + if err != nil { + t.Skip("no permission of mount") + } + nonExistentDir := filepath.Join(dirTo, "non-existent-dir") + err = rmdir(nonExistentDir, true) + if !errors.Is(err, unix.EROFS) { + t.Fatalf("expected the error of removing a non-existent dir %s in a ro mount point with rmdir to be unix.EROFS, but got: %v", nonExistentDir, err) + } + err = RemovePath(nonExistentDir) + if err != nil { + t.Fatalf("expected the error of removing a non-existent dir %s in a ro mount point with RemovePath to be nil, but got: %v", nonExistentDir, err) + } +} diff --git a/v1_utils.go b/v1_utils.go new file mode 100644 index 0000000..81193e2 --- /dev/null +++ b/v1_utils.go @@ -0,0 +1,277 @@ +package cgroups + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "syscall" + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/moby/sys/mountinfo" + "golang.org/x/sys/unix" +) + +// Code in this source file are specific to cgroup v1, +// and must not be used from any cgroup v2 code. + +const ( + CgroupNamePrefix = "name=" + defaultPrefix = "/sys/fs/cgroup" +) + +var ( + errUnified = errors.New("not implemented for cgroup v2 unified hierarchy") + ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1") + + readMountinfoOnce sync.Once + readMountinfoErr error + cgroupMountinfo []*mountinfo.Info +) + +type NotFoundError struct { + Subsystem string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.Subsystem) +} + +func NewNotFoundError(sub string) error { + return &NotFoundError{ + Subsystem: sub, + } +} + +func IsNotFound(err error) bool { + var nfErr *NotFoundError + return errors.As(err, &nfErr) +} + +func tryDefaultPath(cgroupPath, subsystem string) string { + if !strings.HasPrefix(defaultPrefix, cgroupPath) { + return "" + } + + // remove possible prefix + subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix) + + // Make sure we're still under defaultPrefix, and resolve + // a possible symlink (like cpu -> cpu,cpuacct). + path, err := securejoin.SecureJoin(defaultPrefix, subsystem) + if err != nil { + return "" + } + + // (1) path should be a directory. + st, err := os.Lstat(path) + if err != nil || !st.IsDir() { + return "" + } + + // (2) path should be a mount point. + pst, err := os.Lstat(filepath.Dir(path)) + if err != nil { + return "" + } + + if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev { + // parent dir has the same dev -- path is not a mount point + return "" + } + + // (3) path should have 'cgroup' fs type. + fst := unix.Statfs_t{} + err = unix.Statfs(path, &fst) + if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { + return "" + } + + return path +} + +// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones +// with fstype of "cgroup") for the current running process. +// +// The results are cached (to avoid re-reading mountinfo which is relatively +// expensive), so it is assumed that cgroup mounts are not being changed. +func readCgroupMountinfo() ([]*mountinfo.Info, error) { + readMountinfoOnce.Do(func() { + // mountinfo.GetMounts uses /proc/thread-self, so we can use it without + // issues. + cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts( + mountinfo.FSTypeFilter("cgroup"), + ) + }) + return cgroupMountinfo, readMountinfoErr +} + +// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt +func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + + // If subsystem is empty, we look for the cgroupv2 hybrid path. + if len(subsystem) == 0 { + return hybridMountpoint, nil + } + + // Avoid parsing mountinfo by trying the default path first, if possible. + if path := tryDefaultPath(cgroupPath, subsystem); path != "" { + return path, nil + } + + mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) + return mnt, err +} + +func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { + if IsCgroup2UnifiedMode() { + return "", "", errUnified + } + + mi, err := readCgroupMountinfo() + if err != nil { + return "", "", err + } + + return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem) +} + +func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) { + for _, mi := range mounts { + if strings.HasPrefix(mi.Mountpoint, cgroupPath) { + for _, opt := range strings.Split(mi.VFSOptions, ",") { + if opt == subsystem { + return mi.Mountpoint, mi.Root, nil + } + } + } + } + + return "", "", NewNotFoundError(subsystem) +} + +func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { + if len(m.Subsystems) == 0 { + return "", errors.New("no subsystem for mount") + } + + return getControllerPath(m.Subsystems[0], cgroups) +} + +func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) { + res := make([]Mount, 0, len(ss)) + numFound := 0 + for _, mi := range mounts { + m := Mount{ + Mountpoint: mi.Mountpoint, + Root: mi.Root, + } + for _, opt := range strings.Split(mi.VFSOptions, ",") { + seen, known := ss[opt] + if !known || (!all && seen) { + continue + } + ss[opt] = true + opt = strings.TrimPrefix(opt, CgroupNamePrefix) + m.Subsystems = append(m.Subsystems, opt) + numFound++ + } + if len(m.Subsystems) > 0 || all { + res = append(res, m) + } + if !all && numFound >= len(ss) { + break + } + } + return res, nil +} + +func getCgroupMountsV1(all bool) ([]Mount, error) { + mi, err := readCgroupMountinfo() + if err != nil { + return nil, err + } + + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. + allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return nil, err + } + + allMap := make(map[string]bool) + for s := range allSubsystems { + allMap[s] = false + } + + return getCgroupMountsHelper(allMap, mi, all) +} + +// GetOwnCgroup returns the relative path to the cgroup docker is running in. +func GetOwnCgroup(subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + + // We don't need to use /proc/thread-self here because runc always runs + // with every thread in the same cgroup. This lets us avoid having to do + // runtime.LockOSThread. + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func GetOwnCgroupPath(subsystem string) (string, error) { + cgroup, err := GetOwnCgroup(subsystem) + if err != nil { + return "", err + } + + // If subsystem is empty, we look for the cgroupv2 hybrid path. + if len(subsystem) == 0 { + return hybridMountpoint, nil + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func getCgroupPathHelper(subsystem, cgroup string) (string, error) { + mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see paths from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + return filepath.Join(mnt, relCgroup), nil +} + +func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + + if p, ok := cgroups[subsystem]; ok { + return p, nil + } + + if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { + return p, nil + } + + return "", NewNotFoundError(subsystem) +}