diff --git a/MAINTAINERS b/MAINTAINERS
index 9bee195..413edcb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1,8 +1,8 @@
-This meta-project is maintained by the union of MAINTAINERS for all OCI Projects [1].
-
-Other OCI Projects should list one maintainer per line, with a name, email address, and GitHub username:
-
-Random J Developer <random@developer.example.org> (@RandomJDeveloperExample)
-A. U. Thor <author@example.org> (@AUThorExample)
-
-[1]: https://github.com/opencontainers/
+Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp> (@AkihiroSuda)
+Aleksa Sarai <cyphar@cyphar.com> (@cyphar)
+Kir Kolyshkin <kolyshkin@gmail.com> (@kolyshkin)
+Mrunal Patel <mpatel@redhat.com> (@mrunalp)
+Sebastiaan van Stijn <github@gone.nl> (@thaJeztah)
+Odin Ugedal <odin@uged.al> (@odinuge)
+Peter Hunt <pehunt@redhat.com> (@haircommander)
+Davanum Srinivas <davanum@gmail.com> (@dims)
diff --git a/cgroups.go b/cgroups.go
new file mode 100644
index 0000000..1f12755
--- /dev/null
+++ b/cgroups.go
@@ -0,0 +1,78 @@
+package cgroups
+
+import (
+	"errors"
+)
+
+var (
+	// ErrDevicesUnsupported is an error returned when a cgroup manager
+	// is not configured to set device rules.
+	ErrDevicesUnsupported = errors.New("cgroup manager is not configured to set device rules")
+
+	// ErrRootless is returned by [Manager.Apply] when there is an error
+	// creating cgroup directory, and cgroup.Rootless is set. In general,
+	// this error is to be ignored.
+	ErrRootless = errors.New("cgroup manager can not access cgroup (rootless container)")
+
+	// DevicesSetV1 and DevicesSetV2 are functions to set devices for
+	// cgroup v1 and v2, respectively. Unless
+	// [github.com/opencontainers/cgroups/devices]
+	// package is imported, it is set to nil, so cgroup managers can't
+	// manage devices.
+	DevicesSetV1 func(path string, r *Resources) error
+	DevicesSetV2 func(path string, r *Resources) error
+)
+
+type Manager interface {
+	// Apply creates a cgroup, if not yet created, and adds a process
+	// with the specified pid into that cgroup.  A special value of -1
+	// can be used to merely create a cgroup.
+	Apply(pid int) error
+
+	// GetPids returns the PIDs of all processes inside the cgroup.
+	GetPids() ([]int, error)
+
+	// GetAllPids returns the PIDs of all processes inside the cgroup
+	// any all its sub-cgroups.
+	GetAllPids() ([]int, error)
+
+	// GetStats returns cgroups statistics.
+	GetStats() (*Stats, error)
+
+	// Freeze sets the freezer cgroup to the specified state.
+	Freeze(state FreezerState) error
+
+	// Destroy removes cgroup.
+	Destroy() error
+
+	// Path returns a cgroup path to the specified controller/subsystem.
+	// For cgroupv2, the argument is unused and can be empty.
+	Path(string) string
+
+	// Set sets cgroup resources parameters/limits. If the argument is nil,
+	// the resources specified during Manager creation (or the previous call
+	// to Set) are used.
+	Set(r *Resources) error
+
+	// GetPaths returns cgroup path(s) to save in a state file in order to
+	// restore later.
+	//
+	// For cgroup v1, a key is cgroup subsystem name, and the value is the
+	// path to the cgroup for this subsystem.
+	//
+	// For cgroup v2 unified hierarchy, a key is "", and the value is the
+	// unified path.
+	GetPaths() map[string]string
+
+	// GetCgroups returns the cgroup data as configured.
+	GetCgroups() (*Cgroup, error)
+
+	// GetFreezerState retrieves the current FreezerState of the cgroup.
+	GetFreezerState() (FreezerState, error)
+
+	// Exists returns whether the cgroup path exists or not.
+	Exists() bool
+
+	// OOMKillCount reports OOM kill count for the cgroup.
+	OOMKillCount() (uint64, error)
+}
diff --git a/cgroups_test.go b/cgroups_test.go
new file mode 100644
index 0000000..b7ca7b1
--- /dev/null
+++ b/cgroups_test.go
@@ -0,0 +1,21 @@
+package cgroups
+
+import (
+	"testing"
+)
+
+func TestParseCgroups(t *testing.T) {
+	// We don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
+	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if IsCgroup2UnifiedMode() {
+		return
+	}
+	if _, ok := cgroups["cpu"]; !ok {
+		t.Fail()
+	}
+}
diff --git a/config_blkio_device.go b/config_blkio_device.go
new file mode 100644
index 0000000..9dc2a03
--- /dev/null
+++ b/config_blkio_device.go
@@ -0,0 +1,66 @@
+package cgroups
+
+import "fmt"
+
+// BlockIODevice holds major:minor format supported in blkio cgroup.
+type BlockIODevice struct {
+	// Major is the device's major number
+	Major int64 `json:"major"`
+	// Minor is the device's minor number
+	Minor int64 `json:"minor"`
+}
+
+// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
+type WeightDevice struct {
+	BlockIODevice
+	// Weight is the bandwidth rate for the device, range is from 10 to 1000
+	Weight uint16 `json:"weight"`
+	// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
+	LeafWeight uint16 `json:"leafWeight"`
+}
+
+// NewWeightDevice returns a configured WeightDevice pointer
+func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
+	wd := &WeightDevice{}
+	wd.Major = major
+	wd.Minor = minor
+	wd.Weight = weight
+	wd.LeafWeight = leafWeight
+	return wd
+}
+
+// WeightString formats the struct to be writable to the cgroup specific file
+func (wd *WeightDevice) WeightString() string {
+	return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
+}
+
+// LeafWeightString formats the struct to be writable to the cgroup specific file
+func (wd *WeightDevice) LeafWeightString() string {
+	return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
+}
+
+// ThrottleDevice struct holds a `major:minor rate_per_second` pair
+type ThrottleDevice struct {
+	BlockIODevice
+	// Rate is the IO rate limit per cgroup per device
+	Rate uint64 `json:"rate"`
+}
+
+// NewThrottleDevice returns a configured ThrottleDevice pointer
+func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
+	td := &ThrottleDevice{}
+	td.Major = major
+	td.Minor = minor
+	td.Rate = rate
+	return td
+}
+
+// String formats the struct to be writable to the cgroup specific file
+func (td *ThrottleDevice) String() string {
+	return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
+}
+
+// StringName formats the struct to be writable to the cgroup specific file
+func (td *ThrottleDevice) StringName(name string) string {
+	return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate)
+}
diff --git a/config_hugepages.go b/config_hugepages.go
new file mode 100644
index 0000000..5357dd0
--- /dev/null
+++ b/config_hugepages.go
@@ -0,0 +1,9 @@
+package cgroups
+
+type HugepageLimit struct {
+	// which type of hugepage to limit.
+	Pagesize string `json:"page_size"`
+
+	// usage limit for hugepage.
+	Limit uint64 `json:"limit"`
+}
diff --git a/config_ifprio_map.go b/config_ifprio_map.go
new file mode 100644
index 0000000..d771603
--- /dev/null
+++ b/config_ifprio_map.go
@@ -0,0 +1,14 @@
+package cgroups
+
+import (
+	"fmt"
+)
+
+type IfPrioMap struct {
+	Interface string `json:"interface"`
+	Priority  int64  `json:"priority"`
+}
+
+func (i *IfPrioMap) CgroupString() string {
+	return fmt.Sprintf("%s %d", i.Interface, i.Priority)
+}
diff --git a/config_linux.go b/config_linux.go
new file mode 100644
index 0000000..ce98b3d
--- /dev/null
+++ b/config_linux.go
@@ -0,0 +1,169 @@
+package cgroups
+
+import (
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	devices "github.com/opencontainers/cgroups/devices/config"
+)
+
+type FreezerState string
+
+const (
+	Undefined FreezerState = ""
+	Frozen    FreezerState = "FROZEN"
+	Thawed    FreezerState = "THAWED"
+)
+
+// Cgroup holds properties of a cgroup on Linux.
+type Cgroup struct {
+	// Name specifies the name of the cgroup
+	Name string `json:"name,omitempty"`
+
+	// Parent specifies the name of parent of cgroup or slice
+	Parent string `json:"parent,omitempty"`
+
+	// Path specifies the path to cgroups that are created and/or joined by the container.
+	// The path is assumed to be relative to the host system cgroup mountpoint.
+	Path string `json:"path"`
+
+	// ScopePrefix describes prefix for the scope name
+	ScopePrefix string `json:"scope_prefix"`
+
+	// Resources contains various cgroups settings to apply
+	*Resources
+
+	// Systemd tells if systemd should be used to manage cgroups.
+	Systemd bool
+
+	// SystemdProps are any additional properties for systemd,
+	// derived from org.systemd.property.xxx annotations.
+	// Ignored unless systemd is used for managing cgroups.
+	SystemdProps []systemdDbus.Property `json:"-"`
+
+	// Rootless tells if rootless cgroups should be used.
+	Rootless bool
+
+	// The host UID that should own the cgroup, or nil to accept
+	// the default ownership.  This should only be set when the
+	// cgroupfs is to be mounted read/write.
+	// Not all cgroup manager implementations support changing
+	// the ownership.
+	OwnerUID *int `json:"owner_uid,omitempty"`
+}
+
+type Resources struct {
+	// Devices is the set of access rules for devices in the container.
+	Devices []*devices.Rule `json:"devices"`
+
+	// Memory limit (in bytes)
+	Memory int64 `json:"memory"`
+
+	// Memory reservation or soft_limit (in bytes)
+	MemoryReservation int64 `json:"memory_reservation"`
+
+	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
+	MemorySwap int64 `json:"memory_swap"`
+
+	// CPU shares (relative weight vs. other containers)
+	CpuShares uint64 `json:"cpu_shares"`
+
+	// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
+	CpuQuota int64 `json:"cpu_quota"`
+
+	// CPU hardcap burst limit (in usecs). Allowed accumulated cpu time additionally for burst in a given period.
+	CpuBurst *uint64 `json:"cpu_burst"` //nolint:revive
+
+	// CPU period to be used for hardcapping (in usecs). 0 to use system default.
+	CpuPeriod uint64 `json:"cpu_period"`
+
+	// How many time CPU will use in realtime scheduling (in usecs).
+	CpuRtRuntime int64 `json:"cpu_rt_quota"`
+
+	// CPU period to be used for realtime scheduling (in usecs).
+	CpuRtPeriod uint64 `json:"cpu_rt_period"`
+
+	// CPU to use
+	CpusetCpus string `json:"cpuset_cpus"`
+
+	// MEM to use
+	CpusetMems string `json:"cpuset_mems"`
+
+	// cgroup SCHED_IDLE
+	CPUIdle *int64 `json:"cpu_idle,omitempty"`
+
+	// Process limit; set <= `0' to disable limit.
+	PidsLimit int64 `json:"pids_limit"`
+
+	// Specifies per cgroup weight, range is from 10 to 1000.
+	BlkioWeight uint16 `json:"blkio_weight"`
+
+	// Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
+	BlkioLeafWeight uint16 `json:"blkio_leaf_weight"`
+
+	// Weight per cgroup per device, can override BlkioWeight.
+	BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"`
+
+	// IO read rate limit per cgroup per device, bytes per second.
+	BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
+
+	// IO write rate limit per cgroup per device, bytes per second.
+	BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
+
+	// IO read rate limit per cgroup per device, IO per second.
+	BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"`
+
+	// IO write rate limit per cgroup per device, IO per second.
+	BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"`
+
+	// set the freeze value for the process
+	Freezer FreezerState `json:"freezer"`
+
+	// Hugetlb limit (in bytes)
+	HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
+
+	// Whether to disable OOM Killer
+	OomKillDisable bool `json:"oom_kill_disable"`
+
+	// Tuning swappiness behaviour per cgroup
+	MemorySwappiness *uint64 `json:"memory_swappiness"`
+
+	// Set priority of network traffic for container
+	NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
+
+	// Set class identifier for container's network packets
+	NetClsClassid uint32 `json:"net_cls_classid_u"`
+
+	// Rdma resource restriction configuration
+	Rdma map[string]LinuxRdma `json:"rdma"`
+
+	// Used on cgroups v2:
+
+	// CpuWeight sets a proportional bandwidth limit.
+	CpuWeight uint64 `json:"cpu_weight"`
+
+	// Unified is cgroupv2-only key-value map.
+	Unified map[string]string `json:"unified"`
+
+	// SkipDevices allows to skip configuring device permissions.
+	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
+	// common for many containers, and by runc update.
+	//
+	// NOTE it is impossible to start a container which has this flag set.
+	SkipDevices bool `json:"-"`
+
+	// SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup
+	// freeze when setting resources. Only applicable to systemd legacy
+	// (i.e. cgroup v1) manager (which uses freeze by default to avoid
+	// spurious permission errors caused by systemd inability to update
+	// device rules in a non-disruptive manner).
+	//
+	// If not set, a few methods (such as looking into cgroup's
+	// devices.list and querying the systemd unit properties) are used
+	// during Set() to figure out whether the freeze is required. Those
+	// methods may be relatively slow, thus this flag.
+	SkipFreezeOnSet bool `json:"-"`
+
+	// MemoryCheckBeforeUpdate is a flag for cgroup v2 managers to check
+	// if the new memory limits (Memory and MemorySwap) being set are lower
+	// than the current memory usage, and reject if so.
+	MemoryCheckBeforeUpdate bool `json:"memory_check_before_update"`
+}
diff --git a/config_rdma.go b/config_rdma.go
new file mode 100644
index 0000000..a0bd54f
--- /dev/null
+++ b/config_rdma.go
@@ -0,0 +1,9 @@
+package cgroups
+
+// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11)
+type LinuxRdma struct {
+	// Maximum number of HCA handles that can be opened. Default is "no limit".
+	HcaHandles *uint32 `json:"hca_handles,omitempty"`
+	// Maximum number of HCA objects that can be created. Default is "no limit".
+	HcaObjects *uint32 `json:"hca_objects,omitempty"`
+}
diff --git a/config_unsupported.go b/config_unsupported.go
new file mode 100644
index 0000000..db32ec4
--- /dev/null
+++ b/config_unsupported.go
@@ -0,0 +1,8 @@
+//go:build !linux
+
+package cgroups
+
+// Cgroup holds properties of a cgroup on Linux
+// TODO Windows: This can ultimately be entirely factored out on Windows as
+// cgroups are a Unix-specific construct.
+type Cgroup struct{}
diff --git a/devices/config/device.go b/devices/config/device.go
new file mode 100644
index 0000000..05ad3ef
--- /dev/null
+++ b/devices/config/device.go
@@ -0,0 +1,174 @@
+package config
+
+import (
+	"fmt"
+	"os"
+	"strconv"
+)
+
+const (
+	Wildcard = -1
+)
+
+type Device struct {
+	Rule
+
+	// Path to the device.
+	Path string `json:"path"`
+
+	// FileMode permission bits for the device.
+	FileMode os.FileMode `json:"file_mode"`
+
+	// Uid of the device.
+	Uid uint32 `json:"uid"`
+
+	// Gid of the device.
+	Gid uint32 `json:"gid"`
+}
+
+// Permissions is a cgroupv1-style string to represent device access. It
+// has to be a string for backward compatibility reasons, hence why it has
+// methods to do set operations.
+type Permissions string
+
+const (
+	deviceRead uint = (1 << iota)
+	deviceWrite
+	deviceMknod
+)
+
+func (p Permissions) toSet() uint {
+	var set uint
+	for _, perm := range p {
+		switch perm {
+		case 'r':
+			set |= deviceRead
+		case 'w':
+			set |= deviceWrite
+		case 'm':
+			set |= deviceMknod
+		}
+	}
+	return set
+}
+
+func fromSet(set uint) Permissions {
+	var perm string
+	if set&deviceRead == deviceRead {
+		perm += "r"
+	}
+	if set&deviceWrite == deviceWrite {
+		perm += "w"
+	}
+	if set&deviceMknod == deviceMknod {
+		perm += "m"
+	}
+	return Permissions(perm)
+}
+
+// Union returns the union of the two sets of Permissions.
+func (p Permissions) Union(o Permissions) Permissions {
+	lhs := p.toSet()
+	rhs := o.toSet()
+	return fromSet(lhs | rhs)
+}
+
+// Difference returns the set difference of the two sets of Permissions.
+// In set notation, A.Difference(B) gives you A\B.
+func (p Permissions) Difference(o Permissions) Permissions {
+	lhs := p.toSet()
+	rhs := o.toSet()
+	return fromSet(lhs &^ rhs)
+}
+
+// Intersection computes the intersection of the two sets of Permissions.
+func (p Permissions) Intersection(o Permissions) Permissions {
+	lhs := p.toSet()
+	rhs := o.toSet()
+	return fromSet(lhs & rhs)
+}
+
+// IsEmpty returns whether the set of permissions in a Permissions is
+// empty.
+func (p Permissions) IsEmpty() bool {
+	return p == Permissions("")
+}
+
+// IsValid returns whether the set of permissions is a subset of valid
+// permissions (namely, {r,w,m}).
+func (p Permissions) IsValid() bool {
+	return p == fromSet(p.toSet())
+}
+
+type Type rune
+
+const (
+	WildcardDevice Type = 'a'
+	BlockDevice    Type = 'b'
+	CharDevice     Type = 'c' // or 'u'
+	FifoDevice     Type = 'p'
+)
+
+func (t Type) IsValid() bool {
+	switch t {
+	case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
+		return true
+	default:
+		return false
+	}
+}
+
+func (t Type) CanMknod() bool {
+	switch t {
+	case BlockDevice, CharDevice, FifoDevice:
+		return true
+	default:
+		return false
+	}
+}
+
+func (t Type) CanCgroup() bool {
+	switch t {
+	case WildcardDevice, BlockDevice, CharDevice:
+		return true
+	default:
+		return false
+	}
+}
+
+type Rule struct {
+	// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
+	// acts as a wildcard and all fields other than Allow are ignored.
+	Type Type `json:"type"`
+
+	// Major is the device's major number.
+	Major int64 `json:"major"`
+
+	// Minor is the device's minor number.
+	Minor int64 `json:"minor"`
+
+	// Permissions is the set of permissions that this rule applies to (in the
+	// cgroupv1 format -- any combination of "rwm").
+	Permissions Permissions `json:"permissions"`
+
+	// Allow specifies whether this rule is allowed.
+	Allow bool `json:"allow"`
+}
+
+func (d *Rule) CgroupString() string {
+	var (
+		major = strconv.FormatInt(d.Major, 10)
+		minor = strconv.FormatInt(d.Minor, 10)
+	)
+	if d.Major == Wildcard {
+		major = "*"
+	}
+	if d.Minor == Wildcard {
+		minor = "*"
+	}
+	return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
+}
+
+func (d *Rule) Mkdev() (uint64, error) {
+	return mkDev(d)
+}
diff --git a/devices/config/mknod_unix.go b/devices/config/mknod_unix.go
new file mode 100644
index 0000000..98cdc6e
--- /dev/null
+++ b/devices/config/mknod_unix.go
@@ -0,0 +1,14 @@
+package config
+
+import (
+	"errors"
+
+	"golang.org/x/sys/unix"
+)
+
+func mkDev(d *Rule) (uint64, error) {
+	if d.Major == Wildcard || d.Minor == Wildcard {
+		return 0, errors.New("cannot mkdev() device with wildcards")
+	}
+	return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
+}
diff --git a/devices/devicefilter.go b/devices/devicefilter.go
new file mode 100644
index 0000000..aafa0d0
--- /dev/null
+++ b/devices/devicefilter.go
@@ -0,0 +1,207 @@
+// Implements creation of eBPF device filter program.
+//
+// Based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
+//
+// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
+// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
+package devices
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"strconv"
+
+	"github.com/cilium/ebpf/asm"
+	devices "github.com/opencontainers/cgroups/devices/config"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	// license string format is same as kernel MODULE_LICENSE macro
+	license = "Apache"
+)
+
+// deviceFilter returns eBPF device filter program and its license string.
+func deviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
+	// Generate the minimum ruleset for the device rules we are given. While we
+	// don't care about minimum transitions in cgroupv2, using the emulator
+	// gives us a guarantee that the behaviour of devices filtering is the same
+	// as cgroupv1, including security hardenings to avoid misconfiguration
+	// (such as punching holes in wildcard rules).
+	emu := new(emulator)
+	for _, rule := range rules {
+		if err := emu.Apply(*rule); err != nil {
+			return nil, "", err
+		}
+	}
+	cleanRules, err := emu.Rules()
+	if err != nil {
+		return nil, "", err
+	}
+
+	p := &program{
+		defaultAllow: emu.IsBlacklist(),
+	}
+	p.init()
+
+	for idx, rule := range cleanRules {
+		if rule.Type == devices.WildcardDevice {
+			// We can safely skip over wildcard entries because there should
+			// only be one (at most) at the very start to instruct cgroupv1 to
+			// go into allow-list mode. However we do double-check this here.
+			if idx != 0 || rule.Allow != emu.IsBlacklist() {
+				return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
+			}
+			continue
+		}
+		if rule.Allow == p.defaultAllow {
+			// There should be no rules which have an action equal to the
+			// default action, the emulator removes those.
+			return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
+		}
+		if err := p.appendRule(rule); err != nil {
+			return nil, "", err
+		}
+	}
+	return p.finalize(), license, nil
+}
+
+type program struct {
+	insts        asm.Instructions
+	defaultAllow bool
+	blockID      int
+}
+
+func (p *program) init() {
+	// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
+	/*
+		u32 access_type
+		u32 major
+		u32 minor
+	*/
+	// R2 <- type (lower 16 bit of u32 access_type at R1[0])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
+		asm.And.Imm32(asm.R2, 0xFFFF))
+
+	// R3 <- access (upper 16 bit of u32 access_type at R1[0])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
+		// RSh: bitwise shift right
+		asm.RSh.Imm32(asm.R3, 16))
+
+	// R4 <- major (u32 major at R1[4])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
+
+	// R5 <- minor (u32 minor at R1[8])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
+}
+
+// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
+// to the in-progress filter program. In order to operate properly, it must be
+// called with a "clean" rule list (generated by devices.Emulator.Rules() --
+// with any "a" rules removed).
+func (p *program) appendRule(rule *devices.Rule) error {
+	if p.blockID < 0 {
+		return errors.New("the program is finalized")
+	}
+
+	var bpfType int32
+	switch rule.Type {
+	case devices.CharDevice:
+		bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
+	case devices.BlockDevice:
+		bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
+	default:
+		// We do not permit 'a', nor any other types we don't know about.
+		return fmt.Errorf("invalid type %q", string(rule.Type))
+	}
+	if rule.Major > math.MaxUint32 {
+		return fmt.Errorf("invalid major %d", rule.Major)
+	}
+	if rule.Minor > math.MaxUint32 {
+		return fmt.Errorf("invalid minor %d", rule.Major)
+	}
+	hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
+	hasMinor := rule.Minor >= 0
+	bpfAccess := int32(0)
+	for _, r := range rule.Permissions {
+		switch r {
+		case 'r':
+			bpfAccess |= unix.BPF_DEVCG_ACC_READ
+		case 'w':
+			bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
+		case 'm':
+			bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
+		default:
+			return fmt.Errorf("unknown device access %v", r)
+		}
+	}
+	// If the access is rwm, skip the check.
+	hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
+
+	var (
+		blockSym         = "block-" + strconv.Itoa(p.blockID)
+		nextBlockSym     = "block-" + strconv.Itoa(p.blockID+1)
+		prevBlockLastIdx = len(p.insts) - 1
+	)
+	p.insts = append(p.insts,
+		// if (R2 != bpfType) goto next
+		asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
+	)
+	if hasAccess {
+		p.insts = append(p.insts,
+			// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
+			asm.Mov.Reg32(asm.R1, asm.R3),
+			asm.And.Imm32(asm.R1, bpfAccess),
+			asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
+		)
+	}
+	if hasMajor {
+		p.insts = append(p.insts,
+			// if (R4 != major) goto next
+			asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
+		)
+	}
+	if hasMinor {
+		p.insts = append(p.insts,
+			// if (R5 != minor) goto next
+			asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
+		)
+	}
+	p.insts = append(p.insts, acceptBlock(rule.Allow)...)
+	// set blockSym to the first instruction we added in this iteration
+	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].WithSymbol(blockSym)
+	p.blockID++
+	return nil
+}
+
+func (p *program) finalize() asm.Instructions {
+	var v int32
+	if p.defaultAllow {
+		v = 1
+	}
+	blockSym := "block-" + strconv.Itoa(p.blockID)
+	p.insts = append(p.insts,
+		// R0 <- v
+		asm.Mov.Imm32(asm.R0, v).WithSymbol(blockSym),
+		asm.Return(),
+	)
+	p.blockID = -1
+	return p.insts
+}
+
+func acceptBlock(accept bool) asm.Instructions {
+	var v int32
+	if accept {
+		v = 1
+	}
+	return []asm.Instruction{
+		// R0 <- v
+		asm.Mov.Imm32(asm.R0, v),
+		asm.Return(),
+	}
+}
diff --git a/devices/devicefilter_test.go b/devices/devicefilter_test.go
new file mode 100644
index 0000000..4010deb
--- /dev/null
+++ b/devices/devicefilter_test.go
@@ -0,0 +1,336 @@
+package devices
+
+import (
+	"strings"
+	"testing"
+
+	devices "github.com/opencontainers/cgroups/devices/config"
+)
+
+func hash(s, comm string) string {
+	var res []string
+	for _, l := range strings.Split(s, "\n") {
+		trimmed := strings.TrimSpace(l)
+		if trimmed == "" || strings.HasPrefix(trimmed, comm) {
+			continue
+		}
+		res = append(res, trimmed)
+	}
+	return strings.Join(res, "\n")
+}
+
+func testDeviceFilter(t testing.TB, devices []*devices.Rule, expectedStr string) {
+	insts, _, err := deviceFilter(devices)
+	if err != nil {
+		t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices)
+	}
+	s := insts.String()
+	if expectedStr != "" {
+		hashed := hash(s, "//")
+		expectedHashed := hash(expectedStr, "//")
+		if expectedHashed != hashed {
+			t.Fatalf("expected:\n%q\ngot\n%q", expectedHashed, hashed)
+		}
+	}
+}
+
+func TestDeviceFilter_Nil(t *testing.T) {
+	expected := `
+// load parameters into registers
+        0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+        1: AndImm32 dst: r2 imm: 65535
+        2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        3: RShImm32 dst: r3 imm: 16
+        4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 0 (reject)
+        6: MovImm32 dst: r0 imm: 0
+        7: Exit
+	`
+	testDeviceFilter(t, nil, expected)
+}
+
+func TestDeviceFilter_BuiltInAllowList(t *testing.T) {
+	// This is a copy of all rules from
+	// github.com/opencontainers/runc/libcontainer/specconv.AllowedDevices.
+	devices := []*devices.Rule{
+		{
+			Type:        devices.CharDevice,
+			Major:       devices.Wildcard,
+			Minor:       devices.Wildcard,
+			Permissions: "m",
+			Allow:       true,
+		},
+		{
+			Type:        devices.BlockDevice,
+			Major:       devices.Wildcard,
+			Minor:       devices.Wildcard,
+			Permissions: "m",
+			Allow:       true,
+		},
+		{
+			Type:        devices.CharDevice,
+			Major:       1,
+			Minor:       3,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        devices.CharDevice,
+			Major:       1,
+			Minor:       8,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        devices.CharDevice,
+			Major:       1,
+			Minor:       7,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        devices.CharDevice,
+			Major:       5,
+			Minor:       0,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        devices.CharDevice,
+			Major:       1,
+			Minor:       5,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        devices.CharDevice,
+			Major:       1,
+			Minor:       9,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        devices.CharDevice,
+			Major:       136,
+			Minor:       devices.Wildcard,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        devices.CharDevice,
+			Major:       5,
+			Minor:       2,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        devices.CharDevice,
+			Major:       10,
+			Minor:       200,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+	}
+
+	expected := `
+// load parameters into registers
+        0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+        1: AndImm32 dst: r2 imm: 65535
+        2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        3: RShImm32 dst: r3 imm: 16
+        4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// (b, wildcard, wildcard, m, true)
+        6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+        7: MovReg32 dst: r1 src: r3
+        8: AndImm32 dst: r1 imm: 1
+        9: JNEReg dst: r1 off: -1 src: r3 <block-1>
+        10: MovImm32 dst: r0 imm: 1
+        11: Exit
+block-1:
+// (c, wildcard, wildcard, m, true)
+        12: JNEImm dst: r2 off: -1 imm: 2 <block-2>
+        13: MovReg32 dst: r1 src: r3
+        14: AndImm32 dst: r1 imm: 1
+        15: JNEReg dst: r1 off: -1 src: r3 <block-2>
+        16: MovImm32 dst: r0 imm: 1
+        17: Exit
+block-2:
+        18: JNEImm dst: r2 off: -1 imm: 2 <block-3>
+        19: JNEImm dst: r4 off: -1 imm: 1 <block-3>
+        20: JNEImm dst: r5 off: -1 imm: 3 <block-3>
+        21: MovImm32 dst: r0 imm: 1
+        22: Exit
+block-3:
+        23: JNEImm dst: r2 off: -1 imm: 2 <block-4>
+        24: JNEImm dst: r4 off: -1 imm: 1 <block-4>
+        25: JNEImm dst: r5 off: -1 imm: 5 <block-4>
+        26: MovImm32 dst: r0 imm: 1
+        27: Exit
+block-4:
+        28: JNEImm dst: r2 off: -1 imm: 2 <block-5>
+        29: JNEImm dst: r4 off: -1 imm: 1 <block-5>
+        30: JNEImm dst: r5 off: -1 imm: 7 <block-5>
+        31: MovImm32 dst: r0 imm: 1
+        32: Exit
+block-5:
+        33: JNEImm dst: r2 off: -1 imm: 2 <block-6>
+        34: JNEImm dst: r4 off: -1 imm: 1 <block-6>
+        35: JNEImm dst: r5 off: -1 imm: 8 <block-6>
+        36: MovImm32 dst: r0 imm: 1
+        37: Exit
+block-6:
+        38: JNEImm dst: r2 off: -1 imm: 2 <block-7>
+        39: JNEImm dst: r4 off: -1 imm: 1 <block-7>
+        40: JNEImm dst: r5 off: -1 imm: 9 <block-7>
+        41: MovImm32 dst: r0 imm: 1
+        42: Exit
+block-7:
+        43: JNEImm dst: r2 off: -1 imm: 2 <block-8>
+        44: JNEImm dst: r4 off: -1 imm: 5 <block-8>
+        45: JNEImm dst: r5 off: -1 imm: 0 <block-8>
+        46: MovImm32 dst: r0 imm: 1
+        47: Exit
+block-8:
+        48: JNEImm dst: r2 off: -1 imm: 2 <block-9>
+        49: JNEImm dst: r4 off: -1 imm: 5 <block-9>
+        50: JNEImm dst: r5 off: -1 imm: 2 <block-9>
+        51: MovImm32 dst: r0 imm: 1
+        52: Exit
+block-9:
+// tuntap (c, 10, 200, rwm, true)
+        53: JNEImm dst: r2 off: -1 imm: 2 <block-10>
+        54: JNEImm dst: r4 off: -1 imm: 10 <block-10>
+        55: JNEImm dst: r5 off: -1 imm: 200 <block-10>
+        56: MovImm32 dst: r0 imm: 1
+        57: Exit
+block-10:
+// /dev/pts (c, 136, wildcard, rwm, true)
+	58: JNEImm dst: r2 off: -1 imm: 2 <block-11>
+        59: JNEImm dst: r4 off: -1 imm: 136 <block-11>
+        60: MovImm32 dst: r0 imm: 1
+        61: Exit
+block-11:
+        62: MovImm32 dst: r0 imm: 0
+        63: Exit
+`
+	testDeviceFilter(t, devices, expected)
+}
+
+func TestDeviceFilter_Privileged(t *testing.T) {
+	devices := []*devices.Rule{
+		{
+			Type:        'a',
+			Major:       -1,
+			Minor:       -1,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+	}
+	expected := `
+// load parameters into registers
+        0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+        1: AndImm32 dst: r2 imm: 65535
+        2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        3: RShImm32 dst: r3 imm: 16
+        4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 1 (accept)
+        6: MovImm32 dst: r0 imm: 1
+        7: Exit
+	`
+	testDeviceFilter(t, devices, expected)
+}
+
+func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) {
+	devices := []*devices.Rule{
+		{
+			Type:        'a',
+			Major:       -1,
+			Minor:       -1,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        'b',
+			Major:       8,
+			Minor:       0,
+			Permissions: "rwm",
+			Allow:       false,
+		},
+	}
+	expected := `
+// load parameters into registers
+         0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+         1: AndImm32 dst: r2 imm: 65535
+         2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+         3: RShImm32 dst: r3 imm: 16
+         4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+         5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 0 (reject) if type==b && major == 8 && minor == 0
+         6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+         7: JNEImm dst: r4 off: -1 imm: 8 <block-1>
+         8: JNEImm dst: r5 off: -1 imm: 0 <block-1>
+         9: MovImm32 dst: r0 imm: 0
+        10: Exit
+block-1:
+// return 1 (accept)
+        11: MovImm32 dst: r0 imm: 1
+        12: Exit
+`
+	testDeviceFilter(t, devices, expected)
+}
+
+func TestDeviceFilter_Weird(t *testing.T) {
+	devices := []*devices.Rule{
+		{
+			Type:        'b',
+			Major:       8,
+			Minor:       1,
+			Permissions: "rwm",
+			Allow:       false,
+		},
+		{
+			Type:        'a',
+			Major:       -1,
+			Minor:       -1,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        'b',
+			Major:       8,
+			Minor:       2,
+			Permissions: "rwm",
+			Allow:       false,
+		},
+	}
+	// 8/1 is allowed, 8/2 is not allowed.
+	// This conforms to runc v1.0.0-rc.9 (cgroup1) behavior.
+	expected := `
+// load parameters into registers
+         0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+         1: AndImm32 dst: r2 imm: 65535
+         2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+         3: RShImm32 dst: r3 imm: 16
+         4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+         5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 0 (reject) if type==b && major == 8 && minor == 2
+         6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+         7: JNEImm dst: r4 off: -1 imm: 8 <block-1>
+         8: JNEImm dst: r5 off: -1 imm: 2 <block-1>
+         9: MovImm32 dst: r0 imm: 0
+        10: Exit
+block-1:
+// return 1 (accept)
+        11: MovImm32 dst: r0 imm: 1
+        12: Exit
+`
+	testDeviceFilter(t, devices, expected)
+}
diff --git a/devices/devices.go b/devices/devices.go
new file mode 100644
index 0000000..2cfd7d0
--- /dev/null
+++ b/devices/devices.go
@@ -0,0 +1,16 @@
+// Package devices contains functionality to manage cgroup devices, which
+// is exposed indirectly via libcontainer/cgroups managers.
+//
+// To enable cgroup managers to manage devices, this package must be imported.
+package devices
+
+import (
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/systemd"
+)
+
+func init() {
+	cgroups.DevicesSetV1 = setV1
+	cgroups.DevicesSetV2 = setV2
+	systemd.GenerateDeviceProps = systemdProperties
+}
diff --git a/devices/devices_emulator.go b/devices/devices_emulator.go
new file mode 100644
index 0000000..ab18268
--- /dev/null
+++ b/devices/devices_emulator.go
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2020 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package devices
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"sort"
+	"strconv"
+	"strings"
+
+	devices "github.com/opencontainers/cgroups/devices/config"
+)
+
+// deviceMeta is a Rule without the Allow or Permissions fields, and no
+// wildcard-type support. It's effectively the "match" portion of a metadata
+// rule, for the purposes of our emulation.
+type deviceMeta struct {
+	node  devices.Type
+	major int64
+	minor int64
+}
+
+// deviceRule is effectively the tuple (deviceMeta, Permissions).
+type deviceRule struct {
+	meta  deviceMeta
+	perms devices.Permissions
+}
+
+// deviceRules is a mapping of device metadata rules to the associated
+// permissions in the ruleset.
+type deviceRules map[deviceMeta]devices.Permissions
+
+func (r deviceRules) orderedEntries() []deviceRule {
+	var rules []deviceRule
+	for meta, perms := range r {
+		rules = append(rules, deviceRule{meta: meta, perms: perms})
+	}
+	sort.Slice(rules, func(i, j int) bool {
+		// Sort by (major, minor, type).
+		a, b := rules[i].meta, rules[j].meta
+		return a.major < b.major ||
+			(a.major == b.major && a.minor < b.minor) ||
+			(a.major == b.major && a.minor == b.minor && a.node < b.node)
+	})
+	return rules
+}
+
+type emulator struct {
+	defaultAllow bool
+	rules        deviceRules
+}
+
+func (e *emulator) IsBlacklist() bool {
+	return e.defaultAllow
+}
+
+func (e *emulator) IsAllowAll() bool {
+	return e.IsBlacklist() && len(e.rules) == 0
+}
+
+func parseLine(line string) (*deviceRule, error) {
+	// Input: node major:minor perms.
+	fields := strings.FieldsFunc(line, func(r rune) bool {
+		return r == ' ' || r == ':'
+	})
+	if len(fields) != 4 {
+		return nil, fmt.Errorf("malformed devices.list rule %s", line)
+	}
+
+	var (
+		rule  deviceRule
+		node  = fields[0]
+		major = fields[1]
+		minor = fields[2]
+		perms = fields[3]
+	)
+
+	// Parse the node type.
+	switch node {
+	case "a":
+		// Super-special case -- "a" always means every device with every
+		// access mode. In fact, for devices.list this actually indicates that
+		// the cgroup is in black-list mode.
+		// TODO: Double-check that the entire file is "a *:* rwm".
+		return nil, nil
+	case "b":
+		rule.meta.node = devices.BlockDevice
+	case "c":
+		rule.meta.node = devices.CharDevice
+	default:
+		return nil, fmt.Errorf("unknown device type %q", node)
+	}
+
+	// Parse the major number.
+	if major == "*" {
+		rule.meta.major = devices.Wildcard
+	} else {
+		val, err := strconv.ParseUint(major, 10, 32)
+		if err != nil {
+			return nil, fmt.Errorf("invalid major number: %w", err)
+		}
+		rule.meta.major = int64(val)
+	}
+
+	// Parse the minor number.
+	if minor == "*" {
+		rule.meta.minor = devices.Wildcard
+	} else {
+		val, err := strconv.ParseUint(minor, 10, 32)
+		if err != nil {
+			return nil, fmt.Errorf("invalid minor number: %w", err)
+		}
+		rule.meta.minor = int64(val)
+	}
+
+	// Parse the access permissions.
+	rule.perms = devices.Permissions(perms)
+	if !rule.perms.IsValid() || rule.perms.IsEmpty() {
+		return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
+	}
+	return &rule, nil
+}
+
+func (e *emulator) addRule(rule deviceRule) error { //nolint:unparam
+	if e.rules == nil {
+		e.rules = make(map[deviceMeta]devices.Permissions)
+	}
+
+	// Merge with any pre-existing permissions.
+	oldPerms := e.rules[rule.meta]
+	newPerms := rule.perms.Union(oldPerms)
+	e.rules[rule.meta] = newPerms
+	return nil
+}
+
+func (e *emulator) rmRule(rule deviceRule) error {
+	// Give an error if any of the permissions requested to be removed are
+	// present in a partially-matching wildcard rule, because such rules will
+	// be ignored by cgroupv1.
+	//
+	// This is a diversion from cgroupv1, but is necessary to avoid leading
+	// users into a false sense of security. cgroupv1 will silently(!) ignore
+	// requests to remove partial exceptions, but we really shouldn't do that.
+	//
+	// It may seem like we could just "split" wildcard rules which hit this
+	// issue, but unfortunately there are 2^32 possible major and minor
+	// numbers, which would exhaust kernel memory quickly if we did this. Not
+	// to mention it'd be really slow (the kernel side is implemented as a
+	// linked-list of exceptions).
+	for _, partialMeta := range []deviceMeta{
+		{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
+		{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
+		{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
+	} {
+		// This wildcard rule is equivalent to the requested rule, so skip it.
+		if rule.meta == partialMeta {
+			continue
+		}
+		// Only give an error if the set of permissions overlap.
+		partialPerms := e.rules[partialMeta]
+		if !partialPerms.Intersection(rule.perms).IsEmpty() {
+			return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
+		}
+	}
+
+	// Subtract all of the permissions listed from the full match rule. If the
+	// rule didn't exist, all of this is a no-op.
+	newPerms := e.rules[rule.meta].Difference(rule.perms)
+	if newPerms.IsEmpty() {
+		delete(e.rules, rule.meta)
+	} else {
+		e.rules[rule.meta] = newPerms
+	}
+	// TODO: The actual cgroup code doesn't care if an exception didn't exist
+	//       during removal, so not erroring out here is /accurate/ but quite
+	//       worrying. Maybe we should do additional validation, but again we
+	//       have to worry about backwards-compatibility.
+	return nil
+}
+
+func (e *emulator) allow(rule *deviceRule) error {
+	// This cgroup is configured as a black-list. Reset the entire emulator,
+	// and put is into black-list mode.
+	if rule == nil || rule.meta.node == devices.WildcardDevice {
+		*e = emulator{
+			defaultAllow: true,
+			rules:        nil,
+		}
+		return nil
+	}
+
+	var err error
+	if e.defaultAllow {
+		err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
+	} else {
+		err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
+	}
+	return err
+}
+
+func (e *emulator) deny(rule *deviceRule) error {
+	// This cgroup is configured as a white-list. Reset the entire emulator,
+	// and put is into white-list mode.
+	if rule == nil || rule.meta.node == devices.WildcardDevice {
+		*e = emulator{
+			defaultAllow: false,
+			rules:        nil,
+		}
+		return nil
+	}
+
+	var err error
+	if e.defaultAllow {
+		err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
+	} else {
+		err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
+	}
+	return err
+}
+
+func (e *emulator) Apply(rule devices.Rule) error {
+	if !rule.Type.CanCgroup() {
+		return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
+	}
+
+	innerRule := &deviceRule{
+		meta: deviceMeta{
+			node:  rule.Type,
+			major: rule.Major,
+			minor: rule.Minor,
+		},
+		perms: rule.Permissions,
+	}
+	if innerRule.meta.node == devices.WildcardDevice {
+		innerRule = nil
+	}
+
+	if rule.Allow {
+		return e.allow(innerRule)
+	}
+
+	return e.deny(innerRule)
+}
+
+// emulatorFromList takes a reader to a "devices.list"-like source, and returns
+// a new Emulator that represents the state of the devices cgroup. Note that
+// black-list devices cgroups cannot be fully reconstructed, due to limitations
+// in the devices cgroup API. Instead, such cgroups are always treated as
+// "allow all" cgroups.
+func emulatorFromList(list io.Reader) (*emulator, error) {
+	// Normally cgroups are in black-list mode by default, but the way we
+	// figure out the current mode is whether or not devices.list has an
+	// allow-all rule. So we default to a white-list, and the existence of an
+	// "a *:* rwm" entry will tell us otherwise.
+	e := &emulator{
+		defaultAllow: false,
+	}
+
+	// Parse the "devices.list".
+	s := bufio.NewScanner(list)
+	for s.Scan() {
+		line := s.Text()
+		deviceRule, err := parseLine(line)
+		if err != nil {
+			return nil, fmt.Errorf("error parsing line %q: %w", line, err)
+		}
+		// "devices.list" is an allow list. Note that this means that in
+		// black-list mode, we have no idea what rules are in play. As a
+		// result, we need to be very careful in Transition().
+		if err := e.allow(deviceRule); err != nil {
+			return nil, fmt.Errorf("error adding devices.list rule: %w", err)
+		}
+	}
+	if err := s.Err(); err != nil {
+		return nil, fmt.Errorf("error reading devices.list lines: %w", err)
+	}
+	return e, nil
+}
+
+// Transition calculates what is the minimally-disruptive set of rules need to
+// be applied to a devices cgroup in order to transition to the given target.
+// This means that any already-existing rules will not be applied, and
+// disruptive rules (like denying all device access) will only be applied if
+// necessary.
+//
+// This function is the sole reason for all of Emulator -- to allow us
+// to figure out how to update a containers' cgroups without causing spurious
+// device errors (if possible).
+func (source *emulator) Transition(target *emulator) ([]*devices.Rule, error) { //nolint:revive // Ignore receiver-naming warning.
+	var transitionRules []*devices.Rule
+	oldRules := source.rules
+
+	// If the default policy doesn't match, we need to include a "disruptive"
+	// rule (either allow-all or deny-all) in order to switch the cgroup to the
+	// correct default policy.
+	//
+	// However, due to a limitation in "devices.list" we cannot be sure what
+	// deny rules are in place in a black-list cgroup. Thus if the source is a
+	// black-list we also have to include a disruptive rule.
+	if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
+		transitionRules = append(transitionRules, &devices.Rule{
+			Type:        'a',
+			Major:       -1,
+			Minor:       -1,
+			Permissions: devices.Permissions("rwm"),
+			Allow:       target.defaultAllow,
+		})
+		// The old rules are only relevant if we aren't starting out with a
+		// disruptive rule.
+		oldRules = nil
+	}
+
+	// NOTE: We traverse through the rules in a sorted order so we always write
+	//       the same set of rules (this is to aid testing).
+
+	// First, we create inverse rules for any old rules not in the new set.
+	// This includes partial-inverse rules for specific permissions. This is a
+	// no-op if we added a disruptive rule, since oldRules will be empty.
+	for _, rule := range oldRules.orderedEntries() {
+		meta, oldPerms := rule.meta, rule.perms
+		newPerms := target.rules[meta]
+		droppedPerms := oldPerms.Difference(newPerms)
+		if !droppedPerms.IsEmpty() {
+			transitionRules = append(transitionRules, &devices.Rule{
+				Type:        meta.node,
+				Major:       meta.major,
+				Minor:       meta.minor,
+				Permissions: droppedPerms,
+				Allow:       target.defaultAllow,
+			})
+		}
+	}
+
+	// Add any additional rules which weren't in the old set. We happen to
+	// filter out rules which are present in both sets, though this isn't
+	// strictly necessary.
+	for _, rule := range target.rules.orderedEntries() {
+		meta, newPerms := rule.meta, rule.perms
+		oldPerms := oldRules[meta]
+		gainedPerms := newPerms.Difference(oldPerms)
+		if !gainedPerms.IsEmpty() {
+			transitionRules = append(transitionRules, &devices.Rule{
+				Type:        meta.node,
+				Major:       meta.major,
+				Minor:       meta.minor,
+				Permissions: gainedPerms,
+				Allow:       !target.defaultAllow,
+			})
+		}
+	}
+	return transitionRules, nil
+}
+
+// Rules returns the minimum set of rules necessary to convert a *deny-all*
+// cgroup to the emulated filter state (note that this is not the same as a
+// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
+// wrapper around Transition() with the source emulator being an empty cgroup.
+func (e *emulator) Rules() ([]*devices.Rule, error) {
+	defaultCgroup := &emulator{defaultAllow: false}
+	return defaultCgroup.Transition(e)
+}
+
+func wrapErr(err error, text string) error {
+	if err == nil {
+		return nil
+	}
+	return fmt.Errorf(text+": %w", err)
+}
diff --git a/devices/devices_emulator_test.go b/devices/devices_emulator_test.go
new file mode 100644
index 0000000..24c1d1e
--- /dev/null
+++ b/devices/devices_emulator_test.go
@@ -0,0 +1,1144 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2020 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package devices
+
+import (
+	"bufio"
+	"bytes"
+	"reflect"
+	"strings"
+	"testing"
+
+	devices "github.com/opencontainers/cgroups/devices/config"
+)
+
+func TestDeviceEmulatorLoad(t *testing.T) {
+	tests := []struct {
+		name, list string
+		expected   *emulator
+	}{
+		{
+			name: "BlacklistMode",
+			list: `a *:* rwm`,
+			expected: &emulator{
+				defaultAllow: true,
+			},
+		},
+		{
+			name: "WhitelistBasic",
+			list: `c 4:2 rw`,
+			expected: &emulator{
+				defaultAllow: false,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 4,
+						minor: 2,
+					}: devices.Permissions("rw"),
+				},
+			},
+		},
+		{
+			name: "WhitelistWildcard",
+			list: `b 0:* m`,
+			expected: &emulator{
+				defaultAllow: false,
+				rules: deviceRules{
+					{
+						node:  devices.BlockDevice,
+						major: 0,
+						minor: devices.Wildcard,
+					}: devices.Permissions("m"),
+				},
+			},
+		},
+		{
+			name: "WhitelistDuplicate",
+			list: `c *:* rwm
+c 1:1 r`,
+			expected: &emulator{
+				defaultAllow: false,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: devices.Wildcard,
+						minor: devices.Wildcard,
+					}: devices.Permissions("rwm"),
+					// To match the kernel, we allow redundant rules.
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 1,
+					}: devices.Permissions("r"),
+				},
+			},
+		},
+		{
+			name: "WhitelistComplicated",
+			list: `c *:* m
+b *:* m
+c 1:3 rwm
+c 1:5 rwm
+c 1:7 rwm
+c 1:8 rwm
+c 1:9 rwm
+c 5:0 rwm
+c 5:2 rwm
+c 136:* rwm
+c 10:200 rwm`,
+			expected: &emulator{
+				defaultAllow: false,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: devices.Wildcard,
+						minor: devices.Wildcard,
+					}: devices.Permissions("m"),
+					{
+						node:  devices.BlockDevice,
+						major: devices.Wildcard,
+						minor: devices.Wildcard,
+					}: devices.Permissions("m"),
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 3,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 7,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 8,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 9,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.CharDevice,
+						major: 5,
+						minor: 0,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.CharDevice,
+						major: 5,
+						minor: 2,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.CharDevice,
+						major: 136,
+						minor: devices.Wildcard,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.CharDevice,
+						major: 10,
+						minor: 200,
+					}: devices.Permissions("rwm"),
+				},
+			},
+		},
+		// Some invalid lists.
+		{
+			name:     "InvalidFieldNumber",
+			list:     `b 1:0`,
+			expected: nil,
+		},
+		{
+			name:     "InvalidDeviceType",
+			list:     `p *:* rwm`,
+			expected: nil,
+		},
+		{
+			name:     "InvalidMajorNumber1",
+			list:     `p -1:3 rwm`,
+			expected: nil,
+		},
+		{
+			name:     "InvalidMajorNumber2",
+			list:     `c foo:27 rwm`,
+			expected: nil,
+		},
+		{
+			name:     "InvalidMinorNumber1",
+			list:     `b 1:-4 rwm`,
+			expected: nil,
+		},
+		{
+			name:     "InvalidMinorNumber2",
+			list:     `b 1:foo rwm`,
+			expected: nil,
+		},
+		{
+			name:     "InvalidPermissions",
+			list:     `b 1:7 rwk`,
+			expected: nil,
+		},
+	}
+
+	for _, test := range tests {
+		test := test // capture range variable
+		t.Run(test.name, func(t *testing.T) {
+			list := bytes.NewBufferString(test.list)
+			emu, err := emulatorFromList(list)
+			if err != nil && test.expected != nil {
+				t.Fatalf("unexpected failure when creating emulator: %v", err)
+			} else if err == nil && test.expected == nil {
+				t.Fatalf("unexpected success when creating emulator: %#v", emu)
+			}
+
+			if !reflect.DeepEqual(emu, test.expected) {
+				t.Errorf("final emulator state mismatch: %#v != %#v", emu, test.expected)
+			}
+		})
+	}
+}
+
+func testDeviceEmulatorApply(t *testing.T, baseDefaultAllow bool) {
+	tests := []struct {
+		name           string
+		rule           devices.Rule
+		base, expected *emulator
+	}{
+		// Switch between default modes.
+		{
+			name: "SwitchToOtherMode",
+			rule: devices.Rule{
+				Type:        devices.WildcardDevice,
+				Major:       devices.Wildcard,
+				Minor:       devices.Wildcard,
+				Permissions: devices.Permissions("rwm"),
+				Allow:       !baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: devices.Wildcard,
+						minor: devices.Wildcard,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 1,
+					}: devices.Permissions("r"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: !baseDefaultAllow,
+				rules:        nil,
+			},
+		},
+		{
+			name: "SwitchToSameModeNoop",
+			rule: devices.Rule{
+				Type:        devices.WildcardDevice,
+				Major:       devices.Wildcard,
+				Minor:       devices.Wildcard,
+				Permissions: devices.Permissions("rwm"),
+				Allow:       baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules:        nil,
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules:        nil,
+			},
+		},
+		{
+			name: "SwitchToSameMode",
+			rule: devices.Rule{
+				Type:        devices.WildcardDevice,
+				Major:       devices.Wildcard,
+				Minor:       devices.Wildcard,
+				Permissions: devices.Permissions("rwm"),
+				Allow:       baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: devices.Wildcard,
+						minor: devices.Wildcard,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 1,
+					}: devices.Permissions("r"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules:        nil,
+			},
+		},
+		// Rule addition logic.
+		{
+			name: "RuleAdditionBasic",
+			rule: devices.Rule{
+				Type:        devices.CharDevice,
+				Major:       42,
+				Minor:       1337,
+				Permissions: devices.Permissions("rm"),
+				Allow:       !baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 2,
+						minor: 1,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("r"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 2,
+						minor: 1,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("r"),
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("rm"),
+				},
+			},
+		},
+		{
+			name: "RuleAdditionBasicDuplicate",
+			rule: devices.Rule{
+				Type:        devices.CharDevice,
+				Major:       42,
+				Minor:       1337,
+				Permissions: devices.Permissions("rm"),
+				Allow:       !baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: devices.Wildcard,
+					}: devices.Permissions("rwm"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: devices.Wildcard,
+					}: devices.Permissions("rwm"),
+					// To match the kernel, we allow redundant rules.
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("rm"),
+				},
+			},
+		},
+		{
+			name: "RuleAdditionBasicDuplicateNoop",
+			rule: devices.Rule{
+				Type:        devices.CharDevice,
+				Major:       42,
+				Minor:       1337,
+				Permissions: devices.Permissions("rm"),
+				Allow:       !baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("rm"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("rm"),
+				},
+			},
+		},
+		{
+			name: "RuleAdditionMerge",
+			rule: devices.Rule{
+				Type:        devices.BlockDevice,
+				Major:       5,
+				Minor:       12,
+				Permissions: devices.Permissions("rm"),
+				Allow:       !baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 2,
+						minor: 1,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 5,
+						minor: 12,
+					}: devices.Permissions("rw"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 2,
+						minor: 1,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 5,
+						minor: 12,
+					}: devices.Permissions("rwm"),
+				},
+			},
+		},
+		{
+			name: "RuleAdditionMergeWildcard",
+			rule: devices.Rule{
+				Type:        devices.BlockDevice,
+				Major:       5,
+				Minor:       devices.Wildcard,
+				Permissions: devices.Permissions("rm"),
+				Allow:       !baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 2,
+						minor: 1,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 5,
+						minor: devices.Wildcard,
+					}: devices.Permissions("rw"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 2,
+						minor: 1,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 5,
+						minor: devices.Wildcard,
+					}: devices.Permissions("rwm"),
+				},
+			},
+		},
+		{
+			name: "RuleAdditionMergeNoop",
+			rule: devices.Rule{
+				Type:        devices.BlockDevice,
+				Major:       5,
+				Minor:       12,
+				Permissions: devices.Permissions("r"),
+				Allow:       !baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 2,
+						minor: 1,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 5,
+						minor: 12,
+					}: devices.Permissions("rw"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 2,
+						minor: 1,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 5,
+						minor: 12,
+					}: devices.Permissions("rw"),
+				},
+			},
+		},
+		// Rule removal logic.
+		{
+			name: "RuleRemovalBasic",
+			rule: devices.Rule{
+				Type:        devices.CharDevice,
+				Major:       42,
+				Minor:       1337,
+				Permissions: devices.Permissions("rm"),
+				Allow:       baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("rm"),
+					{
+						node:  devices.BlockDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("r"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.BlockDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("r"),
+				},
+			},
+		},
+		{
+			name: "RuleRemovalNonexistent",
+			rule: devices.Rule{
+				Type:        devices.CharDevice,
+				Major:       4,
+				Minor:       1,
+				Permissions: devices.Permissions("rw"),
+				Allow:       baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.BlockDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("r"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.BlockDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("r"),
+				},
+			},
+		},
+		{
+			name: "RuleRemovalFull",
+			rule: devices.Rule{
+				Type:        devices.CharDevice,
+				Major:       42,
+				Minor:       1337,
+				Permissions: devices.Permissions("rw"),
+				Allow:       baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("w"),
+					{
+						node:  devices.BlockDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("r"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.BlockDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("r"),
+				},
+			},
+		},
+		{
+			name: "RuleRemovalPartial",
+			rule: devices.Rule{
+				Type:        devices.CharDevice,
+				Major:       42,
+				Minor:       1337,
+				Permissions: devices.Permissions("r"),
+				Allow:       baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("rm"),
+					{
+						node:  devices.BlockDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("r"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("m"),
+					{
+						node:  devices.BlockDevice,
+						major: 1,
+						minor: 5,
+					}: devices.Permissions("r"),
+				},
+			},
+		},
+		// Check our non-canonical behaviour when it comes to try to "punch
+		// out" holes in a wildcard rule.
+		{
+			name: "RuleRemovalWildcardPunchoutImpossible",
+			rule: devices.Rule{
+				Type:        devices.CharDevice,
+				Major:       42,
+				Minor:       1337,
+				Permissions: devices.Permissions("r"),
+				Allow:       baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: devices.Wildcard,
+					}: devices.Permissions("rm"),
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("r"),
+				},
+			},
+			expected: nil,
+		},
+		{
+			name: "RuleRemovalWildcardPunchoutPossible",
+			rule: devices.Rule{
+				Type:        devices.CharDevice,
+				Major:       42,
+				Minor:       1337,
+				Permissions: devices.Permissions("r"),
+				Allow:       baseDefaultAllow,
+			},
+			base: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: devices.Wildcard,
+					}: devices.Permissions("wm"),
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("r"),
+				},
+			},
+			expected: &emulator{
+				defaultAllow: baseDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: devices.Wildcard,
+					}: devices.Permissions("wm"),
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		test := test
+		t.Run(test.name, func(t *testing.T) {
+			err := test.base.Apply(test.rule)
+			if err != nil && test.expected != nil {
+				t.Fatalf("unexpected failure when applying apply rule: %v", err)
+			} else if err == nil && test.expected == nil {
+				t.Fatalf("unexpected success when applying apply rule: %#v", test.base)
+			}
+
+			if test.expected != nil && !reflect.DeepEqual(test.base, test.expected) {
+				t.Errorf("final emulator state mismatch: %#v != %#v", test.base, test.expected)
+			}
+		})
+	}
+}
+
+func TestDeviceEmulatorWhitelistApply(t *testing.T) {
+	testDeviceEmulatorApply(t, false)
+}
+
+func TestDeviceEmulatorBlacklistApply(t *testing.T) {
+	testDeviceEmulatorApply(t, true)
+}
+
+func testDeviceEmulatorTransition(t *testing.T, sourceDefaultAllow bool) {
+	tests := []struct {
+		name           string
+		source, target *emulator
+		expected       []*devices.Rule
+	}{
+		// No-op changes.
+		{
+			name: "Noop",
+			source: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: devices.Wildcard,
+					}: devices.Permissions("wm"),
+				},
+			},
+			target: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 42,
+						minor: devices.Wildcard,
+					}: devices.Permissions("wm"),
+				},
+			},
+			// Identical white-lists produce no extra rules.
+			expected: nil,
+		},
+		// Switching modes.
+		{
+			name: "SwitchToOtherMode",
+			source: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rwm"),
+				},
+			},
+			target: &emulator{
+				defaultAllow: !sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.BlockDevice,
+						major: 42,
+						minor: devices.Wildcard,
+					}: devices.Permissions("wm"),
+				},
+			},
+			expected: []*devices.Rule{
+				// Clear-all rule.
+				{
+					Type:        devices.WildcardDevice,
+					Major:       devices.Wildcard,
+					Minor:       devices.Wildcard,
+					Permissions: devices.Permissions("rwm"),
+					Allow:       !sourceDefaultAllow,
+				},
+				// The actual rule-set.
+				{
+					Type:        devices.BlockDevice,
+					Major:       42,
+					Minor:       devices.Wildcard,
+					Permissions: devices.Permissions("wm"),
+					Allow:       sourceDefaultAllow,
+				},
+			},
+		},
+		// Rule changes.
+		{
+			name: "RuleAddition",
+			source: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rwm"),
+				},
+			},
+			target: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("rwm"),
+				},
+			},
+			expected: []*devices.Rule{
+				{
+					Type:        devices.BlockDevice,
+					Major:       42,
+					Minor:       1337,
+					Permissions: devices.Permissions("rwm"),
+					Allow:       !sourceDefaultAllow,
+				},
+			},
+		},
+		{
+			name: "RuleRemoval",
+			source: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 42,
+						minor: 1337,
+					}: devices.Permissions("rwm"),
+				},
+			},
+			target: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rwm"),
+				},
+			},
+			expected: []*devices.Rule{
+				{
+					Type:        devices.BlockDevice,
+					Major:       42,
+					Minor:       1337,
+					Permissions: devices.Permissions("rwm"),
+					Allow:       sourceDefaultAllow,
+				},
+			},
+		},
+		{
+			name: "RuleMultipleAdditionRemoval",
+			source: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rwm"),
+					{
+						node:  devices.BlockDevice,
+						major: 3,
+						minor: 9,
+					}: devices.Permissions("rw"),
+				},
+			},
+			target: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rwm"),
+				},
+			},
+			expected: []*devices.Rule{
+				{
+					Type:        devices.BlockDevice,
+					Major:       3,
+					Minor:       9,
+					Permissions: devices.Permissions("rw"),
+					Allow:       sourceDefaultAllow,
+				},
+			},
+		},
+		// Modifying the access permissions.
+		{
+			name: "RulePartialAddition",
+			source: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("r"),
+				},
+			},
+			target: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rwm"),
+				},
+			},
+			expected: []*devices.Rule{
+				{
+					Type:        devices.CharDevice,
+					Major:       1,
+					Minor:       2,
+					Permissions: devices.Permissions("wm"),
+					Allow:       !sourceDefaultAllow,
+				},
+			},
+		},
+		{
+			name: "RulePartialRemoval",
+			source: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rw"),
+				},
+			},
+			target: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("w"),
+				},
+			},
+			expected: []*devices.Rule{
+				{
+					Type:        devices.CharDevice,
+					Major:       1,
+					Minor:       2,
+					Permissions: devices.Permissions("r"),
+					Allow:       sourceDefaultAllow,
+				},
+			},
+		},
+		{
+			name: "RulePartialBoth",
+			source: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rw"),
+				},
+			},
+			target: &emulator{
+				defaultAllow: sourceDefaultAllow,
+				rules: deviceRules{
+					{
+						node:  devices.CharDevice,
+						major: 1,
+						minor: 2,
+					}: devices.Permissions("rm"),
+				},
+			},
+			expected: []*devices.Rule{
+				{
+					Type:        devices.CharDevice,
+					Major:       1,
+					Minor:       2,
+					Permissions: devices.Permissions("w"),
+					Allow:       sourceDefaultAllow,
+				},
+				{
+					Type:        devices.CharDevice,
+					Major:       1,
+					Minor:       2,
+					Permissions: devices.Permissions("m"),
+					Allow:       !sourceDefaultAllow,
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		test := test
+		t.Run(test.name, func(t *testing.T) {
+			// If we are in black-list mode, we need to prepend the relevant
+			// clear-all rule (the expected rule lists are written with
+			// white-list mode in mind), and then make a full copy of the
+			// target rules.
+			if sourceDefaultAllow && test.source.defaultAllow == test.target.defaultAllow {
+				test.expected = []*devices.Rule{{
+					Type:        devices.WildcardDevice,
+					Major:       devices.Wildcard,
+					Minor:       devices.Wildcard,
+					Permissions: devices.Permissions("rwm"),
+					Allow:       test.target.defaultAllow,
+				}}
+				for _, rule := range test.target.rules.orderedEntries() {
+					test.expected = append(test.expected, &devices.Rule{
+						Type:        rule.meta.node,
+						Major:       rule.meta.major,
+						Minor:       rule.meta.minor,
+						Permissions: rule.perms,
+						Allow:       !test.target.defaultAllow,
+					})
+				}
+			}
+
+			rules, err := test.source.Transition(test.target)
+			if err != nil {
+				t.Fatalf("unexpected error while calculating transition rules: %#v", err)
+			}
+
+			if !reflect.DeepEqual(rules, test.expected) {
+				t.Errorf("rules don't match expected set: %#v != %#v", rules, test.expected)
+			}
+
+			// Apply the rules to the source to see if it actually transitions
+			// correctly. This is all emulated but it's a good thing to
+			// double-check.
+			for _, rule := range rules {
+				if err := test.source.Apply(*rule); err != nil {
+					t.Fatalf("error while applying transition rule [%#v]: %v", rule, err)
+				}
+			}
+			if !reflect.DeepEqual(test.source, test.target) {
+				t.Errorf("transition incomplete after applying all rules: %#v != %#v", test.source, test.target)
+			}
+		})
+	}
+}
+
+func TestDeviceEmulatorTransitionFromBlacklist(t *testing.T) {
+	testDeviceEmulatorTransition(t, true)
+}
+
+func TestDeviceEmulatorTransitionFromWhitelist(t *testing.T) {
+	testDeviceEmulatorTransition(t, false)
+}
+
+func BenchmarkParseLine(b *testing.B) {
+	list := `c *:* m
+b *:* m
+c 1:3 rwm
+c 1:5 rwm
+c 1:7 rwm
+c 1:8 rwm
+c 1:9 rwm
+c 5:0 rwm
+c 5:2 rwm
+c 136:* rwm
+c 10:200 rwm`
+
+	var r *deviceRule
+	var err error
+	for i := 0; i < b.N; i++ {
+		s := bufio.NewScanner(strings.NewReader(list))
+		for s.Scan() {
+			line := s.Text()
+			r, err = parseLine(line)
+		}
+		if err := s.Err(); err != nil {
+			b.Fatal(err)
+		}
+	}
+	b.Logf("rule: %v, err: %v", r, err)
+}
diff --git a/devices/ebpf_linux.go b/devices/ebpf_linux.go
new file mode 100644
index 0000000..6a41aff
--- /dev/null
+++ b/devices/ebpf_linux.go
@@ -0,0 +1,256 @@
+package devices
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"runtime"
+	"sync"
+	"unsafe"
+
+	"github.com/cilium/ebpf"
+	"github.com/cilium/ebpf/asm"
+	"github.com/cilium/ebpf/link"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+func nilCloser() error {
+	return nil
+}
+
+func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
+	type bpfAttrQuery struct {
+		TargetFd    uint32
+		AttachType  uint32
+		QueryType   uint32
+		AttachFlags uint32
+		ProgIds     uint64 // __aligned_u64
+		ProgCnt     uint32
+	}
+
+	// Currently you can only have 64 eBPF programs attached to a cgroup.
+	size := 64
+	retries := 0
+	for retries < 10 {
+		progIds := make([]uint32, size)
+		query := bpfAttrQuery{
+			TargetFd:   uint32(dirFd),
+			AttachType: uint32(unix.BPF_CGROUP_DEVICE),
+			ProgIds:    uint64(uintptr(unsafe.Pointer(&progIds[0]))),
+			ProgCnt:    uint32(len(progIds)),
+		}
+
+		// Fetch the list of program ids.
+		_, _, errno := unix.Syscall(unix.SYS_BPF,
+			uintptr(unix.BPF_PROG_QUERY),
+			uintptr(unsafe.Pointer(&query)),
+			unsafe.Sizeof(query))
+		size = int(query.ProgCnt)
+		runtime.KeepAlive(query)
+		if errno != 0 {
+			// On ENOSPC we get the correct number of programs.
+			if errno == unix.ENOSPC {
+				retries++
+				continue
+			}
+			return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
+		}
+
+		// Convert the ids to program handles.
+		progIds = progIds[:size]
+		programs := make([]*ebpf.Program, 0, len(progIds))
+		for _, progId := range progIds {
+			program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
+			if err != nil {
+				// We skip over programs that give us -EACCES or -EPERM. This
+				// is necessary because there may be BPF programs that have
+				// been attached (such as with --systemd-cgroup) which have an
+				// LSM label that blocks us from interacting with the program.
+				//
+				// Because additional BPF_CGROUP_DEVICE programs only can add
+				// restrictions, there's no real issue with just ignoring these
+				// programs (and stops runc from breaking on distributions with
+				// very strict SELinux policies).
+				if errors.Is(err, os.ErrPermission) {
+					logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
+					continue
+				}
+				return nil, fmt.Errorf("cannot fetch program from id: %w", err)
+			}
+			programs = append(programs, program)
+		}
+		runtime.KeepAlive(progIds)
+		return programs, nil
+	}
+
+	return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
+}
+
+var (
+	haveBpfProgReplaceBool bool
+	haveBpfProgReplaceOnce sync.Once
+)
+
+// Loosely based on the BPF_F_REPLACE support check in
+// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
+//
+// TODO: move this logic to cilium/ebpf
+func haveBpfProgReplace() bool {
+	haveBpfProgReplaceOnce.Do(func() {
+		prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
+			Type:    ebpf.CGroupDevice,
+			License: "MIT",
+			Instructions: asm.Instructions{
+				asm.Mov.Imm(asm.R0, 0),
+				asm.Return(),
+			},
+		})
+		if err != nil {
+			logrus.Warnf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
+			return
+		}
+		defer prog.Close()
+
+		devnull, err := os.Open("/dev/null")
+		if err != nil {
+			logrus.Warnf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
+			return
+		}
+		defer devnull.Close()
+
+		// We know that we have BPF_PROG_ATTACH since we can load
+		// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
+		// we know that the feature isn't present.
+		err = link.RawAttachProgram(link.RawAttachProgramOptions{
+			// We rely on this fd being checked after attachFlags in the kernel.
+			Target: int(devnull.Fd()),
+			// Attempt to "replace" our BPF program with itself. This will
+			// always fail, but we should get -EINVAL if BPF_F_REPLACE is not
+			// supported.
+			Anchor:  link.ReplaceProgram(prog),
+			Program: prog,
+			Attach:  ebpf.AttachCGroupDevice,
+			Flags:   unix.BPF_F_ALLOW_MULTI,
+		})
+		if errors.Is(err, ebpf.ErrNotSupported) || errors.Is(err, unix.EINVAL) {
+			// not supported
+			return
+		}
+		if !errors.Is(err, unix.EBADF) {
+			// If we see any new errors here, it's possible that there is a
+			// regression due to a cilium/ebpf update and the above EINVAL
+			// checks are not working. So, be loud about it so someone notices
+			// and we can get the issue fixed quicker.
+			logrus.Warnf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
+		}
+		haveBpfProgReplaceBool = true
+	})
+	return haveBpfProgReplaceBool
+}
+
+// loadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
+//
+// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
+//
+// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
+func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
+	// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
+	// This limit is not inherited into the container.
+	memlockLimit := &unix.Rlimit{
+		Cur: unix.RLIM_INFINITY,
+		Max: unix.RLIM_INFINITY,
+	}
+	_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
+
+	// Get the list of existing programs.
+	oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
+	if err != nil {
+		return nilCloser, err
+	}
+	useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
+
+	// Generate new program.
+	spec := &ebpf.ProgramSpec{
+		Type:         ebpf.CGroupDevice,
+		Instructions: insts,
+		License:      license,
+	}
+	prog, err := ebpf.NewProgram(spec)
+	if err != nil {
+		return nilCloser, err
+	}
+
+	// If there is only one old program, we can just replace it directly.
+
+	attachProgramOptions := link.RawAttachProgramOptions{
+		Target:  dirFd,
+		Program: prog,
+		Attach:  ebpf.AttachCGroupDevice,
+		Flags:   unix.BPF_F_ALLOW_MULTI,
+	}
+
+	if useReplaceProg {
+		attachProgramOptions.Anchor = link.ReplaceProgram(oldProgs[0])
+	}
+	err = link.RawAttachProgram(attachProgramOptions)
+	if err != nil {
+		return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
+	}
+	closer := func() error {
+		err = link.RawDetachProgram(link.RawDetachProgramOptions{
+			Target:  dirFd,
+			Program: prog,
+			Attach:  ebpf.AttachCGroupDevice,
+		})
+		if err != nil {
+			return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
+		}
+		// TODO: Should we attach the old filters back in this case? Otherwise
+		//       we fail-open on a security feature, which is a bit scary.
+		return nil
+	}
+	if !useReplaceProg {
+		logLevel := logrus.DebugLevel
+		// If there was more than one old program, give a warning (since this
+		// really shouldn't happen with runc-managed cgroups) and then detach
+		// all the old programs.
+		if len(oldProgs) > 1 {
+			// NOTE: Ideally this should be a warning but it turns out that
+			//       systemd-managed cgroups trigger this warning (apparently
+			//       systemd doesn't delete old non-systemd programs when
+			//       setting properties).
+			logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
+			logLevel = logrus.InfoLevel
+		}
+		for idx, oldProg := range oldProgs {
+			// Output some extra debug info.
+			if info, err := oldProg.Info(); err == nil {
+				fields := logrus.Fields{
+					"type": info.Type.String(),
+					"tag":  info.Tag,
+					"name": info.Name,
+				}
+				if id, ok := info.ID(); ok {
+					fields["id"] = id
+				}
+				if runCount, ok := info.RunCount(); ok {
+					fields["run_count"] = runCount
+				}
+				if runtime, ok := info.Runtime(); ok {
+					fields["runtime"] = runtime.String()
+				}
+				logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
+			}
+			err = link.RawDetachProgram(link.RawDetachProgramOptions{
+				Target:  dirFd,
+				Program: oldProg,
+				Attach:  ebpf.AttachCGroupDevice,
+			})
+			if err != nil {
+				return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
+			}
+		}
+	}
+	return closer, nil
+}
diff --git a/devices/systemd.go b/devices/systemd.go
new file mode 100644
index 0000000..010f7f2
--- /dev/null
+++ b/devices/systemd.go
@@ -0,0 +1,252 @@
+package devices
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	"github.com/godbus/dbus/v5"
+	"github.com/sirupsen/logrus"
+
+	"github.com/opencontainers/cgroups"
+	devices "github.com/opencontainers/cgroups/devices/config"
+)
+
+// systemdProperties takes the configured device rules and generates a
+// corresponding set of systemd properties to configure the devices correctly.
+func systemdProperties(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, error) {
+	if r.SkipDevices {
+		return nil, nil
+	}
+
+	properties := []systemdDbus.Property{
+		// When we later add DeviceAllow=/dev/foo properties, we are
+		// appending devices to the allow list for the unit. However,
+		// if this is an existing unit, it already has DeviceAllow=
+		// entries, and we need to clear them all before applying the
+		// new set. (We also do this for new units, mainly for safety
+		// to ensure we only enable the devices we expect.)
+		//
+		// To clear any existing DeviceAllow= rules, we have to add an
+		// empty DeviceAllow= property.
+		newProp("DeviceAllow", []deviceAllowEntry{}),
+		// Always run in the strictest white-list mode.
+		newProp("DevicePolicy", "strict"),
+	}
+
+	// Figure out the set of rules.
+	configEmu := emulator{}
+	for _, rule := range r.Devices {
+		if err := configEmu.Apply(*rule); err != nil {
+			return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
+		}
+	}
+	// systemd doesn't support blacklists. So we log a warning, and tell
+	// systemd to act as a deny-all whitelist. This ruleset will be replaced
+	// with our normal fallback code. This may result in spurious errors, but
+	// the only other option is to error out here.
+	if configEmu.IsBlacklist() {
+		// However, if we're dealing with an allow-all rule then we can do it.
+		if configEmu.IsAllowAll() {
+			return allowAllDevices(), nil
+		}
+		logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
+		return properties, nil
+	}
+
+	// Now generate the set of rules we actually need to apply. Unlike the
+	// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
+	// whitelist which is the default for devices.Emulator.
+	finalRules, err := configEmu.Rules()
+	if err != nil {
+		return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
+	}
+	var deviceAllowList []deviceAllowEntry
+	for _, rule := range finalRules {
+		if !rule.Allow {
+			// Should never happen.
+			return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
+		}
+		switch rule.Type {
+		case devices.BlockDevice, devices.CharDevice:
+		default:
+			// Should never happen.
+			return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
+		}
+
+		entry := deviceAllowEntry{
+			Perms: string(rule.Permissions),
+		}
+
+		// systemd has a fairly odd (though understandable) syntax here, and
+		// because of the OCI configuration format we have to do quite a bit of
+		// trickery to convert things:
+		//
+		//  * Concrete rules with non-wildcard major/minor numbers have to use
+		//    /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses
+		//    stat(2) on such paths to look up device properties, meaning we
+		//    cannot add whitelist rules for devices that don't exist. Since v240,
+		//    device properties are parsed from the path string.
+		//
+		//    However, path globbing is not supported for path-based rules so we
+		//    need to handle wildcards in some other manner.
+		//
+		//  * If systemd older than v240 is used, wildcard-minor rules
+		//    have to specify a "device group name" (the second column
+		//    in /proc/devices).
+		//
+		//  * Wildcard (major and minor) rules can just specify a glob with the
+		//    type ("char-*" or "block-*").
+		//
+		// The only type of rule we can't handle is wildcard-major rules, and
+		// so we'll give a warning in that case (note that the fallback code
+		// will insert any rules systemd couldn't handle). What amazing fun.
+
+		if rule.Major == devices.Wildcard {
+			// "_ *:n _" rules aren't supported by systemd.
+			if rule.Minor != devices.Wildcard {
+				logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
+				continue
+			}
+
+			// "_ *:* _" rules just wildcard everything.
+			prefix, err := groupPrefix(rule.Type)
+			if err != nil {
+				return nil, err
+			}
+			entry.Path = prefix + "*"
+		} else if rule.Minor == devices.Wildcard {
+			if sdVer >= 240 {
+				// systemd v240+ allows for {block,char}-MAJOR syntax.
+				prefix, err := groupPrefix(rule.Type)
+				if err != nil {
+					return nil, err
+				}
+				entry.Path = prefix + strconv.FormatInt(rule.Major, 10)
+			} else {
+				// For older systemd, "_ n:* _" rules require a device group from /proc/devices.
+				group, err := findDeviceGroup(rule.Type, rule.Major)
+				if err != nil {
+					return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
+				}
+				if group == "" {
+					// Couldn't find a group.
+					logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
+					continue
+				}
+				entry.Path = group
+			}
+		} else {
+			// "_ n:m _" rules are just a path in /dev/{block,char}/.
+			switch rule.Type {
+			case devices.BlockDevice:
+				entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
+			case devices.CharDevice:
+				entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
+			}
+			if sdVer < 240 {
+				// Old systemd versions use stat(2) on path to find out device major:minor
+				// numbers and type. If the path doesn't exist, it will not add the rule,
+				// emitting a warning instead.
+				// Since all of this logic is best-effort anyway (we manually set these
+				// rules separately to systemd) we can safely skip entries that don't
+				// have a corresponding path.
+				if _, err := os.Stat(entry.Path); err != nil {
+					continue
+				}
+			}
+		}
+		deviceAllowList = append(deviceAllowList, entry)
+	}
+
+	properties = append(properties, newProp("DeviceAllow", deviceAllowList))
+	return properties, nil
+}
+
+func newProp(name string, units interface{}) systemdDbus.Property {
+	return systemdDbus.Property{
+		Name:  name,
+		Value: dbus.MakeVariant(units),
+	}
+}
+
+func groupPrefix(ruleType devices.Type) (string, error) {
+	switch ruleType {
+	case devices.BlockDevice:
+		return "block-", nil
+	case devices.CharDevice:
+		return "char-", nil
+	default:
+		return "", fmt.Errorf("device type %v has no group prefix", ruleType)
+	}
+}
+
+// findDeviceGroup tries to find the device group name (as listed in
+// /proc/devices) with the type prefixed as required for DeviceAllow, for a
+// given (type, major) combination. If more than one device group exists, an
+// arbitrary one is chosen.
+func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
+	fh, err := os.Open("/proc/devices")
+	if err != nil {
+		return "", err
+	}
+	defer fh.Close()
+
+	prefix, err := groupPrefix(ruleType)
+	if err != nil {
+		return "", err
+	}
+	ruleMajorStr := strconv.FormatInt(ruleMajor, 10) + " "
+
+	scanner := bufio.NewScanner(fh)
+	var currentType devices.Type
+	for scanner.Scan() {
+		// We need to strip spaces because the first number is column-aligned.
+		line := strings.TrimSpace(scanner.Text())
+
+		// Handle the "header" lines.
+		switch line {
+		case "Block devices:":
+			currentType = devices.BlockDevice
+			continue
+		case "Character devices:":
+			currentType = devices.CharDevice
+			continue
+		case "":
+			continue
+		}
+
+		// Skip lines unrelated to our type.
+		if currentType != ruleType {
+			continue
+		}
+
+		if group, ok := strings.CutPrefix(line, ruleMajorStr); ok {
+			return prefix + group, nil
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return "", fmt.Errorf("reading /proc/devices: %w", err)
+	}
+	// Couldn't find the device group.
+	return "", nil
+}
+
+// DeviceAllow is the dbus type "a(ss)" which means we need a struct
+// to represent it in Go.
+type deviceAllowEntry struct {
+	Path  string
+	Perms string
+}
+
+func allowAllDevices() []systemdDbus.Property {
+	// Setting mode to auto and removing all DeviceAllow rules
+	// results in allowing access to all devices.
+	return []systemdDbus.Property{
+		newProp("DeviceAllow", []deviceAllowEntry{}),
+		newProp("DevicePolicy", "auto"),
+	}
+}
diff --git a/devices/systemd_test.go b/devices/systemd_test.go
new file mode 100644
index 0000000..21b8a6d
--- /dev/null
+++ b/devices/systemd_test.go
@@ -0,0 +1,279 @@
+package devices
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"os/exec"
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	devices "github.com/opencontainers/cgroups/devices/config"
+	"github.com/opencontainers/cgroups/systemd"
+)
+
+// TestPodSkipDevicesUpdate checks that updating a pod having SkipDevices: true
+// does not result in spurious "permission denied" errors in a container
+// running under the pod. The test is somewhat similar in nature to the
+// @test "update devices [minimal transition rules]" in tests/integration,
+// but uses a pod.
+func TestPodSkipDevicesUpdate(t *testing.T) {
+	if !systemd.IsRunningSystemd() {
+		t.Skip("Test requires systemd.")
+	}
+	if os.Geteuid() != 0 {
+		t.Skip("Test requires root.")
+	}
+
+	podName := "system-runc_test_pod" + t.Name() + ".slice"
+	podConfig := &cgroups.Cgroup{
+		Systemd: true,
+		Parent:  "system.slice",
+		Name:    podName,
+		Resources: &cgroups.Resources{
+			PidsLimit:   42,
+			Memory:      32 * 1024 * 1024,
+			SkipDevices: true,
+		},
+	}
+	// Create "pod" cgroup (a systemd slice to hold containers).
+	pm := newManager(t, podConfig)
+	if err := pm.Apply(-1); err != nil {
+		t.Fatal(err)
+	}
+	if err := pm.Set(podConfig.Resources); err != nil {
+		t.Fatal(err)
+	}
+
+	containerConfig := &cgroups.Cgroup{
+		Parent:      podName,
+		ScopePrefix: "test",
+		Name:        "PodSkipDevicesUpdate",
+		Resources: &cgroups.Resources{
+			Devices: []*devices.Rule{
+				// Allow access to /dev/null.
+				{
+					Type:        devices.CharDevice,
+					Major:       1,
+					Minor:       3,
+					Permissions: "rwm",
+					Allow:       true,
+				},
+			},
+		},
+	}
+
+	// Create a "container" within the "pod" cgroup.
+	// This is not a real container, just a process in the cgroup.
+	cmd := exec.Command("sleep", "infinity")
+	cmd.Env = append(os.Environ(), "LANG=C")
+	var stderr bytes.Buffer
+	cmd.Stderr = &stderr
+	if err := cmd.Start(); err != nil {
+		t.Fatal(err)
+	}
+	// Make sure to not leave a zombie.
+	defer func() {
+		// These may fail, we don't care.
+		_ = cmd.Process.Kill()
+		_ = cmd.Wait()
+	}()
+
+	// Put the process into a cgroup.
+	cm := newManager(t, containerConfig)
+	if err := cm.Apply(cmd.Process.Pid); err != nil {
+		t.Fatal(err)
+	}
+	// Check that we put the "container" into the "pod" cgroup.
+	if !strings.HasPrefix(cm.Path("devices"), pm.Path("devices")) {
+		t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q",
+			cm.Path("devices"), pm.Path("devices"))
+	}
+	if err := cm.Set(containerConfig.Resources); err != nil {
+		t.Fatal(err)
+	}
+
+	// Now update the pod a few times.
+	for i := 0; i < 42; i++ {
+		podConfig.Resources.PidsLimit++
+		podConfig.Resources.Memory += 1024 * 1024
+		if err := pm.Set(podConfig.Resources); err != nil {
+			t.Fatal(err)
+		}
+	}
+	// Kill the "container".
+	if err := cmd.Process.Kill(); err != nil {
+		t.Fatal(err)
+	}
+
+	_ = cmd.Wait()
+
+	// "Container" stderr should be empty.
+	if stderr.Len() != 0 {
+		t.Fatalf("container stderr not empty: %s", stderr.String())
+	}
+}
+
+func testSkipDevices(t *testing.T, skipDevices bool, expected []string) {
+	if !systemd.IsRunningSystemd() {
+		t.Skip("Test requires systemd.")
+	}
+	if os.Geteuid() != 0 {
+		t.Skip("Test requires root.")
+	}
+
+	podConfig := &cgroups.Cgroup{
+		Parent: "system.slice",
+		Name:   "system-runc_test_pods.slice",
+		Resources: &cgroups.Resources{
+			SkipDevices: skipDevices,
+		},
+	}
+	// Create "pods" cgroup (a systemd slice to hold containers).
+	pm := newManager(t, podConfig)
+	if err := pm.Apply(-1); err != nil {
+		t.Fatal(err)
+	}
+	if err := pm.Set(podConfig.Resources); err != nil {
+		t.Fatal(err)
+	}
+
+	config := &cgroups.Cgroup{
+		Parent:      "system-runc_test_pods.slice",
+		ScopePrefix: "test",
+		Name:        "SkipDevices",
+		Resources: &cgroups.Resources{
+			Devices: []*devices.Rule{
+				// Allow access to /dev/full only.
+				{
+					Type:        devices.CharDevice,
+					Major:       1,
+					Minor:       7,
+					Permissions: "rwm",
+					Allow:       true,
+				},
+			},
+		},
+	}
+
+	// Create a "container" within the "pods" cgroup.
+	// This is not a real container, just a process in the cgroup.
+	cmd := exec.Command("bash", "-c", "read; echo > /dev/full; cat /dev/null; true")
+	cmd.Env = append(os.Environ(), "LANG=C")
+	stdinR, stdinW, err := os.Pipe()
+	if err != nil {
+		t.Fatal(err)
+	}
+	cmd.Stdin = stdinR
+	var stderr bytes.Buffer
+	cmd.Stderr = &stderr
+	err = cmd.Start()
+	stdinR.Close()
+	defer stdinW.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+	// Make sure to not leave a zombie.
+	defer func() {
+		// These may fail, we don't care.
+		_, _ = stdinW.WriteString("hey\n")
+		_ = cmd.Wait()
+	}()
+
+	// Put the process into a cgroup.
+	m := newManager(t, config)
+	if err := m.Apply(cmd.Process.Pid); err != nil {
+		t.Fatal(err)
+	}
+	// Check that we put the "container" into the "pod" cgroup.
+	if !strings.HasPrefix(m.Path("devices"), pm.Path("devices")) {
+		t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q",
+			m.Path("devices"), pm.Path("devices"))
+	}
+	if err := m.Set(config.Resources); err != nil {
+		// failed to write "c 1:7 rwm": write /sys/fs/cgroup/devices/system.slice/system-runc_test_pods.slice/test-SkipDevices.scope/devices.allow: operation not permitted
+		if skipDevices == false && strings.HasSuffix(err.Error(), "/devices.allow: operation not permitted") {
+			// Cgroup v1 devices controller gives EPERM on trying
+			// to enable devices that are not enabled
+			// (skipDevices=false) in a parent cgroup.
+			// If this happens, test is passing.
+			return
+		}
+		t.Fatal(err)
+	}
+
+	// Check that we can access /dev/full but not /dev/zero.
+	if _, err := stdinW.WriteString("wow\n"); err != nil {
+		t.Fatal(err)
+	}
+	if err := cmd.Wait(); err != nil {
+		t.Fatal(err)
+	}
+	for _, exp := range expected {
+		if !strings.Contains(stderr.String(), exp) {
+			t.Errorf("expected %q, got: %s", exp, stderr.String())
+		}
+	}
+}
+
+func TestSkipDevicesTrue(t *testing.T) {
+	testSkipDevices(t, true, []string{
+		"echo: write error: No space left on device",
+		"cat: /dev/null: Operation not permitted",
+	})
+}
+
+func TestSkipDevicesFalse(t *testing.T) {
+	// If SkipDevices is not set for the parent slice, access to both
+	// devices should fail. This is done to assess the test correctness.
+	// For cgroup v1, we check for m.Set returning EPERM.
+	// For cgroup v2, we check for the errors below.
+	testSkipDevices(t, false, []string{
+		"/dev/full: Operation not permitted",
+		"cat: /dev/null: Operation not permitted",
+	})
+}
+
+func testFindDeviceGroup() error {
+	const (
+		major = 136
+		group = "char-pts"
+	)
+	res, err := findDeviceGroup(devices.CharDevice, major)
+	if res != group || err != nil {
+		return fmt.Errorf("expected %v, nil, got %v, %w", group, res, err)
+	}
+	return nil
+}
+
+func TestFindDeviceGroup(t *testing.T) {
+	if err := testFindDeviceGroup(); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func BenchmarkFindDeviceGroup(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		if err := testFindDeviceGroup(); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func newManager(t *testing.T, config *cgroups.Cgroup) (m cgroups.Manager) {
+	t.Helper()
+	var err error
+
+	if cgroups.IsCgroup2UnifiedMode() {
+		m, err = systemd.NewUnifiedManager(config, "")
+	} else {
+		m, err = systemd.NewLegacyManager(config, nil)
+	}
+	if err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = m.Destroy() })
+
+	return m
+}
diff --git a/devices/v1.go b/devices/v1.go
new file mode 100644
index 0000000..8d0986d
--- /dev/null
+++ b/devices/v1.go
@@ -0,0 +1,83 @@
+package devices
+
+import (
+	"bytes"
+	"errors"
+	"reflect"
+
+	"github.com/moby/sys/userns"
+	"github.com/opencontainers/cgroups"
+	devices "github.com/opencontainers/cgroups/devices/config"
+)
+
+var testingSkipFinalCheck bool
+
+func setV1(path string, r *cgroups.Resources) error {
+	if userns.RunningInUserNS() || r.SkipDevices {
+		return nil
+	}
+	// Generate two emulators, one for the current state of the cgroup and one
+	// for the requested state by the user.
+	current, err := loadEmulator(path)
+	if err != nil {
+		return err
+	}
+	target, err := buildEmulator(r.Devices)
+	if err != nil {
+		return err
+	}
+
+	// Compute the minimal set of transition rules needed to achieve the
+	// requested state.
+	transitionRules, err := current.Transition(target)
+	if err != nil {
+		return err
+	}
+	for _, rule := range transitionRules {
+		file := "devices.deny"
+		if rule.Allow {
+			file = "devices.allow"
+		}
+		if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
+			return err
+		}
+	}
+
+	// Final safety check -- ensure that the resulting state is what was
+	// requested. This is only really correct for white-lists, but for
+	// black-lists we can at least check that the cgroup is in the right mode.
+	//
+	// This safety-check is skipped for the unit tests because we cannot
+	// currently mock devices.list correctly.
+	if !testingSkipFinalCheck {
+		currentAfter, err := loadEmulator(path)
+		if err != nil {
+			return err
+		}
+		if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
+			return errors.New("resulting devices cgroup doesn't precisely match target")
+		} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
+			return errors.New("resulting devices cgroup doesn't match target mode")
+		}
+	}
+	return nil
+}
+
+func loadEmulator(path string) (*emulator, error) {
+	list, err := cgroups.ReadFile(path, "devices.list")
+	if err != nil {
+		return nil, err
+	}
+	return emulatorFromList(bytes.NewBufferString(list))
+}
+
+func buildEmulator(rules []*devices.Rule) (*emulator, error) {
+	// This defaults to a white-list -- which is what we want!
+	emu := &emulator{}
+	for _, rule := range rules {
+		if err := emu.Apply(*rule); err != nil {
+			return nil, err
+		}
+	}
+	return emu, nil
+}
diff --git a/devices/v1_test.go b/devices/v1_test.go
new file mode 100644
index 0000000..29e4637
--- /dev/null
+++ b/devices/v1_test.go
@@ -0,0 +1,68 @@
+package devices
+
+import (
+	"os"
+	"path"
+	"testing"
+
+	"github.com/moby/sys/userns"
+
+	"github.com/opencontainers/cgroups"
+	devices "github.com/opencontainers/cgroups/devices/config"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+func init() {
+	testingSkipFinalCheck = true
+	cgroups.TestMode = true
+}
+
+func TestSetV1Allow(t *testing.T) {
+	if userns.RunningInUserNS() {
+		t.Skip("userns detected; setV1 does nothing")
+	}
+	dir := t.TempDir()
+
+	for file, contents := range map[string]string{
+		"devices.allow": "",
+		"devices.deny":  "",
+		"devices.list":  "a *:* rwm",
+	} {
+		err := os.WriteFile(path.Join(dir, file), []byte(contents), 0o600)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	r := &cgroups.Resources{
+		Devices: []*devices.Rule{
+			{
+				Type:        devices.CharDevice,
+				Major:       1,
+				Minor:       5,
+				Permissions: devices.Permissions("rwm"),
+				Allow:       true,
+			},
+		},
+	}
+
+	if err := setV1(dir, r); err != nil {
+		t.Fatal(err)
+	}
+
+	// The default deny rule must be written.
+	value, err := fscommon.GetCgroupParamString(dir, "devices.deny")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value[0] != 'a' {
+		t.Errorf("Got the wrong value (%q), set devices.deny failed.", value)
+	}
+
+	// Permitted rule must be written.
+	if value, err := fscommon.GetCgroupParamString(dir, "devices.allow"); err != nil {
+		t.Fatal(err)
+	} else if value != "c 1:5 rwm" {
+		t.Errorf("Got the wrong value (%q), set devices.allow failed.", value)
+	}
+}
diff --git a/devices/v2.go b/devices/v2.go
new file mode 100644
index 0000000..d54298f
--- /dev/null
+++ b/devices/v2.go
@@ -0,0 +1,73 @@
+package devices
+
+import (
+	"fmt"
+
+	"github.com/moby/sys/userns"
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+	devices "github.com/opencontainers/cgroups/devices/config"
+)
+
+func isRWM(perms devices.Permissions) bool {
+	var r, w, m bool
+	for _, perm := range perms {
+		switch perm {
+		case 'r':
+			r = true
+		case 'w':
+			w = true
+		case 'm':
+			m = true
+		}
+	}
+	return r && w && m
+}
+
+// This is similar to the logic applied in crun for handling errors from bpf(2)
+// <https://github.com/containers/crun/blob/0.17/src/libcrun/cgroup.c#L2438-L2470>.
+func canSkipEBPFError(r *cgroups.Resources) bool {
+	// If we're running in a user namespace we can ignore eBPF rules because we
+	// usually cannot use bpf(2), as well as rootless containers usually don't
+	// have the necessary privileges to mknod(2) device inodes or access
+	// host-level instances (though ideally we would be blocking device access
+	// for rootless containers anyway).
+	if userns.RunningInUserNS() {
+		return true
+	}
+
+	// We cannot ignore an eBPF load error if any rule if is a block rule or it
+	// doesn't permit all access modes.
+	//
+	// NOTE: This will sometimes trigger in cases where access modes are split
+	//       between different rules but to handle this correctly would require
+	//       using ".../libcontainer/cgroup/devices".Emulator.
+	for _, dev := range r.Devices {
+		if !dev.Allow || !isRWM(dev.Permissions) {
+			return false
+		}
+	}
+	return true
+}
+
+func setV2(dirPath string, r *cgroups.Resources) error {
+	if r.SkipDevices {
+		return nil
+	}
+	insts, license, err := deviceFilter(r.Devices)
+	if err != nil {
+		return err
+	}
+	dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600)
+	if err != nil {
+		return fmt.Errorf("cannot get dir FD for %s", dirPath)
+	}
+	defer unix.Close(dirFD)
+	if _, err := loadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
+		if !canSkipEBPFError(r) {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/file.go b/file.go
new file mode 100644
index 0000000..c1b8f5c
--- /dev/null
+++ b/file.go
@@ -0,0 +1,216 @@
+package cgroups
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+// OpenFile opens a cgroup file in a given dir with given flags.
+// It is supposed to be used for cgroup files only, and returns
+// an error if the file is not a cgroup file.
+//
+// Arguments dir and file are joined together to form an absolute path
+// to a file being opened.
+func OpenFile(dir, file string, flags int) (*os.File, error) {
+	if dir == "" {
+		return nil, fmt.Errorf("no directory specified for %s", file)
+	}
+	return openFile(dir, file, flags)
+}
+
+// ReadFile reads data from a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func ReadFile(dir, file string) (string, error) {
+	fd, err := OpenFile(dir, file, unix.O_RDONLY)
+	if err != nil {
+		return "", err
+	}
+	defer fd.Close()
+	var buf bytes.Buffer
+
+	_, err = buf.ReadFrom(fd)
+	return buf.String(), err
+}
+
+// WriteFile writes data to a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func WriteFile(dir, file, data string) error {
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
+	if err != nil {
+		return err
+	}
+	defer fd.Close()
+	if _, err := fd.WriteString(data); err != nil {
+		// Having data in the error message helps in debugging.
+		return fmt.Errorf("failed to write %q: %w", data, err)
+	}
+	return nil
+}
+
+// WriteFileByLine is the same as WriteFile, except if data contains newlines,
+// it is written line by line.
+func WriteFileByLine(dir, file, data string) error {
+	i := strings.Index(data, "\n")
+	if i == -1 {
+		return WriteFile(dir, file, data)
+	}
+
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
+	if err != nil {
+		return err
+	}
+	defer fd.Close()
+	start := 0
+	for {
+		var line string
+		if i == -1 {
+			line = data[start:]
+		} else {
+			line = data[start : start+i+1]
+		}
+		_, err := fd.WriteString(line)
+		if err != nil {
+			return fmt.Errorf("failed to write %q: %w", line, err)
+		}
+		if i == -1 {
+			break
+		}
+		start += i + 1
+		i = strings.Index(data[start:], "\n")
+	}
+	return nil
+}
+
+const (
+	cgroupfsDir    = "/sys/fs/cgroup"
+	cgroupfsPrefix = cgroupfsDir + "/"
+)
+
+var (
+	// TestMode is set to true by unit tests that need "fake" cgroupfs.
+	TestMode bool
+
+	cgroupRootHandle *os.File
+	prepOnce         sync.Once
+	prepErr          error
+	resolveFlags     uint64
+)
+
+func prepareOpenat2() error {
+	prepOnce.Do(func() {
+		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
+			Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
+		})
+		if err != nil {
+			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
+			if err != unix.ENOSYS {
+				logrus.Warnf("falling back to securejoin: %s", prepErr)
+			} else {
+				logrus.Debug("openat2 not available, falling back to securejoin")
+			}
+			return
+		}
+		file := os.NewFile(uintptr(fd), cgroupfsDir)
+
+		var st unix.Statfs_t
+		if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
+			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
+			logrus.Warnf("falling back to securejoin: %s", prepErr)
+			return
+		}
+
+		cgroupRootHandle = file
+		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
+		if st.Type == unix.CGROUP2_SUPER_MAGIC {
+			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
+			resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
+		}
+	})
+
+	return prepErr
+}
+
+func openFile(dir, file string, flags int) (*os.File, error) {
+	mode := os.FileMode(0)
+	if TestMode && flags&os.O_WRONLY != 0 {
+		// "emulate" cgroup fs for unit tests
+		flags |= os.O_TRUNC | os.O_CREATE
+		mode = 0o600
+	}
+	// NOTE it is important to use filepath.Clean("/"+file) here
+	// (see https://github.com/opencontainers/runc/issues/4103)!
+	path := filepath.Join(dir, filepath.Clean("/"+file))
+
+	if prepareOpenat2() != nil {
+		return openFallback(path, flags, mode)
+	}
+	relPath, ok := strings.CutPrefix(path, cgroupfsPrefix)
+	if !ok { // Non-standard path, old system?
+		return openFallback(path, flags, mode)
+	}
+
+	fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
+		&unix.OpenHow{
+			Resolve: resolveFlags,
+			Flags:   uint64(flags) | unix.O_CLOEXEC,
+			Mode:    uint64(mode),
+		})
+	if err != nil {
+		err = &os.PathError{Op: "openat2", Path: path, Err: err}
+		// Check if cgroupRootHandle is still opened to cgroupfsDir
+		// (happens when this package is incorrectly used
+		// across the chroot/pivot_root/mntns boundary, or
+		// when /sys/fs/cgroup is remounted).
+		//
+		// TODO: if such usage will ever be common, amend this
+		// to reopen cgroupRootHandle and retry openat2.
+		fdDest, fdErr := os.Readlink("/proc/thread-self/fd/" + strconv.Itoa(int(cgroupRootHandle.Fd())))
+		if fdErr == nil && fdDest != cgroupfsDir {
+			// Wrap the error so it is clear that cgroupRootHandle
+			// is opened to an unexpected/wrong directory.
+			err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
+				cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
+		}
+		return nil, err
+	}
+
+	return os.NewFile(uintptr(fd), path), nil
+}
+
+var errNotCgroupfs = errors.New("not a cgroup file")
+
+// Can be changed by unit tests.
+var openFallback = openAndCheck
+
+// openAndCheck is used when openat2(2) is not available. It checks the opened
+// file is on cgroupfs, returning an error otherwise.
+func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) {
+	fd, err := os.OpenFile(path, flags, mode)
+	if err != nil {
+		return nil, err
+	}
+	if TestMode {
+		return fd, nil
+	}
+	// Check this is a cgroupfs file.
+	var st unix.Statfs_t
+	if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
+	}
+	if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
+	}
+
+	return fd, nil
+}
diff --git a/file_test.go b/file_test.go
new file mode 100644
index 0000000..3a9fac3
--- /dev/null
+++ b/file_test.go
@@ -0,0 +1,93 @@
+package cgroups
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"testing"
+	"time"
+)
+
+func TestWriteCgroupFileHandlesInterrupt(t *testing.T) {
+	const (
+		memoryCgroupMount = "/sys/fs/cgroup/memory"
+		memoryLimit       = "memory.limit_in_bytes"
+	)
+	if _, err := os.Stat(memoryCgroupMount); err != nil {
+		// most probably cgroupv2
+		t.Skip(err)
+	}
+
+	cgroupName := fmt.Sprintf("test-eint-%d", time.Now().Nanosecond())
+	cgroupPath := filepath.Join(memoryCgroupMount, cgroupName)
+	if err := os.MkdirAll(cgroupPath, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	defer os.RemoveAll(cgroupPath)
+
+	if _, err := os.Stat(filepath.Join(cgroupPath, memoryLimit)); err != nil {
+		// either cgroupv2, or memory controller is not available
+		t.Skip(err)
+	}
+
+	for i := 0; i < 100000; i++ {
+		limit := 1024*1024 + i
+		if err := WriteFile(cgroupPath, memoryLimit, strconv.Itoa(limit)); err != nil {
+			t.Fatalf("Failed to write %d on attempt %d: %+v", limit, i, err)
+		}
+	}
+}
+
+func TestOpenat2(t *testing.T) {
+	if !IsCgroup2UnifiedMode() {
+		// The reason is many test cases below test opening files from
+		// the top-level directory, where cgroup v1 has no files.
+		t.Skip("test requires cgroup v2")
+	}
+
+	// Make sure we test openat2, not its fallback.
+	openFallback = func(_ string, _ int, _ os.FileMode) (*os.File, error) {
+		return nil, errors.New("fallback")
+	}
+	defer func() { openFallback = openAndCheck }()
+
+	for _, tc := range []struct{ dir, file string }{
+		{"/sys/fs/cgroup", "cgroup.controllers"},
+		{"/sys/fs/cgroup", "/cgroup.controllers"},
+		{"/sys/fs/cgroup/", "cgroup.controllers"},
+		{"/sys/fs/cgroup/", "/cgroup.controllers"},
+		{"/", "/sys/fs/cgroup/cgroup.controllers"},
+		{"/", "sys/fs/cgroup/cgroup.controllers"},
+		{"/sys/fs/cgroup/cgroup.controllers", ""},
+	} {
+		fd, err := OpenFile(tc.dir, tc.file, os.O_RDONLY)
+		if err != nil {
+			t.Errorf("case %+v: %v", tc, err)
+		}
+		fd.Close()
+	}
+}
+
+func BenchmarkWriteFile(b *testing.B) {
+	TestMode = true
+	defer func() { TestMode = false }()
+
+	dir := b.TempDir()
+	tc := []string{
+		"one",
+		"one\ntwo\nthree",
+		"10:200 foo=bar boo=far\n300:1200 something=other\ndefault 45000\n",
+		"\n\n\n\n\n\n\n\n",
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		for _, val := range tc {
+			if err := WriteFileByLine(dir, "file", val); err != nil {
+				b.Fatal(err)
+			}
+		}
+	}
+}
diff --git a/fs/blkio.go b/fs/blkio.go
new file mode 100644
index 0000000..f3c4c5c
--- /dev/null
+++ b/fs/blkio.go
@@ -0,0 +1,310 @@
+package fs
+
+import (
+	"bufio"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/cgroups"
+)
+
+type BlkioGroup struct {
+	weightFilename       string
+	weightDeviceFilename string
+}
+
+func (s *BlkioGroup) Name() string {
+	return "blkio"
+}
+
+func (s *BlkioGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func (s *BlkioGroup) Set(path string, r *cgroups.Resources) error {
+	s.detectWeightFilenames(path)
+	if r.BlkioWeight != 0 {
+		if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
+			return err
+		}
+	}
+
+	if r.BlkioLeafWeight != 0 {
+		if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
+			return err
+		}
+	}
+	for _, wd := range r.BlkioWeightDevice {
+		if wd.Weight != 0 {
+			if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil {
+				return err
+			}
+		}
+		if wd.LeafWeight != 0 {
+			if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
+				return err
+			}
+		}
+	}
+	for _, td := range r.BlkioThrottleReadBpsDevice {
+		if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
+			return err
+		}
+	}
+	for _, td := range r.BlkioThrottleWriteBpsDevice {
+		if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
+			return err
+		}
+	}
+	for _, td := range r.BlkioThrottleReadIOPSDevice {
+		if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
+			return err
+		}
+	}
+	for _, td := range r.BlkioThrottleWriteIOPSDevice {
+		if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+/*
+examples:
+
+    blkio.sectors
+    8:0 6792
+
+    blkio.io_service_bytes
+    8:0 Read 1282048
+    8:0 Write 2195456
+    8:0 Sync 2195456
+    8:0 Async 1282048
+    8:0 Total 3477504
+    Total 3477504
+
+    blkio.io_serviced
+    8:0 Read 124
+    8:0 Write 104
+    8:0 Sync 104
+    8:0 Async 124
+    8:0 Total 228
+    Total 228
+
+    blkio.io_queued
+    8:0 Read 0
+    8:0 Write 0
+    8:0 Sync 0
+    8:0 Async 0
+    8:0 Total 0
+    Total 0
+*/
+
+func splitBlkioStatLine(r rune) bool {
+	return r == ' ' || r == ':'
+}
+
+func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
+	var blkioStats []cgroups.BlkioStatEntry
+	f, err := cgroups.OpenFile(dir, file, os.O_RDONLY)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return blkioStats, nil
+		}
+		return nil, err
+	}
+	defer f.Close()
+
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		// format: dev type amount
+		fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine)
+		if len(fields) < 3 {
+			if len(fields) == 2 && fields[0] == "Total" {
+				// skip total line
+				continue
+			} else {
+				return nil, malformedLine(dir, file, sc.Text())
+			}
+		}
+
+		v, err := strconv.ParseUint(fields[0], 10, 64)
+		if err != nil {
+			return nil, &parseError{Path: dir, File: file, Err: err}
+		}
+		major := v
+
+		v, err = strconv.ParseUint(fields[1], 10, 64)
+		if err != nil {
+			return nil, &parseError{Path: dir, File: file, Err: err}
+		}
+		minor := v
+
+		op := ""
+		valueField := 2
+		if len(fields) == 4 {
+			op = fields[2]
+			valueField = 3
+		}
+		v, err = strconv.ParseUint(fields[valueField], 10, 64)
+		if err != nil {
+			return nil, &parseError{Path: dir, File: file, Err: err}
+		}
+		blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
+	}
+	if err := sc.Err(); err != nil {
+		return nil, &parseError{Path: dir, File: file, Err: err}
+	}
+
+	return blkioStats, nil
+}
+
+func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
+	type blkioStatInfo struct {
+		filename            string
+		blkioStatEntriesPtr *[]cgroups.BlkioStatEntry
+	}
+	bfqDebugStats := []blkioStatInfo{
+		{
+			filename:            "blkio.bfq.sectors_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_service_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_wait_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_merged_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_queued_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	bfqStats := []blkioStatInfo{
+		{
+			filename:            "blkio.bfq.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	cfqStats := []blkioStatInfo{
+		{
+			filename:            "blkio.sectors_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
+		},
+		{
+			filename:            "blkio.io_service_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
+		},
+		{
+			filename:            "blkio.io_wait_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
+		},
+		{
+			filename:            "blkio.io_merged_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
+		},
+		{
+			filename:            "blkio.io_queued_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
+		},
+		{
+			filename:            "blkio.time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
+		},
+		{
+			filename:            "blkio.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	throttleRecursiveStats := []blkioStatInfo{
+		{
+			filename:            "blkio.throttle.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.throttle.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	baseStats := []blkioStatInfo{
+		{
+			filename:            "blkio.throttle.io_serviced",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.throttle.io_service_bytes",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	orderedStats := [][]blkioStatInfo{
+		bfqDebugStats,
+		bfqStats,
+		cfqStats,
+		throttleRecursiveStats,
+		baseStats,
+	}
+
+	var blkioStats []cgroups.BlkioStatEntry
+	var err error
+
+	for _, statGroup := range orderedStats {
+		for i, statInfo := range statGroup {
+			if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil {
+				// if error occurs on first file, move to next group
+				if i == 0 {
+					break
+				}
+				return err
+			}
+			*statInfo.blkioStatEntriesPtr = blkioStats
+			// finish if all stats are gathered
+			if i == len(statGroup)-1 {
+				return nil
+			}
+		}
+	}
+	return nil
+}
+
+func (s *BlkioGroup) detectWeightFilenames(path string) {
+	if s.weightFilename != "" {
+		// Already detected.
+		return
+	}
+	if cgroups.PathExists(filepath.Join(path, "blkio.weight")) {
+		s.weightFilename = "blkio.weight"
+		s.weightDeviceFilename = "blkio.weight_device"
+	} else {
+		s.weightFilename = "blkio.bfq.weight"
+		s.weightDeviceFilename = "blkio.bfq.weight_device"
+	}
+}
diff --git a/fs/blkio_test.go b/fs/blkio_test.go
new file mode 100644
index 0000000..31aafab
--- /dev/null
+++ b/fs/blkio_test.go
@@ -0,0 +1,862 @@
+package fs
+
+import (
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+const (
+	sectorsRecursiveContents      = `8:0 1024`
+	sectorsRecursiveContentsBFQ   = `8:0 2048`
+	serviceBytesRecursiveContents = `8:0 Read 100
+8:0 Write 200
+8:0 Sync 300
+8:0 Async 500
+8:0 Total 500
+Total 500`
+
+	serviceBytesRecursiveContentsBFQ = `8:0 Read 1100
+8:0 Write 1200
+8:0 Sync 1300
+8:0 Async 1500
+8:0 Total 1500
+Total 1500`
+	servicedRecursiveContents = `8:0 Read 10
+8:0 Write 40
+8:0 Sync 20
+8:0 Async 30
+8:0 Total 50
+Total 50`
+	servicedRecursiveContentsBFQ = `8:0 Read 11
+8:0 Write 41
+8:0 Sync 21
+8:0 Async 31
+8:0 Total 51
+Total 51`
+	queuedRecursiveContents = `8:0 Read 1
+8:0 Write 4
+8:0 Sync 2
+8:0 Async 3
+8:0 Total 5
+Total 5`
+	queuedRecursiveContentsBFQ = `8:0 Read 2
+8:0 Write 3
+8:0 Sync 4
+8:0 Async 5
+8:0 Total 6
+Total 6`
+	serviceTimeRecursiveContents = `8:0 Read 173959
+8:0 Write 0
+8:0 Sync 0
+8:0 Async 173959
+8:0 Total 17395
+Total 17395`
+	serviceTimeRecursiveContentsBFQ = `8:0 Read 173959
+8:0 Write 0
+8:0 Sync 0
+8:0 Async 173
+8:0 Total 174
+Total 174`
+	waitTimeRecursiveContents = `8:0 Read 15571
+8:0 Write 0
+8:0 Sync 0
+8:0 Async 15571
+8:0 Total 15571`
+	waitTimeRecursiveContentsBFQ = `8:0 Read 1557
+8:0 Write 0
+8:0 Sync 0
+8:0 Async 1557
+8:0 Total 1557`
+	mergedRecursiveContents = `8:0 Read 5
+8:0 Write 10
+8:0 Sync 0
+8:0 Async 0
+8:0 Total 15
+Total 15`
+	mergedRecursiveContentsBFQ = `8:0 Read 51
+8:0 Write 101
+8:0 Sync 0
+8:0 Async 0
+8:0 Total 151
+Total 151`
+	timeRecursiveContents    = `8:0 8`
+	timeRecursiveContentsBFQ = `8:0 16`
+	throttleServiceBytes     = `8:0 Read 11030528
+8:0 Write 23
+8:0 Sync 42
+8:0 Async 11030528
+8:0 Total 11030528
+252:0 Read 11030528
+252:0 Write 23
+252:0 Sync 42
+252:0 Async 11030528
+252:0 Total 11030528
+Total 22061056`
+	throttleServiceBytesRecursive = `8:0 Read 110305281
+8:0 Write 231
+8:0 Sync 421
+8:0 Async 110305281
+8:0 Total 110305281
+252:0 Read 110305281
+252:0 Write 231
+252:0 Sync 421
+252:0 Async 110305281
+252:0 Total 110305281
+Total 220610561`
+	throttleServiced = `8:0 Read 164
+8:0 Write 23
+8:0 Sync 42
+8:0 Async 164
+8:0 Total 164
+252:0 Read 164
+252:0 Write 23
+252:0 Sync 42
+252:0 Async 164
+252:0 Total 164
+Total 328`
+	throttleServicedRecursive = `8:0 Read 1641
+8:0 Write 231
+8:0 Sync 421
+8:0 Async 1641
+8:0 Total 1641
+252:0 Read 1641
+252:0 Write 231
+252:0 Sync 421
+252:0 Async 1641
+252:0 Total 1641
+Total 3281`
+)
+
+var blkioBFQDebugStatsTestFiles = map[string]string{
+	"blkio.bfq.io_service_bytes_recursive": serviceBytesRecursiveContentsBFQ,
+	"blkio.bfq.io_serviced_recursive":      servicedRecursiveContentsBFQ,
+	"blkio.bfq.io_queued_recursive":        queuedRecursiveContentsBFQ,
+	"blkio.bfq.io_service_time_recursive":  serviceTimeRecursiveContentsBFQ,
+	"blkio.bfq.io_wait_time_recursive":     waitTimeRecursiveContentsBFQ,
+	"blkio.bfq.io_merged_recursive":        mergedRecursiveContentsBFQ,
+	"blkio.bfq.time_recursive":             timeRecursiveContentsBFQ,
+	"blkio.bfq.sectors_recursive":          sectorsRecursiveContentsBFQ,
+}
+
+var blkioBFQStatsTestFiles = map[string]string{
+	"blkio.bfq.io_service_bytes_recursive": serviceBytesRecursiveContentsBFQ,
+	"blkio.bfq.io_serviced_recursive":      servicedRecursiveContentsBFQ,
+}
+
+var blkioCFQStatsTestFiles = map[string]string{
+	"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+	"blkio.io_serviced_recursive":      servicedRecursiveContents,
+	"blkio.io_queued_recursive":        queuedRecursiveContents,
+	"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+	"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+	"blkio.io_merged_recursive":        mergedRecursiveContents,
+	"blkio.time_recursive":             timeRecursiveContents,
+	"blkio.sectors_recursive":          sectorsRecursiveContents,
+}
+
+type blkioStatFailureTestCase struct {
+	desc     string
+	filename string
+}
+
+func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) { //nolint:unparam
+	*blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op})
+}
+
+func TestBlkioSetWeight(t *testing.T) {
+	const (
+		weightBefore = 100
+		weightAfter  = 200
+	)
+
+	for _, legacyIOScheduler := range []bool{false, true} {
+		// Populate cgroup
+		path := tempDir(t, "blkio")
+		weightFilename := "blkio.bfq.weight"
+		if legacyIOScheduler {
+			weightFilename = "blkio.weight"
+		}
+		writeFileContents(t, path, map[string]string{
+			weightFilename: strconv.Itoa(weightBefore),
+		})
+		// Apply new configuration
+		r := &cgroups.Resources{
+			BlkioWeight: weightAfter,
+		}
+		blkio := &BlkioGroup{}
+		if err := blkio.Set(path, r); err != nil {
+			t.Fatal(err)
+		}
+		// Verify results
+		if weightFilename != blkio.weightFilename {
+			t.Fatalf("weight filename detection failed: expected %q, detected %q", weightFilename, blkio.weightFilename)
+		}
+		value, err := fscommon.GetCgroupParamUint(path, weightFilename)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if value != weightAfter {
+			t.Fatalf("Got the wrong value, set %s failed.", weightFilename)
+		}
+	}
+}
+
+func TestBlkioSetWeightDevice(t *testing.T) {
+	const (
+		weightDeviceBefore = "8:0 400"
+	)
+
+	for _, legacyIOScheduler := range []bool{false, true} {
+		// Populate cgroup
+		path := tempDir(t, "blkio")
+		weightFilename := "blkio.bfq.weight"
+		weightDeviceFilename := "blkio.bfq.weight_device"
+		if legacyIOScheduler {
+			weightFilename = "blkio.weight"
+			weightDeviceFilename = "blkio.weight_device"
+		}
+		writeFileContents(t, path, map[string]string{
+			weightFilename:       "",
+			weightDeviceFilename: weightDeviceBefore,
+		})
+		// Apply new configuration
+		wd := cgroups.NewWeightDevice(8, 0, 500, 0)
+		weightDeviceAfter := wd.WeightString()
+		r := &cgroups.Resources{
+			BlkioWeightDevice: []*cgroups.WeightDevice{wd},
+		}
+		blkio := &BlkioGroup{}
+		if err := blkio.Set(path, r); err != nil {
+			t.Fatal(err)
+		}
+		// Verify results
+		if weightDeviceFilename != blkio.weightDeviceFilename {
+			t.Fatalf("weight_device filename detection failed: expected %q, detected %q", weightDeviceFilename, blkio.weightDeviceFilename)
+		}
+		value, err := fscommon.GetCgroupParamString(path, weightDeviceFilename)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if value != weightDeviceAfter {
+			t.Fatalf("Got the wrong value, set %s failed.", weightDeviceFilename)
+		}
+	}
+}
+
+// regression #274
+func TestBlkioSetMultipleWeightDevice(t *testing.T) {
+	path := tempDir(t, "blkio")
+
+	const (
+		weightDeviceBefore = "8:0 400"
+	)
+
+	wd1 := cgroups.NewWeightDevice(8, 0, 500, 0)
+	wd2 := cgroups.NewWeightDevice(8, 16, 500, 0)
+	// we cannot actually set and check both because normal os.WriteFile
+	// when writing to cgroup file will overwrite the whole file content instead
+	// of updating it as the kernel is doing. Just check the second device
+	// is present will suffice for the test to ensure multiple writes are done.
+	weightDeviceAfter := wd2.WeightString()
+
+	blkio := &BlkioGroup{}
+	blkio.detectWeightFilenames(path)
+	if blkio.weightDeviceFilename != "blkio.bfq.weight_device" {
+		t.Fatalf("when blkio controller is unavailable, expected to use \"blkio.bfq.weight_device\", tried to use %q", blkio.weightDeviceFilename)
+	}
+	writeFileContents(t, path, map[string]string{
+		blkio.weightDeviceFilename: weightDeviceBefore,
+	})
+
+	r := &cgroups.Resources{
+		BlkioWeightDevice: []*cgroups.WeightDevice{wd1, wd2},
+	}
+	if err := blkio.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(path, blkio.weightDeviceFilename)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != weightDeviceAfter {
+		t.Fatalf("Got the wrong value, set %s failed.", blkio.weightDeviceFilename)
+	}
+}
+
+func TestBlkioBFQDebugStats(t *testing.T) {
+	path := tempDir(t, "blkio")
+	writeFileContents(t, path, blkioBFQDebugStatsTestFiles)
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedStats := cgroups.BlkioStats{}
+	appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 2048, "")
+
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Read")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Write")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Async")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 6, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 174, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Read")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Async")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 51, "Read")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 101, "Write")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 151, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 16, "")
+
+	expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioMultipleStatsFiles(t *testing.T) {
+	path := tempDir(t, "blkio")
+	writeFileContents(t, path, blkioBFQDebugStatsTestFiles)
+	writeFileContents(t, path, blkioCFQStatsTestFiles)
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedStats := cgroups.BlkioStats{}
+	appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 2048, "")
+
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Read")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Write")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Async")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 6, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 174, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Read")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Async")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 51, "Read")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 101, "Write")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 151, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 16, "")
+
+	expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioBFQStats(t *testing.T) {
+	path := tempDir(t, "blkio")
+	writeFileContents(t, path, blkioBFQStatsTestFiles)
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedStats := cgroups.BlkioStats{}
+
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total")
+
+	expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioStatsNoFilesBFQDebug(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+	testCases := []blkioStatFailureTestCase{
+		{
+			desc:     "missing blkio.bfq.io_service_bytes_recursive file",
+			filename: "blkio.bfq.io_service_bytes_recursive",
+		},
+		{
+			desc:     "missing blkio.bfq.io_serviced_recursive file",
+			filename: "blkio.bfq.io_serviced_recursive",
+		},
+		{
+			desc:     "missing blkio.bfq.io_queued_recursive file",
+			filename: "blkio.bfq.io_queued_recursive",
+		},
+		{
+			desc:     "missing blkio.bfq.sectors_recursive file",
+			filename: "blkio.bfq.sectors_recursive",
+		},
+		{
+			desc:     "missing blkio.bfq.io_service_time_recursive file",
+			filename: "blkio.bfq.io_service_time_recursive",
+		},
+		{
+			desc:     "missing blkio.bfq.io_wait_time_recursive file",
+			filename: "blkio.bfq.io_wait_time_recursive",
+		},
+		{
+			desc:     "missing blkio.bfq.io_merged_recursive file",
+			filename: "blkio.bfq.io_merged_recursive",
+		},
+		{
+			desc:     "missing blkio.bfq.time_recursive file",
+			filename: "blkio.bfq.time_recursive",
+		},
+	}
+
+	for _, testCase := range testCases {
+		path := tempDir(t, "cpuset")
+
+		tempBlkioTestFiles := map[string]string{}
+		for i, v := range blkioBFQDebugStatsTestFiles {
+			tempBlkioTestFiles[i] = v
+		}
+		delete(tempBlkioTestFiles, testCase.filename)
+
+		writeFileContents(t, path, tempBlkioTestFiles)
+		cpuset := &CpusetGroup{}
+		actualStats := *cgroups.NewStats()
+		err := cpuset.GetStats(path, &actualStats)
+		if err != nil {
+			t.Errorf("%s: want no error, got: %+v", testCase.desc, err)
+		}
+	}
+}
+
+func TestBlkioCFQStats(t *testing.T) {
+	path := tempDir(t, "blkio")
+	writeFileContents(t, path, blkioCFQStatsTestFiles)
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify expected stats.
+	expectedStats := cgroups.BlkioStats{}
+	appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 1024, "")
+
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 100, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 200, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 300, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 10, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 40, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 20, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 30, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 50, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 1, "Read")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Write")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Async")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 17395, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Read")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Async")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 5, "Read")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 10, "Write")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 15, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 8, "")
+
+	expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioStatsNoFilesCFQ(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+	testCases := []blkioStatFailureTestCase{
+		{
+			desc:     "missing blkio.io_service_bytes_recursive file",
+			filename: "blkio.io_service_bytes_recursive",
+		},
+		{
+			desc:     "missing blkio.io_serviced_recursive file",
+			filename: "blkio.io_serviced_recursive",
+		},
+		{
+			desc:     "missing blkio.io_queued_recursive file",
+			filename: "blkio.io_queued_recursive",
+		},
+		{
+			desc:     "missing blkio.sectors_recursive file",
+			filename: "blkio.sectors_recursive",
+		},
+		{
+			desc:     "missing blkio.io_service_time_recursive file",
+			filename: "blkio.io_service_time_recursive",
+		},
+		{
+			desc:     "missing blkio.io_wait_time_recursive file",
+			filename: "blkio.io_wait_time_recursive",
+		},
+		{
+			desc:     "missing blkio.io_merged_recursive file",
+			filename: "blkio.io_merged_recursive",
+		},
+		{
+			desc:     "missing blkio.time_recursive file",
+			filename: "blkio.time_recursive",
+		},
+	}
+
+	for _, testCase := range testCases {
+		path := tempDir(t, "cpuset")
+
+		tempBlkioTestFiles := map[string]string{}
+		for i, v := range blkioCFQStatsTestFiles {
+			tempBlkioTestFiles[i] = v
+		}
+		delete(tempBlkioTestFiles, testCase.filename)
+
+		writeFileContents(t, path, tempBlkioTestFiles)
+		cpuset := &CpusetGroup{}
+		actualStats := *cgroups.NewStats()
+		err := cpuset.GetStats(path, &actualStats)
+		if err != nil {
+			t.Errorf("%s: want no error, got %+v", testCase.desc, err)
+		}
+	}
+}
+
+func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) {
+	path := tempDir(t, "blkio")
+	writeFileContents(t, path, map[string]string{
+		"blkio.io_service_bytes_recursive": "8:0 Read 100 100",
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected to fail, but did not")
+	}
+}
+
+func TestBlkioStatsUnexpectedFieldType(t *testing.T) {
+	path := tempDir(t, "blkio")
+	writeFileContents(t, path, map[string]string{
+		"blkio.io_service_bytes_recursive": "8:0 Read Write",
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected to fail, but did not")
+	}
+}
+
+func TestThrottleRecursiveBlkioStats(t *testing.T) {
+	path := tempDir(t, "blkio")
+	writeFileContents(t, path, map[string]string{
+		"blkio.io_service_bytes_recursive":          "",
+		"blkio.io_serviced_recursive":               "",
+		"blkio.io_queued_recursive":                 "",
+		"blkio.sectors_recursive":                   "",
+		"blkio.io_service_time_recursive":           "",
+		"blkio.io_wait_time_recursive":              "",
+		"blkio.io_merged_recursive":                 "",
+		"blkio.time_recursive":                      "",
+		"blkio.throttle.io_service_bytes_recursive": throttleServiceBytesRecursive,
+		"blkio.throttle.io_serviced_recursive":      throttleServicedRecursive,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify expected stats.
+	expectedStats := cgroups.BlkioStats{}
+
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 231, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 421, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Total")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 231, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 421, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 231, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 421, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Total")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 231, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 421, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Total")
+
+	expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestThrottleBlkioStats(t *testing.T) {
+	path := tempDir(t, "blkio")
+	writeFileContents(t, path, map[string]string{
+		"blkio.io_service_bytes_recursive": "",
+		"blkio.io_serviced_recursive":      "",
+		"blkio.io_queued_recursive":        "",
+		"blkio.sectors_recursive":          "",
+		"blkio.io_service_time_recursive":  "",
+		"blkio.io_wait_time_recursive":     "",
+		"blkio.io_merged_recursive":        "",
+		"blkio.time_recursive":             "",
+		"blkio.throttle.io_service_bytes":  throttleServiceBytes,
+		"blkio.throttle.io_serviced":       throttleServiced,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify expected stats.
+	expectedStats := cgroups.BlkioStats{}
+
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 23, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 42, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Total")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 23, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 42, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 23, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 42, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Total")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 23, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 42, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Total")
+
+	expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioSetThrottleReadBpsDevice(t *testing.T) {
+	path := tempDir(t, "blkio")
+
+	const (
+		throttleBefore = `8:0 1024`
+	)
+
+	td := cgroups.NewThrottleDevice(8, 0, 2048)
+	throttleAfter := td.String()
+
+	writeFileContents(t, path, map[string]string{
+		"blkio.throttle.read_bps_device": throttleBefore,
+	})
+
+	r := &cgroups.Resources{
+		BlkioThrottleReadBpsDevice: []*cgroups.ThrottleDevice{td},
+	}
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.read_bps_device")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != throttleAfter {
+		t.Fatal("Got the wrong value, set blkio.throttle.read_bps_device failed.")
+	}
+}
+
+func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) {
+	path := tempDir(t, "blkio")
+
+	const (
+		throttleBefore = `8:0 1024`
+	)
+
+	td := cgroups.NewThrottleDevice(8, 0, 2048)
+	throttleAfter := td.String()
+
+	writeFileContents(t, path, map[string]string{
+		"blkio.throttle.write_bps_device": throttleBefore,
+	})
+
+	r := &cgroups.Resources{
+		BlkioThrottleWriteBpsDevice: []*cgroups.ThrottleDevice{td},
+	}
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.write_bps_device")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != throttleAfter {
+		t.Fatal("Got the wrong value, set blkio.throttle.write_bps_device failed.")
+	}
+}
+
+func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) {
+	path := tempDir(t, "blkio")
+
+	const (
+		throttleBefore = `8:0 1024`
+	)
+
+	td := cgroups.NewThrottleDevice(8, 0, 2048)
+	throttleAfter := td.String()
+
+	writeFileContents(t, path, map[string]string{
+		"blkio.throttle.read_iops_device": throttleBefore,
+	})
+
+	r := &cgroups.Resources{
+		BlkioThrottleReadIOPSDevice: []*cgroups.ThrottleDevice{td},
+	}
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.read_iops_device")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != throttleAfter {
+		t.Fatal("Got the wrong value, set blkio.throttle.read_iops_device failed.")
+	}
+}
+
+func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) {
+	path := tempDir(t, "blkio")
+
+	const (
+		throttleBefore = `8:0 1024`
+	)
+
+	td := cgroups.NewThrottleDevice(8, 0, 2048)
+	throttleAfter := td.String()
+
+	writeFileContents(t, path, map[string]string{
+		"blkio.throttle.write_iops_device": throttleBefore,
+	})
+
+	r := &cgroups.Resources{
+		BlkioThrottleWriteIOPSDevice: []*cgroups.ThrottleDevice{td},
+	}
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.write_iops_device")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != throttleAfter {
+		t.Fatal("Got the wrong value, set blkio.throttle.write_iops_device failed.")
+	}
+}
diff --git a/fs/cpu.go b/fs/cpu.go
new file mode 100644
index 0000000..3e05788
--- /dev/null
+++ b/fs/cpu.go
@@ -0,0 +1,181 @@
+package fs
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"os"
+	"strconv"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+	"golang.org/x/sys/unix"
+)
+
+type CpuGroup struct{}
+
+func (s *CpuGroup) Name() string {
+	return "cpu"
+}
+
+func (s *CpuGroup) Apply(path string, r *cgroups.Resources, pid int) error {
+	if err := os.MkdirAll(path, 0o755); err != nil {
+		return err
+	}
+	// We should set the real-Time group scheduling settings before moving
+	// in the process because if the process is already in SCHED_RR mode
+	// and no RT bandwidth is set, adding it will fail.
+	if err := s.SetRtSched(path, r); err != nil {
+		return err
+	}
+	// Since we are not using apply(), we need to place the pid
+	// into the procs file.
+	return cgroups.WriteCgroupProc(path, pid)
+}
+
+func (s *CpuGroup) SetRtSched(path string, r *cgroups.Resources) error {
+	var period string
+	if r.CpuRtPeriod != 0 {
+		period = strconv.FormatUint(r.CpuRtPeriod, 10)
+		if err := cgroups.WriteFile(path, "cpu.rt_period_us", period); err != nil {
+			// The values of cpu.rt_period_us and cpu.rt_runtime_us
+			// are inter-dependent and need to be set in a proper order.
+			// If the kernel rejects the new period value with EINVAL
+			// and the new runtime value is also being set, let's
+			// ignore the error for now and retry later.
+			if !errors.Is(err, unix.EINVAL) || r.CpuRtRuntime == 0 {
+				return err
+			}
+		} else {
+			period = ""
+		}
+	}
+	if r.CpuRtRuntime != 0 {
+		if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
+			return err
+		}
+		if period != "" {
+			if err := cgroups.WriteFile(path, "cpu.rt_period_us", period); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func (s *CpuGroup) Set(path string, r *cgroups.Resources) error {
+	if r.CpuShares != 0 {
+		shares := r.CpuShares
+		if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
+			return err
+		}
+		// read it back
+		sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
+		if err != nil {
+			return err
+		}
+		// ... and check
+		if shares > sharesRead {
+			return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead)
+		} else if shares < sharesRead {
+			return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead)
+		}
+	}
+
+	var period string
+	if r.CpuPeriod != 0 {
+		period = strconv.FormatUint(r.CpuPeriod, 10)
+		if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
+			// Sometimes when the period to be set is smaller
+			// than the current one, it is rejected by the kernel
+			// (EINVAL) as old_quota/new_period exceeds the parent
+			// cgroup quota limit. If this happens and the quota is
+			// going to be set, ignore the error for now and retry
+			// after setting the quota.
+			if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
+				return err
+			}
+		} else {
+			period = ""
+		}
+	}
+
+	var burst string
+	if r.CpuBurst != nil {
+		burst = strconv.FormatUint(*r.CpuBurst, 10)
+		if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil {
+			if errors.Is(err, unix.ENOENT) {
+				// If CPU burst knob is not available (e.g.
+				// older kernel), ignore it.
+				burst = ""
+			} else {
+				// Sometimes when the burst to be set is larger
+				// than the current one, it is rejected by the kernel
+				// (EINVAL) as old_quota/new_burst exceeds the parent
+				// cgroup quota limit. If this happens and the quota is
+				// going to be set, ignore the error for now and retry
+				// after setting the quota.
+				if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
+					return err
+				}
+			}
+		} else {
+			burst = ""
+		}
+	}
+	if r.CpuQuota != 0 {
+		if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
+			return err
+		}
+		if period != "" {
+			if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
+				return err
+			}
+		}
+		if burst != "" {
+			if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil {
+				return err
+			}
+		}
+	}
+
+	if r.CPUIdle != nil {
+		idle := strconv.FormatInt(*r.CPUIdle, 10)
+		if err := cgroups.WriteFile(path, "cpu.idle", idle); err != nil {
+			return err
+		}
+	}
+
+	return s.SetRtSched(path, r)
+}
+
+func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
+	const file = "cpu.stat"
+	f, err := cgroups.OpenFile(path, file, os.O_RDONLY)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil
+		}
+		return err
+	}
+	defer f.Close()
+
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		t, v, err := fscommon.ParseKeyValue(sc.Text())
+		if err != nil {
+			return &parseError{Path: path, File: file, Err: err}
+		}
+		switch t {
+		case "nr_periods":
+			stats.CpuStats.ThrottlingData.Periods = v
+
+		case "nr_throttled":
+			stats.CpuStats.ThrottlingData.ThrottledPeriods = v
+
+		case "throttled_time":
+			stats.CpuStats.ThrottlingData.ThrottledTime = v
+		}
+	}
+	return nil
+}
diff --git a/fs/cpu_test.go b/fs/cpu_test.go
new file mode 100644
index 0000000..a2b64c3
--- /dev/null
+++ b/fs/cpu_test.go
@@ -0,0 +1,226 @@
+package fs
+
+import (
+	"fmt"
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+func TestCpuSetShares(t *testing.T) {
+	path := tempDir(t, "cpu")
+
+	const (
+		sharesBefore = 1024
+		sharesAfter  = 512
+	)
+
+	writeFileContents(t, path, map[string]string{
+		"cpu.shares": strconv.Itoa(sharesBefore),
+	})
+
+	r := &cgroups.Resources{
+		CpuShares: sharesAfter,
+	}
+	cpu := &CpuGroup{}
+	if err := cpu.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != sharesAfter {
+		t.Fatal("Got the wrong value, set cpu.shares failed.")
+	}
+}
+
+func TestCpuSetBandWidth(t *testing.T) {
+	path := tempDir(t, "cpu")
+
+	const (
+		quotaBefore     = 8000
+		quotaAfter      = 5000
+		burstBefore     = 2000
+		periodBefore    = 10000
+		periodAfter     = 7000
+		rtRuntimeBefore = 8000
+		rtRuntimeAfter  = 5000
+		rtPeriodBefore  = 10000
+		rtPeriodAfter   = 7000
+	)
+	burstAfter := uint64(1000)
+
+	writeFileContents(t, path, map[string]string{
+		"cpu.cfs_quota_us":  strconv.Itoa(quotaBefore),
+		"cpu.cfs_burst_us":  strconv.Itoa(burstBefore),
+		"cpu.cfs_period_us": strconv.Itoa(periodBefore),
+		"cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
+		"cpu.rt_period_us":  strconv.Itoa(rtPeriodBefore),
+	})
+
+	r := &cgroups.Resources{
+		CpuQuota:     quotaAfter,
+		CpuBurst:     &burstAfter,
+		CpuPeriod:    periodAfter,
+		CpuRtRuntime: rtRuntimeAfter,
+		CpuRtPeriod:  rtPeriodAfter,
+	}
+	cpu := &CpuGroup{}
+	if err := cpu.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	quota, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_quota_us")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if quota != quotaAfter {
+		t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.")
+	}
+
+	burst, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_burst_us")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if burst != burstAfter {
+		t.Fatal("Got the wrong value, set cpu.cfs_burst_us failed.")
+	}
+
+	period, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_period_us")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if period != periodAfter {
+		t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.")
+	}
+
+	rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if rtRuntime != rtRuntimeAfter {
+		t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
+	}
+
+	rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if rtPeriod != rtPeriodAfter {
+		t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
+	}
+}
+
+func TestCpuStats(t *testing.T) {
+	path := tempDir(t, "cpu")
+
+	const (
+		nrPeriods     = 2000
+		nrThrottled   = 200
+		throttledTime = uint64(18446744073709551615)
+	)
+
+	cpuStatContent := fmt.Sprintf("nr_periods %d\nnr_throttled %d\nthrottled_time %d\n",
+		nrPeriods, nrThrottled, throttledTime)
+	writeFileContents(t, path, map[string]string{
+		"cpu.stat": cpuStatContent,
+	})
+
+	cpu := &CpuGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpu.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedStats := cgroups.ThrottlingData{
+		Periods:          nrPeriods,
+		ThrottledPeriods: nrThrottled,
+		ThrottledTime:    throttledTime,
+	}
+
+	expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData)
+}
+
+func TestNoCpuStatFile(t *testing.T) {
+	path := tempDir(t, "cpu")
+
+	cpu := &CpuGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpu.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal("Expected not to fail, but did")
+	}
+}
+
+func TestInvalidCpuStat(t *testing.T) {
+	path := tempDir(t, "cpu")
+
+	cpuStatContent := `nr_periods 2000
+	nr_throttled 200
+	throttled_time fortytwo`
+	writeFileContents(t, path, map[string]string{
+		"cpu.stat": cpuStatContent,
+	})
+
+	cpu := &CpuGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpu.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failed stat parsing.")
+	}
+}
+
+func TestCpuSetRtSchedAtApply(t *testing.T) {
+	path := tempDir(t, "cpu")
+
+	const (
+		rtRuntimeBefore = 0
+		rtRuntimeAfter  = 5000
+		rtPeriodBefore  = 0
+		rtPeriodAfter   = 7000
+	)
+
+	writeFileContents(t, path, map[string]string{
+		"cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
+		"cpu.rt_period_us":  strconv.Itoa(rtPeriodBefore),
+	})
+
+	r := &cgroups.Resources{
+		CpuRtRuntime: rtRuntimeAfter,
+		CpuRtPeriod:  rtPeriodAfter,
+	}
+	cpu := &CpuGroup{}
+
+	if err := cpu.Apply(path, r, 1234); err != nil {
+		t.Fatal(err)
+	}
+
+	rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if rtRuntime != rtRuntimeAfter {
+		t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
+	}
+
+	rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if rtPeriod != rtPeriodAfter {
+		t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
+	}
+
+	pid, err := fscommon.GetCgroupParamUint(path, "cgroup.procs")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if pid != 1234 {
+		t.Fatal("Got the wrong value, set cgroup.procs failed.")
+	}
+}
diff --git a/fs/cpuacct.go b/fs/cpuacct.go
new file mode 100644
index 0000000..391a023
--- /dev/null
+++ b/fs/cpuacct.go
@@ -0,0 +1,158 @@
+package fs
+
+import (
+	"bufio"
+	"os"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+const (
+	nsInSec = 1000000000
+
+	// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
+	// on Linux it's a constant which is safe to be hard coded,
+	// so we can avoid using cgo here. For details, see:
+	// https://github.com/containerd/cgroups/pull/12
+	clockTicks uint64 = 100
+)
+
+type CpuacctGroup struct{}
+
+func (s *CpuacctGroup) Name() string {
+	return "cpuacct"
+}
+
+func (s *CpuacctGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func (s *CpuacctGroup) Set(_ string, _ *cgroups.Resources) error {
+	return nil
+}
+
+func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
+	if !cgroups.PathExists(path) {
+		return nil
+	}
+	userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
+	if err != nil {
+		return err
+	}
+
+	totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage")
+	if err != nil {
+		return err
+	}
+
+	percpuUsage, err := getPercpuUsage(path)
+	if err != nil {
+		return err
+	}
+
+	percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path)
+	if err != nil {
+		return err
+	}
+
+	stats.CpuStats.CpuUsage.TotalUsage = totalUsage
+	stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
+	stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode
+	stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode
+	stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
+	stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
+	return nil
+}
+
+// Returns user and kernel usage breakdown in nanoseconds.
+func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
+	var userModeUsage, kernelModeUsage uint64
+	const (
+		userField   = "user"
+		systemField = "system"
+		file        = "cpuacct.stat"
+	)
+
+	// Expected format:
+	// user <usage in ticks>
+	// system <usage in ticks>
+	data, err := cgroups.ReadFile(path, file)
+	if err != nil {
+		return 0, 0, err
+	}
+
+	fields := strings.Fields(data)
+	if len(fields) < 4 || fields[0] != userField || fields[2] != systemField {
+		return 0, 0, malformedLine(path, file, data)
+	}
+	if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
+		return 0, 0, &parseError{Path: path, File: file, Err: err}
+	}
+	if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
+		return 0, 0, &parseError{Path: path, File: file, Err: err}
+	}
+
+	return (userModeUsage * nsInSec) / clockTicks, (kernelModeUsage * nsInSec) / clockTicks, nil
+}
+
+func getPercpuUsage(path string) ([]uint64, error) {
+	const file = "cpuacct.usage_percpu"
+	percpuUsage := []uint64{}
+	data, err := cgroups.ReadFile(path, file)
+	if err != nil {
+		return percpuUsage, err
+	}
+	for _, value := range strings.Fields(data) {
+		value, err := strconv.ParseUint(value, 10, 64)
+		if err != nil {
+			return percpuUsage, &parseError{Path: path, File: file, Err: err}
+		}
+		percpuUsage = append(percpuUsage, value)
+	}
+	return percpuUsage, nil
+}
+
+func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
+	usageKernelMode := []uint64{}
+	usageUserMode := []uint64{}
+	const file = "cpuacct.usage_all"
+
+	fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
+	if os.IsNotExist(err) {
+		return usageKernelMode, usageUserMode, nil
+	} else if err != nil {
+		return nil, nil, err
+	}
+	defer fd.Close()
+
+	scanner := bufio.NewScanner(fd)
+	scanner.Scan() // skipping header line
+
+	for scanner.Scan() {
+		// Each line is: cpu user system
+		fields := strings.SplitN(scanner.Text(), " ", 3)
+		if len(fields) != 3 {
+			continue
+		}
+
+		user, err := strconv.ParseUint(fields[1], 10, 64)
+		if err != nil {
+			return nil, nil, &parseError{Path: path, File: file, Err: err}
+		}
+		usageUserMode = append(usageUserMode, user)
+
+		kernel, err := strconv.ParseUint(fields[2], 10, 64)
+		if err != nil {
+			return nil, nil, &parseError{Path: path, File: file, Err: err}
+		}
+		usageKernelMode = append(usageKernelMode, kernel)
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, nil, &parseError{Path: path, File: file, Err: err}
+	}
+
+	return usageKernelMode, usageUserMode, nil
+}
diff --git a/fs/cpuacct_test.go b/fs/cpuacct_test.go
new file mode 100644
index 0000000..c0c9543
--- /dev/null
+++ b/fs/cpuacct_test.go
@@ -0,0 +1,112 @@
+package fs
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+const (
+	cpuAcctUsageContents       = "12262454190222160"
+	cpuAcctUsagePerCPUContents = "1564936537989058 1583937096487821 1604195415465681 1596445226820187 1481069084155629 1478735613864327 1477610593414743 1476362015778086"
+	cpuAcctStatContents        = "user 452278264\nsystem 291429664"
+	cpuAcctUsageAll            = `cpu user system
+	0 962250696038415 637727786389114
+	1 981956408513304 638197595421064
+	2 1002658817529022 638956774598358
+	3 994937703492523 637985531181620
+	4 874843781648690 638837766495476
+	5 872544369885276 638763309884944
+	6 870104915696359 640081778921247
+	7 870202363887496 638716766259495
+	`
+)
+
+func TestCpuacctStats(t *testing.T) {
+	path := tempDir(t, "cpuacct")
+	writeFileContents(t, path, map[string]string{
+		"cpuacct.usage":        cpuAcctUsageContents,
+		"cpuacct.usage_percpu": cpuAcctUsagePerCPUContents,
+		"cpuacct.stat":         cpuAcctStatContents,
+		"cpuacct.usage_all":    cpuAcctUsageAll,
+	})
+
+	cpuacct := &CpuacctGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpuacct.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedStats := cgroups.CpuUsage{
+		TotalUsage: uint64(12262454190222160),
+		PercpuUsage: []uint64{
+			1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187,
+			1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086,
+		},
+		PercpuUsageInKernelmode: []uint64{
+			637727786389114, 638197595421064, 638956774598358, 637985531181620,
+			638837766495476, 638763309884944, 640081778921247, 638716766259495,
+		},
+		PercpuUsageInUsermode: []uint64{
+			962250696038415, 981956408513304, 1002658817529022, 994937703492523,
+			874843781648690, 872544369885276, 870104915696359, 870202363887496,
+		},
+		UsageInKernelmode: (uint64(291429664) * nsInSec) / clockTicks,
+		UsageInUsermode:   (uint64(452278264) * nsInSec) / clockTicks,
+	}
+
+	if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) {
+		t.Errorf("Expected CPU usage %#v but found %#v\n",
+			expectedStats, actualStats.CpuStats.CpuUsage)
+	}
+}
+
+func TestCpuacctStatsWithoutUsageAll(t *testing.T) {
+	path := tempDir(t, "cpuacct")
+	writeFileContents(t, path, map[string]string{
+		"cpuacct.usage":        cpuAcctUsageContents,
+		"cpuacct.usage_percpu": cpuAcctUsagePerCPUContents,
+		"cpuacct.stat":         cpuAcctStatContents,
+	})
+
+	cpuacct := &CpuacctGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpuacct.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedStats := cgroups.CpuUsage{
+		TotalUsage: uint64(12262454190222160),
+		PercpuUsage: []uint64{
+			1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187,
+			1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086,
+		},
+		PercpuUsageInKernelmode: []uint64{},
+		PercpuUsageInUsermode:   []uint64{},
+		UsageInKernelmode:       (uint64(291429664) * nsInSec) / clockTicks,
+		UsageInUsermode:         (uint64(452278264) * nsInSec) / clockTicks,
+	}
+
+	if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) {
+		t.Errorf("Expected CPU usage %#v but found %#v\n",
+			expectedStats, actualStats.CpuStats.CpuUsage)
+	}
+}
+
+func BenchmarkGetCpuUsageBreakdown(b *testing.B) {
+	path := tempDir(b, "cpuacct")
+	writeFileContents(b, path, map[string]string{
+		"cpuacct.stat": cpuAcctStatContents,
+	})
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _, err := getCpuUsageBreakdown(path)
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+}
diff --git a/fs/cpuset.go b/fs/cpuset.go
new file mode 100644
index 0000000..ef6ff7d
--- /dev/null
+++ b/fs/cpuset.go
@@ -0,0 +1,276 @@
+package fs
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+var (
+	cpusetLock     sync.Mutex
+	cpusetPrefix   = "cpuset."
+	cpusetFastPath bool
+)
+
+func cpusetFile(path string, name string) string {
+	cpusetLock.Lock()
+	defer cpusetLock.Unlock()
+
+	// Only the v1 cpuset cgroup is allowed to mount with noprefix.
+	// See kernel source: https://github.com/torvalds/linux/blob/2e1b3cc9d7f790145a80cb705b168f05dab65df2/kernel/cgroup/cgroup-v1.c#L1070
+	// Cpuset cannot be mounted with and without prefix simultaneously.
+	// Commonly used in Android environments.
+
+	if cpusetFastPath {
+		return cpusetPrefix + name
+	}
+
+	err := unix.Access(filepath.Join(path, cpusetPrefix+name), unix.F_OK)
+	if err == nil {
+		// Use the fast path only if we can access one type of mount for cpuset already
+		cpusetFastPath = true
+	} else {
+		err = unix.Access(filepath.Join(path, name), unix.F_OK)
+		if err == nil {
+			cpusetPrefix = ""
+			cpusetFastPath = true
+		}
+	}
+
+	return cpusetPrefix + name
+}
+
+type CpusetGroup struct{}
+
+func (s *CpusetGroup) Name() string {
+	return "cpuset"
+}
+
+func (s *CpusetGroup) Apply(path string, r *cgroups.Resources, pid int) error {
+	return s.ApplyDir(path, r, pid)
+}
+
+func (s *CpusetGroup) Set(path string, r *cgroups.Resources) error {
+	if r.CpusetCpus != "" {
+		if err := cgroups.WriteFile(path, cpusetFile(path, "cpus"), r.CpusetCpus); err != nil {
+			return err
+		}
+	}
+	if r.CpusetMems != "" {
+		if err := cgroups.WriteFile(path, cpusetFile(path, "mems"), r.CpusetMems); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func getCpusetStat(path string, file string) ([]uint16, error) {
+	var extracted []uint16
+	fileContent, err := fscommon.GetCgroupParamString(path, file)
+	if err != nil {
+		return extracted, err
+	}
+	if len(fileContent) == 0 {
+		return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")}
+	}
+
+	for _, s := range strings.Split(fileContent, ",") {
+		fromStr, toStr, ok := strings.Cut(s, "-")
+		if ok {
+			from, err := strconv.ParseUint(fromStr, 10, 16)
+			if err != nil {
+				return extracted, &parseError{Path: path, File: file, Err: err}
+			}
+			to, err := strconv.ParseUint(toStr, 10, 16)
+			if err != nil {
+				return extracted, &parseError{Path: path, File: file, Err: err}
+			}
+			if from > to {
+				return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, from > to")}
+			}
+			for i := from; i <= to; i++ {
+				extracted = append(extracted, uint16(i))
+			}
+		} else {
+			value, err := strconv.ParseUint(s, 10, 16)
+			if err != nil {
+				return extracted, &parseError{Path: path, File: file, Err: err}
+			}
+			extracted = append(extracted, uint16(value))
+		}
+	}
+
+	return extracted, nil
+}
+
+func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
+	var err error
+
+	stats.CPUSetStats.CPUs, err = getCpusetStat(path, cpusetFile(path, "cpus"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "cpu_exclusive"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.Mems, err = getCpusetStat(path, cpusetFile(path, "mems"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "mem_hardwall"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "mem_exclusive"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_migrate"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_spread_page"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_spread_slab"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_pressure"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "sched_load_balance"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, cpusetFile(path, "sched_relax_domain_level"))
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	return nil
+}
+
+func (s *CpusetGroup) ApplyDir(dir string, r *cgroups.Resources, pid int) error {
+	// This might happen if we have no cpuset cgroup mounted.
+	// Just do nothing and don't fail.
+	if dir == "" {
+		return nil
+	}
+	// 'ensureParent' start with parent because we don't want to
+	// explicitly inherit from parent, it could conflict with
+	// 'cpuset.cpu_exclusive'.
+	if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil {
+		return err
+	}
+	if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) {
+		return err
+	}
+	// We didn't inherit cpuset configs from parent, but we have
+	// to ensure cpuset configs are set before moving task into the
+	// cgroup.
+	// The logic is, if user specified cpuset configs, use these
+	// specified configs, otherwise, inherit from parent. This makes
+	// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
+	// keep backward compatibility.
+	if err := s.ensureCpusAndMems(dir, r); err != nil {
+		return err
+	}
+	// Since we are not using apply(), we need to place the pid
+	// into the procs file.
+	return cgroups.WriteCgroupProc(dir, pid)
+}
+
+func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) {
+	if cpus, err = cgroups.ReadFile(parent, cpusetFile(parent, "cpus")); err != nil {
+		return
+	}
+	if mems, err = cgroups.ReadFile(parent, cpusetFile(parent, "mems")); err != nil {
+		return
+	}
+	return cpus, mems, nil
+}
+
+// cpusetEnsureParent makes sure that the parent directories of current
+// are created and populated with the proper cpus and mems files copied
+// from their respective parent. It does that recursively, starting from
+// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point).
+func cpusetEnsureParent(current string) error {
+	var st unix.Statfs_t
+
+	parent := filepath.Dir(current)
+	err := unix.Statfs(parent, &st)
+	if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC {
+		return nil
+	}
+	// Treat non-existing directory as cgroupfs as it will be created,
+	// and the root cpuset directory obviously exists.
+	if err != nil && err != unix.ENOENT {
+		return &os.PathError{Op: "statfs", Path: parent, Err: err}
+	}
+
+	if err := cpusetEnsureParent(parent); err != nil {
+		return err
+	}
+	if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) {
+		return err
+	}
+	return cpusetCopyIfNeeded(current, parent)
+}
+
+// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
+// directory to the current directory if the file's contents are 0
+func cpusetCopyIfNeeded(current, parent string) error {
+	currentCpus, currentMems, err := getCpusetSubsystemSettings(current)
+	if err != nil {
+		return err
+	}
+	parentCpus, parentMems, err := getCpusetSubsystemSettings(parent)
+	if err != nil {
+		return err
+	}
+
+	if isEmptyCpuset(currentCpus) {
+		if err := cgroups.WriteFile(current, cpusetFile(current, "cpus"), parentCpus); err != nil {
+			return err
+		}
+	}
+	if isEmptyCpuset(currentMems) {
+		if err := cgroups.WriteFile(current, cpusetFile(current, "mems"), parentMems); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func isEmptyCpuset(str string) bool {
+	return str == "" || str == "\n"
+}
+
+func (s *CpusetGroup) ensureCpusAndMems(path string, r *cgroups.Resources) error {
+	if err := s.Set(path, r); err != nil {
+		return err
+	}
+	return cpusetCopyIfNeeded(path, filepath.Dir(path))
+}
diff --git a/fs/cpuset_test.go b/fs/cpuset_test.go
new file mode 100644
index 0000000..58e571b
--- /dev/null
+++ b/fs/cpuset_test.go
@@ -0,0 +1,241 @@
+package fs
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+const (
+	cpus                  = "0-2,7,12-14\n"
+	cpuExclusive          = "1\n"
+	mems                  = "1-4,6,9\n"
+	memHardwall           = "0\n"
+	memExclusive          = "0\n"
+	memoryMigrate         = "1\n"
+	memorySpreadPage      = "0\n"
+	memorySpeadSlab       = "1\n"
+	memoryPressure        = "34377\n"
+	schedLoadBalance      = "1\n"
+	schedRelaxDomainLevel = "-1\n"
+)
+
+var cpusetTestFiles = map[string]string{
+	"cpuset.cpus":                     cpus,
+	"cpuset.cpu_exclusive":            cpuExclusive,
+	"cpuset.mems":                     mems,
+	"cpuset.mem_hardwall":             memHardwall,
+	"cpuset.mem_exclusive":            memExclusive,
+	"cpuset.memory_migrate":           memoryMigrate,
+	"cpuset.memory_spread_page":       memorySpreadPage,
+	"cpuset.memory_spread_slab":       memorySpeadSlab,
+	"cpuset.memory_pressure":          memoryPressure,
+	"cpuset.sched_load_balance":       schedLoadBalance,
+	"cpuset.sched_relax_domain_level": schedRelaxDomainLevel,
+}
+
+func TestCPUSetSetCpus(t *testing.T) {
+	path := tempDir(t, "cpuset")
+
+	const (
+		cpusBefore = "0"
+		cpusAfter  = "1-3"
+	)
+
+	writeFileContents(t, path, map[string]string{
+		"cpuset.cpus": cpusBefore,
+	})
+
+	r := &cgroups.Resources{
+		CpusetCpus: cpusAfter,
+	}
+	cpuset := &CpusetGroup{}
+	if err := cpuset.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(path, "cpuset.cpus")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != cpusAfter {
+		t.Fatal("Got the wrong value, set cpuset.cpus failed.")
+	}
+}
+
+func TestCPUSetSetMems(t *testing.T) {
+	path := tempDir(t, "cpuset")
+
+	const (
+		memsBefore = "0"
+		memsAfter  = "1"
+	)
+
+	writeFileContents(t, path, map[string]string{
+		"cpuset.mems": memsBefore,
+	})
+
+	r := &cgroups.Resources{
+		CpusetMems: memsAfter,
+	}
+	cpuset := &CpusetGroup{}
+	if err := cpuset.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(path, "cpuset.mems")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != memsAfter {
+		t.Fatal("Got the wrong value, set cpuset.mems failed.")
+	}
+}
+
+func TestCPUSetStatsCorrect(t *testing.T) {
+	path := tempDir(t, "cpuset")
+	writeFileContents(t, path, cpusetTestFiles)
+
+	cpuset := &CpusetGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpuset.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectedStats := cgroups.CPUSetStats{
+		CPUs:                  []uint16{0, 1, 2, 7, 12, 13, 14},
+		CPUExclusive:          1,
+		Mems:                  []uint16{1, 2, 3, 4, 6, 9},
+		MemoryMigrate:         1,
+		MemHardwall:           0,
+		MemExclusive:          0,
+		MemorySpreadPage:      0,
+		MemorySpreadSlab:      1,
+		MemoryPressure:        34377,
+		SchedLoadBalance:      1,
+		SchedRelaxDomainLevel: -1,
+	}
+	if !reflect.DeepEqual(expectedStats, actualStats.CPUSetStats) {
+		t.Fatalf("Expected Cpuset stats usage %#v but found %#v",
+			expectedStats, actualStats.CPUSetStats)
+	}
+}
+
+func TestCPUSetStatsMissingFiles(t *testing.T) {
+	for _, testCase := range []struct {
+		desc               string
+		filename, contents string
+		removeFile         bool
+	}{
+		{
+			desc:       "empty cpus file",
+			filename:   "cpuset.cpus",
+			contents:   "",
+			removeFile: false,
+		},
+		{
+			desc:       "empty mems file",
+			filename:   "cpuset.mems",
+			contents:   "",
+			removeFile: false,
+		},
+		{
+			desc:       "corrupted cpus file",
+			filename:   "cpuset.cpus",
+			contents:   "0-3,*4^2",
+			removeFile: false,
+		},
+		{
+			desc:       "corrupted mems file",
+			filename:   "cpuset.mems",
+			contents:   "0,1,2-5,8-7",
+			removeFile: false,
+		},
+		{
+			desc:       "missing cpu_exclusive file",
+			filename:   "cpuset.cpu_exclusive",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing memory_migrate file",
+			filename:   "cpuset.memory_migrate",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing mem_hardwall file",
+			filename:   "cpuset.mem_hardwall",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing mem_exclusive file",
+			filename:   "cpuset.mem_exclusive",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing memory_spread_page file",
+			filename:   "cpuset.memory_spread_page",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing memory_spread_slab file",
+			filename:   "cpuset.memory_spread_slab",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing memory_pressure file",
+			filename:   "cpuset.memory_pressure",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing sched_load_balance file",
+			filename:   "cpuset.sched_load_balance",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing sched_relax_domain_level file",
+			filename:   "cpuset.sched_relax_domain_level",
+			contents:   "",
+			removeFile: true,
+		},
+	} {
+		t.Run(testCase.desc, func(t *testing.T) {
+			path := tempDir(t, "cpuset")
+
+			tempCpusetTestFiles := map[string]string{}
+			for i, v := range cpusetTestFiles {
+				tempCpusetTestFiles[i] = v
+			}
+
+			if testCase.removeFile {
+				delete(tempCpusetTestFiles, testCase.filename)
+				writeFileContents(t, path, tempCpusetTestFiles)
+				cpuset := &CpusetGroup{}
+				actualStats := *cgroups.NewStats()
+				err := cpuset.GetStats(path, &actualStats)
+				if err != nil {
+					t.Errorf("failed unexpectedly: %q", err)
+				}
+			} else {
+				tempCpusetTestFiles[testCase.filename] = testCase.contents
+				writeFileContents(t, path, tempCpusetTestFiles)
+				cpuset := &CpusetGroup{}
+				actualStats := *cgroups.NewStats()
+				err := cpuset.GetStats(path, &actualStats)
+
+				if err == nil {
+					t.Error("failed to return expected error")
+				}
+			}
+		})
+	}
+}
diff --git a/fs/devices.go b/fs/devices.go
new file mode 100644
index 0000000..26483ec
--- /dev/null
+++ b/fs/devices.go
@@ -0,0 +1,38 @@
+package fs
+
+import (
+	"github.com/opencontainers/cgroups"
+)
+
+type DevicesGroup struct{}
+
+func (s *DevicesGroup) Name() string {
+	return "devices"
+}
+
+func (s *DevicesGroup) Apply(path string, r *cgroups.Resources, pid int) error {
+	if r.SkipDevices {
+		return nil
+	}
+	if path == "" {
+		// Return error here, since devices cgroup
+		// is a hard requirement for container's security.
+		return errSubsystemDoesNotExist
+	}
+
+	return apply(path, pid)
+}
+
+func (s *DevicesGroup) Set(path string, r *cgroups.Resources) error {
+	if cgroups.DevicesSetV1 == nil {
+		if len(r.Devices) == 0 {
+			return nil
+		}
+		return cgroups.ErrDevicesUnsupported
+	}
+	return cgroups.DevicesSetV1(path, r)
+}
+
+func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
diff --git a/fs/error.go b/fs/error.go
new file mode 100644
index 0000000..f13033e
--- /dev/null
+++ b/fs/error.go
@@ -0,0 +1,15 @@
+package fs
+
+import (
+	"fmt"
+
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+type parseError = fscommon.ParseError
+
+// malformedLine is used by all cgroupfs file parsers that expect a line
+// in a particular format but get some garbage instead.
+func malformedLine(path, file, line string) error {
+	return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)}
+}
diff --git a/fs/freezer.go b/fs/freezer.go
new file mode 100644
index 0000000..dae4a60
--- /dev/null
+++ b/fs/freezer.go
@@ -0,0 +1,157 @@
+package fs
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+type FreezerGroup struct{}
+
+func (s *FreezerGroup) Name() string {
+	return "freezer"
+}
+
+func (s *FreezerGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func (s *FreezerGroup) Set(path string, r *cgroups.Resources) (Err error) {
+	switch r.Freezer {
+	case cgroups.Frozen:
+		defer func() {
+			if Err != nil {
+				// Freezing failed, and it is bad and dangerous
+				// to leave the cgroup in FROZEN or FREEZING
+				// state, so (try to) thaw it back.
+				_ = cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed))
+			}
+		}()
+
+		// As per older kernel docs (freezer-subsystem.txt before
+		// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
+		// userspace should either retry or thaw. While current
+		// kernel cgroup v1 docs no longer mention a need to retry,
+		// even a recent kernel (v5.4, Ubuntu 20.04) can't reliably
+		// freeze a cgroup v1 while new processes keep appearing in it
+		// (either via fork/clone or by writing new PIDs to
+		// cgroup.procs).
+		//
+		// The numbers below are empirically chosen to have a decent
+		// chance to succeed in various scenarios ("runc pause/unpause
+		// with parallel runc exec" and "bare freeze/unfreeze on a very
+		// slow system"), tested on RHEL7 and Ubuntu 20.04 kernels.
+		//
+		// Adding any amount of sleep in between retries did not
+		// increase the chances of successful freeze in "pause/unpause
+		// with parallel exec" reproducer. OTOH, adding an occasional
+		// sleep helped for the case where the system is extremely slow
+		// (CentOS 7 VM on GHA CI).
+		//
+		// Alas, this is still a game of chances, since the real fix
+		// belong to the kernel (cgroup v2 do not have this bug).
+
+		for i := 0; i < 1000; i++ {
+			if i%50 == 49 {
+				// Occasional thaw and sleep improves
+				// the chances to succeed in freezing
+				// in case new processes keep appearing
+				// in the cgroup.
+				_ = cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed))
+				time.Sleep(10 * time.Millisecond)
+			}
+
+			if err := cgroups.WriteFile(path, "freezer.state", string(cgroups.Frozen)); err != nil {
+				return err
+			}
+
+			if i%25 == 24 {
+				// Occasional short sleep before reading
+				// the state back also improves the chances to
+				// succeed in freezing in case of a very slow
+				// system.
+				time.Sleep(10 * time.Microsecond)
+			}
+			state, err := cgroups.ReadFile(path, "freezer.state")
+			if err != nil {
+				return err
+			}
+			state = strings.TrimSpace(state)
+			switch state {
+			case "FREEZING":
+				continue
+			case string(cgroups.Frozen):
+				if i > 1 {
+					logrus.Debugf("frozen after %d retries", i)
+				}
+				return nil
+			default:
+				// should never happen
+				return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
+			}
+		}
+		// Despite our best efforts, it got stuck in FREEZING.
+		return errors.New("unable to freeze")
+	case cgroups.Thawed:
+		return cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed))
+	case cgroups.Undefined:
+		return nil
+	default:
+		return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer))
+	}
+}
+
+func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
+
+func (s *FreezerGroup) GetState(path string) (cgroups.FreezerState, error) {
+	for {
+		state, err := cgroups.ReadFile(path, "freezer.state")
+		if err != nil {
+			// If the kernel is too old, then we just treat the freezer as
+			// being in an "undefined" state.
+			if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
+				err = nil
+			}
+			return cgroups.Undefined, err
+		}
+		switch strings.TrimSpace(state) {
+		case "THAWED":
+			return cgroups.Thawed, nil
+		case "FROZEN":
+			// Find out whether the cgroup is frozen directly,
+			// or indirectly via an ancestor.
+			self, err := cgroups.ReadFile(path, "freezer.self_freezing")
+			if err != nil {
+				// If the kernel is too old, then we just treat
+				// it as being frozen.
+				if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) {
+					err = nil
+				}
+				return cgroups.Frozen, err
+			}
+			switch self {
+			case "0\n":
+				return cgroups.Thawed, nil
+			case "1\n":
+				return cgroups.Frozen, nil
+			default:
+				return cgroups.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self)
+			}
+		case "FREEZING":
+			// Make sure we get a stable freezer state, so retry if the cgroup
+			// is still undergoing freezing. This should be a temporary delay.
+			time.Sleep(1 * time.Millisecond)
+			continue
+		default:
+			return cgroups.Undefined, fmt.Errorf("unknown freezer.state %q", state)
+		}
+	}
+}
diff --git a/fs/freezer_test.go b/fs/freezer_test.go
new file mode 100644
index 0000000..c76ee79
--- /dev/null
+++ b/fs/freezer_test.go
@@ -0,0 +1,46 @@
+package fs
+
+import (
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+func TestFreezerSetState(t *testing.T) {
+	path := tempDir(t, "freezer")
+
+	writeFileContents(t, path, map[string]string{
+		"freezer.state": string(cgroups.Frozen),
+	})
+
+	r := &cgroups.Resources{
+		Freezer: cgroups.Thawed,
+	}
+	freezer := &FreezerGroup{}
+	if err := freezer.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(path, "freezer.state")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != string(cgroups.Thawed) {
+		t.Fatal("Got the wrong value, set freezer.state failed.")
+	}
+}
+
+func TestFreezerSetInvalidState(t *testing.T) {
+	path := tempDir(t, "freezer")
+
+	const invalidArg cgroups.FreezerState = "Invalid"
+
+	r := &cgroups.Resources{
+		Freezer: invalidArg,
+	}
+	freezer := &FreezerGroup{}
+	if err := freezer.Set(path, r); err == nil {
+		t.Fatal("Failed to return invalid argument error")
+	}
+}
diff --git a/fs/fs.go b/fs/fs.go
new file mode 100644
index 0000000..23a8fb8
--- /dev/null
+++ b/fs/fs.go
@@ -0,0 +1,265 @@
+package fs
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"sync"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+var subsystems = []subsystem{
+	&CpusetGroup{},
+	&DevicesGroup{},
+	&MemoryGroup{},
+	&CpuGroup{},
+	&CpuacctGroup{},
+	&PidsGroup{},
+	&BlkioGroup{},
+	&HugetlbGroup{},
+	&NetClsGroup{},
+	&NetPrioGroup{},
+	&PerfEventGroup{},
+	&FreezerGroup{},
+	&RdmaGroup{},
+	&NameGroup{GroupName: "name=systemd", Join: true},
+	&NameGroup{GroupName: "misc", Join: true},
+}
+
+var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
+
+func init() {
+	// If using cgroups-hybrid mode then add a "" controller indicating
+	// it should join the cgroups v2.
+	if cgroups.IsCgroup2HybridMode() {
+		subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true})
+	}
+}
+
+type subsystem interface {
+	// Name returns the name of the subsystem.
+	Name() string
+	// GetStats fills in the stats for the subsystem.
+	GetStats(path string, stats *cgroups.Stats) error
+	// Apply creates and joins a cgroup, adding pid into it. Some
+	// subsystems use resources to pre-configure the cgroup parents
+	// before creating or joining it.
+	Apply(path string, r *cgroups.Resources, pid int) error
+	// Set sets the cgroup resources.
+	Set(path string, r *cgroups.Resources) error
+}
+
+type Manager struct {
+	mu      sync.Mutex
+	cgroups *cgroups.Cgroup
+	paths   map[string]string
+}
+
+func NewManager(cg *cgroups.Cgroup, paths map[string]string) (*Manager, error) {
+	// Some v1 controllers (cpu, cpuset, and devices) expect
+	// cgroups.Resources to not be nil in Apply.
+	if cg.Resources == nil {
+		return nil, errors.New("cgroup v1 manager needs cgroups.Resources to be set during manager creation")
+	}
+	if cg.Resources.Unified != nil {
+		return nil, cgroups.ErrV1NoUnified
+	}
+
+	if paths == nil {
+		var err error
+		paths, err = initPaths(cg)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return &Manager{
+		cgroups: cg,
+		paths:   paths,
+	}, nil
+}
+
+// isIgnorableError returns whether err is a permission error (in the loose
+// sense of the word). This includes EROFS (which for an unprivileged user is
+// basically a permission error) and EACCES (for similar reasons) as well as
+// the normal EPERM.
+func isIgnorableError(rootless bool, err error) bool {
+	// We do not ignore errors if we are root.
+	if !rootless {
+		return false
+	}
+	// Is it an ordinary EPERM?
+	if errors.Is(err, os.ErrPermission) {
+		return true
+	}
+	// Handle some specific syscall errors.
+	var errno unix.Errno
+	if errors.As(err, &errno) {
+		return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
+	}
+	return false
+}
+
+func (m *Manager) Apply(pid int) (retErr error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	c := m.cgroups
+
+	for _, sys := range subsystems {
+		name := sys.Name()
+		p, ok := m.paths[name]
+		if !ok {
+			continue
+		}
+
+		if err := sys.Apply(p, c.Resources, pid); err != nil {
+			// In the case of rootless (including euid=0 in userns), where an
+			// explicit cgroup path hasn't been set, we don't bail on error in
+			// case of permission problems here, but do delete the path from
+			// the m.paths map, since it is either non-existent and could not
+			// be created, or the pid could not be added to it.
+			//
+			// Cases where limits for the subsystem have been set are handled
+			// later by Set, which fails with a friendly error (see
+			// if path == "" in Set).
+			if isIgnorableError(c.Rootless, err) && c.Path == "" {
+				retErr = cgroups.ErrRootless
+				delete(m.paths, name)
+				continue
+			}
+			return err
+		}
+
+	}
+	return retErr
+}
+
+func (m *Manager) Destroy() error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return cgroups.RemovePaths(m.paths)
+}
+
+func (m *Manager) Path(subsys string) string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.paths[subsys]
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	stats := cgroups.NewStats()
+	for _, sys := range subsystems {
+		path := m.paths[sys.Name()]
+		if path == "" {
+			continue
+		}
+		if err := sys.GetStats(path, stats); err != nil {
+			return nil, err
+		}
+	}
+	return stats, nil
+}
+
+func (m *Manager) Set(r *cgroups.Resources) error {
+	if r == nil {
+		return nil
+	}
+
+	if r.Unified != nil {
+		return cgroups.ErrV1NoUnified
+	}
+
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	for _, sys := range subsystems {
+		path := m.paths[sys.Name()]
+		if err := sys.Set(path, r); err != nil {
+			// When rootless is true, errors from the device subsystem
+			// are ignored, as it is really not expected to work.
+			if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) {
+				continue
+			}
+			// However, errors from other subsystems are not ignored.
+			// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
+			if path == "" {
+				// We never created a path for this cgroup, so we cannot set
+				// limits for it (though we have already tried at this point).
+				return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
+			}
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Freeze toggles the container's freezer cgroup depending on the state
+// provided
+func (m *Manager) Freeze(state cgroups.FreezerState) error {
+	path := m.Path("freezer")
+	if path == "" {
+		return errors.New("cannot toggle freezer: cgroups not configured for container")
+	}
+
+	prevState := m.cgroups.Resources.Freezer
+	m.cgroups.Resources.Freezer = state
+	freezer := &FreezerGroup{}
+	if err := freezer.Set(path, m.cgroups.Resources); err != nil {
+		m.cgroups.Resources.Freezer = prevState
+		return err
+	}
+	return nil
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+	return cgroups.GetPids(m.Path("devices"))
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+	return cgroups.GetAllPids(m.Path("devices"))
+}
+
+func (m *Manager) GetPaths() map[string]string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.paths
+}
+
+func (m *Manager) GetCgroups() (*cgroups.Cgroup, error) {
+	return m.cgroups, nil
+}
+
+func (m *Manager) GetFreezerState() (cgroups.FreezerState, error) {
+	dir := m.Path("freezer")
+	// If the container doesn't have the freezer cgroup, say it's undefined.
+	if dir == "" {
+		return cgroups.Undefined, nil
+	}
+	freezer := &FreezerGroup{}
+	return freezer.GetState(dir)
+}
+
+func (m *Manager) Exists() bool {
+	return cgroups.PathExists(m.Path("devices"))
+}
+
+func OOMKillCount(path string) (uint64, error) {
+	return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
+}
+
+func (m *Manager) OOMKillCount() (uint64, error) {
+	c, err := OOMKillCount(m.Path("memory"))
+	// Ignore ENOENT when rootless as it couldn't create cgroup.
+	if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {
+		err = nil
+	}
+
+	return c, err
+}
diff --git a/fs/fs_test.go b/fs/fs_test.go
new file mode 100644
index 0000000..f9a0935
--- /dev/null
+++ b/fs/fs_test.go
@@ -0,0 +1,49 @@
+package fs
+
+import (
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+func BenchmarkGetStats(b *testing.B) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		b.Skip("cgroup v2 is not supported")
+	}
+
+	// Unset TestMode as we work with real cgroupfs here,
+	// and we want OpenFile to perform the fstype check.
+	cgroups.TestMode = false
+	defer func() {
+		cgroups.TestMode = true
+	}()
+
+	cg := &cgroups.Cgroup{
+		Path:      "/some/kind/of/a/path/here",
+		Resources: &cgroups.Resources{},
+	}
+	m, err := NewManager(cg, nil)
+	if err != nil {
+		b.Fatal(err)
+	}
+	err = m.Apply(-1)
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer func() {
+		_ = m.Destroy()
+	}()
+
+	var st *cgroups.Stats
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		st, err = m.GetStats()
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+	if st.CpuStats.CpuUsage.TotalUsage != 0 {
+		b.Fatalf("stats: %+v", st)
+	}
+}
diff --git a/fs/hugetlb.go b/fs/hugetlb.go
new file mode 100644
index 0000000..698fd69
--- /dev/null
+++ b/fs/hugetlb.go
@@ -0,0 +1,83 @@
+package fs
+
+import (
+	"errors"
+	"os"
+	"strconv"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+type HugetlbGroup struct{}
+
+func (s *HugetlbGroup) Name() string {
+	return "hugetlb"
+}
+
+func (s *HugetlbGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func (s *HugetlbGroup) Set(path string, r *cgroups.Resources) error {
+	const suffix = ".limit_in_bytes"
+	skipRsvd := false
+
+	for _, hugetlb := range r.HugetlbLimit {
+		prefix := "hugetlb." + hugetlb.Pagesize
+		val := strconv.FormatUint(hugetlb.Limit, 10)
+		if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil {
+			return err
+		}
+		if skipRsvd {
+			continue
+		}
+		if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil {
+			if errors.Is(err, os.ErrNotExist) {
+				skipRsvd = true
+				continue
+			}
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
+	if !cgroups.PathExists(path) {
+		return nil
+	}
+	rsvd := ".rsvd"
+	hugetlbStats := cgroups.HugetlbStats{}
+	for _, pageSize := range cgroups.HugePageSizes() {
+	again:
+		prefix := "hugetlb." + pageSize + rsvd
+
+		value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes")
+		if err != nil {
+			if rsvd != "" && errors.Is(err, os.ErrNotExist) {
+				rsvd = ""
+				goto again
+			}
+			return err
+		}
+		hugetlbStats.Usage = value
+
+		value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes")
+		if err != nil {
+			return err
+		}
+		hugetlbStats.MaxUsage = value
+
+		value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt")
+		if err != nil {
+			return err
+		}
+		hugetlbStats.Failcnt = value
+
+		stats.HugetlbStats[pageSize] = hugetlbStats
+	}
+
+	return nil
+}
diff --git a/fs/hugetlb_test.go b/fs/hugetlb_test.go
new file mode 100644
index 0000000..c37e3ec
--- /dev/null
+++ b/fs/hugetlb_test.go
@@ -0,0 +1,176 @@
+package fs
+
+import (
+	"fmt"
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+const (
+	hugetlbUsageContents    = "128\n"
+	hugetlbMaxUsageContents = "256\n"
+	hugetlbFailcnt          = "100\n"
+)
+
+const (
+	usage    = "hugetlb.%s.usage_in_bytes"
+	limit    = "hugetlb.%s.limit_in_bytes"
+	maxUsage = "hugetlb.%s.max_usage_in_bytes"
+	failcnt  = "hugetlb.%s.failcnt"
+
+	rsvdUsage    = "hugetlb.%s.rsvd.usage_in_bytes"
+	rsvdLimit    = "hugetlb.%s.rsvd.limit_in_bytes"
+	rsvdMaxUsage = "hugetlb.%s.rsvd.max_usage_in_bytes"
+	rsvdFailcnt  = "hugetlb.%s.rsvd.failcnt"
+)
+
+func TestHugetlbSetHugetlb(t *testing.T) {
+	path := tempDir(t, "hugetlb")
+
+	const (
+		hugetlbBefore = 256
+		hugetlbAfter  = 512
+	)
+
+	for _, pageSize := range cgroups.HugePageSizes() {
+		writeFileContents(t, path, map[string]string{
+			fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore),
+		})
+	}
+
+	r := &cgroups.Resources{}
+	for _, pageSize := range cgroups.HugePageSizes() {
+		r.HugetlbLimit = []*cgroups.HugepageLimit{
+			{
+				Pagesize: pageSize,
+				Limit:    hugetlbAfter,
+			},
+		}
+		hugetlb := &HugetlbGroup{}
+		if err := hugetlb.Set(path, r); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	for _, pageSize := range cgroups.HugePageSizes() {
+		for _, f := range []string{limit, rsvdLimit} {
+			limit := fmt.Sprintf(f, pageSize)
+			value, err := fscommon.GetCgroupParamUint(path, limit)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if value != hugetlbAfter {
+				t.Fatalf("Set %s failed. Expected: %v, Got: %v", limit, hugetlbAfter, value)
+			}
+		}
+	}
+}
+
+func TestHugetlbStats(t *testing.T) {
+	path := tempDir(t, "hugetlb")
+	for _, pageSize := range cgroups.HugePageSizes() {
+		writeFileContents(t, path, map[string]string{
+			fmt.Sprintf(usage, pageSize):    hugetlbUsageContents,
+			fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents,
+			fmt.Sprintf(failcnt, pageSize):  hugetlbFailcnt,
+		})
+	}
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
+	for _, pageSize := range cgroups.HugePageSizes() {
+		expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
+	}
+}
+
+func TestHugetlbRStatsRsvd(t *testing.T) {
+	path := tempDir(t, "hugetlb")
+	for _, pageSize := range cgroups.HugePageSizes() {
+		writeFileContents(t, path, map[string]string{
+			fmt.Sprintf(rsvdUsage, pageSize):    hugetlbUsageContents,
+			fmt.Sprintf(rsvdMaxUsage, pageSize): hugetlbMaxUsageContents,
+			fmt.Sprintf(rsvdFailcnt, pageSize):  hugetlbFailcnt,
+		})
+	}
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
+	for _, pageSize := range cgroups.HugePageSizes() {
+		expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
+	}
+}
+
+func TestHugetlbStatsNoUsageFile(t *testing.T) {
+	path := tempDir(t, "hugetlb")
+	writeFileContents(t, path, map[string]string{
+		maxUsage: hugetlbMaxUsageContents,
+	})
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
+	path := tempDir(t, "hugetlb")
+	for _, pageSize := range cgroups.HugePageSizes() {
+		writeFileContents(t, path, map[string]string{
+			fmt.Sprintf(usage, pageSize): hugetlbUsageContents,
+		})
+	}
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestHugetlbStatsBadUsageFile(t *testing.T) {
+	path := tempDir(t, "hugetlb")
+	for _, pageSize := range cgroups.HugePageSizes() {
+		writeFileContents(t, path, map[string]string{
+			fmt.Sprintf(usage, pageSize): "bad",
+			maxUsage:                     hugetlbMaxUsageContents,
+		})
+	}
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
+	path := tempDir(t, "hugetlb")
+	writeFileContents(t, path, map[string]string{
+		usage:    hugetlbUsageContents,
+		maxUsage: "bad",
+	})
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
diff --git a/fs/memory.go b/fs/memory.go
new file mode 100644
index 0000000..d92f232
--- /dev/null
+++ b/fs/memory.go
@@ -0,0 +1,356 @@
+package fs
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+const (
+	cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
+	cgroupMemoryLimit     = "memory.limit_in_bytes"
+	cgroupMemoryUsage     = "memory.usage_in_bytes"
+	cgroupMemoryMaxUsage  = "memory.max_usage_in_bytes"
+)
+
+type MemoryGroup struct{}
+
+func (s *MemoryGroup) Name() string {
+	return "memory"
+}
+
+func (s *MemoryGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func setMemory(path string, val int64) error {
+	if val == 0 {
+		return nil
+	}
+
+	err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
+	if !errors.Is(err, unix.EBUSY) {
+		return err
+	}
+
+	// EBUSY means the kernel can't set new limit as it's too low
+	// (lower than the current usage). Return more specific error.
+	usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage)
+	if err != nil {
+		return err
+	}
+	max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage)
+	if err != nil {
+		return err
+	}
+
+	return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
+}
+
+func setSwap(path string, val int64) error {
+	if val == 0 {
+		return nil
+	}
+
+	return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
+}
+
+func setMemoryAndSwap(path string, r *cgroups.Resources) error {
+	// If the memory update is set to -1 and the swap is not explicitly
+	// set, we should also set swap to -1, it means unlimited memory.
+	if r.Memory == -1 && r.MemorySwap == 0 {
+		// Only set swap if it's enabled in kernel
+		if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
+			r.MemorySwap = -1
+		}
+	}
+
+	// When memory and swap memory are both set, we need to handle the cases
+	// for updating container.
+	if r.Memory != 0 && r.MemorySwap != 0 {
+		curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit)
+		if err != nil {
+			return err
+		}
+
+		// When update memory limit, we should adapt the write sequence
+		// for memory and swap memory, so it won't fail because the new
+		// value and the old value don't fit kernel's validation.
+		if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) {
+			if err := setSwap(path, r.MemorySwap); err != nil {
+				return err
+			}
+			if err := setMemory(path, r.Memory); err != nil {
+				return err
+			}
+			return nil
+		}
+	}
+
+	if err := setMemory(path, r.Memory); err != nil {
+		return err
+	}
+	if err := setSwap(path, r.MemorySwap); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (s *MemoryGroup) Set(path string, r *cgroups.Resources) error {
+	if err := setMemoryAndSwap(path, r); err != nil {
+		return err
+	}
+
+	// ignore KernelMemory and KernelMemoryTCP
+
+	if r.MemoryReservation != 0 {
+		if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
+			return err
+		}
+	}
+
+	if r.OomKillDisable {
+		if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil {
+			return err
+		}
+	}
+	if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
+		return nil
+	} else if *r.MemorySwappiness <= 100 {
+		if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
+			return err
+		}
+	} else {
+		return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness)
+	}
+
+	return nil
+}
+
+func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
+	const file = "memory.stat"
+	statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil
+		}
+		return err
+	}
+	defer statsFile.Close()
+
+	sc := bufio.NewScanner(statsFile)
+	for sc.Scan() {
+		t, v, err := fscommon.ParseKeyValue(sc.Text())
+		if err != nil {
+			return &parseError{Path: path, File: file, Err: err}
+		}
+		stats.MemoryStats.Stats[t] = v
+	}
+	stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
+
+	memoryUsage, err := getMemoryData(path, "")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.Usage = memoryUsage
+	swapUsage, err := getMemoryData(path, "memsw")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.SwapUsage = swapUsage
+	stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{
+		Usage:   swapUsage.Usage - memoryUsage.Usage,
+		Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt,
+	}
+	kernelUsage, err := getMemoryData(path, "kmem")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.KernelUsage = kernelUsage
+	kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
+
+	value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy")
+	if err != nil {
+		return err
+	}
+	if value == 1 {
+		stats.MemoryStats.UseHierarchy = true
+	}
+
+	pagesByNUMA, err := getPageUsageByNUMA(path)
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.PageUsageByNUMA = pagesByNUMA
+
+	return nil
+}
+
+func getMemoryData(path, name string) (cgroups.MemoryData, error) {
+	memoryData := cgroups.MemoryData{}
+
+	moduleName := "memory"
+	if name != "" {
+		moduleName = "memory." + name
+	}
+	var (
+		usage    = moduleName + ".usage_in_bytes"
+		maxUsage = moduleName + ".max_usage_in_bytes"
+		failcnt  = moduleName + ".failcnt"
+		limit    = moduleName + ".limit_in_bytes"
+	)
+
+	value, err := fscommon.GetCgroupParamUint(path, usage)
+	if err != nil {
+		if name != "" && os.IsNotExist(err) {
+			// Ignore ENOENT as swap and kmem controllers
+			// are optional in the kernel.
+			return cgroups.MemoryData{}, nil
+		}
+		return cgroups.MemoryData{}, err
+	}
+	memoryData.Usage = value
+	value, err = fscommon.GetCgroupParamUint(path, maxUsage)
+	if err != nil {
+		return cgroups.MemoryData{}, err
+	}
+	memoryData.MaxUsage = value
+	value, err = fscommon.GetCgroupParamUint(path, failcnt)
+	if err != nil {
+		return cgroups.MemoryData{}, err
+	}
+	memoryData.Failcnt = value
+	value, err = fscommon.GetCgroupParamUint(path, limit)
+	if err != nil {
+		if name == "kmem" && os.IsNotExist(err) {
+			// Ignore ENOENT as kmem.limit_in_bytes has
+			// been removed in newer kernels.
+			return memoryData, nil
+		}
+
+		return cgroups.MemoryData{}, err
+	}
+	memoryData.Limit = value
+
+	return memoryData, nil
+}
+
+func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) {
+	const (
+		maxColumns = math.MaxUint8 + 1
+		file       = "memory.numa_stat"
+	)
+	stats := cgroups.PageUsageByNUMA{}
+
+	fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
+	if os.IsNotExist(err) {
+		return stats, nil
+	} else if err != nil {
+		return stats, err
+	}
+	defer fd.Close()
+
+	// File format is documented in linux/Documentation/cgroup-v1/memory.txt
+	// and it looks like this:
+	//
+	// total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
+
+	scanner := bufio.NewScanner(fd)
+	for scanner.Scan() {
+		var field *cgroups.PageStats
+
+		line := scanner.Text()
+		columns := strings.SplitN(line, " ", maxColumns)
+		for i, column := range columns {
+			key, val, ok := strings.Cut(column, "=")
+			// Some custom kernels have non-standard fields, like
+			//   numa_locality 0 0 0 0 0 0 0 0 0 0
+			//   numa_exectime 0
+			if !ok {
+				if i == 0 {
+					// Ignore/skip those.
+					break
+				} else {
+					// The first column was already validated,
+					// so be strict to the rest.
+					return stats, malformedLine(path, file, line)
+				}
+			}
+			if i == 0 { // First column: key is name, val is total.
+				field = getNUMAField(&stats, key)
+				if field == nil { // unknown field (new kernel?)
+					break
+				}
+				field.Total, err = strconv.ParseUint(val, 0, 64)
+				if err != nil {
+					return stats, &parseError{Path: path, File: file, Err: err}
+				}
+				field.Nodes = map[uint8]uint64{}
+			} else { // Subsequent columns: key is N<id>, val is usage.
+				if len(key) < 2 || key[0] != 'N' {
+					// This is definitely an error.
+					return stats, malformedLine(path, file, line)
+				}
+
+				n, err := strconv.ParseUint(key[1:], 10, 8)
+				if err != nil {
+					return stats, &parseError{Path: path, File: file, Err: err}
+				}
+
+				usage, err := strconv.ParseUint(val, 10, 64)
+				if err != nil {
+					return stats, &parseError{Path: path, File: file, Err: err}
+				}
+
+				field.Nodes[uint8(n)] = usage
+			}
+
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err}
+	}
+
+	return stats, nil
+}
+
+func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats {
+	switch name {
+	case "total":
+		return &stats.Total
+	case "file":
+		return &stats.File
+	case "anon":
+		return &stats.Anon
+	case "unevictable":
+		return &stats.Unevictable
+	case "hierarchical_total":
+		return &stats.Hierarchical.Total
+	case "hierarchical_file":
+		return &stats.Hierarchical.File
+	case "hierarchical_anon":
+		return &stats.Hierarchical.Anon
+	case "hierarchical_unevictable":
+		return &stats.Hierarchical.Unevictable
+	}
+	return nil
+}
diff --git a/fs/memory_test.go b/fs/memory_test.go
new file mode 100644
index 0000000..c94279e
--- /dev/null
+++ b/fs/memory_test.go
@@ -0,0 +1,506 @@
+package fs
+
+import (
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+const (
+	memoryStatContents = `cache 512
+rss 1024`
+	memoryUsageContents        = "2048\n"
+	memoryMaxUsageContents     = "4096\n"
+	memoryFailcnt              = "100\n"
+	memoryLimitContents        = "8192\n"
+	memoryUseHierarchyContents = "1\n"
+	memoryNUMAStatContents     = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497
+file=44428 N0=32614 N1=7335 N2=1982 N3=2497
+anon=183 N0=17 N1=166 N2=0 N3=0
+unevictable=0 N0=0 N1=0 N2=0 N3=0
+hierarchical_total=768133 N0=509113 N1=138887 N2=20464 N3=99669
+hierarchical_file=722017 N0=496516 N1=119997 N2=20181 N3=85323
+hierarchical_anon=46096 N0=12597 N1=18890 N2=283 N3=14326
+hierarchical_unevictable=20 N0=0 N1=0 N2=0 N3=20
+`
+	memoryNUMAStatNoHierarchyContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497
+file=44428 N0=32614 N1=7335 N2=1982 N3=2497
+anon=183 N0=17 N1=166 N2=0 N3=0
+unevictable=0 N0=0 N1=0 N2=0 N3=0
+`
+	// Some custom kernels has extra fields that should be ignored
+	memoryNUMAStatExtraContents = `numa_locality 0 0 0 0 0 0 0 0 0 0
+numa_exectime 0
+whatever=100 N0=0
+`
+)
+
+func TestMemorySetMemory(t *testing.T) {
+	path := tempDir(t, "memory")
+
+	const (
+		memoryBefore      = 314572800 // 300M
+		memoryAfter       = 524288000 // 500M
+		reservationBefore = 209715200 // 200M
+		reservationAfter  = 314572800 // 300M
+	)
+
+	writeFileContents(t, path, map[string]string{
+		"memory.limit_in_bytes":      strconv.Itoa(memoryBefore),
+		"memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore),
+	})
+
+	r := &cgroups.Resources{
+		Memory:            memoryAfter,
+		MemoryReservation: reservationAfter,
+	}
+	memory := &MemoryGroup{}
+	if err := memory.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != memoryAfter {
+		t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+	}
+
+	value, err = fscommon.GetCgroupParamUint(path, "memory.soft_limit_in_bytes")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != reservationAfter {
+		t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.")
+	}
+}
+
+func TestMemorySetMemoryswap(t *testing.T) {
+	path := tempDir(t, "memory")
+
+	const (
+		memoryswapBefore = 314572800 // 300M
+		memoryswapAfter  = 524288000 // 500M
+	)
+
+	writeFileContents(t, path, map[string]string{
+		"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+	})
+
+	r := &cgroups.Resources{
+		MemorySwap: memoryswapAfter,
+	}
+	memory := &MemoryGroup{}
+	if err := memory.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != memoryswapAfter {
+		t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+	}
+}
+
+func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
+	path := tempDir(t, "memory")
+
+	const (
+		memoryBefore     = 314572800 // 300M
+		memoryswapBefore = 524288000 // 500M
+		memoryAfter      = 629145600 // 600M
+		memoryswapAfter  = 838860800 // 800M
+	)
+
+	writeFileContents(t, path, map[string]string{
+		"memory.limit_in_bytes":       strconv.Itoa(memoryBefore),
+		"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+		// Set will call getMemoryData when memory and swap memory are
+		// both set, fake these fields so we don't get error.
+		"memory.usage_in_bytes":     "0",
+		"memory.max_usage_in_bytes": "0",
+		"memory.failcnt":            "0",
+	})
+
+	r := &cgroups.Resources{
+		Memory:     memoryAfter,
+		MemorySwap: memoryswapAfter,
+	}
+	memory := &MemoryGroup{}
+	if err := memory.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != memoryAfter {
+		t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+	}
+
+	value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != memoryswapAfter {
+		t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+	}
+}
+
+func TestMemorySetSwapSmallerThanMemory(t *testing.T) {
+	path := tempDir(t, "memory")
+
+	const (
+		memoryBefore     = 629145600 // 600M
+		memoryswapBefore = 838860800 // 800M
+		memoryAfter      = 314572800 // 300M
+		memoryswapAfter  = 524288000 // 500M
+	)
+
+	writeFileContents(t, path, map[string]string{
+		"memory.limit_in_bytes":       strconv.Itoa(memoryBefore),
+		"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+	})
+
+	r := &cgroups.Resources{
+		Memory:     memoryAfter,
+		MemorySwap: memoryswapAfter,
+	}
+	memory := &MemoryGroup{}
+	if err := memory.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != memoryAfter {
+		t.Fatalf("Got the wrong value (%d != %d), set memory.limit_in_bytes failed", value, memoryAfter)
+	}
+
+	value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != memoryswapAfter {
+		t.Fatalf("Got the wrong value (%d != %d), set memory.memsw.limit_in_bytes failed", value, memoryswapAfter)
+	}
+}
+
+func TestMemorySetMemorySwappinessDefault(t *testing.T) {
+	path := tempDir(t, "memory")
+
+	swappinessBefore := 60 // default is 60
+	swappinessAfter := uint64(0)
+
+	writeFileContents(t, path, map[string]string{
+		"memory.swappiness": strconv.Itoa(swappinessBefore),
+	})
+
+	r := &cgroups.Resources{
+		MemorySwappiness: &swappinessAfter,
+	}
+	memory := &MemoryGroup{}
+	if err := memory.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(path, "memory.swappiness")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != swappinessAfter {
+		t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter)
+	}
+}
+
+func TestMemoryStats(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.stat":                     memoryStatContents,
+		"memory.usage_in_bytes":           memoryUsageContents,
+		"memory.limit_in_bytes":           memoryLimitContents,
+		"memory.max_usage_in_bytes":       memoryMaxUsageContents,
+		"memory.failcnt":                  memoryFailcnt,
+		"memory.memsw.usage_in_bytes":     memoryUsageContents,
+		"memory.memsw.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.memsw.failcnt":            memoryFailcnt,
+		"memory.memsw.limit_in_bytes":     memoryLimitContents,
+		"memory.kmem.usage_in_bytes":      memoryUsageContents,
+		"memory.kmem.max_usage_in_bytes":  memoryMaxUsageContents,
+		"memory.kmem.failcnt":             memoryFailcnt,
+		"memory.kmem.limit_in_bytes":      memoryLimitContents,
+		"memory.use_hierarchy":            memoryUseHierarchyContents,
+		"memory.numa_stat":                memoryNUMAStatContents + memoryNUMAStatExtraContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectedStats := cgroups.MemoryStats{
+		Cache:         512,
+		Usage:         cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
+		SwapUsage:     cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
+		SwapOnlyUsage: cgroups.MemoryData{Usage: 0, MaxUsage: 0, Failcnt: 0, Limit: 0},
+		KernelUsage:   cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
+		Stats:         map[string]uint64{"cache": 512, "rss": 1024},
+		UseHierarchy:  true,
+		PageUsageByNUMA: cgroups.PageUsageByNUMA{
+			PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{
+				Total:       cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}},
+				File:        cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}},
+				Anon:        cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}},
+				Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}},
+			},
+			Hierarchical: cgroups.PageUsageByNUMAInner{
+				Total:       cgroups.PageStats{Total: 768133, Nodes: map[uint8]uint64{0: 509113, 1: 138887, 2: 20464, 3: 99669}},
+				File:        cgroups.PageStats{Total: 722017, Nodes: map[uint8]uint64{0: 496516, 1: 119997, 2: 20181, 3: 85323}},
+				Anon:        cgroups.PageStats{Total: 46096, Nodes: map[uint8]uint64{0: 12597, 1: 18890, 2: 283, 3: 14326}},
+				Unevictable: cgroups.PageStats{Total: 20, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 20}},
+			},
+		},
+	}
+	expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats)
+}
+
+func TestMemoryStatsNoStatFile(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.usage_in_bytes":     memoryUsageContents,
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.limit_in_bytes":     memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestMemoryStatsNoUsageFile(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.stat":               memoryStatContents,
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.limit_in_bytes":     memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsNoMaxUsageFile(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.stat":           memoryStatContents,
+		"memory.usage_in_bytes": memoryUsageContents,
+		"memory.limit_in_bytes": memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsNoLimitInBytesFile(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.stat":               memoryStatContents,
+		"memory.usage_in_bytes":     memoryUsageContents,
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsBadStatFile(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.stat":               "rss rss",
+		"memory.usage_in_bytes":     memoryUsageContents,
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.limit_in_bytes":     memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsBadUsageFile(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.stat":               memoryStatContents,
+		"memory.usage_in_bytes":     "bad",
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.limit_in_bytes":     memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsBadMaxUsageFile(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.stat":               memoryStatContents,
+		"memory.usage_in_bytes":     memoryUsageContents,
+		"memory.max_usage_in_bytes": "bad",
+		"memory.limit_in_bytes":     memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsBadLimitInBytesFile(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.stat":               memoryStatContents,
+		"memory.usage_in_bytes":     memoryUsageContents,
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.limit_in_bytes":     "bad",
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(path, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemorySetOomControl(t *testing.T) {
+	path := tempDir(t, "memory")
+
+	const (
+		oomKillDisable = 1 // disable oom killer, default is 0
+	)
+
+	writeFileContents(t, path, map[string]string{
+		"memory.oom_control": strconv.Itoa(oomKillDisable),
+	})
+
+	memory := &MemoryGroup{}
+	r := &cgroups.Resources{}
+	if err := memory.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(path, "memory.oom_control")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != oomKillDisable {
+		t.Fatalf("Got the wrong value, set memory.oom_control failed.")
+	}
+}
+
+func TestNoHierarchicalNumaStat(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.numa_stat": memoryNUMAStatNoHierarchyContents + memoryNUMAStatExtraContents,
+	})
+
+	actualStats, err := getPageUsageByNUMA(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	pageUsageByNUMA := cgroups.PageUsageByNUMA{
+		PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{
+			Total:       cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}},
+			File:        cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}},
+			Anon:        cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}},
+			Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}},
+		},
+		Hierarchical: cgroups.PageUsageByNUMAInner{},
+	}
+	expectPageUsageByNUMAEquals(t, pageUsageByNUMA, actualStats)
+}
+
+func TestBadNumaStat(t *testing.T) {
+	memoryNUMAStatBadContents := []struct {
+		desc, contents string
+	}{
+		{
+			desc: "Nx where x is not a number",
+			contents: `total=44611 N0=44611,
+file=44428 Nx=0
+`,
+		}, {
+			desc:     "Nx where x > 255",
+			contents: `total=44611 N333=444`,
+		}, {
+			desc:     "Nx argument missing",
+			contents: `total=44611 N0=123 N1=`,
+		}, {
+			desc:     "Nx argument is not a number",
+			contents: `total=44611 N0=123 N1=a`,
+		}, {
+			desc:     "Missing = after Nx",
+			contents: `total=44611 N0=123 N1`,
+		}, {
+			desc: "No Nx at non-first position",
+			contents: `total=44611 N0=32631
+file=44428 N0=32614
+anon=183 N0=12 badone
+`,
+		},
+	}
+	path := tempDir(t, "memory")
+	for _, c := range memoryNUMAStatBadContents {
+		writeFileContents(t, path, map[string]string{
+			"memory.numa_stat": c.contents,
+		})
+
+		_, err := getPageUsageByNUMA(path)
+		if err == nil {
+			t.Errorf("case %q: expected error, got nil", c.desc)
+		}
+	}
+}
+
+func TestWithoutNumaStat(t *testing.T) {
+	path := tempDir(t, "memory")
+
+	actualStats, err := getPageUsageByNUMA(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectPageUsageByNUMAEquals(t, cgroups.PageUsageByNUMA{}, actualStats)
+}
diff --git a/fs/name.go b/fs/name.go
new file mode 100644
index 0000000..2864351
--- /dev/null
+++ b/fs/name.go
@@ -0,0 +1,30 @@
+package fs
+
+import (
+	"github.com/opencontainers/cgroups"
+)
+
+type NameGroup struct {
+	GroupName string
+	Join      bool
+}
+
+func (s *NameGroup) Name() string {
+	return s.GroupName
+}
+
+func (s *NameGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	if s.Join {
+		// Ignore errors if the named cgroup does not exist.
+		_ = apply(path, pid)
+	}
+	return nil
+}
+
+func (s *NameGroup) Set(_ string, _ *cgroups.Resources) error {
+	return nil
+}
+
+func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
diff --git a/fs/net_cls.go b/fs/net_cls.go
new file mode 100644
index 0000000..2bd6c5a
--- /dev/null
+++ b/fs/net_cls.go
@@ -0,0 +1,31 @@
+package fs
+
+import (
+	"strconv"
+
+	"github.com/opencontainers/cgroups"
+)
+
+type NetClsGroup struct{}
+
+func (s *NetClsGroup) Name() string {
+	return "net_cls"
+}
+
+func (s *NetClsGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func (s *NetClsGroup) Set(path string, r *cgroups.Resources) error {
+	if r.NetClsClassid != 0 {
+		if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
diff --git a/fs/net_cls_test.go b/fs/net_cls_test.go
new file mode 100644
index 0000000..2252cdd
--- /dev/null
+++ b/fs/net_cls_test.go
@@ -0,0 +1,41 @@
+package fs
+
+import (
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+const (
+	classidBefore = 0x100002
+	classidAfter  = 0x100001
+)
+
+func TestNetClsSetClassid(t *testing.T) {
+	path := tempDir(t, "net_cls")
+
+	writeFileContents(t, path, map[string]string{
+		"net_cls.classid": strconv.FormatUint(classidBefore, 10),
+	})
+
+	r := &cgroups.Resources{
+		NetClsClassid: classidAfter,
+	}
+	netcls := &NetClsGroup{}
+	if err := netcls.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	// As we are in mock environment, we can't get correct value of classid from
+	// net_cls.classid.
+	// So. we just judge if we successfully write classid into file
+	value, err := fscommon.GetCgroupParamUint(path, "net_cls.classid")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != classidAfter {
+		t.Fatal("Got the wrong value, set net_cls.classid failed.")
+	}
+}
diff --git a/fs/net_prio.go b/fs/net_prio.go
new file mode 100644
index 0000000..b51682b
--- /dev/null
+++ b/fs/net_prio.go
@@ -0,0 +1,29 @@
+package fs
+
+import (
+	"github.com/opencontainers/cgroups"
+)
+
+type NetPrioGroup struct{}
+
+func (s *NetPrioGroup) Name() string {
+	return "net_prio"
+}
+
+func (s *NetPrioGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func (s *NetPrioGroup) Set(path string, r *cgroups.Resources) error {
+	for _, prioMap := range r.NetPrioIfpriomap {
+		if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
diff --git a/fs/net_prio_test.go b/fs/net_prio_test.go
new file mode 100644
index 0000000..1a82be4
--- /dev/null
+++ b/fs/net_prio_test.go
@@ -0,0 +1,36 @@
+package fs
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+var prioMap = []*cgroups.IfPrioMap{
+	{
+		Interface: "test",
+		Priority:  5,
+	},
+}
+
+func TestNetPrioSetIfPrio(t *testing.T) {
+	path := tempDir(t, "net_prio")
+
+	r := &cgroups.Resources{
+		NetPrioIfpriomap: prioMap,
+	}
+	netPrio := &NetPrioGroup{}
+	if err := netPrio.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(path, "net_prio.ifpriomap")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(value, "test 5") {
+		t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.")
+	}
+}
diff --git a/fs/paths.go b/fs/paths.go
new file mode 100644
index 0000000..edbe041
--- /dev/null
+++ b/fs/paths.go
@@ -0,0 +1,169 @@
+package fs
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/internal/path"
+)
+
+// The absolute path to the root of the cgroup hierarchies.
+var (
+	cgroupRootLock sync.Mutex
+	cgroupRoot     string
+)
+
+const defaultCgroupRoot = "/sys/fs/cgroup"
+
+func initPaths(cg *cgroups.Cgroup) (map[string]string, error) {
+	root, err := rootPath()
+	if err != nil {
+		return nil, err
+	}
+
+	inner, err := path.Inner(cg)
+	if err != nil {
+		return nil, err
+	}
+
+	paths := make(map[string]string)
+	for _, sys := range subsystems {
+		name := sys.Name()
+		path, err := subsysPath(root, inner, name)
+		if err != nil {
+			// The non-presence of the devices subsystem
+			// is considered fatal for security reasons.
+			if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") {
+				continue
+			}
+
+			return nil, err
+		}
+		paths[name] = path
+	}
+
+	return paths, nil
+}
+
+func tryDefaultCgroupRoot() string {
+	var st, pst unix.Stat_t
+
+	// (1) it should be a directory...
+	err := unix.Lstat(defaultCgroupRoot, &st)
+	if err != nil || st.Mode&unix.S_IFDIR == 0 {
+		return ""
+	}
+
+	// (2) ... and a mount point ...
+	err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst)
+	if err != nil {
+		return ""
+	}
+
+	if st.Dev == pst.Dev {
+		// parent dir has the same dev -- not a mount point
+		return ""
+	}
+
+	// (3) ... of 'tmpfs' fs type.
+	var fst unix.Statfs_t
+	err = unix.Statfs(defaultCgroupRoot, &fst)
+	if err != nil || fst.Type != unix.TMPFS_MAGIC {
+		return ""
+	}
+
+	// (4) it should have at least 1 entry ...
+	dir, err := os.Open(defaultCgroupRoot)
+	if err != nil {
+		return ""
+	}
+	defer dir.Close()
+	names, err := dir.Readdirnames(1)
+	if err != nil {
+		return ""
+	}
+	if len(names) < 1 {
+		return ""
+	}
+	// ... which is a cgroup mount point.
+	err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst)
+	if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
+		return ""
+	}
+
+	return defaultCgroupRoot
+}
+
+// rootPath finds and returns path to the root of the cgroup hierarchies.
+func rootPath() (string, error) {
+	cgroupRootLock.Lock()
+	defer cgroupRootLock.Unlock()
+
+	if cgroupRoot != "" {
+		return cgroupRoot, nil
+	}
+
+	// fast path
+	cgroupRoot = tryDefaultCgroupRoot()
+	if cgroupRoot != "" {
+		return cgroupRoot, nil
+	}
+
+	// slow path: parse mountinfo
+	mi, err := cgroups.GetCgroupMounts(false)
+	if err != nil {
+		return "", err
+	}
+	if len(mi) < 1 {
+		return "", errors.New("no cgroup mount found in mountinfo")
+	}
+
+	// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
+	// use its parent directory.
+	root := filepath.Dir(mi[0].Mountpoint)
+
+	if _, err := os.Stat(root); err != nil {
+		return "", err
+	}
+
+	cgroupRoot = root
+	return cgroupRoot, nil
+}
+
+func subsysPath(root, inner, subsystem string) (string, error) {
+	// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
+	if filepath.IsAbs(inner) {
+		mnt, err := cgroups.FindCgroupMountpoint(root, subsystem)
+		// If we didn't mount the subsystem, there is no point we make the path.
+		if err != nil {
+			return "", err
+		}
+
+		// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
+		return filepath.Join(root, filepath.Base(mnt), inner), nil
+	}
+
+	// Use GetOwnCgroupPath for dind-like cases, when cgroupns is not
+	// available. This is ugly.
+	parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	return filepath.Join(parentPath, inner), nil
+}
+
+func apply(path string, pid int) error {
+	if path == "" {
+		return nil
+	}
+	if err := os.MkdirAll(path, 0o755); err != nil {
+		return err
+	}
+	return cgroups.WriteCgroupProc(path, pid)
+}
diff --git a/fs/paths_test.go b/fs/paths_test.go
new file mode 100644
index 0000000..42b8b66
--- /dev/null
+++ b/fs/paths_test.go
@@ -0,0 +1,104 @@
+package fs
+
+import (
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/internal/path"
+)
+
+func TestInvalidCgroupPath(t *testing.T) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v2 is not supported")
+	}
+
+	root, err := rootPath()
+	if err != nil {
+		t.Fatalf("couldn't get cgroup root: %v", err)
+	}
+
+	testCases := []struct {
+		test               string
+		path, name, parent string
+	}{
+		{
+			test: "invalid cgroup path",
+			path: "../../../../../../../../../../some/path",
+		},
+		{
+			test: "invalid absolute cgroup path",
+			path: "/../../../../../../../../../../some/path",
+		},
+		{
+			test:   "invalid cgroup parent",
+			parent: "../../../../../../../../../../some/path",
+			name:   "name",
+		},
+		{
+			test:   "invalid absolute cgroup parent",
+			parent: "/../../../../../../../../../../some/path",
+			name:   "name",
+		},
+		{
+			test:   "invalid cgroup name",
+			parent: "parent",
+			name:   "../../../../../../../../../../some/path",
+		},
+		{
+			test:   "invalid absolute cgroup name",
+			parent: "parent",
+			name:   "/../../../../../../../../../../some/path",
+		},
+		{
+			test:   "invalid cgroup name and parent",
+			parent: "../../../../../../../../../../some/path",
+			name:   "../../../../../../../../../../some/path",
+		},
+		{
+			test:   "invalid absolute cgroup name and parent",
+			parent: "/../../../../../../../../../../some/path",
+			name:   "/../../../../../../../../../../some/path",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.test, func(t *testing.T) {
+			config := &cgroups.Cgroup{Path: tc.path, Name: tc.name, Parent: tc.parent}
+
+			inner, err := path.Inner(config)
+			if err != nil {
+				t.Fatalf("couldn't get cgroup data: %v", err)
+			}
+
+			// Make sure the final inner path doesn't go outside the cgroup mountpoint.
+			if strings.HasPrefix(inner, "..") {
+				t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+			}
+
+			// Double-check, using an actual cgroup.
+			deviceRoot := filepath.Join(root, "devices")
+			devicePath, err := subsysPath(root, inner, "devices")
+			if err != nil {
+				t.Fatalf("couldn't get cgroup path: %v", err)
+			}
+			if !strings.HasPrefix(devicePath, deviceRoot) {
+				t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+			}
+		})
+	}
+}
+
+func TestTryDefaultCgroupRoot(t *testing.T) {
+	res := tryDefaultCgroupRoot()
+	exp := defaultCgroupRoot
+	if cgroups.IsCgroup2UnifiedMode() {
+		// checking that tryDefaultCgroupRoot does return ""
+		// in case /sys/fs/cgroup is not cgroup v1 root dir.
+		exp = ""
+	}
+	if res != exp {
+		t.Errorf("tryDefaultCgroupRoot: want %q, got %q", exp, res)
+	}
+}
diff --git a/fs/perf_event.go b/fs/perf_event.go
new file mode 100644
index 0000000..929c412
--- /dev/null
+++ b/fs/perf_event.go
@@ -0,0 +1,23 @@
+package fs
+
+import (
+	"github.com/opencontainers/cgroups"
+)
+
+type PerfEventGroup struct{}
+
+func (s *PerfEventGroup) Name() string {
+	return "perf_event"
+}
+
+func (s *PerfEventGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func (s *PerfEventGroup) Set(_ string, _ *cgroups.Resources) error {
+	return nil
+}
+
+func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
diff --git a/fs/pids.go b/fs/pids.go
new file mode 100644
index 0000000..9319761
--- /dev/null
+++ b/fs/pids.go
@@ -0,0 +1,61 @@
+package fs
+
+import (
+	"math"
+	"strconv"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+type PidsGroup struct{}
+
+func (s *PidsGroup) Name() string {
+	return "pids"
+}
+
+func (s *PidsGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func (s *PidsGroup) Set(path string, r *cgroups.Resources) error {
+	if r.PidsLimit != 0 {
+		// "max" is the fallback value.
+		limit := "max"
+
+		if r.PidsLimit > 0 {
+			limit = strconv.FormatInt(r.PidsLimit, 10)
+		}
+
+		if err := cgroups.WriteFile(path, "pids.max", limit); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
+	if !cgroups.PathExists(path) {
+		return nil
+	}
+	current, err := fscommon.GetCgroupParamUint(path, "pids.current")
+	if err != nil {
+		return err
+	}
+
+	max, err := fscommon.GetCgroupParamUint(path, "pids.max")
+	if err != nil {
+		return err
+	}
+	// If no limit is set, read from pids.max returns "max", which is
+	// converted to MaxUint64 by GetCgroupParamUint. Historically, we
+	// represent "no limit" for pids as 0, thus this conversion.
+	if max == math.MaxUint64 {
+		max = 0
+	}
+
+	stats.PidsStats.Current = current
+	stats.PidsStats.Limit = max
+	return nil
+}
diff --git a/fs/pids_test.go b/fs/pids_test.go
new file mode 100644
index 0000000..a33db7a
--- /dev/null
+++ b/fs/pids_test.go
@@ -0,0 +1,108 @@
+package fs
+
+import (
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+const (
+	maxUnlimited = -1
+	maxLimited   = 1024
+)
+
+func TestPidsSetMax(t *testing.T) {
+	path := tempDir(t, "pids")
+
+	writeFileContents(t, path, map[string]string{
+		"pids.max": "max",
+	})
+
+	r := &cgroups.Resources{
+		PidsLimit: maxLimited,
+	}
+	pids := &PidsGroup{}
+	if err := pids.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(path, "pids.max")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != maxLimited {
+		t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value)
+	}
+}
+
+func TestPidsSetUnlimited(t *testing.T) {
+	path := tempDir(t, "pids")
+
+	writeFileContents(t, path, map[string]string{
+		"pids.max": strconv.Itoa(maxLimited),
+	})
+
+	r := &cgroups.Resources{
+		PidsLimit: maxUnlimited,
+	}
+	pids := &PidsGroup{}
+	if err := pids.Set(path, r); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(path, "pids.max")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if value != "max" {
+		t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value)
+	}
+}
+
+func TestPidsStats(t *testing.T) {
+	path := tempDir(t, "pids")
+
+	writeFileContents(t, path, map[string]string{
+		"pids.current": strconv.Itoa(1337),
+		"pids.max":     strconv.Itoa(maxLimited),
+	})
+
+	pids := &PidsGroup{}
+	stats := *cgroups.NewStats()
+	if err := pids.GetStats(path, &stats); err != nil {
+		t.Fatal(err)
+	}
+
+	if stats.PidsStats.Current != 1337 {
+		t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current)
+	}
+
+	if stats.PidsStats.Limit != maxLimited {
+		t.Fatalf("Expected %d, got %d for pids.max", maxLimited, stats.PidsStats.Limit)
+	}
+}
+
+func TestPidsStatsUnlimited(t *testing.T) {
+	path := tempDir(t, "pids")
+
+	writeFileContents(t, path, map[string]string{
+		"pids.current": strconv.Itoa(4096),
+		"pids.max":     "max",
+	})
+
+	pids := &PidsGroup{}
+	stats := *cgroups.NewStats()
+	if err := pids.GetStats(path, &stats); err != nil {
+		t.Fatal(err)
+	}
+
+	if stats.PidsStats.Current != 4096 {
+		t.Fatalf("Expected %d, got %d for pids.current", 4096, stats.PidsStats.Current)
+	}
+
+	if stats.PidsStats.Limit != 0 {
+		t.Fatalf("Expected %d, got %d for pids.max", 0, stats.PidsStats.Limit)
+	}
+}
diff --git a/fs/rdma.go b/fs/rdma.go
new file mode 100644
index 0000000..4b17536
--- /dev/null
+++ b/fs/rdma.go
@@ -0,0 +1,24 @@
+package fs
+
+import (
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+type RdmaGroup struct{}
+
+func (s *RdmaGroup) Name() string {
+	return "rdma"
+}
+
+func (s *RdmaGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func (s *RdmaGroup) Set(path string, r *cgroups.Resources) error {
+	return fscommon.RdmaSet(path, r)
+}
+
+func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return fscommon.RdmaGetStats(path, stats)
+}
diff --git a/fs/stats_util_test.go b/fs/stats_util_test.go
new file mode 100644
index 0000000..330dced
--- /dev/null
+++ b/fs/stats_util_test.go
@@ -0,0 +1,138 @@
+package fs
+
+import (
+	"errors"
+	"fmt"
+	"reflect"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error {
+	if len(expected) != len(actual) {
+		return errors.New("blkioStatEntries length do not match")
+	}
+	for i, expValue := range expected {
+		actValue := actual[i]
+		if expValue != actValue {
+			return fmt.Errorf("expected: %v, actual: %v", expValue, actValue)
+		}
+	}
+	return nil
+}
+
+func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) {
+	t.Helper()
+	if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil {
+		t.Errorf("blkio IoServiceBytesRecursive do not match: %s", err)
+	}
+
+	if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil {
+		t.Errorf("blkio IoServicedRecursive do not match: %s", err)
+	}
+
+	if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil {
+		t.Errorf("blkio IoQueuedRecursive do not match: %s", err)
+	}
+
+	if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil {
+		t.Errorf("blkio SectorsRecursive do not match: %s", err)
+	}
+
+	if err := blkioStatEntryEquals(expected.IoServiceTimeRecursive, actual.IoServiceTimeRecursive); err != nil {
+		t.Errorf("blkio IoServiceTimeRecursive do not match: %s", err)
+	}
+
+	if err := blkioStatEntryEquals(expected.IoWaitTimeRecursive, actual.IoWaitTimeRecursive); err != nil {
+		t.Errorf("blkio IoWaitTimeRecursive do not match: %s", err)
+	}
+
+	if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil {
+		t.Errorf("blkio IoMergedRecursive do not match: expected: %v, actual: %v", expected.IoMergedRecursive, actual.IoMergedRecursive)
+	}
+
+	if err := blkioStatEntryEquals(expected.IoTimeRecursive, actual.IoTimeRecursive); err != nil {
+		t.Errorf("blkio IoTimeRecursive do not match: %s", err)
+	}
+}
+
+func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) {
+	t.Helper()
+	if expected != actual {
+		t.Errorf("Expected throttling data: %v, actual: %v", expected, actual)
+	}
+}
+
+func expectHugetlbStatEquals(t *testing.T, expected, actual cgroups.HugetlbStats) {
+	t.Helper()
+	if expected != actual {
+		t.Errorf("Expected hugetlb stats: %v, actual: %v", expected, actual)
+	}
+}
+
+func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) {
+	t.Helper()
+	expectMemoryDataEquals(t, expected.Usage, actual.Usage)
+	expectMemoryDataEquals(t, expected.SwapUsage, actual.SwapUsage)
+	expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage)
+	expectPageUsageByNUMAEquals(t, expected.PageUsageByNUMA, actual.PageUsageByNUMA)
+
+	if expected.UseHierarchy != actual.UseHierarchy {
+		t.Errorf("Expected memory use hierarchy: %v, actual: %v", expected.UseHierarchy, actual.UseHierarchy)
+	}
+
+	for key, expValue := range expected.Stats {
+		actValue, ok := actual.Stats[key]
+		if !ok {
+			t.Errorf("Expected memory stat key %s not found", key)
+		}
+		if expValue != actValue {
+			t.Errorf("Expected memory stat value: %d, actual: %d", expValue, actValue)
+		}
+	}
+}
+
+func expectMemoryDataEquals(t *testing.T, expected, actual cgroups.MemoryData) {
+	t.Helper()
+	if expected.Usage != actual.Usage {
+		t.Errorf("Expected memory usage: %d, actual: %d", expected.Usage, actual.Usage)
+	}
+	if expected.MaxUsage != actual.MaxUsage {
+		t.Errorf("Expected memory max usage: %d, actual: %d", expected.MaxUsage, actual.MaxUsage)
+	}
+	if expected.Failcnt != actual.Failcnt {
+		t.Errorf("Expected memory failcnt %d, actual: %d", expected.Failcnt, actual.Failcnt)
+	}
+	if expected.Limit != actual.Limit {
+		t.Errorf("Expected memory limit: %d, actual: %d", expected.Limit, actual.Limit)
+	}
+}
+
+func expectPageUsageByNUMAEquals(t *testing.T, expected, actual cgroups.PageUsageByNUMA) {
+	t.Helper()
+	if !reflect.DeepEqual(expected.Total, actual.Total) {
+		t.Errorf("Expected total page usage by NUMA: %#v, actual: %#v", expected.Total, actual.Total)
+	}
+	if !reflect.DeepEqual(expected.File, actual.File) {
+		t.Errorf("Expected file page usage by NUMA: %#v, actual: %#v", expected.File, actual.File)
+	}
+	if !reflect.DeepEqual(expected.Anon, actual.Anon) {
+		t.Errorf("Expected anon page usage by NUMA: %#v, actual: %#v", expected.Anon, actual.Anon)
+	}
+	if !reflect.DeepEqual(expected.Unevictable, actual.Unevictable) {
+		t.Errorf("Expected unevictable page usage by NUMA: %#v, actual: %#v", expected.Unevictable, actual.Unevictable)
+	}
+	if !reflect.DeepEqual(expected.Hierarchical.Total, actual.Hierarchical.Total) {
+		t.Errorf("Expected hierarchical total page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Total, actual.Hierarchical.Total)
+	}
+	if !reflect.DeepEqual(expected.Hierarchical.File, actual.Hierarchical.File) {
+		t.Errorf("Expected hierarchical file page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.File, actual.Hierarchical.File)
+	}
+	if !reflect.DeepEqual(expected.Hierarchical.Anon, actual.Hierarchical.Anon) {
+		t.Errorf("Expected hierarchical anon page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Anon, actual.Hierarchical.Anon)
+	}
+	if !reflect.DeepEqual(expected.Hierarchical.Unevictable, actual.Hierarchical.Unevictable) {
+		t.Errorf("Expected hierarchical total page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Unevictable, actual.Hierarchical.Unevictable)
+	}
+}
diff --git a/fs/util_test.go b/fs/util_test.go
new file mode 100644
index 0000000..e620fda
--- /dev/null
+++ b/fs/util_test.go
@@ -0,0 +1,39 @@
+/*
+Utility for testing cgroup operations.
+
+Creates a mock of the cgroup filesystem for the duration of the test.
+*/
+package fs
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+func init() {
+	cgroups.TestMode = true
+}
+
+// tempDir creates a new test directory for the specified subsystem.
+func tempDir(t testing.TB, subsystem string) string {
+	path := filepath.Join(t.TempDir(), subsystem)
+	// Ensure the full mock cgroup path exists.
+	if err := os.Mkdir(path, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	return path
+}
+
+// writeFileContents writes the specified contents on the mock of the specified
+// cgroup files.
+func writeFileContents(t testing.TB, path string, fileContents map[string]string) {
+	for file, contents := range fileContents {
+		err := cgroups.WriteFile(path, file, contents)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+}
diff --git a/fs2/cpu.go b/fs2/cpu.go
new file mode 100644
index 0000000..8eae673
--- /dev/null
+++ b/fs2/cpu.go
@@ -0,0 +1,117 @@
+package fs2
+
+import (
+	"bufio"
+	"errors"
+	"os"
+	"strconv"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+func isCPUSet(r *cgroups.Resources) bool {
+	return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil || r.CpuBurst != nil
+}
+
+func setCPU(dirPath string, r *cgroups.Resources) error {
+	if !isCPUSet(r) {
+		return nil
+	}
+
+	if r.CPUIdle != nil {
+		if err := cgroups.WriteFile(dirPath, "cpu.idle", strconv.FormatInt(*r.CPUIdle, 10)); err != nil {
+			return err
+		}
+	}
+
+	// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
+	if r.CpuWeight != 0 {
+		if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {
+			return err
+		}
+	}
+
+	var burst string
+	if r.CpuBurst != nil {
+		burst = strconv.FormatUint(*r.CpuBurst, 10)
+		if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil {
+			// Sometimes when the burst to be set is larger
+			// than the current one, it is rejected by the kernel
+			// (EINVAL) as old_quota/new_burst exceeds the parent
+			// cgroup quota limit. If this happens and the quota is
+			// going to be set, ignore the error for now and retry
+			// after setting the quota.
+			if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
+				return err
+			}
+		} else {
+			burst = ""
+		}
+	}
+	if r.CpuQuota != 0 || r.CpuPeriod != 0 {
+		str := "max"
+		if r.CpuQuota > 0 {
+			str = strconv.FormatInt(r.CpuQuota, 10)
+		}
+		period := r.CpuPeriod
+		if period == 0 {
+			// This default value is documented in
+			// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
+			period = 100000
+		}
+		str += " " + strconv.FormatUint(period, 10)
+		if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil {
+			return err
+		}
+		if burst != "" {
+			if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func statCpu(dirPath string, stats *cgroups.Stats) error {
+	const file = "cpu.stat"
+	f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		t, v, err := fscommon.ParseKeyValue(sc.Text())
+		if err != nil {
+			return &parseError{Path: dirPath, File: file, Err: err}
+		}
+		switch t {
+		case "usage_usec":
+			stats.CpuStats.CpuUsage.TotalUsage = v * 1000
+
+		case "user_usec":
+			stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000
+
+		case "system_usec":
+			stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000
+
+		case "nr_periods":
+			stats.CpuStats.ThrottlingData.Periods = v
+
+		case "nr_throttled":
+			stats.CpuStats.ThrottlingData.ThrottledPeriods = v
+
+		case "throttled_usec":
+			stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000
+		}
+	}
+	if err := sc.Err(); err != nil {
+		return &parseError{Path: dirPath, File: file, Err: err}
+	}
+	return nil
+}
diff --git a/fs2/cpuset.go b/fs2/cpuset.go
new file mode 100644
index 0000000..9399919
--- /dev/null
+++ b/fs2/cpuset.go
@@ -0,0 +1,27 @@
+package fs2
+
+import (
+	"github.com/opencontainers/cgroups"
+)
+
+func isCpusetSet(r *cgroups.Resources) bool {
+	return r.CpusetCpus != "" || r.CpusetMems != ""
+}
+
+func setCpuset(dirPath string, r *cgroups.Resources) error {
+	if !isCpusetSet(r) {
+		return nil
+	}
+
+	if r.CpusetCpus != "" {
+		if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil {
+			return err
+		}
+	}
+	if r.CpusetMems != "" {
+		if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/fs2/create.go b/fs2/create.go
new file mode 100644
index 0000000..565ca88
--- /dev/null
+++ b/fs2/create.go
@@ -0,0 +1,151 @@
+package fs2
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/opencontainers/cgroups"
+)
+
+func supportedControllers() (string, error) {
+	return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers")
+}
+
+// needAnyControllers returns whether we enable some supported controllers or not,
+// based on (1) controllers available and (2) resources that are being set.
+// We don't check "pseudo" controllers such as
+// "freezer" and "devices".
+func needAnyControllers(r *cgroups.Resources) (bool, error) {
+	if r == nil {
+		return false, nil
+	}
+
+	// list of all available controllers
+	content, err := supportedControllers()
+	if err != nil {
+		return false, err
+	}
+	avail := make(map[string]struct{})
+	for _, ctr := range strings.Fields(content) {
+		avail[ctr] = struct{}{}
+	}
+
+	// check whether the controller if available or not
+	have := func(controller string) bool {
+		_, ok := avail[controller]
+		return ok
+	}
+
+	if isPidsSet(r) && have("pids") {
+		return true, nil
+	}
+	if isMemorySet(r) && have("memory") {
+		return true, nil
+	}
+	if isIoSet(r) && have("io") {
+		return true, nil
+	}
+	if isCPUSet(r) && have("cpu") {
+		return true, nil
+	}
+	if isCpusetSet(r) && have("cpuset") {
+		return true, nil
+	}
+	if isHugeTlbSet(r) && have("hugetlb") {
+		return true, nil
+	}
+
+	return false, nil
+}
+
+// containsDomainController returns whether the current config contains domain controller or not.
+// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html
+// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids.
+func containsDomainController(r *cgroups.Resources) bool {
+	return isMemorySet(r) || isIoSet(r) || isCPUSet(r) || isHugeTlbSet(r)
+}
+
+// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers.
+func CreateCgroupPath(path string, c *cgroups.Cgroup) (Err error) {
+	if !strings.HasPrefix(path, UnifiedMountpoint) {
+		return fmt.Errorf("invalid cgroup path %s", path)
+	}
+
+	content, err := supportedControllers()
+	if err != nil {
+		return err
+	}
+
+	const (
+		cgTypeFile  = "cgroup.type"
+		cgStCtlFile = "cgroup.subtree_control"
+	)
+	ctrs := strings.Fields(content)
+	res := "+" + strings.Join(ctrs, " +")
+
+	elements := strings.Split(path, "/")
+	elements = elements[3:]
+	current := "/sys/fs"
+	for i, e := range elements {
+		current = filepath.Join(current, e)
+		if i > 0 {
+			if err := os.Mkdir(current, 0o755); err != nil {
+				if !os.IsExist(err) {
+					return err
+				}
+			} else {
+				// If the directory was created, be sure it is not left around on errors.
+				current := current
+				defer func() {
+					if Err != nil {
+						os.Remove(current)
+					}
+				}()
+			}
+			cgType, _ := cgroups.ReadFile(current, cgTypeFile)
+			cgType = strings.TrimSpace(cgType)
+			switch cgType {
+			// If the cgroup is in an invalid mode (usually this means there's an internal
+			// process in the cgroup tree, because we created a cgroup under an
+			// already-populated-by-other-processes cgroup), then we have to error out if
+			// the user requested controllers which are not thread-aware. However, if all
+			// the controllers requested are thread-aware we can simply put the cgroup into
+			// threaded mode.
+			case "domain invalid":
+				if containsDomainController(c.Resources) {
+					return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current)
+				} else {
+					// Not entirely correct (in theory we'd always want to be a domain --
+					// since that means we're a properly delegated cgroup subtree) but in
+					// this case there's not much we can do and it's better than giving an
+					// error.
+					_ = cgroups.WriteFile(current, cgTypeFile, "threaded")
+				}
+			// If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers
+			// (and you cannot usually take a cgroup out of threaded mode).
+			case "domain threaded":
+				fallthrough
+			case "threaded":
+				if containsDomainController(c.Resources) {
+					return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType)
+				}
+			}
+		}
+		// enable all supported controllers
+		if i < len(elements)-1 {
+			if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil {
+				// try write one by one
+				allCtrs := strings.Split(res, " ")
+				for _, ctr := range allCtrs {
+					_ = cgroups.WriteFile(current, cgStCtlFile, ctr)
+				}
+			}
+			// Some controllers might not be enabled when rootless or containerized,
+			// but we don't catch the error here. (Caught in setXXX() functions.)
+		}
+	}
+
+	return nil
+}
diff --git a/fs2/defaultpath.go b/fs2/defaultpath.go
new file mode 100644
index 0000000..0bc479d
--- /dev/null
+++ b/fs2/defaultpath.go
@@ -0,0 +1,80 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package fs2
+
+import (
+	"bufio"
+	"errors"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/internal/path"
+)
+
+const UnifiedMountpoint = "/sys/fs/cgroup"
+
+func defaultDirPath(c *cgroups.Cgroup) (string, error) {
+	innerPath, err := path.Inner(c)
+	if err != nil {
+		return "", err
+	}
+
+	if filepath.IsAbs(innerPath) {
+		return filepath.Join(UnifiedMountpoint, innerPath), nil
+	}
+
+	// we don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
+	ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return "", err
+	}
+	// The current user scope most probably has tasks in it already,
+	// making it impossible to enable controllers for its sub-cgroup.
+	// A parent cgroup (with no tasks in it) is what we need.
+	ownCgroup = filepath.Dir(ownCgroup)
+
+	return filepath.Join(UnifiedMountpoint, ownCgroup, innerPath), nil
+}
+
+// parseCgroupFile parses /proc/PID/cgroup file and return string
+func parseCgroupFile(path string) (string, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+	return parseCgroupFromReader(f)
+}
+
+func parseCgroupFromReader(r io.Reader) (string, error) {
+	s := bufio.NewScanner(r)
+	for s.Scan() {
+		// "0::/user.slice/user-1001.slice/session-1.scope"
+		if path, ok := strings.CutPrefix(s.Text(), "0::"); ok {
+			return path, nil
+		}
+	}
+	if err := s.Err(); err != nil {
+		return "", err
+	}
+	return "", errors.New("cgroup path not found")
+}
diff --git a/fs2/defaultpath_test.go b/fs2/defaultpath_test.go
new file mode 100644
index 0000000..8fdad88
--- /dev/null
+++ b/fs2/defaultpath_test.go
@@ -0,0 +1,93 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package fs2
+
+import (
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+func TestParseCgroupFromReader(t *testing.T) {
+	cases := map[string]string{
+		"0::/user.slice/user-1001.slice/session-1.scope\n":                                  "/user.slice/user-1001.slice/session-1.scope",
+		"2:cpuset:/foo\n1:name=systemd:/\n":                                                 "",
+		"2:cpuset:/foo\n1:name=systemd:/\n0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope",
+	}
+	for s, expected := range cases {
+		g, err := parseCgroupFromReader(strings.NewReader(s))
+		if expected != "" {
+			if g != expected {
+				t.Errorf("expected %q, got %q", expected, g)
+			}
+			if err != nil {
+				t.Error(err)
+			}
+		} else {
+			if err == nil {
+				t.Error("error is expected")
+			}
+		}
+	}
+}
+
+func TestDefaultDirPath(t *testing.T) {
+	if !cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("need cgroupv2")
+	}
+	// same code as in defaultDirPath()
+	ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		// Not a test failure, but rather some weird
+		// environment so we can't run this test.
+		t.Skipf("can't get own cgroup: %v", err)
+	}
+	ownCgroup = filepath.Dir(ownCgroup)
+
+	cases := []struct {
+		cgPath   string
+		cgParent string
+		cgName   string
+		expected string
+	}{
+		{
+			cgPath:   "/foo/bar",
+			expected: "/sys/fs/cgroup/foo/bar",
+		},
+		{
+			cgPath:   "foo/bar",
+			expected: filepath.Join(UnifiedMountpoint, ownCgroup, "foo/bar"),
+		},
+	}
+	for _, c := range cases {
+		cg := &cgroups.Cgroup{
+			Path:   c.cgPath,
+			Parent: c.cgParent,
+			Name:   c.cgName,
+		}
+
+		got, err := defaultDirPath(cg)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if got != c.expected {
+			t.Fatalf("expected %q, got %q", c.expected, got)
+		}
+	}
+}
diff --git a/fs2/freezer.go b/fs2/freezer.go
new file mode 100644
index 0000000..f0192f0
--- /dev/null
+++ b/fs2/freezer.go
@@ -0,0 +1,124 @@
+package fs2
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+)
+
+func setFreezer(dirPath string, state cgroups.FreezerState) error {
+	var stateStr string
+	switch state {
+	case cgroups.Undefined:
+		return nil
+	case cgroups.Frozen:
+		stateStr = "1"
+	case cgroups.Thawed:
+		stateStr = "0"
+	default:
+		return fmt.Errorf("invalid freezer state %q requested", state)
+	}
+
+	fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR)
+	if err != nil {
+		// We can ignore this request as long as the user didn't ask us to
+		// freeze the container (since without the freezer cgroup, that's a
+		// no-op).
+		if state != cgroups.Frozen {
+			return nil
+		}
+		return fmt.Errorf("freezer not supported: %w", err)
+	}
+	defer fd.Close()
+
+	if _, err := fd.WriteString(stateStr); err != nil {
+		return err
+	}
+	// Confirm that the cgroup did actually change states.
+	if actualState, err := readFreezer(dirPath, fd); err != nil {
+		return err
+	} else if actualState != state {
+		return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState)
+	}
+	return nil
+}
+
+func getFreezer(dirPath string) (cgroups.FreezerState, error) {
+	fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY)
+	if err != nil {
+		// If the kernel is too old, then we just treat the freezer as being in
+		// an "undefined" state.
+		if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
+			err = nil
+		}
+		return cgroups.Undefined, err
+	}
+	defer fd.Close()
+
+	return readFreezer(dirPath, fd)
+}
+
+func readFreezer(dirPath string, fd *os.File) (cgroups.FreezerState, error) {
+	if _, err := fd.Seek(0, 0); err != nil {
+		return cgroups.Undefined, err
+	}
+	state := make([]byte, 2)
+	if _, err := fd.Read(state); err != nil {
+		return cgroups.Undefined, err
+	}
+	switch string(state) {
+	case "0\n":
+		return cgroups.Thawed, nil
+	case "1\n":
+		return waitFrozen(dirPath)
+	default:
+		return cgroups.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state)
+	}
+}
+
+// waitFrozen polls cgroup.events until it sees "frozen 1" in it.
+func waitFrozen(dirPath string) (cgroups.FreezerState, error) {
+	fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY)
+	if err != nil {
+		return cgroups.Undefined, err
+	}
+	defer fd.Close()
+
+	// XXX: Simple wait/read/retry is used here. An implementation
+	// based on poll(2) or inotify(7) is possible, but it makes the code
+	// much more complicated. Maybe address this later.
+	const (
+		// Perform maxIter with waitTime in between iterations.
+		waitTime = 10 * time.Millisecond
+		maxIter  = 1000
+	)
+	scanner := bufio.NewScanner(fd)
+	for i := 0; scanner.Scan(); {
+		if i == maxIter {
+			return cgroups.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter)
+		}
+		if val, ok := strings.CutPrefix(scanner.Text(), "frozen "); ok {
+			if val[0] == '1' {
+				return cgroups.Frozen, nil
+			}
+
+			i++
+			// wait, then re-read
+			time.Sleep(waitTime)
+			_, err := fd.Seek(0, 0)
+			if err != nil {
+				return cgroups.Undefined, err
+			}
+		}
+	}
+	// Should only reach here either on read error,
+	// or if the file does not contain "frozen " line.
+	return cgroups.Undefined, scanner.Err()
+}
diff --git a/fs2/fs2.go b/fs2/fs2.go
new file mode 100644
index 0000000..c5d5a1f
--- /dev/null
+++ b/fs2/fs2.go
@@ -0,0 +1,316 @@
+package fs2
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"strings"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+type parseError = fscommon.ParseError
+
+type Manager struct {
+	config *cgroups.Cgroup
+	// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
+	dirPath string
+	// controllers is content of "cgroup.controllers" file.
+	// excludes pseudo-controllers ("devices" and "freezer").
+	controllers map[string]struct{}
+}
+
+// NewManager creates a manager for cgroup v2 unified hierarchy.
+// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
+// If dirPath is empty, it is automatically set using config.
+func NewManager(config *cgroups.Cgroup, dirPath string) (*Manager, error) {
+	if dirPath == "" {
+		var err error
+		dirPath, err = defaultDirPath(config)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	m := &Manager{
+		config:  config,
+		dirPath: dirPath,
+	}
+	return m, nil
+}
+
+func (m *Manager) getControllers() error {
+	if m.controllers != nil {
+		return nil
+	}
+
+	data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers")
+	if err != nil {
+		if m.config.Rootless && m.config.Path == "" {
+			return nil
+		}
+		return err
+	}
+	fields := strings.Fields(data)
+	m.controllers = make(map[string]struct{}, len(fields))
+	for _, c := range fields {
+		m.controllers[c] = struct{}{}
+	}
+
+	return nil
+}
+
+func (m *Manager) Apply(pid int) error {
+	if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
+		// Related tests:
+		// - "runc create (no limits + no cgrouppath + no permission) succeeds"
+		// - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error"
+		// - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
+		if m.config.Rootless {
+			if m.config.Path == "" {
+				if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed {
+					return cgroups.ErrRootless
+				}
+				return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err)
+			}
+		}
+		return err
+	}
+	if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+	return cgroups.GetPids(m.dirPath)
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+	return cgroups.GetAllPids(m.dirPath)
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+	var errs []error
+
+	st := cgroups.NewStats()
+
+	// pids (since kernel 4.5)
+	if err := statPids(m.dirPath, st); err != nil {
+		errs = append(errs, err)
+	}
+	// memory (since kernel 4.5)
+	if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) {
+		errs = append(errs, err)
+	}
+	// io (since kernel 4.5)
+	if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) {
+		errs = append(errs, err)
+	}
+	// cpu (since kernel 4.15)
+	// Note cpu.stat is available even if the controller is not enabled.
+	if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
+		errs = append(errs, err)
+	}
+	// PSI (since kernel 4.20).
+	var err error
+	if st.CpuStats.PSI, err = statPSI(m.dirPath, "cpu.pressure"); err != nil {
+		errs = append(errs, err)
+	}
+	if st.MemoryStats.PSI, err = statPSI(m.dirPath, "memory.pressure"); err != nil {
+		errs = append(errs, err)
+	}
+	if st.BlkioStats.PSI, err = statPSI(m.dirPath, "io.pressure"); err != nil {
+		errs = append(errs, err)
+	}
+	// hugetlb (since kernel 5.6)
+	if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
+		errs = append(errs, err)
+	}
+	// rdma (since kernel 4.11)
+	if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) {
+		errs = append(errs, err)
+	}
+	// misc (since kernel 5.13)
+	if err := statMisc(m.dirPath, st); err != nil && !os.IsNotExist(err) {
+		errs = append(errs, err)
+	}
+	if len(errs) > 0 && !m.config.Rootless {
+		return st, fmt.Errorf("error while statting cgroup v2: %+v", errs)
+	}
+	return st, nil
+}
+
+func (m *Manager) Freeze(state cgroups.FreezerState) error {
+	if m.config.Resources == nil {
+		return errors.New("cannot toggle freezer: cgroups not configured for container")
+	}
+	if err := setFreezer(m.dirPath, state); err != nil {
+		return err
+	}
+	m.config.Resources.Freezer = state
+	return nil
+}
+
+func (m *Manager) Destroy() error {
+	return cgroups.RemovePath(m.dirPath)
+}
+
+func (m *Manager) Path(_ string) string {
+	return m.dirPath
+}
+
+func (m *Manager) Set(r *cgroups.Resources) error {
+	if r == nil {
+		return nil
+	}
+	if err := m.getControllers(); err != nil {
+		return err
+	}
+	// pids (since kernel 4.5)
+	if err := setPids(m.dirPath, r); err != nil {
+		return err
+	}
+	// memory (since kernel 4.5)
+	if err := setMemory(m.dirPath, r); err != nil {
+		return err
+	}
+	// io (since kernel 4.5)
+	if err := setIo(m.dirPath, r); err != nil {
+		return err
+	}
+	// cpu (since kernel 4.15)
+	if err := setCPU(m.dirPath, r); err != nil {
+		return err
+	}
+	// devices (since kernel 4.15, pseudo-controller)
+	//
+	// When rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
+	// However, errors from other subsystems are not ignored.
+	// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
+	if err := setDevices(m.dirPath, r); err != nil {
+		if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) {
+			return err
+		}
+	}
+	// cpuset (since kernel 5.0)
+	if err := setCpuset(m.dirPath, r); err != nil {
+		return err
+	}
+	// hugetlb (since kernel 5.6)
+	if err := setHugeTlb(m.dirPath, r); err != nil {
+		return err
+	}
+	// rdma (since kernel 4.11)
+	if err := fscommon.RdmaSet(m.dirPath, r); err != nil {
+		return err
+	}
+	// freezer (since kernel 5.2, pseudo-controller)
+	if err := setFreezer(m.dirPath, r.Freezer); err != nil {
+		return err
+	}
+	if err := m.setUnified(r.Unified); err != nil {
+		return err
+	}
+	m.config.Resources = r
+	return nil
+}
+
+func setDevices(dirPath string, r *cgroups.Resources) error {
+	if cgroups.DevicesSetV2 == nil {
+		if len(r.Devices) > 0 {
+			return cgroups.ErrDevicesUnsupported
+		}
+		return nil
+	}
+	return cgroups.DevicesSetV2(dirPath, r)
+}
+
+func (m *Manager) setUnified(res map[string]string) error {
+	for k, v := range res {
+		if strings.Contains(k, "/") {
+			return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
+		}
+		if err := cgroups.WriteFileByLine(m.dirPath, k, v); err != nil {
+			// Check for both EPERM and ENOENT since O_CREAT is used by WriteFile.
+			if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) {
+				// Check if a controller is available,
+				// to give more specific error if not.
+				c, _, ok := strings.Cut(k, ".")
+				if !ok {
+					return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
+				}
+				if _, ok := m.controllers[c]; !ok && c != "cgroup" {
+					return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c)
+				}
+			}
+			return fmt.Errorf("unable to set unified resource %q: %w", k, err)
+		}
+	}
+
+	return nil
+}
+
+func (m *Manager) GetPaths() map[string]string {
+	paths := make(map[string]string, 1)
+	paths[""] = m.dirPath
+	return paths
+}
+
+func (m *Manager) GetCgroups() (*cgroups.Cgroup, error) {
+	return m.config, nil
+}
+
+func (m *Manager) GetFreezerState() (cgroups.FreezerState, error) {
+	return getFreezer(m.dirPath)
+}
+
+func (m *Manager) Exists() bool {
+	return cgroups.PathExists(m.dirPath)
+}
+
+func OOMKillCount(path string) (uint64, error) {
+	return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
+}
+
+func (m *Manager) OOMKillCount() (uint64, error) {
+	c, err := OOMKillCount(m.dirPath)
+	if err != nil && m.config.Rootless && os.IsNotExist(err) {
+		err = nil
+	}
+
+	return c, err
+}
+
+func CheckMemoryUsage(dirPath string, r *cgroups.Resources) error {
+	if !r.MemoryCheckBeforeUpdate {
+		return nil
+	}
+
+	if r.Memory <= 0 && r.MemorySwap <= 0 {
+		return nil
+	}
+
+	usage, err := fscommon.GetCgroupParamUint(dirPath, "memory.current")
+	if err != nil {
+		// This check is on best-effort basis, so if we can't read the
+		// current usage (cgroup not yet created, or any other error),
+		// we should not fail.
+		return nil
+	}
+
+	if r.MemorySwap > 0 {
+		if uint64(r.MemorySwap) <= usage {
+			return fmt.Errorf("rejecting memory+swap limit %d <= usage %d", r.MemorySwap, usage)
+		}
+	}
+
+	if r.Memory > 0 {
+		if uint64(r.Memory) <= usage {
+			return fmt.Errorf("rejecting memory limit %d <= usage %d", r.Memory, usage)
+		}
+	}
+
+	return nil
+}
diff --git a/fs2/hugetlb.go b/fs2/hugetlb.go
new file mode 100644
index 0000000..8e1ac87
--- /dev/null
+++ b/fs2/hugetlb.go
@@ -0,0 +1,69 @@
+package fs2
+
+import (
+	"errors"
+	"os"
+	"strconv"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+func isHugeTlbSet(r *cgroups.Resources) bool {
+	return len(r.HugetlbLimit) > 0
+}
+
+func setHugeTlb(dirPath string, r *cgroups.Resources) error {
+	if !isHugeTlbSet(r) {
+		return nil
+	}
+	const suffix = ".max"
+	skipRsvd := false
+	for _, hugetlb := range r.HugetlbLimit {
+		prefix := "hugetlb." + hugetlb.Pagesize
+		val := strconv.FormatUint(hugetlb.Limit, 10)
+		if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil {
+			return err
+		}
+		if skipRsvd {
+			continue
+		}
+		if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil {
+			if errors.Is(err, os.ErrNotExist) {
+				skipRsvd = true
+				continue
+			}
+			return err
+		}
+	}
+
+	return nil
+}
+
+func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
+	hugetlbStats := cgroups.HugetlbStats{}
+	rsvd := ".rsvd"
+	for _, pagesize := range cgroups.HugePageSizes() {
+	again:
+		prefix := "hugetlb." + pagesize + rsvd
+		value, err := fscommon.GetCgroupParamUint(dirPath, prefix+".current")
+		if err != nil {
+			if rsvd != "" && errors.Is(err, os.ErrNotExist) {
+				rsvd = ""
+				goto again
+			}
+			return err
+		}
+		hugetlbStats.Usage = value
+
+		value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max")
+		if err != nil {
+			return err
+		}
+		hugetlbStats.Failcnt = value
+
+		stats.HugetlbStats[pagesize] = hugetlbStats
+	}
+
+	return nil
+}
diff --git a/fs2/io.go b/fs2/io.go
new file mode 100644
index 0000000..0f6ef7f
--- /dev/null
+++ b/fs2/io.go
@@ -0,0 +1,192 @@
+package fs2
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+
+	"github.com/sirupsen/logrus"
+
+	"github.com/opencontainers/cgroups"
+)
+
+func isIoSet(r *cgroups.Resources) bool {
+	return r.BlkioWeight != 0 ||
+		len(r.BlkioWeightDevice) > 0 ||
+		len(r.BlkioThrottleReadBpsDevice) > 0 ||
+		len(r.BlkioThrottleWriteBpsDevice) > 0 ||
+		len(r.BlkioThrottleReadIOPSDevice) > 0 ||
+		len(r.BlkioThrottleWriteIOPSDevice) > 0
+}
+
+// bfqDeviceWeightSupported checks for per-device BFQ weight support (added
+// in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight".
+func bfqDeviceWeightSupported(bfq *os.File) bool {
+	if bfq == nil {
+		return false
+	}
+	_, _ = bfq.Seek(0, 0)
+	buf := make([]byte, 32)
+	_, _ = bfq.Read(buf)
+	// If only a single number (default weight) if read back, we have older kernel.
+	_, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64)
+	return err != nil
+}
+
+func setIo(dirPath string, r *cgroups.Resources) error {
+	if !isIoSet(r) {
+		return nil
+	}
+
+	// If BFQ IO scheduler is available, use it.
+	var bfq *os.File
+	if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 {
+		var err error
+		bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR)
+		if err == nil {
+			defer bfq.Close()
+		} else if !os.IsNotExist(err) {
+			return err
+		}
+	}
+
+	if r.BlkioWeight != 0 {
+		if bfq != nil { // Use BFQ.
+			if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
+				return err
+			}
+		} else {
+			// Fallback to io.weight with a conversion scheme.
+			v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight)
+			if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil {
+				return err
+			}
+		}
+	}
+	if bfqDeviceWeightSupported(bfq) {
+		for _, wd := range r.BlkioWeightDevice {
+			if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil {
+				return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err)
+			}
+		}
+	}
+	for _, td := range r.BlkioThrottleReadBpsDevice {
+		if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
+			return err
+		}
+	}
+	for _, td := range r.BlkioThrottleWriteBpsDevice {
+		if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil {
+			return err
+		}
+	}
+	for _, td := range r.BlkioThrottleReadIOPSDevice {
+		if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil {
+			return err
+		}
+	}
+	for _, td := range r.BlkioThrottleWriteIOPSDevice {
+		if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) {
+	ret := map[string][]string{}
+	f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		line := scanner.Text()
+		parts := strings.Fields(line)
+		if len(parts) < 2 {
+			continue
+		}
+		ret[parts[0]] = parts[1:]
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, &parseError{Path: dirPath, File: name, Err: err}
+	}
+	return ret, nil
+}
+
+func statIo(dirPath string, stats *cgroups.Stats) error {
+	const file = "io.stat"
+	values, err := readCgroup2MapFile(dirPath, file)
+	if err != nil {
+		return err
+	}
+	// more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
+	var parsedStats cgroups.BlkioStats
+	for k, v := range values {
+		d := strings.Split(k, ":")
+		if len(d) != 2 {
+			continue
+		}
+		major, err := strconv.ParseUint(d[0], 10, 64)
+		if err != nil {
+			return &parseError{Path: dirPath, File: file, Err: err}
+		}
+		minor, err := strconv.ParseUint(d[1], 10, 64)
+		if err != nil {
+			return &parseError{Path: dirPath, File: file, Err: err}
+		}
+
+		for _, item := range v {
+			d := strings.Split(item, "=")
+			if len(d) != 2 {
+				continue
+			}
+			op := d[0]
+
+			// Map to the cgroupv1 naming and layout (in separate tables).
+			var targetTable *[]cgroups.BlkioStatEntry
+			switch op {
+			// Equivalent to cgroupv1's blkio.io_service_bytes.
+			case "rbytes":
+				op = "Read"
+				targetTable = &parsedStats.IoServiceBytesRecursive
+			case "wbytes":
+				op = "Write"
+				targetTable = &parsedStats.IoServiceBytesRecursive
+			// Equivalent to cgroupv1's blkio.io_serviced.
+			case "rios":
+				op = "Read"
+				targetTable = &parsedStats.IoServicedRecursive
+			case "wios":
+				op = "Write"
+				targetTable = &parsedStats.IoServicedRecursive
+			default:
+				// Skip over entries we cannot map to cgroupv1 stats for now.
+				// In the future we should expand the stats struct to include
+				// them.
+				logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item)
+				continue
+			}
+
+			value, err := strconv.ParseUint(d[1], 10, 64)
+			if err != nil {
+				return &parseError{Path: dirPath, File: file, Err: err}
+			}
+
+			entry := cgroups.BlkioStatEntry{
+				Op:    op,
+				Major: major,
+				Minor: minor,
+				Value: value,
+			}
+			*targetTable = append(*targetTable, entry)
+		}
+	}
+	stats.BlkioStats = parsedStats
+	return nil
+}
diff --git a/fs2/io_test.go b/fs2/io_test.go
new file mode 100644
index 0000000..2f3f6c6
--- /dev/null
+++ b/fs2/io_test.go
@@ -0,0 +1,81 @@
+package fs2
+
+import (
+	"os"
+	"path/filepath"
+	"reflect"
+	"sort"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+const exampleIoStatData = `254:1 rbytes=6901432320 wbytes=14245535744 rios=263278 wios=248603 dbytes=0 dios=0
+254:0 rbytes=2702336 wbytes=0 rios=97 wios=0 dbytes=0 dios=0
+259:0 rbytes=6911345664 wbytes=14245536256 rios=264538 wios=244914 dbytes=530485248 dios=2`
+
+var exampleIoStatsParsed = cgroups.BlkioStats{
+	IoServiceBytesRecursive: []cgroups.BlkioStatEntry{
+		{Major: 254, Minor: 1, Value: 6901432320, Op: "Read"},
+		{Major: 254, Minor: 1, Value: 14245535744, Op: "Write"},
+		{Major: 254, Minor: 0, Value: 2702336, Op: "Read"},
+		{Major: 254, Minor: 0, Value: 0, Op: "Write"},
+		{Major: 259, Minor: 0, Value: 6911345664, Op: "Read"},
+		{Major: 259, Minor: 0, Value: 14245536256, Op: "Write"},
+	},
+	IoServicedRecursive: []cgroups.BlkioStatEntry{
+		{Major: 254, Minor: 1, Value: 263278, Op: "Read"},
+		{Major: 254, Minor: 1, Value: 248603, Op: "Write"},
+		{Major: 254, Minor: 0, Value: 97, Op: "Read"},
+		{Major: 254, Minor: 0, Value: 0, Op: "Write"},
+		{Major: 259, Minor: 0, Value: 264538, Op: "Read"},
+		{Major: 259, Minor: 0, Value: 244914, Op: "Write"},
+	},
+}
+
+func lessBlkioStatEntry(a, b cgroups.BlkioStatEntry) bool {
+	if a.Major != b.Major {
+		return a.Major < b.Major
+	}
+	if a.Minor != b.Minor {
+		return a.Minor < b.Minor
+	}
+	if a.Op != b.Op {
+		return a.Op < b.Op
+	}
+	return a.Value < b.Value
+}
+
+func sortBlkioStats(stats *cgroups.BlkioStats) {
+	for _, table := range []*[]cgroups.BlkioStatEntry{
+		&stats.IoServicedRecursive,
+		&stats.IoServiceBytesRecursive,
+	} {
+		sort.SliceStable(*table, func(i, j int) bool { return lessBlkioStatEntry((*table)[i], (*table)[j]) })
+	}
+}
+
+func TestStatIo(t *testing.T) {
+	// We're using a fake cgroupfs.
+	cgroups.TestMode = true
+
+	fakeCgroupDir := t.TempDir()
+	statPath := filepath.Join(fakeCgroupDir, "io.stat")
+
+	if err := os.WriteFile(statPath, []byte(exampleIoStatData), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	var gotStats cgroups.Stats
+	if err := statIo(fakeCgroupDir, &gotStats); err != nil {
+		t.Error(err)
+	}
+
+	// Sort the output since statIo uses a map internally.
+	sortBlkioStats(&gotStats.BlkioStats)
+	sortBlkioStats(&exampleIoStatsParsed)
+
+	if !reflect.DeepEqual(gotStats.BlkioStats, exampleIoStatsParsed) {
+		t.Errorf("parsed cgroupv2 io.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.BlkioStats, exampleIoStatsParsed)
+	}
+}
diff --git a/fs2/memory.go b/fs2/memory.go
new file mode 100644
index 0000000..d67fd8a
--- /dev/null
+++ b/fs2/memory.go
@@ -0,0 +1,241 @@
+package fs2
+
+import (
+	"bufio"
+	"errors"
+	"math"
+	"os"
+	"strconv"
+	"strings"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+// numToStr converts an int64 value to a string for writing to a
+// cgroupv2 files with .min, .max, .low, or .high suffix.
+// The value of -1 is converted to "max" for cgroupv1 compatibility
+// (which used to write -1 to remove the limit).
+func numToStr(value int64) (ret string) {
+	switch {
+	case value == 0:
+		ret = ""
+	case value == -1:
+		ret = "max"
+	default:
+		ret = strconv.FormatInt(value, 10)
+	}
+
+	return ret
+}
+
+func isMemorySet(r *cgroups.Resources) bool {
+	return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0
+}
+
+func setMemory(dirPath string, r *cgroups.Resources) error {
+	if !isMemorySet(r) {
+		return nil
+	}
+
+	if err := CheckMemoryUsage(dirPath, r); err != nil {
+		return err
+	}
+
+	swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
+	if err != nil {
+		return err
+	}
+	swapStr := numToStr(swap)
+	if swapStr == "" && swap == 0 && r.MemorySwap > 0 {
+		// memory and memorySwap set to the same value -- disable swap
+		swapStr = "0"
+	}
+	// never write empty string to `memory.swap.max`, it means set to 0.
+	if swapStr != "" {
+		if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil {
+			// If swap is not enabled, silently ignore setting to max or disabling it.
+			if !(errors.Is(err, os.ErrNotExist) && (swapStr == "max" || swapStr == "0")) {
+				return err
+			}
+		}
+	}
+
+	if val := numToStr(r.Memory); val != "" {
+		if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil {
+			return err
+		}
+	}
+
+	// cgroup.Resources.KernelMemory is ignored
+
+	if val := numToStr(r.MemoryReservation); val != "" {
+		if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func statMemory(dirPath string, stats *cgroups.Stats) error {
+	const file = "memory.stat"
+	statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
+	if err != nil {
+		return err
+	}
+	defer statsFile.Close()
+
+	sc := bufio.NewScanner(statsFile)
+	for sc.Scan() {
+		t, v, err := fscommon.ParseKeyValue(sc.Text())
+		if err != nil {
+			return &parseError{Path: dirPath, File: file, Err: err}
+		}
+		stats.MemoryStats.Stats[t] = v
+	}
+	if err := sc.Err(); err != nil {
+		return &parseError{Path: dirPath, File: file, Err: err}
+	}
+	stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"]
+	// Unlike cgroup v1 which has memory.use_hierarchy binary knob,
+	// cgroup v2 is always hierarchical.
+	stats.MemoryStats.UseHierarchy = true
+
+	memoryUsage, err := getMemoryDataV2(dirPath, "")
+	if err != nil {
+		if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
+			// The root cgroup does not have memory.{current,max,peak}
+			// so emulate those using data from /proc/meminfo and
+			// /sys/fs/cgroup/memory.stat
+			return rootStatsFromMeminfo(stats)
+		}
+		return err
+	}
+	stats.MemoryStats.Usage = memoryUsage
+	swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage
+	swapUsage := swapOnlyUsage
+	// As cgroup v1 reports SwapUsage values as mem+swap combined,
+	// while in cgroup v2 swap values do not include memory,
+	// report combined mem+swap for v1 compatibility.
+	swapUsage.Usage += memoryUsage.Usage
+	if swapUsage.Limit != math.MaxUint64 {
+		swapUsage.Limit += memoryUsage.Limit
+	}
+	// The `MaxUsage` of mem+swap cannot simply combine mem with
+	// swap. So set it to 0 for v1 compatibility.
+	swapUsage.MaxUsage = 0
+	stats.MemoryStats.SwapUsage = swapUsage
+
+	return nil
+}
+
+func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
+	memoryData := cgroups.MemoryData{}
+
+	moduleName := "memory"
+	if name != "" {
+		moduleName = "memory." + name
+	}
+	usage := moduleName + ".current"
+	limit := moduleName + ".max"
+	maxUsage := moduleName + ".peak"
+
+	value, err := fscommon.GetCgroupParamUint(path, usage)
+	if err != nil {
+		if name != "" && os.IsNotExist(err) {
+			// Ignore EEXIST as there's no swap accounting
+			// if kernel CONFIG_MEMCG_SWAP is not set or
+			// swapaccount=0 kernel boot parameter is given.
+			return cgroups.MemoryData{}, nil
+		}
+		return cgroups.MemoryData{}, err
+	}
+	memoryData.Usage = value
+
+	value, err = fscommon.GetCgroupParamUint(path, limit)
+	if err != nil {
+		return cgroups.MemoryData{}, err
+	}
+	memoryData.Limit = value
+
+	// `memory.peak` since kernel 5.19
+	// `memory.swap.peak` since kernel 6.5
+	value, err = fscommon.GetCgroupParamUint(path, maxUsage)
+	if err != nil && !os.IsNotExist(err) {
+		return cgroups.MemoryData{}, err
+	}
+	memoryData.MaxUsage = value
+
+	return memoryData, nil
+}
+
+func rootStatsFromMeminfo(stats *cgroups.Stats) error {
+	const file = "/proc/meminfo"
+	f, err := os.Open(file)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	// Fields we are interested in.
+	var (
+		swap_free  uint64
+		swap_total uint64
+	)
+	mem := map[string]*uint64{
+		"SwapFree":  &swap_free,
+		"SwapTotal": &swap_total,
+	}
+
+	found := 0
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		parts := strings.SplitN(sc.Text(), ":", 3)
+		if len(parts) != 2 {
+			// Should not happen.
+			continue
+		}
+		k := parts[0]
+		p, ok := mem[k]
+		if !ok {
+			// Unknown field -- not interested.
+			continue
+		}
+		vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB"))
+		*p, err = strconv.ParseUint(vStr, 10, 64)
+		if err != nil {
+			return &parseError{File: file, Err: errors.New("bad value for " + k)}
+		}
+
+		found++
+		if found == len(mem) {
+			// Got everything we need -- skip the rest.
+			break
+		}
+	}
+	if err := sc.Err(); err != nil {
+		return &parseError{Path: "", File: file, Err: err}
+	}
+
+	// cgroup v1 `usage_in_bytes` reports memory usage as the sum of
+	// - rss (NR_ANON_MAPPED)
+	// - cache (NR_FILE_PAGES)
+	// cgroup v1 reports SwapUsage values as mem+swap combined
+	// cgroup v2 reports rss and cache as anon and file.
+	// sum `anon` + `file` to report the same value as `usage_in_bytes` in v1.
+	// sum swap usage as combined mem+swap usage for consistency as well.
+	stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"]
+	stats.MemoryStats.Usage.Limit = math.MaxUint64
+	stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024
+	stats.MemoryStats.SwapUsage.Limit = math.MaxUint64
+	stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage
+
+	return nil
+}
diff --git a/fs2/memory_test.go b/fs2/memory_test.go
new file mode 100644
index 0000000..e46dbe6
--- /dev/null
+++ b/fs2/memory_test.go
@@ -0,0 +1,155 @@
+package fs2
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+const exampleMemoryStatData = `anon 790425600
+file 6502666240
+kernel_stack 7012352
+pagetables 8867840
+percpu 2445520
+sock 40960
+shmem 6721536
+file_mapped 656187392
+file_dirty 1122304
+file_writeback 0
+swapcached 10
+anon_thp 438304768
+file_thp 0
+shmem_thp 0
+inactive_anon 892223488
+active_anon 2973696
+inactive_file 5307346944
+active_file 1179316224
+unevictable 31477760
+slab_reclaimable 348866240
+slab_unreclaimable 10099808
+slab 358966048
+workingset_refault_anon 0
+workingset_refault_file 0
+workingset_activate_anon 0
+workingset_activate_file 0
+workingset_restore_anon 0
+workingset_restore_file 0
+workingset_nodereclaim 0
+pgfault 103216687
+pgmajfault 6879
+pgrefill 0
+pgscan 0
+pgsteal 0
+pgactivate 1110217
+pgdeactivate 292
+pglazyfree 267
+pglazyfreed 0
+thp_fault_alloc 57411
+thp_collapse_alloc 443`
+
+func TestStatMemoryPodCgroupNotFound(t *testing.T) {
+	// We're using a fake cgroupfs.
+	cgroups.TestMode = true
+	fakeCgroupDir := t.TempDir()
+
+	// only write memory.stat to ensure pod cgroup usage
+	// still reads memory.current.
+	statPath := filepath.Join(fakeCgroupDir, "memory.stat")
+	if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	gotStats := cgroups.NewStats()
+
+	// use a fake root path to mismatch the file we wrote.
+	// this triggers the non-root path which should fail to find memory.current.
+	err := statMemory(fakeCgroupDir, gotStats)
+	if err == nil {
+		t.Errorf("expected error when statting memory for cgroupv2 root, but was nil")
+	}
+
+	if !strings.Contains(err.Error(), "memory.current: no such file or directory") {
+		t.Errorf("expected error to contain 'memory.current: no such file or directory', but was %s", err.Error())
+	}
+}
+
+func TestStatMemoryPodCgroup(t *testing.T) {
+	// We're using a fake cgroupfs.
+	cgroups.TestMode = true
+	fakeCgroupDir := t.TempDir()
+
+	statPath := filepath.Join(fakeCgroupDir, "memory.stat")
+	if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.current"), []byte("123456789"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.max"), []byte("999999999"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.peak"), []byte("987654321"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	gotStats := cgroups.NewStats()
+
+	// use a fake root path to trigger the pod cgroup lookup.
+	err := statMemory(fakeCgroupDir, gotStats)
+	if err != nil {
+		t.Errorf("expected no error when statting memory for cgroupv2 root, but got %#+v", err)
+	}
+
+	// result should be "memory.current"
+	var expectedUsageBytes uint64 = 123456789
+	if gotStats.MemoryStats.Usage.Usage != expectedUsageBytes {
+		t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Usage, expectedUsageBytes)
+	}
+
+	// result should be "memory.max"
+	var expectedLimitBytes uint64 = 999999999
+	if gotStats.MemoryStats.Usage.Limit != expectedLimitBytes {
+		t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Limit, expectedLimitBytes)
+	}
+
+	// result should be "memory.peak"
+	var expectedMaxUsageBytes uint64 = 987654321
+	if gotStats.MemoryStats.Usage.MaxUsage != expectedMaxUsageBytes {
+		t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.MaxUsage, expectedMaxUsageBytes)
+	}
+}
+
+func TestRootStatsFromMeminfo(t *testing.T) {
+	stats := &cgroups.Stats{
+		MemoryStats: cgroups.MemoryStats{
+			Stats: map[string]uint64{
+				"anon": 790425600,
+				"file": 6502666240,
+			},
+		},
+	}
+
+	if err := rootStatsFromMeminfo(stats); err != nil {
+		t.Fatal(err)
+	}
+
+	// result is anon + file
+	var expectedUsageBytes uint64 = 7293091840
+	if stats.MemoryStats.Usage.Usage != expectedUsageBytes {
+		t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %d\nexpected %d\n", stats.MemoryStats.Usage.Usage, expectedUsageBytes)
+	}
+
+	// swap is adjusted to mem+swap
+	if stats.MemoryStats.SwapUsage.Usage < stats.MemoryStats.Usage.Usage {
+		t.Errorf("swap usage %d should be at least mem usage %d", stats.MemoryStats.SwapUsage.Usage, stats.MemoryStats.Usage.Usage)
+	}
+	if stats.MemoryStats.SwapUsage.Limit < stats.MemoryStats.Usage.Limit {
+		t.Errorf("swap limit %d should be at least mem limit %d", stats.MemoryStats.SwapUsage.Limit, stats.MemoryStats.Usage.Limit)
+	}
+}
diff --git a/fs2/misc.go b/fs2/misc.go
new file mode 100644
index 0000000..f20136b
--- /dev/null
+++ b/fs2/misc.go
@@ -0,0 +1,52 @@
+package fs2
+
+import (
+	"bufio"
+	"os"
+	"strings"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+func statMisc(dirPath string, stats *cgroups.Stats) error {
+	for _, file := range []string{"current", "events"} {
+		fd, err := cgroups.OpenFile(dirPath, "misc."+file, os.O_RDONLY)
+		if err != nil {
+			return err
+		}
+
+		s := bufio.NewScanner(fd)
+		for s.Scan() {
+			key, value, err := fscommon.ParseKeyValue(s.Text())
+			if err != nil {
+				fd.Close()
+				return err
+			}
+
+			key = strings.TrimSuffix(key, ".max")
+
+			if _, ok := stats.MiscStats[key]; !ok {
+				stats.MiscStats[key] = cgroups.MiscStats{}
+			}
+
+			tmp := stats.MiscStats[key]
+
+			switch file {
+			case "current":
+				tmp.Usage = value
+			case "events":
+				tmp.Events = value
+			}
+
+			stats.MiscStats[key] = tmp
+		}
+		fd.Close()
+
+		if err := s.Err(); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
diff --git a/fs2/misc_test.go b/fs2/misc_test.go
new file mode 100644
index 0000000..01ccc0a
--- /dev/null
+++ b/fs2/misc_test.go
@@ -0,0 +1,103 @@
+package fs2
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+const exampleMiscCurrentData = `res_a 123
+res_b 456
+res_c 42`
+
+const exampleMiscEventsData = `res_a.max 1
+res_b.max 2
+res_c.max 3`
+
+func TestStatMiscPodCgroupEmpty(t *testing.T) {
+	// We're using a fake cgroupfs.
+	cgroups.TestMode = true
+	fakeCgroupDir := t.TempDir()
+
+	// create empty misc.current and misc.events files to test the common case
+	// where no misc resource keys are available
+	for _, file := range []string{"misc.current", "misc.events"} {
+		if _, err := os.Create(filepath.Join(fakeCgroupDir, file)); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	gotStats := cgroups.NewStats()
+
+	err := statMisc(fakeCgroupDir, gotStats)
+	if err != nil {
+		t.Errorf("expected no error when statting empty misc.current/misc.events for cgroupv2, but got %#v", err)
+	}
+
+	if len(gotStats.MiscStats) != 0 {
+		t.Errorf("parsed cgroupv2 misc.* returns unexpected resources: got %#v but expected nothing", gotStats.MiscStats)
+	}
+}
+
+func TestStatMiscPodCgroupNotFound(t *testing.T) {
+	// We're using a fake cgroupfs.
+	cgroups.TestMode = true
+	fakeCgroupDir := t.TempDir()
+
+	// only write misc.current to ensure pod cgroup usage
+	// still reads misc.events.
+	statPath := filepath.Join(fakeCgroupDir, "misc.current")
+	if err := os.WriteFile(statPath, []byte(exampleMiscCurrentData), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	gotStats := cgroups.NewStats()
+
+	// use a fake root path to mismatch the file we wrote.
+	// this triggers the non-root path which should fail to find misc.events.
+	err := statMisc(fakeCgroupDir, gotStats)
+	if err == nil {
+		t.Errorf("expected error when statting misc.current for cgroupv2 root, but was nil")
+	}
+
+	if !strings.Contains(err.Error(), "misc.events: no such file or directory") {
+		t.Errorf("expected error to contain 'misc.events: no such file or directory', but was %s", err.Error())
+	}
+}
+
+func TestStatMiscPodCgroup(t *testing.T) {
+	// We're using a fake cgroupfs.
+	cgroups.TestMode = true
+	fakeCgroupDir := t.TempDir()
+
+	currentPath := filepath.Join(fakeCgroupDir, "misc.current")
+	if err := os.WriteFile(currentPath, []byte(exampleMiscCurrentData), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	eventsPath := filepath.Join(fakeCgroupDir, "misc.events")
+	if err := os.WriteFile(eventsPath, []byte(exampleMiscEventsData), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	gotStats := cgroups.NewStats()
+
+	// use a fake root path to trigger the pod cgroup lookup.
+	err := statMisc(fakeCgroupDir, gotStats)
+	if err != nil {
+		t.Errorf("expected no error when statting misc for cgroupv2 root, but got %#+v", err)
+	}
+
+	// make sure all res_* from exampleMisc*Data are returned
+	if len(gotStats.MiscStats) != 3 {
+		t.Errorf("parsed cgroupv2 misc doesn't return all expected resources: \ngot %#v\nexpected %#v\n", len(gotStats.MiscStats), 3)
+	}
+
+	var expectedUsageBytes uint64 = 42
+	if gotStats.MiscStats["res_c"].Usage != expectedUsageBytes {
+		t.Errorf("parsed cgroupv2 misc.current for res_c doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MiscStats["res_c"].Usage, expectedUsageBytes)
+	}
+}
diff --git a/fs2/pids.go b/fs2/pids.go
new file mode 100644
index 0000000..9b82b90
--- /dev/null
+++ b/fs2/pids.go
@@ -0,0 +1,71 @@
+package fs2
+
+import (
+	"errors"
+	"math"
+	"os"
+	"strings"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+)
+
+func isPidsSet(r *cgroups.Resources) bool {
+	return r.PidsLimit != 0
+}
+
+func setPids(dirPath string, r *cgroups.Resources) error {
+	if !isPidsSet(r) {
+		return nil
+	}
+	if val := numToStr(r.PidsLimit); val != "" {
+		if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
+	// if the controller is not enabled, let's read PIDS from cgroups.procs
+	// (or threads if cgroup.threads is enabled)
+	contents, err := cgroups.ReadFile(dirPath, "cgroup.procs")
+	if errors.Is(err, unix.ENOTSUP) {
+		contents, err = cgroups.ReadFile(dirPath, "cgroup.threads")
+	}
+	if err != nil {
+		return err
+	}
+	pids := strings.Count(contents, "\n")
+	stats.PidsStats.Current = uint64(pids)
+	stats.PidsStats.Limit = 0
+	return nil
+}
+
+func statPids(dirPath string, stats *cgroups.Stats) error {
+	current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current")
+	if err != nil {
+		if os.IsNotExist(err) {
+			return statPidsFromCgroupProcs(dirPath, stats)
+		}
+		return err
+	}
+
+	max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max")
+	if err != nil {
+		return err
+	}
+	// If no limit is set, read from pids.max returns "max", which is
+	// converted to MaxUint64 by GetCgroupParamUint. Historically, we
+	// represent "no limit" for pids as 0, thus this conversion.
+	if max == math.MaxUint64 {
+		max = 0
+	}
+
+	stats.PidsStats.Current = current
+	stats.PidsStats.Limit = max
+	return nil
+}
diff --git a/fs2/psi.go b/fs2/psi.go
new file mode 100644
index 0000000..010fe0b
--- /dev/null
+++ b/fs2/psi.go
@@ -0,0 +1,89 @@
+package fs2
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+)
+
+func statPSI(dirPath string, file string) (*cgroups.PSIStats, error) {
+	f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			// Kernel < 4.20, or CONFIG_PSI is not set,
+			// or PSI stats are turned off for the cgroup
+			// ("echo 0 > cgroup.pressure", kernel >= 6.1).
+			return nil, nil
+		}
+		return nil, err
+	}
+	defer f.Close()
+
+	var psistats cgroups.PSIStats
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		parts := strings.Fields(sc.Text())
+		var pv *cgroups.PSIData
+		switch parts[0] {
+		case "some":
+			pv = &psistats.Some
+		case "full":
+			pv = &psistats.Full
+		}
+		if pv != nil {
+			*pv, err = parsePSIData(parts[1:])
+			if err != nil {
+				return nil, &parseError{Path: dirPath, File: file, Err: err}
+			}
+		}
+	}
+	if err := sc.Err(); err != nil {
+		if errors.Is(err, unix.ENOTSUP) {
+			// Some kernels (e.g. CS9) may return ENOTSUP on read
+			// if psi=1 kernel cmdline parameter is required.
+			return nil, nil
+		}
+		return nil, &parseError{Path: dirPath, File: file, Err: err}
+	}
+	return &psistats, nil
+}
+
+func parsePSIData(psi []string) (cgroups.PSIData, error) {
+	data := cgroups.PSIData{}
+	for _, f := range psi {
+		key, val, ok := strings.Cut(f, "=")
+		if !ok {
+			return data, fmt.Errorf("invalid psi data: %q", f)
+		}
+		var pv *float64
+		switch key {
+		case "avg10":
+			pv = &data.Avg10
+		case "avg60":
+			pv = &data.Avg60
+		case "avg300":
+			pv = &data.Avg300
+		case "total":
+			v, err := strconv.ParseUint(val, 10, 64)
+			if err != nil {
+				return data, fmt.Errorf("invalid %s PSI value: %w", key, err)
+			}
+			data.Total = v
+		}
+		if pv != nil {
+			v, err := strconv.ParseFloat(val, 64)
+			if err != nil {
+				return data, fmt.Errorf("invalid %s PSI value: %w", key, err)
+			}
+			*pv = v
+		}
+	}
+	return data, nil
+}
diff --git a/fs2/psi_test.go b/fs2/psi_test.go
new file mode 100644
index 0000000..7007efe
--- /dev/null
+++ b/fs2/psi_test.go
@@ -0,0 +1,47 @@
+package fs2
+
+import (
+	"os"
+	"path/filepath"
+	"reflect"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+func TestStatCPUPSI(t *testing.T) {
+	const examplePSIData = `some avg10=1.71 avg60=2.36 avg300=2.57 total=230548833
+full avg10=1.00 avg60=1.01 avg300=1.00 total=157622356`
+
+	// We're using a fake cgroupfs.
+	cgroups.TestMode = true
+
+	fakeCgroupDir := t.TempDir()
+	statPath := filepath.Join(fakeCgroupDir, "cpu.pressure")
+
+	if err := os.WriteFile(statPath, []byte(examplePSIData), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	st, err := statPSI(fakeCgroupDir, "cpu.pressure")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !reflect.DeepEqual(*st, cgroups.PSIStats{
+		Some: cgroups.PSIData{
+			Avg10:  1.71,
+			Avg60:  2.36,
+			Avg300: 2.57,
+			Total:  230548833,
+		},
+		Full: cgroups.PSIData{
+			Avg10:  1.00,
+			Avg60:  1.01,
+			Avg300: 1.00,
+			Total:  157622356,
+		},
+	}) {
+		t.Errorf("unexpected PSI result: %+v", st)
+	}
+}
diff --git a/fscommon/rdma.go b/fscommon/rdma.go
new file mode 100644
index 0000000..86e38fd
--- /dev/null
+++ b/fscommon/rdma.go
@@ -0,0 +1,120 @@
+package fscommon
+
+import (
+	"bufio"
+	"errors"
+	"math"
+	"os"
+	"strconv"
+	"strings"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/cgroups"
+)
+
+// parseRdmaKV parses raw string to RdmaEntry.
+func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error {
+	var value uint32
+
+	k, v, ok := strings.Cut(raw, "=")
+
+	if !ok {
+		return errors.New("Unable to parse RDMA entry")
+	}
+
+	if v == "max" {
+		value = math.MaxUint32
+	} else {
+		val64, err := strconv.ParseUint(v, 10, 32)
+		if err != nil {
+			return err
+		}
+		value = uint32(val64)
+	}
+	switch k {
+	case "hca_handle":
+		entry.HcaHandles = value
+	case "hca_object":
+		entry.HcaObjects = value
+	}
+
+	return nil
+}
+
+// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file.
+// example entry: mlx4_0 hca_handle=2 hca_object=2000
+func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) {
+	rdmaEntries := make([]cgroups.RdmaEntry, 0)
+	fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY)
+	if err != nil {
+		return nil, err
+	}
+	defer fd.Close() //nolint:errorlint
+	scanner := bufio.NewScanner(fd)
+	for scanner.Scan() {
+		parts := strings.SplitN(scanner.Text(), " ", 4)
+		if len(parts) == 3 {
+			entry := new(cgroups.RdmaEntry)
+			entry.Device = parts[0]
+			err = parseRdmaKV(parts[1], entry)
+			if err != nil {
+				continue
+			}
+			err = parseRdmaKV(parts[2], entry)
+			if err != nil {
+				continue
+			}
+
+			rdmaEntries = append(rdmaEntries, *entry)
+		}
+	}
+	return rdmaEntries, scanner.Err()
+}
+
+// RdmaGetStats returns rdma stats such as totalLimit and current entries.
+func RdmaGetStats(path string, stats *cgroups.Stats) error {
+	currentEntries, err := readRdmaEntries(path, "rdma.current")
+	if err != nil {
+		if errors.Is(err, os.ErrNotExist) {
+			err = nil
+		}
+		return err
+	}
+	maxEntries, err := readRdmaEntries(path, "rdma.max")
+	if err != nil {
+		return err
+	}
+	// If device got removed between reading two files, ignore returning stats.
+	if len(currentEntries) != len(maxEntries) {
+		return nil
+	}
+
+	stats.RdmaStats = cgroups.RdmaStats{
+		RdmaLimit:   maxEntries,
+		RdmaCurrent: currentEntries,
+	}
+
+	return nil
+}
+
+func createCmdString(device string, limits cgroups.LinuxRdma) string {
+	cmdString := device
+	if limits.HcaHandles != nil {
+		cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10)
+	}
+	if limits.HcaObjects != nil {
+		cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10)
+	}
+	return cmdString
+}
+
+// RdmaSet sets RDMA resources.
+func RdmaSet(path string, r *cgroups.Resources) error {
+	for device, limits := range r.Rdma {
+		if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/fscommon/rdma_test.go b/fscommon/rdma_test.go
new file mode 100644
index 0000000..6af3151
--- /dev/null
+++ b/fscommon/rdma_test.go
@@ -0,0 +1,57 @@
+package fscommon
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+/* Roadmap for future */
+// (Low-priority)  TODO: Check if it is possible to virtually mimic an actual RDMA device.
+// TODO: Think of more edge-cases to add.
+
+// TestRdmaSet performs an E2E test of RdmaSet(), parseRdmaKV() using dummy device and a dummy cgroup file-system.
+// Note: Following test does not guarantees that your host supports RDMA since this mocks underlying infrastructure.
+func TestRdmaSet(t *testing.T) {
+	testCgroupPath := filepath.Join(t.TempDir(), "rdma")
+
+	// Ensure the full mock cgroup path exists.
+	err := os.Mkdir(testCgroupPath, 0o755)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	rdmaDevice := "mlx5_1"
+	maxHandles := uint32(100)
+	maxObjects := uint32(300)
+
+	rdmaStubResource := &cgroups.Resources{
+		Rdma: map[string]cgroups.LinuxRdma{
+			rdmaDevice: {
+				HcaHandles: &maxHandles,
+				HcaObjects: &maxObjects,
+			},
+		},
+	}
+
+	if err := RdmaSet(testCgroupPath, rdmaStubResource); err != nil {
+		t.Fatal(err)
+	}
+
+	// The default rdma.max must be written.
+	rdmaEntries, err := readRdmaEntries(testCgroupPath, "rdma.max")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(rdmaEntries) != 1 {
+		t.Fatal("rdma_test: Got the wrong values while parsing entries from rdma.max")
+	}
+	if rdmaEntries[0].HcaHandles != maxHandles {
+		t.Fatalf("rdma_test: Got the wrong value for hca_handles")
+	}
+	if rdmaEntries[0].HcaObjects != maxObjects {
+		t.Fatalf("rdma_test: Got the wrong value for hca_Objects")
+	}
+}
diff --git a/fscommon/utils.go b/fscommon/utils.go
new file mode 100644
index 0000000..d8f8dfc
--- /dev/null
+++ b/fscommon/utils.go
@@ -0,0 +1,144 @@
+package fscommon
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"path"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/cgroups"
+)
+
+var (
+	// Deprecated: use cgroups.OpenFile instead.
+	OpenFile = cgroups.OpenFile
+	// Deprecated: use cgroups.ReadFile instead.
+	ReadFile = cgroups.ReadFile
+	// Deprecated: use cgroups.WriteFile instead.
+	WriteFile = cgroups.WriteFile
+)
+
+// ParseError records a parse error details, including the file path.
+type ParseError struct {
+	Path string
+	File string
+	Err  error
+}
+
+func (e *ParseError) Error() string {
+	return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error()
+}
+
+func (e *ParseError) Unwrap() error { return e.Err }
+
+// ParseUint converts a string to an uint64 integer.
+// Negative values are returned at zero as, due to kernel bugs,
+// some of the memory cgroup stats can be negative.
+func ParseUint(s string, base, bitSize int) (uint64, error) {
+	value, err := strconv.ParseUint(s, base, bitSize)
+	if err != nil {
+		intValue, intErr := strconv.ParseInt(s, base, bitSize)
+		// 1. Handle negative values greater than MinInt64 (and)
+		// 2. Handle negative values lesser than MinInt64
+		if intErr == nil && intValue < 0 {
+			return 0, nil
+		} else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 {
+			return 0, nil
+		}
+
+		return value, err
+	}
+
+	return value, nil
+}
+
+// ParseKeyValue parses a space-separated "key value" kind of cgroup
+// parameter and returns its key as a string, and its value as uint64
+// (using [ParseUint] to convert the value). For example,
+// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
+func ParseKeyValue(t string) (string, uint64, error) {
+	key, val, ok := strings.Cut(t, " ")
+	if !ok || key == "" || val == "" {
+		return "", 0, fmt.Errorf(`line %q is not in "key value" format`, t)
+	}
+
+	value, err := ParseUint(val, 10, 64)
+	if err != nil {
+		return "", 0, err
+	}
+
+	return key, value, nil
+}
+
+// GetValueByKey reads space-separated "key value" pairs from the specified
+// cgroup file, looking for a specified key, and returns its value as uint64,
+// using [ParseUint] for conversion. If the value is not found, 0 is returned.
+func GetValueByKey(path, file, key string) (uint64, error) {
+	content, err := cgroups.ReadFile(path, file)
+	if err != nil {
+		return 0, err
+	}
+
+	key += " "
+	lines := strings.Split(content, "\n")
+	for _, line := range lines {
+		v, ok := strings.CutPrefix(line, key)
+		if ok {
+			val, err := ParseUint(v, 10, 64)
+			if err != nil {
+				err = &ParseError{Path: path, File: file, Err: err}
+			}
+			return val, err
+		}
+	}
+
+	return 0, nil
+}
+
+// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
+// If the value read is "max", the math.MaxUint64 is returned.
+func GetCgroupParamUint(path, file string) (uint64, error) {
+	contents, err := GetCgroupParamString(path, file)
+	if err != nil {
+		return 0, err
+	}
+	if contents == "max" {
+		return math.MaxUint64, nil
+	}
+
+	res, err := ParseUint(contents, 10, 64)
+	if err != nil {
+		return res, &ParseError{Path: path, File: file, Err: err}
+	}
+	return res, nil
+}
+
+// GetCgroupParamInt reads a single int64 value from specified cgroup file.
+// If the value read is "max", the math.MaxInt64 is returned.
+func GetCgroupParamInt(path, file string) (int64, error) {
+	contents, err := GetCgroupParamString(path, file)
+	if err != nil {
+		return 0, err
+	}
+	if contents == "max" {
+		return math.MaxInt64, nil
+	}
+
+	res, err := strconv.ParseInt(contents, 10, 64)
+	if err != nil {
+		return res, &ParseError{Path: path, File: file, Err: err}
+	}
+	return res, nil
+}
+
+// GetCgroupParamString reads a string from the specified cgroup file.
+func GetCgroupParamString(path, file string) (string, error) {
+	contents, err := cgroups.ReadFile(path, file)
+	if err != nil {
+		return "", err
+	}
+
+	return strings.TrimSpace(contents), nil
+}
diff --git a/fscommon/utils_test.go b/fscommon/utils_test.go
new file mode 100644
index 0000000..2bc411a
--- /dev/null
+++ b/fscommon/utils_test.go
@@ -0,0 +1,95 @@
+package fscommon
+
+import (
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+)
+
+const (
+	cgroupFile  = "cgroup.file"
+	floatValue  = 2048.0
+	floatString = "2048"
+)
+
+func init() {
+	cgroups.TestMode = true
+}
+
+func TestGetCgroupParamsInt(t *testing.T) {
+	// Setup tempdir.
+	tempDir := t.TempDir()
+	tempFile := filepath.Join(tempDir, cgroupFile)
+
+	// Success.
+	if err := os.WriteFile(tempFile, []byte(floatString), 0o755); err != nil {
+		t.Fatal(err)
+	}
+	value, err := GetCgroupParamUint(tempDir, cgroupFile)
+	if err != nil {
+		t.Fatal(err)
+	} else if value != floatValue {
+		t.Fatalf("Expected %d to equal %f", value, floatValue)
+	}
+
+	// Success with new line.
+	err = os.WriteFile(tempFile, []byte(floatString+"\n"), 0o755)
+	if err != nil {
+		t.Fatal(err)
+	}
+	value, err = GetCgroupParamUint(tempDir, cgroupFile)
+	if err != nil {
+		t.Fatal(err)
+	} else if value != floatValue {
+		t.Fatalf("Expected %d to equal %f", value, floatValue)
+	}
+
+	// Success with negative values
+	err = os.WriteFile(tempFile, []byte("-12345"), 0o755)
+	if err != nil {
+		t.Fatal(err)
+	}
+	value, err = GetCgroupParamUint(tempDir, cgroupFile)
+	if err != nil {
+		t.Fatal(err)
+	} else if value != 0 {
+		t.Fatalf("Expected %d to equal %d", value, 0)
+	}
+
+	// Success with negative values lesser than min int64
+	s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64)
+	err = os.WriteFile(tempFile, []byte(s), 0o755)
+	if err != nil {
+		t.Fatal(err)
+	}
+	value, err = GetCgroupParamUint(tempDir, cgroupFile)
+	if err != nil {
+		t.Fatal(err)
+	} else if value != 0 {
+		t.Fatalf("Expected %d to equal %d", value, 0)
+	}
+
+	// Not a float.
+	err = os.WriteFile(tempFile, []byte("not-a-float"), 0o755)
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = GetCgroupParamUint(tempDir, cgroupFile)
+	if err == nil {
+		t.Fatal("Expecting error, got none")
+	}
+
+	// Unknown file.
+	err = os.Remove(tempFile)
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = GetCgroupParamUint(tempDir, cgroupFile)
+	if err == nil {
+		t.Fatal("Expecting error, got none")
+	}
+}
diff --git a/getallpids.go b/getallpids.go
new file mode 100644
index 0000000..1355a51
--- /dev/null
+++ b/getallpids.go
@@ -0,0 +1,27 @@
+package cgroups
+
+import (
+	"io/fs"
+	"path/filepath"
+)
+
+// GetAllPids returns all pids from the cgroup identified by path, and all its
+// sub-cgroups.
+func GetAllPids(path string) ([]int, error) {
+	var pids []int
+	err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error {
+		if iErr != nil {
+			return iErr
+		}
+		if !d.IsDir() {
+			return nil
+		}
+		cPids, err := readProcsFile(p)
+		if err != nil {
+			return err
+		}
+		pids = append(pids, cPids...)
+		return nil
+	})
+	return pids, err
+}
diff --git a/getallpids_test.go b/getallpids_test.go
new file mode 100644
index 0000000..e6b0632
--- /dev/null
+++ b/getallpids_test.go
@@ -0,0 +1,17 @@
+package cgroups
+
+import (
+	"testing"
+)
+
+func BenchmarkGetAllPids(b *testing.B) {
+	total := 0
+	for i := 0; i < b.N; i++ {
+		i, err := GetAllPids("/sys/fs/cgroup")
+		if err != nil {
+			b.Fatal(err)
+		}
+		total += len(i)
+	}
+	b.Logf("iter: %d, total: %d", b.N, total)
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..151d458
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,14 @@
+module github.com/opencontainers/cgroups
+
+go 1.23.0
+
+require (
+	github.com/cilium/ebpf v0.17.3
+	github.com/coreos/go-systemd/v22 v22.5.0
+	github.com/cyphar/filepath-securejoin v0.4.1
+	github.com/godbus/dbus/v5 v5.1.0
+	github.com/moby/sys/mountinfo v0.7.2
+	github.com/moby/sys/userns v0.1.0
+	github.com/sirupsen/logrus v1.9.3
+	golang.org/x/sys v0.30.0
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..f63cca1
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,53 @@
+github.com/cilium/ebpf v0.17.3 h1:FnP4r16PWYSE4ux6zN+//jMcW4nMVRvuTLVTvCjyyjg=
+github.com/cilium/ebpf v0.17.3/go.mod h1:G5EDHij8yiLzaqn0WjyfJHvRa+3aDlReIaLVRMvOyJk=
+github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
+github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
+github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s=
+github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/go-quicktest/qt v1.101.0 h1:O1K29Txy5P2OK0dGo59b7b0LR6wKfIhttaAhHUyn7eI=
+github.com/go-quicktest/qt v1.101.0/go.mod h1:14Bz/f7NwaXPtdYEgzsx46kqSxVwTbzVZsDC26tQJow=
+github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
+github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk=
+github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA=
+github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
+github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM=
+github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g=
+github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw=
+github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U=
+github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA=
+github.com/moby/sys/mountinfo v0.7.2 h1:1shs6aH5s4o5H2zQLn796ADW1wMrIwHsyJ2v9KouLrg=
+github.com/moby/sys/mountinfo v0.7.2/go.mod h1:1YOa8w8Ih7uW0wALDUgT1dTTSBrZ+HiBLGws92L2RU4=
+github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g=
+github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
+github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
+github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
+github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
+golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
+golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
+golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/internal/path/path.go b/internal/path/path.go
new file mode 100644
index 0000000..a105a7c
--- /dev/null
+++ b/internal/path/path.go
@@ -0,0 +1,52 @@
+package path
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+
+	"github.com/opencontainers/cgroups"
+)
+
+// Inner returns a path to cgroup relative to a cgroup mount point, based
+// on cgroup configuration, or an error, if cgroup configuration is invalid.
+// To be used only by fs cgroup managers (systemd has different path rules).
+func Inner(c *cgroups.Cgroup) (string, error) {
+	if (c.Name != "" || c.Parent != "") && c.Path != "" {
+		return "", errors.New("cgroup: either Path or Name and Parent should be used")
+	}
+
+	// XXX: Do not remove cleanPath. Path safety is important! -- cyphar
+	innerPath := cleanPath(c.Path)
+	if innerPath == "" {
+		cgParent := cleanPath(c.Parent)
+		cgName := cleanPath(c.Name)
+		innerPath = filepath.Join(cgParent, cgName)
+	}
+
+	return innerPath, nil
+}
+
+// cleanPath is a copy of github.com/opencontainers/runc/libcontainer/utils.CleanPath.
+func cleanPath(path string) string {
+	// Deal with empty strings nicely.
+	if path == "" {
+		return ""
+	}
+
+	// Ensure that all paths are cleaned (especially problematic ones like
+	// "/../../../../../" which can cause lots of issues).
+
+	if filepath.IsAbs(path) {
+		return filepath.Clean(path)
+	}
+
+	// If the path isn't absolute, we need to do more processing to fix paths
+	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
+	// paths to relative ones.
+	path = filepath.Clean(string(os.PathSeparator) + path)
+	// This can't fail, as (by definition) all paths are relative to root.
+	path, _ = filepath.Rel(string(os.PathSeparator), path)
+
+	return path
+}
diff --git a/manager/manager_test.go b/manager/manager_test.go
new file mode 100644
index 0000000..f3a446a
--- /dev/null
+++ b/manager/manager_test.go
@@ -0,0 +1,55 @@
+package manager
+
+import (
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/systemd"
+)
+
+// TestNilResources checks that a cgroup manager do not panic when
+// config.Resources is nil. While it does not make sense to use a
+// manager with no resources, it should not result in a panic.
+//
+// This tests either v1 or v2 fs cgroup manager, depending on which
+// cgroup version is available.
+func TestNilResources(t *testing.T) {
+	testNilResources(t, false)
+}
+
+// TestNilResourcesSystemd is the same as TestNilResources,
+// only checking the systemd cgroup manager.
+func TestNilResourcesSystemd(t *testing.T) {
+	if !systemd.IsRunningSystemd() {
+		t.Skip("requires systemd")
+	}
+	testNilResources(t, true)
+}
+
+func testNilResources(t *testing.T, systemd bool) {
+	cg := &cgroups.Cgroup{} // .Resources is nil
+	cg.Systemd = systemd
+	mgr, err := New(cg)
+	if err != nil {
+		// Some managers require non-nil Resources during
+		// instantiation -- provide and retry. In such case
+		// we're mostly testing Set(nil) below.
+		cg.Resources = &cgroups.Resources{}
+		mgr, err = New(cg)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+	_ = mgr.Apply(-1)
+	_ = mgr.Set(nil)
+	_ = mgr.Freeze(cgroups.Thawed)
+	_ = mgr.Exists()
+	_, _ = mgr.GetAllPids()
+	_, _ = mgr.GetCgroups()
+	_, _ = mgr.GetFreezerState()
+	_ = mgr.Path("")
+	_ = mgr.GetPaths()
+	_, _ = mgr.GetStats()
+	_, _ = mgr.OOMKillCount()
+	_ = mgr.Destroy()
+}
diff --git a/manager/new.go b/manager/new.go
new file mode 100644
index 0000000..2df39e5
--- /dev/null
+++ b/manager/new.go
@@ -0,0 +1,77 @@
+package manager
+
+import (
+	"errors"
+	"fmt"
+	"path/filepath"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fs"
+	"github.com/opencontainers/cgroups/fs2"
+	"github.com/opencontainers/cgroups/systemd"
+)
+
+// New returns the instance of a cgroup manager, which is chosen
+// based on the local environment (whether cgroup v1 or v2 is used)
+// and the config (whether config.Systemd is set or not).
+func New(config *cgroups.Cgroup) (cgroups.Manager, error) {
+	return NewWithPaths(config, nil)
+}
+
+// NewWithPaths is similar to New, and can be used in case cgroup paths
+// are already well known, which can save some resources.
+//
+// For cgroup v1, the keys are controller/subsystem name, and the values
+// are absolute filesystem paths to the appropriate cgroups.
+//
+// For cgroup v2, the only key allowed is "" (empty string), and the value
+// is the unified cgroup path.
+func NewWithPaths(config *cgroups.Cgroup, paths map[string]string) (cgroups.Manager, error) {
+	if config == nil {
+		return nil, errors.New("cgroups/manager.New: config must not be nil")
+	}
+	if config.Systemd && !systemd.IsRunningSystemd() {
+		return nil, errors.New("systemd not running on this host, cannot use systemd cgroups manager")
+	}
+
+	// Cgroup v2 aka unified hierarchy.
+	if cgroups.IsCgroup2UnifiedMode() {
+		path, err := getUnifiedPath(paths)
+		if err != nil {
+			return nil, fmt.Errorf("manager.NewWithPaths: inconsistent paths: %w", err)
+		}
+		if config.Systemd {
+			return systemd.NewUnifiedManager(config, path)
+		}
+		return fs2.NewManager(config, path)
+	}
+
+	// Cgroup v1.
+	if config.Systemd {
+		return systemd.NewLegacyManager(config, paths)
+	}
+
+	return fs.NewManager(config, paths)
+}
+
+// getUnifiedPath is an implementation detail of libcontainer.
+// Historically, libcontainer.Create saves cgroup paths as per-subsystem path
+// map (as returned by cm.GetPaths(""), but with v2 we only have one single
+// unified path (with "" as a key).
+//
+// This function converts from that map to string (using "" as a key),
+// and also checks that the map itself is sane.
+func getUnifiedPath(paths map[string]string) (string, error) {
+	if len(paths) > 1 {
+		return "", fmt.Errorf("expected a single path, got %+v", paths)
+	}
+	path := paths[""]
+	// can be empty
+	if path != "" {
+		if filepath.Clean(path) != path || !filepath.IsAbs(path) {
+			return "", fmt.Errorf("invalid path: %q", path)
+		}
+	}
+
+	return path, nil
+}
diff --git a/stats.go b/stats.go
new file mode 100644
index 0000000..b475567
--- /dev/null
+++ b/stats.go
@@ -0,0 +1,200 @@
+package cgroups
+
+type ThrottlingData struct {
+	// Number of periods with throttling active
+	Periods uint64 `json:"periods,omitempty"`
+	// Number of periods when the container hit its throttling limit.
+	ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
+	// Aggregate time the container was throttled for in nanoseconds.
+	ThrottledTime uint64 `json:"throttled_time,omitempty"`
+}
+
+// CpuUsage denotes the usage of a CPU.
+// All CPU stats are aggregate since container inception.
+type CpuUsage struct {
+	// Total CPU time consumed.
+	// Units: nanoseconds.
+	TotalUsage uint64 `json:"total_usage,omitempty"`
+	// Total CPU time consumed per core.
+	// Units: nanoseconds.
+	PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
+	// CPU time consumed per core in kernel mode
+	// Units: nanoseconds.
+	PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"`
+	// CPU time consumed per core in user mode
+	// Units: nanoseconds.
+	PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"`
+	// Time spent by tasks of the cgroup in kernel mode.
+	// Units: nanoseconds.
+	UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
+	// Time spent by tasks of the cgroup in user mode.
+	// Units: nanoseconds.
+	UsageInUsermode uint64 `json:"usage_in_usermode"`
+}
+
+type PSIData struct {
+	Avg10  float64 `json:"avg10"`
+	Avg60  float64 `json:"avg60"`
+	Avg300 float64 `json:"avg300"`
+	Total  uint64  `json:"total"`
+}
+
+type PSIStats struct {
+	Some PSIData `json:"some,omitempty"`
+	Full PSIData `json:"full,omitempty"`
+}
+
+type CpuStats struct {
+	CpuUsage       CpuUsage       `json:"cpu_usage,omitempty"`
+	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
+	PSI            *PSIStats      `json:"psi,omitempty"`
+}
+
+type CPUSetStats struct {
+	// List of the physical numbers of the CPUs on which processes
+	// in that cpuset are allowed to execute
+	CPUs []uint16 `json:"cpus,omitempty"`
+	// cpu_exclusive flag
+	CPUExclusive uint64 `json:"cpu_exclusive"`
+	// List of memory nodes on which processes in that cpuset
+	// are allowed to allocate memory
+	Mems []uint16 `json:"mems,omitempty"`
+	// mem_hardwall flag
+	MemHardwall uint64 `json:"mem_hardwall"`
+	// mem_exclusive flag
+	MemExclusive uint64 `json:"mem_exclusive"`
+	// memory_migrate flag
+	MemoryMigrate uint64 `json:"memory_migrate"`
+	// memory_spread page flag
+	MemorySpreadPage uint64 `json:"memory_spread_page"`
+	// memory_spread slab flag
+	MemorySpreadSlab uint64 `json:"memory_spread_slab"`
+	// memory_pressure
+	MemoryPressure uint64 `json:"memory_pressure"`
+	// sched_load balance flag
+	SchedLoadBalance uint64 `json:"sched_load_balance"`
+	// sched_relax_domain_level
+	SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
+}
+
+type MemoryData struct {
+	Usage    uint64 `json:"usage,omitempty"`
+	MaxUsage uint64 `json:"max_usage,omitempty"`
+	Failcnt  uint64 `json:"failcnt"`
+	Limit    uint64 `json:"limit"`
+}
+
+type MemoryStats struct {
+	// memory used for cache
+	Cache uint64 `json:"cache,omitempty"`
+	// usage of memory
+	Usage MemoryData `json:"usage,omitempty"`
+	// usage of memory + swap
+	SwapUsage MemoryData `json:"swap_usage,omitempty"`
+	// usage of swap only
+	SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"`
+	// usage of kernel memory
+	KernelUsage MemoryData `json:"kernel_usage,omitempty"`
+	// usage of kernel TCP memory
+	KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
+	// usage of memory pages by NUMA node
+	// see chapter 5.6 of memory controller documentation
+	PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"`
+	// if true, memory usage is accounted for throughout a hierarchy of cgroups.
+	UseHierarchy bool `json:"use_hierarchy"`
+
+	Stats map[string]uint64 `json:"stats,omitempty"`
+	PSI   *PSIStats         `json:"psi,omitempty"`
+}
+
+type PageUsageByNUMA struct {
+	// Embedding is used as types can't be recursive.
+	PageUsageByNUMAInner
+	Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"`
+}
+
+type PageUsageByNUMAInner struct {
+	Total       PageStats `json:"total,omitempty"`
+	File        PageStats `json:"file,omitempty"`
+	Anon        PageStats `json:"anon,omitempty"`
+	Unevictable PageStats `json:"unevictable,omitempty"`
+}
+
+type PageStats struct {
+	Total uint64           `json:"total,omitempty"`
+	Nodes map[uint8]uint64 `json:"nodes,omitempty"`
+}
+
+type PidsStats struct {
+	// number of pids in the cgroup
+	Current uint64 `json:"current,omitempty"`
+	// active pids hard limit
+	Limit uint64 `json:"limit,omitempty"`
+}
+
+type BlkioStatEntry struct {
+	Major uint64 `json:"major,omitempty"`
+	Minor uint64 `json:"minor,omitempty"`
+	Op    string `json:"op,omitempty"`
+	Value uint64 `json:"value,omitempty"`
+}
+
+type BlkioStats struct {
+	// number of bytes transferred to and from the block device
+	IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
+	IoServicedRecursive     []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
+	IoQueuedRecursive       []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
+	IoServiceTimeRecursive  []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
+	IoWaitTimeRecursive     []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
+	IoMergedRecursive       []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
+	IoTimeRecursive         []BlkioStatEntry `json:"io_time_recursive,omitempty"`
+	SectorsRecursive        []BlkioStatEntry `json:"sectors_recursive,omitempty"`
+	PSI                     *PSIStats        `json:"psi,omitempty"`
+}
+
+type HugetlbStats struct {
+	// current res_counter usage for hugetlb
+	Usage uint64 `json:"usage,omitempty"`
+	// maximum usage ever recorded.
+	MaxUsage uint64 `json:"max_usage,omitempty"`
+	// number of times hugetlb usage allocation failure.
+	Failcnt uint64 `json:"failcnt"`
+}
+
+type RdmaEntry struct {
+	Device     string `json:"device,omitempty"`
+	HcaHandles uint32 `json:"hca_handles,omitempty"`
+	HcaObjects uint32 `json:"hca_objects,omitempty"`
+}
+
+type RdmaStats struct {
+	RdmaLimit   []RdmaEntry `json:"rdma_limit,omitempty"`
+	RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"`
+}
+
+type MiscStats struct {
+	// current resource usage for a key in misc
+	Usage uint64 `json:"usage,omitempty"`
+	// number of times the resource usage was about to go over the max boundary
+	Events uint64 `json:"events,omitempty"`
+}
+
+type Stats struct {
+	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
+	CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
+	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
+	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
+	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
+	// the map is in the format "size of hugepage: stats of the hugepage"
+	HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
+	RdmaStats    RdmaStats               `json:"rdma_stats,omitempty"`
+	// the map is in the format "misc resource name: stats of the key"
+	MiscStats map[string]MiscStats `json:"misc_stats,omitempty"`
+}
+
+func NewStats() *Stats {
+	memoryStats := MemoryStats{Stats: make(map[string]uint64)}
+	hugetlbStats := make(map[string]HugetlbStats)
+	miscStats := make(map[string]MiscStats)
+	return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats}
+}
diff --git a/systemd/common.go b/systemd/common.go
new file mode 100644
index 0000000..b3077bd
--- /dev/null
+++ b/systemd/common.go
@@ -0,0 +1,362 @@
+package systemd
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"math"
+	"os"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	dbus "github.com/godbus/dbus/v5"
+	"github.com/sirupsen/logrus"
+
+	"github.com/opencontainers/cgroups"
+)
+
+const (
+	// Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2.
+	// v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
+	// v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
+	defCPUQuotaPeriod = uint64(100000)
+)
+
+var (
+	versionOnce sync.Once
+	version     int
+
+	isRunningSystemdOnce sync.Once
+	isRunningSystemd     bool
+
+	// GenerateDeviceProps is a function to generate systemd device
+	// properties, used by Set methods. Unless
+	// [github.com/opencontainers/cgroups/devices]
+	// package is imported, it is set to nil, so cgroup managers can't
+	// configure devices.
+	GenerateDeviceProps func(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, error)
+)
+
+// NOTE: This function comes from package github.com/coreos/go-systemd/util
+// It was borrowed here to avoid a dependency on cgo.
+//
+// IsRunningSystemd checks whether the host was booted with systemd as its init
+// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
+// checks whether /run/systemd/system/ exists and is a directory.
+// http://www.freedesktop.org/software/systemd/man/sd_booted.html
+func IsRunningSystemd() bool {
+	isRunningSystemdOnce.Do(func() {
+		fi, err := os.Lstat("/run/systemd/system")
+		isRunningSystemd = err == nil && fi.IsDir()
+	})
+	return isRunningSystemd
+}
+
+// systemd represents slice hierarchy using `-`, so we need to follow suit when
+// generating the path of slice. Essentially, test-a-b.slice becomes
+// /test.slice/test-a.slice/test-a-b.slice.
+func ExpandSlice(slice string) (string, error) {
+	suffix := ".slice"
+	// Name has to end with ".slice", but can't be just ".slice".
+	if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	// Path-separators are not allowed.
+	if strings.Contains(slice, "/") {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	var path, prefix string
+	sliceName := strings.TrimSuffix(slice, suffix)
+	// if input was -.slice, we should just return root now
+	if sliceName == "-" {
+		return "/", nil
+	}
+	for _, component := range strings.Split(sliceName, "-") {
+		// test--a.slice isn't permitted, nor is -test.slice.
+		if component == "" {
+			return "", fmt.Errorf("invalid slice name: %s", slice)
+		}
+
+		// Append the component to the path and to the prefix.
+		path += "/" + prefix + component + suffix
+		prefix += component + "-"
+	}
+	return path, nil
+}
+
+func newProp(name string, units interface{}) systemdDbus.Property {
+	return systemdDbus.Property{
+		Name:  name,
+		Value: dbus.MakeVariant(units),
+	}
+}
+
+func getUnitName(c *cgroups.Cgroup) string {
+	// by default, we create a scope unless the user explicitly asks for a slice.
+	if !strings.HasSuffix(c.Name, ".slice") {
+		return c.ScopePrefix + "-" + c.Name + ".scope"
+	}
+	return c.Name
+}
+
+// This code should be in sync with getUnitName.
+func getUnitType(unitName string) string {
+	if strings.HasSuffix(unitName, ".slice") {
+		return "Slice"
+	}
+	return "Scope"
+}
+
+// isDbusError returns true if the error is a specific dbus error.
+func isDbusError(err error, name string) bool {
+	if err != nil {
+		var derr dbus.Error
+		if errors.As(err, &derr) {
+			return strings.Contains(derr.Name, name)
+		}
+	}
+	return false
+}
+
+// isUnitExists returns true if the error is that a systemd unit already exists.
+func isUnitExists(err error) bool {
+	return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
+}
+
+func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error {
+	statusChan := make(chan string, 1)
+	retry := true
+
+retry:
+	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+		_, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
+		return err
+	})
+	if err != nil {
+		if !isUnitExists(err) {
+			return err
+		}
+		if ignoreExist {
+			// TODO: remove this hack.
+			// This is kubelet making sure a slice exists (see
+			// https://github.com/opencontainers/runc/pull/1124).
+			return nil
+		}
+		if retry {
+			// In case a unit with the same name exists, this may
+			// be a leftover failed unit. Reset it, so systemd can
+			// remove it, and retry once.
+			err = resetFailedUnit(cm, unitName)
+			if err != nil {
+				logrus.Warnf("unable to reset failed unit: %v", err)
+			}
+			retry = false
+			goto retry
+		}
+		return err
+	}
+
+	timeout := time.NewTimer(30 * time.Second)
+	defer timeout.Stop()
+
+	select {
+	case s := <-statusChan:
+		close(statusChan)
+		// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
+		if s != "done" {
+			_ = resetFailedUnit(cm, unitName)
+			return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
+		}
+	case <-timeout.C:
+		_ = resetFailedUnit(cm, unitName)
+		return errors.New("Timeout waiting for systemd to create " + unitName)
+	}
+
+	return nil
+}
+
+func stopUnit(cm *dbusConnManager, unitName string) error {
+	statusChan := make(chan string, 1)
+	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+		_, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan)
+		return err
+	})
+	if err == nil {
+		timeout := time.NewTimer(30 * time.Second)
+		defer timeout.Stop()
+
+		select {
+		case s := <-statusChan:
+			close(statusChan)
+			// Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
+			if s != "done" {
+				logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
+			}
+		case <-timeout.C:
+			return errors.New("Timed out while waiting for systemd to remove " + unitName)
+		}
+	}
+
+	// In case of a failed unit, let systemd remove it.
+	_ = resetFailedUnit(cm, unitName)
+
+	return nil
+}
+
+func resetFailedUnit(cm *dbusConnManager, name string) error {
+	return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+		return c.ResetFailedUnitContext(context.TODO(), name)
+	})
+}
+
+func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) {
+	var prop *systemdDbus.Property
+	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) {
+		prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName)
+		return Err
+	})
+	return prop, err
+}
+
+func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error {
+	return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+		return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...)
+	})
+}
+
+func getManagerProperty(cm *dbusConnManager, name string) (string, error) {
+	str := ""
+	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+		var err error
+		str, err = c.GetManagerProperty(name)
+		return err
+	})
+	if err != nil {
+		return "", err
+	}
+	return strconv.Unquote(str)
+}
+
+func systemdVersion(cm *dbusConnManager) int {
+	versionOnce.Do(func() {
+		version = -1
+		verStr, err := getManagerProperty(cm, "Version")
+		if err == nil {
+			version, err = systemdVersionAtoi(verStr)
+		}
+
+		if err != nil {
+			logrus.WithError(err).Error("unable to get systemd version")
+		}
+	})
+
+	return version
+}
+
+// systemdVersionAtoi extracts a numeric systemd version from the argument.
+// The argument should be of the form: "v245.4-1.fc32", "245", "v245-1.fc32",
+// "245-1.fc32" (with or without quotes). The result for all of the above
+// should be 245.
+func systemdVersionAtoi(str string) (int, error) {
+	// Unconditionally remove the leading prefix ("v).
+	str = strings.TrimLeft(str, `"v`)
+	// Match on the first integer we can grab.
+	for i := 0; i < len(str); i++ {
+		if str[i] < '0' || str[i] > '9' {
+			// First non-digit: cut the tail.
+			str = str[:i]
+			break
+		}
+	}
+	ver, err := strconv.Atoi(str)
+	if err != nil {
+		return -1, fmt.Errorf("can't parse version: %w", err)
+	}
+	return ver, nil
+}
+
+func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) {
+	if period != 0 {
+		// systemd only supports CPUQuotaPeriodUSec since v242
+		sdVer := systemdVersion(cm)
+		if sdVer >= 242 {
+			*properties = append(*properties,
+				newProp("CPUQuotaPeriodUSec", period))
+		} else {
+			logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+
+				" (setting will still be applied to cgroupfs)", sdVer)
+		}
+	}
+	if quota != 0 || period != 0 {
+		// corresponds to USEC_INFINITY in systemd
+		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
+		if quota > 0 {
+			if period == 0 {
+				// assume the default
+				period = defCPUQuotaPeriod
+			}
+			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
+			// (integer percentage of CPU) internally.  This means that if a fractional percent of
+			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
+			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
+			cpuQuotaPerSecUSec = uint64(quota*1000000) / period
+			if cpuQuotaPerSecUSec%10000 != 0 {
+				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
+			}
+		}
+		*properties = append(*properties,
+			newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
+	}
+}
+
+func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error {
+	if cpus == "" && mems == "" {
+		return nil
+	}
+
+	// systemd only supports AllowedCPUs/AllowedMemoryNodes since v244
+	sdVer := systemdVersion(cm)
+	if sdVer < 244 {
+		logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+
+			" (settings will still be applied to cgroupfs)", sdVer)
+		return nil
+	}
+
+	if cpus != "" {
+		bits, err := RangeToBits(cpus)
+		if err != nil {
+			return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w",
+				cpus, err)
+		}
+		*props = append(*props,
+			newProp("AllowedCPUs", bits))
+	}
+	if mems != "" {
+		bits, err := RangeToBits(mems)
+		if err != nil {
+			return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w",
+				mems, err)
+		}
+		*props = append(*props,
+			newProp("AllowedMemoryNodes", bits))
+	}
+	return nil
+}
+
+// generateDeviceProperties takes the configured device rules and generates a
+// corresponding set of systemd properties to configure the devices correctly.
+func generateDeviceProperties(r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
+	if GenerateDeviceProps == nil {
+		if len(r.Devices) > 0 {
+			return nil, cgroups.ErrDevicesUnsupported
+		}
+		return nil, nil
+	}
+
+	return GenerateDeviceProps(r, systemdVersion(cm))
+}
diff --git a/systemd/cpuset.go b/systemd/cpuset.go
new file mode 100644
index 0000000..c6f5642
--- /dev/null
+++ b/systemd/cpuset.go
@@ -0,0 +1,60 @@
+package systemd
+
+import (
+	"errors"
+	"math/big"
+	"strconv"
+	"strings"
+)
+
+// RangeToBits converts a text representation of a CPU mask (as written to
+// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes
+// with the corresponding bits set (as consumed by systemd over dbus as
+// AllowedCPUs/AllowedMemoryNodes unit property value).
+func RangeToBits(str string) ([]byte, error) {
+	bits := new(big.Int)
+
+	for _, r := range strings.Split(str, ",") {
+		// allow extra spaces around
+		r = strings.TrimSpace(r)
+		// allow empty elements (extra commas)
+		if r == "" {
+			continue
+		}
+		startr, endr, ok := strings.Cut(r, "-")
+		if ok {
+			start, err := strconv.ParseUint(startr, 10, 32)
+			if err != nil {
+				return nil, err
+			}
+			end, err := strconv.ParseUint(endr, 10, 32)
+			if err != nil {
+				return nil, err
+			}
+			if start > end {
+				return nil, errors.New("invalid range: " + r)
+			}
+			for i := start; i <= end; i++ {
+				bits.SetBit(bits, int(i), 1)
+			}
+		} else {
+			val, err := strconv.ParseUint(startr, 10, 32)
+			if err != nil {
+				return nil, err
+			}
+			bits.SetBit(bits, int(val), 1)
+		}
+	}
+
+	ret := bits.Bytes()
+	if len(ret) == 0 {
+		// do not allow empty values
+		return nil, errors.New("empty value")
+	}
+
+	// fit cpuset parsing order in systemd
+	for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 {
+		ret[l], ret[r] = ret[r], ret[l]
+	}
+	return ret, nil
+}
diff --git a/systemd/cpuset_test.go b/systemd/cpuset_test.go
new file mode 100644
index 0000000..bda31a5
--- /dev/null
+++ b/systemd/cpuset_test.go
@@ -0,0 +1,55 @@
+package systemd
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestRangeToBits(t *testing.T) {
+	testCases := []struct {
+		in    string
+		out   []byte
+		isErr bool
+	}{
+		{in: "", isErr: true},
+		{in: "0", out: []byte{1}},
+		{in: "1", out: []byte{2}},
+		{in: "0-1", out: []byte{3}},
+		{in: "0,1", out: []byte{3}},
+		{in: ",0,1,", out: []byte{3}},
+		{in: "0-3", out: []byte{0x0f}},
+		{in: "0,1,2-3", out: []byte{0x0f}},
+		{in: "4-7", out: []byte{0xf0}},
+		{in: "0-7", out: []byte{0xff}},
+		{in: "0-15", out: []byte{0xff, 0xff}},
+		{in: "16", out: []byte{0, 0, 1}},
+		{in: "0-3,32-33", out: []byte{0x0f, 0, 0, 0, 3}},
+		// extra spaces and tabs are ok
+		{in: "1, 2, 1-2", out: []byte{6}},
+		{in: "    , 1   , 3  ,  5-7,	", out: []byte{0xea}},
+		// somewhat large values
+		{in: "128-130,1", out: []byte{2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7}},
+
+		{in: "-", isErr: true},
+		{in: "1-", isErr: true},
+		{in: "-3", isErr: true},
+		// bad range (start > end)
+		{in: "54-53", isErr: true},
+		// kernel does not allow extra spaces inside a range
+		{in: "1 - 2", isErr: true},
+	}
+
+	for _, tc := range testCases {
+		out, err := RangeToBits(tc.in)
+		if err != nil {
+			if !tc.isErr {
+				t.Errorf("case %q: unexpected error: %v", tc.in, err)
+			}
+
+			continue
+		}
+		if !bytes.Equal(out, tc.out) {
+			t.Errorf("case %q: expected %v, got %v", tc.in, tc.out, out)
+		}
+	}
+}
diff --git a/systemd/dbus.go b/systemd/dbus.go
new file mode 100644
index 0000000..bb87ae8
--- /dev/null
+++ b/systemd/dbus.go
@@ -0,0 +1,102 @@
+package systemd
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	dbus "github.com/godbus/dbus/v5"
+)
+
+var (
+	dbusC        *systemdDbus.Conn
+	dbusMu       sync.RWMutex
+	dbusInited   bool
+	dbusRootless bool
+)
+
+type dbusConnManager struct{}
+
+// newDbusConnManager initializes systemd dbus connection manager.
+func newDbusConnManager(rootless bool) *dbusConnManager {
+	dbusMu.Lock()
+	defer dbusMu.Unlock()
+	if dbusInited && rootless != dbusRootless {
+		panic("can't have both root and rootless dbus")
+	}
+	dbusInited = true
+	dbusRootless = rootless
+	return &dbusConnManager{}
+}
+
+// getConnection lazily initializes and returns systemd dbus connection.
+func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) {
+	// In the case where dbusC != nil
+	// Use the read lock the first time to ensure
+	// that Conn can be acquired at the same time.
+	dbusMu.RLock()
+	if conn := dbusC; conn != nil {
+		dbusMu.RUnlock()
+		return conn, nil
+	}
+	dbusMu.RUnlock()
+
+	// In the case where dbusC == nil
+	// Use write lock to ensure that only one
+	// will be created
+	dbusMu.Lock()
+	defer dbusMu.Unlock()
+	if conn := dbusC; conn != nil {
+		return conn, nil
+	}
+
+	conn, err := d.newConnection()
+	if err != nil {
+		// When dbus-user-session is not installed, we can't detect whether we should try to connect to user dbus or system dbus, so d.dbusRootless is set to false.
+		// This may fail with a cryptic error "read unix @->/run/systemd/private: read: connection reset by peer: unknown."
+		// https://github.com/moby/moby/issues/42793
+		return nil, fmt.Errorf("failed to connect to dbus (hint: for rootless containers, maybe you need to install dbus-user-session package, see https://github.com/opencontainers/runc/blob/master/docs/cgroup-v2.md): %w", err)
+	}
+	dbusC = conn
+	return conn, nil
+}
+
+func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) {
+	if dbusRootless {
+		return newUserSystemdDbus()
+	}
+	return systemdDbus.NewWithContext(context.TODO())
+}
+
+// resetConnection resets the connection to its initial state
+// (so it can be reconnected if necessary).
+func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) {
+	dbusMu.Lock()
+	defer dbusMu.Unlock()
+	if dbusC != nil && dbusC == conn {
+		dbusC.Close()
+		dbusC = nil
+	}
+}
+
+// retryOnDisconnect calls op, and if the error it returns is about closed dbus
+// connection, the connection is re-established and the op is retried. This helps
+// with the situation when dbus is restarted and we have a stale connection.
+func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error {
+	for {
+		conn, err := d.getConnection()
+		if err != nil {
+			return err
+		}
+		err = op(conn)
+		if err == nil {
+			return nil
+		}
+		if !errors.Is(err, dbus.ErrClosed) {
+			return err
+		}
+		d.resetConnection(conn)
+	}
+}
diff --git a/systemd/devices.go b/systemd/devices.go
new file mode 100644
index 0000000..51ca7fa
--- /dev/null
+++ b/systemd/devices.go
@@ -0,0 +1,74 @@
+package systemd
+
+import (
+	"reflect"
+
+	dbus "github.com/godbus/dbus/v5"
+
+	"github.com/opencontainers/cgroups"
+)
+
+// freezeBeforeSet answers whether there is a need to freeze the cgroup before
+// applying its systemd unit properties, and thaw after, while avoiding
+// unnecessary freezer state changes.
+//
+// The reason why we have to freeze is that systemd's application of device
+// rules is done disruptively, resulting in spurious errors to common devices
+// (unlike our fs driver, they will happily write deny-all rules to running
+// containers). So we have to freeze the container to avoid the container get
+// an occasional "permission denied" error.
+func (m *LegacyManager) freezeBeforeSet(unitName string, r *cgroups.Resources) (needsFreeze, needsThaw bool, err error) {
+	// Special case for SkipDevices, as used by Kubernetes to create pod
+	// cgroups with allow-all device policy).
+	if r.SkipDevices {
+		if r.SkipFreezeOnSet {
+			// Both needsFreeze and needsThaw are false.
+			return
+		}
+
+		// No need to freeze if SkipDevices is set, and either
+		// (1) systemd unit does not (yet) exist, or
+		// (2) it has DevicePolicy=auto and empty DeviceAllow list.
+		//
+		// Interestingly, (1) and (2) are the same here because
+		// a non-existent unit returns default properties,
+		// and settings in (2) are the defaults.
+		//
+		// Do not return errors from getUnitTypeProperty, as they alone
+		// should not prevent Set from working.
+
+		unitType := getUnitType(unitName)
+
+		devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy")
+		if e == nil && devPolicy.Value == dbus.MakeVariant("auto") {
+			devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow")
+			if e == nil {
+				if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 {
+					needsFreeze = false
+					needsThaw = false
+					return
+				}
+			}
+		}
+	}
+
+	needsFreeze = true
+	needsThaw = true
+
+	// Check the current freezer state.
+	freezerState, err := m.GetFreezerState()
+	if err != nil {
+		return
+	}
+	if freezerState == cgroups.Frozen {
+		// Already frozen, and should stay frozen.
+		needsFreeze = false
+		needsThaw = false
+	}
+
+	if r.Freezer == cgroups.Frozen {
+		// Will be frozen anyway -- no need to thaw.
+		needsThaw = false
+	}
+	return
+}
diff --git a/systemd/freeze_test.go b/systemd/freeze_test.go
new file mode 100644
index 0000000..35558a8
--- /dev/null
+++ b/systemd/freeze_test.go
@@ -0,0 +1,354 @@
+package systemd
+
+import (
+	"bufio"
+	"bytes"
+	"os"
+	"os/exec"
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/cgroups"
+	"golang.org/x/sys/unix"
+)
+
+func TestFreezeBeforeSet(t *testing.T) {
+	requireV1(t)
+
+	testCases := []struct {
+		desc string
+		// Test input.
+		cg        *cgroups.Cgroup
+		preFreeze bool
+		// Expected values.
+		// Before unit creation (Apply).
+		freeze0, thaw0 bool
+		// After unit creation.
+		freeze1, thaw1 bool
+	}{
+		{
+			// A slice with SkipDevices.
+			desc: "slice,skip-devices",
+			cg: &cgroups.Cgroup{
+				Name:   "system-runc_test_freeze_1.slice",
+				Parent: "system.slice",
+				Resources: &cgroups.Resources{
+					SkipDevices: true,
+				},
+			},
+			// Expected.
+			freeze0: false,
+			thaw0:   false,
+			freeze1: false,
+			thaw1:   false,
+		},
+		{
+			// A scope with SkipDevices. Not a realistic scenario with runc
+			// (as container can't have SkipDevices == true), but possible
+			// for a standalone cgroup manager.
+			desc: "scope,skip-devices",
+			cg: &cgroups.Cgroup{
+				ScopePrefix: "test",
+				Name:        "testFreeze2",
+				Parent:      "system.slice",
+				Resources: &cgroups.Resources{
+					SkipDevices: true,
+				},
+			},
+			// Expected.
+			freeze0: false,
+			thaw0:   false,
+			freeze1: false,
+			thaw1:   false,
+		},
+		{
+			// A slice that is about to be frozen in Set.
+			desc: "slice,will-freeze",
+			cg: &cgroups.Cgroup{
+				Name:   "system-runc_test_freeze_3.slice",
+				Parent: "system.slice",
+				Resources: &cgroups.Resources{
+					Freezer: cgroups.Frozen,
+				},
+			},
+			// Expected.
+			freeze0: true,
+			thaw0:   false,
+			freeze1: true,
+			thaw1:   false,
+		},
+		{
+			// A pre-frozen slice that should stay frozen.
+			desc: "slice,pre-frozen,will-freeze",
+			cg: &cgroups.Cgroup{
+				Name:   "system-runc_test_freeze_4.slice",
+				Parent: "system.slice",
+				Resources: &cgroups.Resources{
+					Freezer: cgroups.Frozen,
+				},
+			},
+			preFreeze: true,
+			// Expected.
+			freeze0: true, // not actually frozen yet.
+			thaw0:   false,
+			freeze1: false,
+			thaw1:   false,
+		},
+		{
+			// A pre-frozen scope with skip devices set.
+			desc: "scope,pre-frozen,skip-devices",
+			cg: &cgroups.Cgroup{
+				ScopePrefix: "test",
+				Name:        "testFreeze5",
+				Parent:      "system.slice",
+				Resources: &cgroups.Resources{
+					SkipDevices: true,
+				},
+			},
+			preFreeze: true,
+			// Expected.
+			freeze0: false,
+			thaw0:   false,
+			freeze1: false,
+			thaw1:   false,
+		},
+		{
+			// A pre-frozen scope which will be thawed.
+			desc: "scope,pre-frozen",
+			cg: &cgroups.Cgroup{
+				ScopePrefix: "test",
+				Name:        "testFreeze6",
+				Parent:      "system.slice",
+				Resources:   &cgroups.Resources{},
+			},
+			preFreeze: true,
+			// Expected.
+			freeze0: true, // not actually frozen yet.
+			thaw0:   true,
+			freeze1: false,
+			thaw1:   false,
+		},
+	}
+
+	for _, tc := range testCases {
+		tc := tc
+		t.Run(tc.desc, func(t *testing.T) {
+			m, err := NewLegacyManager(tc.cg, nil)
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer m.Destroy() //nolint:errcheck
+
+			// Checks for a non-existent unit.
+			freeze, thaw, err := m.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if freeze != tc.freeze0 || thaw != tc.thaw0 {
+				t.Errorf("before Apply (non-existent unit): expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v",
+					tc.freeze0, tc.thaw0, freeze, thaw)
+			}
+
+			// Create systemd unit.
+			pid := -1
+			if strings.HasSuffix(getUnitName(tc.cg), ".scope") {
+				// Scopes require a process inside.
+				cmd := exec.Command("bash", "-c", "sleep 1m")
+				if err := cmd.Start(); err != nil {
+					t.Fatal(err)
+				}
+				pid = cmd.Process.Pid
+				// Make sure to not leave a zombie.
+				defer func() {
+					// These may fail, we don't care.
+					_ = cmd.Process.Kill()
+					_ = cmd.Wait()
+				}()
+			}
+			if err := m.Apply(pid); err != nil {
+				t.Fatal(err)
+			}
+			if tc.preFreeze {
+				if err := m.Freeze(cgroups.Frozen); err != nil {
+					t.Error(err)
+					return // no more checks
+				}
+			}
+			freeze, thaw, err = m.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources)
+			if err != nil {
+				t.Error(err)
+				return // no more checks
+			}
+			if freeze != tc.freeze1 || thaw != tc.thaw1 {
+				t.Errorf("expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v",
+					tc.freeze1, tc.thaw1, freeze, thaw)
+			}
+			// Destroy() timeouts on a frozen container, so we need to thaw it.
+			if tc.preFreeze {
+				if err := m.Freeze(cgroups.Thawed); err != nil {
+					t.Error(err)
+				}
+			}
+			// Destroy() does not kill processes in cgroup, so we should.
+			if pid != -1 {
+				if err = unix.Kill(pid, unix.SIGKILL); err != nil {
+					t.Errorf("unable to kill pid %d: %s", pid, err)
+				}
+			}
+			// Not really needed, but may help catch some bugs.
+			if err := m.Destroy(); err != nil {
+				t.Errorf("destroy: %s", err)
+			}
+		})
+	}
+}
+
+// requireV1 skips the test unless a set of requirements (cgroup v1,
+// systemd, root) is met.
+func requireV1(t *testing.T) {
+	t.Helper()
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("Test requires cgroup v1.")
+	}
+	if !IsRunningSystemd() {
+		t.Skip("Test requires systemd.")
+	}
+	if os.Geteuid() != 0 {
+		t.Skip("Test requires root.")
+	}
+}
+
+func TestFreezePodCgroup(t *testing.T) {
+	if !IsRunningSystemd() {
+		t.Skip("Test requires systemd.")
+	}
+	if os.Geteuid() != 0 {
+		t.Skip("Test requires root.")
+	}
+
+	podConfig := &cgroups.Cgroup{
+		Parent: "system.slice",
+		Name:   "system-runc_test_pod.slice",
+		Resources: &cgroups.Resources{
+			SkipDevices: true,
+			Freezer:     cgroups.Frozen,
+		},
+	}
+	// Create a "pod" cgroup (a systemd slice to hold containers),
+	// which is frozen initially.
+	pm := newManager(t, podConfig)
+	if err := pm.Apply(-1); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := pm.Set(podConfig.Resources); err != nil {
+		t.Fatal(err)
+	}
+
+	// Check the pod is frozen.
+	pf, err := pm.GetFreezerState()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if pf != cgroups.Frozen {
+		t.Fatalf("expected pod to be frozen, got %v", pf)
+	}
+
+	// Create a "container" within the "pod" cgroup.
+	// This is not a real container, just a process in the cgroup.
+	containerConfig := &cgroups.Cgroup{
+		Parent:      "system-runc_test_pod.slice",
+		ScopePrefix: "test",
+		Name:        "inner-container",
+		Resources:   &cgroups.Resources{},
+	}
+
+	cmd := exec.Command("bash", "-c", "while read; do echo $REPLY; done")
+	cmd.Env = append(os.Environ(), "LANG=C")
+
+	// Setup stdin.
+	stdinR, stdinW, err := os.Pipe()
+	if err != nil {
+		t.Fatal(err)
+	}
+	cmd.Stdin = stdinR
+
+	// Setup stdout.
+	stdoutR, stdoutW, err := os.Pipe()
+	if err != nil {
+		t.Fatal(err)
+	}
+	cmd.Stdout = stdoutW
+	rdr := bufio.NewReader(stdoutR)
+
+	// Setup stderr.
+	var stderr bytes.Buffer
+	cmd.Stderr = &stderr
+
+	err = cmd.Start()
+	stdinR.Close()
+	stdoutW.Close()
+	defer func() {
+		_ = stdinW.Close()
+		_ = stdoutR.Close()
+	}()
+	if err != nil {
+		t.Fatal(err)
+	}
+	// Make sure to not leave a zombie.
+	defer func() {
+		// These may fail, we don't care.
+		_ = cmd.Process.Kill()
+		_ = cmd.Wait()
+	}()
+
+	// Put the process into a cgroup.
+	cm := newManager(t, containerConfig)
+
+	if err := cm.Apply(cmd.Process.Pid); err != nil {
+		t.Fatal(err)
+	}
+	if err := cm.Set(containerConfig.Resources); err != nil {
+		t.Fatal(err)
+	}
+	// Check that we put the "container" into the "pod" cgroup.
+	if !strings.HasPrefix(cm.Path("freezer"), pm.Path("freezer")) {
+		t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q",
+			cm.Path("freezer"), pm.Path("freezer"))
+	}
+	// Check the container is not reported as frozen despite the frozen parent.
+	cf, err := cm.GetFreezerState()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if cf != cgroups.Thawed {
+		t.Fatalf("expected container to be thawed, got %v", cf)
+	}
+
+	// Unfreeze the pod.
+	if err := pm.Freeze(cgroups.Thawed); err != nil {
+		t.Fatal(err)
+	}
+
+	cf, err = cm.GetFreezerState()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if cf != cgroups.Thawed {
+		t.Fatalf("expected container to be thawed, got %v", cf)
+	}
+
+	// Check the "container" works.
+	marker := "one two\n"
+	_, err = stdinW.WriteString(marker)
+	if err != nil {
+		t.Fatal(err)
+	}
+	reply, err := rdr.ReadString('\n')
+	if err != nil {
+		t.Fatalf("reading from container: %v", err)
+	}
+	if reply != marker {
+		t.Fatalf("expected %q, got %q", marker, reply)
+	}
+}
diff --git a/systemd/systemd_test.go b/systemd/systemd_test.go
new file mode 100644
index 0000000..dae851c
--- /dev/null
+++ b/systemd/systemd_test.go
@@ -0,0 +1,180 @@
+package systemd
+
+import (
+	"os"
+	"reflect"
+	"testing"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	"github.com/opencontainers/cgroups"
+)
+
+func newManager(t *testing.T, config *cgroups.Cgroup) (m cgroups.Manager) {
+	t.Helper()
+	var err error
+
+	if cgroups.IsCgroup2UnifiedMode() {
+		m, err = NewUnifiedManager(config, "")
+	} else {
+		m, err = NewLegacyManager(config, nil)
+	}
+	if err != nil {
+		t.Fatal(err)
+	}
+	t.Cleanup(func() { _ = m.Destroy() })
+
+	return m
+}
+
+func TestSystemdVersion(t *testing.T) {
+	systemdVersionTests := []struct {
+		verStr      string
+		expectedVer int
+		expectErr   bool
+	}{
+		{`"219"`, 219, false},
+		{`"v245.4-1.fc32"`, 245, false},
+		{`"241-1"`, 241, false},
+		{`"v241-1"`, 241, false},
+		{`333.45"`, 333, false},
+		{`v321-0`, 321, false},
+		{"NaN", -1, true},
+		{"", -1, true},
+		{"v", -1, true},
+	}
+	for _, sdTest := range systemdVersionTests {
+		ver, err := systemdVersionAtoi(sdTest.verStr)
+		if !sdTest.expectErr && err != nil {
+			t.Errorf("systemdVersionAtoi(%s); want nil; got %v", sdTest.verStr, err)
+		}
+		if sdTest.expectErr && err == nil {
+			t.Errorf("systemdVersionAtoi(%s); wanted failure; got nil", sdTest.verStr)
+		}
+		if ver != sdTest.expectedVer {
+			t.Errorf("systemdVersionAtoi(%s); want %d; got %d", sdTest.verStr, sdTest.expectedVer, ver)
+		}
+	}
+}
+
+func TestValidUnitTypes(t *testing.T) {
+	testCases := []struct {
+		unitName         string
+		expectedUnitType string
+	}{
+		{"system.slice", "Slice"},
+		{"kubepods.slice", "Slice"},
+		{"testing-container:ab.scope", "Scope"},
+	}
+	for _, sdTest := range testCases {
+		unitType := getUnitType(sdTest.unitName)
+		if unitType != sdTest.expectedUnitType {
+			t.Errorf("getUnitType(%s); want %q; got %q", sdTest.unitName, sdTest.expectedUnitType, unitType)
+		}
+	}
+}
+
+func TestUnitExistsIgnored(t *testing.T) {
+	if !IsRunningSystemd() {
+		t.Skip("Test requires systemd.")
+	}
+	if os.Geteuid() != 0 {
+		t.Skip("Test requires root.")
+	}
+
+	podConfig := &cgroups.Cgroup{
+		Parent:    "system.slice",
+		Name:      "system-runc_test_exists.slice",
+		Resources: &cgroups.Resources{},
+	}
+	// Create "pods" cgroup (a systemd slice to hold containers).
+	pm := newManager(t, podConfig)
+
+	// create twice to make sure "UnitExists" error is ignored.
+	for i := 0; i < 2; i++ {
+		if err := pm.Apply(-1); err != nil {
+			t.Fatal(err)
+		}
+	}
+}
+
+func TestUnifiedResToSystemdProps(t *testing.T) {
+	if !IsRunningSystemd() {
+		t.Skip("Test requires systemd.")
+	}
+	if !cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v2 is required")
+	}
+
+	cm := newDbusConnManager(os.Geteuid() != 0)
+
+	testCases := []struct {
+		name     string
+		minVer   int
+		res      map[string]string
+		expError bool
+		expProps []systemdDbus.Property
+	}{
+		{
+			name: "empty map",
+			res:  map[string]string{},
+		},
+		{
+			name:   "only cpu.idle=1",
+			minVer: cpuIdleSupportedVersion,
+			res: map[string]string{
+				"cpu.idle": "1",
+			},
+			expProps: []systemdDbus.Property{
+				newProp("CPUWeight", uint64(0)),
+			},
+		},
+		{
+			name:   "only cpu.idle=0",
+			minVer: cpuIdleSupportedVersion,
+			res: map[string]string{
+				"cpu.idle": "0",
+			},
+		},
+		{
+			name:   "cpu.idle=1 and cpu.weight=1000",
+			minVer: cpuIdleSupportedVersion,
+			res: map[string]string{
+				"cpu.idle":   "1",
+				"cpu.weight": "1000",
+			},
+			expProps: []systemdDbus.Property{
+				newProp("CPUWeight", uint64(0)),
+			},
+		},
+		{
+			name:   "cpu.idle=0 and cpu.weight=1000",
+			minVer: cpuIdleSupportedVersion,
+			res: map[string]string{
+				"cpu.idle":   "0",
+				"cpu.weight": "1000",
+			},
+			expProps: []systemdDbus.Property{
+				newProp("CPUWeight", uint64(1000)),
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		tc := tc
+		t.Run(tc.name, func(t *testing.T) {
+			if tc.minVer != 0 && systemdVersion(cm) < tc.minVer {
+				t.Skipf("requires systemd >= %d", tc.minVer)
+			}
+			props, err := unifiedResToSystemdProps(cm, tc.res)
+			if err != nil && !tc.expError {
+				t.Fatalf("expected no error, got: %v", err)
+			}
+			if err == nil && tc.expError {
+				t.Fatal("expected error, got nil")
+			}
+			if !reflect.DeepEqual(tc.expProps, props) {
+				t.Errorf("wrong properties (exp %+v, got %+v)", tc.expProps, props)
+			}
+		})
+	}
+}
diff --git a/systemd/user.go b/systemd/user.go
new file mode 100644
index 0000000..4a4348e
--- /dev/null
+++ b/systemd/user.go
@@ -0,0 +1,92 @@
+package systemd
+
+import (
+	"bufio"
+	"bytes"
+	"errors"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	dbus "github.com/godbus/dbus/v5"
+	"github.com/moby/sys/userns"
+)
+
+// newUserSystemdDbus creates a connection for systemd user-instance.
+func newUserSystemdDbus() (*systemdDbus.Conn, error) {
+	addr, err := DetectUserDbusSessionBusAddress()
+	if err != nil {
+		return nil, err
+	}
+	uid, err := DetectUID()
+	if err != nil {
+		return nil, err
+	}
+
+	return systemdDbus.NewConnection(func() (*dbus.Conn, error) {
+		conn, err := dbus.Dial(addr)
+		if err != nil {
+			return nil, fmt.Errorf("error while dialing %q: %w", addr, err)
+		}
+		methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))}
+		err = conn.Auth(methods)
+		if err != nil {
+			conn.Close()
+			return nil, fmt.Errorf("error while authenticating connection (address=%q, UID=%d): %w", addr, uid, err)
+		}
+		if err = conn.Hello(); err != nil {
+			conn.Close()
+			return nil, fmt.Errorf("error while sending Hello message (address=%q, UID=%d): %w", addr, uid, err)
+		}
+		return conn, nil
+	})
+}
+
+// DetectUID detects UID from the OwnerUID field of `busctl --user status`
+// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) .
+//
+// Otherwise returns os.Getuid() .
+func DetectUID() (int, error) {
+	if !userns.RunningInUserNS() {
+		return os.Getuid(), nil
+	}
+	b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()
+	if err != nil {
+		return -1, fmt.Errorf("could not execute `busctl --user --no-pager status` (output: %q): %w", string(b), err)
+	}
+	scanner := bufio.NewScanner(bytes.NewReader(b))
+	for scanner.Scan() {
+		s := strings.TrimSpace(scanner.Text())
+		if uidStr, ok := strings.CutPrefix(s, "OwnerUID="); ok {
+			i, err := strconv.Atoi(uidStr)
+			if err != nil {
+				return -1, fmt.Errorf("could not detect the OwnerUID: %w", err)
+			}
+			return i, nil
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return -1, err
+	}
+	return -1, errors.New("could not detect the OwnerUID")
+}
+
+// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS, if set.
+// Otherwise it returns "unix:path=$XDG_RUNTIME_DIR/bus", if $XDG_RUNTIME_DIR/bus exists.
+func DetectUserDbusSessionBusAddress() (string, error) {
+	if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" {
+		return env, nil
+	}
+	if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" {
+		busPath := filepath.Join(xdr, "bus")
+		if _, err := os.Stat(busPath); err == nil {
+			busAddress := "unix:path=" + dbus.EscapeBusAddressValue(busPath)
+			return busAddress, nil
+		}
+	}
+	return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from the environment; make sure you have installed the dbus-user-session or dbus-daemon package; note you may need to re-login")
+}
diff --git a/systemd/v1.go b/systemd/v1.go
new file mode 100644
index 0000000..8453e9b
--- /dev/null
+++ b/systemd/v1.go
@@ -0,0 +1,412 @@
+package systemd
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	"github.com/sirupsen/logrus"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fs"
+)
+
+type LegacyManager struct {
+	mu      sync.Mutex
+	cgroups *cgroups.Cgroup
+	paths   map[string]string
+	dbus    *dbusConnManager
+}
+
+func NewLegacyManager(cg *cgroups.Cgroup, paths map[string]string) (*LegacyManager, error) {
+	if cg.Rootless {
+		return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1")
+	}
+	if cg.Resources != nil && cg.Resources.Unified != nil {
+		return nil, cgroups.ErrV1NoUnified
+	}
+	if paths == nil {
+		var err error
+		paths, err = initPaths(cg)
+		if err != nil {
+			return nil, err
+		}
+	}
+	return &LegacyManager{
+		cgroups: cg,
+		paths:   paths,
+		dbus:    newDbusConnManager(false),
+	}, nil
+}
+
+type subsystem interface {
+	// Name returns the name of the subsystem.
+	Name() string
+	// GetStats returns the stats, as 'stats', corresponding to the cgroup under 'path'.
+	GetStats(path string, stats *cgroups.Stats) error
+	// Set sets cgroup resource limits.
+	Set(path string, r *cgroups.Resources) error
+}
+
+var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
+
+var legacySubsystems = []subsystem{
+	&fs.CpusetGroup{},
+	&fs.DevicesGroup{},
+	&fs.MemoryGroup{},
+	&fs.CpuGroup{},
+	&fs.CpuacctGroup{},
+	&fs.PidsGroup{},
+	&fs.BlkioGroup{},
+	&fs.HugetlbGroup{},
+	&fs.PerfEventGroup{},
+	&fs.FreezerGroup{},
+	&fs.NetPrioGroup{},
+	&fs.NetClsGroup{},
+	&fs.NameGroup{GroupName: "name=systemd"},
+	&fs.RdmaGroup{},
+	&fs.NameGroup{GroupName: "misc"},
+}
+
+func genV1ResourcesProperties(r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
+	var properties []systemdDbus.Property
+
+	deviceProperties, err := generateDeviceProperties(r, cm)
+	if err != nil {
+		return nil, err
+	}
+	properties = append(properties, deviceProperties...)
+
+	if r.Memory != 0 {
+		properties = append(properties,
+			newProp("MemoryLimit", uint64(r.Memory)))
+	}
+
+	if r.CpuShares != 0 {
+		properties = append(properties,
+			newProp("CPUShares", r.CpuShares))
+	}
+
+	addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
+
+	if r.BlkioWeight != 0 {
+		properties = append(properties,
+			newProp("BlockIOWeight", uint64(r.BlkioWeight)))
+	}
+
+	if r.PidsLimit > 0 || r.PidsLimit == -1 {
+		properties = append(properties,
+			newProp("TasksMax", uint64(r.PidsLimit)))
+	}
+
+	err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
+	if err != nil {
+		return nil, err
+	}
+
+	return properties, nil
+}
+
+// initPaths figures out and returns paths to cgroups.
+func initPaths(c *cgroups.Cgroup) (map[string]string, error) {
+	slice := "system.slice"
+	if c.Parent != "" {
+		var err error
+		slice, err = ExpandSlice(c.Parent)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	unit := getUnitName(c)
+
+	paths := make(map[string]string)
+	for _, s := range legacySubsystems {
+		subsystemPath, err := getSubsystemPath(slice, unit, s.Name())
+		if err != nil {
+			// Even if it's `not found` error, we'll return err
+			// because devices cgroup is hard requirement for
+			// container security.
+			if s.Name() == "devices" {
+				return nil, err
+			}
+			// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+			if cgroups.IsNotFound(err) {
+				continue
+			}
+			return nil, err
+		}
+		paths[s.Name()] = subsystemPath
+	}
+
+	// If systemd is using cgroups-hybrid mode then add the slice path of
+	// this container to the paths so the following process executed with
+	// "runc exec" joins that cgroup as well.
+	if cgroups.IsCgroup2HybridMode() {
+		// "" means cgroup-hybrid path
+		cgroupsHybridPath, err := getSubsystemPath(slice, unit, "")
+		if err != nil && cgroups.IsNotFound(err) {
+			return nil, err
+		}
+		paths[""] = cgroupsHybridPath
+	}
+
+	return paths, nil
+}
+
+func (m *LegacyManager) Apply(pid int) error {
+	var (
+		c          = m.cgroups
+		unitName   = getUnitName(c)
+		slice      = "system.slice"
+		properties []systemdDbus.Property
+	)
+
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	if c.Parent != "" {
+		slice = c.Parent
+	}
+
+	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+	if strings.HasSuffix(unitName, ".slice") {
+		// If we create a slice, the parent is defined via a Wants=.
+		properties = append(properties, systemdDbus.PropWants(slice))
+	} else {
+		// Otherwise it's a scope, which we put into a Slice=.
+		properties = append(properties, systemdDbus.PropSlice(slice))
+		// Assume scopes always support delegation (supported since systemd v218).
+		properties = append(properties, newProp("Delegate", true))
+	}
+
+	// only add pid if its valid, -1 is used w/ general slice creation.
+	if pid != -1 {
+		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+	}
+
+	// Always enable accounting, this gets us the same behaviour as the fs implementation,
+	// plus the kernel has some problems with joining the memory cgroup at a later time.
+	properties = append(properties,
+		newProp("MemoryAccounting", true),
+		newProp("CPUAccounting", true),
+		newProp("BlockIOAccounting", true),
+		newProp("TasksAccounting", true),
+	)
+
+	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
+	properties = append(properties,
+		newProp("DefaultDependencies", false))
+
+	properties = append(properties, c.SystemdProps...)
+
+	if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
+		return err
+	}
+
+	if err := m.joinCgroups(pid); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (m *LegacyManager) Destroy() error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	stopErr := stopUnit(m.dbus, getUnitName(m.cgroups))
+
+	// Both on success and on error, cleanup all the cgroups
+	// we are aware of, as some of them were created directly
+	// by Apply() and are not managed by systemd.
+	if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil {
+		return err
+	}
+
+	return stopErr
+}
+
+func (m *LegacyManager) Path(subsys string) string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.paths[subsys]
+}
+
+func (m *LegacyManager) joinCgroups(pid int) error {
+	for _, sys := range legacySubsystems {
+		name := sys.Name()
+		switch name {
+		case "name=systemd":
+			// let systemd handle this
+		case "cpuset":
+			if path, ok := m.paths[name]; ok {
+				s := &fs.CpusetGroup{}
+				if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil {
+					return err
+				}
+			}
+		default:
+			if path, ok := m.paths[name]; ok {
+				if err := os.MkdirAll(path, 0o755); err != nil {
+					return err
+				}
+				if err := cgroups.WriteCgroupProc(path, pid); err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	return nil
+}
+
+func getSubsystemPath(slice, unit, subsystem string) (string, error) {
+	mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	return filepath.Join(mountpoint, slice, unit), nil
+}
+
+func (m *LegacyManager) Freeze(state cgroups.FreezerState) error {
+	err := m.doFreeze(state)
+	if err == nil {
+		m.cgroups.Resources.Freezer = state
+	}
+	return err
+}
+
+// doFreeze is the same as Freeze but without
+// changing the m.cgroups.Resources.Frozen field.
+func (m *LegacyManager) doFreeze(state cgroups.FreezerState) error {
+	path, ok := m.paths["freezer"]
+	if !ok {
+		return errSubsystemDoesNotExist
+	}
+	freezer := &fs.FreezerGroup{}
+	resources := &cgroups.Resources{Freezer: state}
+	return freezer.Set(path, resources)
+}
+
+func (m *LegacyManager) GetPids() ([]int, error) {
+	path, ok := m.paths["devices"]
+	if !ok {
+		return nil, errSubsystemDoesNotExist
+	}
+	return cgroups.GetPids(path)
+}
+
+func (m *LegacyManager) GetAllPids() ([]int, error) {
+	path, ok := m.paths["devices"]
+	if !ok {
+		return nil, errSubsystemDoesNotExist
+	}
+	return cgroups.GetAllPids(path)
+}
+
+func (m *LegacyManager) GetStats() (*cgroups.Stats, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	stats := cgroups.NewStats()
+	for _, sys := range legacySubsystems {
+		path := m.paths[sys.Name()]
+		if path == "" {
+			continue
+		}
+		if err := sys.GetStats(path, stats); err != nil {
+			return nil, err
+		}
+	}
+
+	return stats, nil
+}
+
+func (m *LegacyManager) Set(r *cgroups.Resources) error {
+	if r == nil {
+		return nil
+	}
+	if r.Unified != nil {
+		return cgroups.ErrV1NoUnified
+	}
+	properties, err := genV1ResourcesProperties(r, m.dbus)
+	if err != nil {
+		return err
+	}
+
+	unitName := getUnitName(m.cgroups)
+	needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r)
+	if err != nil {
+		return err
+	}
+
+	if needsFreeze {
+		if err := m.doFreeze(cgroups.Frozen); err != nil {
+			// If freezer cgroup isn't supported, we just warn about it.
+			logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
+			// skip update the cgroup while frozen failed. #3803
+			if !errors.Is(err, errSubsystemDoesNotExist) {
+				if needsThaw {
+					if thawErr := m.doFreeze(cgroups.Thawed); thawErr != nil {
+						logrus.Infof("thaw container after doFreeze failed: %v", thawErr)
+					}
+				}
+				return err
+			}
+		}
+	}
+	setErr := setUnitProperties(m.dbus, unitName, properties...)
+	if needsThaw {
+		if err := m.doFreeze(cgroups.Thawed); err != nil {
+			logrus.Infof("thaw container after SetUnitProperties failed: %v", err)
+		}
+	}
+	if setErr != nil {
+		return setErr
+	}
+
+	for _, sys := range legacySubsystems {
+		// Get the subsystem path, but don't error out for not found cgroups.
+		path, ok := m.paths[sys.Name()]
+		if !ok {
+			continue
+		}
+		if err := sys.Set(path, r); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (m *LegacyManager) GetPaths() map[string]string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.paths
+}
+
+func (m *LegacyManager) GetCgroups() (*cgroups.Cgroup, error) {
+	return m.cgroups, nil
+}
+
+func (m *LegacyManager) GetFreezerState() (cgroups.FreezerState, error) {
+	path, ok := m.paths["freezer"]
+	if !ok {
+		return cgroups.Undefined, nil
+	}
+	freezer := &fs.FreezerGroup{}
+	return freezer.GetState(path)
+}
+
+func (m *LegacyManager) Exists() bool {
+	return cgroups.PathExists(m.Path("devices"))
+}
+
+func (m *LegacyManager) OOMKillCount() (uint64, error) {
+	return fs.OOMKillCount(m.Path("memory"))
+}
diff --git a/systemd/v2.go b/systemd/v2.go
new file mode 100644
index 0000000..42a6e35
--- /dev/null
+++ b/systemd/v2.go
@@ -0,0 +1,515 @@
+package systemd
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/sirupsen/logrus"
+
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fs2"
+)
+
+const (
+	cpuIdleSupportedVersion = 252
+)
+
+type UnifiedManager struct {
+	mu      sync.Mutex
+	cgroups *cgroups.Cgroup
+	// path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
+	path  string
+	dbus  *dbusConnManager
+	fsMgr cgroups.Manager
+}
+
+func NewUnifiedManager(config *cgroups.Cgroup, path string) (*UnifiedManager, error) {
+	m := &UnifiedManager{
+		cgroups: config,
+		path:    path,
+		dbus:    newDbusConnManager(config.Rootless),
+	}
+	if err := m.initPath(); err != nil {
+		return nil, err
+	}
+
+	fsMgr, err := fs2.NewManager(config, m.path)
+	if err != nil {
+		return nil, err
+	}
+	m.fsMgr = fsMgr
+
+	return m, nil
+}
+
+func shouldSetCPUIdle(cm *dbusConnManager, v string) bool {
+	// The only valid values for cpu.idle are 0 and 1. As it is
+	// not possible to directly set cpu.idle to 0 via systemd,
+	// ignore 0. Ignore other values as we'll error out later
+	// in Set() while calling fsMgr.Set().
+	return v == "1" && systemdVersion(cm) >= cpuIdleSupportedVersion
+}
+
+// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
+// key/value map (where key is cgroupfs file name) to systemd unit properties.
+// This is on a best-effort basis, so the properties that are not known
+// (to this function and/or systemd) are ignored (but logged with "debug"
+// log level).
+//
+// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
+//
+// For the list of systemd unit properties, see systemd.resource-control(5).
+func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) {
+	var err error
+
+	for k, v := range res {
+		if strings.Contains(k, "/") {
+			return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
+		}
+		if strings.IndexByte(k, '.') <= 0 {
+			return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
+		}
+		// Kernel is quite forgiving to extra whitespace
+		// around the value, and so should we.
+		v = strings.TrimSpace(v)
+		// Please keep cases in alphabetical order.
+		switch k {
+		case "cpu.idle":
+			if shouldSetCPUIdle(cm, v) {
+				// Setting CPUWeight to 0 tells systemd
+				// to set cpu.idle to 1.
+				props = append(props,
+					newProp("CPUWeight", uint64(0)))
+			}
+
+		case "cpu.max":
+			// value: quota [period]
+			quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
+			period := defCPUQuotaPeriod
+			sv := strings.Fields(v)
+			if len(sv) < 1 || len(sv) > 2 {
+				return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v)
+			}
+			// quota
+			if sv[0] != "max" {
+				quota, err = strconv.ParseInt(sv[0], 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err)
+				}
+			}
+			// period
+			if len(sv) == 2 {
+				period, err = strconv.ParseUint(sv[1], 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
+				}
+			}
+			addCpuQuota(cm, &props, quota, period)
+
+		case "cpu.weight":
+			if shouldSetCPUIdle(cm, strings.TrimSpace(res["cpu.idle"])) {
+				// Do not add duplicate CPUWeight property
+				// (see case "cpu.idle" above).
+				logrus.Warn("unable to apply both cpu.weight and cpu.idle to systemd, ignoring cpu.weight")
+				continue
+			}
+			num, err := strconv.ParseUint(v, 10, 64)
+			if err != nil {
+				return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
+			}
+			props = append(props,
+				newProp("CPUWeight", num))
+
+		case "cpuset.cpus", "cpuset.mems":
+			bits, err := RangeToBits(v)
+			if err != nil {
+				return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
+			}
+			m := map[string]string{
+				"cpuset.cpus": "AllowedCPUs",
+				"cpuset.mems": "AllowedMemoryNodes",
+			}
+			// systemd only supports these properties since v244
+			sdVer := systemdVersion(cm)
+			if sdVer >= 244 {
+				props = append(props,
+					newProp(m[k], bits))
+			} else {
+				logrus.Debugf("systemd v%d is too old to support %s"+
+					" (setting will still be applied to cgroupfs)",
+					sdVer, m[k])
+			}
+
+		case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max":
+			num := uint64(math.MaxUint64)
+			if v != "max" {
+				num, err = strconv.ParseUint(v, 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
+				}
+			}
+			m := map[string]string{
+				"memory.high":     "MemoryHigh",
+				"memory.low":      "MemoryLow",
+				"memory.min":      "MemoryMin",
+				"memory.max":      "MemoryMax",
+				"memory.swap.max": "MemorySwapMax",
+			}
+			props = append(props,
+				newProp(m[k], num))
+
+		case "pids.max":
+			num := uint64(math.MaxUint64)
+			if v != "max" {
+				var err error
+				num, err = strconv.ParseUint(v, 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
+				}
+			}
+			props = append(props,
+				newProp("TasksMax", num))
+
+		case "memory.oom.group":
+			// Setting this to 1 is roughly equivalent to OOMPolicy=kill
+			// (as per systemd.service(5) and
+			// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html),
+			// but it's not clear what to do if it is unset or set
+			// to 0 in runc update, as there are two other possible
+			// values for OOMPolicy (continue/stop).
+			fallthrough
+
+		default:
+			// Ignore the unknown resource here -- will still be
+			// applied in Set which calls fs2.Set.
+			logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v)
+		}
+	}
+
+	return props, nil
+}
+
+func genV2ResourcesProperties(dirPath string, r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
+	// We need this check before setting systemd properties, otherwise
+	// the container is OOM-killed and the systemd unit is removed
+	// before we get to fsMgr.Set().
+	if err := fs2.CheckMemoryUsage(dirPath, r); err != nil {
+		return nil, err
+	}
+
+	var properties []systemdDbus.Property
+
+	// NOTE: This is of questionable correctness because we insert our own
+	//       devices eBPF program later. Two programs with identical rules
+	//       aren't the end of the world, but it is a bit concerning. However
+	//       it's unclear if systemd removes all eBPF programs attached when
+	//       doing SetUnitProperties...
+	deviceProperties, err := generateDeviceProperties(r, cm)
+	if err != nil {
+		return nil, err
+	}
+	properties = append(properties, deviceProperties...)
+
+	if r.Memory != 0 {
+		properties = append(properties,
+			newProp("MemoryMax", uint64(r.Memory)))
+	}
+	if r.MemoryReservation != 0 {
+		properties = append(properties,
+			newProp("MemoryLow", uint64(r.MemoryReservation)))
+	}
+
+	swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
+	if err != nil {
+		return nil, err
+	}
+	if swap != 0 {
+		properties = append(properties,
+			newProp("MemorySwapMax", uint64(swap)))
+	}
+
+	idleSet := false
+	// The logic here is the same as in shouldSetCPUIdle.
+	if r.CPUIdle != nil && *r.CPUIdle == 1 && systemdVersion(cm) >= cpuIdleSupportedVersion {
+		properties = append(properties,
+			newProp("CPUWeight", uint64(0)))
+		idleSet = true
+	}
+	if r.CpuWeight != 0 {
+		if idleSet {
+			// Ignore CpuWeight if CPUIdle is already set.
+			logrus.Warn("unable to apply both CPUWeight and CpuIdle to systemd, ignoring CPUWeight")
+		} else {
+			properties = append(properties,
+				newProp("CPUWeight", r.CpuWeight))
+		}
+	}
+
+	addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
+
+	if r.PidsLimit > 0 || r.PidsLimit == -1 {
+		properties = append(properties,
+			newProp("TasksMax", uint64(r.PidsLimit)))
+	}
+
+	err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
+	if err != nil {
+		return nil, err
+	}
+
+	// ignore r.KernelMemory
+
+	// convert Resources.Unified map to systemd properties
+	if r.Unified != nil {
+		unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified)
+		if err != nil {
+			return nil, err
+		}
+		properties = append(properties, unifiedProps...)
+	}
+
+	return properties, nil
+}
+
+func (m *UnifiedManager) Apply(pid int) error {
+	var (
+		c          = m.cgroups
+		unitName   = getUnitName(c)
+		properties []systemdDbus.Property
+	)
+
+	slice := "system.slice"
+	if m.cgroups.Rootless {
+		slice = "user.slice"
+	}
+	if c.Parent != "" {
+		slice = c.Parent
+	}
+
+	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+	if strings.HasSuffix(unitName, ".slice") {
+		// If we create a slice, the parent is defined via a Wants=.
+		properties = append(properties, systemdDbus.PropWants(slice))
+	} else {
+		// Otherwise it's a scope, which we put into a Slice=.
+		properties = append(properties, systemdDbus.PropSlice(slice))
+		// Assume scopes always support delegation (supported since systemd v218).
+		properties = append(properties, newProp("Delegate", true))
+	}
+
+	// only add pid if its valid, -1 is used w/ general slice creation.
+	if pid != -1 {
+		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+	}
+
+	// Always enable accounting, this gets us the same behaviour as the fs implementation,
+	// plus the kernel has some problems with joining the memory cgroup at a later time.
+	properties = append(properties,
+		newProp("MemoryAccounting", true),
+		newProp("CPUAccounting", true),
+		newProp("IOAccounting", true),
+		newProp("TasksAccounting", true),
+	)
+
+	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
+	properties = append(properties,
+		newProp("DefaultDependencies", false))
+
+	properties = append(properties, c.SystemdProps...)
+
+	if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
+		return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
+	}
+
+	if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
+		return err
+	}
+
+	if c.OwnerUID != nil {
+		// The directory itself must be chowned.
+		err := os.Chown(m.path, *c.OwnerUID, -1)
+		if err != nil {
+			return err
+		}
+
+		filesToChown, err := cgroupFilesToChown()
+		if err != nil {
+			return err
+		}
+
+		for _, v := range filesToChown {
+			err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
+			// Some files might not be present.
+			if err != nil && !errors.Is(err, os.ErrNotExist) {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+// The kernel exposes a list of files that should be chowned to the delegate
+// uid in /sys/kernel/cgroup/delegate.  If the file is not present
+// (Linux < 4.15), use the initial values mentioned in cgroups(7).
+func cgroupFilesToChown() ([]string, error) {
+	const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
+
+	f, err := os.Open(cgroupDelegateFile)
+	if err != nil {
+		return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
+	}
+	defer f.Close()
+
+	filesToChown := []string{}
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		filesToChown = append(filesToChown, scanner.Text())
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
+	}
+
+	return filesToChown, nil
+}
+
+func (m *UnifiedManager) Destroy() error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	unitName := getUnitName(m.cgroups)
+	if err := stopUnit(m.dbus, unitName); err != nil {
+		return err
+	}
+
+	// systemd 239 do not remove sub-cgroups.
+	err := m.fsMgr.Destroy()
+	// fsMgr.Destroy has handled ErrNotExist
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (m *UnifiedManager) Path(_ string) string {
+	return m.path
+}
+
+// getSliceFull value is used in initPath.
+// The value is incompatible with systemdDbus.PropSlice.
+func (m *UnifiedManager) getSliceFull() (string, error) {
+	c := m.cgroups
+	slice := "system.slice"
+	if c.Rootless {
+		slice = "user.slice"
+	}
+	if c.Parent != "" {
+		var err error
+		slice, err = ExpandSlice(c.Parent)
+		if err != nil {
+			return "", err
+		}
+	}
+
+	if c.Rootless {
+		// managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
+		managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
+		if err != nil {
+			return "", err
+		}
+		slice = filepath.Join(managerCG, slice)
+	}
+
+	// an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice"
+	// NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified.
+	return slice, nil
+}
+
+func (m *UnifiedManager) initPath() error {
+	if m.path != "" {
+		return nil
+	}
+
+	sliceFull, err := m.getSliceFull()
+	if err != nil {
+		return err
+	}
+
+	c := m.cgroups
+	path := filepath.Join(sliceFull, getUnitName(c))
+	path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path)
+	if err != nil {
+		return err
+	}
+
+	// an example of the final path in rootless:
+	// "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
+	m.path = path
+
+	return nil
+}
+
+func (m *UnifiedManager) Freeze(state cgroups.FreezerState) error {
+	return m.fsMgr.Freeze(state)
+}
+
+func (m *UnifiedManager) GetPids() ([]int, error) {
+	return cgroups.GetPids(m.path)
+}
+
+func (m *UnifiedManager) GetAllPids() ([]int, error) {
+	return cgroups.GetAllPids(m.path)
+}
+
+func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) {
+	return m.fsMgr.GetStats()
+}
+
+func (m *UnifiedManager) Set(r *cgroups.Resources) error {
+	if r == nil {
+		return nil
+	}
+	properties, err := genV2ResourcesProperties(m.fsMgr.Path(""), r, m.dbus)
+	if err != nil {
+		return err
+	}
+
+	if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
+		return fmt.Errorf("unable to set unit properties: %w", err)
+	}
+
+	return m.fsMgr.Set(r)
+}
+
+func (m *UnifiedManager) GetPaths() map[string]string {
+	paths := make(map[string]string, 1)
+	paths[""] = m.path
+	return paths
+}
+
+func (m *UnifiedManager) GetCgroups() (*cgroups.Cgroup, error) {
+	return m.cgroups, nil
+}
+
+func (m *UnifiedManager) GetFreezerState() (cgroups.FreezerState, error) {
+	return m.fsMgr.GetFreezerState()
+}
+
+func (m *UnifiedManager) Exists() bool {
+	return cgroups.PathExists(m.path)
+}
+
+func (m *UnifiedManager) OOMKillCount() (uint64, error) {
+	return m.fsMgr.OOMKillCount()
+}
diff --git a/utils.go b/utils.go
new file mode 100644
index 0000000..9ef24b1
--- /dev/null
+++ b/utils.go
@@ -0,0 +1,468 @@
+package cgroups
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/moby/sys/userns"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	CgroupProcesses   = "cgroup.procs"
+	unifiedMountpoint = "/sys/fs/cgroup"
+	hybridMountpoint  = "/sys/fs/cgroup/unified"
+)
+
+var (
+	isUnifiedOnce sync.Once
+	isUnified     bool
+	isHybridOnce  sync.Once
+	isHybrid      bool
+)
+
+// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
+func IsCgroup2UnifiedMode() bool {
+	isUnifiedOnce.Do(func() {
+		var st unix.Statfs_t
+		err := unix.Statfs(unifiedMountpoint, &st)
+		if err != nil {
+			level := logrus.WarnLevel
+			if os.IsNotExist(err) && userns.RunningInUserNS() {
+				// For rootless containers, sweep it under the rug.
+				level = logrus.DebugLevel
+			}
+			logrus.StandardLogger().Logf(level,
+				"statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
+		}
+		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
+	})
+	return isUnified
+}
+
+// IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode.
+func IsCgroup2HybridMode() bool {
+	isHybridOnce.Do(func() {
+		var st unix.Statfs_t
+		err := unix.Statfs(hybridMountpoint, &st)
+		if err != nil {
+			isHybrid = false
+			if !os.IsNotExist(err) {
+				// Report unexpected errors.
+				logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint)
+			}
+			return
+		}
+		isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
+	})
+	return isHybrid
+}
+
+type Mount struct {
+	Mountpoint string
+	Root       string
+	Subsystems []string
+}
+
+// GetCgroupMounts returns the mounts for the cgroup subsystems.
+// all indicates whether to return just the first instance or all the mounts.
+// This function should not be used from cgroupv2 code, as in this case
+// all the controllers are available under the constant unifiedMountpoint.
+func GetCgroupMounts(all bool) ([]Mount, error) {
+	if IsCgroup2UnifiedMode() {
+		// TODO: remove cgroupv2 case once all external users are converted
+		availableControllers, err := GetAllSubsystems()
+		if err != nil {
+			return nil, err
+		}
+		m := Mount{
+			Mountpoint: unifiedMountpoint,
+			Root:       unifiedMountpoint,
+			Subsystems: availableControllers,
+		}
+		return []Mount{m}, nil
+	}
+
+	return getCgroupMountsV1(all)
+}
+
+// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
+func GetAllSubsystems() ([]string, error) {
+	// /proc/cgroups is meaningless for v2
+	// https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
+	if IsCgroup2UnifiedMode() {
+		// "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
+		// - devices: implemented in kernel 4.15
+		// - freezer: implemented in kernel 5.2
+		// We assume these are always available, as it is hard to detect availability.
+		pseudo := []string{"devices", "freezer"}
+		data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers")
+		if err != nil {
+			return nil, err
+		}
+		subsystems := append(pseudo, strings.Fields(data)...)
+		return subsystems, nil
+	}
+	f, err := os.Open("/proc/cgroups")
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	subsystems := []string{}
+
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		text := s.Text()
+		if text[0] != '#' {
+			parts := strings.Fields(text)
+			if len(parts) >= 4 && parts[3] != "0" {
+				subsystems = append(subsystems, parts[0])
+			}
+		}
+	}
+	if err := s.Err(); err != nil {
+		return nil, err
+	}
+	return subsystems, nil
+}
+
+func readProcsFile(dir string) (out []int, _ error) {
+	file := CgroupProcesses
+	retry := true
+
+again:
+	f, err := OpenFile(dir, file, os.O_RDONLY)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		if t := s.Text(); t != "" {
+			pid, err := strconv.Atoi(t)
+			if err != nil {
+				return nil, err
+			}
+			out = append(out, pid)
+		}
+	}
+	if errors.Is(s.Err(), unix.ENOTSUP) && retry {
+		// For a threaded cgroup, read returns ENOTSUP, and we should
+		// read from cgroup.threads instead.
+		file = "cgroup.threads"
+		retry = false
+		goto again
+	}
+	return out, s.Err()
+}
+
+// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
+// or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
+//
+//	"cpu": "/user.slice/user-1000.slice"
+//	"pids": "/user.slice/user-1000.slice"
+//
+// etc.
+//
+// Note that for cgroup v2 unified hierarchy, there are no per-controller
+// cgroup paths, so the resulting map will have a single element where the key
+// is empty string ("") and the value is the cgroup path the <pid> is in.
+func ParseCgroupFile(path string) (map[string]string, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	return parseCgroupFromReader(f)
+}
+
+// helper function for ParseCgroupFile to make testing easier
+func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
+	s := bufio.NewScanner(r)
+	cgroups := make(map[string]string)
+
+	for s.Scan() {
+		text := s.Text()
+		// from cgroups(7):
+		// /proc/[pid]/cgroup
+		// ...
+		// For each cgroup hierarchy ... there is one entry
+		// containing three colon-separated fields of the form:
+		//     hierarchy-ID:subsystem-list:cgroup-path
+		parts := strings.SplitN(text, ":", 3)
+		if len(parts) < 3 {
+			return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
+		}
+
+		for _, subs := range strings.Split(parts[1], ",") {
+			cgroups[subs] = parts[2]
+		}
+	}
+	if err := s.Err(); err != nil {
+		return nil, err
+	}
+
+	return cgroups, nil
+}
+
+func PathExists(path string) bool {
+	if _, err := os.Stat(path); err != nil {
+		return false
+	}
+	return true
+}
+
+// rmdir tries to remove a directory, optionally retrying on EBUSY.
+func rmdir(path string, retry bool) error {
+	delay := time.Millisecond
+	tries := 10
+
+again:
+	err := unix.Rmdir(path)
+	switch err { // nolint:errorlint // unix errors are bare
+	case nil, unix.ENOENT:
+		return nil
+	case unix.EINTR:
+		goto again
+	case unix.EBUSY:
+		if retry && tries > 0 {
+			time.Sleep(delay)
+			delay *= 2
+			tries--
+			goto again
+
+		}
+	}
+	return &os.PathError{Op: "rmdir", Path: path, Err: err}
+}
+
+// RemovePath aims to remove cgroup path. It does so recursively,
+// by removing any subdirectories (sub-cgroups) first.
+func RemovePath(path string) error {
+	// Try the fast path first; don't retry on EBUSY yet.
+	if err := rmdir(path, false); err == nil {
+		return nil
+	}
+
+	// There are many reasons why rmdir can fail, including:
+	// 1. cgroup have existing sub-cgroups;
+	// 2. cgroup (still) have some processes (that are about to vanish);
+	// 3. lack of permission (one example is read-only /sys/fs/cgroup mount,
+	//    in which case rmdir returns EROFS even for for a non-existent path,
+	//    see issue 4518).
+	//
+	// Using os.ReadDir here kills two birds with one stone: check if
+	// the directory exists (handling scenario 3 above), and use
+	// directory contents to remove sub-cgroups (handling scenario 1).
+	infos, err := os.ReadDir(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil
+		}
+		return err
+	}
+	// Let's remove sub-cgroups, if any.
+	for _, info := range infos {
+		if info.IsDir() {
+			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
+				return err
+			}
+		}
+	}
+	// Finally, try rmdir again, this time with retries on EBUSY,
+	// which may help with scenario 2 above.
+	return rmdir(path, true)
+}
+
+// RemovePaths iterates over the provided paths removing them.
+func RemovePaths(paths map[string]string) (err error) {
+	for s, p := range paths {
+		if err := RemovePath(p); err == nil {
+			delete(paths, s)
+		}
+	}
+	if len(paths) == 0 {
+		clear(paths)
+		return nil
+	}
+	return fmt.Errorf("Failed to remove paths: %v", paths)
+}
+
+var (
+	hugePageSizes []string
+	initHPSOnce   sync.Once
+)
+
+func HugePageSizes() []string {
+	initHPSOnce.Do(func() {
+		dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
+		if err != nil {
+			return
+		}
+		files, err := dir.Readdirnames(0)
+		dir.Close()
+		if err != nil {
+			return
+		}
+
+		hugePageSizes, err = getHugePageSizeFromFilenames(files)
+		if err != nil {
+			logrus.Warn("HugePageSizes: ", err)
+		}
+	})
+
+	return hugePageSizes
+}
+
+func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
+	pageSizes := make([]string, 0, len(fileNames))
+	var warn error
+
+	for _, file := range fileNames {
+		// example: hugepages-1048576kB
+		val, ok := strings.CutPrefix(file, "hugepages-")
+		if !ok {
+			// Unexpected file name: no prefix found, ignore it.
+			continue
+		}
+		// The suffix is always "kB" (as of Linux 5.13). If we find
+		// something else, produce an error but keep going.
+		eLen := len(val) - 2
+		val = strings.TrimSuffix(val, "kB")
+		if len(val) != eLen {
+			// Highly unlikely.
+			if warn == nil {
+				warn = errors.New(file + `: invalid suffix (expected "kB")`)
+			}
+			continue
+		}
+		size, err := strconv.Atoi(val)
+		if err != nil {
+			// Highly unlikely.
+			if warn == nil {
+				warn = fmt.Errorf("%s: %w", file, err)
+			}
+			continue
+		}
+		// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
+		// but in our case the size is in KB already.
+		if size >= (1 << 20) {
+			val = strconv.Itoa(size>>20) + "GB"
+		} else if size >= (1 << 10) {
+			val = strconv.Itoa(size>>10) + "MB"
+		} else {
+			val += "KB"
+		}
+		pageSizes = append(pageSizes, val)
+	}
+
+	return pageSizes, warn
+}
+
+// GetPids returns all pids, that were added to cgroup at path.
+func GetPids(dir string) ([]int, error) {
+	return readProcsFile(dir)
+}
+
+// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
+func WriteCgroupProc(dir string, pid int) error {
+	// Normally dir should not be empty, one case is that cgroup subsystem
+	// is not mounted, we will get empty dir, and we want it fail here.
+	if dir == "" {
+		return fmt.Errorf("no such directory for %s", CgroupProcesses)
+	}
+
+	// Dont attach any pid to the cgroup if -1 is specified as a pid
+	if pid == -1 {
+		return nil
+	}
+
+	file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
+	if err != nil {
+		return fmt.Errorf("failed to write %v: %w", pid, err)
+	}
+	defer file.Close()
+
+	for i := 0; i < 5; i++ {
+		_, err = file.WriteString(strconv.Itoa(pid))
+		if err == nil {
+			return nil
+		}
+
+		// EINVAL might mean that the task being added to cgroup.procs is in state
+		// TASK_NEW. We should attempt to do so again.
+		if errors.Is(err, unix.EINVAL) {
+			time.Sleep(30 * time.Millisecond)
+			continue
+		}
+
+		return fmt.Errorf("failed to write %v: %w", pid, err)
+	}
+	return err
+}
+
+// Since the OCI spec is designed for cgroup v1, in some cases
+// there is need to convert from the cgroup v1 configuration to cgroup v2
+// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
+// convert from [2-262144] to [1-10000]
+// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
+func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
+	if cpuShares == 0 {
+		return 0
+	}
+	return (1 + ((cpuShares-2)*9999)/262142)
+}
+
+// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
+// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
+// is defined as memory+swap combined, while in cgroup v2 swap is a separate value,
+// so we need to subtract memory from it where it makes sense.
+func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
+	switch {
+	case memory == -1 && memorySwap == 0:
+		// For compatibility with cgroup1 controller, set swap to unlimited in
+		// case the memory is set to unlimited and the swap is not explicitly set,
+		// treating the request as "set both memory and swap to unlimited".
+		return -1, nil
+	case memorySwap == -1, memorySwap == 0:
+		// Treat -1 ("max") and 0 ("unset") swap as is.
+		return memorySwap, nil
+	case memory == -1:
+		// Unlimited memory, so treat swap as is.
+		return memorySwap, nil
+	case memory == 0:
+		// Unset or unknown memory, can't calculate swap.
+		return 0, errors.New("unable to set swap limit without memory limit")
+	case memory < 0:
+		// Does not make sense to subtract a negative value.
+		return 0, fmt.Errorf("invalid memory value: %d", memory)
+	case memorySwap < memory:
+		// Sanity check.
+		return 0, errors.New("memory+swap limit should be >= memory limit")
+	}
+
+	return memorySwap - memory, nil
+}
+
+// Since the OCI spec is designed for cgroup v1, in some cases
+// there is need to convert from the cgroup v1 configuration to cgroup v2
+// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
+// convert linearly from [10-1000] to [1-10000]
+func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
+	if blkIoWeight == 0 {
+		return 0
+	}
+	return 1 + (uint64(blkIoWeight)-10)*9999/990
+}
diff --git a/utils_test.go b/utils_test.go
new file mode 100644
index 0000000..58ac85a
--- /dev/null
+++ b/utils_test.go
@@ -0,0 +1,691 @@
+package cgroups
+
+import (
+	"bytes"
+	"errors"
+	"path/filepath"
+	"reflect"
+	"strings"
+	"testing"
+
+	"github.com/moby/sys/mountinfo"
+	"golang.org/x/sys/unix"
+)
+
+const fedoraMountinfo = `15 35 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
+16 35 0:14 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
+17 35 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8056484k,nr_inodes=2014121,mode=755
+18 16 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
+19 16 0:13 / /sys/fs/selinux rw,relatime shared:8 - selinuxfs selinuxfs rw
+20 17 0:16 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
+21 17 0:10 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
+22 35 0:17 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,seclabel,mode=755
+23 16 0:18 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:9 - tmpfs tmpfs rw,seclabel,mode=755
+24 23 0:19 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
+25 16 0:20 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
+26 23 0:21 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,clone_children
+27 23 0:22 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,cpuacct,cpu,clone_children
+28 23 0:23 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,memory,clone_children
+29 23 0:24 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,devices,clone_children
+30 23 0:25 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,freezer,clone_children
+31 23 0:26 / /sys/fs/cgroup/net_cls rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,net_cls,clone_children
+32 23 0:27 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,blkio,clone_children
+33 23 0:28 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,perf_event,clone_children
+34 23 0:29 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,hugetlb,clone_children
+35 1 253:2 / / rw,relatime shared:1 - ext4 /dev/mapper/ssd-root--f20 rw,seclabel,data=ordered
+36 15 0:30 / /proc/sys/fs/binfmt_misc rw,relatime shared:22 - autofs systemd-1 rw,fd=38,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
+37 17 0:12 / /dev/mqueue rw,relatime shared:23 - mqueue mqueue rw,seclabel
+38 35 0:31 / /tmp rw shared:24 - tmpfs tmpfs rw,seclabel
+39 17 0:32 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw,seclabel
+40 16 0:7 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw
+41 16 0:33 / /sys/kernel/config rw,relatime shared:27 - configfs configfs rw
+42 35 0:34 / /var/lib/nfs/rpc_pipefs rw,relatime shared:28 - rpc_pipefs sunrpc rw
+43 15 0:35 / /proc/fs/nfsd rw,relatime shared:29 - nfsd sunrpc rw
+45 35 8:17 / /boot rw,relatime shared:30 - ext4 /dev/sdb1 rw,seclabel,data=ordered
+46 35 253:4 / /home rw,relatime shared:31 - ext4 /dev/mapper/ssd-home rw,seclabel,data=ordered
+47 35 253:5 / /var/lib/libvirt/images rw,noatime,nodiratime shared:32 - ext4 /dev/mapper/ssd-virt rw,seclabel,discard,data=ordered
+48 35 253:12 / /mnt/old rw,relatime shared:33 - ext4 /dev/mapper/HelpDeskRHEL6-FedoraRoot rw,seclabel,data=ordered
+121 22 0:36 / /run/user/1000/gvfs rw,nosuid,nodev,relatime shared:104 - fuse.gvfsd-fuse gvfsd-fuse rw,user_id=1000,group_id=1000
+124 16 0:37 / /sys/fs/fuse/connections rw,relatime shared:107 - fusectl fusectl rw
+165 38 253:3 / /tmp/mnt rw,relatime shared:147 - ext4 /dev/mapper/ssd-root rw,seclabel,data=ordered
+167 35 253:15 / /var/lib/docker/devicemapper/mnt/aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,relatime shared:149 - ext4 /dev/mapper/docker-253:2-425882-aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,seclabel,discard,stripe=16,data=ordered
+171 35 253:16 / /var/lib/docker/devicemapper/mnt/c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,relatime shared:153 - ext4 /dev/mapper/docker-253:2-425882-c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,seclabel,discard,stripe=16,data=ordered
+175 35 253:17 / /var/lib/docker/devicemapper/mnt/1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,relatime shared:157 - ext4 /dev/mapper/docker-253:2-425882-1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,seclabel,discard,stripe=16,data=ordered
+179 35 253:18 / /var/lib/docker/devicemapper/mnt/d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,relatime shared:161 - ext4 /dev/mapper/docker-253:2-425882-d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,seclabel,discard,stripe=16,data=ordered
+183 35 253:19 / /var/lib/docker/devicemapper/mnt/6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,relatime shared:165 - ext4 /dev/mapper/docker-253:2-425882-6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,seclabel,discard,stripe=16,data=ordered
+187 35 253:20 / /var/lib/docker/devicemapper/mnt/8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,relatime shared:169 - ext4 /dev/mapper/docker-253:2-425882-8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,seclabel,discard,stripe=16,data=ordered
+191 35 253:21 / /var/lib/docker/devicemapper/mnt/c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,relatime shared:173 - ext4 /dev/mapper/docker-253:2-425882-c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,seclabel,discard,stripe=16,data=ordered
+195 35 253:22 / /var/lib/docker/devicemapper/mnt/2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,relatime shared:177 - ext4 /dev/mapper/docker-253:2-425882-2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,seclabel,discard,stripe=16,data=ordered
+199 35 253:23 / /var/lib/docker/devicemapper/mnt/37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,relatime shared:181 - ext4 /dev/mapper/docker-253:2-425882-37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,seclabel,discard,stripe=16,data=ordered
+203 35 253:24 / /var/lib/docker/devicemapper/mnt/aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,relatime shared:185 - ext4 /dev/mapper/docker-253:2-425882-aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,seclabel,discard,stripe=16,data=ordered
+207 35 253:25 / /var/lib/docker/devicemapper/mnt/928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,relatime shared:189 - ext4 /dev/mapper/docker-253:2-425882-928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,seclabel,discard,stripe=16,data=ordered
+211 35 253:26 / /var/lib/docker/devicemapper/mnt/0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,relatime shared:193 - ext4 /dev/mapper/docker-253:2-425882-0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,seclabel,discard,stripe=16,data=ordered
+215 35 253:27 / /var/lib/docker/devicemapper/mnt/d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,relatime shared:197 - ext4 /dev/mapper/docker-253:2-425882-d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,seclabel,discard,stripe=16,data=ordered
+219 35 253:28 / /var/lib/docker/devicemapper/mnt/bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,relatime shared:201 - ext4 /dev/mapper/docker-253:2-425882-bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,seclabel,discard,stripe=16,data=ordered
+223 35 253:29 / /var/lib/docker/devicemapper/mnt/7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,relatime shared:205 - ext4 /dev/mapper/docker-253:2-425882-7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,seclabel,discard,stripe=16,data=ordered
+227 35 253:30 / /var/lib/docker/devicemapper/mnt/c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,relatime shared:209 - ext4 /dev/mapper/docker-253:2-425882-c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,seclabel,discard,stripe=16,data=ordered
+231 35 253:31 / /var/lib/docker/devicemapper/mnt/8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,relatime shared:213 - ext4 /dev/mapper/docker-253:2-425882-8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,seclabel,discard,stripe=16,data=ordered
+235 35 253:32 / /var/lib/docker/devicemapper/mnt/1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,relatime shared:217 - ext4 /dev/mapper/docker-253:2-425882-1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,seclabel,discard,stripe=16,data=ordered
+239 35 253:33 / /var/lib/docker/devicemapper/mnt/e9aa60c60128cad1 rw,relatime shared:221 - ext4 /dev/mapper/docker-253:2-425882-e9aa60c60128cad1 rw,seclabel,discard,stripe=16,data=ordered
+243 35 253:34 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,relatime shared:225 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,seclabel,discard,stripe=16,data=ordered
+247 35 253:35 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,relatime shared:229 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,seclabel,discard,stripe=16,data=ordered
+31 21 0:23 / /DATA/foo_bla_bla rw,relatime - cifs //foo/BLA\040BLA\040BLA/ rw,sec=ntlm,cache=loose,unc=\\foo\BLA BLA BLA,username=my_login,domain=mydomain.com,uid=12345678,forceuid,gid=12345678,forcegid,addr=10.1.30.10,file_mode=0755,dir_mode=0755,nounix,rsize=61440,wsize=65536,actimeo=1`
+
+const systemdMountinfo = `115 83 0:32 / / rw,relatime - aufs none rw,si=c0bd3d3,dio,dirperm1
+116 115 0:35 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
+117 115 0:36 / /dev rw,nosuid - tmpfs tmpfs rw,mode=755
+118 117 0:37 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666
+119 115 0:38 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw
+120 119 0:39 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755
+121 120 0:19 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+122 120 0:20 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,devices
+123 120 0:21 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
+124 120 0:22 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
+125 120 0:23 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,net_cls,net_prio
+126 120 0:24 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,blkio
+127 120 0:25 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuset,clone_children
+128 120 0:26 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpu,cpuacct
+129 120 0:27 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,perf_event,release_agent=/run/cgmanager/agents/cgm-release-agent.perf_event
+130 115 43:0 /var/lib/docker/volumes/a44a712176377f57c094397330ee04387284c478364eb25f4c3d25f775f25c26/_data /var/lib/docker rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+131 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/resolv.conf /etc/resolv.conf rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+132 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hostname /etc/hostname rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+133 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hosts /etc/hosts rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+134 117 0:33 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k
+135 117 0:13 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw
+136 117 0:12 /1 /dev/console rw,nosuid,noexec,relatime - devpts none rw,gid=5,mode=620,ptmxmode=000
+84 115 0:40 / /tmp rw,relatime - tmpfs none rw`
+
+const bedrockMountinfo = `120 17 0:28 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+124 28 0:28 / /bedrock/strata/arch/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+123 53 0:28 / /bedrock/strata/fallback/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+122 71 0:28 / /bedrock/strata/gentoo/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+121 89 0:28 / /bedrock/strata/kde/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+125 120 0:29 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+129 124 0:29 / /bedrock/strata/arch/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+128 123 0:29 / /bedrock/strata/fallback/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+127 122 0:29 / /bedrock/strata/gentoo/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+126 121 0:29 / /bedrock/strata/kde/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+140 120 0:32 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+144 124 0:32 / /bedrock/strata/arch/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+143 123 0:32 / /bedrock/strata/fallback/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+142 122 0:32 / /bedrock/strata/gentoo/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+141 121 0:32 / /bedrock/strata/kde/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+145 120 0:33 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+149 124 0:33 / /bedrock/strata/arch/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+148 123 0:33 / /bedrock/strata/fallback/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+147 122 0:33 / /bedrock/strata/gentoo/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+146 121 0:33 / /bedrock/strata/kde/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+150 120 0:34 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+154 124 0:34 / /bedrock/strata/arch/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+153 123 0:34 / /bedrock/strata/fallback/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+152 122 0:34 / /bedrock/strata/gentoo/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+151 121 0:34 / /bedrock/strata/kde/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+155 120 0:35 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+159 124 0:35 / /bedrock/strata/arch/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+158 123 0:35 / /bedrock/strata/fallback/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+157 122 0:35 / /bedrock/strata/gentoo/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+156 121 0:35 / /bedrock/strata/kde/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+160 120 0:36 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+164 124 0:36 / /bedrock/strata/arch/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+163 123 0:36 / /bedrock/strata/fallback/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+162 122 0:36 / /bedrock/strata/gentoo/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+161 121 0:36 / /bedrock/strata/kde/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+165 120 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+169 124 0:37 / /bedrock/strata/arch/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+168 123 0:37 / /bedrock/strata/fallback/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+167 122 0:37 / /bedrock/strata/gentoo/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+166 121 0:37 / /bedrock/strata/kde/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+170 120 0:38 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+174 124 0:38 / /bedrock/strata/arch/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+173 123 0:38 / /bedrock/strata/fallback/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+172 122 0:38 / /bedrock/strata/gentoo/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+171 121 0:38 / /bedrock/strata/kde/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+175 120 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+179 124 0:39 / /bedrock/strata/arch/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+178 123 0:39 / /bedrock/strata/fallback/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+177 122 0:39 / /bedrock/strata/gentoo/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+176 121 0:39 / /bedrock/strata/kde/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+180 120 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+184 124 0:40 / /bedrock/strata/arch/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+183 123 0:40 / /bedrock/strata/fallback/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+182 122 0:40 / /bedrock/strata/gentoo/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+181 121 0:40 / /bedrock/strata/kde/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event`
+
+const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
+19 64 0:4 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
+20 64 0:6 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8171204k,nr_inodes=2042801,mode=755
+21 18 0:19 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
+22 20 0:20 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
+23 20 0:21 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
+24 64 0:22 / /run rw,nosuid,nodev shared:24 - tmpfs tmpfs rw,seclabel,mode=755
+25 18 0:23 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755
+26 25 0:24 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup2 cgroup rw
+27 18 0:25 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw,seclabel
+28 18 0:26 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime shared:21 - efivarfs efivarfs rw
+29 25 0:27 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpu,cpuacct
+30 25 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,memory
+31 25 0:29 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,net_cls,net_prio
+32 25 0:30 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,blkio
+33 25 0:31 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,perf_event
+34 25 0:32 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,hugetlb
+35 25 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,freezer
+36 25 0:34 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
+37 25 0:35 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+38 25 0:36 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids
+61 18 0:37 / /sys/kernel/config rw,relatime shared:22 - configfs configfs rw
+64 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/fedora_dhcp--16--129-root rw,seclabel,data=ordered
+39 18 0:17 / /sys/fs/selinux rw,relatime shared:23 - selinuxfs selinuxfs rw
+40 20 0:16 / /dev/mqueue rw,relatime shared:25 - mqueue mqueue rw,seclabel
+41 20 0:39 / /dev/hugepages rw,relatime shared:26 - hugetlbfs hugetlbfs rw,seclabel
+`
+
+func TestGetCgroupMounts(t *testing.T) {
+	type testData struct {
+		mountInfo string
+		root      string
+		// all is the total number of records expected with all=true,
+		// or 0 for no extra records expected (most cases).
+		all        int
+		subsystems map[string]bool
+	}
+	testTable := []testData{
+		{
+			mountInfo: fedoraMountinfo,
+			root:      "/",
+			subsystems: map[string]bool{
+				"name=systemd": false,
+				"cpuset":       false,
+				"cpu":          false,
+				"cpuacct":      false,
+				"memory":       false,
+				"devices":      false,
+				"freezer":      false,
+				"net_cls":      false,
+				"blkio":        false,
+				"perf_event":   false,
+				"hugetlb":      false,
+			},
+		},
+		{
+			mountInfo: systemdMountinfo,
+			root:      "/system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope",
+			subsystems: map[string]bool{
+				"name=systemd": false,
+				"cpuset":       false,
+				"cpu":          false,
+				"cpuacct":      false,
+				"memory":       false,
+				"devices":      false,
+				"freezer":      false,
+				"net_cls":      false,
+				"net_prio":     false,
+				"blkio":        false,
+				"perf_event":   false,
+			},
+		},
+		{
+			mountInfo: bedrockMountinfo,
+			root:      "/",
+			all:       50,
+			subsystems: map[string]bool{
+				"name=systemd": false,
+				"cpuset":       false,
+				"cpu":          false,
+				"cpuacct":      false,
+				"memory":       false,
+				"devices":      false,
+				"freezer":      false,
+				"net_cls":      false,
+				"net_prio":     false,
+				"blkio":        false,
+				"perf_event":   false,
+				"pids":         false,
+			},
+		},
+	}
+	for _, td := range testTable {
+		mi, err := mountinfo.GetMountsFromReader(
+			bytes.NewBufferString(td.mountInfo),
+			mountinfo.FSTypeFilter("cgroup"),
+		)
+		if err != nil {
+			t.Fatal(err)
+		}
+		cgMounts, err := getCgroupMountsHelper(td.subsystems, mi, false)
+		if err != nil {
+			t.Fatal(err)
+		}
+		cgMap := make(map[string]Mount)
+		for _, m := range cgMounts {
+			for _, ss := range m.Subsystems {
+				cgMap[ss] = m
+			}
+		}
+		for ss := range td.subsystems {
+			ss = strings.TrimPrefix(ss, CgroupNamePrefix)
+			m, ok := cgMap[ss]
+			if !ok {
+				t.Fatalf("%s not found", ss)
+			}
+			if m.Root != td.root {
+				t.Fatalf("unexpected root for %s: %s", ss, m.Root)
+			}
+			if !strings.HasPrefix(m.Mountpoint, "/sys/fs/cgroup/") && !strings.Contains(m.Mountpoint, ss) {
+				t.Fatalf("unexpected mountpoint for %s: %s", ss, m.Mountpoint)
+			}
+			var ssFound bool
+			for _, mss := range m.Subsystems {
+				if mss == ss {
+					ssFound = true
+					break
+				}
+			}
+			if !ssFound {
+				t.Fatalf("subsystem %s not found in Subsystems field %v", ss, m.Subsystems)
+			}
+		}
+		// Test the all=true case.
+
+		// Reset the test input.
+		for k := range td.subsystems {
+			td.subsystems[k] = false
+		}
+		cgMountsAll, err := getCgroupMountsHelper(td.subsystems, mi, true)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if td.all == 0 {
+			// Results with and without "all" should be the same.
+			if len(cgMounts) != len(cgMountsAll) || !reflect.DeepEqual(cgMounts, cgMountsAll) {
+				t.Errorf("expected same results, got (all=false) %v, (all=true) %v", cgMounts, cgMountsAll)
+			}
+		} else {
+			// Make sure we got all records.
+			if len(cgMountsAll) != td.all {
+				t.Errorf("expected %d records, got %d (%+v)", td.all, len(cgMountsAll), cgMountsAll)
+			}
+		}
+
+	}
+}
+
+func BenchmarkGetCgroupMounts(b *testing.B) {
+	subsystems := map[string]bool{
+		"cpuset":     false,
+		"cpu":        false,
+		"cpuacct":    false,
+		"memory":     false,
+		"devices":    false,
+		"freezer":    false,
+		"net_cls":    false,
+		"blkio":      false,
+		"perf_event": false,
+		"hugetlb":    false,
+	}
+	mi, err := mountinfo.GetMountsFromReader(
+		bytes.NewBufferString(fedoraMountinfo),
+		mountinfo.FSTypeFilter("cgroup"),
+	)
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if _, err := getCgroupMountsHelper(subsystems, mi, false); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func TestParseCgroupString(t *testing.T) {
+	testCases := []struct {
+		input          string
+		expectedError  error
+		expectedOutput map[string]string
+	}{
+		{
+			// Taken from a CoreOS instance running systemd 225 with CPU/Mem
+			// accounting enabled in systemd
+			input: `9:blkio:/
+8:freezer:/
+7:perf_event:/
+6:devices:/system.slice/system-sshd.slice
+5:cpuset:/
+4:cpu,cpuacct:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service
+3:net_cls,net_prio:/
+2:memory:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service
+1:name=systemd:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service`,
+			expectedOutput: map[string]string{
+				"name=systemd": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+				"blkio":        "/",
+				"freezer":      "/",
+				"perf_event":   "/",
+				"devices":      "/system.slice/system-sshd.slice",
+				"cpuset":       "/",
+				"cpu":          "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+				"cpuacct":      "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+				"net_cls":      "/",
+				"net_prio":     "/",
+				"memory":       "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+			},
+		},
+		{
+			input:         `malformed input`,
+			expectedError: errors.New(`invalid cgroup entry: must contain at least two colons: malformed input`),
+		},
+	}
+
+	for ndx, testCase := range testCases {
+		out, err := parseCgroupFromReader(strings.NewReader(testCase.input))
+		if err != nil {
+			if testCase.expectedError == nil || testCase.expectedError.Error() != err.Error() {
+				t.Errorf("%v: expected error %v, got error %v", ndx, testCase.expectedError, err)
+			}
+		} else {
+			if !reflect.DeepEqual(testCase.expectedOutput, out) {
+				t.Errorf("%v: expected output %v, got error %v", ndx, testCase.expectedOutput, out)
+			}
+		}
+	}
+}
+
+func TestIgnoreCgroup2Mount(t *testing.T) {
+	subsystems := map[string]bool{
+		"cpuset":       false,
+		"cpu":          false,
+		"cpuacct":      false,
+		"memory":       false,
+		"devices":      false,
+		"freezer":      false,
+		"net_cls":      false,
+		"blkio":        false,
+		"perf_event":   false,
+		"pids":         false,
+		"name=systemd": false,
+	}
+
+	mi, err := mountinfo.GetMountsFromReader(
+		bytes.NewBufferString(cgroup2Mountinfo),
+		mountinfo.FSTypeFilter("cgroup"),
+	)
+	if err != nil {
+		t.Fatal(err)
+	}
+	cgMounts, err := getCgroupMountsHelper(subsystems, mi, false)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, m := range cgMounts {
+		if m.Mountpoint == "/sys/fs/cgroup/systemd" {
+			t.Errorf("parsed a cgroup2 mount at /sys/fs/cgroup/systemd instead of ignoring it")
+		}
+	}
+}
+
+func TestFindCgroupMountpointAndRoot(t *testing.T) {
+	fakeMountInfo := `35 27 0:29 / /foo rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+35 27 0:29 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices`
+	testCases := []struct {
+		cgroupPath string
+		output     string
+	}{
+		{cgroupPath: "/sys/fs", output: "/sys/fs/cgroup/devices"},
+		{cgroupPath: "", output: "/foo"},
+	}
+
+	mi, err := mountinfo.GetMountsFromReader(
+		bytes.NewBufferString(fakeMountInfo),
+		mountinfo.FSTypeFilter("cgroup"),
+	)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for _, c := range testCases {
+		mountpoint, _, _ := findCgroupMountpointAndRootFromMI(mi, c.cgroupPath, "devices")
+		if mountpoint != c.output {
+			t.Errorf("expected %s, got %s", c.output, mountpoint)
+		}
+	}
+}
+
+func BenchmarkGetHugePageSizeImpl(b *testing.B) {
+	var (
+		input  = []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"}
+		output []string
+		err    error
+	)
+	for i := 0; i < b.N; i++ {
+		output, err = getHugePageSizeFromFilenames(input)
+	}
+	if err != nil || len(output) != len(input) {
+		b.Fatal("unexpected results")
+	}
+}
+
+func TestGetHugePageSizeImpl(t *testing.T) {
+	testCases := []struct {
+		doc    string
+		input  []string
+		output []string
+		isErr  bool
+	}{
+		{
+			doc:    "normal input",
+			input:  []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"},
+			output: []string{"1GB", "2MB", "32MB", "64KB"},
+		},
+		{
+			doc:    "empty input",
+			input:  []string{},
+			output: []string{},
+		},
+		{
+			doc:   "not a number",
+			input: []string{"hugepages-akB"},
+			isErr: true,
+		},
+		{
+			doc:   "no prefix (silently skipped)",
+			input: []string{"1024kB"},
+		},
+		{
+			doc:   "invalid prefix (silently skipped)",
+			input: []string{"whatever-1024kB"},
+		},
+		{
+			doc:   "invalid suffix",
+			input: []string{"hugepages-1024gB"},
+			isErr: true,
+		},
+		{
+			doc:   "no suffix",
+			input: []string{"hugepages-1024"},
+			isErr: true,
+		},
+		{
+			doc:    "mixed valid and invalid entries",
+			input:  []string{"hugepages-4194304kB", "hugepages-2048kB", "hugepages-akB", "hugepages-64kB"},
+			output: []string{"4GB", "2MB", "64KB"},
+			isErr:  true,
+		},
+		{
+			doc:    "more mixed valid and invalid entries",
+			input:  []string{"hugepages-2048kB", "hugepages-kB", "hugepages-64kB"},
+			output: []string{"2MB", "64KB"},
+			isErr:  true,
+		},
+	}
+
+	for _, c := range testCases {
+		c := c
+		t.Run(c.doc, func(t *testing.T) {
+			output, err := getHugePageSizeFromFilenames(c.input)
+			t.Log("input:", c.input, "; output:", output, "; err:", err)
+			if err != nil {
+				if !c.isErr {
+					t.Errorf("input %v, expected nil, got error: %v", c.input, err)
+				}
+				// no more checks
+				return
+			}
+			if c.isErr {
+				t.Errorf("input %v, expected error, got error: nil, output: %v", c.input, output)
+			}
+			// check output
+			if len(output) != len(c.output) || (len(output) > 0 && !reflect.DeepEqual(output, c.output)) {
+				t.Errorf("input %v, expected %v, got %v", c.input, c.output, output)
+			}
+		})
+	}
+}
+
+func TestConvertCPUSharesToCgroupV2Value(t *testing.T) {
+	cases := map[uint64]uint64{
+		0:      0,
+		2:      1,
+		262144: 10000,
+	}
+	for i, expected := range cases {
+		got := ConvertCPUSharesToCgroupV2Value(i)
+		if got != expected {
+			t.Errorf("expected ConvertCPUSharesToCgroupV2Value(%d) to be %d, got %d", i, expected, got)
+		}
+	}
+}
+
+func TestConvertMemorySwapToCgroupV2Value(t *testing.T) {
+	cases := []struct {
+		descr           string
+		memswap, memory int64
+		expected        int64
+		expErr          bool
+	}{
+		{
+			descr:    "all unset",
+			memswap:  0,
+			memory:   0,
+			expected: 0,
+		},
+		{
+			descr:    "unlimited memory+swap, unset memory",
+			memswap:  -1,
+			memory:   0,
+			expected: -1,
+		},
+		{
+			descr:    "unlimited memory",
+			memswap:  300,
+			memory:   -1,
+			expected: 300,
+		},
+		{
+			descr:    "all unlimited",
+			memswap:  -1,
+			memory:   -1,
+			expected: -1,
+		},
+		{
+			descr:   "negative memory+swap",
+			memswap: -2,
+			memory:  0,
+			expErr:  true,
+		},
+		{
+			descr:    "unlimited memory+swap, set memory",
+			memswap:  -1,
+			memory:   1000,
+			expected: -1,
+		},
+		{
+			descr:    "memory+swap == memory",
+			memswap:  1000,
+			memory:   1000,
+			expected: 0,
+		},
+		{
+			descr:    "memory+swap > memory",
+			memswap:  500,
+			memory:   200,
+			expected: 300,
+		},
+		{
+			descr:   "memory+swap < memory",
+			memswap: 300,
+			memory:  400,
+			expErr:  true,
+		},
+		{
+			descr:   "unset memory",
+			memswap: 300,
+			memory:  0,
+			expErr:  true,
+		},
+		{
+			descr:   "negative memory",
+			memswap: 300,
+			memory:  -300,
+			expErr:  true,
+		},
+	}
+
+	for _, c := range cases {
+		c := c
+		t.Run(c.descr, func(t *testing.T) {
+			swap, err := ConvertMemorySwapToCgroupV2Value(c.memswap, c.memory)
+			if c.expErr {
+				if err == nil {
+					t.Errorf("memswap: %d, memory %d, expected error, got %d, nil", c.memswap, c.memory, swap)
+				}
+				// No more checks.
+				return
+			}
+			if err != nil {
+				t.Errorf("memswap: %d, memory %d, expected success, got error %s", c.memswap, c.memory, err)
+			}
+			if swap != c.expected {
+				t.Errorf("memswap: %d, memory %d, expected %d, got %d", c.memswap, c.memory, c.expected, swap)
+			}
+		})
+	}
+}
+
+func TestConvertBlkIOToIOWeightValue(t *testing.T) {
+	cases := map[uint16]uint64{
+		0:    0,
+		10:   1,
+		1000: 10000,
+	}
+	for i, expected := range cases {
+		got := ConvertBlkIOToIOWeightValue(i)
+		if got != expected {
+			t.Errorf("expected ConvertBlkIOToIOWeightValue(%d) to be %d, got %d", i, expected, got)
+		}
+	}
+}
+
+// TestRemovePathReadOnly is to test remove a non-existent dir in a ro mount point.
+// The similar issue example: https://github.com/opencontainers/runc/issues/4518
+func TestRemovePathReadOnly(t *testing.T) {
+	dirTo := t.TempDir()
+	err := unix.Mount(t.TempDir(), dirTo, "", unix.MS_BIND, "")
+	if err != nil {
+		t.Skip("no permission of mount")
+	}
+	defer func() {
+		_ = unix.Unmount(dirTo, 0)
+	}()
+	err = unix.Mount("", dirTo, "", unix.MS_REMOUNT|unix.MS_BIND|unix.MS_RDONLY, "")
+	if err != nil {
+		t.Skip("no permission of mount")
+	}
+	nonExistentDir := filepath.Join(dirTo, "non-existent-dir")
+	err = rmdir(nonExistentDir, true)
+	if !errors.Is(err, unix.EROFS) {
+		t.Fatalf("expected the error of removing a non-existent dir %s in a ro mount point with rmdir to be unix.EROFS, but got: %v", nonExistentDir, err)
+	}
+	err = RemovePath(nonExistentDir)
+	if err != nil {
+		t.Fatalf("expected the error of removing a non-existent dir %s in a ro mount point with RemovePath to be nil, but got: %v", nonExistentDir, err)
+	}
+}
diff --git a/v1_utils.go b/v1_utils.go
new file mode 100644
index 0000000..81193e2
--- /dev/null
+++ b/v1_utils.go
@@ -0,0 +1,277 @@
+package cgroups
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"syscall"
+
+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/moby/sys/mountinfo"
+	"golang.org/x/sys/unix"
+)
+
+// Code in this source file are specific to cgroup v1,
+// and must not be used from any cgroup v2 code.
+
+const (
+	CgroupNamePrefix = "name="
+	defaultPrefix    = "/sys/fs/cgroup"
+)
+
+var (
+	errUnified     = errors.New("not implemented for cgroup v2 unified hierarchy")
+	ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
+
+	readMountinfoOnce sync.Once
+	readMountinfoErr  error
+	cgroupMountinfo   []*mountinfo.Info
+)
+
+type NotFoundError struct {
+	Subsystem string
+}
+
+func (e *NotFoundError) Error() string {
+	return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
+}
+
+func NewNotFoundError(sub string) error {
+	return &NotFoundError{
+		Subsystem: sub,
+	}
+}
+
+func IsNotFound(err error) bool {
+	var nfErr *NotFoundError
+	return errors.As(err, &nfErr)
+}
+
+func tryDefaultPath(cgroupPath, subsystem string) string {
+	if !strings.HasPrefix(defaultPrefix, cgroupPath) {
+		return ""
+	}
+
+	// remove possible prefix
+	subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix)
+
+	// Make sure we're still under defaultPrefix, and resolve
+	// a possible symlink (like cpu -> cpu,cpuacct).
+	path, err := securejoin.SecureJoin(defaultPrefix, subsystem)
+	if err != nil {
+		return ""
+	}
+
+	// (1) path should be a directory.
+	st, err := os.Lstat(path)
+	if err != nil || !st.IsDir() {
+		return ""
+	}
+
+	// (2) path should be a mount point.
+	pst, err := os.Lstat(filepath.Dir(path))
+	if err != nil {
+		return ""
+	}
+
+	if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev {
+		// parent dir has the same dev -- path is not a mount point
+		return ""
+	}
+
+	// (3) path should have 'cgroup' fs type.
+	fst := unix.Statfs_t{}
+	err = unix.Statfs(path, &fst)
+	if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
+		return ""
+	}
+
+	return path
+}
+
+// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
+// with fstype of "cgroup") for the current running process.
+//
+// The results are cached (to avoid re-reading mountinfo which is relatively
+// expensive), so it is assumed that cgroup mounts are not being changed.
+func readCgroupMountinfo() ([]*mountinfo.Info, error) {
+	readMountinfoOnce.Do(func() {
+		// mountinfo.GetMounts uses /proc/thread-self, so we can use it without
+		// issues.
+		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
+			mountinfo.FSTypeFilter("cgroup"),
+		)
+	})
+	return cgroupMountinfo, readMountinfoErr
+}
+
+// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
+func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "", errUnified
+	}
+
+	// If subsystem is empty, we look for the cgroupv2 hybrid path.
+	if len(subsystem) == 0 {
+		return hybridMountpoint, nil
+	}
+
+	// Avoid parsing mountinfo by trying the default path first, if possible.
+	if path := tryDefaultPath(cgroupPath, subsystem); path != "" {
+		return path, nil
+	}
+
+	mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
+	return mnt, err
+}
+
+func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "", "", errUnified
+	}
+
+	mi, err := readCgroupMountinfo()
+	if err != nil {
+		return "", "", err
+	}
+
+	return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
+}
+
+func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
+	for _, mi := range mounts {
+		if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
+			for _, opt := range strings.Split(mi.VFSOptions, ",") {
+				if opt == subsystem {
+					return mi.Mountpoint, mi.Root, nil
+				}
+			}
+		}
+	}
+
+	return "", "", NewNotFoundError(subsystem)
+}
+
+func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
+	if len(m.Subsystems) == 0 {
+		return "", errors.New("no subsystem for mount")
+	}
+
+	return getControllerPath(m.Subsystems[0], cgroups)
+}
+
+func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
+	res := make([]Mount, 0, len(ss))
+	numFound := 0
+	for _, mi := range mounts {
+		m := Mount{
+			Mountpoint: mi.Mountpoint,
+			Root:       mi.Root,
+		}
+		for _, opt := range strings.Split(mi.VFSOptions, ",") {
+			seen, known := ss[opt]
+			if !known || (!all && seen) {
+				continue
+			}
+			ss[opt] = true
+			opt = strings.TrimPrefix(opt, CgroupNamePrefix)
+			m.Subsystems = append(m.Subsystems, opt)
+			numFound++
+		}
+		if len(m.Subsystems) > 0 || all {
+			res = append(res, m)
+		}
+		if !all && numFound >= len(ss) {
+			break
+		}
+	}
+	return res, nil
+}
+
+func getCgroupMountsV1(all bool) ([]Mount, error) {
+	mi, err := readCgroupMountinfo()
+	if err != nil {
+		return nil, err
+	}
+
+	// We don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
+	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return nil, err
+	}
+
+	allMap := make(map[string]bool)
+	for s := range allSubsystems {
+		allMap[s] = false
+	}
+
+	return getCgroupMountsHelper(allMap, mi, all)
+}
+
+// GetOwnCgroup returns the relative path to the cgroup docker is running in.
+func GetOwnCgroup(subsystem string) (string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "", errUnified
+	}
+
+	// We don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
+	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return "", err
+	}
+
+	return getControllerPath(subsystem, cgroups)
+}
+
+func GetOwnCgroupPath(subsystem string) (string, error) {
+	cgroup, err := GetOwnCgroup(subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	// If subsystem is empty, we look for the cgroupv2 hybrid path.
+	if len(subsystem) == 0 {
+		return hybridMountpoint, nil
+	}
+
+	return getCgroupPathHelper(subsystem, cgroup)
+}
+
+func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
+	mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	// This is needed for nested containers, because in /proc/self/cgroup we
+	// see paths from host, which don't exist in container.
+	relCgroup, err := filepath.Rel(root, cgroup)
+	if err != nil {
+		return "", err
+	}
+
+	return filepath.Join(mnt, relCgroup), nil
+}
+
+func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "", errUnified
+	}
+
+	if p, ok := cgroups[subsystem]; ok {
+		return p, nil
+	}
+
+	if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
+		return p, nil
+	}
+
+	return "", NewNotFoundError(subsystem)
+}