From c7c70ce810683ec7ca68f760230cba7ea036c9c0 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sun, 23 May 2021 17:52:30 +1000
Subject: [PATCH 1/6] *: clean t.Skip messages

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 .../configs/validate/validator_test.go        |  2 +-
 libcontainer/integration/checkpoint_test.go   |  2 +-
 libcontainer/integration/exec_test.go         | 20 +++++++++----------
 libcontainer/integration/execin_test.go       |  4 ++--
 libcontainer/specconv/spec_linux_test.go      |  2 +-
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go
index 327776b590e..7d1ef0f6b3d 100644
--- a/libcontainer/configs/validate/validator_test.go
+++ b/libcontainer/configs/validate/validator_test.go
@@ -154,7 +154,7 @@ func TestValidateSecurityWithoutNEWNS(t *testing.T) {
 
 func TestValidateUsernamespace(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
-		t.Skip("userns is unsupported")
+		t.Skip("Test requires userns.")
 	}
 	config := &configs.Config{
 		Rootfs: "/var",
diff --git a/libcontainer/integration/checkpoint_test.go b/libcontainer/integration/checkpoint_test.go
index 9ec03b92d0f..398a214be41 100644
--- a/libcontainer/integration/checkpoint_test.go
+++ b/libcontainer/integration/checkpoint_test.go
@@ -41,7 +41,7 @@ func showFile(t *testing.T, fname string) error {
 
 func TestUsernsCheckpoint(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
-		t.Skip("userns is unsupported")
+		t.Skip("Test requires userns.")
 	}
 	cmd := exec.Command("criu", "check", "--feature", "userns")
 	if err := cmd.Run(); err != nil {
diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go
index 505006a89ac..c6aa394ce74 100644
--- a/libcontainer/integration/exec_test.go
+++ b/libcontainer/integration/exec_test.go
@@ -29,7 +29,7 @@ func TestExecPS(t *testing.T) {
 
 func TestUsernsExecPS(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
-		t.Skip("userns is unsupported")
+		t.Skip("Test requires userns.")
 	}
 	testExecPS(t, true)
 }
@@ -166,7 +166,7 @@ func TestRlimit(t *testing.T) {
 
 func TestUsernsRlimit(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
-		t.Skip("userns is unsupported")
+		t.Skip("Test requires userns.")
 	}
 
 	testRlimit(t, true)
@@ -498,7 +498,7 @@ func TestFreeze(t *testing.T) {
 
 func TestSystemdFreeze(t *testing.T) {
 	if !systemd.IsRunningSystemd() {
-		t.Skip("Systemd is unsupported")
+		t.Skip("Test requires systemd.")
 	}
 	testFreeze(t, true)
 }
@@ -555,7 +555,7 @@ func TestCpuShares(t *testing.T) {
 
 func TestCpuSharesSystemd(t *testing.T) {
 	if !systemd.IsRunningSystemd() {
-		t.Skip("Systemd is unsupported")
+		t.Skip("Test requires systemd.")
 	}
 	testCpuShares(t, true)
 }
@@ -590,7 +590,7 @@ func TestPids(t *testing.T) {
 
 func TestPidsSystemd(t *testing.T) {
 	if !systemd.IsRunningSystemd() {
-		t.Skip("Systemd is unsupported")
+		t.Skip("Test requires systemd.")
 	}
 	testPids(t, true)
 }
@@ -664,7 +664,7 @@ func TestCgroupResourcesUnifiedErrorOnV1(t *testing.T) {
 
 func TestCgroupResourcesUnifiedErrorOnV1Systemd(t *testing.T) {
 	if !systemd.IsRunningSystemd() {
-		t.Skip("Systemd is unsupported")
+		t.Skip("Test requires systemd.")
 	}
 	testCgroupResourcesUnifiedErrorOnV1(t, true)
 }
@@ -699,7 +699,7 @@ func TestCgroupResourcesUnified(t *testing.T) {
 
 func TestCgroupResourcesUnifiedSystemd(t *testing.T) {
 	if !systemd.IsRunningSystemd() {
-		t.Skip("Systemd is unsupported")
+		t.Skip("Test requires systemd.")
 	}
 	testCgroupResourcesUnified(t, true)
 }
@@ -1720,7 +1720,7 @@ func TestInitJoinPID(t *testing.T) {
 
 func TestInitJoinNetworkAndUser(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
-		t.Skip("userns is unsupported")
+		t.Skip("Test requires userns.")
 	}
 	if testing.Short() {
 		return
@@ -1861,7 +1861,7 @@ func TestTmpfsCopyUp(t *testing.T) {
 
 func TestCGROUPPrivate(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
-		t.Skip("cgroupns is unsupported")
+		t.Skip("Test requires cgroupns.")
 	}
 	if testing.Short() {
 		return
@@ -1890,7 +1890,7 @@ func TestCGROUPPrivate(t *testing.T) {
 
 func TestCGROUPHost(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
-		t.Skip("cgroupns is unsupported")
+		t.Skip("Test requires cgroupns.")
 	}
 	if testing.Short() {
 		return
diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go
index 894e54f368b..f4e58c015d8 100644
--- a/libcontainer/integration/execin_test.go
+++ b/libcontainer/integration/execin_test.go
@@ -72,7 +72,7 @@ func TestExecIn(t *testing.T) {
 
 func TestExecInUsernsRlimit(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
-		t.Skip("userns is unsupported")
+		t.Skip("Test requires userns.")
 	}
 
 	testExecInRlimit(t, true)
@@ -528,7 +528,7 @@ func TestExecInOomScoreAdj(t *testing.T) {
 
 func TestExecInUserns(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
-		t.Skip("userns is unsupported")
+		t.Skip("Test requires userns.")
 	}
 	if testing.Short() {
 		return
diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go
index e8deabaecba..1a0aae9a2d1 100644
--- a/libcontainer/specconv/spec_linux_test.go
+++ b/libcontainer/specconv/spec_linux_test.go
@@ -470,7 +470,7 @@ func TestDupNamespaces(t *testing.T) {
 
 func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
-		t.Skip("userns is unsupported")
+		t.Skip("Test requires userns.")
 	}
 
 	spec := Example()

From 54904516e66a01e4817361bf5f9d0761fad44e22 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 13 May 2021 13:16:20 +1000
Subject: [PATCH 2/6] libcontainer: fix integration failure in "make test"

When running inside a Docker container, systemd is not available. The
new TestFdLeaksSystemd forgot to include the relevant t.Skip section.

Fixes: a7feb4239534 ("libct/int: add TestFdLeaksSystemd")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/integration/exec_test.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go
index c6aa394ce74..26de5b64f24 100644
--- a/libcontainer/integration/exec_test.go
+++ b/libcontainer/integration/exec_test.go
@@ -1921,6 +1921,9 @@ func TestFdLeaks(t *testing.T) {
 }
 
 func TestFdLeaksSystemd(t *testing.T) {
+	if !systemd.IsRunningSystemd() {
+		t.Skip("Test requires systemd.")
+	}
 	testFdLeaks(t, true)
 }
 

From dcc1cf7c1c9d9e366d7d7e28c57506a591b747fc Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 12 May 2021 15:54:31 +1000
Subject: [PATCH 3/6] devices: add emulator.Rules shorthand

The devices cgroup emulator is also useful for removing unneeded rules
as well as computing what the final default-allow state of the filter
will be (allow-list or deny-list).

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/cgroups/devices/devices_emulator.go | 9 +++++++++
 libcontainer/cgroups/systemd/common.go           | 3 +--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/libcontainer/cgroups/devices/devices_emulator.go b/libcontainer/cgroups/devices/devices_emulator.go
index 3572a5eae8e..6f29ef0371e 100644
--- a/libcontainer/cgroups/devices/devices_emulator.go
+++ b/libcontainer/cgroups/devices/devices_emulator.go
@@ -371,3 +371,12 @@ func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
 	}
 	return transitionRules, nil
 }
+
+// Rules returns the minimum set of rules necessary to convert a *deny-all*
+// cgroup to the emulated filter state (note that this is not the same as a
+// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
+// wrapper around Transition() with the source emulator being an empty cgroup.
+func (e *Emulator) Rules() ([]*devices.Rule, error) {
+	defaultCgroup := &Emulator{defaultAllow: false}
+	return defaultCgroup.Transition(e)
+}
diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go
index de69617ee44..e17339452c2 100644
--- a/libcontainer/cgroups/systemd/common.go
+++ b/libcontainer/cgroups/systemd/common.go
@@ -203,8 +203,7 @@ func generateDeviceProperties(rules []*devices.Rule) ([]systemdDbus.Property, er
 	// Now generate the set of rules we actually need to apply. Unlike the
 	// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
 	// whitelist which is the default for devices.Emulator.
-	baseEmu := &cgroupdevices.Emulator{}
-	finalRules, err := baseEmu.Transition(configEmu)
+	finalRules, err := configEmu.Rules()
 	if err != nil {
 		return nil, errors.Wrap(err, "get simplified rules for systemd")
 	}

From 98a3c0e4dbb68021e3f7c4cddfb43d4240ee8717 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 12 May 2021 15:56:14 +1000
Subject: [PATCH 4/6] cgroup2: devices: switch to emulator for cgroupv1 parity

There were several issues with the previous cgroupv2 devices filter
generator implementation, stemming from the previous implementation
using a few too many tricks to implement the correct cgroup behaviour
(rules were handled in reverse order, with wildcards having particularly
special interpretations). As a result, some slightly odd configurations
with rules in specific orders could result in incorrect filters being
generated.

By switching to the emulator which is already used by cgroupv1, we can
guarantee that the behaviour of filters in both cgroup versions will be
identical, as well as making use of the hardenings in the emulator (not
allowing users to add deny rules the kernel will ignore).

(Note that because the ordering of the devices emulator rules is
deterministic and based on the rule value, the existing test rules had
to be reordered slightly.)

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 .../cgroups/ebpf/devicefilter/devicefilter.go | 106 +++++++++------
 .../ebpf/devicefilter/devicefilter_test.go    | 128 +++++++++---------
 libcontainer/cgroups/fs2/devices.go           |   3 -
 3 files changed, 131 insertions(+), 106 deletions(-)

diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
index fcd3746e023..10e575d2640 100644
--- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
+++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
@@ -11,6 +11,7 @@ import (
 	"strconv"
 
 	"github.com/cilium/ebpf/asm"
+	devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
 	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/pkg/errors"
 	"golang.org/x/sys/unix"
@@ -22,11 +23,44 @@ const (
 )
 
 // DeviceFilter returns eBPF device filter program and its license string
-func DeviceFilter(devices []*devices.Rule) (asm.Instructions, string, error) {
-	p := &program{}
+func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
+	// Generate the minimum ruleset for the device rules we are given. While we
+	// don't care about minimum transitions in cgroupv2, using the emulator
+	// gives us a guarantee that the behaviour of devices filtering is the same
+	// as cgroupv1, including security hardenings to avoid misconfiguration
+	// (such as punching holes in wildcard rules).
+	emu := new(devicesemulator.Emulator)
+	for _, rule := range rules {
+		if err := emu.Apply(*rule); err != nil {
+			return nil, "", err
+		}
+	}
+	cleanRules, err := emu.Rules()
+	if err != nil {
+		return nil, "", err
+	}
+
+	p := &program{
+		defaultAllow: emu.IsBlacklist(),
+	}
 	p.init()
-	for i := len(devices) - 1; i >= 0; i-- {
-		if err := p.appendDevice(devices[i]); err != nil {
+
+	for idx, rule := range cleanRules {
+		if rule.Type == devices.WildcardDevice {
+			// We can safely skip over wildcard entries because there should
+			// only be one (at most) at the very start to instruct cgroupv1 to
+			// go into allow-list mode. However we do double-check this here.
+			if idx != 0 || rule.Allow != emu.IsBlacklist() {
+				return nil, "", errors.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
+			}
+			continue
+		}
+		if rule.Allow == p.defaultAllow {
+			// There should be no rules which have an action equal to the
+			// default action, the emulator removes those.
+			return nil, "", errors.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
+		}
+		if err := p.appendRule(rule); err != nil {
 			return nil, "", err
 		}
 	}
@@ -35,9 +69,9 @@ func DeviceFilter(devices []*devices.Rule) (asm.Instructions, string, error) {
 }
 
 type program struct {
-	insts       asm.Instructions
-	hasWildCard bool
-	blockID     int
+	insts        asm.Instructions
+	defaultAllow bool
+	blockID      int
 }
 
 func (p *program) init() {
@@ -67,39 +101,36 @@ func (p *program) init() {
 		asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
 }
 
-// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
-func (p *program) appendDevice(dev *devices.Rule) error {
+// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
+// to the in-progress filter program. In order to operate properly, it must be
+// called with a "clean" rule list (generated by devices.Emulator.Rules() --
+// with any "a" rules removed).
+func (p *program) appendRule(rule *devices.Rule) error {
 	if p.blockID < 0 {
 		return errors.New("the program is finalized")
 	}
-	if p.hasWildCard {
-		// All entries after wildcard entry are ignored
-		return nil
-	}
 
 	bpfType := int32(-1)
 	hasType := true
-	switch dev.Type {
-	case 'c':
+	switch rule.Type {
+	case devices.CharDevice:
 		bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
-	case 'b':
+	case devices.BlockDevice:
 		bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
-	case 'a':
-		hasType = false
 	default:
-		// if not specified in OCI json, typ is set to DeviceTypeAll
-		return errors.Errorf("invalid Type %q", string(dev.Type))
+		// We do not permit 'a', nor any other types we don't know about.
+		return errors.Errorf("invalid Type %q", string(rule.Type))
 	}
-	if dev.Major > math.MaxUint32 {
-		return errors.Errorf("invalid major %d", dev.Major)
+	if rule.Major > math.MaxUint32 {
+		return errors.Errorf("invalid major %d", rule.Major)
 	}
-	if dev.Minor > math.MaxUint32 {
-		return errors.Errorf("invalid minor %d", dev.Major)
+	if rule.Minor > math.MaxUint32 {
+		return errors.Errorf("invalid minor %d", rule.Major)
 	}
-	hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1
-	hasMinor := dev.Minor >= 0
+	hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
+	hasMinor := rule.Minor >= 0
 	bpfAccess := int32(0)
-	for _, r := range dev.Permissions {
+	for _, r := range rule.Permissions {
 		switch r {
 		case 'r':
 			bpfAccess |= unix.BPF_DEVCG_ACC_READ
@@ -136,19 +167,16 @@ func (p *program) appendDevice(dev *devices.Rule) error {
 	if hasMajor {
 		p.insts = append(p.insts,
 			// if (R4 != major) goto next
-			asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym),
+			asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
 		)
 	}
 	if hasMinor {
 		p.insts = append(p.insts,
 			// if (R5 != minor) goto next
-			asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym),
+			asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
 		)
 	}
-	if !hasType && !hasAccess && !hasMajor && !hasMinor {
-		p.hasWildCard = true
-	}
-	p.insts = append(p.insts, acceptBlock(dev.Allow)...)
+	p.insts = append(p.insts, acceptBlock(rule.Allow)...)
 	// set blockSym to the first instruction we added in this iteration
 	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
 	p.blockID++
@@ -156,14 +184,14 @@ func (p *program) appendDevice(dev *devices.Rule) error {
 }
 
 func (p *program) finalize() (asm.Instructions, error) {
-	if p.hasWildCard {
-		// acceptBlock with asm.Return() is already inserted
-		return p.insts, nil
+	var v int32
+	if p.defaultAllow {
+		v = 1
 	}
 	blockSym := "block-" + strconv.Itoa(p.blockID)
 	p.insts = append(p.insts,
-		// R0 <- 0
-		asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),
+		// R0 <- v
+		asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
 		asm.Return(),
 	)
 	p.blockID = -1
@@ -171,7 +199,7 @@ func (p *program) finalize() (asm.Instructions, error) {
 }
 
 func acceptBlock(accept bool) asm.Instructions {
-	v := int32(0)
+	var v int32
 	if accept {
 		v = 1
 	}
diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
index cfcaa20306a..a8fc562d038 100644
--- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
+++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
@@ -55,81 +55,81 @@ block-0:
 func TestDeviceFilter_BuiltInAllowList(t *testing.T) {
 	expected := `
 // load parameters into registers
-         0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
-         1: And32Imm dst: r2 imm: 65535
-         2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
-         3: RSh32Imm dst: r3 imm: 16
-         4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
-         5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+        0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+        1: And32Imm dst: r2 imm: 65535
+        2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        3: RSh32Imm dst: r3 imm: 16
+        4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
 block-0:
-// tuntap (c, 10, 200, rwm, allow)
-         6: JNEImm dst: r2 off: -1 imm: 2 <block-1>
-         7: JNEImm dst: r4 off: -1 imm: 10 <block-1>
-         8: JNEImm dst: r5 off: -1 imm: 200 <block-1>
-         9: Mov32Imm dst: r0 imm: 1
-        10: Exit
+// (b, wildcard, wildcard, m, true)
+        6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+        7: Mov32Reg dst: r1 src: r3
+        8: And32Imm dst: r1 imm: 1
+        9: JNEReg dst: r1 off: -1 src: r3 <block-1>
+        10: Mov32Imm dst: r0 imm: 1
+        11: Exit
 block-1:
-        11: JNEImm dst: r2 off: -1 imm: 2 <block-2>
-        12: JNEImm dst: r4 off: -1 imm: 5 <block-2>
-        13: JNEImm dst: r5 off: -1 imm: 2 <block-2>
-        14: Mov32Imm dst: r0 imm: 1
-        15: Exit
+// (c, wildcard, wildcard, m, true)
+        12: JNEImm dst: r2 off: -1 imm: 2 <block-2>
+        13: Mov32Reg dst: r1 src: r3
+        14: And32Imm dst: r1 imm: 1
+        15: JNEReg dst: r1 off: -1 src: r3 <block-2>
+        16: Mov32Imm dst: r0 imm: 1
+        17: Exit
 block-2:
-// /dev/pts (c, 136, wildcard, rwm, true)
-        16: JNEImm dst: r2 off: -1 imm: 2 <block-3>
-        17: JNEImm dst: r4 off: -1 imm: 136 <block-3>
-        18: Mov32Imm dst: r0 imm: 1
-        19: Exit
+        18: JNEImm dst: r2 off: -1 imm: 2 <block-3>
+        19: JNEImm dst: r4 off: -1 imm: 1 <block-3>
+        20: JNEImm dst: r5 off: -1 imm: 3 <block-3>
+        21: Mov32Imm dst: r0 imm: 1
+        22: Exit
 block-3:
-        20: JNEImm dst: r2 off: -1 imm: 2 <block-4>
-        21: JNEImm dst: r4 off: -1 imm: 1 <block-4>
-        22: JNEImm dst: r5 off: -1 imm: 9 <block-4>
-        23: Mov32Imm dst: r0 imm: 1
-        24: Exit
+        23: JNEImm dst: r2 off: -1 imm: 2 <block-4>
+        24: JNEImm dst: r4 off: -1 imm: 1 <block-4>
+        25: JNEImm dst: r5 off: -1 imm: 5 <block-4>
+        26: Mov32Imm dst: r0 imm: 1
+        27: Exit
 block-4:
-        25: JNEImm dst: r2 off: -1 imm: 2 <block-5>
-        26: JNEImm dst: r4 off: -1 imm: 1 <block-5>
-        27: JNEImm dst: r5 off: -1 imm: 5 <block-5>
-        28: Mov32Imm dst: r0 imm: 1
-        29: Exit
+        28: JNEImm dst: r2 off: -1 imm: 2 <block-5>
+        29: JNEImm dst: r4 off: -1 imm: 1 <block-5>
+        30: JNEImm dst: r5 off: -1 imm: 7 <block-5>
+        31: Mov32Imm dst: r0 imm: 1
+        32: Exit
 block-5:
-        30: JNEImm dst: r2 off: -1 imm: 2 <block-6>
-        31: JNEImm dst: r4 off: -1 imm: 5 <block-6>
-        32: JNEImm dst: r5 off: -1 imm: 0 <block-6>
-        33: Mov32Imm dst: r0 imm: 1
-        34: Exit
+        33: JNEImm dst: r2 off: -1 imm: 2 <block-6>
+        34: JNEImm dst: r4 off: -1 imm: 1 <block-6>
+        35: JNEImm dst: r5 off: -1 imm: 8 <block-6>
+        36: Mov32Imm dst: r0 imm: 1
+        37: Exit
 block-6:
-        35: JNEImm dst: r2 off: -1 imm: 2 <block-7>
-        36: JNEImm dst: r4 off: -1 imm: 1 <block-7>
-        37: JNEImm dst: r5 off: -1 imm: 7 <block-7>
-        38: Mov32Imm dst: r0 imm: 1
-        39: Exit
+        38: JNEImm dst: r2 off: -1 imm: 2 <block-7>
+        39: JNEImm dst: r4 off: -1 imm: 1 <block-7>
+        40: JNEImm dst: r5 off: -1 imm: 9 <block-7>
+        41: Mov32Imm dst: r0 imm: 1
+        42: Exit
 block-7:
-        40: JNEImm dst: r2 off: -1 imm: 2 <block-8>
-        41: JNEImm dst: r4 off: -1 imm: 1 <block-8>
-        42: JNEImm dst: r5 off: -1 imm: 8 <block-8>
-        43: Mov32Imm dst: r0 imm: 1
-        44: Exit
+        43: JNEImm dst: r2 off: -1 imm: 2 <block-8>
+        44: JNEImm dst: r4 off: -1 imm: 5 <block-8>
+        45: JNEImm dst: r5 off: -1 imm: 0 <block-8>
+        46: Mov32Imm dst: r0 imm: 1
+        47: Exit
 block-8:
-        45: JNEImm dst: r2 off: -1 imm: 2 <block-9>
-        46: JNEImm dst: r4 off: -1 imm: 1 <block-9>
-        47: JNEImm dst: r5 off: -1 imm: 3 <block-9>
-        48: Mov32Imm dst: r0 imm: 1
-        49: Exit
+        48: JNEImm dst: r2 off: -1 imm: 2 <block-9>
+        49: JNEImm dst: r4 off: -1 imm: 5 <block-9>
+        50: JNEImm dst: r5 off: -1 imm: 2 <block-9>
+        51: Mov32Imm dst: r0 imm: 1
+        52: Exit
 block-9:
-// (b, wildcard, wildcard, m, true)
-        50: JNEImm dst: r2 off: -1 imm: 1 <block-10>
-        51: Mov32Reg dst: r1 src: r3
-        52: And32Imm dst: r1 imm: 1
-        53: JNEReg dst: r1 off: -1 src: r3 <block-10>
-        54: Mov32Imm dst: r0 imm: 1
-        55: Exit
+// tuntap (c, 10, 200, rwm, allow)
+        53: JNEImm dst: r2 off: -1 imm: 2 <block-10>
+        54: JNEImm dst: r4 off: -1 imm: 10 <block-10>
+        55: JNEImm dst: r5 off: -1 imm: 200 <block-10>
+        56: Mov32Imm dst: r0 imm: 1
+        57: Exit
 block-10:
-// (c, wildcard, wildcard, m, true)
-        56: JNEImm dst: r2 off: -1 imm: 2 <block-11>
-        57: Mov32Reg dst: r1 src: r3
-        58: And32Imm dst: r1 imm: 1
-        59: JNEReg dst: r1 off: -1 src: r3 <block-11>
+// /dev/pts (c, 136, wildcard, rwm, true)
+        58: JNEImm dst: r2 off: -1 imm: 2 <block-11>
+        59: JNEImm dst: r4 off: -1 imm: 136 <block-11>
         60: Mov32Imm dst: r0 imm: 1
         61: Exit
 block-11:
diff --git a/libcontainer/cgroups/fs2/devices.go b/libcontainer/cgroups/fs2/devices.go
index fa9194b557a..de1930a800e 100644
--- a/libcontainer/cgroups/fs2/devices.go
+++ b/libcontainer/cgroups/fs2/devices.go
@@ -58,9 +58,6 @@ func setDevices(dirPath string, r *configs.Resources) error {
 	if r.SkipDevices {
 		return nil
 	}
-	// XXX: This is currently a white-list (but all callers pass a blacklist of
-	//      devices). This is bad for a whole variety of reasons, but will need
-	//      to be fixed with co-ordinated effort with downstreams.
 	insts, license, err := devicefilter.DeviceFilter(r.Devices)
 	if err != nil {
 		return err

From d0f2c25f521e22d25390a8bd4af554fcb1735a58 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 13 May 2021 00:32:26 +1000
Subject: [PATCH 5/6] cgroup2: devices: replace all existing filters when
 attaching

In the normal cases (only one existing filter or no existing filters),
just make use of BPF_F_REPLACE if there is one existing filter. However
if there is more than one filter applied, we should probably remove all
other filters since the alternative is that we will never remove our old
filters.

The only two other viable ways of solving this problem would be to use
BPF pins to either pin the eBPF program using a predictable name (so we
can always only replace *our* programs) or to switch away from custom
programs and instead use eBPF maps (which are pinned) and thus we just
update the map conntents to update the ruleset. Unfortunately these both
would add a hard requirement of bpffs and would require at least a minor
rewrite of the eBPF filtering code -- which is better left for another
time.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/cgroups/ebpf/ebpf.go   | 106 +++++++++++++++++++++++++---
 libcontainer/cgroups/fs2/devices.go |  11 ---
 2 files changed, 98 insertions(+), 19 deletions(-)

diff --git a/libcontainer/cgroups/ebpf/ebpf.go b/libcontainer/cgroups/ebpf/ebpf.go
index 3bee21cb7de..6066d08c800 100644
--- a/libcontainer/cgroups/ebpf/ebpf.go
+++ b/libcontainer/cgroups/ebpf/ebpf.go
@@ -1,22 +1,83 @@
 package ebpf
 
 import (
+	"fmt"
+	"runtime"
+	"unsafe"
+
 	"github.com/cilium/ebpf"
 	"github.com/cilium/ebpf/asm"
 	"github.com/cilium/ebpf/link"
 	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
+func nilCloser() error {
+	return nil
+}
+
+func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
+	type bpfAttrQuery struct {
+		TargetFd    uint32
+		AttachType  uint32
+		QueryType   uint32
+		AttachFlags uint32
+		ProgIds     uint64 // __aligned_u64
+		ProgCnt     uint32
+	}
+
+	// Currently you can only have 64 eBPF programs attached to a cgroup.
+	size := 64
+	retries := 0
+	for retries < 10 {
+		progIds := make([]uint32, size)
+		query := bpfAttrQuery{
+			TargetFd:   uint32(dirFd),
+			AttachType: uint32(unix.BPF_CGROUP_DEVICE),
+			ProgIds:    uint64(uintptr(unsafe.Pointer(&progIds[0]))),
+			ProgCnt:    uint32(len(progIds)),
+		}
+
+		// Fetch the list of program ids.
+		_, _, errno := unix.Syscall(unix.SYS_BPF,
+			uintptr(unix.BPF_PROG_QUERY),
+			uintptr(unsafe.Pointer(&query)),
+			unsafe.Sizeof(query))
+		size = int(query.ProgCnt)
+		runtime.KeepAlive(query)
+		if errno != 0 {
+			// On ENOSPC we get the correct number of programs.
+			if errno == unix.ENOSPC {
+				retries++
+				continue
+			}
+			return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
+		}
+
+		// Convert the ids to program handles.
+		progIds = progIds[:size]
+		programs := make([]*ebpf.Program, len(progIds))
+		for idx, progId := range progIds {
+			program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
+			if err != nil {
+				return nil, fmt.Errorf("cannot fetch program from id: %w", err)
+			}
+			programs[idx] = program
+		}
+		runtime.KeepAlive(progIds)
+		return programs, nil
+	}
+
+	return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
+}
+
 // LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
 //
 // Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
 //
 // https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
-func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) {
-	nilCloser := func() error {
-		return nil
-	}
+func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
 	// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
 	// This limit is not inherited into the container.
 	memlockLimit := &unix.Rlimit{
@@ -24,6 +85,11 @@ func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD
 		Max: unix.RLIM_INFINITY,
 	}
 	_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
+	// Get the list of existing programs.
+	oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
+	if err != nil {
+		return nilCloser, err
+	}
 	spec := &ebpf.ProgramSpec{
 		Type:         ebpf.CGroupDevice,
 		Instructions: insts,
@@ -33,25 +99,49 @@ func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD
 	if err != nil {
 		return nilCloser, err
 	}
+	// If there is only one old program, we can just replace it directly.
+	var replaceProg *ebpf.Program
+	if len(oldProgs) == 1 {
+		replaceProg = oldProgs[0]
+	}
 	err = link.RawAttachProgram(link.RawAttachProgramOptions{
-		Target:  dirFD,
+		Target:  dirFd,
 		Program: prog,
+		Replace: replaceProg,
 		Attach:  ebpf.AttachCGroupDevice,
 		Flags:   unix.BPF_F_ALLOW_MULTI,
 	})
 	if err != nil {
-		return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
+		return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
 	}
 	closer := func() error {
 		err = link.RawDetachProgram(link.RawDetachProgramOptions{
-			Target:  dirFD,
+			Target:  dirFd,
 			Program: prog,
 			Attach:  ebpf.AttachCGroupDevice,
 		})
 		if err != nil {
-			return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE)")
+			return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
 		}
+		// TODO: Should we attach the old filters back in this case? Otherwise
+		//       we fail-open on a security feature, which is a bit scary.
 		return nil
 	}
+	// If there was more than one old program, give a warning (since this
+	// really shouldn't happen with runc-managed cgroups) and then detach all
+	// the old programs.
+	if len(oldProgs) > 1 {
+		logrus.Warnf("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
+		for _, oldProg := range oldProgs {
+			err = link.RawDetachProgram(link.RawDetachProgramOptions{
+				Target:  dirFd,
+				Program: oldProg,
+				Attach:  ebpf.AttachCGroupDevice,
+			})
+			if err != nil {
+				return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
+			}
+		}
+	}
 	return closer, nil
 }
diff --git a/libcontainer/cgroups/fs2/devices.go b/libcontainer/cgroups/fs2/devices.go
index de1930a800e..6b4b73a134f 100644
--- a/libcontainer/cgroups/fs2/devices.go
+++ b/libcontainer/cgroups/fs2/devices.go
@@ -67,17 +67,6 @@ func setDevices(dirPath string, r *configs.Resources) error {
 		return errors.Errorf("cannot get dir FD for %s", dirPath)
 	}
 	defer unix.Close(dirFD)
-	// XXX: This code is currently incorrect when it comes to updating an
-	//      existing cgroup with new rules (new rulesets are just appended to
-	//      the program list because this uses BPF_F_ALLOW_MULTI). If we didn't
-	//      use BPF_F_ALLOW_MULTI we could actually atomically swap the
-	//      programs.
-	//
-	//      The real issue is that BPF_F_ALLOW_MULTI makes it hard to have a
-	//      race-free blacklist because it acts as a whitelist by default, and
-	//      having a deny-everything program cannot be overridden by other
-	//      programs. You could temporarily insert a deny-everything program
-	//      but that would result in spurrious failures during updates.
 	if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
 		if !canSkipEBPFError(r) {
 			return err

From 00119c85a9232700a27d9f80d67a1c7e947e124f Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 18 May 2021 16:44:24 +1000
Subject: [PATCH 6/6] integration: add repeated "runc update" test

This is to ensure that we aren't leaking eBPF programs after "runc
update". Unfortunately we cannot directly test the behaviour of cgroup
program updates in an integration test because "runc update" doesn't
support that behaviour at the moment.

So instead we rely on the fact that each "runc update" implicitly
triggers the devices rules to be updated. Without the previous patches
applied, this new test will fail with errors (on cgroupv2 systems).

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 tests/integration/update.bats | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/integration/update.bats b/tests/integration/update.bats
index e204ea49550..41344a9dc22 100644
--- a/tests/integration/update.bats
+++ b/tests/integration/update.bats
@@ -648,3 +648,29 @@ EOF
 	runc resume test_update
 	[ "$status" -eq 0 ]
 }
+
+@test "runc update replaces devices cgroup program" {
+	[[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
+
+	# Unfortunately we can't update device rules directly with runc ("runc
+	# update" doesn't support it, and adding support would require ironing out
+	# some long-standing design issues with device configuration). So instead
+	# we just run "runc update" many times, relying on the fact that runc will
+	# re-apply devices cgroup rules on each runc update.
+	#
+	# In the past runc would not delete old cgroupv2 eBPF programs, so this
+	# test ensures that once we go past the program limit (64 stacked programs
+	# at time of writing) you can still run "runc" update.
+
+	# Run the container in the background.
+	runc run -d --console-socket "$CONSOLE_SOCKET" test_update
+	[ "$status" -eq 0 ]
+
+	for new_limit in $(seq 300); do
+		runc update --pids-limit "$((2 * new_limit))" test_update
+		[ "$status" -eq 0 ]
+	done
+
+	# The container should still be running.
+	testcontainer test_update running
+}