From 4c530c95516fea701946511baa5f198162c70703 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Thu, 5 Feb 2026 17:15:03 -0800
Subject: [PATCH 1/4] libct: factor out initProcessCgroupPath

Separate initProcessCgroupPath code out of addIntoCgroupV2.
To be used by the next patch.

While at it, describe the new scenario in which the container's
configured cgroup might not be available.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 94133fab970c2ff9011cc9531b7415934b9fcd61)
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/process_linux.go | 53 +++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 15 deletions(-)

diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 8114cc50bbf..b5da931b4cc 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -19,10 +19,11 @@ import (
 	"syscall"
 	"time"
 
-	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 
+	"github.com/opencontainers/runtime-spec/specs-go"
+
 	"github.com/opencontainers/cgroups"
 	"github.com/opencontainers/cgroups/fs2"
 	"github.com/opencontainers/runc/libcontainer/configs"
@@ -274,24 +275,46 @@ func (p *setnsProcess) addIntoCgroupV1() error {
 	return nil
 }
 
+// initProcessCgroupPath returns container init's cgroup path,
+// as read from /proc/PID/cgroup. Only works for cgroup v2.
+// Returns empty string if the path can not be obtained.
+//
+// This is used by runc exec in these cases:
+//
+//  1. On cgroup v2 + nesting + domain controllers, adding to initial cgroup
+//     may fail with EBUSY (https://github.com/opencontainers/runc/issues/2356);
+//
+//  2. A container init process with no cgroupns and /sys/fs/cgroup rw access
+//     may move itself to any other cgroup, and the original cgroup will disappear.
+func (p *setnsProcess) initProcessCgroupPath() string {
+	if p.initProcessPid == 0 || !cgroups.IsCgroup2UnifiedMode() {
+		return ""
+	}
+
+	cg, err := cgroups.ParseCgroupFile("/proc/" + strconv.Itoa(p.initProcessPid) + "/cgroup")
+	if err != nil {
+		return ""
+	}
+	cgroup, ok := cg[""]
+	if !ok {
+		return ""
+	}
+
+	return fs2.UnifiedMountpoint + cgroup
+}
+
 func (p *setnsProcess) addIntoCgroupV2() error {
 	sub := p.process.SubCgroupPaths[""]
 	err := p.manager.AddPid(sub, p.pid())
 	if err != nil && !p.rootlessCgroups {
-		// On cgroup v2 + nesting + domain controllers, adding to initial cgroup may fail with EBUSY.
-		// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
-		// Try to join the cgroup of InitProcessPid, unless sub-cgroup is explicitly set.
-		if p.initProcessPid != 0 && sub == "" {
-			initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
-			initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
-			if initCgErr == nil {
-				if initCgPath, ok := initCg[""]; ok {
-					initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
-					logrus.Debugf("adding pid %d to cgroup failed (%v), attempting to join %s",
-						p.pid(), err, initCgDirpath)
-					// NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
-					err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
-				}
+		// Failed to join the configured cgroup, fall back to container init's cgroup
+		// unless sub-cgroup is explicitly requested.
+		if sub == "" {
+			if path := p.initProcessCgroupPath(); path != "" {
+				logrus.Debugf("adding pid %d to configured cgroup failed (%v), will join container init cgroup %q",
+					p.pid(), err, path)
+				// NOTE: path is not guaranteed to exist because we didn't pause the container.
+				err = cgroups.WriteCgroupProc(path, p.pid())
 			}
 		}
 		if err != nil {

From 404dce8e53fcdd6069cf0cecd092e58f1a210b33 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Mon, 9 Feb 2026 17:01:51 -0800
Subject: [PATCH 2/4] libct: refactor addIntoCgroupV2, fix wrt rootless

1. Refactor addIntoCgroupV2 in an attempt to simplify it.

2. Fix the bug of not trying the init cgroup fallback if
   rootlessCgroup is set. This is a bug because rootlessCgroup
   tells to ignore cgroup join errors, not to never try the fallback.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 1d030fab7dd856c0709e102b61bd1792e85d13d3)
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/process_linux.go | 40 +++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index b5da931b4cc..6251b326bfd 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -306,23 +306,35 @@ func (p *setnsProcess) initProcessCgroupPath() string {
 func (p *setnsProcess) addIntoCgroupV2() error {
 	sub := p.process.SubCgroupPaths[""]
 	err := p.manager.AddPid(sub, p.pid())
-	if err != nil && !p.rootlessCgroups {
-		// Failed to join the configured cgroup, fall back to container init's cgroup
-		// unless sub-cgroup is explicitly requested.
-		if sub == "" {
-			if path := p.initProcessCgroupPath(); path != "" {
-				logrus.Debugf("adding pid %d to configured cgroup failed (%v), will join container init cgroup %q",
-					p.pid(), err, path)
-				// NOTE: path is not guaranteed to exist because we didn't pause the container.
-				err = cgroups.WriteCgroupProc(path, p.pid())
-			}
-		}
-		if err != nil {
-			return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
-		}
+	if err == nil {
+		return nil
 	}
 
+	// Failed to join the configured cgroup. Fall back to container init's cgroup
+	// unless sub-cgroup is explicitly requested.
+	var path string
+	if sub != "" {
+		goto fail
+	}
+	path = p.initProcessCgroupPath()
+	if path == "" {
+		goto fail
+	}
+	logrus.Debugf("adding pid %d to configured cgroup failed (%v), will join container init cgroup %q", p.pid(), err, path)
+	// NOTE: path is not guaranteed to exist because we didn't pause the container.
+	err = cgroups.WriteCgroupProc(path, p.pid())
+	if err != nil {
+		goto fail
+	}
 	return nil
+
+fail:
+	if p.rootlessCgroups {
+		// Ignore cgroup join errors when rootless.
+		return nil
+	}
+
+	return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
 }
 
 func (p *setnsProcess) addIntoCgroup() error {

From 8c8c41675cf78e0849f548c82a6edc949fe3a80a Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Thu, 5 Feb 2026 17:41:55 -0800
Subject: [PATCH 3/4] libct: prepareCgroupFD: fall back to container init
 cgroup

Previously, when prepareCgroupFD would not open container's cgroup
(as configured in config.json and saved to state.json), it returned
a fatal error, as we presumed a container can't exist without its own
cgroup.

Apparently, it can. In a case when container is configured without
cgroupns (i.e. it uses hosts cgroups), and /sys/fs/cgroup is mounted
read-write, a rootful container's init can move itself to an entirely
different cgroup (even a new one that it just created), and then the
original container cgroup is removed by the kernel (or systemd?) as
it has no processes left. By the way, from the systemd point of view
the container is gone. And yet it is still there, and users want
runc exec to work!

And it worked, thanks to the "let's try container init's cgroup"
fallback as added by commit c91fe9aebac83 ("cgroup2: exec: join the
cgroup of the init process on EBUSY"). The fallback was added for
the entirely different reason, but it happened to work in this very
case, too.

This behavior was broken with the introduction of CLONE_INTO_CGROUP
support.

While it is debatable whether this is a valid scenario when a container
moves itself into a different cgroup, this very setup is used by e.g.
buildkitd running in a privileged kubernetes container (see issue 5089).

To restore the way things are expected to work, add the same "try
container init's cgroup" fallback into prepareCgroupFD.

While at it, simplify the code flow.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 6c07a37a585db26a3117683456c9c06f97dc7485)
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 libcontainer/process_linux.go | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 6251b326bfd..29d00ae063e 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -353,6 +353,8 @@ func (p *setnsProcess) addIntoCgroup() error {
 // to join cgroup early, in p.cmd.Start. Returns an *os.File which
 // must be closed by the caller after p.Cmd.Start return.
 func (p *setnsProcess) prepareCgroupFD() (*os.File, error) {
+	const openFlags = unix.O_PATH | unix.O_DIRECTORY | unix.O_CLOEXEC
+
 	if !cgroups.IsCgroup2UnifiedMode() {
 		return nil, nil
 	}
@@ -370,14 +372,28 @@ func (p *setnsProcess) prepareCgroupFD() (*os.File, error) {
 		return nil, fmt.Errorf("bad sub cgroup path: %s", sub)
 	}
 
-	fd, err := cgroups.OpenFile(base, sub, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC)
+	fd, err := cgroups.OpenFile(base, sub, openFlags)
+	if err == nil {
+		goto success
+	}
+	// Failed to open the configured cgroup. Fall back to container init's cgroup
+	// unless sub-cgroup is explicitly requested. The fallback logic should be
+	// the same as in addIntoCgroupV2.
+	if sub != "" {
+		goto fail
+	}
+	cgroup = p.initProcessCgroupPath()
+	if cgroup == "" {
+		goto fail
+	}
+	logrus.Debugf("failed to open configured cgroup (%v), will open container init cgroup %q", err, cgroup)
+	// NOTE: path is not guaranteed to exist because we didn't pause the container.
+	fd, err = cgroups.OpenFile(cgroup, "", openFlags)
 	if err != nil {
-		if p.rootlessCgroups {
-			return nil, nil
-		}
-		return nil, fmt.Errorf("can't open cgroup: %w", err)
+		goto fail
 	}
 
+success:
 	logrus.Debugf("using CLONE_INTO_CGROUP %q", cgroup)
 	if p.cmd.SysProcAttr == nil {
 		p.cmd.SysProcAttr = &syscall.SysProcAttr{}
@@ -386,6 +402,13 @@ func (p *setnsProcess) prepareCgroupFD() (*os.File, error) {
 	p.cmd.SysProcAttr.CgroupFD = int(fd.Fd())
 
 	return fd, nil
+
+fail:
+	// Ignore cgroup join error for rootless.
+	if p.rootlessCgroups {
+		return nil, nil
+	}
+	return nil, fmt.Errorf("can't open cgroup: %w", err)
 }
 
 // startWithCgroupFD starts a process via clone3 with CLONE_INTO_CGROUP,

From 9ac76a086ebed93112d213f695c7a07674cbe9b9 Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Fri, 6 Feb 2026 15:58:06 -0800
Subject: [PATCH 4/4] tests/int: add "runc exec [init changes cgroup]"

Add a test case to reproduce runc issue 5089.

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
(cherry picked from commit 1fdbab8107c61876eb69f88730497d250d67e0e6)
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
---
 tests/integration/exec.bats | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats
index 1426e198269..93ba6e90806 100644
--- a/tests/integration/exec.bats
+++ b/tests/integration/exec.bats
@@ -335,6 +335,41 @@ function check_exec_debug() {
 	[ "$status" -eq 0 ]
 }
 
+# https://github.com/opencontainers/runc/issues/5089
+@test "runc exec [init changes cgroup]" {
+	requires root cgroups_v2
+
+	NEW_CGROUP_REL=/runc-tst-$$
+	NEW_CGROUP=/sys/fs/cgroup$NEW_CGROUP_REL
+	mkdir $NEW_CGROUP
+
+	# The container is placed into a $CGROUP_V2_PATH cgroup.
+	set_cgroups_path
+	# And upon the start it moves itself into $NEW_CGROUP.
+	set_cgroup_mount_writable
+	update_config '	  .linux.namespaces -= [{"type": "cgroup"}]
+			| .process.args = ["sh", "-c", "echo 1 > '$NEW_CGROUP'/cgroup.procs && exec sleep 1h"]'
+
+	runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox
+	[ $status -eq 0 ]
+	testcontainer test_busybox running
+	sleep 1
+	# Remove the original container cgroup. If systemd cgroup manager is used by runc,
+	# the cgroup might have already be deleted by systemd, so we ignore rmdir errors.
+	rmdir "$CGROUP_V2_PATH" || true
+	test -d "$CGROUP_V2_PATH" && false
+
+	# Test that runc exec is able to fallback to container's init cgroup
+	# even if the original cgroup is gone.
+	runc exec test_busybox cat /proc/self/cgroup
+	[ $status -eq 0 ]
+	[ "$output" = "0::$NEW_CGROUP_REL" ]
+
+	# Cleanup.
+	runc delete -f test_busybox
+	rmdir "$NEW_CGROUP"
+}
+
 @test "runc exec [execve error]" {
 	cat <<EOF >rootfs/run.sh
 #!/mmnnttbb foo bar