diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 8114cc50bbf..29d00ae063e 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -19,10 +19,11 @@ import ( "syscall" "time" - "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/cgroups" "github.com/opencontainers/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" @@ -274,32 +275,66 @@ func (p *setnsProcess) addIntoCgroupV1() error { return nil } +// initProcessCgroupPath returns container init's cgroup path, +// as read from /proc/PID/cgroup. Only works for cgroup v2. +// Returns empty string if the path can not be obtained. +// +// This is used by runc exec in these cases: +// +// 1. On cgroup v2 + nesting + domain controllers, adding to initial cgroup +// may fail with EBUSY (https://github.com/opencontainers/runc/issues/2356); +// +// 2. A container init process with no cgroupns and /sys/fs/cgroup rw access +// may move itself to any other cgroup, and the original cgroup will disappear. +func (p *setnsProcess) initProcessCgroupPath() string { + if p.initProcessPid == 0 || !cgroups.IsCgroup2UnifiedMode() { + return "" + } + + cg, err := cgroups.ParseCgroupFile("/proc/" + strconv.Itoa(p.initProcessPid) + "/cgroup") + if err != nil { + return "" + } + cgroup, ok := cg[""] + if !ok { + return "" + } + + return fs2.UnifiedMountpoint + cgroup +} + func (p *setnsProcess) addIntoCgroupV2() error { sub := p.process.SubCgroupPaths[""] err := p.manager.AddPid(sub, p.pid()) - if err != nil && !p.rootlessCgroups { - // On cgroup v2 + nesting + domain controllers, adding to initial cgroup may fail with EBUSY. - // https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643 - // Try to join the cgroup of InitProcessPid, unless sub-cgroup is explicitly set. - if p.initProcessPid != 0 && sub == "" { - initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid) - initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile) - if initCgErr == nil { - if initCgPath, ok := initCg[""]; ok { - initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath) - logrus.Debugf("adding pid %d to cgroup failed (%v), attempting to join %s", - p.pid(), err, initCgDirpath) - // NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container. - err = cgroups.WriteCgroupProc(initCgDirpath, p.pid()) - } - } - } - if err != nil { - return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err) - } + if err == nil { + return nil } + // Failed to join the configured cgroup. Fall back to container init's cgroup + // unless sub-cgroup is explicitly requested. + var path string + if sub != "" { + goto fail + } + path = p.initProcessCgroupPath() + if path == "" { + goto fail + } + logrus.Debugf("adding pid %d to configured cgroup failed (%v), will join container init cgroup %q", p.pid(), err, path) + // NOTE: path is not guaranteed to exist because we didn't pause the container. + err = cgroups.WriteCgroupProc(path, p.pid()) + if err != nil { + goto fail + } return nil + +fail: + if p.rootlessCgroups { + // Ignore cgroup join errors when rootless. + return nil + } + + return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err) } func (p *setnsProcess) addIntoCgroup() error { @@ -318,6 +353,8 @@ func (p *setnsProcess) addIntoCgroup() error { // to join cgroup early, in p.cmd.Start. Returns an *os.File which // must be closed by the caller after p.Cmd.Start return. func (p *setnsProcess) prepareCgroupFD() (*os.File, error) { + const openFlags = unix.O_PATH | unix.O_DIRECTORY | unix.O_CLOEXEC + if !cgroups.IsCgroup2UnifiedMode() { return nil, nil } @@ -335,14 +372,28 @@ func (p *setnsProcess) prepareCgroupFD() (*os.File, error) { return nil, fmt.Errorf("bad sub cgroup path: %s", sub) } - fd, err := cgroups.OpenFile(base, sub, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC) + fd, err := cgroups.OpenFile(base, sub, openFlags) + if err == nil { + goto success + } + // Failed to open the configured cgroup. Fall back to container init's cgroup + // unless sub-cgroup is explicitly requested. The fallback logic should be + // the same as in addIntoCgroupV2. + if sub != "" { + goto fail + } + cgroup = p.initProcessCgroupPath() + if cgroup == "" { + goto fail + } + logrus.Debugf("failed to open configured cgroup (%v), will open container init cgroup %q", err, cgroup) + // NOTE: path is not guaranteed to exist because we didn't pause the container. + fd, err = cgroups.OpenFile(cgroup, "", openFlags) if err != nil { - if p.rootlessCgroups { - return nil, nil - } - return nil, fmt.Errorf("can't open cgroup: %w", err) + goto fail } +success: logrus.Debugf("using CLONE_INTO_CGROUP %q", cgroup) if p.cmd.SysProcAttr == nil { p.cmd.SysProcAttr = &syscall.SysProcAttr{} @@ -351,6 +402,13 @@ func (p *setnsProcess) prepareCgroupFD() (*os.File, error) { p.cmd.SysProcAttr.CgroupFD = int(fd.Fd()) return fd, nil + +fail: + // Ignore cgroup join error for rootless. + if p.rootlessCgroups { + return nil, nil + } + return nil, fmt.Errorf("can't open cgroup: %w", err) } // startWithCgroupFD starts a process via clone3 with CLONE_INTO_CGROUP, diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats index 1426e198269..93ba6e90806 100644 --- a/tests/integration/exec.bats +++ b/tests/integration/exec.bats @@ -335,6 +335,41 @@ function check_exec_debug() { [ "$status" -eq 0 ] } +# https://github.com/opencontainers/runc/issues/5089 +@test "runc exec [init changes cgroup]" { + requires root cgroups_v2 + + NEW_CGROUP_REL=/runc-tst-$$ + NEW_CGROUP=/sys/fs/cgroup$NEW_CGROUP_REL + mkdir $NEW_CGROUP + + # The container is placed into a $CGROUP_V2_PATH cgroup. + set_cgroups_path + # And upon the start it moves itself into $NEW_CGROUP. + set_cgroup_mount_writable + update_config ' .linux.namespaces -= [{"type": "cgroup"}] + | .process.args = ["sh", "-c", "echo 1 > '$NEW_CGROUP'/cgroup.procs && exec sleep 1h"]' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ $status -eq 0 ] + testcontainer test_busybox running + sleep 1 + # Remove the original container cgroup. If systemd cgroup manager is used by runc, + # the cgroup might have already be deleted by systemd, so we ignore rmdir errors. + rmdir "$CGROUP_V2_PATH" || true + test -d "$CGROUP_V2_PATH" && false + + # Test that runc exec is able to fallback to container's init cgroup + # even if the original cgroup is gone. + runc exec test_busybox cat /proc/self/cgroup + [ $status -eq 0 ] + [ "$output" = "0::$NEW_CGROUP_REL" ] + + # Cleanup. + runc delete -f test_busybox + rmdir "$NEW_CGROUP" +} + @test "runc exec [execve error]" { cat <rootfs/run.sh #!/mmnnttbb foo bar