diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index b6fae2787c4..7e56f02a19b 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -90,7 +90,7 @@ func Init() { } // Normally, StartInitialization() never returns, meaning // if we are here, it had failed. - os.Exit(1) + os.Exit(255) } // Normally, this function does not return. If it returns, with or without an diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index 95deb0d6ca7..349a18ed383 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -4,10 +4,19 @@ import ( "bufio" "encoding/json" "io" + "os" "github.com/sirupsen/logrus" ) +// IsLogrusFd returns whether the provided fd matches the one that logrus is +// currently outputting to. This should only ever be called by UnsafeCloseFrom +// from `runc init`. +func IsLogrusFd(fd uintptr) bool { + file, ok := logrus.StandardLogger().Out.(*os.File) + return ok && file.Fd() == fd +} + func ForwardLogs(logPipe io.ReadCloser) chan error { done := make(chan error, 1) s := bufio.NewScanner(logPipe) diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 0dd72f95e7f..a4288d241a2 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -15,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) // linuxSetnsInit performs the container's initialization for running a new process @@ -139,5 +140,23 @@ func (l *linuxSetnsInit) Init() error { l.config.Args[0] = name return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ()) } + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } return system.Exec(name, l.config.Args, os.Environ()) } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 3096d0d81ee..b5652c5a7da 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -282,5 +282,23 @@ func (l *linuxStandardInit) Init() error { l.config.Args[0] = name return system.Fexecve(l.dmzExe.Fd(), l.config.Args, os.Environ()) } + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } return system.Exec(name, l.config.Args, os.Environ()) } diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index a48221b000a..c90c2b60545 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -11,10 +11,13 @@ import ( "runtime" "strconv" "sync" + _ "unsafe" // for go:linkname securejoin "github.com/cyphar/filepath-securejoin" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/logs" ) // EnsureProcHandle returns whether or not the given file handle is on procfs. @@ -53,14 +56,11 @@ func haveCloseRangeCloexec() bool { return haveCloseRangeCloexecBool } -// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for -// the process (except for those below the given fd value). -func CloseExecFrom(minFd int) error { - if haveCloseRangeCloexec() { - err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC) - return os.NewSyscallError("close_range", err) - } +type fdFunc func(fd int) +// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in +// the current process. +func fdRangeFrom(minFd int, fn fdFunc) error { procSelfFd, closer := ProcThreadSelf("fd") defer closer() @@ -88,15 +88,73 @@ func CloseExecFrom(minFd int) error { if fd < minFd { continue } - // Intentionally ignore errors from unix.CloseOnExec -- the cases where - // this might fail are basically file descriptors that have already - // been closed (including and especially the one that was created when - // os.ReadDir did the "opendir" syscall). - unix.CloseOnExec(fd) + // Ignore the file descriptor we used for readdir, as it will be closed + // when we return. + if uintptr(fd) == fdDir.Fd() { + continue + } + // Run the closure. + fn(fd) } return nil } +// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or +// equal to minFd in the current process. +func CloseExecFrom(minFd int) error { + // Use close_range(CLOSE_RANGE_CLOEXEC) if possible. + if haveCloseRangeCloexec() { + err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC) + return os.NewSyscallError("close_range", err) + } + // Otherwise, fall back to the standard loop. + return fdRangeFrom(minFd, unix.CloseOnExec) +} + +//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor + +// In order to make sure we do not close the internal epoll descriptors the Go +// runtime uses, we need to ensure that we skip descriptors that match +// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, +// unfortunately there's no other way to be sure we're only keeping the file +// descriptors the Go runtime needs. Hopefully nothing blows up doing this... +func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive + +// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the +// current process, except for those critical to Go's runtime (such as the +// netpoll management descriptors). +// +// NOTE: That this function is incredibly dangerous to use in most Go code, as +// closing file descriptors from underneath *os.File handles can lead to very +// bad behaviour (the closed file descriptor can be re-used and then any +// *os.File operations would apply to the wrong file). This function is only +// intended to be called from the last stage of runc init. +func UnsafeCloseFrom(minFd int) error { + // We cannot use close_range(2) even if it is available, because we must + // not close some file descriptors. + return fdRangeFrom(minFd, func(fd int) { + if runtime_IsPollDescriptor(uintptr(fd)) { + // These are the Go runtimes internal netpoll file descriptors. + // These file descriptors are operated on deep in the Go scheduler, + // and closing those files from underneath Go can result in panics. + // There is no issue with keeping them because they are not + // executable and are not useful to an attacker anyway. Also we + // don't have any choice. + return + } + if logs.IsLogrusFd(uintptr(fd)) { + // Do not close the logrus output fd. We cannot exec a pipe, and + // the contents are quite limited (very little attacker control, + // JSON-encoded) making shellcode attacks unlikely. + return + } + // There's nothing we can do about errors from close(2), and the + // only likely error to be seen is EBADF which indicates the fd was + // already closed (in which case, we got what we wanted). + _ = unix.Close(fd) + }) +} + // NewSockPair returns a new SOCK_STREAM unix socket pair. func NewSockPair(name string) (parent, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)