From 0e9a3358f84813120b38f01cd8ad153ddf3a1694 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 4 Aug 2023 11:59:31 +1000 Subject: [PATCH] nsexec: migrate memfd /proc/self/exe logic to Go code This allow us to remove the amount of C code in runc quite substantially, as well as removing a whole execve(2) from the nsexec path because we no longer spawn "runc init" only to re-exec "runc init" after doing the clone. Signed-off-by: Aleksa Sarai --- libcontainer/container_linux.go | 74 +++- libcontainer/dmz/cloned_binary_linux.go | 192 ++++++++ libcontainer/nsenter/cloned_binary.c | 567 ------------------------ libcontainer/nsenter/nsexec.c | 11 - libcontainer/process.go | 12 + libcontainer/system/linux.go | 28 ++ 6 files changed, 286 insertions(+), 598 deletions(-) create mode 100644 libcontainer/dmz/cloned_binary_linux.go delete mode 100644 libcontainer/nsenter/cloned_binary.c diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index c941239b841..9eb72f86600 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -24,6 +24,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/dmz" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" @@ -316,6 +317,8 @@ func (c *Container) start(process *Process) (retErr error) { if err != nil { return fmt.Errorf("unable to create new parent process: %w", err) } + // We do not need the cloned binaries once the process is spawned. + defer process.closeClonedExes() logsDone := parent.forwardChildLogs() if logsDone != nil { @@ -454,24 +457,30 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { } logFilePair := filePair{parentLogPipe, childLogPipe} - cmd := c.commandTemplate(p, childInitPipe, childLogPipe) - if !p.Init { - return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) + // Make sure we use a new safe copy of /proc/self/exe each time this is + // called, to make sure that if a container manages to overwrite the file + // it cannot affect other containers on the system. For runc, this code + // will only ever be called once, but libcontainer users might call this + // more than once. + p.closeClonedExes() + var ( + exePath string + safeExe *os.File + ) + if dmz.IsSelfExeCloned() { + // /proc/self/exe is already a cloned binary -- no need to do anything + logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!") + exePath = "/proc/self/exe" + } else { + safeExe, err = dmz.CloneSelfExe(c.root) + if err != nil { + return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err) + } + exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd())) + p.clonedExes = append(p.clonedExes, safeExe) } - // We only set up fifoFd if we're not doing a `runc exec`. The historic - // reason for this is that previously we would pass a dirfd that allowed - // for container rootfs escape (and not doing it in `runc exec` avoided - // that problem), but we no longer do that. However, there's no need to do - // this for `runc exec` so we just keep it this way to be safe. - if err := c.includeExecFifo(cmd); err != nil { - return nil, fmt.Errorf("unable to setup exec fifo: %w", err) - } - return c.newInitProcess(p, cmd, messageSockPair, logFilePair) -} - -func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd { - cmd := exec.Command("/proc/self/exe", "init") + cmd := exec.Command(exePath, "init") cmd.Args[0] = os.Args[0] cmd.Stdin = p.Stdin cmd.Stdout = p.Stdout @@ -501,13 +510,38 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel) } - // NOTE: when running a container with no PID namespace and the parent process spawning the container is - // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason - // even with the parent still running. + if safeExe != nil { + // Due to a Go stdlib bug, we need to add safeExe to the set of + // ExtraFiles otherwise it is possible for the stdlib to clobber the fd + // during forkAndExecInChild1 and replace it with some other file that + // might be malicious. This is less than ideal (because the descriptor + // will be non-O_CLOEXEC) however we have protections in "runc init" to + // stop us from leaking extra file descriptors. + // + // See . + cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe) + } + + // NOTE: when running a container with no PID namespace and the parent + // process spawning the container is PID1 the pdeathsig is being + // delivered to the container's init process by the kernel for some + // reason even with the parent still running. if c.config.ParentDeathSignal > 0 { cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal) } - return cmd + + if p.Init { + // We only set up fifoFd if we're not doing a `runc exec`. The historic + // reason for this is that previously we would pass a dirfd that allowed + // for container rootfs escape (and not doing it in `runc exec` avoided + // that problem), but we no longer do that. However, there's no need to do + // this for `runc exec` so we just keep it this way to be safe. + if err := c.includeExecFifo(cmd); err != nil { + return nil, fmt.Errorf("unable to setup exec fifo: %w", err) + } + return c.newInitProcess(p, cmd, messageSockPair, logFilePair) + } + return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) } // shouldSendMountSources says whether the child process must setup bind mounts with diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go new file mode 100644 index 00000000000..c962ea6fcba --- /dev/null +++ b/libcontainer/dmz/cloned_binary_linux.go @@ -0,0 +1,192 @@ +package dmz + +import ( + "errors" + "fmt" + "io" + "os" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/system" +) + +type SealFunc func(**os.File) error + +var ( + _ SealFunc = sealMemfd + _ SealFunc = sealFile +) + +const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE + +func sealMemfd(f **os.File) error { + if err := (*f).Chmod(0o511); err != nil { + return err + } + // Try to set the newer memfd sealing flags, but we ignore + // errors because they are not needed and we want to continue + // to work on older kernels. + fd := (*f).Fd() + // F_SEAL_FUTURE_WRITE -- Linux 5.1 + _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE) + // F_SEAL_EXEC -- Linux 6.3 + const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name + _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC) + // Apply all original memfd seals. + _, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals) + return os.NewSyscallError("fcntl(F_ADD_SEALS)", err) +} + +// Memfd creates a sealable executable memfd (supported since Linux 3.17). +func Memfd(comment string) (*os.File, SealFunc, error) { + file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC) + return file, sealMemfd, err +} + +func sealFile(f **os.File) error { + if err := (*f).Chmod(0o511); err != nil { + return err + } + // When sealing an O_TMPFILE-style descriptor we need to + // re-open the path as O_PATH to clear the existing write + // handle we have. + opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("reopen tmpfile: %w", err) + } + _ = (*f).Close() + *f = opath + return nil +} + +// otmpfile creates an open(O_TMPFILE) file in the given directory (supported +// since Linux 3.11). +func otmpfile(dir string) (*os.File, SealFunc, error) { + file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700) + if err != nil { + return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err) + } + // Make sure we actually got an unlinked O_TMPFILE descriptor. + var stat unix.Stat_t + if err := unix.Fstat(int(file.Fd()), &stat); err != nil { + file.Close() + return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err) + } else if stat.Nlink != 0 { + file.Close() + return nil, nil, errors.New("O_TMPFILE has non-zero nlink") + } + return file, sealFile, err +} + +// mktemp creates a classic unlinked file in the given directory. +func mktemp(dir string) (*os.File, SealFunc, error) { + file, err := os.CreateTemp(dir, "runc.") + if err != nil { + return nil, nil, err + } + // Unlink the file and verify it was unlinked. + if err := os.Remove(file.Name()); err != nil { + return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err) + } + var stat unix.Stat_t + if err := unix.Fstat(int(file.Fd()), &stat); err != nil { + return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err) + } else if stat.Nlink != 0 { + return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name()) + } + return file, sealFile, err +} + +func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) { + // First, try an executable memfd (supported since Linux 3.17). + file, sealFn, err = Memfd(comment) + if err == nil { + return + } + logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err) + // Try to fallback to O_TMPFILE (supported since Linux 3.11). + file, sealFn, err = otmpfile(tmpDir) + if err == nil { + return + } + logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err) + // Finally, try a classic unlinked temporary file. + file, sealFn, err = mktemp(tmpDir) + if err == nil { + return + } + return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err) +} + +// CloneBinary creates a "sealed" clone of a given binary, which can be used to +// thwart attempts by the container process to gain access to host binaries +// through procfs magic-link shenanigans. For more details on why this is +// necessary, see CVE-2019-5736. +func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) { + logrus.Debugf("cloning %s binary (%d bytes)", name, size) + file, sealFn, err := getSealableFile(name, tmpDir) + if err != nil { + return nil, err + } + copied, err := io.Copy(file, src) + if err != nil { + file.Close() + return nil, fmt.Errorf("copy binary: %w", err) + } else if copied != size { + file.Close() + return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size) + } + if err := sealFn(&file); err != nil { + file.Close() + return nil, fmt.Errorf("could not seal fd: %w", err) + } + return file, nil +} + +// IsCloned returns whether the given file can be guaranteed to be a safe exe. +func IsCloned(exe *os.File) bool { + seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0) + if err != nil { + // /proc/self/exe is probably not a memfd + logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err) + return false + } + // The memfd must have all of the base seals applied. + logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals) + return seals&baseMemfdSeals == baseMemfdSeals +} + +// CloneSelfExe makes a clone of the current process's binary (through +// /proc/self/exe). This binary can then be used for "runc init" in order to +// make sure the container process can never resolve the original runc binary. +// For more details on why this is necessary, see CVE-2019-5736. +func CloneSelfExe(tmpDir string) (*os.File, error) { + selfExe, err := os.Open("/proc/self/exe") + if err != nil { + return nil, fmt.Errorf("opening current binary: %w", err) + } + defer selfExe.Close() + + stat, err := selfExe.Stat() + if err != nil { + return nil, fmt.Errorf("checking /proc/self/exe size: %w", err) + } + size := stat.Size() + + return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir) +} + +// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can +// be guaranteed to be safe. This means that it must be a sealed memfd. Other +// types of clones cannot be completely verified as safe. +func IsSelfExeCloned() bool { + selfExe, err := os.Open("/proc/self/exe") + if err != nil { + logrus.Debugf("open /proc/self/exe failed: %v", err) + return false + } + defer selfExe.Close() + return IsCloned(selfExe) +} diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c deleted file mode 100644 index a7f992fddd7..00000000000 --- a/libcontainer/nsenter/cloned_binary.c +++ /dev/null @@ -1,567 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later -/* - * Copyright (C) 2019 Aleksa Sarai - * Copyright (C) 2019 SUSE LLC - * - * This work is dual licensed under the following licenses. You may use, - * redistribute, and/or modify the work under the conditions of either (or - * both) licenses. - * - * === Apache-2.0 === - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * === LGPL-2.1-or-later === - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library. If not, see - * . - * - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ipc.h" -#include "log.h" - -/* Use our own wrapper for memfd_create. */ -#ifndef SYS_memfd_create -# ifdef __NR_memfd_create -# define SYS_memfd_create __NR_memfd_create -# else -/* These values come from . */ -# warning "libc is outdated -- using hard-coded SYS_memfd_create" -# if defined(__x86_64__) -# define SYS_memfd_create 319 -# elif defined(__i386__) -# define SYS_memfd_create 356 -# elif defined(__ia64__) -# define SYS_memfd_create 1340 -# elif defined(__arm__) -# define SYS_memfd_create 385 -# elif defined(__aarch64__) -# define SYS_memfd_create 279 -# elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__) -# define SYS_memfd_create 360 -# elif defined(__s390__) || defined(__s390x__) -# define SYS_memfd_create 350 -# else -# warning "unknown architecture -- cannot hard-code SYS_memfd_create" -# endif -# endif -#endif - -/* memfd_create(2) flags -- copied from . */ -#ifndef MFD_CLOEXEC -# define MFD_CLOEXEC 0x0001U -# define MFD_ALLOW_SEALING 0x0002U -#endif -#ifndef MFD_EXEC -# define MFD_EXEC 0x0010U -#endif - -int memfd_create(const char *name, unsigned int flags) -{ -#ifdef SYS_memfd_create - return syscall(SYS_memfd_create, name, flags); -#else - errno = ENOSYS; - return -1; -#endif -} - -/* This comes directly from . */ -#ifndef F_LINUX_SPECIFIC_BASE -# define F_LINUX_SPECIFIC_BASE 1024 -#endif -#ifndef F_ADD_SEALS -# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) -#endif -#ifndef F_SEAL_SEAL -# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -# define F_SEAL_GROW 0x0004 /* prevent file from growing */ -# define F_SEAL_WRITE 0x0008 /* prevent writes */ -#endif -#ifndef F_SEAL_FUTURE_WRITE -# define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ -#endif -#ifndef F_SEAL_EXEC -# define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ -#endif - -#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY" -#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" -/* - * There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC), - * which we use opportunistically. However, this set is the original set of - * memfd seals, and we require them all to be set to trust our /proc/self/exe - * if it is a memfd. - */ -#define RUNC_MEMFD_MIN_SEALS \ - (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) - -static void *must_realloc(void *ptr, size_t size) -{ - void *old = ptr; - do { - ptr = realloc(old, size); - } while (!ptr); - return ptr; -} - -/* - * Verify whether we are currently in a self-cloned program (namely, is - * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather - * for shmem files), and we want to be sure it's actually sealed. - */ -static int is_self_cloned(void) -{ - int fd, seals = 0, is_cloned = false; - struct stat statbuf = { }; - struct statfs fsbuf = { }; - - fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); - if (fd < 0) { - write_log(ERROR, "cannot open runc binary for reading: open /proc/self/exe: %m"); - return -ENOTRECOVERABLE; - } - - /* - * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for - * this, because you cannot write to a sealed memfd no matter what. - */ - seals = fcntl(fd, F_GET_SEALS); - if (seals >= 0) { - write_log(DEBUG, "checking /proc/self/exe memfd seals: 0x%x", seals); - is_cloned = (seals & RUNC_MEMFD_MIN_SEALS) == RUNC_MEMFD_MIN_SEALS; - if (is_cloned) - goto out; - } - - /* - * All other forms require CLONED_BINARY_ENV, since they are potentially - * writeable (or we can't tell if they're fully safe) and thus we must - * check the environment as an extra layer of defence. - */ - if (!getenv(CLONED_BINARY_ENV)) { - is_cloned = false; - goto out; - } - - /* - * Is the binary on a read-only filesystem? We can't detect bind-mounts in - * particular (in-kernel they are identical to regular mounts) but we can - * at least be sure that it's read-only. In addition, to make sure that - * it's *our* bind-mount we check CLONED_BINARY_ENV. - */ - if (fstatfs(fd, &fsbuf) >= 0) - is_cloned |= (fsbuf.f_flags & MS_RDONLY); - - /* - * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 - * which appears to have a borked backport of F_GET_SEALS. Either way, - * having a file which has no hardlinks indicates that we aren't using - * a host-side "runc" binary and this is something that a container - * cannot fake (because unlinking requires being able to resolve the - * path that you want to unlink). - */ - if (fstat(fd, &statbuf) >= 0) - is_cloned |= (statbuf.st_nlink == 0); - -out: - close(fd); - return is_cloned; -} - -/* Read a given file into a new buffer, and providing the length. */ -static char *read_file(char *path, size_t *length) -{ - int fd; - char buf[4096], *copy = NULL; - - if (!length) - return NULL; - - fd = open(path, O_RDONLY | O_CLOEXEC); - if (fd < 0) - return NULL; - - *length = 0; - for (;;) { - ssize_t n; - - n = read(fd, buf, sizeof(buf)); - if (n < 0) - goto error; - if (!n) - break; - - copy = must_realloc(copy, (*length + n) * sizeof(*copy)); - memcpy(copy + *length, buf, n); - *length += n; - } - close(fd); - return copy; - -error: - close(fd); - free(copy); - return NULL; -} - -/* - * A poor-man's version of "xargs -0". Basically parses a given block of - * NUL-delimited data, within the given length and adds a pointer to each entry - * to the array of pointers. - */ -static int parse_xargs(char *data, int data_length, char ***output) -{ - int num = 0; - char *cur = data; - - if (!data || *output != NULL) - return -1; - - while (cur < data + data_length) { - num++; - *output = must_realloc(*output, (num + 1) * sizeof(**output)); - (*output)[num - 1] = cur; - cur += strlen(cur) + 1; - } - (*output)[num] = NULL; - return num; -} - -/* - * "Parse" out argv from /proc/self/cmdline. - * This is necessary because we are running in a context where we don't have a - * main() that we can just get the arguments from. - */ -static int fetchve(char ***argv) -{ - char *cmdline = NULL; - size_t cmdline_size; - - cmdline = read_file("/proc/self/cmdline", &cmdline_size); - if (!cmdline) - goto error; - - if (parse_xargs(cmdline, cmdline_size, argv) <= 0) - goto error; - - return 0; - -error: - free(cmdline); - return -EINVAL; -} - -enum { - EFD_NONE = 0, - EFD_MEMFD, - EFD_FILE, -}; - -/* - * This comes from . We can't hard-code __O_TMPFILE because it - * changes depending on the architecture. If we don't have O_TMPFILE we always - * have the mkostemp(3) fallback. - */ -#ifndef O_TMPFILE -# if defined(__O_TMPFILE) && defined(O_DIRECTORY) -# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) -# endif -#endif - -static inline bool is_memfd_unsupported_error(int err) -{ - /* - * - ENOSYS is obviously an "unsupported" error. - * - * - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel), - * but it can also be hit if vm.memfd_noexec=2 (in kernels without - * [1] applied) and the flags does not contain MFD_EXEC. However, - * there was a bug in the original 6.3 implementation of - * vm.memfd_noexec=2, which meant that MFD_EXEC would work even in - * the "strict" mode. Because we try MFD_EXEC first, we won't get - * EINVAL in the vm.memfd_noexec=2 case (which means we don't need to - * figure out whether to log the message about memfd_create). - * - * - EACCES is returned in kernels that contain [1] in the - * vm.memfd_noexec=2 case. - * - * At time of writing, [1] is not in Linus's tree and it't not clear if - * it will be backported to stable, so what exact versions apply here - * is unclear. But the bug is present in 6.3-6.5 at the very least. - * - * [1]: https://lore.kernel.org/all/20230705063315.3680666-2-jeffxu@google.com/ - */ - if (err == EACCES) - write_log(INFO, - "memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE"); - return err == ENOSYS || err == EINVAL || err == EACCES; -} - -static int make_execfd(int *fdtype) -{ - int fd = -1; - char template[PATH_MAX] = { 0 }; - char *prefix = getenv("_LIBCONTAINER_STATEDIR"); - - if (!prefix || *prefix != '/') - prefix = "/tmp"; - if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) - return -1; - - /* - * Now try memfd, it's much nicer than actually creating a file in STATEDIR - * since it's easily detected thanks to sealing and also doesn't require - * assumptions about STATEDIR. - */ - *fdtype = EFD_MEMFD; - /* - * On newer kernels we should set MFD_EXEC to indicate we need +x - * permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly - * break runc. vm.memfd_noexec=2 is a little bit more complicated, see the - * comment in is_memfd_unsupported_error() -- the upshot is that doing it - * this way works, but only because of two overlapping bugs in the sysctl - * implementation. - */ - fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); - if (fd < 0 && is_memfd_unsupported_error(errno)) - fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); - if (fd >= 0) - return fd; - if (!is_memfd_unsupported_error(errno)) - goto error; - -#ifdef O_TMPFILE - /* - * Try O_TMPFILE to avoid races where someone might snatch our file. Note - * that O_EXCL isn't actually a security measure here (since you can just - * fd re-open it and clear O_EXCL). - */ - *fdtype = EFD_FILE; - fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); - if (fd >= 0) { - struct stat statbuf = { }; - bool working_otmpfile = false; - - /* - * open(2) ignores unknown O_* flags -- yeah, I was surprised when I - * found this out too. As a result we can't check for EINVAL. However, - * if we get nlink != 0 (or EISDIR) then we know that this kernel - * doesn't support O_TMPFILE. - */ - if (fstat(fd, &statbuf) >= 0) - working_otmpfile = (statbuf.st_nlink == 0); - - if (working_otmpfile) - return fd; - - /* Pretend that we got EISDIR since O_TMPFILE failed. */ - close(fd); - errno = EISDIR; - } - if (errno != EISDIR) - goto error; -#endif /* defined(O_TMPFILE) */ - - /* - * Our final option is to create a temporary file the old-school way, and - * then unlink it so that nothing else sees it by accident. - */ - *fdtype = EFD_FILE; - fd = mkostemp(template, O_CLOEXEC); - if (fd >= 0) { - if (unlink(template) >= 0) - return fd; - close(fd); - } - -error: - *fdtype = EFD_NONE; - return -1; -} - -static int seal_execfd(int *fd, int fdtype) -{ - switch (fdtype) { - case EFD_MEMFD:{ - /* - * Try to seal with newer seals, but we ignore errors because older - * kernels don't support some of them. For container security only - * RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are - * nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it - * contains F_SEAL_SEAL. - */ - int __attribute__((unused)) _err1 = fcntl(*fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE); // Linux 5.1 - int __attribute__((unused)) _err2 = fcntl(*fd, F_ADD_SEALS, F_SEAL_EXEC); // Linux 6.3 - return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_MIN_SEALS); - } - case EFD_FILE:{ - /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ - int newfd; - char fdpath[PATH_MAX] = { 0 }; - - if (fchmod(*fd, 0100) < 0) - return -1; - - if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) - return -1; - - newfd = open(fdpath, O_PATH | O_CLOEXEC); - if (newfd < 0) - return -1; - - close(*fd); - *fd = newfd; - return 0; - } - default: - break; - } - return -1; -} - -static ssize_t fd_to_fd(int outfd, int infd) -{ - ssize_t total = 0; - char buffer[4096]; - - for (;;) { - ssize_t nread, nwritten = 0; - - nread = read(infd, buffer, sizeof(buffer)); - if (nread < 0) - return -1; - if (!nread) - break; - - do { - ssize_t n = write(outfd, buffer + nwritten, nread - nwritten); - if (n < 0) - return -1; - nwritten += n; - } while (nwritten < nread); - - total += nwritten; - } - - return total; -} - -static int clone_binary(void) -{ - int binfd, execfd; - struct stat statbuf = { }; - size_t sent = 0; - int fdtype = EFD_NONE; - - execfd = make_execfd(&fdtype); - if (execfd < 0 || fdtype == EFD_NONE) - return -ENOTRECOVERABLE; - - binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); - if (binfd < 0) - goto error; - - if (fstat(binfd, &statbuf) < 0) - goto error_binfd; - - while (sent < statbuf.st_size) { - int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); - if (n < 0) { - /* sendfile can fail so we fallback to a dumb user-space copy. */ - n = fd_to_fd(execfd, binfd); - if (n < 0) - goto error_binfd; - } - sent += n; - } - close(binfd); - if (sent != statbuf.st_size) - goto error; - - if (seal_execfd(&execfd, fdtype) < 0) - goto error; - - return execfd; - -error_binfd: - close(binfd); -error: - close(execfd); - return -EIO; -} - -/* Get cheap access to the environment. */ -extern char **environ; - -int ensure_cloned_binary(void) -{ - int execfd; - char **argv = NULL; - - /* Check that we're not self-cloned, and if we are then bail. */ - int cloned = is_self_cloned(); - if (cloned > 0 || cloned == -ENOTRECOVERABLE) - return cloned; - - if (fetchve(&argv) < 0) - return -EINVAL; - - execfd = clone_binary(); - if (execfd < 0) - return -EIO; - - if (putenv(CLONED_BINARY_ENV "=1")) - goto error; - - fexecve(execfd, argv, environ); -error: - close(execfd); - return -ENOEXEC; -} diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 17e0468c6af..9b10b232528 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -536,9 +536,6 @@ void join_namespaces(char *nslist) free(namespaces); } -/* Defined in cloned_binary.c. */ -extern int ensure_cloned_binary(void); - static inline int sane_kill(pid_t pid, int signum) { if (pid > 0) @@ -791,14 +788,6 @@ void nsexec(void) return; } - /* - * We need to re-exec if we are not in a cloned binary. This is necessary - * to ensure that containers won't be able to access the host binary - * through /proc/self/exe. See CVE-2019-5736. - */ - if (ensure_cloned_binary() < 0) - bail("could not ensure we are a cloned binary"); - /* * Inform the parent we're past initial setup. * For the other side of this, see initWaiter. diff --git a/libcontainer/process.go b/libcontainer/process.go index 4de4a9e75c2..d2c7bfcda36 100644 --- a/libcontainer/process.go +++ b/libcontainer/process.go @@ -49,6 +49,9 @@ type Process struct { // ExtraFiles specifies additional open files to be inherited by the container ExtraFiles []*os.File + // open handles to cloned binaries -- see dmz.ClonedBinary for more details + clonedExes []*os.File + // Initial sizings for the console ConsoleWidth uint16 ConsoleHeight uint16 @@ -121,6 +124,15 @@ func (p Process) Signal(sig os.Signal) error { return p.ops.signal(sig) } +// closeClonedExes cleans up any existing cloned binaries associated with the +// Process. +func (p *Process) closeClonedExes() { + for _, exe := range p.clonedExes { + _ = exe.Close() + } + p.clonedExes = nil +} + // IO holds the process's STDIO type IO struct { Stdin io.WriteCloser diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go index d2ad5cea229..5acaa4df384 100644 --- a/libcontainer/system/linux.go +++ b/libcontainer/system/linux.go @@ -4,10 +4,12 @@ package system import ( + "fmt" "os" "os/exec" "unsafe" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -102,3 +104,29 @@ func GetSubreaper() (int, error) { return int(i), nil } + +func ExecutableMemfd(comment string, flags int) (*os.File, error) { + // Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this + // flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an + // executable memfd. For vm.memfd_noexec=2 this is a bit more complicated. + // The original vm.memfd_noexec=2 implementation incorrectly silently + // allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer + // kernels, we will get -EACCES if we try to use MFD_EXEC with + // vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value). + // + // The upshot is we only need to retry without MFD_EXEC on -EINVAL because + // it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on + // kernels where -EINVAL is actually a security denial. + memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC) + if err == unix.EINVAL { + memfd, err = unix.MemfdCreate(comment, flags) + } + if err != nil { + if err == unix.EACCES { + logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE") + } + err := os.NewSyscallError("memfd_create", err) + return nil, fmt.Errorf("failed to create executable memfd: %w", err) + } + return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil +}