Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nsexec: spring cleaning #3953

Closed
wants to merge 11 commits into from
9 changes: 9 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,15 @@ jobs:
with:
bats-version: 1.9.0

- name: procfs mount
run: |
# Get the list of mounts to help with debugging.
cat /proc/self/mounts
# Create a procfs mount that is not masked, to ensure that container
# procfs mounts will succeed.
sudo mkdir -p /tmp/.procfs-stashed-mount
sudo unshare -pf mount -t proc -o subset=pid proc /tmp/.procfs-stashed-mount
Comment on lines +81 to +88
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know why this has become necessary with this PR (the "problem" commit is the /proc/self/exe cloning change) but this solves the issue and this is a CI-weirdness issue.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cyphar Does that mean that runc won't work as-is inside "your usual" GHA, or is it only for the sake of testing?


- name: unit test
if: matrix.rootless != 'rootless'
run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ unittest: runcimage
-t --privileged --rm \
-v /lib/modules:/lib/modules:ro \
-v $(CURDIR):/go/src/$(PROJECT) \
$(RUNC_IMAGE) make localunittest TESTFLAGS=$(TESTFLAGS)
$(RUNC_IMAGE) make localunittest TESTFLAGS="$(TESTFLAGS)"

localunittest: all
$(GO) test -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./...
Expand All @@ -115,7 +115,7 @@ integration: runcimage
-t --privileged --rm \
-v /lib/modules:/lib/modules:ro \
-v $(CURDIR):/go/src/$(PROJECT) \
$(RUNC_IMAGE) make localintegration TESTPATH=$(TESTPATH)
$(RUNC_IMAGE) make localintegration TESTPATH="$(TESTPATH)"

localintegration: all
bats -t tests/integration$(TESTPATH)
Expand Down
4 changes: 2 additions & 2 deletions contrib/cmd/recvtty/recvtty.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func handleSingle(path string, noStdin bool) error {
defer socket.Close()

// Get the master file descriptor from runC.
master, err := utils.RecvFd(socket)
master, err := utils.RecvFile(socket)
if err != nil {
return err
}
Expand Down Expand Up @@ -171,7 +171,7 @@ func handleNull(path string) error {
defer socket.Close()

// Get the master file descriptor from runC.
master, err := utils.RecvFd(socket)
master, err := utils.RecvFile(socket)
if err != nil {
return
}
Expand Down
2 changes: 1 addition & 1 deletion libcontainer/criu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -1185,7 +1185,7 @@ func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process,
defer master.Close()

// While we can access console.master, using the API is a good idea.
if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
if err := utils.SendFile(process.ConsoleSocket, master); err != nil {
return err
}
case "status-ready":
Expand Down
62 changes: 26 additions & 36 deletions libcontainer/init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"encoding/json"
"errors"
"fmt"
"io"
"net"
"os"
"runtime"
"runtime/debug"
"strconv"
"strings"
Expand Down Expand Up @@ -102,11 +102,8 @@ func StartInitialization() (retErr error) {
defer func() {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
if err := writeSync(pipe, procError); err != nil {
fmt.Fprintln(os.Stderr, retErr)
return
}
if err := utils.WriteJSON(pipe, &initError{Message: retErr.Error()}); err != nil {
ierr := initError{Message: retErr.Error()}
if err := writeSyncArg(pipe, procError, ierr); err != nil {
fmt.Fprintln(os.Stderr, retErr)
return
}
Expand Down Expand Up @@ -317,7 +314,6 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error {
if err != nil {
return err
}

// After we return from here, we don't need the console anymore.
defer pty.Close()

Expand All @@ -339,67 +335,61 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error {
}
}
// While we can access console.master, using the API is a good idea.
if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil {
return err
}
runtime.KeepAlive(pty)

// Now, dup over all the things.
return dupStdio(slavePath)
}

// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
func syncParentReady(pipe *os.File) error {
// Tell parent.
if err := writeSync(pipe, procReady); err != nil {
return err
}

// Wait for parent to give the all-clear.
return readSync(pipe, procRun)
}

// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe io.ReadWriter) error {
func syncParentHooks(pipe *os.File) error {
// Tell parent.
if err := writeSync(pipe, procHooks); err != nil {
return err
}

// Wait for parent to give the all-clear.
return readSync(pipe, procResume)
}

// syncParentSeccomp sends to the given pipe a JSON payload which
// indicates that the parent should pick up the seccomp fd with pidfd_getfd()
// and send it to the seccomp agent over a unix socket. It then waits for
// the parent to indicate that it is cleared to resume and closes the seccompFd.
// If the seccompFd is -1, there isn't anything to sync with the parent, so it
// returns no error.
func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error {
if seccompFd == -1 {
// syncParentSeccomp sends the fd associated with the seccomp file descriptor
// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy.
func syncParentSeccomp(pipe *os.File, seccompFd int) error {
if seccompFd >= 0 {

This comment was marked as resolved.

This comment was marked as resolved.

return nil
}

// Tell parent.
if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil {
unix.Close(seccompFd)
defer unix.Close(seccompFd)

// Tell parent to grab our fd.
//
// Notably, we do not use writeSyncFile here because a container might have
// an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest
// possible number of system calls here because all of those syscalls
// cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here
// before the parent gets the file descriptor would deadlock "runc init" if
// we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more
// details.
if err := writeSyncArg(pipe, procSeccomp, seccompFd); err != nil {
return err
}

// Wait for parent to give the all-clear.
if err := readSync(pipe, procSeccompDone); err != nil {
unix.Close(seccompFd)
return fmt.Errorf("sync parent seccomp: %w", err)
}

if err := unix.Close(seccompFd); err != nil {
return fmt.Errorf("close seccomp fd: %w", err)
}

return nil
// Wait for parent to tell us they've grabbed the seccompfd.
return readSync(pipe, procSeccompDone)
}

// setupUser changes the groups, gid, and uid for the user inside the container
Expand Down
4 changes: 2 additions & 2 deletions libcontainer/integration/execin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,9 @@ func TestExecInTTY(t *testing.T) {

done := make(chan (error))
go func() {
f, err := utils.RecvFd(parent)
f, err := utils.RecvFile(parent)
if err != nil {
done <- fmt.Errorf("RecvFd: %w", err)
done <- fmt.Errorf("RecvFile: %w", err)
return
}
c, err := console.ConsoleFromFile(f)
Expand Down
83 changes: 48 additions & 35 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"time"

Expand Down Expand Up @@ -171,14 +172,27 @@ func (p *setnsProcess) start() (retErr error) {
panic("unexpected procHooks in setns")
case procSeccomp:
if p.config.Config.Seccomp.ListenerPath == "" {
return errors.New("listenerPath is not set")
return errors.New("seccomp listenerPath is not set")
}

seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd))
if sync.Arg == nil {
return fmt.Errorf("sync %q is missing an argument", sync.Type)
}
var srcFd int
if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
}
seccompFd, err := pidGetFd(p.pid(), srcFd)
if err != nil {
return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
}
defer seccompFd.Close()
// We have a copy, the child can keep working. We don't need to
// wait for the seccomp notify listener to get the fd before we
// permit the child to continue because the child will happily wait
// for the listener if it hits SCMP_ACT_NOTIFY.
if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
Comment on lines +195 to +199

This comment was marked as resolved.

return err
}
defer unix.Close(seccompFd)

bundle, annotations := utils.Annotations(p.config.Config.Labels)
containerProcessState := &specs.ContainerProcessState{
Expand All @@ -199,15 +213,10 @@ func (p *setnsProcess) start() (retErr error) {
containerProcessState, seccompFd); err != nil {
return err
}

// Sync with child.
if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
return err
}
return nil
default:
return errors.New("invalid JSON payload from child")
}
return nil
})

if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
Expand Down Expand Up @@ -460,14 +469,27 @@ func (p *initProcess) start() (retErr error) {
switch sync.Type {
case procSeccomp:
if p.config.Config.Seccomp.ListenerPath == "" {
return errors.New("listenerPath is not set")
return errors.New("seccomp listenerPath is not set")
}

seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd))
var srcFd int
if sync.Arg == nil {
return fmt.Errorf("sync %q is missing an argument", sync.Type)
}
if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
}
seccompFd, err := pidGetFd(p.pid(), srcFd)
if err != nil {
return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
}
defer seccompFd.Close()
// We have a copy, the child can keep working. We don't need to
// wait for the seccomp notify listener to get the fd before we
// permit the child to continue because the child will happily wait
// for the listener if it hits SCMP_ACT_NOTIFY.
if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
return err
}
defer unix.Close(seccompFd)

s, err := p.container.currentOCIState()
if err != nil {
Expand All @@ -488,11 +510,6 @@ func (p *initProcess) start() (retErr error) {
containerProcessState, seccompFd); err != nil {
return err
}

// Sync with child.
if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
return err
}
case procReady:
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
Expand Down Expand Up @@ -591,7 +608,6 @@ func (p *initProcess) start() (retErr error) {
default:
return errors.New("invalid JSON payload from child")
}

return nil
})

Expand Down Expand Up @@ -684,22 +700,20 @@ func (p *initProcess) forwardChildLogs() chan error {
return logs.ForwardLogs(p.logFilePair.parent)
}

func recvSeccompFd(childPid, childFd uintptr) (int, error) {
pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0)
if errno != 0 {
return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno)
func pidGetFd(pid, srcFd int) (*os.File, error) {
pidFd, err := unix.PidfdOpen(pid, 0)
if err != nil {
return nil, os.NewSyscallError("pidfd_open", err)
}
defer unix.Close(int(pidfd))

seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0)
if errno != 0 {
return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno)
defer unix.Close(pidFd)
fd, err := unix.PidfdGetfd(pidFd, srcFd, 0)
if err != nil {
return nil, os.NewSyscallError("pidfd_getfd", err)
}

return int(seccompFd), nil
return os.NewFile(uintptr(fd), "[pidfd_getfd]"), nil
}

func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error {
func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, file *os.File) error {
conn, err := net.Dial("unix", listenerPath)
if err != nil {
return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
Expand All @@ -716,11 +730,10 @@ func sendContainerProcessState(listenerPath string, state *specs.ContainerProces
return fmt.Errorf("cannot marshall seccomp state: %w", err)
}

err = utils.SendFds(socket, b, fd)
if err != nil {
if err := utils.SendRawFd(socket, string(b), file.Fd()); err != nil {
return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
}

runtime.KeepAlive(file)
return nil
}

Expand Down
3 changes: 1 addition & 2 deletions libcontainer/rootfs_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package libcontainer
import (
"errors"
"fmt"
"io"
"os"
"path"
"path/filepath"
Expand Down Expand Up @@ -64,7 +63,7 @@ func needsSetupDev(config *configs.Config) bool {
// prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro. You must call
// finalizeRootfs after this function to finish setting up the rootfs.
func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) (err error) {
func prepareRootfs(pipe *os.File, iConfig *initConfig, mountFds mountFds) (err error) {
config := iConfig.Config
if err := prepareRoot(config); err != nil {
return fmt.Errorf("error preparing rootfs: %w", err)
Expand Down
2 changes: 0 additions & 2 deletions libcontainer/setns_init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ func (l *linuxSetnsInit) Init() error {
if err != nil {
return err
}

if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
Expand All @@ -94,7 +93,6 @@ func (l *linuxSetnsInit) Init() error {
if err != nil {
return fmt.Errorf("unable to init seccomp: %w", err)
}

if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
Expand Down
Loading