Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 62 additions & 37 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -479,52 +479,34 @@
// container cannot access the statedir (and the FIFO itself remains
// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
func (c *Container) includeExecFifo() error {
fifoName := filepath.Join(c.stateDir, execFifoFilename)
fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
if err != nil {
return err
}
c.fifo = fifo

cmd.ExtraFiles = append(cmd.ExtraFiles, fifo)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
return nil
}

func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
comm, err := newProcessComm()
if err != nil {
return nil, err
}

// Make sure we use a new safe copy of /proc/self/exe binary each time, this
// is called to make sure that if a container manages to overwrite the file,
// it cannot affect other containers on the system. For runc, this code will
// only ever be called once, but libcontainer users might call this more than
// once.
p.closeClonedExes()
// After https://go-review.googlesource.com/c/go/+/728642 is merged, we need to recreate
// cmd Object to retry it.
// This logic can be replaced if https://github.com/golang/go/issues/77075 will be solved.
func (c *Container) createCmdObject(p *Process, comm *processComm, cgroupFD *os.File) *exec.Cmd {
var (
exePath string
safeExe *os.File
)
if exeseal.IsSelfExeCloned() {
// /proc/self/exe is already a cloned binary -- no need to do anything
logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")

if len(p.clonedExes) > 0 {
safeExe = p.clonedExes[0]
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
} else {
// We don't need to use /proc/thread-self here because the exe mm of a
// thread-group is guaranteed to be the same for all threads by
// definition. This lets us avoid having to do runtime.LockOSThread.
exePath = "/proc/self/exe"
} else {
var err error
safeExe, err = exeseal.CloneSelfExe(c.stateDir)
if err != nil {
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
}
exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
p.clonedExes = append(p.clonedExes, safeExe)
logrus.Debug("runc exeseal: using /proc/self/exe clone") // used for tests
}

cmd := exec.Command(exePath, "init")
Expand Down Expand Up @@ -602,22 +584,67 @@
cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
}

if c.fifo != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, c.fifo)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))

Check failure on line 590 in libcontainer/container_linux.go

View workflow job for this annotation

GitHub Actions / lint

File is not properly formatted (gofumpt)
}

if p.Init {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
} else {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
}

if cgroupFD != nil {
cmd.SysProcAttr.UseCgroupFD = true
cmd.SysProcAttr.CgroupFD = int(cgroupFD.Fd())
}

return cmd
}

func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
comm, err := newProcessComm()
if err != nil {
return nil, err
}

// Make sure we use a new safe copy of /proc/self/exe binary each time, this
// is called to make sure that if a container manages to overwrite the file,
// it cannot affect other containers on the system. For runc, this code will
// only ever be called once, but libcontainer users might call this more than
// once.
p.closeClonedExes()
var safeExe *os.File
if exeseal.IsSelfExeCloned() {
Copy link
Copy Markdown
Member

@rata rata Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd expect this commit (" libct: add Container method to recreate cmd Object in setnsProcess") to only move cmd.* assignments to a function that we then just call. And keep all the sealing of the binary untouched.

Is not possible to do something like that, that is super straight-forward?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I have kept the sealing untouched. I have deleted only exePath (it is not used in this function). In createCmdObject function I use the created safeExe if runc binary is cloned. I can move createCmdObject after newParentProcess function that's why there won't be any changes in newParentProcess function header.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I can keep the function signatures and should create cmd without cgroup fd. In Setns Process Cmd retry it will be ok.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I meant is we create a function that does the cmd and the commit is just moving cmd.X = asd lines from one place to the other. That will be quite clear we are not missing anything.

// /proc/self/exe is already a cloned binary -- no need to do anything
logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
} else {
var err error
safeExe, err = exeseal.CloneSelfExe(c.stateDir)
if err != nil {
return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
}
p.clonedExes = append(p.clonedExes, safeExe)
logrus.Debug("runc exeseal: using /proc/self/exe clone") // used for tests
}

if p.Init {
// We only set up fifoFd if we're not doing a `runc exec`. The historic
// reason for this is that previously we would pass a dirfd that allowed
// for container rootfs escape (and not doing it in `runc exec` avoided
// that problem), but we no longer do that. However, there's no need to do
// this for `runc exec` so we just keep it this way to be safe.
if err := c.includeExecFifo(cmd); err != nil {
if err := c.includeExecFifo(); err != nil {
return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
}
return c.newInitProcess(p, cmd, comm)
return c.newInitProcess(p, comm)
}
return c.newSetnsProcess(p, cmd, comm)
return c.newSetnsProcess(p, comm)
}

func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
func (c *Container) newInitProcess(p *Process, comm *processComm) (*initProcess, error) {
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
Expand All @@ -631,7 +658,7 @@

init := &initProcess{
containerProcess: containerProcess{
cmd: cmd,
cmd: c.createCmdObject(p, comm, nil),
comm: comm,
manager: c.cgroupManager,
config: c.newInitConfig(p),
Expand All @@ -645,8 +672,7 @@
return init, nil
}

func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
func (c *Container) newSetnsProcess(p *Process, comm *processComm) (*setnsProcess, error) {
state := c.currentState()
// for setns process, we don't have to set cloneflags as the process namespaces
// will only be set via setns syscall
Expand All @@ -656,7 +682,6 @@
}
proc := &setnsProcess{
containerProcess: containerProcess{
cmd: cmd,
comm: comm,
manager: c.cgroupManager,
config: c.newInitConfig(p),
Expand Down
121 changes: 121 additions & 0 deletions libcontainer/integration/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@ import (
"strings"
"syscall"
"testing"
"time"

securejoin "github.com/cyphar/filepath-securejoin"
"github.com/opencontainers/cgroups"
"github.com/opencontainers/cgroups/systemd"
"github.com/opencontainers/runc/internal/linux"
"github.com/opencontainers/runc/internal/pathrs"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/internal/userns"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"

"golang.org/x/sys/unix"
Expand Down Expand Up @@ -231,6 +234,124 @@ func TestEnter(t *testing.T) {
}
}

const stateFilename = "state.json"

// We need to dump changed state into state file.
func saveState(stateDir string, s *libcontainer.State) (retErr error) {
tmpFile, err := os.CreateTemp(stateDir, "state-")
if err != nil {
return err
}

defer func() {
if retErr != nil {
tmpFile.Close()
os.Remove(tmpFile.Name())
}
}()

err = utils.WriteJSON(tmpFile, s)
if err != nil {
return err
}
err = tmpFile.Close()
if err != nil {
return err
}

stateFilePath := filepath.Join(stateDir, stateFilename)
return os.Rename(tmpFile.Name(), stateFilePath)
}

func TestCmdRetry(t *testing.T) {
if testing.Short() {
return
}

if !cgroups.IsCgroup2UnifiedMode() {
t.Skip("CLONE_INTO_CGROUP works only with cgroup v2")
}

// Create dir to "pseudo" cgroup path
pseudoPath := t.TempDir()

config := newTemplateConfig(t, nil)

name := strings.ReplaceAll(t.Name(), "/", "_") + strconv.FormatInt(-int64(time.Now().Nanosecond()), 35)
root := t.TempDir()

// Container State Dir
stateDir, err := securejoin.SecureJoin(root, name)
ok(t, err)

// Create Container
container, err := libcontainer.Create(root, name, config)

ok(t, err)
defer destroyContainer(container)

// Execute a first process in the container
stdinR, stdinW, err := os.Pipe()
ok(t, err)

var stdout, stdout2 bytes.Buffer

pconfig := libcontainer.Process{
Cwd: "/",
Args: []string{"sh", "-c", "cat"},
Env: standardEnvironment,
Stdin: stdinR,
Stdout: &stdout,
Init: true,
}

err = container.Run(&pconfig)
_ = stdinR.Close()
defer stdinW.Close()
ok(t, err)

state, err := container.State()
ok(t, err)

// Dump our "pseudo" cgroup path as dirPath
state.CgroupPaths[""] = pseudoPath
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assumes (and will work only with) cgroup v2

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have checked the cgroup version in the call of cgroups.IsCgroup2UnifiedMode. As for cgroup v1 it is deprecated :) #4955 .

err = saveState(stateDir, state)
ok(t, err)

// Load container from dumped state
container2, err := libcontainer.Load(root, name)
ok(t, err)

// Execute another process in the container
stdinR2, stdinW2, err := os.Pipe()
ok(t, err)
pconfig2 := libcontainer.Process{
Cwd: "/",
Env: standardEnvironment,
}
pconfig2.Args = []string{"sh", "-c", "cat"}
pconfig2.Stdin = stdinR2
pconfig2.Stdout = &stdout2

// Turn on TestMode in cgroups library to not check path mode and create all necessary files.
cgroups.TestMode = true
defer func() {
cgroups.TestMode = false
}()

err = container2.Run(&pconfig2)
_ = stdinR2.Close()
defer stdinW2.Close()
ok(t, err)

// Wait processes
_ = stdinW2.Close()
waitProcess(&pconfig2, t)

_ = stdinW.Close()
waitProcess(&pconfig, t)
}

func TestProcessEnv(t *testing.T) {
if testing.Short() {
return
Expand Down
14 changes: 6 additions & 8 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import (
"runtime"
"strconv"
"strings"
"syscall"
"time"

"github.com/opencontainers/runtime-spec/specs-go"
Expand Down Expand Up @@ -113,6 +112,8 @@ func (c *processComm) closeParent() {
}

type containerProcess struct {
// Do not use the same exec.Cmd to retry Start method
// https://github.com/opencontainers/runc/issues/5060
cmd *exec.Cmd
comm *processComm
config *initConfig
Expand Down Expand Up @@ -169,6 +170,7 @@ func (p *containerProcess) wait() (*os.ProcessState, error) { //nolint:unparam
}

type setnsProcess struct {
// cmd will be created after startWithCgroupFD call.
containerProcess
rootlessCgroups bool
intelRdtPath string
Expand Down Expand Up @@ -357,11 +359,6 @@ func (p *setnsProcess) prepareCgroupFD() (*os.File, error) {
}

logrus.Debugf("using CLONE_INTO_CGROUP %q", cgroup)
if p.cmd.SysProcAttr == nil {
p.cmd.SysProcAttr = &syscall.SysProcAttr{}
}
p.cmd.SysProcAttr.UseCgroupFD = true
p.cmd.SysProcAttr.CgroupFD = int(fd.Fd())

return fd, nil
}
Expand All @@ -380,11 +377,12 @@ func (p *setnsProcess) startWithCgroupFD() error {
defer fd.Close()
}

p.cmd = p.containerProcess.container.createCmdObject(p.process, p.comm, fd)
err = p.startWithCPUAffinity()
if err != nil && p.cmd.SysProcAttr.UseCgroupFD {
logrus.Debugf("exec with CLONE_INTO_CGROUP failed: %v; retrying without", err)
// SysProcAttr.CgroupFD is never used when UseCgroupFD is unset.
p.cmd.SysProcAttr.UseCgroupFD = false
// We need to recreate the exec.Cmd object
p.cmd = p.containerProcess.container.createCmdObject(p.process, p.comm, nil)
err = p.startWithCPUAffinity()
}

Expand Down
Loading