From 133885aa390eed20ebaf18c9ef146cf437029b80 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 13 Apr 2023 11:13:13 -0700 Subject: [PATCH] libct: move killing logic to kill By default, the container has its own PID namespace, and killing (with SIGKILL) its init process from the parent PID namespace also kills all the other processes. Obviously, it does not work that way when the container is sharing its PID namespace with the host or another container, since init is no longer special (it's not PID 1). In this case, killing container's init will result in a bunch of other processes left running (and thus the inability to remove the cgroup). The solution to the above problem is killing all the container processes, not just init. The problem with the current implementation is, the killing logic is implemented in libcontainer's initProcess.wait (rather than kill), and therefore was only available to libcontainer users, but not the runc kill command (which does not call wait as it's not a parent of container's init). To untangle this, the patch moves killing all processes from initProcess.wait to initProcess.kill, and documents the new behavior. In essence, this also makes `runc kill` to automatically kill all container processes when the container does not have its own PID namespace. Document that as well. Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 2 +- libcontainer/process_linux.go | 10 ++++++---- libcontainer/state_linux.go | 7 ------- man/runc-kill.8.md | 4 ++-- 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index ff570aec703..9c4d46517d4 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -360,7 +360,7 @@ func (c *Container) start(process *Process) (retErr error) { // Signal sends a specified signal to container's init, or, if all is true, // true, to all container's proceses (as determined by container's cgroup). // -// Setting all=true is useful when s is SIGKILL and the container does not have +// Note all=true is implied when s is SIGKILL and the container does not have // its own PID namespace. In this scenario, the libcontainer user may be required // to implement a proper child reaper. func (c *Container) Signal(s os.Signal, all bool) error { diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index ba2990a951b..0537992d94f 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -616,10 +616,6 @@ func (p *initProcess) start() (retErr error) { func (p *initProcess) wait() (*os.ProcessState, error) { err := p.cmd.Wait() - // we should kill all processes in cgroup when init is died if we use host PID namespace - if p.sharePidns { - _ = signalAllProcesses(p.manager, unix.SIGKILL) - } return p.cmd.ProcessState, err } @@ -678,6 +674,12 @@ func (p *initProcess) signal(sig os.Signal) error { if !ok { return errors.New("os: unsupported signal type") } + // When a container has its own pidns, init pid is 1, and killing + // init with SIGKILL will kill all processes in its PID namespace. + // Otherwise, we should kill all pids to avoid leftover processes. + if p.sharePidns && s == unix.SIGKILL { + return signalAllProcesses(p.manager, unix.SIGKILL) + } return unix.Kill(p.pid(), s) } diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go index 4895612e257..442d0baaf5b 100644 --- a/libcontainer/state_linux.go +++ b/libcontainer/state_linux.go @@ -7,7 +7,6 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -36,12 +35,6 @@ type containerState interface { } func destroy(c *Container) error { - if !c.config.Namespaces.Contains(configs.NEWPID) || - c.config.Namespaces.PathOf(configs.NEWPID) != "" { - if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil { - logrus.Warn(err) - } - } err := c.cgroupManager.Destroy() if c.intelRdtManager != nil { if ierr := c.intelRdtManager.Destroy(); err == nil { diff --git a/man/runc-kill.8.md b/man/runc-kill.8.md index 5f0f51ed9b9..684666396bd 100644 --- a/man/runc-kill.8.md +++ b/man/runc-kill.8.md @@ -18,8 +18,8 @@ to list available signals. # OPTIONS **--all**|**-a** : Send the signal to all processes inside the container, rather than -the container's init only. This option is useful when the container does not -have its own PID namespace. +the container's init only. This option is implied when the _signal_ is **KILL** +and the container does not have its own PID namespace. : When this option is set, no error is returned if the container is stopped or does not exist.