From 4cf7a53626d945d3778e11282500efa11ff4b457 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 22 Feb 2021 17:13:47 -0800 Subject: [PATCH] runc exec: report possible OOM kill An exec may fail due to memory shortage (cgroup memory limits being too tight), and an error message provided in this case is clueless: > $ sudo ../runc/runc exec xx56 top > ERRO[0000] exec failed: container_linux.go:367: starting container process caused: read init-p: connection reset by peer Same as the previous commit for run/start, check the OOM kill counter and report an OOM kill. The differences from run are 1. The container is already running and OOM kill counter might not be zero. This is why we have to read the counter before exec and after it failed. 2. An unrelated OOM kill event might occur in parallel with our exec (and I see no way to find out which process was killed, except to parse kernel logs which seems excessive and not very reliable). This is why we report _possible_ OOM kill. Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 1 + libcontainer/process_linux.go | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 3dca29e4c3f..65e2eace8c8 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -570,6 +570,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP intelRdtPath: state.IntelRdtPath, messageSockPair: messageSockPair, logFilePair: logFilePair, + manager: c.cgroupManager, config: c.newInitConfig(p), process: p, bootstrapData: data, diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 0a5acf7baeb..26686764d83 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -65,6 +65,7 @@ type setnsProcess struct { logFilePair filePair cgroupPaths map[string]string rootlessCgroups bool + manager cgroups.Manager intelRdtPath string config *initConfig fds []string @@ -88,6 +89,8 @@ func (p *setnsProcess) signal(sig os.Signal) error { func (p *setnsProcess) start() (retErr error) { defer p.messageSockPair.parent.Close() + // get the "before" value of oom kill count + oom, _ := p.manager.OOMKill() err := p.cmd.Start() // close the write-side of the pipes (controlled by child) p.messageSockPair.child.Close() @@ -97,6 +100,10 @@ func (p *setnsProcess) start() (retErr error) { } defer func() { if retErr != nil { + if newOom, err := p.manager.OOMKill(); err == nil && newOom != oom { + // Someone in this cgroup was killed, this _might_ be us. + retErr = newSystemErrorWithCause(retErr, "possibly OOM-killed") + } err := ignoreTerminateErrors(p.terminate()) if err != nil { logrus.WithError(err).Warn("unable to terminate setnsProcess")