diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 632d7c3491b..a5f2b2e147a 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -8,6 +8,7 @@ import ( "io" "net" "os" + "os/signal" "path/filepath" "runtime" "runtime/debug" @@ -41,6 +42,36 @@ type pid struct { PidFirstChild int `json:"stage1_pid"` } +func setupPreExecSignalExit() (chan bool, func()) { + if unix.Getpid() != 1 { + return nil, nil + } + + // The Go runtime assumes the kernel will finish terminating _SigKill + // signals by calling dieFromSignal: + // https://github.com/golang/go/blob/c60392da8b6f18b2aa92db5d22c4963ec25ae0ad/src/runtime/signal_unix.go#L993 + // But Linux suppresses the default action for PID 1. While this helper is + // still the container's PID 1, translate the affected signals to 128+signo + // instead of leaking Go's fallback exit status 2. + setupDone := make(chan bool, 1) + signals := make(chan os.Signal, 1) + signal.Notify(signals, unix.SIGTERM, unix.SIGINT, unix.SIGHUP) + go func() { + setupDone <- true + sig, ok := <-signals + if !ok { + return + } + if s, ok := sig.(unix.Signal); ok { + os.Exit(128 + int(s)) + } + }() + return setupDone, func() { + signal.Stop(signals) + close(signals) + } +} + // network is an internal struct used to setup container networks. type network struct { configs.Network @@ -103,6 +134,28 @@ type initConfig struct { // Init is part of "runc init" implementation. func Init() { + // Translate termination signals to conventional shell-style exit codes + // while PID 1 is still the Go-based runc init helper. + // NOTE: This setup must occur BEFORE `syncParentHooks`. Once the parent receives the + // sync notification, it may immediately enforce a strict cgroup PID limit (e.g., <5). + // If the signal handler isn't ready by then, any subsequent OS thread creation + // (even by the Go runtime) could exceed the limit and cause the kernel to kill PID 1. + preExecSignalSetupDone, preExecSignalCloser := setupPreExecSignalExit() + if preExecSignalCloser != nil { + // This defer is technically unnecessary today, since any error leads to os.Exit(255) + // and the process terminates immediately. However, we retain it as a safeguard: + // if the error handling is later refactored to return instead of calling os.Exit, + // resource cleanup won’t be accidentally omitted. + defer preExecSignalCloser() + } + + // Wait for the signal handler goroutine to finish initialization. Without this, + // there's a race: `syncParentHooks` could notify the parent before the handler is + // ready, triggering PID enforcement while the runtime is still setting up threads. + if preExecSignalSetupDone != nil { + <-preExecSignalSetupDone + } + runtime.GOMAXPROCS(1) runtime.LockOSThread() diff --git a/libcontainer/integration/preexec_signal_test.go b/libcontainer/integration/preexec_signal_test.go new file mode 100644 index 00000000000..b09f9046cdb --- /dev/null +++ b/libcontainer/integration/preexec_signal_test.go @@ -0,0 +1,167 @@ +package integration + +import ( + "errors" + "os" + "path/filepath" + "syscall" + "testing" + "time" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" + + "golang.org/x/sys/unix" +) + +// Test to simulate problem we saw in https://github.com/kubernetes/kubernetes/issues/135713 +func TestPreExecSignalMapping(t *testing.T) { + if testing.Short() { + return + } + + testCases := []struct { + name string + sig unix.Signal + }{ + {name: "SIGTERM", sig: unix.SIGTERM}, + {name: "SIGINT", sig: unix.SIGINT}, + {name: "SIGHUP", sig: unix.SIGHUP}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + container, process, hookRelease := newPreExecSignalProcess(t) + defer destroyContainer(container) + + ok(t, container.Signal(tc.sig)) + if got, want := waitForExitCodeWhileHookBlocked(t, process, hookRelease), 128+int(tc.sig); got != want { + t.Fatalf("expected %s to exit %d, got %d", tc.name, want, got) + } + }) + } +} + +func newPreExecSignalProcess(t testing.TB) (*libcontainer.Container, *libcontainer.Process, string) { + t.Helper() + + config := newTemplateConfig(t, nil) + hookReady := filepath.Join(config.Rootfs, "startContainer-hook-ready") + hookRelease := filepath.Join(config.Rootfs, "startContainer-hook-release") + ok(t, unix.Mkfifo(hookRelease, 0o600)) + config.Hooks = configs.Hooks{ + configs.StartContainer: configs.HookList{ + configs.NewCommandHook(&configs.Command{ + Path: "/bin/sh", + Args: []string{"/bin/sh", "-c", "touch /startContainer-hook-ready && cat /startContainer-hook-release >/dev/null"}, + }), + }, + } + + container, err := newContainer(t, config) + ok(t, err) + + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"false"}, + Env: standardEnvironment, + Init: true, + } + + ok(t, container.Run(process)) + waitForFile(t, hookReady) + return container, process, hookRelease +} + +func waitForFile(t testing.TB, path string) { + t.Helper() + + deadline := time.Now().Add(5 * time.Second) + for time.Now().Before(deadline) { + if _, err := os.Stat(path); err == nil { + return + } else if !errors.Is(err, os.ErrNotExist) { + t.Fatalf("stat %s: %v", path, err) + } + time.Sleep(10 * time.Millisecond) + } + + t.Fatalf("timed out waiting for %s", path) +} + +func processExitCode(process *libcontainer.Process) (int, error) { + ps, err := process.Wait() + if err != nil && ps == nil { + return 0, err + } + if ps == nil { + return 0, errors.New("wait returned nil process state") + } + + status, ok := ps.Sys().(syscall.WaitStatus) + if !ok { + return 0, errors.New("unexpected wait status type") + } + if !status.Exited() { + return 0, errors.New("process did not exit") + } + return status.ExitStatus(), nil +} + +func waitForExitCodeWhileHookBlocked(t testing.TB, process *libcontainer.Process, hookRelease string) int { + t.Helper() + + type waitResult struct { + code int + err error + } + + results := make(chan waitResult, 1) + go func() { + code, err := processExitCode(process) + results <- waitResult{code: code, err: err} + }() + + select { + case result := <-results: + releaseHook(t, hookRelease) + if result.err != nil { + t.Fatalf("wait: %v", result.err) + } + return result.code + case <-time.After(500 * time.Millisecond): + releaseHook(t, hookRelease) + t.Fatal("process did not exit while startContainer hook was still blocking") + return 0 + } +} + +func releaseHook(t testing.TB, path string) { + t.Helper() + + fd, err := unix.Open(path, unix.O_WRONLY|unix.O_NONBLOCK|unix.O_CLOEXEC, 0) + if errors.Is(err, unix.ENXIO) || errors.Is(err, os.ErrNotExist) { + return + } + ok(t, err) + f := os.NewFile(uintptr(fd), path) + if _, err := f.Write([]byte("ok\n")); err != nil { + _ = f.Close() + t.Fatalf("write %s: %v", path, err) + } + ok(t, f.Close()) + + deadline := time.Now().Add(500 * time.Millisecond) + for time.Now().Before(deadline) { + fd, err := unix.Open(path, unix.O_WRONLY|unix.O_NONBLOCK|unix.O_CLOEXEC, 0) + if errors.Is(err, unix.ENXIO) || errors.Is(err, os.ErrNotExist) { + return + } + if err == nil { + _ = unix.Close(fd) + } + time.Sleep(10 * time.Millisecond) + } + + t.Fatalf("timed out waiting for startContainer hook to exit") +}