diff --git a/internal/os/linux/pidfd.go b/internal/os/linux/pidfd.go new file mode 100644 index 000000000000..40d07cd93c71 --- /dev/null +++ b/internal/os/linux/pidfd.go @@ -0,0 +1,62 @@ +//go:build linux + +// SPDX-FileCopyrightText: 2025 k0s authors +// SPDX-License-Identifier: Apache-2.0 + +package linux + +import ( + "cmp" + "errors" + "fmt" + "os" + "syscall" + + "golang.org/x/sys/unix" +) + +// Sends a signal to the process. +func SendSignal(pidfd syscall.Conn, signal os.Signal) error { + sig, ok := signal.(syscall.Signal) + if !ok { + return fmt.Errorf("%w: %s", errors.ErrUnsupported, signal) + } + + conn, err := pidfd.SyscallConn() + if err != nil { + return err + } + + outerErr := conn.Control(func(fd uintptr) { + err = pidfdSendSignal(int(fd), sig) + }) + + return cmp.Or(err, outerErr) +} + +// Send a signal to a process specified by a file descriptor. +// +// The calling process must either be in the same PID namespace as the process +// referred to by pidfd, or be in an ancestor of that namespace. +// +// Since Linux 5.1. +// https://man7.org/linux/man-pages/man2/pidfd_send_signal.2.html +// https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=3eb39f47934f9d5a3027fe00d906a45fe3a15fad +func pidfdSendSignal(pidfd int, sig syscall.Signal) error { + // If the info argument is a NULL pointer, this is equivalent to specifying + // a pointer to a siginfo_t buffer whose fields match the values that are + // implicitly supplied when a signal is sent using kill(2): + // + // * si_signo is set to the signal number; + // * si_errno is set to 0; + // * si_code is set to SI_USER; + // * si_pid is set to the caller's PID; and + // * si_uid is set to the caller's real user ID. + info := (*unix.Siginfo)(nil) + + // The flags argument is reserved for future use; currently, this + // argument must be specified as 0. + flags := 0 + + return os.NewSyscallError("pidfd_send_signal", unix.PidfdSendSignal(pidfd, sig, info, flags)) +} diff --git a/internal/os/linux/procfs/pid.go b/internal/os/linux/procfs/pid.go new file mode 100644 index 000000000000..53ca4470374c --- /dev/null +++ b/internal/os/linux/procfs/pid.go @@ -0,0 +1,67 @@ +//go:build linux + +// SPDX-FileCopyrightText: 2024 k0s authors +// SPDX-License-Identifier: Apache-2.0 + +package procfs + +import ( + "bytes" + "errors" + "fmt" + "io" + "io/fs" +) + +// A process-specific subdirectory within the proc(5) file system, i.e., a +// /proc/ directory. It exposes methods to parse the contents of the +// well-known files inside it. +type PIDDir struct{ fs.FS } + +var _ fs.ReadFileFS = (*PIDDir)(nil) + +// ReadFile implements [fs.ReadFileFS]. +func (d *PIDDir) ReadFile(name string) (_ []byte, err error) { + // The io/fs ReadFile implementation uses stat to optimize the read buffer + // size by first determining the file size. This doesn't make sense, and is + // even counter-productive for procfs files because they are usually + // reported as having zero bytes, which is, of course, not what you get when + // reading them. Hence PIDDir implements its own ReadFile method that skips + // this step and allows the buffer to grow as needed. + + f, err := d.Open(name) + if err != nil { + return nil, err + } + defer func() { err = errors.Join(err, f.Close()) }() + return io.ReadAll(f) +} + +// Reads and parses /proc//cmdline. +// https://man7.org/linux/man-pages/man5/proc_pid_cmdline.5.html +func (d *PIDDir) Cmdline() ([]string, error) { + return d.readNulTerminatedStrings("cmdline") +} + +// Reads and parses /proc//environ. +// https://man7.org/linux/man-pages/man5/proc_pid_environ.5.html +func (d *PIDDir) Environ() ([]string, error) { + return d.readNulTerminatedStrings("environ") +} + +func (d *PIDDir) readNulTerminatedStrings(name string) (items []string, _ error) { + raw, err := d.ReadFile(name) + if err != nil { + return nil, err + } + + for len(raw) > 0 { + current, rest, ok := bytes.Cut(raw, []byte{0}) + if !ok { + return nil, fmt.Errorf("not properly terminated: %q", raw) + } + items = append(items, string(current)) + raw = rest + } + return items, nil +} diff --git a/internal/os/linux/procfs/procfs.go b/internal/os/linux/procfs/procfs.go new file mode 100644 index 000000000000..873f3e756c85 --- /dev/null +++ b/internal/os/linux/procfs/procfs.go @@ -0,0 +1,104 @@ +//go:build linux + +// SPDX-FileCopyrightText: 2024 k0s authors +// SPDX-License-Identifier: Apache-2.0 + +package procfs + +import ( + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + "strconv" + "syscall" + + "github.com/k0sproject/k0s/internal/os/linux" + osunix "github.com/k0sproject/k0s/internal/os/unix" + + "golang.org/x/sys/unix" +) + +var _ = linux.SendSignal // for godoc links + +// A proc(5) filesystem. +// +// See https://www.kernel.org/doc/html/latest/filesystems/proc.html. +// See https://man7.org/linux/man-pages/man5/proc.5.html. +type ProcFS string + +const ( + DefaultMountPoint = "/proc" + Default ProcFS = DefaultMountPoint +) + +func At(mountPoint string) ProcFS { + return ProcFS(mountPoint) +} + +func (p ProcFS) String() string { + return string(p) +} + +// Delegates to [Default]. +// See [ProcFS.OpenPID]. +func OpenPID(pid int) (*osunix.Dir, error) { + return Default.OpenPID(pid) +} + +// Returns a [*osunix.Dir] that points to a process-specific subdirectory inside +// the proc(5) filesystem. It therefore refers to a process or thread, and may +// be used in some syscalls that accept pidfds, most notably [linux.SendSignal]. +// +// Operations on open /proc/ Dirs corresponding to dead processes never act +// on any new process that the kernel may, through chance, have also assigned +// the same process ID. Instead, operations on these Dirs usually fail with +// [syscall.ESRCH]. +// +// The underlying file descriptor of the Dir obtained in this way is not +// pollable and can't be waited on with waitid(2). +// +// https://docs.kernel.org/filesystems/proc.html#process-specific-subdirectories +func (p ProcFS) OpenPID(pid int) (*osunix.Dir, error) { + path := filepath.Join(p.String(), strconv.Itoa(pid)) + path, err := filepath.Abs(path) + if err != nil { + return nil, err + } + + pidDir, err := osunix.OpenDir(path, 0) + if err != nil { + // If there was an error, check if the procfs is actually valid. + verifyErr := p.Verify() + if verifyErr != nil { + err = fmt.Errorf("%w (%v)", verifyErr, err) //nolint:errorlint // shadow open err + } + return nil, err + } + + return pidDir, nil +} + +func (p ProcFS) Verify() error { + path, err := filepath.Abs(p.String()) + if err != nil { + return fmt.Errorf("proc(5) filesystem check failed: %w", err) + } + + var st syscall.Statfs_t + if err := syscall.Statfs(path, &st); err != nil { + statErr := &fs.PathError{Op: "statfs", Path: path, Err: err} + if errors.Is(err, os.ErrNotExist) { + err = fmt.Errorf("%w: proc(5) filesystem unavailable", errors.ErrUnsupported) + } else { + err = errors.New("proc(5) filesystem check failed") + } + return fmt.Errorf("%w: %v", err, statErr) //nolint:errorlint // shadow stat err + } + + if st.Type != unix.PROC_SUPER_MAGIC { + return fmt.Errorf("%w: not a proc(5) filesystem: %s: type is 0x%x", errors.ErrUnsupported, p, st.Type) + } + return nil +} diff --git a/internal/os/linux/procfs/stat.go b/internal/os/linux/procfs/stat.go new file mode 100644 index 000000000000..30be54dbd1f6 --- /dev/null +++ b/internal/os/linux/procfs/stat.go @@ -0,0 +1,54 @@ +//go:build linux + +// SPDX-FileCopyrightText: 2024 k0s authors +// SPDX-License-Identifier: Apache-2.0 + +package procfs + +import ( + "bytes" + "fmt" +) + +type PIDState byte + +// Known values of the process state values as used in the third field of /proc//stat. +const ( + PIDStateRunning PIDState = 'R' + PIDStateSleeping PIDState = 'S' // in an interruptible wait + PIDStateWaiting PIDState = 'D' // in uninterruptible disk sleep + PIDStateZombie PIDState = 'Z' + PIDStateStopped PIDState = 'T' // (on a signal) or (before Linux 2.6.33) trace stopped + PIDStateTracingStop PIDState = 't' // (Linux 2.6.33 onward) + PIDStatePaging PIDState = 'W' // (only before Linux 2.6.0) + PIDStateDead PIDState = 'X' // (from Linux 2.6.0 onward) + PIDStateDeadX PIDState = 'x' // (Linux 2.6.33 to 3.13 only) + PIDStateWakekill PIDState = 'K' // (Linux 2.6.33 to 3.13 only) + PIDStateWaking PIDState = 'W' // (Linux 2.6.33 to 3.13 only) + PIDStateParked PIDState = 'P' // (Linux 3.9 to 3.13 only) + PIDStateIdle PIDState = 'I' // (Linux 4.14 onward) +) + +// Reads the state field from /proc//stat. +// https://man7.org/linux/man-pages/man5/proc_pid_stat.5.html +func (d *PIDDir) State() (PIDState, error) { + raw, err := d.ReadFile("stat") + if err != nil { + return 0, err + } + + // Skip over the pid and comm fields: The last parenthesis marks the end of + // the comm field, all other fields won't contain parentheses. The end of + // comm needs to be at the fourth byte the earliest. + if idx := bytes.LastIndexByte(raw, ')'); idx < 0 { + return 0, fmt.Errorf("no closing parenthesis: %q", raw) + } else { + raw = raw[idx+1:] + } + + if len(raw) < 3 || raw[0] != ' ' || raw[2] != ' ' { + return 0, fmt.Errorf("failed to locate state field: %q", raw) + } + + return PIDState(raw[1]), nil +} diff --git a/internal/os/linux/procfs/status.go b/internal/os/linux/procfs/status.go new file mode 100644 index 000000000000..facd52928d2f --- /dev/null +++ b/internal/os/linux/procfs/status.go @@ -0,0 +1,51 @@ +//go:build linux + +// SPDX-FileCopyrightText: 2024 k0s authors +// SPDX-License-Identifier: Apache-2.0 + +package procfs + +import ( + "bytes" + "errors" + "fmt" + "strconv" +) + +type PIDStatus map[string]string + +var ErrNoSuchStatusField = errors.New("no such status field") + +// Reads and parses /proc//status. +// https://man7.org/linux/man-pages/man5/proc_pid_status.5.html +func (d *PIDDir) Status() (PIDStatus, error) { + raw, err := d.ReadFile("status") + if err != nil { + return nil, err + } + + status := make(PIDStatus, 64) + for len(raw) > 0 { + line, rest, ok := bytes.Cut(raw, []byte{'\n'}) + if !ok { + return nil, fmt.Errorf("status file not properly terminated: %q", raw) + } + name, val, ok := bytes.Cut(line, []byte{':'}) + if !ok { + return nil, fmt.Errorf("line without colon: %q", line) + } + status[string(name)] = string(bytes.TrimSpace(val)) + raw = rest + } + + return status, nil +} + +// Thread group ID (i.e., Process ID). +func (s PIDStatus) ThreadGroupID() (int, error) { + if tgid, ok := s["Tgid"]; ok { + tgid, err := strconv.Atoi(tgid) + return tgid, err + } + return 0, ErrNoSuchStatusField +} diff --git a/internal/os/unix/dir.go b/internal/os/unix/dir.go new file mode 100644 index 000000000000..5291664ce0ed --- /dev/null +++ b/internal/os/unix/dir.go @@ -0,0 +1,300 @@ +//go:build unix + +// SPDX-FileCopyrightText: 2024 k0s authors +// SPDX-License-Identifier: Apache-2.0 + +package unix + +import ( + "errors" + "io" + "io/fs" + "os" + "path/filepath" + "syscall" + "time" + + "golang.org/x/sys/unix" +) + +// An open handle to some path on the file system. +type Path interface { + io.Closer + syscall.Conn + Name() string // Delegates to [os.File.Name]. + Stat() (os.FileInfo, error) // Delegates to [os.File.Stat]. + + // Converts this pointer to an [*os.File] without any additional checks. + // + // Note that both [os.File.ReadDir] and [os.File.Readdir] will NOT work, + // even if this path is pointing to a directory. + UnwrapFile() *os.File + + // Converts this pointer to a [*Dir] without any additional checks. + // + // Note that [Dir.Readdirnames] will NOT work if this path is not pointing + // to a directory. + UnwrapDir() *Dir +} + +// Opens a [Path] referring to the given path. +// +// This function can be used to open a path without knowing if it's a directory +// or a file, then use [Path.Stat] to figure out if [Path.UnwrapFile] or +// [Path.UnwrapDir] is appropriate. +// +// Note that, in contrast to [os.Open] and [os.OpenFile], the returned +// descriptor is not put into non-blocking mode automatically. Callers may +// decide if they want this by setting the [syscall.O_NONBLOCK] flag. +func OpenPath(path string, flags int, perm os.FileMode) (Path, error) { + // Use the raw syscall instead of os.OpenFile here, as the latter tries to + // put the fds into non-blocking mode. + flags, mode, err := sysOpenFlags(flags, perm) + if err != nil { + return nil, &os.PathError{Op: "open", Path: path, Err: err} + } + + fd, err := syscall.Open(path, flags, mode) + if err != nil { + return nil, &os.PathError{Op: "open", Path: path, Err: err} + } + + return (*pathFD)(os.NewFile(uintptr(fd), path)), nil +} + +// A file descriptor pointing to a directory (a.k.a. dirfd). It uses the +// syscalls that accept a dirfd, i.e. openat, fstatat ... +// +// Using a Dir, as opposed to using a path (or path prefix) for all +// operations, offers some unique features: Operations are more consistent. A +// Dir ensures that all operations are relative to the same directory +// instance. If the directory is renamed or moved, the Dir remains valid and +// operations continue to work as expected, which is not the case when using +// paths. Using a Dir can also be more secure. If a directory path is given as +// a string and used repeatedly, there's a risk that the path could be +// maliciously altered (e.g., through symbolic link attacks). Using a Dir +// ensures that operations use the original directory, mitigating this type of +// attack. +// +// Dir implements [fs.StatFS] and can be used as such. However, it can't be +// meaningfully used with [fs.WalkDir]: That function is implemented in terms of +// file system path manipulation, which contradicts the nature of a Dir. For +// this reason, the [fs.File] instances returned by [Dir.Open] won't implement +// [fs.ReadDirFile]. +type Dir os.File + +// The interface that [Dir] is about to implement. +var _ fs.StatFS = (*Dir)(nil) + +// Opens a [Dir] referring to the given path. +// +// Note that this is not a chroot: The *at syscalls will only use Dir to +// resolve relative paths, and will happily follow symlinks and cross mount +// points. +func OpenDir(path string, flags int) (*Dir, error) { + // Use the raw syscall instead of os.OpenFile here, as the latter tries to + // put the fds into non-blocking mode. + fd, err := syscall.Open(path, flags|syscall.O_DIRECTORY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, &os.PathError{Op: "open", Path: path, Err: err} + } + + return (*Dir)(os.NewFile(uintptr(fd), path)), nil +} + +// Delegates to [os.File.Close]. +func (d *Dir) Close() error { return (*os.File)(d).Close() } + +// Delegates to [os.File.SyscallConn]. +func (d *Dir) SyscallConn() (syscall.RawConn, error) { return (*os.File)(d).SyscallConn() } + +// Delegates to [os.File.Name]. +func (d *Dir) Name() string { return (*os.File)(d).Name() } + +// Delegates to [io.File.Stat]. +func (d *Dir) StatSelf() (os.FileInfo, error) { return (*os.File)(d).Stat() } + +// Opens the path with the given name. +// The path is opened relative to the receiver, using the openat syscall. +// +// Note that, in contrast to [os.Open] and [os.OpenFile], the returned +// descriptor is not put into non-blocking mode automatically. Callers may +// decide if they want this by setting the [unix.O_NONBLOCK] flag. +// +// https://www.man7.org/linux/man-pages/man2/open.2.html +func (d *Dir) OpenAt(name string, flags int, mode os.FileMode) (Path, error) { + f, err := d.openAt(name, flags, mode) + return (*pathFD)(f), err +} + +func (d *Dir) openAt(name string, flags int, mode os.FileMode) (*os.File, error) { + var opened int + err := syscallControl(d, func(fd uintptr) error { + flags, mode, err := sysOpenFlags(flags, mode) + if err != nil { + return &os.PathError{Op: "openat", Path: name, Err: err} + } + + opened, err = unix.Openat(int(fd), name, flags, mode) + if err != nil { + return &os.PathError{Op: "openat", Path: name, Err: err} + } + + return nil + }) + if err != nil { + return nil, err + } + + return os.NewFile(uintptr(opened), name), nil +} + +// Open implements [fs.FS]. +// +// Note that files and directories opened via this method won't implement +// [fs.ReadDirFile] and hence cannot be traversed via the io/fs package. +// However, the returned files will implement [Path], and can then be unwrapped +// to a [*Dir], if appropriate. +func (d *Dir) Open(name string) (fs.File, error) { + f, err := d.openAt(name, syscall.O_NONBLOCK, 0) + return (*pathFD)(f), err +} + +// Implements [Path] and [fs.File], but hides the [os.File.ReadDir] method. +type pathFD os.File + +func (f *pathFD) Close() error { return (*os.File)(f).Close() } +func (f *pathFD) Name() string { return (*os.File)(f).Name() } +func (f *pathFD) SyscallConn() (syscall.RawConn, error) { return (*os.File)(f).SyscallConn() } +func (f *pathFD) Read(b []byte) (int, error) { return (*os.File)(f).Read(b) } +func (f *pathFD) Stat() (fs.FileInfo, error) { return (*os.File)(f).Stat() } +func (f *pathFD) UnwrapFile() *os.File { return (*os.File)(f) } +func (f *pathFD) UnwrapDir() *Dir { return (*Dir)(f) } + +// Delegates to [os.File.Readdirnames]. +// +// This is the preferred way of listing directory contents. Traversing can be +// done via [Dir.OpenAt], followed by [Path.Stat] and [Path.UnwrapDir], if +// appropriate. Both [os.File.ReadDir] and [os.File.Readdir] won't make sense +// for Dirs, as they are path based, and not file descriptor based. +func (d *Dir) Readdirnames(n int) ([]string, error) { + return (*os.File)(d).Readdirnames(n) +} + +// Stats the path with the given name. +// The name is interpreted relative to the receiver, using the fstatat syscall. +// +// https://www.man7.org/linux/man-pages/man2/stat.2.html +func (d *Dir) StatAt(name string, flags int) (*FileInfo, error) { + info := FileInfo{Path: name} + if err := syscallControl(d, func(fd uintptr) error { + if err := unix.Fstatat(int(fd), name, (*unix.Stat_t)(&info.Stat), flags); err != nil { + return &os.PathError{Op: "fstatat", Path: name, Err: err} + } + + return nil + }); err != nil { + return nil, err + } + + return &info, nil +} + +// Stat implements [fs.StatFS]. +func (d *Dir) Stat(name string) (fs.FileInfo, error) { + fileInfo, err := d.StatAt(name, 0) + return fileInfo, err +} + +type Stat unix.Stat_t + +func (s *Stat) ToFileMode() os.FileMode { return toFileMode(s.Mode) } +func (s *Stat) IsDir() bool { return s.Mode&unix.S_IFMT == unix.S_IFDIR } +func (s *Stat) ModTime() time.Time { return time.Unix(s.Mtim.Unix()) } +func (s *Stat) Sys() any { return (*unix.Stat_t)(s) } + +type FileInfo struct { + Path string + Stat +} + +func (i *FileInfo) Name() string { return filepath.Base(i.Path) } +func (i *FileInfo) Size() int64 { return i.Stat.Size } +func (i *FileInfo) Mode() os.FileMode { return i.ToFileMode() } + +func toFileMode[T ~uint16 | ~uint32](unixMode T) os.FileMode { + fileMode := os.FileMode(unixMode) & os.ModePerm + + // https://www.man7.org/linux/man-pages/man2/fstatat.2.html#EXAMPLES + + switch unixMode & unix.S_IFMT { + case unix.S_IFREG: // regular file + // nothing to do + case unix.S_IFDIR: // directory + fileMode |= os.ModeDir + case unix.S_IFIFO: // FIFO/pipe + fileMode |= os.ModeNamedPipe + case unix.S_IFLNK: // symlink + fileMode |= os.ModeSymlink + case unix.S_IFSOCK: // socket + fileMode |= os.ModeSocket + case unix.S_IFCHR: // character device + fileMode |= os.ModeCharDevice + fallthrough + case unix.S_IFBLK: // block device + fileMode |= os.ModeDevice + default: // unknown? + fileMode |= os.ModeIrregular + } + + if unixMode&unix.S_ISGID != 0 { + fileMode |= os.ModeSetgid + } + if unixMode&unix.S_ISUID != 0 { + fileMode |= os.ModeSetuid + } + if unixMode&unix.S_ISVTX != 0 { + fileMode |= os.ModeSticky + } + + return fileMode +} + +func sysOpenFlags(flags int, mode os.FileMode) (int, uint32, error) { + const mask = os.ModePerm | os.ModeSetuid | os.ModeSetgid | os.ModeSticky + if mode != (mode & mask) { + return 0, 0, errors.New("invalid mode bits") + } + if mode != 0 && flags|os.O_CREATE == 0 { + return 0, 0, errors.New("mode may only be used when creating") + } + + return flags | syscall.O_CLOEXEC, toSysMode(mode), nil +} + +func toSysMode(mode os.FileMode) uint32 { + sysMode := uint32(mode & os.ModePerm) + if mode&os.ModeSetuid != 0 { + sysMode |= syscall.S_ISUID + } + if mode&os.ModeSetgid != 0 { + sysMode |= syscall.S_ISGID + } + if mode&os.ModeSticky != 0 { + sysMode |= syscall.S_ISVTX + } + return sysMode +} + +func syscallControl[C syscall.Conn](conn C, f func(fd uintptr) error) error { + rawConn, err := conn.SyscallConn() + if err != nil { + return err + } + + outerErr := rawConn.Control(func(fd uintptr) { err = f(fd) }) + if outerErr != nil { + return outerErr + } + return err +} diff --git a/internal/os/unix/dir_test.go b/internal/os/unix/dir_test.go new file mode 100644 index 000000000000..1b600d3c0291 --- /dev/null +++ b/internal/os/unix/dir_test.go @@ -0,0 +1,144 @@ +//go:build unix + +// SPDX-FileCopyrightText: 2024 k0s authors +// SPDX-License-Identifier: Apache-2.0 + +package unix_test + +import ( + "io" + "io/fs" + "os" + "path/filepath" + "syscall" + "testing" + "time" + + osunix "github.com/k0sproject/k0s/internal/os/unix" + "golang.org/x/sys/unix" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDir_NotExist(t *testing.T) { + path := filepath.Join(t.TempDir(), "foo") + + d, err := osunix.OpenDir(path, 0) + if err == nil { + assert.NoError(t, d.Close()) + } + assert.ErrorIs(t, err, os.ErrNotExist) +} + +func TestDir_Empty(t *testing.T) { + path := t.TempDir() + + d, err := osunix.OpenDir(path, 0) + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, d.Close()) }) + + foo := "foo" + assertENOENT := func(t *testing.T, op string, err error) { + var pathErr *os.PathError + if assert.ErrorAs(t, err, &pathErr) { + assert.Equal(t, op, pathErr.Op) + assert.Equal(t, foo, pathErr.Path) + assert.Equal(t, syscall.ENOENT, pathErr.Err) + } + } + + _, err = d.Open(foo) + assertENOENT(t, "openat", err) + + _, err = fs.Stat(d, foo) + assertENOENT(t, "fstatat", err) + + _, err = fs.ReadFile(d, foo) + assertENOENT(t, "openat", err) + + var pathErr *os.PathError + + // Reading a directory as a file should yield the right error. + if _, err = fs.ReadFile(d, "."); assert.ErrorAs(t, err, &pathErr) { + assert.Equal(t, "read", pathErr.Op) + assert.Equal(t, ".", pathErr.Path) + assert.Equal(t, syscall.EISDIR, pathErr.Err) + } + + // We don't want to allow reading directories via io/fs. + if _, err = fs.ReadDir(d, "."); assert.ErrorAs(t, err, &pathErr) { + assert.Equal(t, "readdir", pathErr.Op) + assert.Equal(t, ".", pathErr.Path) + assert.ErrorContains(t, pathErr.Err, "not implemented") + } + + if entries, err := d.Readdirnames(1); assert.Equal(t, io.EOF, err) { + assert.Empty(t, entries) + } +} + +func TestDir_Filled(t *testing.T) { + dirPath := t.TempDir() + require.NoError(t, os.WriteFile(filepath.Join(dirPath, "foo"), []byte("lorem"), 0644)) + require.NoError(t, os.Mkdir(filepath.Join(dirPath, "bar"), 0755)) + require.NoError(t, os.WriteFile(filepath.Join(dirPath, "bar", "baz"), []byte("ipsum"), 0400)) + + now := time.Now() + require.NoError(t, os.Chtimes(filepath.Join(dirPath, "foo"), time.Time{}, now.Add(-3*time.Minute))) + require.NoError(t, os.Chtimes(filepath.Join(dirPath, "bar", "baz"), time.Time{}, now.Add(-2*time.Minute))) + require.NoError(t, os.Chtimes(filepath.Join(dirPath, "bar"), time.Time{}, now.Add(-1*time.Minute))) + + d, err := osunix.OpenDir(dirPath, 0) + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, d.Close()) }) + + // Read foo and match contents. + if data, err := fs.ReadFile(d, "foo"); assert.NoError(t, err) { + assert.Equal(t, []byte("lorem"), data) + } + + // Stat foo and match contents. + if stat, err := fs.Stat(d, "foo"); assert.NoError(t, err) { + assert.Equal(t, "foo", stat.Name()) + assert.Equal(t, int64(5), stat.Size()) + assert.WithinDuration(t, now.Add(-3*time.Minute), stat.ModTime(), 0) + assert.Equal(t, os.FileMode(0644), stat.Mode()) + assert.False(t, stat.IsDir()) + assert.IsType(t, new(unix.Stat_t), stat.Sys()) + } + + // Stat bar and match contents. + if stat, err := fs.Stat(d, "bar"); assert.NoError(t, err) { + assert.Equal(t, "bar", stat.Name()) + assert.Positive(t, stat.Size(), int64(0)) + assert.WithinDuration(t, now.Add(-1*time.Minute), stat.ModTime(), 0) + assert.Equal(t, os.FileMode(0755)|os.ModeDir, stat.Mode()) + assert.True(t, stat.IsDir()) + assert.IsType(t, new(unix.Stat_t), stat.Sys()) + } + + // Stat bar/baz and match contents. + if stat, err := fs.Stat(d, filepath.Join("bar", "baz")); assert.NoError(t, err) { + assert.Equal(t, "baz", stat.Name()) + assert.Equal(t, int64(5), stat.Size()) + assert.WithinDuration(t, now.Add(-2*time.Minute), stat.ModTime(), 0) + assert.Equal(t, os.FileMode(0400), stat.Mode()) + assert.False(t, stat.IsDir()) + assert.IsType(t, new(unix.Stat_t), stat.Sys()) + } + + // List directory contents and match for correctness. + entries, err := d.Readdirnames(10) + if assert.NoError(t, err) && assert.Len(t, entries, 2) { + assert.ElementsMatch(t, entries, []string{"foo", "bar"}) + } + entries, err = d.Readdirnames(10) + assert.Empty(t, entries) + assert.Same(t, io.EOF, err) + + // Read bar/baz and match contents. + if data, err := fs.ReadFile(d, filepath.Join("bar", "baz")); assert.NoError(t, err) { + assert.Equal(t, []byte("ipsum"), data) + } +} diff --git a/pkg/supervisor/prochandle.go b/pkg/supervisor/prochandle.go index fe4cdb7aabb2..631f4bca269c 100644 --- a/pkg/supervisor/prochandle.go +++ b/pkg/supervisor/prochandle.go @@ -12,6 +12,9 @@ import ( type procHandle interface { io.Closer + // Checks whether the process has terminated. + hasTerminated() (bool, error) + // Reads and returns the process's command line. cmdline() ([]string, error) @@ -19,8 +22,5 @@ type procHandle interface { environ() ([]string, error) // Requests graceful process termination. - requestGracefulShutdown() error - - // Kills the process. - kill() error + requestGracefulTermination() error } diff --git a/pkg/supervisor/prochandle_darwin.go b/pkg/supervisor/prochandle_darwin.go new file mode 100644 index 000000000000..7aeaa17655de --- /dev/null +++ b/pkg/supervisor/prochandle_darwin.go @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: 2025 k0s authors +// SPDX-License-Identifier: Apache-2.0 + +package supervisor + +import ( + "errors" + "fmt" + "syscall" +) + +// openPID can only check the existence of a PID on macOS. +func openPID(pid int) (procHandle, error) { + // Send "the null signal" to probe if the PID still exists. + if err := syscall.Kill(pid, syscall.Signal(0)); err != nil { + return nil, err + } + + return nil, fmt.Errorf("%w on Darwin", errors.ErrUnsupported) +} diff --git a/pkg/supervisor/prochandle_linux.go b/pkg/supervisor/prochandle_linux.go new file mode 100644 index 000000000000..b80c18488585 --- /dev/null +++ b/pkg/supervisor/prochandle_linux.go @@ -0,0 +1,106 @@ +// SPDX-FileCopyrightText: 2022 k0s authors +// SPDX-License-Identifier: Apache-2.0 + +package supervisor + +import ( + "errors" + "fmt" + "os" + "syscall" + + "github.com/k0sproject/k0s/internal/os/linux" + "github.com/k0sproject/k0s/internal/os/linux/procfs" + osunix "github.com/k0sproject/k0s/internal/os/unix" +) + +type unixProcess struct { + pid int + pidDir *osunix.Dir +} + +func openPID(pid int) (_ *unixProcess, err error) { + p := &unixProcess{pid: pid} + p.pidDir, err = procfs.OpenPID(pid) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil, syscall.ESRCH + } + return nil, err + } + defer func() { + if err != nil { + err = errors.Join(err, p.Close()) + } + }() + + // The dir is open. It might refer to a thread, though. + // Check if the thread group ID is the process ID. + if status, err := p.dir().Status(); err != nil { + return nil, err + } else if tgid, err := status.ThreadGroupID(); err != nil { + return nil, fmt.Errorf("failed to get thread group ID: %w", err) + } else if tgid != pid { + return nil, fmt.Errorf("%w (thread group ID is %d)", syscall.ESRCH, tgid) + } + + return p, nil +} + +func (p *unixProcess) Close() error { + return p.pidDir.Close() +} + +func (p *unixProcess) hasTerminated() (bool, error) { + // Checking for termination is harder than one might think when there are + // open file descriptors to that process. The "null signal" trick won't work + // because the process remains a zombie as long as there are open file + // descriptors to it. Rely on the proc filesystem once again to check if the + // process has terminated or is a zombie. + state, err := p.dir().State() + if err != nil { + if errors.Is(err, syscall.ESRCH) { + return true, nil + } + return false, err + } + + return state == procfs.PIDStateZombie, nil +} + +// cmdline implements [procHandle]. +func (p *unixProcess) cmdline() ([]string, error) { + cmdline, err := p.dir().Cmdline() + if errors.Is(err, syscall.ESRCH) { + return nil, os.ErrProcessDone + } + return cmdline, err +} + +// environ implements [procHandle]. +func (p *unixProcess) environ() ([]string, error) { + env, err := p.dir().Environ() + if errors.Is(err, syscall.ESRCH) { + return nil, os.ErrProcessDone + } + return env, err +} + +// requestGracefulTermination implements [procHandle]. +func (p *unixProcess) requestGracefulTermination() error { + if err := linux.SendSignal(p.pidDir, syscall.SIGTERM); errors.Is(err, syscall.ESRCH) { + return os.ErrProcessDone + } else if !errors.Is(err, errors.ErrUnsupported) { + return err + } + + if err := syscall.Kill(p.pid, syscall.SIGTERM); errors.Is(err, syscall.ESRCH) { + return os.ErrProcessDone + } else { + return err + } +} + +func (p *unixProcess) dir() *procfs.PIDDir { + return &procfs.PIDDir{FS: p.pidDir} +} diff --git a/pkg/supervisor/prochandle_unix.go b/pkg/supervisor/prochandle_unix.go index 028a3eba7aa3..0c0ed2df34a0 100644 --- a/pkg/supervisor/prochandle_unix.go +++ b/pkg/supervisor/prochandle_unix.go @@ -1,77 +1,17 @@ //go:build unix -// SPDX-FileCopyrightText: 2022 k0s authors +// SPDX-FileCopyrightText: 2025 k0s authors // SPDX-License-Identifier: Apache-2.0 package supervisor import ( - "errors" "fmt" "os" - "path/filepath" - "strconv" - "strings" "syscall" ) -type unixProcess struct { - // The PID that was used when opening the process. - // Note: Don't rely on [os.Process.Pid] here, as it's not thread safe. - pid int - process *os.Process -} - -func openPID(pid int) (procHandle, error) { - process, err := os.FindProcess(pid) - if err != nil { - return nil, err - } - - return &unixProcess{pid, process}, nil -} - -func (p *unixProcess) Close() error { - return p.process.Release() -} - -// cmdline implements [procHandle]. -func (p *unixProcess) cmdline() ([]string, error) { - cmdline, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(p.pid), "cmdline")) - if err != nil { - if errors.Is(err, os.ErrNotExist) { - return nil, fmt.Errorf("%w: %w", syscall.ESRCH, err) - } - return nil, fmt.Errorf("failed to read process cmdline: %w", err) - } - - return strings.Split(string(cmdline), "\x00"), nil -} - -// environ implements [procHandle]. -func (p *unixProcess) environ() ([]string, error) { - env, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(p.pid), "environ")) - if err != nil { - if errors.Is(err, os.ErrNotExist) { - return nil, fmt.Errorf("%w: %w", syscall.ESRCH, err) - } - return nil, fmt.Errorf("failed to read process environ: %w", err) - } - - return strings.Split(string(env), "\x00"), nil -} - -// requestGracefulShutdown implements [procHandle]. -func (p *unixProcess) requestGracefulShutdown() error { - return p.process.Signal(syscall.SIGTERM) -} - -// kill implements [procHandle]. -func (p *unixProcess) kill() error { - return p.process.Kill() -} - -func requestGracefulShutdown(p *os.Process) error { +func requestGracefulTermination(p *os.Process) error { if err := p.Signal(syscall.SIGTERM); err != nil { return fmt.Errorf("failed to send SIGTERM: %w", err) } diff --git a/pkg/supervisor/prochandle_windows.go b/pkg/supervisor/prochandle_windows.go index 1f4f369251e2..5a1d4190d8dd 100644 --- a/pkg/supervisor/prochandle_windows.go +++ b/pkg/supervisor/prochandle_windows.go @@ -15,12 +15,12 @@ func openPID(int) (procHandle, error) { return nil, syscall.EWINDOWS } -func requestGracefulShutdown(p *os.Process) error { +func requestGracefulTermination(p *os.Process) error { // According to https://stackoverflow.com/q/1798771/, the _only_ somewhat // straight-forward option is to send Ctrl+Break events to processes which // have been started with the CREATE_NEW_PROCESS_GROUP flag. Sending Ctrl+C // seems to require at least some helper process. If Ctrl+Break will - // _actually_ trigger a graceful process shutdown is dependent of the + // _actually_ trigger a graceful process termination is dependent of the // program being run. According to the above Stack Overflow question, this // is e.g. not the case for Python. // diff --git a/pkg/supervisor/supervisor.go b/pkg/supervisor/supervisor.go index 509100e3cb7e..919a74aed0da 100644 --- a/pkg/supervisor/supervisor.go +++ b/pkg/supervisor/supervisor.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "io" + "math" "os" "os/exec" "path" @@ -20,6 +21,7 @@ import ( "time" "github.com/sirupsen/logrus" + "k8s.io/apimachinery/pkg/util/wait" "github.com/k0sproject/k0s/internal/pkg/dir" "github.com/k0sproject/k0s/pkg/constant" @@ -66,9 +68,15 @@ func (s *Supervisor) processWaitQuit(ctx context.Context) bool { select { case <-ctx.Done(): for { - s.log.Info("Requesting graceful shutdown") - if err := requestGracefulShutdown(s.cmd.Process); err != nil { - s.log.WithError(err).Warn("Failed to request graceful shutdown") + s.log.Debug("Requesting graceful termination") + if err := requestGracefulTermination(s.cmd.Process); err != nil { + if errors.Is(err, os.ErrProcessDone) { + s.log.Info("Failed to request graceful termination: process has already terminated") + } else { + s.log.WithError(err).Error("Failed to request graceful termination") + } + } else { + s.log.Info("Requested graceful termination") } select { case <-time.After(s.TimeoutStop): @@ -106,7 +114,7 @@ func (s *Supervisor) Supervise() error { s.TimeoutRespawn = 5 * time.Second } - if err := s.maybeKillPidFile(); err != nil { + if err := s.maybeCleanupPIDFile(); err != nil { if !errors.Is(err, errors.ErrUnsupported) { return err } @@ -216,11 +224,11 @@ func (s *Supervisor) Stop() { } } -// maybeKillPidFile checks kills the process in the pidFile if it's has -// the same binary as the supervisor's and also checks that the env -// `_KOS_MANAGED=yes`. This function does not delete the old pidFile as -// this is done by the caller. -func (s *Supervisor) maybeKillPidFile() error { +// Checks if the process referenced in the PID file is a k0s-managed process. +// If so, requests graceful termination and waits for the process to terminate. +// +// The PID file itself is not removed here; that is handled by the caller. +func (s *Supervisor) maybeCleanupPIDFile() error { pid, err := os.ReadFile(s.PidFile) if os.IsNotExist(err) { return nil @@ -235,80 +243,58 @@ func (s *Supervisor) maybeKillPidFile() error { ph, err := openPID(p) if err != nil { + if errors.Is(err, syscall.ESRCH) { + return nil // no such process, nothing to cleanup + } return fmt.Errorf("cannot interact with PID %d from PID file %s: %w", p, s.PidFile, err) } defer ph.Close() - if err := s.killProcess(ph); err != nil { - return fmt.Errorf("failed to kill PID %d from PID file %s: %w", p, s.PidFile, err) - } - - return nil -} - -const exitCheckInterval = 200 * time.Millisecond - -// Tries to terminate a process gracefully. If it's still running after -// s.TimeoutStop, the process is killed. -func (s *Supervisor) killProcess(ph procHandle) error { - if shouldKill, err := s.shouldKillProcess(ph); err != nil || !shouldKill { + if managed, err := s.isK0sManaged(ph); err != nil { + if errors.Is(err, os.ErrProcessDone) { + return nil + } return err - } - - if err := ph.requestGracefulShutdown(); errors.Is(err, syscall.ESRCH) { + } else if !managed { return nil - } else if err != nil { - return fmt.Errorf("failed to request graceful termination: %w", err) } - if terminate, err := s.waitForTermination(ph); err != nil || !terminate { - return err - } - - if err := ph.kill(); errors.Is(err, syscall.ESRCH) { - return nil - } else if err != nil { - return fmt.Errorf("failed to kill: %w", err) + if err := s.terminateAndWait(ph); err != nil { + return fmt.Errorf("while waiting for termination of PID %d from PID file %s: %w", p, s.PidFile, err) } return nil } -func (s *Supervisor) waitForTermination(ph procHandle) (bool, error) { - deadlineTimer := time.NewTimer(s.TimeoutStop) - defer deadlineTimer.Stop() - checkTicker := time.NewTicker(exitCheckInterval) - defer checkTicker.Stop() - - for { - select { - case <-checkTicker.C: - if shouldKill, err := s.shouldKillProcess(ph); err != nil || !shouldKill { - return false, nil - } - - case <-deadlineTimer.C: - return true, nil +// Tries to gracefully terminate a process and waits for it to exit. If the +// process is still running after several attempts, it returns an error instead +// of forcefully killing the process. +func (s *Supervisor) terminateAndWait(ph procHandle) error { + if err := ph.requestGracefulTermination(); err != nil { + if errors.Is(err, os.ErrProcessDone) { + return nil } + return fmt.Errorf("failed to request graceful termination: %w", err) } + + errTimeout := errors.New("process did not terminate in time") + ctx, cancel := context.WithTimeoutCause(context.TODO(), s.TimeoutStop, errTimeout) + defer cancel() + return s.awaitTermination(ctx, ph) } -func (s *Supervisor) shouldKillProcess(ph procHandle) (bool, error) { - // only kill process if it has the expected cmd +// Checks if the process handle refers to a k0s-managed process. A process is +// considered k0s-managed if: +// - The executable path matches. +// - The process environment contains `_K0S_MANAGED=yes`. +func (s *Supervisor) isK0sManaged(ph procHandle) (bool, error) { if cmd, err := ph.cmdline(); err != nil { - if errors.Is(err, syscall.ESRCH) { - return false, nil - } return false, err } else if len(cmd) > 0 && cmd[0] != s.BinPath { return false, nil } - // only kill process if it has the _KOS_MANAGED env set if env, err := ph.environ(); err != nil { - if errors.Is(err, syscall.ESRCH) { - return false, nil - } return false, err } else if !slices.Contains(env, k0sManaged) { return false, nil @@ -317,6 +303,29 @@ func (s *Supervisor) shouldKillProcess(ph procHandle) (bool, error) { return true, nil } +func (s *Supervisor) awaitTermination(ctx context.Context, ph procHandle) error { + s.log.Debug("Polling for process termination") + backoff := wait.Backoff{ + Duration: 25 * time.Millisecond, + Cap: 3 * time.Second, + Steps: math.MaxInt32, + Factor: 1.5, + Jitter: 0.1, + } + + if err := wait.ExponentialBackoffWithContext(ctx, backoff, func(context.Context) (bool, error) { + return ph.hasTerminated() + }); err != nil { + if err == ctx.Err() { //nolint:errorlint // the equal check is intended + return context.Cause(ctx) + } + + return err + } + + return nil +} + // Prepare the env for exec: // - handle component specific env // - inject k0s embedded bins into path diff --git a/pkg/supervisor/supervisor_test.go b/pkg/supervisor/supervisor_test.go index f4c56b4bad76..09c76b5fd42f 100644 --- a/pkg/supervisor/supervisor_test.go +++ b/pkg/supervisor/supervisor_test.go @@ -225,7 +225,7 @@ func TestCleanupPIDFile_Gracefully(t *testing.T) { case "windows": t.Skip("PID file cleanup not yet implemented on Windows") case "darwin": - t.Skip("FIXME: times out on macOS, needs debugging") + t.Skip("PID file cleanup not implemented on macOS") } // Start some k0s-managed process. @@ -261,12 +261,12 @@ func TestCleanupPIDFile_Gracefully(t *testing.T) { assert.NoFileExists(t, pidFilePath) } -func TestCleanupPIDFile_Forcefully(t *testing.T) { +func TestCleanupPIDFile_LingeringProcess(t *testing.T) { switch runtime.GOOS { case "windows": t.Skip("PID file cleanup not yet implemented on Windows") case "darwin": - t.Skip("FIXME: times out on macOS, needs debugging") + t.Skip("PID file cleanup not implemented on macOS") } // Start some k0s-managed process that won't terminate gracefully. @@ -285,7 +285,7 @@ func TestCleanupPIDFile_Forcefully(t *testing.T) { BinPath: pingPong.BinPath(), RunDir: t.TempDir(), Args: pingPong.BinArgs(), - TimeoutStop: 1 * time.Second, + TimeoutStop: 10 * time.Millisecond, TimeoutRespawn: 1 * time.Hour, } @@ -293,17 +293,24 @@ func TestCleanupPIDFile_Forcefully(t *testing.T) { pidFilePath := filepath.Join(s.RunDir, s.Name+".pid") require.NoError(t, os.WriteFile(pidFilePath, fmt.Appendf(nil, "%d\n", prevCmd.Process.Pid), 0644)) - // Start to supervise the new process. - require.NoError(t, s.Supervise()) - t.Cleanup(s.Stop) + // Start to supervise the new process and expect it to fail because the + // previous process won't terminate. + err := s.Supervise() + if !assert.Error(t, err) { + s.Stop() + } else { + assert.ErrorContains(t, err, "while waiting for termination of PID") + assert.ErrorContains(t, err, pidFilePath) + assert.ErrorContains(t, err, "process did not terminate in time") + } - // Expect the previous process to be forcefully terminated. - assert.ErrorContains(t, prevCmd.Wait(), "signal: killed") + // Expect the previous process to still be alive. + require.NoError(t, prevPingPong.SendPong()) - // Stop the supervisor and check if the PID file is gone. - assert.NoError(t, pingPong.AwaitPing()) - s.Stop() - assert.NoFileExists(t, pidFilePath) + // PID file should still point to the previous PID. + if pid, err := os.ReadFile(pidFilePath); assert.NoError(t, err) { + assert.Equal(t, fmt.Appendf(nil, "%d\n", prevCmd.Process.Pid), pid) + } } func TestCleanupPIDFile_WrongProcess(t *testing.T) { @@ -367,10 +374,6 @@ func TestCleanupPIDFile_NonexistingProcess(t *testing.T) { } func TestCleanupPIDFile_BogusPIDFile(t *testing.T) { - if runtime.GOOS == "windows" { - t.Skip("PID file cleanup not yet implemented on Windows") - } - // Prepare some supervised process that should never be started. s := Supervisor{ Name: t.Name(),