From ecb15b2e305a1cc61384a46f2fde0f553f49d94c Mon Sep 17 00:00:00 2001 From: reederc42 Date: Thu, 16 Jan 2025 14:46:27 -0800 Subject: [PATCH] Adds program to allow syscalls to be canceled --- Dockerfile | 2 + Makefile | 2 + cmd/syswrap/main.go | 76 +++++++++++++++++++ frontend/csi/node_server.go | 4 +- internal/syswrap/README.md | 33 ++++++++ internal/syswrap/syswrap.go | 14 ++++ internal/syswrap/syswrap_linux.go | 55 ++++++++++++++ internal/syswrap/syswrap_linux_test.go | 27 +++++++ internal/syswrap/unix/syscall.go | 27 +++++++ mocks/mock_utils/mock_osutils/mock_osutils.go | 15 ++++ utils/filesystem/filesystem_linux.go | 39 ++-------- utils/osutils/osutils.go | 17 ++++- 12 files changed, 272 insertions(+), 39 deletions(-) create mode 100644 cmd/syswrap/main.go create mode 100644 internal/syswrap/README.md create mode 100644 internal/syswrap/syswrap.go create mode 100644 internal/syswrap/syswrap_linux.go create mode 100644 internal/syswrap/syswrap_linux_test.go create mode 100644 internal/syswrap/unix/syscall.go diff --git a/Dockerfile b/Dockerfile index 7029bbb0c..e5af358bd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,10 +23,12 @@ ARG BIN=trident_orchestrator ARG CLI_BIN=tridentctl ARG CHWRAP_BIN=chwrap.tar ARG NODE_PREP_BIN=node_prep +ARG SYSWRAP_BIN=syswrap COPY ${BIN} /trident_orchestrator COPY ${CLI_BIN} /bin/tridentctl COPY ${NODE_PREP_BIN} /node_prep +COPY ${SYSWRAP_BIN} /syswrap ADD ${CHWRAP_BIN} / ENTRYPOINT ["/bin/tridentctl"] diff --git a/Makefile b/Makefile index e5be3fbc6..82251c9c8 100644 --- a/Makefile +++ b/Makefile @@ -200,6 +200,7 @@ binaries_for_platform = $(call go_build,tridentctl,./cli,$1,$2)\ $(if $(findstring darwin,$1),,\ && $(call go_build,trident_orchestrator,.,$1,$2)\ $(if $(findstring linux,$1),\ + && $(call go_build,syswrap,./cmd/syswrap,$1,$2) \ && $(call chwrap_build,$1,$2) \ && $(call node_prep_build,$1,$2) )) @@ -239,6 +240,7 @@ docker_build_linux = $1 build \ --build-arg CLI_BIN=$(call binary_path,tridentctl,$2) \ --build-arg NODE_PREP_BIN=$(call binary_path,node_prep,$2) \ --build-arg CHWRAP_BIN=$(call binary_path,chwrap.tar,$2) \ + --build-arg SYSWRAP_BIN=${call binary_path,syswrap,$2} \ --tag $3 \ --rm \ $(if $(findstring $(DOCKER_BUILDX_BUILD_CLI),$1),--builder trident-builder) \ diff --git a/cmd/syswrap/main.go b/cmd/syswrap/main.go new file mode 100644 index 000000000..943e1f5b9 --- /dev/null +++ b/cmd/syswrap/main.go @@ -0,0 +1,76 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + "time" + + "github.com/netapp/trident/internal/syswrap/unix" +) + +var syscalls = map[string]func([]string) (interface{}, error){ + "statfs": unix.Statfs, + "exists": unix.Exists, +} + +type result struct { + output interface{} + err error +} + +func main() { + timeout, syscall, args, err := parseArgs(os.Args) + if err != nil { + exit(err) + } + + select { + case <-time.After(timeout): + exit(fmt.Errorf("timed out waiting for %s", syscall)) + case res := <-func() chan result { + r := make(chan result) + go func() { + s, ok := syscalls[syscall] + if !ok { + r <- result{ + err: fmt.Errorf("unknown syscall: %s", syscall), + } + } + i, e := s(args) + r <- result{ + output: i, + err: e, + } + }() + return r + }(): + if res.err != nil { + exit(res.err) + } + _ = json.NewEncoder(os.Stdout).Encode(res.output) + } +} + +func exit(err error) { + _, _ = fmt.Fprintf(os.Stderr, "error: %v", err) + os.Exit(1) +} + +func parseArgs(args []string) (timeout time.Duration, syscall string, syscallArgs []string, err error) { + if len(args) < 3 { + err = fmt.Errorf("expected at least 3 arguments") + return + } + timeout, err = time.ParseDuration(args[1]) + if err != nil { + return + } + + syscall = args[2] + + if len(args) > 3 { + syscallArgs = args[3:] + } + return +} diff --git a/frontend/csi/node_server.go b/frontend/csi/node_server.go index ef67aecb4..42d5ce5d4 100644 --- a/frontend/csi/node_server.go +++ b/frontend/csi/node_server.go @@ -24,6 +24,7 @@ import ( tridentconfig "github.com/netapp/trident/config" "github.com/netapp/trident/internal/fiji" + "github.com/netapp/trident/internal/syswrap" . "github.com/netapp/trident/logging" "github.com/netapp/trident/pkg/collection" "github.com/netapp/trident/pkg/convert" @@ -54,6 +55,7 @@ const ( maximumNodeReconciliationJitter = 5000 * time.Millisecond nvmeMaxFlushWaitDuration = 6 * time.Minute csiNodeLockTimeout = 60 * time.Second + fsUnavailableTimeout = 5 * time.Second ) var ( @@ -350,7 +352,7 @@ func (p *Plugin) NodeGetVolumeStats( } // Ensure volume is published at path - exists, err := p.osutils.PathExists(req.GetVolumePath()) + exists, err := syswrap.Exists(ctx, req.GetVolumePath(), fsUnavailableTimeout) if !exists || err != nil { return nil, status.Error(codes.NotFound, fmt.Sprintf("could not find volume mount at path: %s; %v", req.GetVolumePath(), err)) diff --git a/internal/syswrap/README.md b/internal/syswrap/README.md new file mode 100644 index 000000000..2fa2a3808 --- /dev/null +++ b/internal/syswrap/README.md @@ -0,0 +1,33 @@ +# syswrap + +`syswrap` is a small program to wrap system calls with timeouts. In Go, syscalls cannot be cleaned up if they are +blocked, for example if `lstat` is called on an inaccessible NFS mount, but because Go can create additional goroutines +the calling application will not block. This can cause resource exhaustion if syscalls are not bounded. + +Linux will clean up syscalls if the owning process ends, so `syswrap` exists to allow Trident to create a new process +that owns a potentially blocking syscall. + +The Trident-accessible interface is in this package. + +## Usage + +`syswrap ` + +`` is in Go format, i.e. `30s` + +`` is the name of the call, see `cmd/syswrap/main.go` + +`` are string representations of any arguments required by the call. + +## Adding Syscalls + +See `syswrap.Exists` for a cross-platform example, and `syswrap.Statfs` for a Linux-only example. + +There are 3 steps to adding a new syscall: + +1. Add func to `internal/syswrap` package. This func calls the syswrap binary, and it should also call the syscall +itself if the syswrap binary is not found. +2. Add func to `internal/syswrap/unix` package. The func must have the signature +`func(args []string) (output interface{}, err error)`, and parse its args then call the syscall. Any fields in output +that need to be used by Trident must be exported. +3. Add func name to `cmd/syswrap/main.syscalls` map. \ No newline at end of file diff --git a/internal/syswrap/syswrap.go b/internal/syswrap/syswrap.go new file mode 100644 index 000000000..3a11d7551 --- /dev/null +++ b/internal/syswrap/syswrap.go @@ -0,0 +1,14 @@ +//go:build windows || darwin + +package syswrap + +import ( + "context" + "os" + "time" +) + +func Exists(_ context.Context, path string, _ time.Duration) (bool, error) { + _, err := os.Stat(path) + return err == nil, nil +} diff --git a/internal/syswrap/syswrap_linux.go b/internal/syswrap/syswrap_linux.go new file mode 100644 index 000000000..a521b36a0 --- /dev/null +++ b/internal/syswrap/syswrap_linux.go @@ -0,0 +1,55 @@ +// Package syswrap wraps syscalls that need to be canceled +package syswrap + +import ( + "context" + "encoding/json" + "errors" + "os" + "time" + + "golang.org/x/sys/unix" + + "github.com/netapp/trident/utils/exec" +) + +const syswrapBin = "/syswrap" + +func Statfs(ctx context.Context, path string, timeout time.Duration) (unix.Statfs_t, error) { + buf, err := exec.NewCommand().Execute(ctx, syswrapBin, timeout.String(), "statfs", path) + if err != nil { + // If syswrap is unavailable fall back to blocking call. This may hang if NFS backend is unreachable + var pe *os.PathError + ok := errors.As(err, &pe) + if !ok { + return unix.Statfs_t{}, err + } + + var fsStat unix.Statfs_t + err = unix.Statfs(path, &fsStat) + return fsStat, err + } + + var b unix.Statfs_t + err = json.Unmarshal(buf, &b) + return b, err +} + +func Exists(ctx context.Context, path string, timeout time.Duration) (bool, error) { + buf, err := exec.NewCommand().Execute(ctx, syswrapBin, timeout.String(), "exists", path) + if err != nil { + // If syswrap is unavailable fall back to blocking call. This may hang if NFS backend is unreachable + var pe *os.PathError + ok := errors.As(err, &pe) + if !ok { + return false, err + } + + _, err = os.Stat(path) + return err == nil, err + } + + var b bool + err = json.Unmarshal(buf, &b) + return b, err +} diff --git a/internal/syswrap/syswrap_linux_test.go b/internal/syswrap/syswrap_linux_test.go new file mode 100644 index 000000000..7267e1441 --- /dev/null +++ b/internal/syswrap/syswrap_linux_test.go @@ -0,0 +1,27 @@ +package syswrap + +import ( + "context" + "errors" + "os" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/netapp/trident/utils/exec" +) + +// errors.Is should work to detect PathError from Execute, but it currently does not. If this test starts to fail +// then the errors.As calls in this package should be converted to errors.Is. +func TestSyswrapUnavailableRequiresAs(t *testing.T) { + _, err := os.Stat(syswrapBin) + assert.Error(t, err) + wd, err := os.Getwd() + assert.NoError(t, err) + _, err = exec.NewCommand().Execute(context.Background(), syswrapBin, "1s", "statfs", wd) + assert.Error(t, err) + assert.False(t, errors.Is(err, &os.PathError{})) + + var pe *os.PathError + assert.True(t, errors.As(err, &pe)) +} diff --git a/internal/syswrap/unix/syscall.go b/internal/syswrap/unix/syscall.go new file mode 100644 index 000000000..69f0aa06d --- /dev/null +++ b/internal/syswrap/unix/syscall.go @@ -0,0 +1,27 @@ +// Package unix parses string arguments and calls the system call +package unix + +import ( + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +func Statfs(args []string) (interface{}, error) { + if len(args) != 1 { + return nil, fmt.Errorf("expected 1 argument") + } + var buf unix.Statfs_t + err := unix.Statfs(args[0], &buf) + return &buf, err +} + +func Exists(args []string) (interface{}, error) { + if len(args) != 1 { + return nil, fmt.Errorf("expected 1 argument") + } + _, err := os.Stat(args[0]) + exists := err == nil + return &exists, err +} diff --git a/mocks/mock_utils/mock_osutils/mock_osutils.go b/mocks/mock_utils/mock_osutils/mock_osutils.go index aca4c396a..e96590829 100644 --- a/mocks/mock_utils/mock_osutils/mock_osutils.go +++ b/mocks/mock_utils/mock_osutils/mock_osutils.go @@ -158,6 +158,21 @@ func (mr *MockUtilsMockRecorder) PathExists(arg0 any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PathExists", reflect.TypeOf((*MockUtils)(nil).PathExists), arg0) } +// PathExistsWithTimeout mocks base method. +func (m *MockUtils) PathExistsWithTimeout(arg0 context.Context, arg1 string, arg2 time.Duration) (bool, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "PathExistsWithTimeout", arg0, arg1, arg2) + ret0, _ := ret[0].(bool) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// PathExistsWithTimeout indicates an expected call of PathExistsWithTimeout. +func (mr *MockUtilsMockRecorder) PathExistsWithTimeout(arg0, arg1, arg2 any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "PathExistsWithTimeout", reflect.TypeOf((*MockUtils)(nil).PathExistsWithTimeout), arg0, arg1, arg2) +} + // ServiceActiveOnHost mocks base method. func (m *MockUtils) ServiceActiveOnHost(arg0 context.Context, arg1 string) (bool, error) { m.ctrl.T.Helper() diff --git a/utils/filesystem/filesystem_linux.go b/utils/filesystem/filesystem_linux.go index 012671ad6..a30685ecd 100644 --- a/utils/filesystem/filesystem_linux.go +++ b/utils/filesystem/filesystem_linux.go @@ -9,18 +9,12 @@ import ( "fmt" "time" - "golang.org/x/sys/unix" - + "github.com/netapp/trident/internal/syswrap" . "github.com/netapp/trident/logging" "github.com/netapp/trident/utils/errors" "github.com/netapp/trident/utils/models" ) -type statFSResult struct { - Output unix.Statfs_t - Error error -} - // GetFilesystemStats returns the size of the filesystem for the given path. // The caller of the func is responsible for verifying the mountPoint existence and readiness. func (f *FSClient) GetFilesystemStats( @@ -29,35 +23,12 @@ func (f *FSClient) GetFilesystemStats( Logc(ctx).Debug(">>>> filesystem_linux.GetFilesystemStats") defer Logc(ctx).Debug("<<<< filesystem_linux.GetFilesystemStats") - timedOut := false - timeout := 30 * time.Second - done := make(chan statFSResult, 1) - var result statFSResult - - go func() { - // Warning: syscall.Statfs_t uses types that are OS and arch dependent. The following code has been - // confirmed to work with Linux/amd64 and Darwin/amd64. - var buf unix.Statfs_t - err := unix.Statfs(path, &buf) - done <- statFSResult{Output: buf, Error: err} - }() - - select { - case <-time.After(timeout): - timedOut = true - case result = <-done: - break - } - - if result.Error != nil { - Logc(ctx).WithField("path", path).Errorf("Failed to statfs: %s", result.Error) - return 0, 0, 0, 0, 0, 0, fmt.Errorf("couldn't get filesystem stats %s: %s", path, result.Error) - } else if timedOut { - Logc(ctx).WithField("path", path).Errorf("Failed to statfs due to timeout") - return 0, 0, 0, 0, 0, 0, fmt.Errorf("couldn't get filesystem stats %s: timeout", path) + buf, err := syswrap.Statfs(ctx, path, 30*time.Second) + if err != nil { + Logc(ctx).WithField("path", path).Errorf("Failed to statfs: %s", err) + return 0, 0, 0, 0, 0, 0, fmt.Errorf("couldn't get filesystem stats %s: %s", path, err) } - buf := result.Output //goland:noinspection GoRedundantConversion size := int64(buf.Blocks) * int64(buf.Bsize) diff --git a/utils/osutils/osutils.go b/utils/osutils/osutils.go index 76f4bd129..644d3b709 100644 --- a/utils/osutils/osutils.go +++ b/utils/osutils/osutils.go @@ -17,6 +17,7 @@ import ( "github.com/cenkalti/backoff/v4" "github.com/spf13/afero" + "github.com/netapp/trident/internal/syswrap" . "github.com/netapp/trident/logging" "github.com/netapp/trident/utils/exec" "github.com/netapp/trident/utils/models" @@ -37,6 +38,7 @@ type Utils interface { GetHostSystemInfo(ctx context.Context) (*models.HostSystem, error) GetIPAddresses(ctx context.Context) ([]string, error) PathExists(path string) (bool, error) + PathExistsWithTimeout(ctx context.Context, path string, timeout time.Duration) (bool, error) IsLikelyDir(mountpoint string) (bool, error) DeleteResourceAtPath(ctx context.Context, resource string) error WaitForResourceDeletionAtPath(ctx context.Context, resource string, maxDuration time.Duration) error @@ -107,11 +109,18 @@ func (o *OSUtils) GetIPAddresses(ctx context.Context) ([]string, error) { return ipAddrs, nil } +// PathExists returns if path exists. This should only be used if the call will not block if the path or volume is +// inaccessible. func (o *OSUtils) PathExists(path string) (bool, error) { - if _, err := o.osFs.Stat(path); err == nil { - return true, nil - } - return false, nil + _, err := o.osFs.Stat(path) + return err == nil, nil +} + +// PathExistsWithTimeout returns if path exists, and can return a timeout error on linux; windows ignores the timeout. +// Context timeouts may be ignored and are handled by the underlying implementation. This should be used instead of +// PathExists if there is a chance the call will block if the path or volume is inaccessible, such as on an NFS mount. +func (o *OSUtils) PathExistsWithTimeout(ctx context.Context, path string, timeout time.Duration) (bool, error) { + return syswrap.Exists(ctx, path, timeout) } // EnsureFileExists makes sure that file of given name exists