From bdb3409c59363882ddb9a85be55b530573d1a9d8 Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Mon, 26 Sep 2022 16:09:31 -0400 Subject: [PATCH] client: recover from getter panics (#14696) (#14705) The artifact getter uses the go-getter library to fetch files from different sources. Any bug in this library that results in a panic can cause the entire Nomad client to crash due to a single file download attempt. This change aims to guard against this types of crashes by recovering from panics when the getter attempts to download an artifact. The resulting panic is converted to an error that is stored as a task event for operator visibility and the panic stack trace is logged to the client's log. --- .changelog/14696.txt | 3 +++ .../allocrunner/taskrunner/getter/getter.go | 21 ++++++++++++++-- .../taskrunner/getter/getter_test.go | 25 ++++++++++++++++++- .../allocrunner/taskrunner/getter/testing.go | 3 ++- client/client.go | 2 +- 5 files changed, 49 insertions(+), 5 deletions(-) create mode 100644 .changelog/14696.txt diff --git a/.changelog/14696.txt b/.changelog/14696.txt new file mode 100644 index 00000000000..f1b4af4b453 --- /dev/null +++ b/.changelog/14696.txt @@ -0,0 +1,3 @@ +```release-note:security +client: recover from panics caused by artifact download to prevent the Nomad client from crashing +``` diff --git a/client/allocrunner/taskrunner/getter/getter.go b/client/allocrunner/taskrunner/getter/getter.go index f1f42b1ce11..d46c730fcd7 100644 --- a/client/allocrunner/taskrunner/getter/getter.go +++ b/client/allocrunner/taskrunner/getter/getter.go @@ -5,10 +5,12 @@ import ( "fmt" "net/http" "net/url" + "runtime/debug" "strings" "github.com/hashicorp/go-cleanhttp" gg "github.com/hashicorp/go-getter" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/interfaces" @@ -22,6 +24,8 @@ const ( // Getter wraps go-getter calls in an artifact configuration. type Getter struct { + logger hclog.Logger + // httpClient is a shared HTTP client for use across all http/https // Getter instantiations. The HTTP client is designed to be // thread-safe, and using a pooled transport will help reduce excessive @@ -32,8 +36,9 @@ type Getter struct { // NewGetter returns a new Getter instance. This function is called once per // client and shared across alloc and task runners. -func NewGetter(config *config.ArtifactConfig) *Getter { +func NewGetter(logger hclog.Logger, config *config.ArtifactConfig) *Getter { return &Getter{ + logger: logger, httpClient: &http.Client{ Transport: cleanhttp.DefaultPooledTransport(), }, @@ -42,7 +47,19 @@ func NewGetter(config *config.ArtifactConfig) *Getter { } // GetArtifact downloads an artifact into the specified task directory. -func (g *Getter) GetArtifact(taskEnv interfaces.EnvReplacer, artifact *structs.TaskArtifact) error { +func (g *Getter) GetArtifact(taskEnv interfaces.EnvReplacer, artifact *structs.TaskArtifact) (returnErr error) { + // Recover from panics to avoid crashing the entire Nomad client due to + // artifact download failures, such as bugs in go-getter. + defer func() { + if r := recover(); r != nil { + g.logger.Error("panic while downloading artifact", + "artifact", artifact.GetterSource, + "error", r, + "stack", string(debug.Stack())) + returnErr = fmt.Errorf("getter panic: %v", r) + } + }() + ggURL, err := getGetterUrl(taskEnv, artifact) if err != nil { return newGetError(artifact.GetterSource, err, false) diff --git a/client/allocrunner/taskrunner/getter/getter_test.go b/client/allocrunner/taskrunner/getter/getter_test.go index b5d2c8289e9..92e7e364f44 100644 --- a/client/allocrunner/taskrunner/getter/getter_test.go +++ b/client/allocrunner/taskrunner/getter/getter_test.go @@ -16,6 +16,7 @@ import ( "time" gg "github.com/hashicorp/go-getter" + "github.com/hashicorp/go-hclog" clientconfig "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/interfaces" "github.com/hashicorp/nomad/client/taskenv" @@ -56,6 +57,19 @@ func noopTaskEnv(taskDir string) interfaces.EnvReplacer { } } +// panicReplacer is a version of taskenv.TaskEnv.ReplaceEnv that panics. +type panicReplacer struct{} + +func (panicReplacer) ReplaceEnv(_ string) string { + panic("panic") +} +func (panicReplacer) ClientPath(_ string, _ bool) (string, bool) { + panic("panic") +} +func panicTaskEnv() interfaces.EnvReplacer { + return panicReplacer{} +} + // upperReplacer is a version of taskenv.TaskEnv.ReplaceEnv that upper-cases // the given input. type upperReplacer struct { @@ -76,7 +90,7 @@ func removeAllT(t *testing.T, path string) { } func TestGetter_getClient(t *testing.T) { - getter := NewGetter(&clientconfig.ArtifactConfig{ + getter := NewGetter(hclog.NewNullLogger(), &clientconfig.ArtifactConfig{ HTTPReadTimeout: time.Minute, HTTPMaxBytes: 100_000, GCSTimeout: 1 * time.Minute, @@ -464,6 +478,15 @@ func TestGetArtifact_Setuid(t *testing.T) { } } +// TestGetArtifact_handlePanic tests that a panic during the getter execution +// does not cause its goroutine to crash. +func TestGetArtifact_handlePanic(t *testing.T) { + getter := TestDefaultGetter(t) + err := getter.GetArtifact(panicTaskEnv(), &structs.TaskArtifact{}) + require.Error(t, err) + require.Contains(t, err.Error(), "panic") +} + func TestGetGetterUrl_Queries(t *testing.T) { cases := []struct { name string diff --git a/client/allocrunner/taskrunner/getter/testing.go b/client/allocrunner/taskrunner/getter/testing.go index c30b7fc5af6..9dc974802ca 100644 --- a/client/allocrunner/taskrunner/getter/testing.go +++ b/client/allocrunner/taskrunner/getter/testing.go @@ -6,6 +6,7 @@ package getter import ( "testing" + "github.com/hashicorp/go-hclog" clientconfig "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/nomad/structs/config" "github.com/stretchr/testify/require" @@ -14,5 +15,5 @@ import ( func TestDefaultGetter(t *testing.T) *Getter { getterConf, err := clientconfig.ArtifactConfigFromAgent(config.DefaultArtifactConfig()) require.NoError(t, err) - return NewGetter(getterConf) + return NewGetter(hclog.NewNullLogger(), getterConf) } diff --git a/client/client.go b/client/client.go index c95807ff384..6a59669c0eb 100644 --- a/client/client.go +++ b/client/client.go @@ -379,7 +379,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie serversContactedCh: make(chan struct{}), serversContactedOnce: sync.Once{}, cpusetManager: cgutil.NewCpusetManager(cfg.CgroupParent, logger.Named("cpuset_manager")), - getter: getter.NewGetter(cfg.Artifact), + getter: getter.NewGetter(logger.Named("artifact_getter"), cfg.Artifact), EnterpriseClient: newEnterpriseClient(logger), }