diff --git a/.changelog/14696.txt b/.changelog/14696.txt new file mode 100644 index 00000000000..f1b4af4b453 --- /dev/null +++ b/.changelog/14696.txt @@ -0,0 +1,3 @@ +```release-note:security +client: recover from panics caused by artifact download to prevent the Nomad client from crashing +``` diff --git a/client/allocrunner/taskrunner/getter/getter.go b/client/allocrunner/taskrunner/getter/getter.go index f1f42b1ce11..d46c730fcd7 100644 --- a/client/allocrunner/taskrunner/getter/getter.go +++ b/client/allocrunner/taskrunner/getter/getter.go @@ -5,10 +5,12 @@ import ( "fmt" "net/http" "net/url" + "runtime/debug" "strings" "github.com/hashicorp/go-cleanhttp" gg "github.com/hashicorp/go-getter" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/interfaces" @@ -22,6 +24,8 @@ const ( // Getter wraps go-getter calls in an artifact configuration. type Getter struct { + logger hclog.Logger + // httpClient is a shared HTTP client for use across all http/https // Getter instantiations. The HTTP client is designed to be // thread-safe, and using a pooled transport will help reduce excessive @@ -32,8 +36,9 @@ type Getter struct { // NewGetter returns a new Getter instance. This function is called once per // client and shared across alloc and task runners. -func NewGetter(config *config.ArtifactConfig) *Getter { +func NewGetter(logger hclog.Logger, config *config.ArtifactConfig) *Getter { return &Getter{ + logger: logger, httpClient: &http.Client{ Transport: cleanhttp.DefaultPooledTransport(), }, @@ -42,7 +47,19 @@ func NewGetter(config *config.ArtifactConfig) *Getter { } // GetArtifact downloads an artifact into the specified task directory. -func (g *Getter) GetArtifact(taskEnv interfaces.EnvReplacer, artifact *structs.TaskArtifact) error { +func (g *Getter) GetArtifact(taskEnv interfaces.EnvReplacer, artifact *structs.TaskArtifact) (returnErr error) { + // Recover from panics to avoid crashing the entire Nomad client due to + // artifact download failures, such as bugs in go-getter. + defer func() { + if r := recover(); r != nil { + g.logger.Error("panic while downloading artifact", + "artifact", artifact.GetterSource, + "error", r, + "stack", string(debug.Stack())) + returnErr = fmt.Errorf("getter panic: %v", r) + } + }() + ggURL, err := getGetterUrl(taskEnv, artifact) if err != nil { return newGetError(artifact.GetterSource, err, false) diff --git a/client/allocrunner/taskrunner/getter/getter_test.go b/client/allocrunner/taskrunner/getter/getter_test.go index b5d2c8289e9..92e7e364f44 100644 --- a/client/allocrunner/taskrunner/getter/getter_test.go +++ b/client/allocrunner/taskrunner/getter/getter_test.go @@ -16,6 +16,7 @@ import ( "time" gg "github.com/hashicorp/go-getter" + "github.com/hashicorp/go-hclog" clientconfig "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/interfaces" "github.com/hashicorp/nomad/client/taskenv" @@ -56,6 +57,19 @@ func noopTaskEnv(taskDir string) interfaces.EnvReplacer { } } +// panicReplacer is a version of taskenv.TaskEnv.ReplaceEnv that panics. +type panicReplacer struct{} + +func (panicReplacer) ReplaceEnv(_ string) string { + panic("panic") +} +func (panicReplacer) ClientPath(_ string, _ bool) (string, bool) { + panic("panic") +} +func panicTaskEnv() interfaces.EnvReplacer { + return panicReplacer{} +} + // upperReplacer is a version of taskenv.TaskEnv.ReplaceEnv that upper-cases // the given input. type upperReplacer struct { @@ -76,7 +90,7 @@ func removeAllT(t *testing.T, path string) { } func TestGetter_getClient(t *testing.T) { - getter := NewGetter(&clientconfig.ArtifactConfig{ + getter := NewGetter(hclog.NewNullLogger(), &clientconfig.ArtifactConfig{ HTTPReadTimeout: time.Minute, HTTPMaxBytes: 100_000, GCSTimeout: 1 * time.Minute, @@ -464,6 +478,15 @@ func TestGetArtifact_Setuid(t *testing.T) { } } +// TestGetArtifact_handlePanic tests that a panic during the getter execution +// does not cause its goroutine to crash. +func TestGetArtifact_handlePanic(t *testing.T) { + getter := TestDefaultGetter(t) + err := getter.GetArtifact(panicTaskEnv(), &structs.TaskArtifact{}) + require.Error(t, err) + require.Contains(t, err.Error(), "panic") +} + func TestGetGetterUrl_Queries(t *testing.T) { cases := []struct { name string diff --git a/client/allocrunner/taskrunner/getter/testing.go b/client/allocrunner/taskrunner/getter/testing.go index c30b7fc5af6..9dc974802ca 100644 --- a/client/allocrunner/taskrunner/getter/testing.go +++ b/client/allocrunner/taskrunner/getter/testing.go @@ -6,6 +6,7 @@ package getter import ( "testing" + "github.com/hashicorp/go-hclog" clientconfig "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/nomad/structs/config" "github.com/stretchr/testify/require" @@ -14,5 +15,5 @@ import ( func TestDefaultGetter(t *testing.T) *Getter { getterConf, err := clientconfig.ArtifactConfigFromAgent(config.DefaultArtifactConfig()) require.NoError(t, err) - return NewGetter(getterConf) + return NewGetter(hclog.NewNullLogger(), getterConf) } diff --git a/client/client.go b/client/client.go index c95807ff384..6a59669c0eb 100644 --- a/client/client.go +++ b/client/client.go @@ -379,7 +379,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie serversContactedCh: make(chan struct{}), serversContactedOnce: sync.Once{}, cpusetManager: cgutil.NewCpusetManager(cfg.CgroupParent, logger.Named("cpuset_manager")), - getter: getter.NewGetter(cfg.Artifact), + getter: getter.NewGetter(logger.Named("artifact_getter"), cfg.Artifact), EnterpriseClient: newEnterpriseClient(logger), }