diff --git a/changelog/fragments/1754346816-fleet-enc-save-retries.yaml b/changelog/fragments/1754346816-fleet-enc-save-retries.yaml new file mode 100644 index 00000000000..e5cbb87b5b9 --- /dev/null +++ b/changelog/fragments/1754346816-fleet-enc-save-retries.yaml @@ -0,0 +1,36 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: On Windows, retry saving the Agent information file to disk + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +description: > + Saving the Agent information file involves renaming/moving a file to its final + destination. However, on Windows, it is sometimes not possible to rename/move a + file to its destination file because the destination file is locked by another + process (e.g. antivirus software). For such situations, we now retry the save operation + on Windows. +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/9224 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/elastic-agent/issues/5862 diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_policy_change.go b/internal/pkg/agent/application/actions/handlers/handler_action_policy_change.go index 0efa5f0d918..634f2020e55 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_action_policy_change.go +++ b/internal/pkg/agent/application/actions/handlers/handler_action_policy_change.go @@ -291,7 +291,7 @@ func (h *PolicyChangeHandler) handlePolicyChange(ctx context.Context, c *config. } // persist configuration - err = saveConfig(h.agentInfo, h.config, h.store) + err = saveConfig(h.agentInfo, h.config, h.store, h.log) if err != nil { return fmt.Errorf("saving config: %w", err) } @@ -379,7 +379,7 @@ func (h *PolicyChangeHandler) applyLoggingConfig(ctx context.Context, loggingCon return h.policyLogLevelSetter.SetLogLevel(ctx, policyLogLevel) } -func saveConfig(agentInfo info.Agent, validatedConfig *configuration.Configuration, store storage.Store) error { +func saveConfig(agentInfo info.Agent, validatedConfig *configuration.Configuration, store storage.Store, log *logger.Logger) error { if validatedConfig == nil { // nothing to do for fleet hosts return nil @@ -391,8 +391,7 @@ func saveConfig(agentInfo info.Agent, validatedConfig *configuration.Configurati errors.TypeUnexpected, errors.M("hosts", validatedConfig.Fleet.Client.Hosts)) } - err = store.Save(reader) - if err != nil { + if err := saveConfigToStore(store, reader, log); err != nil { return errors.New( err, "fail to persist new Fleet Server API client hosts", errors.TypeFilesystem, errors.M("hosts", validatedConfig.Fleet.Client.Hosts)) @@ -448,7 +447,7 @@ func clientEqual(k1 remote.Config, k2 remote.Config) bool { return true } -func fleetToReader(agentID string, headers map[string]string, cfg *configuration.Configuration) (io.Reader, error) { +func fleetToReader(agentID string, headers map[string]string, cfg *configuration.Configuration) (io.ReadSeeker, error) { configToStore := map[string]interface{}{ "fleet": cfg.Fleet, "agent": map[string]interface{}{ // Add event logging configuration here! diff --git a/internal/pkg/agent/application/actions/handlers/handler_helpers_other.go b/internal/pkg/agent/application/actions/handlers/handler_helpers_other.go new file mode 100644 index 00000000000..f314d25b103 --- /dev/null +++ b/internal/pkg/agent/application/actions/handlers/handler_helpers_other.go @@ -0,0 +1,20 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License 2.0; +// you may not use this file except in compliance with the Elastic License 2.0. + +//go:build !windows + +package handlers + +import ( + "io" + + "github.com/elastic/elastic-agent/pkg/core/logger" + + "github.com/elastic/elastic-agent/internal/pkg/agent/storage" +) + +// saveConfigToStore saves the given configuration (reader) to the given store +func saveConfigToStore(store storage.Store, reader io.Reader, _ *logger.Logger) error { + return store.Save(reader) +} diff --git a/internal/pkg/agent/application/actions/handlers/handler_helpers_test.go b/internal/pkg/agent/application/actions/handlers/handler_helpers_test.go index 087c15331be..3cd2e378bb2 100644 --- a/internal/pkg/agent/application/actions/handlers/handler_helpers_test.go +++ b/internal/pkg/agent/application/actions/handlers/handler_helpers_test.go @@ -7,17 +7,23 @@ package handlers import ( "context" "math" + "math/rand/v2" + "os" + "path/filepath" + "strings" "testing" "time" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/stretchr/testify/require" + "github.com/elastic/elastic-agent-client/v7/pkg/proto" "github.com/elastic/elastic-agent-libs/logp" - "github.com/elastic/elastic-agent/internal/pkg/agent/errors" + "github.com/elastic/elastic-agent/internal/pkg/agent/storage" "github.com/elastic/elastic-agent/pkg/component" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" + "github.com/elastic/elastic-agent/pkg/core/logger/loggertest" ) type testAction struct { @@ -215,3 +221,36 @@ func TestProxiedActionsNotifier(t *testing.T) { }) } } + +func TestSaveConfigToStore(t *testing.T) { + // Create destination file + tmpDir := t.TempDir() + dest := filepath.Join(tmpDir, "dest.txt") + err := os.WriteFile(dest, []byte("existing content"), 0644) + require.NoError(t, err) + + // Open handle on destination file and keep it open for a random duration + // between [200ms, 1800ms). + openDuration := time.Duration(200+rand.Int64N(1800-200)) * time.Millisecond + destFile, err := os.Open(dest) + require.NoError(t, err) + time.AfterFunc(openDuration, func() { + destFile.Close() + }) + defer destFile.Close() + + // Create disk store + store, err := storage.NewDiskStore(dest) + require.NoError(t, err) + + // Try to save content to store + reader := strings.NewReader("new content") + log, _ := loggertest.New("test") + err = saveConfigToStore(store, reader, log) + require.NoError(t, err) + + // Check that dest file has been replaced with new file + data, err := os.ReadFile(dest) + require.NoError(t, err) + require.Equal(t, "new content", string(data)) +} diff --git a/internal/pkg/agent/application/actions/handlers/handler_helpers_windows.go b/internal/pkg/agent/application/actions/handlers/handler_helpers_windows.go new file mode 100644 index 00000000000..306b52f4aa7 --- /dev/null +++ b/internal/pkg/agent/application/actions/handlers/handler_helpers_windows.go @@ -0,0 +1,56 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License 2.0; +// you may not use this file except in compliance with the Elastic License 2.0. + +//go:build windows + +package handlers + +import ( + "context" + "errors" + "io" + "syscall" + "time" + + "github.com/cenkalti/backoff/v4" + + "github.com/elastic/elastic-agent/internal/pkg/agent/storage" + "github.com/elastic/elastic-agent/pkg/core/logger" +) + +const saveRetryInterval = 50 * time.Millisecond +const saveRetryDuration = 2 * time.Second + +// saveConfigToStore saves the given configuration (reader) to the given store. +// On Windows platforms, the save operation is retried if the error is an +// ACCESS_DENIED error, which can happen if the file is locked by another process. +func saveConfigToStore(store storage.Store, reader io.ReadSeeker, log *logger.Logger) error { + ctx, cancel := context.WithTimeout(context.Background(), saveRetryDuration) + defer cancel() + + bo := backoff.WithContext(backoff.NewConstantBackOff(saveRetryInterval), ctx) + + return backoff.Retry(func() error { + err := store.Save(reader) + if err == nil { + // Save succeeded + return nil + } + + if !errors.Is(err, syscall.ERROR_ACCESS_DENIED) { + // Save failed due to an error that is not ACCESS_DENIED. Immediately + // indicate failure without retrying further. + log.Debugf("Saving configuration to store failed: %v. Not retrying.", err) + return backoff.Permanent(err) + } + + if _, seekErr := reader.Seek(0, io.SeekStart); seekErr != nil { + log.Debugf("Saving configuration to store failed: %v. Failed to reset reader: %v. Not retrying.", err, seekErr) + return backoff.Permanent(errors.Join(err, seekErr)) + } + + log.Debugf("Saving configuration to store failed: %v. Retrying...", err) + return err + }, bo) +}