diff --git a/docs/operator-manual/config-management-plugins.md b/docs/operator-manual/config-management-plugins.md index 48cc0d0ff871a..f8dcef3a6c36a 100644 --- a/docs/operator-manual/config-management-plugins.md +++ b/docs/operator-manual/config-management-plugins.md @@ -326,7 +326,9 @@ If you don't need to set any environment variables, you can set an empty plugin if version was mentioned in the `ConfigManagementPlugin` spec or else just use ``. You can also remove the name altogether and let the automatic discovery to identify the plugin. !!! note - If a CMP renders blank manifests, and `prune` is set to `true`, Argo CD will automatically remove resources. CMP plugin authors should ensure errors are part of the exit code. Commonly something like `kustomize build . | cat` won't pass errors because of the pipe. Consider setting `set -o pipefail` so anything piped will pass errors on failure. + If a CMP renders blank manfiests, and `prune` is set to `true`, Argo CD will automatically remove resources. CMP plugin authors should ensure errors are part of the exit code. Commonly something like `kustomize build . | cat` won't pass errors because of the pipe. Consider setting `set -o pipefail` so anything piped will pass errors on failure. +!!! note + If a CMP command fails to gracefully exit on `ARGOCD_EXEC_TIMEOUT`, it will be forcefully killed after an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT`. ## Debugging a CMP diff --git a/docs/operator-manual/high_availability.md b/docs/operator-manual/high_availability.md index 599df0810c357..ee27ee406fa3f 100644 --- a/docs/operator-manual/high_availability.md +++ b/docs/operator-manual/high_availability.md @@ -34,6 +34,8 @@ and might fail. To avoid failed syncs use the `ARGOCD_GIT_ATTEMPTS_COUNT` enviro * `argocd-repo-server` executes config management tools such as `helm` or `kustomize` and enforces a 90 second timeout. This timeout can be changed by using the `ARGOCD_EXEC_TIMEOUT` env variable. The value should be in the Go time duration string format, for example, `2m30s`. +* `argocd-repo-server` will issue a `SIGTERM` signal to a command that has elapsed the `ARGOCD_EXEC_TIMEOUT`. In most cases, well-behaved commands will exit immediately when receiving the signal. However, if this does not happen, `argocd-repo-server` will wait an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT` and then forcefully exit the command with a `SIGKILL` to prevent stalling. Note that a failure to exit with `SIGTERM` is usually a bug in either the offending command or in the way `argocd-repo-server` calls it and should be reported to the issue tracker for further investigation. + **metrics:** * `argocd_git_request_total` - Number of git requests. This metric provides two tags: diff --git a/util/exec/exec.go b/util/exec/exec.go index f2aa375300beb..0df69aeba7f9b 100644 --- a/util/exec/exec.go +++ b/util/exec/exec.go @@ -20,8 +20,9 @@ import ( ) var ( - timeout time.Duration - Unredacted = Redact(nil) + timeout time.Duration + fatalTimeout time.Duration + Unredacted = Redact(nil) ) type ExecRunOpts struct { @@ -45,6 +46,10 @@ func initTimeout() { if err != nil { timeout = 90 * time.Second } + fatalTimeout, err = time.ParseDuration(os.Getenv("ARGOCD_EXEC_FATAL_TIMEOUT")) + if err != nil { + fatalTimeout = 10 * time.Second + } } func Run(cmd *exec.Cmd) (string, error) { @@ -57,7 +62,7 @@ func RunWithRedactor(cmd *exec.Cmd, redactor func(text string) string) (string, } func RunWithExecRunOpts(cmd *exec.Cmd, opts ExecRunOpts) (string, error) { - cmdOpts := CmdOpts{Timeout: timeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging} + cmdOpts := CmdOpts{Timeout: timeout, FatalTimeout: fatalTimeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging} span := tracing.NewLoggingTracer(log.NewLogrusLogger(log.NewWithCurrentConfig())).StartSpan(fmt.Sprintf("exec %v", cmd.Args[0])) span.SetBaggageItem("dir", cmd.Dir) if cmdOpts.Redactor != nil { @@ -130,6 +135,8 @@ type TimeoutBehavior struct { type CmdOpts struct { // Timeout determines how long to wait for the command to exit Timeout time.Duration + // FatalTimeout is the amount of additional time to wait after Timeout before fatal SIGKILL + FatalTimeout time.Duration // Redactor redacts tokens from the output Redactor func(text string) string // TimeoutBehavior configures what to do in case of timeout @@ -142,6 +149,7 @@ type CmdOpts struct { var DefaultCmdOpts = CmdOpts{ Timeout: time.Duration(0), + FatalTimeout: time.Duration(0), Redactor: Unredacted, TimeoutBehavior: TimeoutBehavior{syscall.SIGKILL, false}, SkipErrorLogging: false, @@ -189,19 +197,30 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { done := make(chan error) go func() { done <- cmd.Wait() }() - // Start a timer + // Start timers for timeout timeout := DefaultCmdOpts.Timeout + fatalTimeout := DefaultCmdOpts.FatalTimeout if opts.Timeout != time.Duration(0) { timeout = opts.Timeout } + if opts.FatalTimeout != time.Duration(0) { + fatalTimeout = opts.FatalTimeout + } + var timoutCh <-chan time.Time if timeout != 0 { timoutCh = time.NewTimer(timeout).C } + var fatalTimeoutCh <-chan time.Time + if fatalTimeout != 0 { + fatalTimeoutCh = time.NewTimer(timeout + fatalTimeout).C + } + timeoutBehavior := DefaultCmdOpts.TimeoutBehavior + fatalTimeoutBehaviour := syscall.SIGKILL if opts.TimeoutBehavior.Signal != syscall.Signal(0) { timeoutBehavior = opts.TimeoutBehavior } @@ -209,10 +228,29 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { select { // noinspection ALL case <-timoutCh: + // send timeout signal _ = cmd.Process.Signal(timeoutBehavior.Signal) + // wait on timeout signal and fallback to fatal timeout signal if timeoutBehavior.ShouldWait { - <-done + select { + case <-done: + case <-fatalTimeoutCh: + // upgrades to SIGKILL if cmd does not respect SIGTERM + _ = cmd.Process.Signal(fatalTimeoutBehaviour) + // now original cmd should exit immediately after SIGKILL + <-done + // return error with a marker indicating that cmd exited only after fatal SIGKILL + output := stdout.String() + if opts.CaptureStderr { + output += stderr.String() + } + logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output)) + err = newCmdError(redactor(args), fmt.Errorf("fatal timeout after %v", timeout+fatalTimeout), "") + logCtx.Error(err.Error()) + return strings.TrimSuffix(output, "\n"), err + } } + // either did not wait for timeout or cmd did respect SIGTERM output := stdout.String() if opts.CaptureStderr { output += stderr.String() @@ -235,7 +273,6 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { return strings.TrimSuffix(output, "\n"), err } } - output := stdout.String() if opts.CaptureStderr { output += stderr.String() diff --git a/util/exec/exec_test.go b/util/exec/exec_test.go index 9bae21f4d847e..d8113ce260762 100644 --- a/util/exec/exec_test.go +++ b/util/exec/exec_test.go @@ -17,11 +17,14 @@ func Test_timeout(t *testing.T) { t.Run("Default", func(t *testing.T) { initTimeout() assert.Equal(t, 90*time.Second, timeout) + assert.Equal(t, 10*time.Second, fatalTimeout) }) t.Run("Default", func(t *testing.T) { t.Setenv("ARGOCD_EXEC_TIMEOUT", "1s") + t.Setenv("ARGOCD_EXEC_FATAL_TIMEOUT", "2s") initTimeout() assert.Equal(t, 1*time.Second, timeout) + assert.Equal(t, 2*time.Second, fatalTimeout) }) } @@ -42,6 +45,7 @@ func TestHideUsernamePassword(t *testing.T) { require.Error(t, err) } +// This tests a cmd that properly handles a SIGTERM signal func TestRunWithExecRunOpts(t *testing.T) { t.Setenv("ARGOCD_EXEC_TIMEOUT", "200ms") initTimeout() @@ -56,6 +60,25 @@ func TestRunWithExecRunOpts(t *testing.T) { assert.ErrorContains(t, err, "failed timeout after 200ms") } +// This tests a mis-behaved cmd that stalls on SIGTERM and requires a SIGKILL +func TestRunWithExecRunOptsFatal(t *testing.T) { + t.Setenv("ARGOCD_EXEC_TIMEOUT", "200ms") + t.Setenv("ARGOCD_EXEC_FATAL_TIMEOUT", "100ms") + + initTimeout() + + opts := ExecRunOpts{ + TimeoutBehavior: TimeoutBehavior{ + Signal: syscall.SIGTERM, + ShouldWait: true, + }, + } + // The returned error string in this case should contain a "fatal" in this case + _, err := RunWithExecRunOpts(exec.Command("sh", "-c", "trap 'trap - 15 && echo captured && sleep 10000' 15 && sleep 2"), opts) + // The expected timeout is ARGOCD_EXEC_TIMEOUT + ARGOCD_EXEC_FATAL_TIMEOUT = 200ms + 100ms = 300ms + assert.ErrorContains(t, err, "failed fatal timeout after 300ms") +} + func Test_getCommandArgsToLog(t *testing.T) { t.Parallel()