Skip to content
Merged
4 changes: 3 additions & 1 deletion docs/operator-manual/config-management-plugins.md
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,9 @@ If you don't need to set any environment variables, you can set an empty plugin
if version was mentioned in the `ConfigManagementPlugin` spec or else just use `<metadata.name>`. You can also remove the name altogether
and let the automatic discovery to identify the plugin.
!!! note
If a CMP renders blank manifests, and `prune` is set to `true`, Argo CD will automatically remove resources. CMP plugin authors should ensure errors are part of the exit code. Commonly something like `kustomize build . | cat` won't pass errors because of the pipe. Consider setting `set -o pipefail` so anything piped will pass errors on failure.
If a CMP renders blank manfiests, and `prune` is set to `true`, Argo CD will automatically remove resources. CMP plugin authors should ensure errors are part of the exit code. Commonly something like `kustomize build . | cat` won't pass errors because of the pipe. Consider setting `set -o pipefail` so anything piped will pass errors on failure.
!!! note
If a CMP command fails to gracefully exit on `ARGOCD_EXEC_TIMEOUT`, it will be forcefully killed after an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT`.

## Debugging a CMP

Expand Down
2 changes: 2 additions & 0 deletions docs/operator-manual/high_availability.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ and might fail. To avoid failed syncs use the `ARGOCD_GIT_ATTEMPTS_COUNT` enviro

* `argocd-repo-server` executes config management tools such as `helm` or `kustomize` and enforces a 90 second timeout. This timeout can be changed by using the `ARGOCD_EXEC_TIMEOUT` env variable. The value should be in the Go time duration string format, for example, `2m30s`.

* `argocd-repo-server` will issue a `SIGTERM` signal to a command that has elapsed the `ARGOCD_EXEC_TIMEOUT`. In most cases, well-behaved commands will exit immediately when receiving the signal. However, if this does not happen, `argocd-repo-server` will wait an additional timeout of `ARGOCD_EXEC_FATAL_TIMEOUT` and then forcefully exit the command with a `SIGKILL` to prevent stalling. Note that a failure to exit with `SIGTERM` is usually a bug in either the offending command or in the way `argocd-repo-server` calls it and should be reported to the issue tracker for further investigation.

**metrics:**

* `argocd_git_request_total` - Number of git requests. This metric provides two tags:
Expand Down
49 changes: 43 additions & 6 deletions util/exec/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ import (
)

var (
timeout time.Duration
Unredacted = Redact(nil)
timeout time.Duration
fatalTimeout time.Duration
Unredacted = Redact(nil)
)

type ExecRunOpts struct {
Expand All @@ -45,6 +46,10 @@ func initTimeout() {
if err != nil {
timeout = 90 * time.Second
}
fatalTimeout, err = time.ParseDuration(os.Getenv("ARGOCD_EXEC_FATAL_TIMEOUT"))
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is something that should be exposed to users but in a followup I think we should add metrics for when the fatal SIGKILL is sent. My guess is that the SIGTERM stall is either some kind of race-condition corruption in argo multithreading or an actual rare race condition in git itself but I don't have a good idea of what to debug without further understanding of the issue.

if err != nil {
fatalTimeout = 10 * time.Second
}
}

func Run(cmd *exec.Cmd) (string, error) {
Expand All @@ -57,7 +62,7 @@ func RunWithRedactor(cmd *exec.Cmd, redactor func(text string) string) (string,
}

func RunWithExecRunOpts(cmd *exec.Cmd, opts ExecRunOpts) (string, error) {
cmdOpts := CmdOpts{Timeout: timeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging}
cmdOpts := CmdOpts{Timeout: timeout, FatalTimeout: fatalTimeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging}
span := tracing.NewLoggingTracer(log.NewLogrusLogger(log.NewWithCurrentConfig())).StartSpan(fmt.Sprintf("exec %v", cmd.Args[0]))
span.SetBaggageItem("dir", cmd.Dir)
if cmdOpts.Redactor != nil {
Expand Down Expand Up @@ -130,6 +135,8 @@ type TimeoutBehavior struct {
type CmdOpts struct {
// Timeout determines how long to wait for the command to exit
Timeout time.Duration
// FatalTimeout is the amount of additional time to wait after Timeout before fatal SIGKILL
FatalTimeout time.Duration
// Redactor redacts tokens from the output
Redactor func(text string) string
// TimeoutBehavior configures what to do in case of timeout
Expand All @@ -142,6 +149,7 @@ type CmdOpts struct {

var DefaultCmdOpts = CmdOpts{
Timeout: time.Duration(0),
FatalTimeout: time.Duration(0),
Redactor: Unredacted,
TimeoutBehavior: TimeoutBehavior{syscall.SIGKILL, false},
SkipErrorLogging: false,
Expand Down Expand Up @@ -189,30 +197,60 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) {
done := make(chan error)
go func() { done <- cmd.Wait() }()

// Start a timer
// Start timers for timeout
timeout := DefaultCmdOpts.Timeout
fatalTimeout := DefaultCmdOpts.FatalTimeout

if opts.Timeout != time.Duration(0) {
timeout = opts.Timeout
}

if opts.FatalTimeout != time.Duration(0) {
fatalTimeout = opts.FatalTimeout
}

var timoutCh <-chan time.Time
if timeout != 0 {
timoutCh = time.NewTimer(timeout).C
}

var fatalTimeoutCh <-chan time.Time
if fatalTimeout != 0 {
fatalTimeoutCh = time.NewTimer(timeout + fatalTimeout).C
}

timeoutBehavior := DefaultCmdOpts.TimeoutBehavior
fatalTimeoutBehaviour := syscall.SIGKILL
if opts.TimeoutBehavior.Signal != syscall.Signal(0) {
timeoutBehavior = opts.TimeoutBehavior
}

select {
// noinspection ALL
case <-timoutCh:
// send timeout signal
_ = cmd.Process.Signal(timeoutBehavior.Signal)
// wait on timeout signal and fallback to fatal timeout signal
if timeoutBehavior.ShouldWait {
<-done
select {
case <-done:
case <-fatalTimeoutCh:
// upgrades to SIGKILL if cmd does not respect SIGTERM
_ = cmd.Process.Signal(fatalTimeoutBehaviour)
// now original cmd should exit immediately after SIGKILL
<-done
// return error with a marker indicating that cmd exited only after fatal SIGKILL
output := stdout.String()
if opts.CaptureStderr {
output += stderr.String()
}
logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output))
Copy link
Copy Markdown
Author

@hazel-sudz hazel-sudz Apr 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some overlap here with the return logic below but with different error code. I'm open to refactoring the exit to a new function if this is desired but also fine leaving it as is since it's not too bad

err = newCmdError(redactor(args), fmt.Errorf("fatal timeout after %v", timeout+fatalTimeout), "")
logCtx.Error(err.Error())
return strings.TrimSuffix(output, "\n"), err
}
}
// either did not wait for timeout or cmd did respect SIGTERM
output := stdout.String()
if opts.CaptureStderr {
output += stderr.String()
Expand All @@ -235,7 +273,6 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) {
return strings.TrimSuffix(output, "\n"), err
}
}

output := stdout.String()
if opts.CaptureStderr {
output += stderr.String()
Expand Down
23 changes: 23 additions & 0 deletions util/exec/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@ func Test_timeout(t *testing.T) {
t.Run("Default", func(t *testing.T) {
initTimeout()
assert.Equal(t, 90*time.Second, timeout)
assert.Equal(t, 10*time.Second, fatalTimeout)
})
t.Run("Default", func(t *testing.T) {
t.Setenv("ARGOCD_EXEC_TIMEOUT", "1s")
t.Setenv("ARGOCD_EXEC_FATAL_TIMEOUT", "2s")
initTimeout()
assert.Equal(t, 1*time.Second, timeout)
assert.Equal(t, 2*time.Second, fatalTimeout)
})
}

Expand All @@ -42,6 +45,7 @@ func TestHideUsernamePassword(t *testing.T) {
require.Error(t, err)
}

// This tests a cmd that properly handles a SIGTERM signal
func TestRunWithExecRunOpts(t *testing.T) {
t.Setenv("ARGOCD_EXEC_TIMEOUT", "200ms")
initTimeout()
Expand All @@ -56,6 +60,25 @@ func TestRunWithExecRunOpts(t *testing.T) {
assert.ErrorContains(t, err, "failed timeout after 200ms")
}

// This tests a mis-behaved cmd that stalls on SIGTERM and requires a SIGKILL
func TestRunWithExecRunOptsFatal(t *testing.T) {
t.Setenv("ARGOCD_EXEC_TIMEOUT", "200ms")
t.Setenv("ARGOCD_EXEC_FATAL_TIMEOUT", "100ms")

initTimeout()

opts := ExecRunOpts{
TimeoutBehavior: TimeoutBehavior{
Signal: syscall.SIGTERM,
ShouldWait: true,
},
}
// The returned error string in this case should contain a "fatal" in this case
Copy link
Copy Markdown
Author

@hazel-sudz hazel-sudz Apr 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added test for new error flow case. previously this cmd would have forced argocd to deadlock.

_, err := RunWithExecRunOpts(exec.Command("sh", "-c", "trap 'trap - 15 && echo captured && sleep 10000' 15 && sleep 2"), opts)
// The expected timeout is ARGOCD_EXEC_TIMEOUT + ARGOCD_EXEC_FATAL_TIMEOUT = 200ms + 100ms = 300ms
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe ARGOCD_EXEC_FATAL_TIMEOUT is a little bit confusing naming since this is timeout in addition too ARGOCD_EXEC_TIMEOUT. open to suggestions on a better name here

assert.ErrorContains(t, err, "failed fatal timeout after 300ms")
}

func Test_getCommandArgsToLog(t *testing.T) {
t.Parallel()

Expand Down
Loading