-
Notifications
You must be signed in to change notification settings - Fork 7k
fix: add fatal timeout upgrade with SIGKILL to ARGO_EXEC_TIMEOUT (closes #20785, #18478) #22713
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2148f16
ac9c4dc
d2aa6ca
478f866
349c5f9
4fee1b7
a1c653f
4c8df06
8b62cf5
a615cf4
32b44ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,8 +20,9 @@ import ( | |
| ) | ||
|
|
||
| var ( | ||
| timeout time.Duration | ||
| Unredacted = Redact(nil) | ||
| timeout time.Duration | ||
| fatalTimeout time.Duration | ||
| Unredacted = Redact(nil) | ||
| ) | ||
|
|
||
| type ExecRunOpts struct { | ||
|
|
@@ -45,6 +46,10 @@ func initTimeout() { | |
| if err != nil { | ||
| timeout = 90 * time.Second | ||
| } | ||
| fatalTimeout, err = time.ParseDuration(os.Getenv("ARGOCD_EXEC_FATAL_TIMEOUT")) | ||
| if err != nil { | ||
| fatalTimeout = 10 * time.Second | ||
| } | ||
| } | ||
|
|
||
| func Run(cmd *exec.Cmd) (string, error) { | ||
|
|
@@ -57,7 +62,7 @@ func RunWithRedactor(cmd *exec.Cmd, redactor func(text string) string) (string, | |
| } | ||
|
|
||
| func RunWithExecRunOpts(cmd *exec.Cmd, opts ExecRunOpts) (string, error) { | ||
| cmdOpts := CmdOpts{Timeout: timeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging} | ||
| cmdOpts := CmdOpts{Timeout: timeout, FatalTimeout: fatalTimeout, Redactor: opts.Redactor, TimeoutBehavior: opts.TimeoutBehavior, SkipErrorLogging: opts.SkipErrorLogging} | ||
| span := tracing.NewLoggingTracer(log.NewLogrusLogger(log.NewWithCurrentConfig())).StartSpan(fmt.Sprintf("exec %v", cmd.Args[0])) | ||
| span.SetBaggageItem("dir", cmd.Dir) | ||
| if cmdOpts.Redactor != nil { | ||
|
|
@@ -130,6 +135,8 @@ type TimeoutBehavior struct { | |
| type CmdOpts struct { | ||
| // Timeout determines how long to wait for the command to exit | ||
| Timeout time.Duration | ||
| // FatalTimeout is the amount of additional time to wait after Timeout before fatal SIGKILL | ||
| FatalTimeout time.Duration | ||
| // Redactor redacts tokens from the output | ||
| Redactor func(text string) string | ||
| // TimeoutBehavior configures what to do in case of timeout | ||
|
|
@@ -142,6 +149,7 @@ type CmdOpts struct { | |
|
|
||
| var DefaultCmdOpts = CmdOpts{ | ||
| Timeout: time.Duration(0), | ||
| FatalTimeout: time.Duration(0), | ||
| Redactor: Unredacted, | ||
| TimeoutBehavior: TimeoutBehavior{syscall.SIGKILL, false}, | ||
| SkipErrorLogging: false, | ||
|
|
@@ -189,30 +197,60 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { | |
| done := make(chan error) | ||
| go func() { done <- cmd.Wait() }() | ||
|
|
||
| // Start a timer | ||
| // Start timers for timeout | ||
| timeout := DefaultCmdOpts.Timeout | ||
| fatalTimeout := DefaultCmdOpts.FatalTimeout | ||
|
|
||
| if opts.Timeout != time.Duration(0) { | ||
| timeout = opts.Timeout | ||
| } | ||
|
|
||
| if opts.FatalTimeout != time.Duration(0) { | ||
| fatalTimeout = opts.FatalTimeout | ||
| } | ||
|
|
||
| var timoutCh <-chan time.Time | ||
| if timeout != 0 { | ||
| timoutCh = time.NewTimer(timeout).C | ||
| } | ||
|
|
||
| var fatalTimeoutCh <-chan time.Time | ||
| if fatalTimeout != 0 { | ||
| fatalTimeoutCh = time.NewTimer(timeout + fatalTimeout).C | ||
| } | ||
|
|
||
| timeoutBehavior := DefaultCmdOpts.TimeoutBehavior | ||
| fatalTimeoutBehaviour := syscall.SIGKILL | ||
| if opts.TimeoutBehavior.Signal != syscall.Signal(0) { | ||
| timeoutBehavior = opts.TimeoutBehavior | ||
| } | ||
|
|
||
| select { | ||
| // noinspection ALL | ||
| case <-timoutCh: | ||
| // send timeout signal | ||
| _ = cmd.Process.Signal(timeoutBehavior.Signal) | ||
| // wait on timeout signal and fallback to fatal timeout signal | ||
| if timeoutBehavior.ShouldWait { | ||
| <-done | ||
| select { | ||
| case <-done: | ||
| case <-fatalTimeoutCh: | ||
| // upgrades to SIGKILL if cmd does not respect SIGTERM | ||
| _ = cmd.Process.Signal(fatalTimeoutBehaviour) | ||
| // now original cmd should exit immediately after SIGKILL | ||
| <-done | ||
| // return error with a marker indicating that cmd exited only after fatal SIGKILL | ||
| output := stdout.String() | ||
| if opts.CaptureStderr { | ||
| output += stderr.String() | ||
| } | ||
| logCtx.WithFields(logrus.Fields{"duration": time.Since(start)}).Debug(redactor(output)) | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. some overlap here with the return logic below but with different error code. I'm open to refactoring the exit to a new function if this is desired but also fine leaving it as is since it's not too bad |
||
| err = newCmdError(redactor(args), fmt.Errorf("fatal timeout after %v", timeout+fatalTimeout), "") | ||
| logCtx.Error(err.Error()) | ||
| return strings.TrimSuffix(output, "\n"), err | ||
| } | ||
| } | ||
| // either did not wait for timeout or cmd did respect SIGTERM | ||
| output := stdout.String() | ||
| if opts.CaptureStderr { | ||
| output += stderr.String() | ||
|
|
@@ -235,7 +273,6 @@ func RunCommandExt(cmd *exec.Cmd, opts CmdOpts) (string, error) { | |
| return strings.TrimSuffix(output, "\n"), err | ||
| } | ||
| } | ||
|
|
||
| output := stdout.String() | ||
| if opts.CaptureStderr { | ||
| output += stderr.String() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,11 +17,14 @@ func Test_timeout(t *testing.T) { | |
| t.Run("Default", func(t *testing.T) { | ||
| initTimeout() | ||
| assert.Equal(t, 90*time.Second, timeout) | ||
| assert.Equal(t, 10*time.Second, fatalTimeout) | ||
| }) | ||
| t.Run("Default", func(t *testing.T) { | ||
| t.Setenv("ARGOCD_EXEC_TIMEOUT", "1s") | ||
| t.Setenv("ARGOCD_EXEC_FATAL_TIMEOUT", "2s") | ||
| initTimeout() | ||
| assert.Equal(t, 1*time.Second, timeout) | ||
| assert.Equal(t, 2*time.Second, fatalTimeout) | ||
| }) | ||
| } | ||
|
|
||
|
|
@@ -42,6 +45,7 @@ func TestHideUsernamePassword(t *testing.T) { | |
| require.Error(t, err) | ||
| } | ||
|
|
||
| // This tests a cmd that properly handles a SIGTERM signal | ||
| func TestRunWithExecRunOpts(t *testing.T) { | ||
| t.Setenv("ARGOCD_EXEC_TIMEOUT", "200ms") | ||
| initTimeout() | ||
|
|
@@ -56,6 +60,25 @@ func TestRunWithExecRunOpts(t *testing.T) { | |
| assert.ErrorContains(t, err, "failed timeout after 200ms") | ||
| } | ||
|
|
||
| // This tests a mis-behaved cmd that stalls on SIGTERM and requires a SIGKILL | ||
| func TestRunWithExecRunOptsFatal(t *testing.T) { | ||
| t.Setenv("ARGOCD_EXEC_TIMEOUT", "200ms") | ||
| t.Setenv("ARGOCD_EXEC_FATAL_TIMEOUT", "100ms") | ||
|
|
||
| initTimeout() | ||
|
|
||
| opts := ExecRunOpts{ | ||
| TimeoutBehavior: TimeoutBehavior{ | ||
| Signal: syscall.SIGTERM, | ||
| ShouldWait: true, | ||
| }, | ||
| } | ||
| // The returned error string in this case should contain a "fatal" in this case | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added test for new error flow case. previously this cmd would have forced argocd to deadlock. |
||
| _, err := RunWithExecRunOpts(exec.Command("sh", "-c", "trap 'trap - 15 && echo captured && sleep 10000' 15 && sleep 2"), opts) | ||
| // The expected timeout is ARGOCD_EXEC_TIMEOUT + ARGOCD_EXEC_FATAL_TIMEOUT = 200ms + 100ms = 300ms | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe ARGOCD_EXEC_FATAL_TIMEOUT is a little bit confusing naming since this is timeout in addition too ARGOCD_EXEC_TIMEOUT. open to suggestions on a better name here |
||
| assert.ErrorContains(t, err, "failed fatal timeout after 300ms") | ||
| } | ||
|
|
||
| func Test_getCommandArgsToLog(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this is something that should be exposed to users but in a followup I think we should add metrics for when the fatal SIGKILL is sent. My guess is that the SIGTERM stall is either some kind of race-condition corruption in argo multithreading or an actual rare race condition in git itself but I don't have a good idea of what to debug without further understanding of the issue.