-
Notifications
You must be signed in to change notification settings - Fork 68
feat (postStart) : Allow debugging poststart failures with sleep by trapping errors #1522
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,9 +15,12 @@ package lifecycle | |
|
|
||
| import ( | ||
| "fmt" | ||
| "regexp" | ||
| "strings" | ||
| "time" | ||
|
|
||
| "github.com/go-logr/logr" | ||
|
|
||
| dw "github.com/devfile/api/v2/pkg/apis/workspaces/v1alpha2" | ||
| corev1 "k8s.io/api/core/v1" | ||
| "sigs.k8s.io/controller-runtime/pkg/log" | ||
|
|
@@ -41,7 +44,9 @@ const ( | |
| ` | ||
| ) | ||
|
|
||
| func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string) error { | ||
| var trapErrRegex = regexp.MustCompile(`\btrap\b.*\bERR\b`) | ||
|
|
||
| func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers []corev1.Container, postStartTimeout string, postStartDebugTrapSleepDuration string) error { | ||
| if wksp.Events == nil || len(wksp.Events.PostStart) == 0 { | ||
| return nil | ||
| } | ||
|
|
@@ -69,7 +74,7 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers [] | |
| return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err) | ||
| } | ||
|
|
||
| postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout) | ||
| postStartHandler, err := processCommandsForPostStart(commands, postStartTimeout, postStartDebugTrapSleepDuration) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to process postStart event %s: %w", commands[0].Id, err) | ||
| } | ||
|
|
@@ -85,10 +90,10 @@ func AddPostStartLifecycleHooks(wksp *dw.DevWorkspaceTemplateSpec, containers [] | |
|
|
||
| // processCommandsForPostStart processes a list of DevWorkspace commands | ||
| // and generates a corev1.LifecycleHandler for the PostStart lifecycle hook. | ||
| func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) (*corev1.LifecycleHandler, error) { | ||
| func processCommandsForPostStart(commands []dw.Command, postStartTimeout string, postStartDebugTrapSleepDuration string) (*corev1.LifecycleHandler, error) { | ||
| if postStartTimeout == "" { | ||
| // use the fallback if no timeout propagated | ||
| return processCommandsWithoutTimeoutFallback(commands) | ||
| return processCommandsWithoutTimeoutFallback(commands, postStartDebugTrapSleepDuration) | ||
| } | ||
|
|
||
| originalUserScript, err := buildUserScript(commands) | ||
|
|
@@ -101,7 +106,7 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) | |
| scriptToExecute := "set -e\n" + originalUserScript | ||
| escapedUserScriptForTimeoutWrapper := strings.ReplaceAll(scriptToExecute, "'", `'\''`) | ||
|
|
||
| fullScriptWithTimeout := generateScriptWithTimeout(escapedUserScriptForTimeoutWrapper, postStartTimeout) | ||
| fullScriptWithTimeout := generateScriptWithTimeout(escapedUserScriptForTimeoutWrapper, postStartTimeout, postStartDebugTrapSleepDuration) | ||
|
|
||
| finalScriptForHook := fmt.Sprintf(redirectOutputFmt, fullScriptWithTimeout) | ||
|
|
||
|
|
@@ -128,8 +133,10 @@ func processCommandsForPostStart(commands []dw.Command, postStartTimeout string) | |
| // - | | ||
| // cd <workingDir> | ||
| // <commandline> | ||
| func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.LifecycleHandler, error) { | ||
| func processCommandsWithoutTimeoutFallback(commands []dw.Command, postStartDebugTrapSleepDuration string) (*corev1.LifecycleHandler, error) { | ||
| var dwCommands []string | ||
| postStartFailureDebugSleepSeconds := parsePostStartFailureDebugSleepDurationToSeconds(log.Log, postStartDebugTrapSleepDuration) | ||
| hasErrTrapInUserScript := false | ||
| for _, command := range commands { | ||
| execCmd := command.Exec | ||
| if len(execCmd.Env) > 0 { | ||
|
|
@@ -139,6 +146,21 @@ func processCommandsWithoutTimeoutFallback(commands []dw.Command) (*corev1.Lifec | |
| dwCommands = append(dwCommands, fmt.Sprintf("cd %s", execCmd.WorkingDir)) | ||
| } | ||
| dwCommands = append(dwCommands, execCmd.CommandLine) | ||
| if trapErrRegex.MatchString(execCmd.CommandLine) { | ||
| hasErrTrapInUserScript = true | ||
| } | ||
| } | ||
|
|
||
| if postStartFailureDebugSleepSeconds > 0 && !hasErrTrapInUserScript { | ||
| debugTrap := fmt.Sprintf(` | ||
| trap 'echo "[postStart] failure encountered, sleep for debugging"; sleep %d' ERR | ||
| `, postStartFailureDebugSleepSeconds) | ||
| debugTrapLine := strings.ReplaceAll(strings.TrimSpace(debugTrap), "\n", " ") | ||
|
|
||
| dwCommands = append([]string{ | ||
| "set -e", | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the purpose of the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I checked if we can remove apiVersion: workspace.devfile.io/v1alpha2
kind: DevWorkspace
metadata:
name: dig-fail-debug
annotations:
controller.devfile.io/debug-start: "true"
spec:
started: true
template:
components:
- name: tools
container:
image: quay.io/devfile/universal-developer-image:ubi9-latest
mountSources: false
command: ["tail"]
args: ["-f", "/dev/null"]
commands:
- id: poststart-wrapper
exec:
component: tools
commandLine: |
echo "Start"
wget 'https://wrongexample.com' # should fail
echo "After failure"
events:
postStart:
- poststart-wrapperWhen I tested with a version that had $ oc get pods
NAME READY STATUS RESTARTS AGE
workspacef6c14b383c1240e6-7f98c9ffb7-t6m6g 1/1 Running 0 7m54s
$ kubectl exec -it pod/workspacef6c14b383c1240e6-7f98c9ffb7-t6m6g -- /bin/bash
projects $ cat /tmp/poststart-stderr.txt
--2025-11-05 09:53:47-- https://wrongexample.com/
Resolving wrongexample.com (wrongexample.com)... 172.67.177.31, 104.21.83.138, 2606:4700:3033::6815:538a, ...
Connecting to wrongexample.com (wrongexample.com)|172.67.177.31|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
index.html: Permission denied
Cannot write to 'index.html' (No such file or directory).
projects $ cat /tmp/poststart-stdout.txt
Start
[postStart] failure encountered, sleep for debugging
After failure
projects $ exit
exitWhen I tested with oc get pods
NAME READY STATUS RESTARTS AGE
workspace50b3197366ca4c18-84cfdf596c-8rfz5 0/1 PostStartHookError 0 7m43s |
||
| debugTrapLine, | ||
| }, dwCommands...) | ||
| } | ||
|
|
||
| joinedCommands := strings.Join(dwCommands, "\n") | ||
|
|
@@ -187,7 +209,7 @@ func buildUserScript(commands []dw.Command) (string, error) { | |
| // environment variable exports, and specific exit code handling. | ||
| // The killAfterDurationSeconds is hardcoded to 5s within this generated script. | ||
| // It conditionally prefixes the user script with the timeout command if available. | ||
| func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string) string { | ||
| func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string, postStartDebugTrapSleepDuration string) string { | ||
| // Convert `postStartTimeout` into the `timeout` format | ||
| var timeoutSeconds int64 | ||
| if postStartTimeout != "" && postStartTimeout != "0" { | ||
|
|
@@ -199,10 +221,12 @@ func generateScriptWithTimeout(escapedUserScript string, postStartTimeout string | |
| timeoutSeconds = int64(duration.Seconds()) | ||
| } | ||
| } | ||
| postStartFailureDebugSleepSeconds := parsePostStartFailureDebugSleepDurationToSeconds(log.Log, postStartDebugTrapSleepDuration) | ||
|
|
||
| return fmt.Sprintf(` | ||
| export POSTSTART_TIMEOUT_DURATION="%d" | ||
| export POSTSTART_KILL_AFTER_DURATION="5" | ||
| export DEBUG_ENABLED="%t" | ||
|
|
||
| _TIMEOUT_COMMAND_PART="" | ||
| _WAS_TIMEOUT_USED="false" # Use strings "true" or "false" for shell boolean | ||
|
|
@@ -219,6 +243,11 @@ fi | |
| ${_TIMEOUT_COMMAND_PART} /bin/sh -c '%s' | ||
| exit_code=$? | ||
|
|
||
| if [ "$DEBUG_ENABLED" = "true" ] && [ $exit_code -ne 0 ]; then | ||
| echo "[postStart] failure encountered, sleep for debugging" >&2 | ||
| sleep %d | ||
| fi | ||
|
|
||
| # Check the exit code based on whether timeout was attempted | ||
| if [ "$_WAS_TIMEOUT_USED" = "true" ]; then | ||
| if [ $exit_code -eq 143 ]; then # 128 + 15 (SIGTERM) | ||
|
|
@@ -239,5 +268,19 @@ else | |
| fi | ||
|
|
||
| exit $exit_code | ||
| `, timeoutSeconds, escapedUserScript) | ||
| `, timeoutSeconds, postStartFailureDebugSleepSeconds > 0, escapedUserScript, postStartFailureDebugSleepSeconds) | ||
| } | ||
|
|
||
| func parsePostStartFailureDebugSleepDurationToSeconds(logger logr.Logger, durationStr string) int { | ||
| if durationStr == "" { | ||
| return 0 | ||
| } | ||
|
|
||
| d, err := time.ParseDuration(durationStr) | ||
| if err != nil { | ||
| logger.Error(err, "Failed to parse postStart failure debug sleep duration for ", "durationStr", durationStr) | ||
| return 0 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we also log the error in this case?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I’ve added the log statement as you advised. |
||
| } | ||
|
|
||
| return int(d.Seconds()) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be honest I don't like using
ProgressTimeoutfor this purpose.But on the other hand I don't have another solution but some constant
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, I use
ProgressTimeoutto be consistent with the behavior of the Debug annotation when it fails for the main component.We do not scale down the failing workspace until the failing timeout is satisfied:
devworkspace-operator/controllers/workspace/devworkspace_controller.go
Lines 170 to 173 in b61eaed
Inside the
checkForFailingTimeout, we're parsing ProgressTimeout:devworkspace-operator/controllers/workspace/status.go
Line 317 in b61eaed