Skip to content

Commit a0ddd30

Browse files
committed
Bail out after stop timeout
Previously, the shutdown code looped endlessly until the child process finished, requesting graceful termination over and over again. Change this to a single request-termination -> wait -> bail-out logic. This is to ensure that k0s won't hang when the supervised processes can't be terminated for whichever reason: the code will terminate, at least after the timeout expired. Use a buffered channel for the wait result, so that the goroutine will be able to exit, even if nothing reads from the channel anymore. Introduce fine-grained error reporting to differentiate shutdown outcomes (graceful shutdown, forced kill, failure, and so on). Signed-off-by: Tom Wieczorek <[email protected]>
1 parent 932e435 commit a0ddd30

File tree

1 file changed

+64
-18
lines changed

1 file changed

+64
-18
lines changed

pkg/supervisor/supervisor.go

Lines changed: 64 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ const k0sManaged = "_K0S_MANAGED=yes"
5858
// processWaitQuit waits for a process to exit or a shut down signal
5959
// returns true if shutdown is requested
6060
func (s *Supervisor) processWaitQuit(ctx context.Context, cmd *exec.Cmd) bool {
61-
waitresult := make(chan error)
61+
waitresult := make(chan error, 1)
6262
go func() {
6363
waitresult <- cmd.Wait()
6464
}()
@@ -67,24 +67,14 @@ func (s *Supervisor) processWaitQuit(ctx context.Context, cmd *exec.Cmd) bool {
6767

6868
select {
6969
case <-ctx.Done():
70-
for {
71-
s.log.Debug("Requesting graceful termination")
72-
if err := requestGracefulTermination(cmd.Process); err != nil {
73-
if errors.Is(err, os.ErrProcessDone) {
74-
s.log.Info("Failed to request graceful termination: process has already terminated")
75-
} else {
76-
s.log.WithError(err).Error("Failed to request graceful termination")
77-
}
78-
} else {
79-
s.log.Info("Requested graceful termination")
80-
}
81-
select {
82-
case <-time.After(s.TimeoutStop):
83-
continue
84-
case <-waitresult:
85-
return true
86-
}
70+
err := s.terminateSupervisedProcess(cmd, waitresult)
71+
if err != nil {
72+
s.log.WithError(err).Error("Error while terminating process")
73+
} else {
74+
s.log.Info("Process terminated successfully")
8775
}
76+
return true
77+
8878
case err := <-waitresult:
8979
var exitErr *exec.ExitError
9080
state := cmd.ProcessState
@@ -101,6 +91,62 @@ func (s *Supervisor) processWaitQuit(ctx context.Context, cmd *exec.Cmd) bool {
10191
}
10292
}
10393

94+
func (s *Supervisor) terminateSupervisedProcess(cmd *exec.Cmd, waitresult <-chan error) error {
95+
s.log.Debug("Requesting graceful termination")
96+
err := requestGracefulTermination(cmd.Process)
97+
switch {
98+
case err == nil:
99+
// Termination request sent, wait for process to finish.
100+
s.log.Debug("Awaiting graceful process termination for ", s.TimeoutStop)
101+
102+
select {
103+
case err := <-waitresult:
104+
var exitErr *exec.ExitError
105+
switch {
106+
case err == nil:
107+
return nil
108+
case errors.As(err, &exitErr):
109+
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok && status.Signal() == syscall.SIGTERM {
110+
return errors.New("process terminated without handling SIGTERM")
111+
}
112+
return exitErr
113+
default:
114+
return fmt.Errorf("failed to wait for process: %w", err)
115+
}
116+
117+
case <-time.After(s.TimeoutStop):
118+
err = fmt.Errorf("timed out after %s while waiting for process to terminate", s.TimeoutStop)
119+
}
120+
121+
return err
122+
123+
case errors.Is(err, os.ErrProcessDone):
124+
// The process has finished even before the termination could be requested.
125+
select {
126+
case err = <-waitresult:
127+
var exitErr *exec.ExitError
128+
state := cmd.ProcessState
129+
switch {
130+
case errors.As(err, &exitErr):
131+
state = exitErr.ProcessState
132+
fallthrough
133+
case err == nil:
134+
err = errors.New(state.String())
135+
default:
136+
return fmt.Errorf("failed to wait for process: %s (%w)", state, err)
137+
}
138+
default:
139+
err = errors.New("process state unavailable")
140+
}
141+
142+
return fmt.Errorf("process terminated before graceful termination could be requested: %w", err)
143+
144+
default:
145+
// Something else went wrong
146+
return fmt.Errorf("failed to request graceful termination: %w", err)
147+
}
148+
}
149+
104150
// Supervise Starts supervising the given process
105151
func (s *Supervisor) Supervise() error {
106152
s.startStopMutex.Lock()

0 commit comments

Comments
 (0)