diff --git a/lib/service/signals.go b/lib/service/signals.go index 68496351eabe9..fadd884e01728 100644 --- a/lib/service/signals.go +++ b/lib/service/signals.go @@ -40,6 +40,16 @@ import ( "github.com/gravitational/teleport/lib/utils" ) +const ( + // fastShutdownTimeout is how long we're going to wait before connections + // are forcibly terminated during a fast shutdown. + fastShutdownTimeout = time.Second * 3 + + // fastShutdownGrace is how long we're going to wait for the shutdown + // procedure to complete after the fastShutdownTimeout is hit. + fastShutdownGrace = time.Second * 2 +) + // printShutdownStatus prints running services until shut down func (process *TeleportProcess) printShutdownStatus(ctx context.Context) { statusInterval := defaults.HighResPollingPeriod @@ -91,19 +101,25 @@ func (process *TeleportProcess) WaitForSignals(ctx context.Context) error { process.log.Infof("All services stopped, exiting.") return nil case syscall.SIGTERM, syscall.SIGINT: - timeout := getShutdownTimeout(process.log) - timeoutCtx, cancel := context.WithTimeout(ctx, timeout) - process.log.Infof("Got signal %q, exiting within %vs.", signal, timeout.Seconds()) - // we run the shutdown in a goroutine and return when the - // context is done even if Shutdown hasn't returned because we - // want to ensure that we exit shortly after SIGTERM even in - // case of bugs + process.log.Infof("Got signal %q, exiting within %s.", signal, fastShutdownTimeout) + // we run the shutdown in a goroutine so we can return and exit + // the process even if Shutdown takes longer to return than we + // expected (due to bugs, for example) + shutdownDone := make(chan struct{}) go func() { + defer close(shutdownDone) + timeoutCtx, cancel := context.WithTimeout(ctx, fastShutdownTimeout) defer cancel() process.Shutdown(timeoutCtx) }() - <-timeoutCtx.Done() - process.log.Infof("All services stopped or timeout passed, exiting immediately.") + graceTimer := time.NewTimer(fastShutdownTimeout + fastShutdownGrace) + defer graceTimer.Stop() + select { + case <-graceTimer.C: + process.log.Warn("Shutdown still hasn't completed, exiting anyway.") + case <-shutdownDone: + process.log.Info("All services stopped, exiting.") + } return nil case syscall.SIGUSR1: // All programs placed diagnostics on the standard output. @@ -180,33 +196,6 @@ func (process *TeleportProcess) WaitForSignals(ctx context.Context) error { } } -const ( - defaultShutdownTimeout = time.Second * 3 - maxShutdownTimeout = time.Minute * 10 -) - -func getShutdownTimeout(log logrus.FieldLogger) time.Duration { - timeout := defaultShutdownTimeout - - // read undocumented env var TELEPORT_UNSTABLE_SHUTDOWN_TIMEOUT. - // TODO(Tener): DELETE IN 15.0. after ironing out all possible shutdown bugs. - override := os.Getenv("TELEPORT_UNSTABLE_SHUTDOWN_TIMEOUT") - if override != "" { - t, err := time.ParseDuration(override) - if err != nil { - log.Warnf("Cannot parse timeout override %q, using default instead.", override) - } - if err == nil { - if t > maxShutdownTimeout { - log.Warnf("Timeout override %q exceeds maximum value, reducing.", override) - t = maxShutdownTimeout - } - timeout = t - } - } - return timeout -} - // ErrTeleportReloading is returned when signal waiter exits // because the teleport process has initiaded shutdown var ErrTeleportReloading = &trace.CompareFailedError{Message: "teleport process is reloading"} diff --git a/lib/service/signals_test.go b/lib/service/signals_test.go deleted file mode 100644 index b69e963b50171..0000000000000 --- a/lib/service/signals_test.go +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Teleport - * Copyright (C) 2023 Gravitational, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -package service - -import ( - "testing" - "time" - - "github.com/sirupsen/logrus" - "github.com/stretchr/testify/require" -) - -func Test_getShutdownTimeout(t *testing.T) { - tests := []struct { - name string - envValue string - want time.Duration - }{ - { - name: "no override", - envValue: "", - want: defaultShutdownTimeout, - }, - { - name: "accept valid override, one second", - envValue: "1s", - want: time.Second * 1, - }, - { - name: "accept valid override, one minute", - envValue: "1m", - want: time.Minute * 1, - }, - { - name: "ignore invalid override", - envValue: "one moment", - want: defaultShutdownTimeout, - }, - { - name: "valid override above maximum, trim", - envValue: "3000h", - want: maxShutdownTimeout, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - t.Setenv("TELEPORT_UNSTABLE_SHUTDOWN_TIMEOUT", tt.envValue) - require.Equal(t, tt.want, getShutdownTimeout(logrus.StandardLogger())) - }) - } -}